aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank
diff options
context:
space:
mode:
authorPjotr Prins2021-01-03 09:48:14 +0000
committerPjotr Prins2021-01-03 09:48:14 +0000
commita10c61d1f5afca70f54b762beefb402372d7fd31 (patch)
tree40824a1aad7c532979c9dada50163d745d0e3373 /workflows/pull-data/genbank
parenta5ba1a8062e7116c2951762f86a6ae6d1638261d (diff)
downloadbh20-seq-resource-a10c61d1f5afca70f54b762beefb402372d7fd31.tar.gz
bh20-seq-resource-a10c61d1f5afca70f54b762beefb402372d7fd31.tar.lz
bh20-seq-resource-a10c61d1f5afca70f54b762beefb402372d7fd31.zip
genbank: technology parsing
Diffstat (limited to 'workflows/pull-data/genbank')
-rw-r--r--workflows/pull-data/genbank/genbank.py23
-rw-r--r--workflows/pull-data/genbank/ref.py18
2 files changed, 22 insertions, 19 deletions
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 8f6ba06..314f50d 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -55,12 +55,15 @@ Example of an output JSON:
Note: missing data should be None! Do not fill in other data by
'guessing'.
+When data is malformed an warning should be issued.
+
"""
def get_metadata(id, gbseq):
host = types.SimpleNamespace()
sample = types.SimpleNamespace()
submitter = types.SimpleNamespace()
+ technology = types.SimpleNamespace()
warnings = []
def warn(msg):
@@ -95,6 +98,24 @@ def get_metadata(id, gbseq):
submitter.additional_submitter_information = n
pass
+ try:
+ n = gbseq.find("./GBSeq_comment").text
+ except AttributeError:
+ pass
+ if 'Assembly-Data' in n:
+ # print(n,file=sys.stderr)
+ # the following is wrong (de novo by default)
+ # technology.assembly_method = 'http://purl.obolibrary.org/obo/GENEPIO_0001628'
+ p = re.compile(r'.*Assembly Method :: ([^;]+).*')
+ m = p.match(n)
+ if m: technology.alignment_protocol = m.group(1)
+ p = re.compile(r'.*Coverage :: ([^;]+).*')
+ m = p.match(n)
+ if m: technology.sequencing_coverage = m.group(1)
+ p = re.compile(r'.*Sequencing Technology :: ([^;]+).*')
+ m = p.match(n)
+ if m: technology.sample_sequencing_technology = m.group(1).strip()
+
# --- Dates
n = gbseq.find("./GBSeq_create-date")
creation_date = dateparse(n.text).date()
@@ -117,7 +138,7 @@ def get_metadata(id, gbseq):
'host': host,
'sample': sample,
#'virus': virus,
- #'technology': technology,
+ 'technology': technology,
'submitter': submitter,
'warnings': warnings,
}
diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py
index 66c9fb0..d809d7f 100644
--- a/workflows/pull-data/genbank/ref.py
+++ b/workflows/pull-data/genbank/ref.py
@@ -1,23 +1,5 @@
# ---- BELOW IS JUST FOR REFERENCE ----
- # This script download and prepare data and metadata for assemblies samples
- technology['assembly_method'] = 'http://purl.obolibrary.org/obo/GENEPIO_0001628'
-
- GBSeq_comment = GBSeq.find('GBSeq_comment')
- if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text:
- prefix_split_string = '##Genome-Assembly' if GBSeq_comment.text.startswith('##Genome-') else '##Assembly'
-
- GBSeq_comment_text = GBSeq_comment.text.split(
- '{}-Data-START## ; '.format(prefix_split_string)
- )[1].split(' ; {}-Data-END##'.format(prefix_split_string))[0]
-
- for info_to_check, field_in_yaml in zip(
- ['Assembly Method', 'Coverage', 'Sequencing Technology'],
- ['alignment_protocol', 'sequencing_coverage', 'sample_sequencing_technology']
- ):
- if info_to_check in GBSeq_comment_text:
- tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0]
-
if field_in_yaml == 'sequencing_coverage':
# A regular expression would be better!
try: