aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data
diff options
context:
space:
mode:
Diffstat (limited to 'workflows/pull-data')
-rw-r--r--workflows/pull-data/genbank/.guix-run2
-rw-r--r--workflows/pull-data/genbank/genbank.py17
2 files changed, 7 insertions, 12 deletions
diff --git a/workflows/pull-data/genbank/.guix-run b/workflows/pull-data/genbank/.guix-run
index 6db7871..f6b1a0c 100644
--- a/workflows/pull-data/genbank/.guix-run
+++ b/workflows/pull-data/genbank/.guix-run
@@ -4,5 +4,5 @@
echo # next run:
echo 'export PATH=$GUIX_ENVIRONMENT/bin:$PATH'
-~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby
+~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby jq
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 026c03f..a994055 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -90,10 +90,9 @@ def get_metadata(id, gbseq):
except AttributeError:
warn("Missing "+msg)
- host.host_species = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
sample.sample_id = id
sample.database = "https://www.ncbi.nlm.nih.gov/genbank/"
- sample.source_database_accession = f"http://identifiers.org/insdc/{id}#sequence"
+ sample.source_database_accession = [ f"http://identifiers.org/insdc/{id}#sequence" ]
# <GBQualifier_value>USA: Cruise_Ship_1, California</GBQualifier_value>
n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='country']/../GBQualifier_value")
if n: sample.collection_location = n
@@ -112,7 +111,7 @@ def get_metadata(id, gbseq):
if n != 'Unpublished':
institute,address = n.split(',',1)
if ")" in institute:
- submitter.submitter_name = institute.split(')')[1]
+ submitter.submitter_name = [institute.split(')')[1].strip()]
submitter.submitter_address = address.strip()
except AttributeError:
pass
@@ -129,13 +128,13 @@ def get_metadata(id, gbseq):
# technology.assembly_method = 'http://purl.obolibrary.org/obo/GENEPIO_0001628'
p = re.compile(r'.*Assembly Method :: ([^;]+).*')
m = p.match(n)
- if m: technology.alignment_protocol = m.group(1)
+ if m: technology.alignment_protocol = m.group(1).strip()
p = re.compile(r'.*Coverage :: ([^;]+).*')
m = p.match(n)
if m: technology.sequencing_coverage = m.group(1)
p = re.compile(r'.*Sequencing Technology :: ([^;]+).*')
m = p.match(n)
- if m: technology.sample_sequencing_technology = m.group(1).strip()
+ if m: technology.sample_sequencing_technology = [m.group(1).strip()]
else: warn("Missing sample_sequencing_technology")
# --- Dates
@@ -167,10 +166,7 @@ def get_metadata(id, gbseq):
n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='host']/../GBQualifier_value")
if n:
list = n.split('; ')
- species = list[0]
- host.host_species = species
- if species != "Homo sapiens":
- warn(f"Species not understood: {species}")
+ host.host_species = list[0]
if len(list)>1:
sex = list[1]
if 'male' in sex or 'gender: M' in sex: host.host_sex = 'male'
@@ -183,13 +179,12 @@ def get_metadata(id, gbseq):
if m:
host.host_age = int(m.group(1))
host.host_age_unit = 'http://purl.obolibrary.org/obo/UO_0000036'
- # sys.exit(1)
n = fetch("virus_strain", ".//GBQualifier/GBQualifier_name/[.='isolate']/../GBQualifier_value")
if n: virus.virus_strain = n
n = fetch("virus_species", ".//GBQualifier/GBQualifier_name/[.='db_xref']/../GBQualifier_value")
if n: virus.virus_species = "http://purl.obolibrary.org/obo/NCBITaxon_"+n.split('taxon:')[1]
n = fetch("specimen_source", ".//GBQualifier/GBQualifier_name/[.='isolation_source']/../GBQualifier_value")
- if n: sample.specimen_source = n
+ if n: sample.specimen_source = [n]
info = {
'id': 'placeholder',