aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Amstutz2020-04-30 10:22:27 -0400
committerPeter Amstutz2020-04-30 10:22:27 -0400
commit347b8dce36832c6d3e379d81b3efefcbc88a3117 (patch)
tree1d0f62247576dfd746511aed683acd65072b1783
parent3475c3e30197a11ada10d1b0a4bafb026f2fb580 (diff)
downloadbh20-seq-resource-347b8dce36832c6d3e379d81b3efefcbc88a3117.tar.gz
bh20-seq-resource-347b8dce36832c6d3e379d81b3efefcbc88a3117.tar.lz
bh20-seq-resource-347b8dce36832c6d3e379d81b3efefcbc88a3117.zip
Wrap import script to run as a workflow
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
-rw-r--r--scripts/docker/Dockerfile10
-rwxr-xr-xscripts/from_genbank_to_fasta_and_yaml.py30
-rw-r--r--scripts/import.cwl24
-rw-r--r--scripts/import_to_arvados.py13
4 files changed, 62 insertions, 15 deletions
diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile
new file mode 100644
index 0000000..5bd38dd
--- /dev/null
+++ b/scripts/docker/Dockerfile
@@ -0,0 +1,10 @@
+FROM debian:10
+
+RUN apt-get update && \
+ apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \
+ python3 python3-pip python3-setuptools python3-dev python-pycurl \
+ clustalw python3-biopython libcurl4-openssl-dev build-essential \
+ libssl-dev && \
+ apt-get clean
+
+RUN pip3 install bh20-seq-uploader \ No newline at end of file
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 21ed3b2..2564b51 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -43,13 +43,13 @@ if not os.path.exists(dir_metadata):
print(term_list, len(id_set))
- with open(path_ncbi_virus_accession) as f:
- tmp_list = [line.strip('\n') for line in f]
-
- print('NCBI Virus', len(tmp_list))
- id_set.update(tmp_list)
-
- print(term_list + ['NCBI Virus'], len(id_set))
+ if os.path.exists(path_ncbi_virus_accession):
+ with open(path_ncbi_virus_accession) as f:
+ tmp_list = [line.strip('\n') for line in f]
+ print('NCBI Virus', len(tmp_list))
+ id_set.update(tmp_list)
+ term_list.append('NCBI Virus')
+ print(term_list, len(id_set))
for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)):
path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i))
@@ -85,7 +85,7 @@ if not os.path.exists(dir_fasta_and_yaml):
os.makedirs(dir_fasta_and_yaml)
missing_value_list = []
-
+
for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
tree = ET.parse(path_metadata_xxx_xml)
GBSet = tree.getroot()
@@ -109,20 +109,20 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
'submitter': {}
}
-
+
info_for_yaml_dict['sample']['sample_id'] = accession_version
info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now
-
-
+
+
# submitter info
GBSeq_references = GBSeq.find('GBSeq_references')
if GBSeq_references is not None:
info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')]
-
+
GBReference = GBSeq_references.find('GBReference')
if GBReference is not None:
GBReference_journal = GBReference.find('GBReference_journal')
-
+
if GBReference_journal is not None and GBReference_journal.text != 'Unpublished':
if 'Submitted' in GBReference_journal.text:
info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())]
@@ -207,7 +207,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
elif GBQualifier_name_text == 'isolation_source':
if GBQualifier_value_text.upper() in term_to_uri_dict:
GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa'
-
+
if GBQualifier_value_text in term_to_uri_dict:
info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]]
else:
@@ -250,7 +250,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw:
json.dump(info_for_yaml_dict, fw, indent=2)
-
+
if len(missing_value_list) > 0:
with open('missing_terms.tsv', 'w') as fw:
fw.write('\n'.join(missing_value_list))
diff --git a/scripts/import.cwl b/scripts/import.cwl
new file mode 100644
index 0000000..81752c8
--- /dev/null
+++ b/scripts/import.cwl
@@ -0,0 +1,24 @@
+cwlVersion: v1.1
+class: CommandLineTool
+baseCommand: python3
+inputs:
+ scripts:
+ type: File
+ default:
+ class: File
+ location: import_to_arvados.py
+ inputBinding: {position: 1}
+ importScript:
+ type: File
+ default:
+ class: File
+ location: from_genbank_to_fasta_and_yaml.py
+ inputBinding: {position: 2}
+outputs: []
+requirements:
+ DockerRequirement:
+ dockerPull: bh20-seq-uploader/import
+ NetworkAccess:
+ networkAccess: true
+ WorkReuse:
+ workReuse: false
diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py
new file mode 100644
index 0000000..07b7d71
--- /dev/null
+++ b/scripts/import_to_arvados.py
@@ -0,0 +1,13 @@
+import os
+import subprocess
+import glob
+import sys
+
+os.chdir(os.environ["TMPDIR"])
+subprocess.run(sys.argv[1])
+
+os.chdir("fasta_and_yaml")
+fasta_files = glob.glob("*.fasta")
+
+for f in fasta_files:
+ subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]])