diff options
author | Peter Amstutz | 2020-04-30 10:22:27 -0400 |
---|---|---|
committer | Peter Amstutz | 2020-04-30 10:22:27 -0400 |
commit | 347b8dce36832c6d3e379d81b3efefcbc88a3117 (patch) | |
tree | 1d0f62247576dfd746511aed683acd65072b1783 | |
parent | 3475c3e30197a11ada10d1b0a4bafb026f2fb580 (diff) | |
download | bh20-seq-resource-347b8dce36832c6d3e379d81b3efefcbc88a3117.tar.gz bh20-seq-resource-347b8dce36832c6d3e379d81b3efefcbc88a3117.tar.lz bh20-seq-resource-347b8dce36832c6d3e379d81b3efefcbc88a3117.zip |
Wrap import script to run as a workflow
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
-rw-r--r-- | scripts/docker/Dockerfile | 10 | ||||
-rwxr-xr-x | scripts/from_genbank_to_fasta_and_yaml.py | 30 | ||||
-rw-r--r-- | scripts/import.cwl | 24 | ||||
-rw-r--r-- | scripts/import_to_arvados.py | 13 |
4 files changed, 62 insertions, 15 deletions
diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile new file mode 100644 index 0000000..5bd38dd --- /dev/null +++ b/scripts/docker/Dockerfile @@ -0,0 +1,10 @@ +FROM debian:10 + +RUN apt-get update && \ + apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \ + python3 python3-pip python3-setuptools python3-dev python-pycurl \ + clustalw python3-biopython libcurl4-openssl-dev build-essential \ + libssl-dev && \ + apt-get clean + +RUN pip3 install bh20-seq-uploader
\ No newline at end of file diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 21ed3b2..2564b51 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -43,13 +43,13 @@ if not os.path.exists(dir_metadata): print(term_list, len(id_set)) - with open(path_ncbi_virus_accession) as f: - tmp_list = [line.strip('\n') for line in f] - - print('NCBI Virus', len(tmp_list)) - id_set.update(tmp_list) - - print(term_list + ['NCBI Virus'], len(id_set)) + if os.path.exists(path_ncbi_virus_accession): + with open(path_ncbi_virus_accession) as f: + tmp_list = [line.strip('\n') for line in f] + print('NCBI Virus', len(tmp_list)) + id_set.update(tmp_list) + term_list.append('NCBI Virus') + print(term_list, len(id_set)) for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) @@ -85,7 +85,7 @@ if not os.path.exists(dir_fasta_and_yaml): os.makedirs(dir_fasta_and_yaml) missing_value_list = [] - + for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: tree = ET.parse(path_metadata_xxx_xml) GBSet = tree.getroot() @@ -109,20 +109,20 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) 'submitter': {} } - + info_for_yaml_dict['sample']['sample_id'] = accession_version info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now - - + + # submitter info GBSeq_references = GBSeq.find('GBSeq_references') if GBSeq_references is not None: info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] - + GBReference = GBSeq_references.find('GBReference') if GBReference is not None: GBReference_journal = GBReference.find('GBReference_journal') - + if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': if 'Submitted' in GBReference_journal.text: info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] @@ -207,7 +207,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) elif GBQualifier_name_text == 'isolation_source': if GBQualifier_value_text.upper() in term_to_uri_dict: GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' - + if GBQualifier_value_text in term_to_uri_dict: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: @@ -250,7 +250,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: json.dump(info_for_yaml_dict, fw, indent=2) - + if len(missing_value_list) > 0: with open('missing_terms.tsv', 'w') as fw: fw.write('\n'.join(missing_value_list)) diff --git a/scripts/import.cwl b/scripts/import.cwl new file mode 100644 index 0000000..81752c8 --- /dev/null +++ b/scripts/import.cwl @@ -0,0 +1,24 @@ +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 +inputs: + scripts: + type: File + default: + class: File + location: import_to_arvados.py + inputBinding: {position: 1} + importScript: + type: File + default: + class: File + location: from_genbank_to_fasta_and_yaml.py + inputBinding: {position: 2} +outputs: [] +requirements: + DockerRequirement: + dockerPull: bh20-seq-uploader/import + NetworkAccess: + networkAccess: true + WorkReuse: + workReuse: false diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py new file mode 100644 index 0000000..07b7d71 --- /dev/null +++ b/scripts/import_to_arvados.py @@ -0,0 +1,13 @@ +import os +import subprocess +import glob +import sys + +os.chdir(os.environ["TMPDIR"]) +subprocess.run(sys.argv[1]) + +os.chdir("fasta_and_yaml") +fasta_files = glob.glob("*.fasta") + +for f in fasta_files: + subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]]) |