From 6032a373003affa641ca1e70a44c29a232b5b3ed Mon Sep 17 00:00:00 2001 From: lltommy Date: Tue, 28 Apr 2020 20:31:42 +0200 Subject: Changes to the structure - we use lists now instead of strings where it makes sense. This allows us to have multiple values where in makes sense --- scripts/from_genbank_to_fasta_and_yaml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'scripts/from_genbank_to_fasta_and_yaml.py') diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 5257bd1..148a7e1 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -112,7 +112,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['sample_id'] = accession_version - info_for_yaml_dict['sample']['source_database_accession'] = accession_version + info_for_yaml_dict['sample']['source_database_accession'] = "http://identifiers.org/insdc/"+accession_version+"#sequence" #accession is turned into resolvable URL/URI now # submitter info -- cgit v1.2.3 From 8f5853364360357e8424f21ea7ab05e73aa7a367 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Tue, 28 Apr 2020 22:52:55 +0200 Subject: updated to manage list fields and added new control on nasopharyngeal/throat swab --- scripts/from_genbank_to_fasta_and_yaml.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'scripts/from_genbank_to_fasta_and_yaml.py') diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 148a7e1..21ed3b2 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -37,8 +37,7 @@ if not os.path.exists(dir_metadata): tmp_list = [x.split('.')[0] for x in tmp_list] print(term, len(tmp_list)) - tmp_list=tmp_list - # tmp_list = tmp_list[0:2] # restricting to small run + #tmp_list = tmp_list[0:2] # restricting to small run id_set.update([x.split('.')[0] for x in tmp_list]) @@ -112,13 +111,13 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['sample_id'] = accession_version - info_for_yaml_dict['sample']['source_database_accession'] = "http://identifiers.org/insdc/"+accession_version+"#sequence" #accession is turned into resolvable URL/URI now + info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now # submitter info GBSeq_references = GBSeq.find('GBSeq_references') if GBSeq_references is not None: - info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq_references.iter('GBAuthor')]) + info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] GBReference = GBSeq_references.find('GBReference') if GBReference is not None: @@ -126,7 +125,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': if 'Submitted' in GBReference_journal.text: - info_for_yaml_dict['submitter']['submitter_name'] = GBReference_journal.text.split(') ')[1].split(',')[0].strip() + info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip() else: info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text @@ -146,8 +145,9 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if field_in_yaml == 'sequencing_coverage': # A regular expression would be better! try: - info_for_yaml_dict['technology'][field_in_yaml] = float( - tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + info_for_yaml_dict['technology'][field_in_yaml] = [ + float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + ] except ValueError: print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse) pass @@ -162,8 +162,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) new_seq_tec_list.append(seq_tec) - for n, seq_tec in enumerate(new_seq_tec_list): - info_for_yaml_dict['technology'][field_in_yaml + ('' if n == 0 else str(n + 1))] = seq_tec + info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list] else: info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse @@ -210,17 +209,14 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' if GBQualifier_value_text in term_to_uri_dict: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict[GBQualifier_value_text] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: if GBQualifier_value_text in ['NP/OP swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'np/np swab', 'np/op']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['oropharyngeal swab'] - elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab'] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] + elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] elif GBQualifier_value_text in ['nasopharyngeal aspirate/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal aspirate'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab'] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] else: missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) elif GBQualifier_name_text == 'collection_date': -- cgit v1.2.3 From 61a083081cd2d70a25eba4cdae4f85c774b25b95 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Wed, 29 Apr 2020 17:03:25 +0200 Subject: the date is now handled more formally all the date are saved as "YYYY-MM-DD"--- scripts/from_genbank_to_fasta_and_yaml.py | 34 +++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'scripts/from_genbank_to_fasta_and_yaml.py') diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 21ed3b2..0175d3c 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -7,6 +7,8 @@ import xml.etree.ElementTree as ET import json import os +from dateutil import parser + num_ids_for_request = 100 dir_metadata = 'metadata_from_nuccore' @@ -221,7 +223,32 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) elif GBQualifier_name_text == 'collection_date': # TO_DO: which format we will use? - info_for_yaml_dict['sample']['collection_date'] = GBQualifier_value_text + date_to_write = GBQualifier_value_text + + if len(GBQualifier_value_text.split('-')) == 1: + if int(GBQualifier_value_text) < 2020: + date_to_write = "15 12 {}".format(GBQualifier_value_text) + else: + date_to_write = "15 01 {}".format(GBQualifier_value_text) + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 2: + date_to_write += '-15' + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 3: + GBQualifier_value_text_list = GBQualifier_value_text.split('-') + + if GBQualifier_value_text_list[1].isalpha(): + date_to_write = GBQualifier_value_text_list[1] + ' ' + GBQualifier_value_text_list[0] + ' ' + GBQualifier_value_text_list[2] + + info_for_yaml_dict['sample']['collection_date'] = date_to_write elif GBQualifier_name_text in ['lat_lon', 'country']: if GBQualifier_value_text == 'Hong Kong': GBQualifier_value_text = 'China: Hong Kong' @@ -233,7 +260,10 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text elif GBQualifier_name_text == 'note': - info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text + else: + info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text elif GBQualifier_name_text == 'isolate': info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text elif GBQualifier_name_text == 'db_xref': -- cgit v1.2.3 From 347b8dce36832c6d3e379d81b3efefcbc88a3117 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 30 Apr 2020 10:22:27 -0400 Subject: Wrap import script to run as a workflow Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- scripts/docker/Dockerfile | 10 ++++++++++ scripts/from_genbank_to_fasta_and_yaml.py | 30 +++++++++++++++--------------- scripts/import.cwl | 24 ++++++++++++++++++++++++ scripts/import_to_arvados.py | 13 +++++++++++++ 4 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 scripts/docker/Dockerfile create mode 100644 scripts/import.cwl create mode 100644 scripts/import_to_arvados.py (limited to 'scripts/from_genbank_to_fasta_and_yaml.py') diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile new file mode 100644 index 0000000..5bd38dd --- /dev/null +++ b/scripts/docker/Dockerfile @@ -0,0 +1,10 @@ +FROM debian:10 + +RUN apt-get update && \ + apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \ + python3 python3-pip python3-setuptools python3-dev python-pycurl \ + clustalw python3-biopython libcurl4-openssl-dev build-essential \ + libssl-dev && \ + apt-get clean + +RUN pip3 install bh20-seq-uploader \ No newline at end of file diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 21ed3b2..2564b51 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -43,13 +43,13 @@ if not os.path.exists(dir_metadata): print(term_list, len(id_set)) - with open(path_ncbi_virus_accession) as f: - tmp_list = [line.strip('\n') for line in f] - - print('NCBI Virus', len(tmp_list)) - id_set.update(tmp_list) - - print(term_list + ['NCBI Virus'], len(id_set)) + if os.path.exists(path_ncbi_virus_accession): + with open(path_ncbi_virus_accession) as f: + tmp_list = [line.strip('\n') for line in f] + print('NCBI Virus', len(tmp_list)) + id_set.update(tmp_list) + term_list.append('NCBI Virus') + print(term_list, len(id_set)) for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) @@ -85,7 +85,7 @@ if not os.path.exists(dir_fasta_and_yaml): os.makedirs(dir_fasta_and_yaml) missing_value_list = [] - + for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: tree = ET.parse(path_metadata_xxx_xml) GBSet = tree.getroot() @@ -109,20 +109,20 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) 'submitter': {} } - + info_for_yaml_dict['sample']['sample_id'] = accession_version info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now - - + + # submitter info GBSeq_references = GBSeq.find('GBSeq_references') if GBSeq_references is not None: info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] - + GBReference = GBSeq_references.find('GBReference') if GBReference is not None: GBReference_journal = GBReference.find('GBReference_journal') - + if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': if 'Submitted' in GBReference_journal.text: info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] @@ -207,7 +207,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) elif GBQualifier_name_text == 'isolation_source': if GBQualifier_value_text.upper() in term_to_uri_dict: GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' - + if GBQualifier_value_text in term_to_uri_dict: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: @@ -250,7 +250,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: json.dump(info_for_yaml_dict, fw, indent=2) - + if len(missing_value_list) > 0: with open('missing_terms.tsv', 'w') as fw: fw.write('\n'.join(missing_value_list)) diff --git a/scripts/import.cwl b/scripts/import.cwl new file mode 100644 index 0000000..81752c8 --- /dev/null +++ b/scripts/import.cwl @@ -0,0 +1,24 @@ +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 +inputs: + scripts: + type: File + default: + class: File + location: import_to_arvados.py + inputBinding: {position: 1} + importScript: + type: File + default: + class: File + location: from_genbank_to_fasta_and_yaml.py + inputBinding: {position: 2} +outputs: [] +requirements: + DockerRequirement: + dockerPull: bh20-seq-uploader/import + NetworkAccess: + networkAccess: true + WorkReuse: + workReuse: false diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py new file mode 100644 index 0000000..07b7d71 --- /dev/null +++ b/scripts/import_to_arvados.py @@ -0,0 +1,13 @@ +import os +import subprocess +import glob +import sys + +os.chdir(os.environ["TMPDIR"]) +subprocess.run(sys.argv[1]) + +os.chdir("fasta_and_yaml") +fasta_files = glob.glob("*.fasta") + +for f in fasta_files: + subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]]) -- cgit v1.2.3 From 6165495618b9c2ad3ad7b8bd95ed807d022ebf1c Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Thu, 30 Apr 2020 18:27:07 +0200 Subject: fixed UO_0000036 for year --- scripts/from_genbank_to_fasta_and_yaml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'scripts/from_genbank_to_fasta_and_yaml.py') diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 900f087..6f046ea 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -200,7 +200,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if 'age' in GBQualifier_value_text: info_for_yaml_dict['host']['host_age'] = int(GBQualifier_value_text_list[2].split('age ')[1]) - info_for_yaml_dict['host']['host_age_unit'] = 'year' + info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036' elif GBQualifier_name_text == 'collected_by': if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]): info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text -- cgit v1.2.3