diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/dict_ontology_standardization/ncbi_countries.csv | 9 | ||||
-rw-r--r-- | scripts/dict_ontology_standardization/ncbi_speciesman_source.csv | 1 | ||||
-rw-r--r-- | scripts/docker/Dockerfile | 10 | ||||
-rwxr-xr-x | scripts/from_genbank_to_fasta_and_yaml.py | 96 | ||||
-rw-r--r-- | scripts/import.cwl | 30 | ||||
-rw-r--r-- | scripts/import_to_arvados.py | 14 | ||||
-rw-r--r-- | scripts/sequences.acc | 297 |
7 files changed, 421 insertions, 36 deletions
diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 20e8a9b..6b43137 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -39,6 +39,7 @@ Chad,http://www.wikidata.org/entity/Q657 Chile,http://www.wikidata.org/entity/Q298 China,http://www.wikidata.org/entity/Q148 China: Anhui,http://www.wikidata.org/entity/Q40956 +"China: Anhui, Fuyang":http://www.wikidata.org/entity/Q360584 China: Beijing,http://www.wikidata.org/entity/Q956 China: Chongqing,http://www.wikidata.org/entity/Q11725 China: Fujian,http://www.wikidata.org/entity/Q41705 @@ -48,6 +49,7 @@ China: Guangdong,http://www.wikidata.org/entity/Q15175 China: Guangxi Zhuang Autonomous Region,http://www.wikidata.org/entity/Q15176 China: Guangzhou,http://www.wikidata.org/entity/Q16572 China: Guizhou,http://www.wikidata.org/entity/Q47097 +China: Hangzhou,http://www.wikidata.org/entity/Q4970 China: Hainan,http://www.wikidata.org/entity/Q42200 China: Hebei,http://www.wikidata.org/entity/Q21208 China: Heilongjiang,http://www.wikidata.org/entity/Q19206 @@ -109,6 +111,7 @@ France,http://www.wikidata.org/entity/Q142 Gabon,http://www.wikidata.org/entity/Q1000 Georgia,http://www.wikidata.org/entity/Q230 Germany,http://www.wikidata.org/entity/Q183 +Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718 Ghana,http://www.wikidata.org/entity/Q117 Greece,http://www.wikidata.org/entity/Q41 Grenada,http://www.wikidata.org/entity/Q769 @@ -123,6 +126,7 @@ Iceland,http://www.wikidata.org/entity/Q189 Icelandic Commonwealth,http://www.wikidata.org/entity/Q62389 India,http://www.wikidata.org/entity/Q668 India: Kerala State,http://www.wikidata.org/entity/Q1186 +India: Rajkot,http://www.wikidata.org/entity/Q1815245 Indonesia,http://www.wikidata.org/entity/Q252 Iran,http://www.wikidata.org/entity/Q794 Iran: Qum,http://www.wikidata.org/entity/Q131664 @@ -172,6 +176,7 @@ Mozambique,http://www.wikidata.org/entity/Q1029 Myanmar,http://www.wikidata.org/entity/Q836 Namibia,http://www.wikidata.org/entity/Q1030 Nauru,http://www.wikidata.org/entity/Q697 +Netherlands: Milheeze,https://www.wikidata.org/wiki/Q3314115 Nepal,http://www.wikidata.org/entity/Q837 New Zealand,http://www.wikidata.org/entity/Q664 Nicaragua,http://www.wikidata.org/entity/Q811 @@ -263,6 +268,7 @@ USA: CA,http://www.wikidata.org/entity/Q99 "USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143 USA: CO,http://www.wikidata.org/entity/Q1261 USA: CT,http://www.wikidata.org/entity/Q779 +USA: DC,http://www.wikidata.org/entity/Q3551781 USA: DE,http://www.wikidata.org/entity/Q1393 USA: FL,http://www.wikidata.org/entity/Q812 USA: GA,http://www.wikidata.org/entity/Q1428 @@ -293,6 +299,7 @@ USA: NM,http://www.wikidata.org/entity/Q1522 USA: North Carolina,http://www.wikidata.org/entity/Q1454 USA: NV,http://www.wikidata.org/entity/Q1227 USA: NY,http://www.wikidata.org/entity/Q1384 +USA: New York,http://www.wikidata.org/entity/Q1384 USA: OH,http://www.wikidata.org/entity/Q1397 USA: OK,http://www.wikidata.org/entity/Q1649 USA: OR,http://www.wikidata.org/entity/Q824 @@ -321,4 +328,4 @@ Viet Nam: Ho Chi Minh city,http://www.wikidata.org/entity/Q1854 Vietnam,http://www.wikidata.org/entity/Q881 Yemen,http://www.wikidata.org/entity/Q805 Zambia,http://www.wikidata.org/entity/Q953 -Zimbabwe,http://www.wikidata.org/entity/Q954 +Zimbabwe,http://www.wikidata.org/entity/Q954
\ No newline at end of file diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv index f5aeaae..7fa67f8 100644 --- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv +++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv @@ -1,5 +1,6 @@ nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831 respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831 naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal aspirate,http://purl.obolibrary.org/obo/NCIT_C155831 diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile new file mode 100644 index 0000000..5bd38dd --- /dev/null +++ b/scripts/docker/Dockerfile @@ -0,0 +1,10 @@ +FROM debian:10 + +RUN apt-get update && \ + apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \ + python3 python3-pip python3-setuptools python3-dev python-pycurl \ + clustalw python3-biopython libcurl4-openssl-dev build-essential \ + libssl-dev && \ + apt-get clean + +RUN pip3 install bh20-seq-uploader
\ No newline at end of file diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 5257bd1..6f046ea 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -7,6 +7,8 @@ import xml.etree.ElementTree as ET import json import os +from dateutil import parser + num_ids_for_request = 100 dir_metadata = 'metadata_from_nuccore' @@ -37,20 +39,19 @@ if not os.path.exists(dir_metadata): tmp_list = [x.split('.')[0] for x in tmp_list] print(term, len(tmp_list)) - tmp_list=tmp_list - # tmp_list = tmp_list[0:2] # restricting to small run + #tmp_list = tmp_list[0:2] # restricting to small run id_set.update([x.split('.')[0] for x in tmp_list]) print(term_list, len(id_set)) - with open(path_ncbi_virus_accession) as f: - tmp_list = [line.strip('\n') for line in f] - - print('NCBI Virus', len(tmp_list)) - id_set.update(tmp_list) - - print(term_list + ['NCBI Virus'], len(id_set)) + if os.path.exists(path_ncbi_virus_accession): + with open(path_ncbi_virus_accession) as f: + tmp_list = [line.strip('\n') for line in f] + print('NCBI Virus', len(tmp_list)) + id_set.update(tmp_list) + term_list.append('NCBI Virus') + print(term_list, len(id_set)) for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) @@ -86,7 +87,7 @@ if not os.path.exists(dir_fasta_and_yaml): os.makedirs(dir_fasta_and_yaml) missing_value_list = [] - + for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: tree = ET.parse(path_metadata_xxx_xml) GBSet = tree.getroot() @@ -110,23 +111,23 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) 'submitter': {} } - + info_for_yaml_dict['sample']['sample_id'] = accession_version - info_for_yaml_dict['sample']['source_database_accession'] = accession_version - - + info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now + + # submitter info GBSeq_references = GBSeq.find('GBSeq_references') if GBSeq_references is not None: - info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq_references.iter('GBAuthor')]) - + info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] + GBReference = GBSeq_references.find('GBReference') if GBReference is not None: GBReference_journal = GBReference.find('GBReference_journal') - + if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': if 'Submitted' in GBReference_journal.text: - info_for_yaml_dict['submitter']['submitter_name'] = GBReference_journal.text.split(') ')[1].split(',')[0].strip() + info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip() else: info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text @@ -146,8 +147,9 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if field_in_yaml == 'sequencing_coverage': # A regular expression would be better! try: - info_for_yaml_dict['technology'][field_in_yaml] = float( - tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + info_for_yaml_dict['technology'][field_in_yaml] = [ + float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + ] except ValueError: print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse) pass @@ -162,8 +164,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) new_seq_tec_list.append(seq_tec) - for n, seq_tec in enumerate(new_seq_tec_list): - info_for_yaml_dict['technology'][field_in_yaml + ('' if n == 0 else str(n + 1))] = seq_tec + info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list] else: info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse @@ -199,7 +200,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if 'age' in GBQualifier_value_text: info_for_yaml_dict['host']['host_age'] = int(GBQualifier_value_text_list[2].split('age ')[1]) - info_for_yaml_dict['host']['host_age_unit'] = 'year' + info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036' elif GBQualifier_name_text == 'collected_by': if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]): info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text @@ -208,24 +209,46 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) elif GBQualifier_name_text == 'isolation_source': if GBQualifier_value_text.upper() in term_to_uri_dict: GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' - + if GBQualifier_value_text in term_to_uri_dict: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict[GBQualifier_value_text] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: if GBQualifier_value_text in ['NP/OP swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'np/np swab', 'np/op']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['oropharyngeal swab'] - elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab'] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] + elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] elif GBQualifier_value_text in ['nasopharyngeal aspirate/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal aspirate'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab'] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] else: missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) elif GBQualifier_name_text == 'collection_date': # TO_DO: which format we will use? - info_for_yaml_dict['sample']['collection_date'] = GBQualifier_value_text + date_to_write = GBQualifier_value_text + + if len(GBQualifier_value_text.split('-')) == 1: + if int(GBQualifier_value_text) < 2020: + date_to_write = "15 12 {}".format(GBQualifier_value_text) + else: + date_to_write = "15 01 {}".format(GBQualifier_value_text) + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 2: + date_to_write += '-15' + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 3: + GBQualifier_value_text_list = GBQualifier_value_text.split('-') + + if GBQualifier_value_text_list[1].isalpha(): + date_to_write = GBQualifier_value_text_list[1] + ' ' + GBQualifier_value_text_list[0] + ' ' + GBQualifier_value_text_list[2] + + info_for_yaml_dict['sample']['collection_date'] = date_to_write elif GBQualifier_name_text in ['lat_lon', 'country']: if GBQualifier_value_text == 'Hong Kong': GBQualifier_value_text = 'China: Hong Kong' @@ -237,7 +260,10 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text elif GBQualifier_name_text == 'note': - info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text + else: + info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text elif GBQualifier_name_text == 'isolate': info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text elif GBQualifier_name_text == 'db_xref': @@ -254,7 +280,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: json.dump(info_for_yaml_dict, fw, indent=2) - + if len(missing_value_list) > 0: with open('missing_terms.tsv', 'w') as fw: fw.write('\n'.join(missing_value_list)) diff --git a/scripts/import.cwl b/scripts/import.cwl new file mode 100644 index 0000000..d84516b --- /dev/null +++ b/scripts/import.cwl @@ -0,0 +1,30 @@ +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 +inputs: + scripts: + type: File + default: + class: File + location: import_to_arvados.py + inputBinding: {position: 1} + importScript: + type: File + default: + class: File + location: from_genbank_to_fasta_and_yaml.py + inputBinding: {position: 2} + dict: + type: Directory + default: + class: Directory + location: dict_ontology_standardization + inputBinding: {position: 3} +outputs: [] +requirements: + DockerRequirement: + dockerPull: bh20-seq-uploader/import + NetworkAccess: + networkAccess: true + WorkReuse: + enableReuse: false diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py new file mode 100644 index 0000000..78cd13d --- /dev/null +++ b/scripts/import_to_arvados.py @@ -0,0 +1,14 @@ +import os +import subprocess +import glob +import sys + +os.chdir(os.environ["TMPDIR"]) +os.symlink(sys.argv[2], "dict_ontology_standardization") +subprocess.run(sys.argv[1]) + +os.chdir("fasta_and_yaml") +fasta_files = glob.glob("*.fasta") + +for f in fasta_files: + subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]]) diff --git a/scripts/sequences.acc b/scripts/sequences.acc index a99c4e6..697d868 100644 --- a/scripts/sequences.acc +++ b/scripts/sequences.acc @@ -1,4 +1,299 @@ NC_045512 +MT394528 +MT394529 +MT394530 +MT394531 +MT394864 +MT396241 +MT396242 +MT396243 +MT396244 +MT396245 +MT396246 +MT396247 +MT396248 +MT396266 +MT380726 +MT380727 +MT380728 +MT380729 +MT380730 +MT380731 +MT380732 +MT380733 +MT380734 +MT385414 +MT385415 +MT385416 +MT385417 +MT385418 +MT385419 +MT385420 +MT385421 +MT385422 +MT385423 +MT385424 +MT385425 +MT385426 +MT385427 +MT385428 +MT385429 +MT385430 +MT385431 +MT385432 +MT385433 +MT385434 +MT385435 +MT385436 +MT385437 +MT385438 +MT385439 +MT385440 +MT385441 +MT385442 +MT385443 +MT385444 +MT385445 +MT385446 +MT385447 +MT385448 +MT385449 +MT385450 +MT385451 +MT385452 +MT385453 +MT385454 +MT385455 +MT385456 +MT385457 +MT385458 +MT385459 +MT385460 +MT385461 +MT385462 +MT385463 +MT385464 +MT385465 +MT385466 +MT385467 +MT385468 +MT385469 +MT385470 +MT385471 +MT385472 +MT385473 +MT385474 +MT385475 +MT385476 +MT385477 +MT385478 +MT385479 +MT385480 +MT385481 +MT385482 +MT385483 +MT385484 +MT385485 +MT385486 +MT385487 +MT385488 +MT385489 +MT385490 +MT385491 +MT385492 +MT385493 +MT385494 +MT385495 +MT385496 +MT385497 +MT186683 +MT252677 +MT252678 +MT252679 +MT252680 +MT252681 +MT252682 +MT252683 +MT252684 +MT252685 +MT252686 +MT252687 +MT252688 +MT252689 +MT252690 +MT252691 +MT252692 +MT252693 +MT252694 +MT252695 +MT252696 +MT252697 +MT252698 +MT252699 +MT252700 +MT252701 +MT252702 +MT252703 +MT252704 +MT252705 +MT252706 +MT252707 +MT252708 +MT252709 +MT252710 +MT252711 +MT252712 +MT252713 +MT252715 +MT252716 +MT252717 +MT252719 +MT252721 +MT252723 +MT252725 +MT252726 +MT252728 +MT252729 +MT252730 +MT252733 +MT252734 +MT252735 +MT252736 +MT252737 +MT252738 +MT252739 +MT252740 +MT252741 +MT252742 +MT252745 +MT252746 +MT252747 +MT252748 +MT252749 +MT252756 +MT252757 +MT252758 +MT252761 +MT252763 +MT252764 +MT252765 +MT252766 +MT252767 +MT252768 +MT252769 +MT252770 +MT252771 +MT252772 +MT252773 +MT252774 +MT252775 +MT252778 +MT252779 +MT252780 +MT252781 +MT252782 +MT252783 +MT252784 +MT252785 +MT252787 +MT252788 +MT252792 +MT252793 +MT252794 +MT252795 +MT252797 +MT252798 +MT252799 +MT252800 +MT252801 +MT252802 +MT252803 +MT252804 +MT252805 +MT252806 +MT252807 +MT252808 +MT252809 +MT252810 +MT252811 +MT252821 +MT252822 +MT252823 +MT252824 +MT339043 +MT365033 +MT374101 +MT374102 +MT374103 +MT374104 +MT374105 +MT374106 +MT374107 +MT374108 +MT374109 +MT374110 +MT374111 +MT374112 +MT374113 +MT374114 +MT374115 +MT374116 +MT375428 +MT375429 +MT375430 +MT375431 +MT375432 +MT375433 +MT375434 +MT375435 +MT375436 +MT375437 +MT375438 +MT375439 +MT375440 +MT375441 +MT375442 +MT375443 +MT375444 +MT375445 +MT375446 +MT375447 +MT375448 +MT375449 +MT375450 +MT375451 +MT375452 +MT375453 +MT375454 +MT375455 +MT375456 +MT375457 +MT375458 +MT375459 +MT375460 +MT375461 +MT375462 +MT375463 +MT375464 +MT375465 +MT375466 +MT375467 +MT375468 +MT375469 +MT375470 +MT375471 +MT375472 +MT375473 +MT375474 +MT375475 +MT375476 +MT375477 +MT375478 +MT375479 +MT375480 +MT375481 +MT375482 +MT375483 MT370516 MT370517 MT370518 @@ -225,6 +520,8 @@ MT372480 MT372481 MT372482 MT372483 +7BV2_P +7BV2_T LC542976 LC542809 MT114412 |