From 976f0978b38ffcce090fecbf4c04cb7aeec8239c Mon Sep 17 00:00:00 2001 From: lltommy Date: Sun, 26 Apr 2020 12:06:55 +0200 Subject: Updating dics --- scripts/dict_ontology_standardization/ncbi_countries.csv | 4 ++++ scripts/dict_ontology_standardization/ncbi_speciesman_source.csv | 1 + 2 files changed, 5 insertions(+) (limited to 'scripts') diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 20e8a9b..c08b613 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -39,6 +39,7 @@ Chad,http://www.wikidata.org/entity/Q657 Chile,http://www.wikidata.org/entity/Q298 China,http://www.wikidata.org/entity/Q148 China: Anhui,http://www.wikidata.org/entity/Q40956 +"China: Anhui, Fuyang":http://www.wikidata.org/entity/Q360584 China: Beijing,http://www.wikidata.org/entity/Q956 China: Chongqing,http://www.wikidata.org/entity/Q11725 China: Fujian,http://www.wikidata.org/entity/Q41705 @@ -48,6 +49,7 @@ China: Guangdong,http://www.wikidata.org/entity/Q15175 China: Guangxi Zhuang Autonomous Region,http://www.wikidata.org/entity/Q15176 China: Guangzhou,http://www.wikidata.org/entity/Q16572 China: Guizhou,http://www.wikidata.org/entity/Q47097 +China: Hangzhou,http://www.wikidata.org/entity/Q4970 China: Hainan,http://www.wikidata.org/entity/Q42200 China: Hebei,http://www.wikidata.org/entity/Q21208 China: Heilongjiang,http://www.wikidata.org/entity/Q19206 @@ -123,6 +125,7 @@ Iceland,http://www.wikidata.org/entity/Q189 Icelandic Commonwealth,http://www.wikidata.org/entity/Q62389 India,http://www.wikidata.org/entity/Q668 India: Kerala State,http://www.wikidata.org/entity/Q1186 +India: Rajkot,http://www.wikidata.org/entity/Q1815245 Indonesia,http://www.wikidata.org/entity/Q252 Iran,http://www.wikidata.org/entity/Q794 Iran: Qum,http://www.wikidata.org/entity/Q131664 @@ -263,6 +266,7 @@ USA: CA,http://www.wikidata.org/entity/Q99 "USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143 USA: CO,http://www.wikidata.org/entity/Q1261 USA: CT,http://www.wikidata.org/entity/Q779 +USA: DC,http://www.wikidata.org/entity/Q3551781 USA: DE,http://www.wikidata.org/entity/Q1393 USA: FL,http://www.wikidata.org/entity/Q812 USA: GA,http://www.wikidata.org/entity/Q1428 diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv index f5aeaae..7fa67f8 100644 --- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv +++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv @@ -1,5 +1,6 @@ nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831 respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831 naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal aspirate,http://purl.obolibrary.org/obo/NCIT_C155831 -- cgit v1.2.3 From 6032a373003affa641ca1e70a44c29a232b5b3ed Mon Sep 17 00:00:00 2001 From: lltommy Date: Tue, 28 Apr 2020 20:31:42 +0200 Subject: Changes to the structure - we use lists now instead of strings where it makes sense. This allows us to have multiple values where in makes sense --- bh20sequploader/bh20seq-options.yml | 30 ------------------ bh20sequploader/bh20seq-schema.yml | 52 +++++++++---------------------- bh20sequploader/bh20seq-shex.rdf | 11 ++++--- example/maximum_metadata_example.yaml | 44 ++++++++++++++++++++++++++ example/metadata.yaml | 43 ------------------------- example/minimal_example.yaml | 18 ----------- example/minimal_metadata_example.yaml | 0 scripts/from_genbank_to_fasta_and_yaml.py | 2 +- 8 files changed, 65 insertions(+), 135 deletions(-) create mode 100644 example/maximum_metadata_example.yaml delete mode 100644 example/metadata.yaml delete mode 100644 example/minimal_example.yaml create mode 100644 example/minimal_metadata_example.yaml (limited to 'scripts') diff --git a/bh20sequploader/bh20seq-options.yml b/bh20sequploader/bh20seq-options.yml index 104ed6c..c553f41 100644 --- a/bh20sequploader/bh20seq-options.yml +++ b/bh20sequploader/bh20seq-options.yml @@ -35,38 +35,8 @@ sample_sequencing_technology: Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 -sample_sequencing_technology2: - Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 - Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566 - Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567 - Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 - Illumina: http://purl.obolibrary.org/obo/OBI_0000759 - IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 - Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894 - Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 - Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 - Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 - -sample_sequencing_technology3: - Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 - Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566 - Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567 - Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 - Illumina: http://purl.obolibrary.org/obo/OBI_0000759 - IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 - Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894 - Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 - Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 - Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 - specimen_source: nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 sputum: http://purl.obolibrary.org/obo/NCIT_C13278 bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 - -specimen_source2: - nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 - oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 - sputum: http://purl.obolibrary.org/obo/NCIT_C13278 - bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index ea813a0..f36a6e6 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -48,6 +48,7 @@ $graph: type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C25688 + _type: "@id" host_treatment: doc: Process in which the act is intended to modify or alter host status type: string? @@ -55,7 +56,7 @@ $graph: _id: http://www.ebi.ac.uk/efo/EFO_0000727 host_vaccination: doc: List of vaccines given to the host - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/VO_0000002 additional_host_information: @@ -96,14 +97,7 @@ $graph: _id: http://purl.obolibrary.org/obo/NCIT_C41206 specimen_source: doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0001479 - _type: "@id" - noLinkCheck: true - specimen_source2: - doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb) - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001479 _type: "@id" @@ -119,10 +113,11 @@ $graph: jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 source_database_accession: - doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here - type: string? + doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here. Please use a resolveable URL (e.g. http://identifiers.org/insdc/LC522350.1#sequence) + type: string[]? jsonldPredicate: _id: http://edamontology.org/data_2091 + _type: "@id" - name: virusSchema type: record @@ -145,21 +140,7 @@ $graph: fields: sample_sequencing_technology: doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0600047 - _type: "@id" - noLinkCheck: true - sample_sequencing_technology2: - doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0600047 - _type: "@id" - noLinkCheck: true - sample_sequencing_technology3: - doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0600047 _type: "@id" @@ -170,13 +151,8 @@ $graph: jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0002699 sequencing_coverage: - doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) - type: float? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/FLU_0000848 - sequencing_coverage2: - doc: If a second sequence technology was used you can submit its coverage here - type: float? + doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. [100]) - if multiple technologies were used multiple float values can be submitted e.g. [100, 20] + type: int[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 additional_technology_information: @@ -189,13 +165,13 @@ $graph: type: record fields: authors: - doc: Name of the author(s) - type: string + doc: Name(s) of the author(s) + type: string[] jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C42781 submitter_name: - doc: Name of the submitter - type: string? + doc: Name of the submitter(s) + type: string[]? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000116 submitter_address: @@ -228,7 +204,7 @@ $graph: _id: http://purl.obolibrary.org/obo/NCIT_C19026 submitter_orcid: doc: ORCID of the submitter as a full URI, e.g. https://orcid.org/0000-0002-1825-0097 - type: string? + type: string[]? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000115 _type: "@id" diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index c3b0ae1..4ec957d 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -25,7 +25,7 @@ PREFIX wikidata: obo:NCIT_C42574 [ obo:UO_~ ] ?; obo:NCIT_C25688 [obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987 ] ? ; efo:EFO_0000727 xsd:string ?; - obo:VO_0000002 xsd:string ?; + obo:VO_0000002 xsd:string {0,10}; sio:SIO_001167 xsd:string ?; } @@ -38,25 +38,26 @@ PREFIX wikidata: obo:OBI_0001479 IRI {0,2}; obo:OBI_0001472 xsd:string ?; sio:SIO_001167 xsd:string ?; + edam:data_2091 IRI {0,3}; } :submitterShape { - obo:NCIT_C42781 xsd:string ; - sio:SIO_000116 xsd:string ?; + obo:NCIT_C42781 xsd:string * ; + sio:SIO_000116 xsd:string *; sio:SIO_000172 xsd:string ?; obo:NCIT_C37984 xsd:string ?; obo:OBI_0600047 xsd:string ?; obo:NCIT_C37900 xsd:string ?; efo:EFO_0001741 xsd:string ?; obo:NCIT_C19026 xsd:string ?; - sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?; + sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/ {0,10}; sio:SIO_001167 xsd:string ?; } :technologyShape { obo:OBI_0600047 IRI {0,3} ; efo:EFO_0002699 xsd:string ?; - obo:FLU_0000848 xsd:double {0,2}; + obo:FLU_0000848 xsd:integer {0,2}; sio:SIO_001167 xsd:string ?; } diff --git a/example/maximum_metadata_example.yaml b/example/maximum_metadata_example.yaml new file mode 100644 index 0000000..0a6d910 --- /dev/null +++ b/example/maximum_metadata_example.yaml @@ -0,0 +1,44 @@ +id: placeholder + +host: + host_id: XX1 + host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 + host_sex: http://purl.obolibrary.org/obo/PATO_0000384 + host_age: 20 + host_age_unit: http://purl.obolibrary.org/obo/UO_0000036 + host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269 + host_treatment: Process in which the act is intended to modify or alter host status (Compounds) + host_vaccination: [vaccines1,vaccine2] + additional_host_information: Optional free text field for addtional information + +sample: + sample_id: Id of the sample as defined by the submitter + collector_name: Name of the person that took the sample + collecting_institution: Institute that was responsible of sampling + specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835] + collection_date: "2020-01-01" + collection_location: http://www.wikidata.org/entity/Q148 + sample_storage_conditions: frozen specimen + source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] + additional_collection_information: Optional free text field for addtional information + +virus: + virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 + virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 + +technology: + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173] + sequence_assembly_method: Protocol used for assembly + sequencing_coverage: [70, 100] + additional_technology_information: Optional free text field for addtional information + +submitter: + submitter_name: [John Doe] + submitter_address: John Doe's adress + originating_lab: John Doe kitchen + lab_address: John Doe's address + provider_sample_id: XXX1 + submitter_sample_id: XXX2 + authors: [John Doe, Joe Boe, Jonny Oe] + submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001] + additional_submitter_information: Optional free text field for addtional information \ No newline at end of file diff --git a/example/metadata.yaml b/example/metadata.yaml deleted file mode 100644 index a76616c..0000000 --- a/example/metadata.yaml +++ /dev/null @@ -1,43 +0,0 @@ -id: placeholder - -host: - host_id: XX1 - host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 - host_sex: http://purl.obolibrary.org/obo/NCIT_C27993 - host_age: 20 - host_age_unit: http://purl.obolibrary.org/obo/UO_0000036 - host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269 - host_treatment: Process in which the act is intended to modify or alter host status (Compounds) - host_vaccination: List of vaccines given to the host (RRIDs?) - additional_host_information: Field for additional host information - -sample: - sample_id: Id of the sample as defined by the submitter - collector_name: Name of the person that took the sample - collecting_institution: Institute that was responsible of sampling - specimen_source: http://purl.obolibrary.org/obo/NCIT_C155831 - specimen_source2: http://purl.obolibrary.org/obo/NCIT_C155835 - collection_date: "2020-01-01" - collection_location: http://www.wikidata.org/entity/Q148 - sample_storage_conditions: XXX - additional_collection_information: XXX - -virus: - virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 - virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 - -technology: - sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173 - sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173 - sequence_assembly_method: Protocol used for assembly - sequencing_coverage: 70 - -submitter: - submitter_name: John Doe - submitter_address: John Doe's adress - originating_lab: John Doe kitchen - lab_address: John Doe's address - provider_sample_id: HmX - submitter_sample_id: xXx - authors: John Doe et all - submitter_orcid: https://orcid.org/0000-0000-0000-0000 \ No newline at end of file diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml deleted file mode 100644 index 0e36a25..0000000 --- a/example/minimal_example.yaml +++ /dev/null @@ -1,18 +0,0 @@ -id: placeholder - -host: - host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 - -sample: - sample_id: XX - collection_date: 2020-01 - collection_location: http://www.wikidata.org/entity/Q148 - -virus: - virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 - -technology: - sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632 - -submitter: - authors: John Doe \ No newline at end of file diff --git a/example/minimal_metadata_example.yaml b/example/minimal_metadata_example.yaml new file mode 100644 index 0000000..e69de29 diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 5257bd1..148a7e1 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -112,7 +112,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['sample_id'] = accession_version - info_for_yaml_dict['sample']['source_database_accession'] = accession_version + info_for_yaml_dict['sample']['source_database_accession'] = "http://identifiers.org/insdc/"+accession_version+"#sequence" #accession is turned into resolvable URL/URI now # submitter info -- cgit v1.2.3 From c1b24ed6ab4ad0697f472a2726b1e557297797a6 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Tue, 28 Apr 2020 22:49:13 +0200 Subject: Updated - 1731 IDs - 2020/04/28 --- scripts/sequences.acc | 297 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) (limited to 'scripts') diff --git a/scripts/sequences.acc b/scripts/sequences.acc index a99c4e6..697d868 100644 --- a/scripts/sequences.acc +++ b/scripts/sequences.acc @@ -1,4 +1,299 @@ NC_045512 +MT394528 +MT394529 +MT394530 +MT394531 +MT394864 +MT396241 +MT396242 +MT396243 +MT396244 +MT396245 +MT396246 +MT396247 +MT396248 +MT396266 +MT380726 +MT380727 +MT380728 +MT380729 +MT380730 +MT380731 +MT380732 +MT380733 +MT380734 +MT385414 +MT385415 +MT385416 +MT385417 +MT385418 +MT385419 +MT385420 +MT385421 +MT385422 +MT385423 +MT385424 +MT385425 +MT385426 +MT385427 +MT385428 +MT385429 +MT385430 +MT385431 +MT385432 +MT385433 +MT385434 +MT385435 +MT385436 +MT385437 +MT385438 +MT385439 +MT385440 +MT385441 +MT385442 +MT385443 +MT385444 +MT385445 +MT385446 +MT385447 +MT385448 +MT385449 +MT385450 +MT385451 +MT385452 +MT385453 +MT385454 +MT385455 +MT385456 +MT385457 +MT385458 +MT385459 +MT385460 +MT385461 +MT385462 +MT385463 +MT385464 +MT385465 +MT385466 +MT385467 +MT385468 +MT385469 +MT385470 +MT385471 +MT385472 +MT385473 +MT385474 +MT385475 +MT385476 +MT385477 +MT385478 +MT385479 +MT385480 +MT385481 +MT385482 +MT385483 +MT385484 +MT385485 +MT385486 +MT385487 +MT385488 +MT385489 +MT385490 +MT385491 +MT385492 +MT385493 +MT385494 +MT385495 +MT385496 +MT385497 +MT186683 +MT252677 +MT252678 +MT252679 +MT252680 +MT252681 +MT252682 +MT252683 +MT252684 +MT252685 +MT252686 +MT252687 +MT252688 +MT252689 +MT252690 +MT252691 +MT252692 +MT252693 +MT252694 +MT252695 +MT252696 +MT252697 +MT252698 +MT252699 +MT252700 +MT252701 +MT252702 +MT252703 +MT252704 +MT252705 +MT252706 +MT252707 +MT252708 +MT252709 +MT252710 +MT252711 +MT252712 +MT252713 +MT252715 +MT252716 +MT252717 +MT252719 +MT252721 +MT252723 +MT252725 +MT252726 +MT252728 +MT252729 +MT252730 +MT252733 +MT252734 +MT252735 +MT252736 +MT252737 +MT252738 +MT252739 +MT252740 +MT252741 +MT252742 +MT252745 +MT252746 +MT252747 +MT252748 +MT252749 +MT252756 +MT252757 +MT252758 +MT252761 +MT252763 +MT252764 +MT252765 +MT252766 +MT252767 +MT252768 +MT252769 +MT252770 +MT252771 +MT252772 +MT252773 +MT252774 +MT252775 +MT252778 +MT252779 +MT252780 +MT252781 +MT252782 +MT252783 +MT252784 +MT252785 +MT252787 +MT252788 +MT252792 +MT252793 +MT252794 +MT252795 +MT252797 +MT252798 +MT252799 +MT252800 +MT252801 +MT252802 +MT252803 +MT252804 +MT252805 +MT252806 +MT252807 +MT252808 +MT252809 +MT252810 +MT252811 +MT252821 +MT252822 +MT252823 +MT252824 +MT339043 +MT365033 +MT374101 +MT374102 +MT374103 +MT374104 +MT374105 +MT374106 +MT374107 +MT374108 +MT374109 +MT374110 +MT374111 +MT374112 +MT374113 +MT374114 +MT374115 +MT374116 +MT375428 +MT375429 +MT375430 +MT375431 +MT375432 +MT375433 +MT375434 +MT375435 +MT375436 +MT375437 +MT375438 +MT375439 +MT375440 +MT375441 +MT375442 +MT375443 +MT375444 +MT375445 +MT375446 +MT375447 +MT375448 +MT375449 +MT375450 +MT375451 +MT375452 +MT375453 +MT375454 +MT375455 +MT375456 +MT375457 +MT375458 +MT375459 +MT375460 +MT375461 +MT375462 +MT375463 +MT375464 +MT375465 +MT375466 +MT375467 +MT375468 +MT375469 +MT375470 +MT375471 +MT375472 +MT375473 +MT375474 +MT375475 +MT375476 +MT375477 +MT375478 +MT375479 +MT375480 +MT375481 +MT375482 +MT375483 MT370516 MT370517 MT370518 @@ -225,6 +520,8 @@ MT372480 MT372481 MT372482 MT372483 +7BV2_P +7BV2_T LC542976 LC542809 MT114412 -- cgit v1.2.3 From a775ad53d42a71ec8758ac5a0d2115abeaf23dd4 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Tue, 28 Apr 2020 22:50:45 +0200 Subject: Updated with new terms --- scripts/dict_ontology_standardization/ncbi_countries.csv | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index c08b613..772c0f2 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -39,13 +39,13 @@ Chad,http://www.wikidata.org/entity/Q657 Chile,http://www.wikidata.org/entity/Q298 China,http://www.wikidata.org/entity/Q148 China: Anhui,http://www.wikidata.org/entity/Q40956 -"China: Anhui, Fuyang":http://www.wikidata.org/entity/Q360584 +"China: Anhui, Fuyang"":http://www.wikidata.org/entity/Q360584 China: Beijing,http://www.wikidata.org/entity/Q956 China: Chongqing,http://www.wikidata.org/entity/Q11725 China: Fujian,http://www.wikidata.org/entity/Q41705 China: Gansu,http://www.wikidata.org/entity/Q42392 China: Guangdong,http://www.wikidata.org/entity/Q15175 -"China: Guangdong, Guangzhou",http://www.wikidata.org/entity/Q16572 +""China: Guangdong, Guangzhou",http://www.wikidata.org/entity/Q16572 China: Guangxi Zhuang Autonomous Region,http://www.wikidata.org/entity/Q15176 China: Guangzhou,http://www.wikidata.org/entity/Q16572 China: Guizhou,http://www.wikidata.org/entity/Q47097 @@ -111,6 +111,7 @@ France,http://www.wikidata.org/entity/Q142 Gabon,http://www.wikidata.org/entity/Q1000 Georgia,http://www.wikidata.org/entity/Q230 Germany,http://www.wikidata.org/entity/Q183 +Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718 Ghana,http://www.wikidata.org/entity/Q117 Greece,http://www.wikidata.org/entity/Q41 Grenada,http://www.wikidata.org/entity/Q769 @@ -175,6 +176,7 @@ Mozambique,http://www.wikidata.org/entity/Q1029 Myanmar,http://www.wikidata.org/entity/Q836 Namibia,http://www.wikidata.org/entity/Q1030 Nauru,http://www.wikidata.org/entity/Q697 +Netherlands: Milheeze,https://www.wikidata.org/wiki/Q3314115 Nepal,http://www.wikidata.org/entity/Q837 New Zealand,http://www.wikidata.org/entity/Q664 Nicaragua,http://www.wikidata.org/entity/Q811 @@ -297,6 +299,7 @@ USA: NM,http://www.wikidata.org/entity/Q1522 USA: North Carolina,http://www.wikidata.org/entity/Q1454 USA: NV,http://www.wikidata.org/entity/Q1227 USA: NY,http://www.wikidata.org/entity/Q1384 +USA: New York,http://www.wikidata.org/entity/Q1384 USA: OH,http://www.wikidata.org/entity/Q1397 USA: OK,http://www.wikidata.org/entity/Q1649 USA: OR,http://www.wikidata.org/entity/Q824 -- cgit v1.2.3 From ceec48e78ab50e59431adf409d82ab38e702f517 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Tue, 28 Apr 2020 22:51:09 +0200 Subject: Updated with new terms --- .../ncbi_countries.csv | 360 ++------------------- 1 file changed, 29 insertions(+), 331 deletions(-) (limited to 'scripts') diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 772c0f2..b81da36 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -1,331 +1,29 @@ -30.59 N 114.3 E,http://www.wikidata.org/entity/Q11746 -35.92 N 74.33 E,http://www.wikidata.org/entity/Q609024 -39.54 N 116.23 E,http://www.wikidata.org/entity/Q198244 -Afghanistan,http://www.wikidata.org/entity/Q889 -Albania,http://www.wikidata.org/entity/Q222 -Algeria,http://www.wikidata.org/entity/Q262 -Andorra,http://www.wikidata.org/entity/Q228 -Angola,http://www.wikidata.org/entity/Q916 -Antigua and Barbuda,http://www.wikidata.org/entity/Q781 -Argentina,http://www.wikidata.org/entity/Q414 -Armenia,http://www.wikidata.org/entity/Q399 -Australia,http://www.wikidata.org/entity/Q408 -Australia: Queensland,http://www.wikidata.org/entity/Q36074 -Australia: Victoria,http://www.wikidata.org/entity/Q36687 -Austria,http://www.wikidata.org/entity/Q40 -Azerbaijan,http://www.wikidata.org/entity/Q227 -Bahrain,http://www.wikidata.org/entity/Q398 -Bangladesh,http://www.wikidata.org/entity/Q902 -Barbados,http://www.wikidata.org/entity/Q244 -Belarus,http://www.wikidata.org/entity/Q184 -Belgium,http://www.wikidata.org/entity/Q31 -Belize,http://www.wikidata.org/entity/Q242 -Benin,http://www.wikidata.org/entity/Q962 -Bhutan,http://www.wikidata.org/entity/Q917 -Bolivia,http://www.wikidata.org/entity/Q750 -Bosnia and Herzegovina,http://www.wikidata.org/entity/Q225 -Botswana,http://www.wikidata.org/entity/Q963 -Brazil,http://www.wikidata.org/entity/Q155 -Brunei,http://www.wikidata.org/entity/Q921 -Bulgaria,http://www.wikidata.org/entity/Q219 -Burkina Faso,http://www.wikidata.org/entity/Q965 -Burundi,http://www.wikidata.org/entity/Q967 -Cambodia,http://www.wikidata.org/entity/Q424 -Cameroon,http://www.wikidata.org/entity/Q1009 -Canada,http://www.wikidata.org/entity/Q16 -Cape Verde,http://www.wikidata.org/entity/Q1011 -Central African Republic,http://www.wikidata.org/entity/Q929 -Chad,http://www.wikidata.org/entity/Q657 -Chile,http://www.wikidata.org/entity/Q298 -China,http://www.wikidata.org/entity/Q148 -China: Anhui,http://www.wikidata.org/entity/Q40956 -"China: Anhui, Fuyang"":http://www.wikidata.org/entity/Q360584 -China: Beijing,http://www.wikidata.org/entity/Q956 -China: Chongqing,http://www.wikidata.org/entity/Q11725 -China: Fujian,http://www.wikidata.org/entity/Q41705 -China: Gansu,http://www.wikidata.org/entity/Q42392 -China: Guangdong,http://www.wikidata.org/entity/Q15175 -""China: Guangdong, Guangzhou",http://www.wikidata.org/entity/Q16572 -China: Guangxi Zhuang Autonomous Region,http://www.wikidata.org/entity/Q15176 -China: Guangzhou,http://www.wikidata.org/entity/Q16572 -China: Guizhou,http://www.wikidata.org/entity/Q47097 -China: Hangzhou,http://www.wikidata.org/entity/Q4970 -China: Hainan,http://www.wikidata.org/entity/Q42200 -China: Hebei,http://www.wikidata.org/entity/Q21208 -China: Heilongjiang,http://www.wikidata.org/entity/Q19206 -China: Henan,http://www.wikidata.org/entity/Q43684 -China: Hong Kong,http://www.wikidata.org/entity/Q8646 -China: HuaShang,http://www.wikidata.org/entity/Q148 -China: Hubei,http://www.wikidata.org/entity/Q46862 -"China: Hubei, Wuhan",http://www.wikidata.org/entity/Q11746 -China: Hunan,http://www.wikidata.org/entity/Q45761 -China: Inner Mongolia,http://www.wikidata.org/entity/Q41079 -China: Jiangsu,http://www.wikidata.org/entity/Q16963 -China: Jiangxi,http://www.wikidata.org/entity/Q57052 -China: Jilin,http://www.wikidata.org/entity/Q45208 -China: Liaoning,http://www.wikidata.org/entity/Q43934 -China: Macau,http://www.wikidata.org/entity/Q14773 -China: Nanchang,https://www.wikidata.org/wiki/Q171943 -China: Ningxia Hui Autonomous Region,http://www.wikidata.org/entity/Q57448 -China: Qinghai,http://www.wikidata.org/entity/Q45833 -China: Shaanxi,http://www.wikidata.org/entity/Q47974 -China: Shandong,http://www.wikidata.org/entity/Q43407 -China: Shanghai,http://www.wikidata.org/entity/Q8686 -China: Shanxi,http://www.wikidata.org/entity/Q46913 -China: Shenzhen,http://www.wikidata.org/entity/Q15174 -China: Sichuan,http://www.wikidata.org/entity/Q19770 -China: Tianjin,http://www.wikidata.org/entity/Q11736 -China: Tibet Autonomous Region,http://www.wikidata.org/entity/Q17269 -China: Wuhan,http://www.wikidata.org/entity/Q11746 -China:Wuhan,http://www.wikidata.org/entity/Q11746 -China: Xinjiang,http://www.wikidata.org/entity/Q34800 -China: Yunnan,http://www.wikidata.org/entity/Q43194 -China: Zhejiang,http://www.wikidata.org/entity/Q16967 -"China: Zhejiang, Hangzhou",http://www.wikidata.org/entity/Q4970 -Colombia,http://www.wikidata.org/entity/Q739 -Colombia: Antioquia,http://www.wikidata.org/entity/Q123304 -Comoros,http://www.wikidata.org/entity/Q970 -Costa Rica,http://www.wikidata.org/entity/Q800 -Croatia,http://www.wikidata.org/entity/Q224 -Cuba,http://www.wikidata.org/entity/Q241 -Czech Republic,http://www.wikidata.org/entity/Q213 -Democratic Republic of the Congo,http://www.wikidata.org/entity/Q974 -Denmark,http://www.wikidata.org/entity/Q35 -Djibouti,http://www.wikidata.org/entity/Q977 -Dominica,http://www.wikidata.org/entity/Q784 -Dominican Republic,http://www.wikidata.org/entity/Q786 -East Timor,http://www.wikidata.org/entity/Q574 -Ecuador,http://www.wikidata.org/entity/Q736 -Egypt,http://www.wikidata.org/entity/Q79 -El Salvador,http://www.wikidata.org/entity/Q792 -Equatorial Guinea,http://www.wikidata.org/entity/Q983 -Eritrea,http://www.wikidata.org/entity/Q986 -Estado Libre del Istmo,http://www.wikidata.org/entity/Q8842943 -Estonia,http://www.wikidata.org/entity/Q191 -Eswatini,http://www.wikidata.org/entity/Q1050 -Ethiopia,http://www.wikidata.org/entity/Q115 -Federated States of Micronesia,http://www.wikidata.org/entity/Q702 -Fiji,http://www.wikidata.org/entity/Q712 -Finland,http://www.wikidata.org/entity/Q33 -France,http://www.wikidata.org/entity/Q142 -Gabon,http://www.wikidata.org/entity/Q1000 -Georgia,http://www.wikidata.org/entity/Q230 -Germany,http://www.wikidata.org/entity/Q183 -Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718 -Ghana,http://www.wikidata.org/entity/Q117 -Greece,http://www.wikidata.org/entity/Q41 -Grenada,http://www.wikidata.org/entity/Q769 -Guatemala,http://www.wikidata.org/entity/Q774 -Guinea,http://www.wikidata.org/entity/Q1006 -Guinea-Bissau,http://www.wikidata.org/entity/Q1007 -Guyana,http://www.wikidata.org/entity/Q734 -Haiti,http://www.wikidata.org/entity/Q790 -Honduras,http://www.wikidata.org/entity/Q783 -Hungary,http://www.wikidata.org/entity/Q28 -Iceland,http://www.wikidata.org/entity/Q189 -Icelandic Commonwealth,http://www.wikidata.org/entity/Q62389 -India,http://www.wikidata.org/entity/Q668 -India: Kerala State,http://www.wikidata.org/entity/Q1186 -India: Rajkot,http://www.wikidata.org/entity/Q1815245 -Indonesia,http://www.wikidata.org/entity/Q252 -Iran,http://www.wikidata.org/entity/Q794 -Iran: Qum,http://www.wikidata.org/entity/Q131664 -Iran: Tehran,http://www.wikidata.org/entity/Q3616 -Iraq,http://www.wikidata.org/entity/Q796 -Ireland,http://www.wikidata.org/entity/Q27 -Israel,http://www.wikidata.org/entity/Q801 -Italy,http://www.wikidata.org/entity/Q38 -Italy: Cagliari,http://www.wikidata.org/entity/Q1897 -Italy: Rome,http://www.wikidata.org/entity/Q220 -Ivory Coast,http://www.wikidata.org/entity/Q1008 -Jamaica,http://www.wikidata.org/entity/Q766 -Japan,http://www.wikidata.org/entity/Q17 -Jordan,http://www.wikidata.org/entity/Q810 -Kazakhstan,http://www.wikidata.org/entity/Q232 -Kenya,http://www.wikidata.org/entity/Q114 -Kingdom of Denmark,http://www.wikidata.org/entity/Q756617 -Kingdom of the Netherlands,http://www.wikidata.org/entity/Q29999 -Kiribati,http://www.wikidata.org/entity/Q710 -Kuwait,http://www.wikidata.org/entity/Q817 -Kyrgyzstan,http://www.wikidata.org/entity/Q813 -Laos,http://www.wikidata.org/entity/Q819 -Latvia,http://www.wikidata.org/entity/Q211 -Lebanon,http://www.wikidata.org/entity/Q822 -Lesotho,http://www.wikidata.org/entity/Q1013 -Liberia,http://www.wikidata.org/entity/Q1014 -Libya,http://www.wikidata.org/entity/Q1016 -Liechtenstein,http://www.wikidata.org/entity/Q347 -Lithuania,http://www.wikidata.org/entity/Q37 -Luxembourg,http://www.wikidata.org/entity/Q32 -Madagascar,http://www.wikidata.org/entity/Q1019 -Malawi,http://www.wikidata.org/entity/Q1020 -Malaysia,http://www.wikidata.org/entity/Q833 -Maldives,http://www.wikidata.org/entity/Q826 -Mali,http://www.wikidata.org/entity/Q912 -Malta,http://www.wikidata.org/entity/Q233 -Marshall Islands,http://www.wikidata.org/entity/Q709 -Mauritania,http://www.wikidata.org/entity/Q1025 -Mauritius,http://www.wikidata.org/entity/Q1027 -Mexico,http://www.wikidata.org/entity/Q96 -Moldova,http://www.wikidata.org/entity/Q217 -Monaco,http://www.wikidata.org/entity/Q235 -Mongolia,http://www.wikidata.org/entity/Q711 -Montenegro,http://www.wikidata.org/entity/Q236 -Morocco,http://www.wikidata.org/entity/Q1028 -Mozambique,http://www.wikidata.org/entity/Q1029 -Myanmar,http://www.wikidata.org/entity/Q836 -Namibia,http://www.wikidata.org/entity/Q1030 -Nauru,http://www.wikidata.org/entity/Q697 -Netherlands: Milheeze,https://www.wikidata.org/wiki/Q3314115 -Nepal,http://www.wikidata.org/entity/Q837 -New Zealand,http://www.wikidata.org/entity/Q664 -Nicaragua,http://www.wikidata.org/entity/Q811 -Niger,http://www.wikidata.org/entity/Q1032 -Nigeria,http://www.wikidata.org/entity/Q1033 -Nigeria: Lagos,http://www.wikidata.org/entity/Q8673 -North Korea,http://www.wikidata.org/entity/Q423 -North Macedonia,http://www.wikidata.org/entity/Q221 -Norway,http://www.wikidata.org/entity/Q20 -Oman,http://www.wikidata.org/entity/Q842 -Ottoman Empire,http://www.wikidata.org/entity/Q12560 -Pakistan,http://www.wikidata.org/entity/Q843 -Pakistan: Gilgit,http://www.wikidata.org/entity/Q609024 -Pakistan: KPK,http://www.wikidata.org/entity/Q183314 -Palau,http://www.wikidata.org/entity/Q695 -Panama,http://www.wikidata.org/entity/Q804 -Papua New Guinea,http://www.wikidata.org/entity/Q691 -Paraguay,http://www.wikidata.org/entity/Q733 -People's Republic of China,http://www.wikidata.org/entity/Q148 -Peru,http://www.wikidata.org/entity/Q419 -Philippines,http://www.wikidata.org/entity/Q928 -Poland,http://www.wikidata.org/entity/Q36 -Portugal,http://www.wikidata.org/entity/Q45 -Principality of Turov and Pinsk,http://www.wikidata.org/entity/Q671362 -Qatar,http://www.wikidata.org/entity/Q846 -Republic of Cyprus,http://www.wikidata.org/entity/Q229 -Republic of Geneva,http://www.wikidata.org/entity/Q23366230 -Republic of the Congo,http://www.wikidata.org/entity/Q971 -Romania,http://www.wikidata.org/entity/Q218 -Russia,http://www.wikidata.org/entity/Q159 -Rwanda,http://www.wikidata.org/entity/Q1037 -Saint Kitts and Nevis,http://www.wikidata.org/entity/Q763 -Saint Lucia,http://www.wikidata.org/entity/Q760 -Saint Vincent and the Grenadines,http://www.wikidata.org/entity/Q757 -Samoa,http://www.wikidata.org/entity/Q683 -San Marino,http://www.wikidata.org/entity/Q238 -São Tomé and Príncipe,http://www.wikidata.org/entity/Q1039 -Saudi Arabia,http://www.wikidata.org/entity/Q851 -Senegal,http://www.wikidata.org/entity/Q1041 -Serbia,http://www.wikidata.org/entity/Q403 -Seychelles,http://www.wikidata.org/entity/Q1042 -Sierra Leone,http://www.wikidata.org/entity/Q1044 -Singapore,http://www.wikidata.org/entity/Q334 -Slovakia,http://www.wikidata.org/entity/Q214 -Slovenia,http://www.wikidata.org/entity/Q215 -Solomon Islands,http://www.wikidata.org/entity/Q685 -Somalia,http://www.wikidata.org/entity/Q1045 -South Africa,http://www.wikidata.org/entity/Q258 -South Africa: KwaZulu-Natal,http://www.wikidata.org/entity/Q81725 -South African Republic,http://www.wikidata.org/entity/Q550374 -South Korea,http://www.wikidata.org/entity/Q884 -South Sudan,http://www.wikidata.org/entity/Q958 -Spain,http://www.wikidata.org/entity/Q29 -Spain: Valencia,http://www.wikidata.org/entity/Q8818 -Sri Lanka,http://www.wikidata.org/entity/Q854 -State of Los Altos,http://www.wikidata.org/entity/Q738264 -Sudan,http://www.wikidata.org/entity/Q1049 -Suriname,http://www.wikidata.org/entity/Q730 -Sweden,http://www.wikidata.org/entity/Q34 -Switzerland,http://www.wikidata.org/entity/Q39 -Syria,http://www.wikidata.org/entity/Q858 -Taiwan,http://www.wikidata.org/entity/Q865 -Tajikistan,http://www.wikidata.org/entity/Q863 -Tanzania,http://www.wikidata.org/entity/Q924 -Thailand,http://www.wikidata.org/entity/Q869 -The Bahamas,http://www.wikidata.org/entity/Q778 -The Gambia,http://www.wikidata.org/entity/Q1005 -Togo,http://www.wikidata.org/entity/Q945 -Tonga,http://www.wikidata.org/entity/Q678 -Trinidad and Tobago,http://www.wikidata.org/entity/Q754 -Tunisia,http://www.wikidata.org/entity/Q948 -Tunisia: Tunis,http://www.wikidata.org/entity/Q3572 -Turkey,http://www.wikidata.org/entity/Q43 -Turkmenistan,http://www.wikidata.org/entity/Q874 -Tuvalu,http://www.wikidata.org/entity/Q672 -Uganda,http://www.wikidata.org/entity/Q1036 -Ukraine,http://www.wikidata.org/entity/Q212 -United Arab Emirates,http://www.wikidata.org/entity/Q878 -United Arab Republic,http://www.wikidata.org/entity/Q170468 -United Kingdom,http://www.wikidata.org/entity/Q145 -United States of America,http://www.wikidata.org/entity/Q30 -Uruguay,http://www.wikidata.org/entity/Q77 -USA,http://www.wikidata.org/entity/Q30 -USA: AK,http://www.wikidata.org/entity/Q797 -USA: AL,http://www.wikidata.org/entity/Q173 -USA: AR,http://www.wikidata.org/entity/Q1612 -USA: AZ,http://www.wikidata.org/entity/Q816 -USA: CA,http://www.wikidata.org/entity/Q99 -"USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143 -USA: CO,http://www.wikidata.org/entity/Q1261 -USA: CT,http://www.wikidata.org/entity/Q779 -USA: DC,http://www.wikidata.org/entity/Q3551781 -USA: DE,http://www.wikidata.org/entity/Q1393 -USA: FL,http://www.wikidata.org/entity/Q812 -USA: GA,http://www.wikidata.org/entity/Q1428 -USA: HI,http://www.wikidata.org/entity/Q782 -USA: IA,http://www.wikidata.org/entity/Q1546 -USA: ID,http://www.wikidata.org/entity/Q1221 -USA: IL,http://www.wikidata.org/entity/Q1204 -USA: Illinois,http://www.wikidata.org/entity/Q1204 -USA: IN,http://www.wikidata.org/entity/Q1415 -USA: KS,http://www.wikidata.org/entity/Q1558 -USA: KY,http://www.wikidata.org/entity/Q1603 -USA: LA,http://www.wikidata.org/entity/Q1588 -"USA: New Orleans, LA",https://www.wikidata.org/wiki/Q34404 -USA: MA,http://www.wikidata.org/entity/Q771 -USA: MD,http://www.wikidata.org/entity/Q1391 -USA: ME,http://www.wikidata.org/entity/Q724 -USA: MI,http://www.wikidata.org/entity/Q1166 -USA: MN,http://www.wikidata.org/entity/Q1527 -USA: MO,http://www.wikidata.org/entity/Q1581 -USA: MS,http://www.wikidata.org/entity/Q1494 -USA: MT,http://www.wikidata.org/entity/Q1212 -USA: NC,http://www.wikidata.org/entity/Q1454 -USA: ND,http://www.wikidata.org/entity/Q1207 -USA: NE,http://www.wikidata.org/entity/Q1553 -USA: NH,http://www.wikidata.org/entity/Q759 -USA: NJ,http://www.wikidata.org/entity/Q1408 -USA: NM,http://www.wikidata.org/entity/Q1522 -USA: North Carolina,http://www.wikidata.org/entity/Q1454 -USA: NV,http://www.wikidata.org/entity/Q1227 -USA: NY,http://www.wikidata.org/entity/Q1384 -USA: New York,http://www.wikidata.org/entity/Q1384 -USA: OH,http://www.wikidata.org/entity/Q1397 -USA: OK,http://www.wikidata.org/entity/Q1649 -USA: OR,http://www.wikidata.org/entity/Q824 -USA: PA,http://www.wikidata.org/entity/Q1400 -USA: RI,http://www.wikidata.org/entity/Q1387 -"USA: San Francisco, CA",http://www.wikidata.org/entity/Q62 -USA: SC,http://www.wikidata.org/entity/Q1456 -USA: SD,http://www.wikidata.org/entity/Q1211 -"USA: Snohomish County, WA",http://www.wikidata.org/entity/Q110403 -USA: TN,http://www.wikidata.org/entity/Q1509 -USA: TX,http://www.wikidata.org/entity/Q1439 -USA: UT,http://www.wikidata.org/entity/Q829 -USA: VA,http://www.wikidata.org/entity/Q1370 -USA: VT,http://www.wikidata.org/entity/Q16551 -USA: WA,http://www.wikidata.org/entity/Q1223 -USA: WI,http://www.wikidata.org/entity/Q1537 -USA: WV,http://www.wikidata.org/entity/Q1371 -USA: WY,http://www.wikidata.org/entity/Q1214 -Uzbekistan,http://www.wikidata.org/entity/Q265 -Vanuatu,http://www.wikidata.org/entity/Q686 -Vatican City,http://www.wikidata.org/entity/Q237 -Venezuela,http://www.wikidata.org/entity/Q717 -Viet nam,http://www.wikidata.org/entity/Q881 -Viet Nam,http://www.wikidata.org/entity/Q881 -Viet Nam: Ho Chi Minh city,http://www.wikidata.org/entity/Q1854 -Vietnam,http://www.wikidata.org/entity/Q881 -Yemen,http://www.wikidata.org/entity/Q805 -Zambia,http://www.wikidata.org/entity/Q953 -Zimbabwe,http://www.wikidata.org/entity/Q954 +nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal swabs,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831 +respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831 +naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal aspirate,http://purl.obolibrary.org/obo/NCIT_C155831 +nasal swab specimen,http://purl.obolibrary.org/obo/NCIT_C155831 +pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 +respiratory secretion,http://purl.obolibrary.org/obo/NCIT_C155831 +mid-nasal swab,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal (throat) washings,http://purl.obolibrary.org/obo/NCIT_C155831 +oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 +throat swab,http://purl.obolibrary.org/obo/NCIT_C155835 +oro-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835 +buccal swab,http://purl.obolibrary.org/obo/NCIT_C155835 +throat washing,http://purl.obolibrary.org/obo/NCIT_C155835 +Throat Swab,http://purl.obolibrary.org/obo/NCIT_C155835 +throat (oropharyngeal) swab,http://purl.obolibrary.org/obo/NCIT_C155835 +bronchoalveolar lavage fluid,http://purl.obolibrary.org/obo/NCIT_C13195 +swab,http://purl.obolibrary.org/obo/NCIT_C13195 +oral swab,http://purl.obolibrary.org/obo/NCIT_C13195 +bronchoalveolar lavage,http://purl.obolibrary.org/obo/NCIT_C13195 +sputum,http://purl.obolibrary.org/obo/NCIT_C13278 +aspirate,http://purl.obolibrary.org/obo/NCIT_C13347 +stool,http://purl.obolibrary.org/obo/NCIT_C13234 +serum,http://purl.obolibrary.org/obo/NCIT_C13325 +saliva,http://purl.obolibrary.org/obo/NCIT_C13275 +nasal swab,http://purl.obolibrary.org/obo/NCIT_C132119 -- cgit v1.2.3 From 8f5853364360357e8424f21ea7ab05e73aa7a367 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Tue, 28 Apr 2020 22:52:55 +0200 Subject: updated to manage list fields and added new control on nasopharyngeal/throat swab --- scripts/from_genbank_to_fasta_and_yaml.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'scripts') diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 148a7e1..21ed3b2 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -37,8 +37,7 @@ if not os.path.exists(dir_metadata): tmp_list = [x.split('.')[0] for x in tmp_list] print(term, len(tmp_list)) - tmp_list=tmp_list - # tmp_list = tmp_list[0:2] # restricting to small run + #tmp_list = tmp_list[0:2] # restricting to small run id_set.update([x.split('.')[0] for x in tmp_list]) @@ -112,13 +111,13 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['sample_id'] = accession_version - info_for_yaml_dict['sample']['source_database_accession'] = "http://identifiers.org/insdc/"+accession_version+"#sequence" #accession is turned into resolvable URL/URI now + info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now # submitter info GBSeq_references = GBSeq.find('GBSeq_references') if GBSeq_references is not None: - info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq_references.iter('GBAuthor')]) + info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] GBReference = GBSeq_references.find('GBReference') if GBReference is not None: @@ -126,7 +125,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': if 'Submitted' in GBReference_journal.text: - info_for_yaml_dict['submitter']['submitter_name'] = GBReference_journal.text.split(') ')[1].split(',')[0].strip() + info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip() else: info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text @@ -146,8 +145,9 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if field_in_yaml == 'sequencing_coverage': # A regular expression would be better! try: - info_for_yaml_dict['technology'][field_in_yaml] = float( - tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + info_for_yaml_dict['technology'][field_in_yaml] = [ + float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + ] except ValueError: print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse) pass @@ -162,8 +162,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) new_seq_tec_list.append(seq_tec) - for n, seq_tec in enumerate(new_seq_tec_list): - info_for_yaml_dict['technology'][field_in_yaml + ('' if n == 0 else str(n + 1))] = seq_tec + info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list] else: info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse @@ -210,17 +209,14 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' if GBQualifier_value_text in term_to_uri_dict: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict[GBQualifier_value_text] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: if GBQualifier_value_text in ['NP/OP swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'np/np swab', 'np/op']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['oropharyngeal swab'] - elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab'] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] + elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] elif GBQualifier_value_text in ['nasopharyngeal aspirate/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal aspirate'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab'] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] else: missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) elif GBQualifier_name_text == 'collection_date': -- cgit v1.2.3 From 5721e446df72d5563c7434bf1b583d3d0004cd09 Mon Sep 17 00:00:00 2001 From: lltommy Date: Wed, 29 Apr 2020 09:26:56 +0200 Subject: Reverting country list back, something went wrong there --- .../ncbi_countries.csv | 360 +++++++++++++++++++-- 1 file changed, 331 insertions(+), 29 deletions(-) (limited to 'scripts') diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index b81da36..6b43137 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -1,29 +1,331 @@ -nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 -nasopharyngeal swabs,http://purl.obolibrary.org/obo/NCIT_C155831 -nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 -nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831 -respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831 -naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 -nasopharyngeal aspirate,http://purl.obolibrary.org/obo/NCIT_C155831 -nasal swab specimen,http://purl.obolibrary.org/obo/NCIT_C155831 -pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 -respiratory secretion,http://purl.obolibrary.org/obo/NCIT_C155831 -mid-nasal swab,http://purl.obolibrary.org/obo/NCIT_C155831 -nasopharyngeal (throat) washings,http://purl.obolibrary.org/obo/NCIT_C155831 -oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 -throat swab,http://purl.obolibrary.org/obo/NCIT_C155835 -oro-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835 -buccal swab,http://purl.obolibrary.org/obo/NCIT_C155835 -throat washing,http://purl.obolibrary.org/obo/NCIT_C155835 -Throat Swab,http://purl.obolibrary.org/obo/NCIT_C155835 -throat (oropharyngeal) swab,http://purl.obolibrary.org/obo/NCIT_C155835 -bronchoalveolar lavage fluid,http://purl.obolibrary.org/obo/NCIT_C13195 -swab,http://purl.obolibrary.org/obo/NCIT_C13195 -oral swab,http://purl.obolibrary.org/obo/NCIT_C13195 -bronchoalveolar lavage,http://purl.obolibrary.org/obo/NCIT_C13195 -sputum,http://purl.obolibrary.org/obo/NCIT_C13278 -aspirate,http://purl.obolibrary.org/obo/NCIT_C13347 -stool,http://purl.obolibrary.org/obo/NCIT_C13234 -serum,http://purl.obolibrary.org/obo/NCIT_C13325 -saliva,http://purl.obolibrary.org/obo/NCIT_C13275 -nasal swab,http://purl.obolibrary.org/obo/NCIT_C132119 +30.59 N 114.3 E,http://www.wikidata.org/entity/Q11746 +35.92 N 74.33 E,http://www.wikidata.org/entity/Q609024 +39.54 N 116.23 E,http://www.wikidata.org/entity/Q198244 +Afghanistan,http://www.wikidata.org/entity/Q889 +Albania,http://www.wikidata.org/entity/Q222 +Algeria,http://www.wikidata.org/entity/Q262 +Andorra,http://www.wikidata.org/entity/Q228 +Angola,http://www.wikidata.org/entity/Q916 +Antigua and Barbuda,http://www.wikidata.org/entity/Q781 +Argentina,http://www.wikidata.org/entity/Q414 +Armenia,http://www.wikidata.org/entity/Q399 +Australia,http://www.wikidata.org/entity/Q408 +Australia: Queensland,http://www.wikidata.org/entity/Q36074 +Australia: Victoria,http://www.wikidata.org/entity/Q36687 +Austria,http://www.wikidata.org/entity/Q40 +Azerbaijan,http://www.wikidata.org/entity/Q227 +Bahrain,http://www.wikidata.org/entity/Q398 +Bangladesh,http://www.wikidata.org/entity/Q902 +Barbados,http://www.wikidata.org/entity/Q244 +Belarus,http://www.wikidata.org/entity/Q184 +Belgium,http://www.wikidata.org/entity/Q31 +Belize,http://www.wikidata.org/entity/Q242 +Benin,http://www.wikidata.org/entity/Q962 +Bhutan,http://www.wikidata.org/entity/Q917 +Bolivia,http://www.wikidata.org/entity/Q750 +Bosnia and Herzegovina,http://www.wikidata.org/entity/Q225 +Botswana,http://www.wikidata.org/entity/Q963 +Brazil,http://www.wikidata.org/entity/Q155 +Brunei,http://www.wikidata.org/entity/Q921 +Bulgaria,http://www.wikidata.org/entity/Q219 +Burkina Faso,http://www.wikidata.org/entity/Q965 +Burundi,http://www.wikidata.org/entity/Q967 +Cambodia,http://www.wikidata.org/entity/Q424 +Cameroon,http://www.wikidata.org/entity/Q1009 +Canada,http://www.wikidata.org/entity/Q16 +Cape Verde,http://www.wikidata.org/entity/Q1011 +Central African Republic,http://www.wikidata.org/entity/Q929 +Chad,http://www.wikidata.org/entity/Q657 +Chile,http://www.wikidata.org/entity/Q298 +China,http://www.wikidata.org/entity/Q148 +China: Anhui,http://www.wikidata.org/entity/Q40956 +"China: Anhui, Fuyang":http://www.wikidata.org/entity/Q360584 +China: Beijing,http://www.wikidata.org/entity/Q956 +China: Chongqing,http://www.wikidata.org/entity/Q11725 +China: Fujian,http://www.wikidata.org/entity/Q41705 +China: Gansu,http://www.wikidata.org/entity/Q42392 +China: Guangdong,http://www.wikidata.org/entity/Q15175 +"China: Guangdong, Guangzhou",http://www.wikidata.org/entity/Q16572 +China: Guangxi Zhuang Autonomous Region,http://www.wikidata.org/entity/Q15176 +China: Guangzhou,http://www.wikidata.org/entity/Q16572 +China: Guizhou,http://www.wikidata.org/entity/Q47097 +China: Hangzhou,http://www.wikidata.org/entity/Q4970 +China: Hainan,http://www.wikidata.org/entity/Q42200 +China: Hebei,http://www.wikidata.org/entity/Q21208 +China: Heilongjiang,http://www.wikidata.org/entity/Q19206 +China: Henan,http://www.wikidata.org/entity/Q43684 +China: Hong Kong,http://www.wikidata.org/entity/Q8646 +China: HuaShang,http://www.wikidata.org/entity/Q148 +China: Hubei,http://www.wikidata.org/entity/Q46862 +"China: Hubei, Wuhan",http://www.wikidata.org/entity/Q11746 +China: Hunan,http://www.wikidata.org/entity/Q45761 +China: Inner Mongolia,http://www.wikidata.org/entity/Q41079 +China: Jiangsu,http://www.wikidata.org/entity/Q16963 +China: Jiangxi,http://www.wikidata.org/entity/Q57052 +China: Jilin,http://www.wikidata.org/entity/Q45208 +China: Liaoning,http://www.wikidata.org/entity/Q43934 +China: Macau,http://www.wikidata.org/entity/Q14773 +China: Nanchang,https://www.wikidata.org/wiki/Q171943 +China: Ningxia Hui Autonomous Region,http://www.wikidata.org/entity/Q57448 +China: Qinghai,http://www.wikidata.org/entity/Q45833 +China: Shaanxi,http://www.wikidata.org/entity/Q47974 +China: Shandong,http://www.wikidata.org/entity/Q43407 +China: Shanghai,http://www.wikidata.org/entity/Q8686 +China: Shanxi,http://www.wikidata.org/entity/Q46913 +China: Shenzhen,http://www.wikidata.org/entity/Q15174 +China: Sichuan,http://www.wikidata.org/entity/Q19770 +China: Tianjin,http://www.wikidata.org/entity/Q11736 +China: Tibet Autonomous Region,http://www.wikidata.org/entity/Q17269 +China: Wuhan,http://www.wikidata.org/entity/Q11746 +China:Wuhan,http://www.wikidata.org/entity/Q11746 +China: Xinjiang,http://www.wikidata.org/entity/Q34800 +China: Yunnan,http://www.wikidata.org/entity/Q43194 +China: Zhejiang,http://www.wikidata.org/entity/Q16967 +"China: Zhejiang, Hangzhou",http://www.wikidata.org/entity/Q4970 +Colombia,http://www.wikidata.org/entity/Q739 +Colombia: Antioquia,http://www.wikidata.org/entity/Q123304 +Comoros,http://www.wikidata.org/entity/Q970 +Costa Rica,http://www.wikidata.org/entity/Q800 +Croatia,http://www.wikidata.org/entity/Q224 +Cuba,http://www.wikidata.org/entity/Q241 +Czech Republic,http://www.wikidata.org/entity/Q213 +Democratic Republic of the Congo,http://www.wikidata.org/entity/Q974 +Denmark,http://www.wikidata.org/entity/Q35 +Djibouti,http://www.wikidata.org/entity/Q977 +Dominica,http://www.wikidata.org/entity/Q784 +Dominican Republic,http://www.wikidata.org/entity/Q786 +East Timor,http://www.wikidata.org/entity/Q574 +Ecuador,http://www.wikidata.org/entity/Q736 +Egypt,http://www.wikidata.org/entity/Q79 +El Salvador,http://www.wikidata.org/entity/Q792 +Equatorial Guinea,http://www.wikidata.org/entity/Q983 +Eritrea,http://www.wikidata.org/entity/Q986 +Estado Libre del Istmo,http://www.wikidata.org/entity/Q8842943 +Estonia,http://www.wikidata.org/entity/Q191 +Eswatini,http://www.wikidata.org/entity/Q1050 +Ethiopia,http://www.wikidata.org/entity/Q115 +Federated States of Micronesia,http://www.wikidata.org/entity/Q702 +Fiji,http://www.wikidata.org/entity/Q712 +Finland,http://www.wikidata.org/entity/Q33 +France,http://www.wikidata.org/entity/Q142 +Gabon,http://www.wikidata.org/entity/Q1000 +Georgia,http://www.wikidata.org/entity/Q230 +Germany,http://www.wikidata.org/entity/Q183 +Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718 +Ghana,http://www.wikidata.org/entity/Q117 +Greece,http://www.wikidata.org/entity/Q41 +Grenada,http://www.wikidata.org/entity/Q769 +Guatemala,http://www.wikidata.org/entity/Q774 +Guinea,http://www.wikidata.org/entity/Q1006 +Guinea-Bissau,http://www.wikidata.org/entity/Q1007 +Guyana,http://www.wikidata.org/entity/Q734 +Haiti,http://www.wikidata.org/entity/Q790 +Honduras,http://www.wikidata.org/entity/Q783 +Hungary,http://www.wikidata.org/entity/Q28 +Iceland,http://www.wikidata.org/entity/Q189 +Icelandic Commonwealth,http://www.wikidata.org/entity/Q62389 +India,http://www.wikidata.org/entity/Q668 +India: Kerala State,http://www.wikidata.org/entity/Q1186 +India: Rajkot,http://www.wikidata.org/entity/Q1815245 +Indonesia,http://www.wikidata.org/entity/Q252 +Iran,http://www.wikidata.org/entity/Q794 +Iran: Qum,http://www.wikidata.org/entity/Q131664 +Iran: Tehran,http://www.wikidata.org/entity/Q3616 +Iraq,http://www.wikidata.org/entity/Q796 +Ireland,http://www.wikidata.org/entity/Q27 +Israel,http://www.wikidata.org/entity/Q801 +Italy,http://www.wikidata.org/entity/Q38 +Italy: Cagliari,http://www.wikidata.org/entity/Q1897 +Italy: Rome,http://www.wikidata.org/entity/Q220 +Ivory Coast,http://www.wikidata.org/entity/Q1008 +Jamaica,http://www.wikidata.org/entity/Q766 +Japan,http://www.wikidata.org/entity/Q17 +Jordan,http://www.wikidata.org/entity/Q810 +Kazakhstan,http://www.wikidata.org/entity/Q232 +Kenya,http://www.wikidata.org/entity/Q114 +Kingdom of Denmark,http://www.wikidata.org/entity/Q756617 +Kingdom of the Netherlands,http://www.wikidata.org/entity/Q29999 +Kiribati,http://www.wikidata.org/entity/Q710 +Kuwait,http://www.wikidata.org/entity/Q817 +Kyrgyzstan,http://www.wikidata.org/entity/Q813 +Laos,http://www.wikidata.org/entity/Q819 +Latvia,http://www.wikidata.org/entity/Q211 +Lebanon,http://www.wikidata.org/entity/Q822 +Lesotho,http://www.wikidata.org/entity/Q1013 +Liberia,http://www.wikidata.org/entity/Q1014 +Libya,http://www.wikidata.org/entity/Q1016 +Liechtenstein,http://www.wikidata.org/entity/Q347 +Lithuania,http://www.wikidata.org/entity/Q37 +Luxembourg,http://www.wikidata.org/entity/Q32 +Madagascar,http://www.wikidata.org/entity/Q1019 +Malawi,http://www.wikidata.org/entity/Q1020 +Malaysia,http://www.wikidata.org/entity/Q833 +Maldives,http://www.wikidata.org/entity/Q826 +Mali,http://www.wikidata.org/entity/Q912 +Malta,http://www.wikidata.org/entity/Q233 +Marshall Islands,http://www.wikidata.org/entity/Q709 +Mauritania,http://www.wikidata.org/entity/Q1025 +Mauritius,http://www.wikidata.org/entity/Q1027 +Mexico,http://www.wikidata.org/entity/Q96 +Moldova,http://www.wikidata.org/entity/Q217 +Monaco,http://www.wikidata.org/entity/Q235 +Mongolia,http://www.wikidata.org/entity/Q711 +Montenegro,http://www.wikidata.org/entity/Q236 +Morocco,http://www.wikidata.org/entity/Q1028 +Mozambique,http://www.wikidata.org/entity/Q1029 +Myanmar,http://www.wikidata.org/entity/Q836 +Namibia,http://www.wikidata.org/entity/Q1030 +Nauru,http://www.wikidata.org/entity/Q697 +Netherlands: Milheeze,https://www.wikidata.org/wiki/Q3314115 +Nepal,http://www.wikidata.org/entity/Q837 +New Zealand,http://www.wikidata.org/entity/Q664 +Nicaragua,http://www.wikidata.org/entity/Q811 +Niger,http://www.wikidata.org/entity/Q1032 +Nigeria,http://www.wikidata.org/entity/Q1033 +Nigeria: Lagos,http://www.wikidata.org/entity/Q8673 +North Korea,http://www.wikidata.org/entity/Q423 +North Macedonia,http://www.wikidata.org/entity/Q221 +Norway,http://www.wikidata.org/entity/Q20 +Oman,http://www.wikidata.org/entity/Q842 +Ottoman Empire,http://www.wikidata.org/entity/Q12560 +Pakistan,http://www.wikidata.org/entity/Q843 +Pakistan: Gilgit,http://www.wikidata.org/entity/Q609024 +Pakistan: KPK,http://www.wikidata.org/entity/Q183314 +Palau,http://www.wikidata.org/entity/Q695 +Panama,http://www.wikidata.org/entity/Q804 +Papua New Guinea,http://www.wikidata.org/entity/Q691 +Paraguay,http://www.wikidata.org/entity/Q733 +People's Republic of China,http://www.wikidata.org/entity/Q148 +Peru,http://www.wikidata.org/entity/Q419 +Philippines,http://www.wikidata.org/entity/Q928 +Poland,http://www.wikidata.org/entity/Q36 +Portugal,http://www.wikidata.org/entity/Q45 +Principality of Turov and Pinsk,http://www.wikidata.org/entity/Q671362 +Qatar,http://www.wikidata.org/entity/Q846 +Republic of Cyprus,http://www.wikidata.org/entity/Q229 +Republic of Geneva,http://www.wikidata.org/entity/Q23366230 +Republic of the Congo,http://www.wikidata.org/entity/Q971 +Romania,http://www.wikidata.org/entity/Q218 +Russia,http://www.wikidata.org/entity/Q159 +Rwanda,http://www.wikidata.org/entity/Q1037 +Saint Kitts and Nevis,http://www.wikidata.org/entity/Q763 +Saint Lucia,http://www.wikidata.org/entity/Q760 +Saint Vincent and the Grenadines,http://www.wikidata.org/entity/Q757 +Samoa,http://www.wikidata.org/entity/Q683 +San Marino,http://www.wikidata.org/entity/Q238 +São Tomé and Príncipe,http://www.wikidata.org/entity/Q1039 +Saudi Arabia,http://www.wikidata.org/entity/Q851 +Senegal,http://www.wikidata.org/entity/Q1041 +Serbia,http://www.wikidata.org/entity/Q403 +Seychelles,http://www.wikidata.org/entity/Q1042 +Sierra Leone,http://www.wikidata.org/entity/Q1044 +Singapore,http://www.wikidata.org/entity/Q334 +Slovakia,http://www.wikidata.org/entity/Q214 +Slovenia,http://www.wikidata.org/entity/Q215 +Solomon Islands,http://www.wikidata.org/entity/Q685 +Somalia,http://www.wikidata.org/entity/Q1045 +South Africa,http://www.wikidata.org/entity/Q258 +South Africa: KwaZulu-Natal,http://www.wikidata.org/entity/Q81725 +South African Republic,http://www.wikidata.org/entity/Q550374 +South Korea,http://www.wikidata.org/entity/Q884 +South Sudan,http://www.wikidata.org/entity/Q958 +Spain,http://www.wikidata.org/entity/Q29 +Spain: Valencia,http://www.wikidata.org/entity/Q8818 +Sri Lanka,http://www.wikidata.org/entity/Q854 +State of Los Altos,http://www.wikidata.org/entity/Q738264 +Sudan,http://www.wikidata.org/entity/Q1049 +Suriname,http://www.wikidata.org/entity/Q730 +Sweden,http://www.wikidata.org/entity/Q34 +Switzerland,http://www.wikidata.org/entity/Q39 +Syria,http://www.wikidata.org/entity/Q858 +Taiwan,http://www.wikidata.org/entity/Q865 +Tajikistan,http://www.wikidata.org/entity/Q863 +Tanzania,http://www.wikidata.org/entity/Q924 +Thailand,http://www.wikidata.org/entity/Q869 +The Bahamas,http://www.wikidata.org/entity/Q778 +The Gambia,http://www.wikidata.org/entity/Q1005 +Togo,http://www.wikidata.org/entity/Q945 +Tonga,http://www.wikidata.org/entity/Q678 +Trinidad and Tobago,http://www.wikidata.org/entity/Q754 +Tunisia,http://www.wikidata.org/entity/Q948 +Tunisia: Tunis,http://www.wikidata.org/entity/Q3572 +Turkey,http://www.wikidata.org/entity/Q43 +Turkmenistan,http://www.wikidata.org/entity/Q874 +Tuvalu,http://www.wikidata.org/entity/Q672 +Uganda,http://www.wikidata.org/entity/Q1036 +Ukraine,http://www.wikidata.org/entity/Q212 +United Arab Emirates,http://www.wikidata.org/entity/Q878 +United Arab Republic,http://www.wikidata.org/entity/Q170468 +United Kingdom,http://www.wikidata.org/entity/Q145 +United States of America,http://www.wikidata.org/entity/Q30 +Uruguay,http://www.wikidata.org/entity/Q77 +USA,http://www.wikidata.org/entity/Q30 +USA: AK,http://www.wikidata.org/entity/Q797 +USA: AL,http://www.wikidata.org/entity/Q173 +USA: AR,http://www.wikidata.org/entity/Q1612 +USA: AZ,http://www.wikidata.org/entity/Q816 +USA: CA,http://www.wikidata.org/entity/Q99 +"USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143 +USA: CO,http://www.wikidata.org/entity/Q1261 +USA: CT,http://www.wikidata.org/entity/Q779 +USA: DC,http://www.wikidata.org/entity/Q3551781 +USA: DE,http://www.wikidata.org/entity/Q1393 +USA: FL,http://www.wikidata.org/entity/Q812 +USA: GA,http://www.wikidata.org/entity/Q1428 +USA: HI,http://www.wikidata.org/entity/Q782 +USA: IA,http://www.wikidata.org/entity/Q1546 +USA: ID,http://www.wikidata.org/entity/Q1221 +USA: IL,http://www.wikidata.org/entity/Q1204 +USA: Illinois,http://www.wikidata.org/entity/Q1204 +USA: IN,http://www.wikidata.org/entity/Q1415 +USA: KS,http://www.wikidata.org/entity/Q1558 +USA: KY,http://www.wikidata.org/entity/Q1603 +USA: LA,http://www.wikidata.org/entity/Q1588 +"USA: New Orleans, LA",https://www.wikidata.org/wiki/Q34404 +USA: MA,http://www.wikidata.org/entity/Q771 +USA: MD,http://www.wikidata.org/entity/Q1391 +USA: ME,http://www.wikidata.org/entity/Q724 +USA: MI,http://www.wikidata.org/entity/Q1166 +USA: MN,http://www.wikidata.org/entity/Q1527 +USA: MO,http://www.wikidata.org/entity/Q1581 +USA: MS,http://www.wikidata.org/entity/Q1494 +USA: MT,http://www.wikidata.org/entity/Q1212 +USA: NC,http://www.wikidata.org/entity/Q1454 +USA: ND,http://www.wikidata.org/entity/Q1207 +USA: NE,http://www.wikidata.org/entity/Q1553 +USA: NH,http://www.wikidata.org/entity/Q759 +USA: NJ,http://www.wikidata.org/entity/Q1408 +USA: NM,http://www.wikidata.org/entity/Q1522 +USA: North Carolina,http://www.wikidata.org/entity/Q1454 +USA: NV,http://www.wikidata.org/entity/Q1227 +USA: NY,http://www.wikidata.org/entity/Q1384 +USA: New York,http://www.wikidata.org/entity/Q1384 +USA: OH,http://www.wikidata.org/entity/Q1397 +USA: OK,http://www.wikidata.org/entity/Q1649 +USA: OR,http://www.wikidata.org/entity/Q824 +USA: PA,http://www.wikidata.org/entity/Q1400 +USA: RI,http://www.wikidata.org/entity/Q1387 +"USA: San Francisco, CA",http://www.wikidata.org/entity/Q62 +USA: SC,http://www.wikidata.org/entity/Q1456 +USA: SD,http://www.wikidata.org/entity/Q1211 +"USA: Snohomish County, WA",http://www.wikidata.org/entity/Q110403 +USA: TN,http://www.wikidata.org/entity/Q1509 +USA: TX,http://www.wikidata.org/entity/Q1439 +USA: UT,http://www.wikidata.org/entity/Q829 +USA: VA,http://www.wikidata.org/entity/Q1370 +USA: VT,http://www.wikidata.org/entity/Q16551 +USA: WA,http://www.wikidata.org/entity/Q1223 +USA: WI,http://www.wikidata.org/entity/Q1537 +USA: WV,http://www.wikidata.org/entity/Q1371 +USA: WY,http://www.wikidata.org/entity/Q1214 +Uzbekistan,http://www.wikidata.org/entity/Q265 +Vanuatu,http://www.wikidata.org/entity/Q686 +Vatican City,http://www.wikidata.org/entity/Q237 +Venezuela,http://www.wikidata.org/entity/Q717 +Viet nam,http://www.wikidata.org/entity/Q881 +Viet Nam,http://www.wikidata.org/entity/Q881 +Viet Nam: Ho Chi Minh city,http://www.wikidata.org/entity/Q1854 +Vietnam,http://www.wikidata.org/entity/Q881 +Yemen,http://www.wikidata.org/entity/Q805 +Zambia,http://www.wikidata.org/entity/Q953 +Zimbabwe,http://www.wikidata.org/entity/Q954 \ No newline at end of file -- cgit v1.2.3 From 61a083081cd2d70a25eba4cdae4f85c774b25b95 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Wed, 29 Apr 2020 17:03:25 +0200 Subject: the date is now handled more formally all the date are saved as "YYYY-MM-DD"--- scripts/from_genbank_to_fasta_and_yaml.py | 34 +++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 21ed3b2..0175d3c 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -7,6 +7,8 @@ import xml.etree.ElementTree as ET import json import os +from dateutil import parser + num_ids_for_request = 100 dir_metadata = 'metadata_from_nuccore' @@ -221,7 +223,32 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) elif GBQualifier_name_text == 'collection_date': # TO_DO: which format we will use? - info_for_yaml_dict['sample']['collection_date'] = GBQualifier_value_text + date_to_write = GBQualifier_value_text + + if len(GBQualifier_value_text.split('-')) == 1: + if int(GBQualifier_value_text) < 2020: + date_to_write = "15 12 {}".format(GBQualifier_value_text) + else: + date_to_write = "15 01 {}".format(GBQualifier_value_text) + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 2: + date_to_write += '-15' + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 3: + GBQualifier_value_text_list = GBQualifier_value_text.split('-') + + if GBQualifier_value_text_list[1].isalpha(): + date_to_write = GBQualifier_value_text_list[1] + ' ' + GBQualifier_value_text_list[0] + ' ' + GBQualifier_value_text_list[2] + + info_for_yaml_dict['sample']['collection_date'] = date_to_write elif GBQualifier_name_text in ['lat_lon', 'country']: if GBQualifier_value_text == 'Hong Kong': GBQualifier_value_text = 'China: Hong Kong' @@ -233,7 +260,10 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text elif GBQualifier_name_text == 'note': - info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text + else: + info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text elif GBQualifier_name_text == 'isolate': info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text elif GBQualifier_name_text == 'db_xref': -- cgit v1.2.3 From 347b8dce36832c6d3e379d81b3efefcbc88a3117 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 30 Apr 2020 10:22:27 -0400 Subject: Wrap import script to run as a workflow Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- scripts/docker/Dockerfile | 10 ++++++++++ scripts/from_genbank_to_fasta_and_yaml.py | 30 +++++++++++++++--------------- scripts/import.cwl | 24 ++++++++++++++++++++++++ scripts/import_to_arvados.py | 13 +++++++++++++ 4 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 scripts/docker/Dockerfile create mode 100644 scripts/import.cwl create mode 100644 scripts/import_to_arvados.py (limited to 'scripts') diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile new file mode 100644 index 0000000..5bd38dd --- /dev/null +++ b/scripts/docker/Dockerfile @@ -0,0 +1,10 @@ +FROM debian:10 + +RUN apt-get update && \ + apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \ + python3 python3-pip python3-setuptools python3-dev python-pycurl \ + clustalw python3-biopython libcurl4-openssl-dev build-essential \ + libssl-dev && \ + apt-get clean + +RUN pip3 install bh20-seq-uploader \ No newline at end of file diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 21ed3b2..2564b51 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -43,13 +43,13 @@ if not os.path.exists(dir_metadata): print(term_list, len(id_set)) - with open(path_ncbi_virus_accession) as f: - tmp_list = [line.strip('\n') for line in f] - - print('NCBI Virus', len(tmp_list)) - id_set.update(tmp_list) - - print(term_list + ['NCBI Virus'], len(id_set)) + if os.path.exists(path_ncbi_virus_accession): + with open(path_ncbi_virus_accession) as f: + tmp_list = [line.strip('\n') for line in f] + print('NCBI Virus', len(tmp_list)) + id_set.update(tmp_list) + term_list.append('NCBI Virus') + print(term_list, len(id_set)) for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) @@ -85,7 +85,7 @@ if not os.path.exists(dir_fasta_and_yaml): os.makedirs(dir_fasta_and_yaml) missing_value_list = [] - + for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: tree = ET.parse(path_metadata_xxx_xml) GBSet = tree.getroot() @@ -109,20 +109,20 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) 'submitter': {} } - + info_for_yaml_dict['sample']['sample_id'] = accession_version info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now - - + + # submitter info GBSeq_references = GBSeq.find('GBSeq_references') if GBSeq_references is not None: info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] - + GBReference = GBSeq_references.find('GBReference') if GBReference is not None: GBReference_journal = GBReference.find('GBReference_journal') - + if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': if 'Submitted' in GBReference_journal.text: info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] @@ -207,7 +207,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) elif GBQualifier_name_text == 'isolation_source': if GBQualifier_value_text.upper() in term_to_uri_dict: GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' - + if GBQualifier_value_text in term_to_uri_dict: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: @@ -250,7 +250,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: json.dump(info_for_yaml_dict, fw, indent=2) - + if len(missing_value_list) > 0: with open('missing_terms.tsv', 'w') as fw: fw.write('\n'.join(missing_value_list)) diff --git a/scripts/import.cwl b/scripts/import.cwl new file mode 100644 index 0000000..81752c8 --- /dev/null +++ b/scripts/import.cwl @@ -0,0 +1,24 @@ +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 +inputs: + scripts: + type: File + default: + class: File + location: import_to_arvados.py + inputBinding: {position: 1} + importScript: + type: File + default: + class: File + location: from_genbank_to_fasta_and_yaml.py + inputBinding: {position: 2} +outputs: [] +requirements: + DockerRequirement: + dockerPull: bh20-seq-uploader/import + NetworkAccess: + networkAccess: true + WorkReuse: + workReuse: false diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py new file mode 100644 index 0000000..07b7d71 --- /dev/null +++ b/scripts/import_to_arvados.py @@ -0,0 +1,13 @@ +import os +import subprocess +import glob +import sys + +os.chdir(os.environ["TMPDIR"]) +subprocess.run(sys.argv[1]) + +os.chdir("fasta_and_yaml") +fasta_files = glob.glob("*.fasta") + +for f in fasta_files: + subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]]) -- cgit v1.2.3 From 02ecf15cdc04270e06f2be7457bb1b284eeddc56 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 30 Apr 2020 10:46:56 -0400 Subject: Import script fixes Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- scripts/import.cwl | 8 +++++++- scripts/import_to_arvados.py | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'scripts') diff --git a/scripts/import.cwl b/scripts/import.cwl index 81752c8..d84516b 100644 --- a/scripts/import.cwl +++ b/scripts/import.cwl @@ -14,6 +14,12 @@ inputs: class: File location: from_genbank_to_fasta_and_yaml.py inputBinding: {position: 2} + dict: + type: Directory + default: + class: Directory + location: dict_ontology_standardization + inputBinding: {position: 3} outputs: [] requirements: DockerRequirement: @@ -21,4 +27,4 @@ requirements: NetworkAccess: networkAccess: true WorkReuse: - workReuse: false + enableReuse: false diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py index 07b7d71..78cd13d 100644 --- a/scripts/import_to_arvados.py +++ b/scripts/import_to_arvados.py @@ -4,6 +4,7 @@ import glob import sys os.chdir(os.environ["TMPDIR"]) +os.symlink(sys.argv[2], "dict_ontology_standardization") subprocess.run(sys.argv[1]) os.chdir("fasta_and_yaml") -- cgit v1.2.3 From 6165495618b9c2ad3ad7b8bd95ed807d022ebf1c Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Thu, 30 Apr 2020 18:27:07 +0200 Subject: fixed UO_0000036 for year --- scripts/from_genbank_to_fasta_and_yaml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'scripts') diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 900f087..6f046ea 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -200,7 +200,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if 'age' in GBQualifier_value_text: info_for_yaml_dict['host']['host_age'] = int(GBQualifier_value_text_list[2].split('age ')[1]) - info_for_yaml_dict['host']['host_age_unit'] = 'year' + info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036' elif GBQualifier_name_text == 'collected_by': if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]): info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text -- cgit v1.2.3