From 6032a373003affa641ca1e70a44c29a232b5b3ed Mon Sep 17 00:00:00 2001 From: lltommy Date: Tue, 28 Apr 2020 20:31:42 +0200 Subject: Changes to the structure - we use lists now instead of strings where it makes sense. This allows us to have multiple values where in makes sense --- bh20sequploader/bh20seq-options.yml | 30 ------------------ bh20sequploader/bh20seq-schema.yml | 52 +++++++++---------------------- bh20sequploader/bh20seq-shex.rdf | 11 ++++--- example/maximum_metadata_example.yaml | 44 ++++++++++++++++++++++++++ example/metadata.yaml | 43 ------------------------- example/minimal_example.yaml | 18 ----------- example/minimal_metadata_example.yaml | 0 scripts/from_genbank_to_fasta_and_yaml.py | 2 +- 8 files changed, 65 insertions(+), 135 deletions(-) create mode 100644 example/maximum_metadata_example.yaml delete mode 100644 example/metadata.yaml delete mode 100644 example/minimal_example.yaml create mode 100644 example/minimal_metadata_example.yaml diff --git a/bh20sequploader/bh20seq-options.yml b/bh20sequploader/bh20seq-options.yml index 104ed6c..c553f41 100644 --- a/bh20sequploader/bh20seq-options.yml +++ b/bh20sequploader/bh20seq-options.yml @@ -35,38 +35,8 @@ sample_sequencing_technology: Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 -sample_sequencing_technology2: - Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 - Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566 - Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567 - Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 - Illumina: http://purl.obolibrary.org/obo/OBI_0000759 - IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 - Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894 - Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 - Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 - Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 - -sample_sequencing_technology3: - Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 - Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566 - Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567 - Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 - Illumina: http://purl.obolibrary.org/obo/OBI_0000759 - IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 - Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894 - Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 - Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 - Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 - specimen_source: nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 sputum: http://purl.obolibrary.org/obo/NCIT_C13278 bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 - -specimen_source2: - nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 - oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 - sputum: http://purl.obolibrary.org/obo/NCIT_C13278 - bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index ea813a0..f36a6e6 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -48,6 +48,7 @@ $graph: type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C25688 + _type: "@id" host_treatment: doc: Process in which the act is intended to modify or alter host status type: string? @@ -55,7 +56,7 @@ $graph: _id: http://www.ebi.ac.uk/efo/EFO_0000727 host_vaccination: doc: List of vaccines given to the host - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/VO_0000002 additional_host_information: @@ -96,14 +97,7 @@ $graph: _id: http://purl.obolibrary.org/obo/NCIT_C41206 specimen_source: doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0001479 - _type: "@id" - noLinkCheck: true - specimen_source2: - doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb) - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001479 _type: "@id" @@ -119,10 +113,11 @@ $graph: jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 source_database_accession: - doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here - type: string? + doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here. Please use a resolveable URL (e.g. http://identifiers.org/insdc/LC522350.1#sequence) + type: string[]? jsonldPredicate: _id: http://edamontology.org/data_2091 + _type: "@id" - name: virusSchema type: record @@ -145,21 +140,7 @@ $graph: fields: sample_sequencing_technology: doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0600047 - _type: "@id" - noLinkCheck: true - sample_sequencing_technology2: - doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0600047 - _type: "@id" - noLinkCheck: true - sample_sequencing_technology3: - doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0600047 _type: "@id" @@ -170,13 +151,8 @@ $graph: jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0002699 sequencing_coverage: - doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) - type: float? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/FLU_0000848 - sequencing_coverage2: - doc: If a second sequence technology was used you can submit its coverage here - type: float? + doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. [100]) - if multiple technologies were used multiple float values can be submitted e.g. [100, 20] + type: int[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 additional_technology_information: @@ -189,13 +165,13 @@ $graph: type: record fields: authors: - doc: Name of the author(s) - type: string + doc: Name(s) of the author(s) + type: string[] jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C42781 submitter_name: - doc: Name of the submitter - type: string? + doc: Name of the submitter(s) + type: string[]? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000116 submitter_address: @@ -228,7 +204,7 @@ $graph: _id: http://purl.obolibrary.org/obo/NCIT_C19026 submitter_orcid: doc: ORCID of the submitter as a full URI, e.g. https://orcid.org/0000-0002-1825-0097 - type: string? + type: string[]? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000115 _type: "@id" diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index c3b0ae1..4ec957d 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -25,7 +25,7 @@ PREFIX wikidata: obo:NCIT_C42574 [ obo:UO_~ ] ?; obo:NCIT_C25688 [obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987 ] ? ; efo:EFO_0000727 xsd:string ?; - obo:VO_0000002 xsd:string ?; + obo:VO_0000002 xsd:string {0,10}; sio:SIO_001167 xsd:string ?; } @@ -38,25 +38,26 @@ PREFIX wikidata: obo:OBI_0001479 IRI {0,2}; obo:OBI_0001472 xsd:string ?; sio:SIO_001167 xsd:string ?; + edam:data_2091 IRI {0,3}; } :submitterShape { - obo:NCIT_C42781 xsd:string ; - sio:SIO_000116 xsd:string ?; + obo:NCIT_C42781 xsd:string * ; + sio:SIO_000116 xsd:string *; sio:SIO_000172 xsd:string ?; obo:NCIT_C37984 xsd:string ?; obo:OBI_0600047 xsd:string ?; obo:NCIT_C37900 xsd:string ?; efo:EFO_0001741 xsd:string ?; obo:NCIT_C19026 xsd:string ?; - sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?; + sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/ {0,10}; sio:SIO_001167 xsd:string ?; } :technologyShape { obo:OBI_0600047 IRI {0,3} ; efo:EFO_0002699 xsd:string ?; - obo:FLU_0000848 xsd:double {0,2}; + obo:FLU_0000848 xsd:integer {0,2}; sio:SIO_001167 xsd:string ?; } diff --git a/example/maximum_metadata_example.yaml b/example/maximum_metadata_example.yaml new file mode 100644 index 0000000..0a6d910 --- /dev/null +++ b/example/maximum_metadata_example.yaml @@ -0,0 +1,44 @@ +id: placeholder + +host: + host_id: XX1 + host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 + host_sex: http://purl.obolibrary.org/obo/PATO_0000384 + host_age: 20 + host_age_unit: http://purl.obolibrary.org/obo/UO_0000036 + host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269 + host_treatment: Process in which the act is intended to modify or alter host status (Compounds) + host_vaccination: [vaccines1,vaccine2] + additional_host_information: Optional free text field for addtional information + +sample: + sample_id: Id of the sample as defined by the submitter + collector_name: Name of the person that took the sample + collecting_institution: Institute that was responsible of sampling + specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835] + collection_date: "2020-01-01" + collection_location: http://www.wikidata.org/entity/Q148 + sample_storage_conditions: frozen specimen + source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] + additional_collection_information: Optional free text field for addtional information + +virus: + virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 + virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 + +technology: + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173] + sequence_assembly_method: Protocol used for assembly + sequencing_coverage: [70, 100] + additional_technology_information: Optional free text field for addtional information + +submitter: + submitter_name: [John Doe] + submitter_address: John Doe's adress + originating_lab: John Doe kitchen + lab_address: John Doe's address + provider_sample_id: XXX1 + submitter_sample_id: XXX2 + authors: [John Doe, Joe Boe, Jonny Oe] + submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001] + additional_submitter_information: Optional free text field for addtional information \ No newline at end of file diff --git a/example/metadata.yaml b/example/metadata.yaml deleted file mode 100644 index a76616c..0000000 --- a/example/metadata.yaml +++ /dev/null @@ -1,43 +0,0 @@ -id: placeholder - -host: - host_id: XX1 - host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 - host_sex: http://purl.obolibrary.org/obo/NCIT_C27993 - host_age: 20 - host_age_unit: http://purl.obolibrary.org/obo/UO_0000036 - host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269 - host_treatment: Process in which the act is intended to modify or alter host status (Compounds) - host_vaccination: List of vaccines given to the host (RRIDs?) - additional_host_information: Field for additional host information - -sample: - sample_id: Id of the sample as defined by the submitter - collector_name: Name of the person that took the sample - collecting_institution: Institute that was responsible of sampling - specimen_source: http://purl.obolibrary.org/obo/NCIT_C155831 - specimen_source2: http://purl.obolibrary.org/obo/NCIT_C155835 - collection_date: "2020-01-01" - collection_location: http://www.wikidata.org/entity/Q148 - sample_storage_conditions: XXX - additional_collection_information: XXX - -virus: - virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 - virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 - -technology: - sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173 - sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173 - sequence_assembly_method: Protocol used for assembly - sequencing_coverage: 70 - -submitter: - submitter_name: John Doe - submitter_address: John Doe's adress - originating_lab: John Doe kitchen - lab_address: John Doe's address - provider_sample_id: HmX - submitter_sample_id: xXx - authors: John Doe et all - submitter_orcid: https://orcid.org/0000-0000-0000-0000 \ No newline at end of file diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml deleted file mode 100644 index 0e36a25..0000000 --- a/example/minimal_example.yaml +++ /dev/null @@ -1,18 +0,0 @@ -id: placeholder - -host: - host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 - -sample: - sample_id: XX - collection_date: 2020-01 - collection_location: http://www.wikidata.org/entity/Q148 - -virus: - virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 - -technology: - sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632 - -submitter: - authors: John Doe \ No newline at end of file diff --git a/example/minimal_metadata_example.yaml b/example/minimal_metadata_example.yaml new file mode 100644 index 0000000..e69de29 diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 5257bd1..148a7e1 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -112,7 +112,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['sample_id'] = accession_version - info_for_yaml_dict['sample']['source_database_accession'] = accession_version + info_for_yaml_dict['sample']['source_database_accession'] = "http://identifiers.org/insdc/"+accession_version+"#sequence" #accession is turned into resolvable URL/URI now # submitter info -- cgit v1.2.3