From 6032a373003affa641ca1e70a44c29a232b5b3ed Mon Sep 17 00:00:00 2001
From: lltommy
Date: Tue, 28 Apr 2020 20:31:42 +0200
Subject: Changes to the structure - we use lists now instead of strings where
it makes sense. This allows us to have multiple values where in makes sense
---
bh20sequploader/bh20seq-options.yml | 30 ------------------
bh20sequploader/bh20seq-schema.yml | 52 +++++++++----------------------
bh20sequploader/bh20seq-shex.rdf | 11 ++++---
example/maximum_metadata_example.yaml | 44 ++++++++++++++++++++++++++
example/metadata.yaml | 43 -------------------------
example/minimal_example.yaml | 18 -----------
example/minimal_metadata_example.yaml | 0
scripts/from_genbank_to_fasta_and_yaml.py | 2 +-
8 files changed, 65 insertions(+), 135 deletions(-)
create mode 100644 example/maximum_metadata_example.yaml
delete mode 100644 example/metadata.yaml
delete mode 100644 example/minimal_example.yaml
create mode 100644 example/minimal_metadata_example.yaml
diff --git a/bh20sequploader/bh20seq-options.yml b/bh20sequploader/bh20seq-options.yml
index 104ed6c..c553f41 100644
--- a/bh20sequploader/bh20seq-options.yml
+++ b/bh20sequploader/bh20seq-options.yml
@@ -35,38 +35,8 @@ sample_sequencing_technology:
Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
-sample_sequencing_technology2:
- Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
- Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566
- Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567
- Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205
- Illumina: http://purl.obolibrary.org/obo/OBI_0000759
- IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894
- Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894
- Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632
- Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
- Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
-
-sample_sequencing_technology3:
- Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
- Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566
- Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567
- Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205
- Illumina: http://purl.obolibrary.org/obo/OBI_0000759
- IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894
- Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894
- Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632
- Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
- Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
-
specimen_source:
nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831
oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835
sputum: http://purl.obolibrary.org/obo/NCIT_C13278
bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195
-
-specimen_source2:
- nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831
- oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835
- sputum: http://purl.obolibrary.org/obo/NCIT_C13278
- bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index ea813a0..f36a6e6 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -48,6 +48,7 @@ $graph:
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C25688
+ _type: "@id"
host_treatment:
doc: Process in which the act is intended to modify or alter host status
type: string?
@@ -55,7 +56,7 @@ $graph:
_id: http://www.ebi.ac.uk/efo/EFO_0000727
host_vaccination:
doc: List of vaccines given to the host
- type: string?
+ type: string[]?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/VO_0000002
additional_host_information:
@@ -96,14 +97,7 @@ $graph:
_id: http://purl.obolibrary.org/obo/NCIT_C41206
specimen_source:
doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab)
- type: string?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/OBI_0001479
- _type: "@id"
- noLinkCheck: true
- specimen_source2:
- doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb)
- type: string?
+ type: string[]?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
@@ -119,10 +113,11 @@ $graph:
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_001167
source_database_accession:
- doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here
- type: string?
+ doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here. Please use a resolveable URL (e.g. http://identifiers.org/insdc/LC522350.1#sequence)
+ type: string[]?
jsonldPredicate:
_id: http://edamontology.org/data_2091
+ _type: "@id"
- name: virusSchema
type: record
@@ -145,21 +140,7 @@ $graph:
fields:
sample_sequencing_technology:
doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
- type: string?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/OBI_0600047
- _type: "@id"
- noLinkCheck: true
- sample_sequencing_technology2:
- doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
- type: string?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/OBI_0600047
- _type: "@id"
- noLinkCheck: true
- sample_sequencing_technology3:
- doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
- type: string?
+ type: string[]?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
@@ -170,13 +151,8 @@ $graph:
jsonldPredicate:
_id: http://www.ebi.ac.uk/efo/EFO_0002699
sequencing_coverage:
- doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x)
- type: float?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/FLU_0000848
- sequencing_coverage2:
- doc: If a second sequence technology was used you can submit its coverage here
- type: float?
+ doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. [100]) - if multiple technologies were used multiple float values can be submitted e.g. [100, 20]
+ type: int[]?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/FLU_0000848
additional_technology_information:
@@ -189,13 +165,13 @@ $graph:
type: record
fields:
authors:
- doc: Name of the author(s)
- type: string
+ doc: Name(s) of the author(s)
+ type: string[]
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C42781
submitter_name:
- doc: Name of the submitter
- type: string?
+ doc: Name of the submitter(s)
+ type: string[]?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000116
submitter_address:
@@ -228,7 +204,7 @@ $graph:
_id: http://purl.obolibrary.org/obo/NCIT_C19026
submitter_orcid:
doc: ORCID of the submitter as a full URI, e.g. https://orcid.org/0000-0002-1825-0097
- type: string?
+ type: string[]?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
_type: "@id"
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index c3b0ae1..4ec957d 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -25,7 +25,7 @@ PREFIX wikidata:
obo:NCIT_C42574 [ obo:UO_~ ] ?;
obo:NCIT_C25688 [obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987 ] ? ;
efo:EFO_0000727 xsd:string ?;
- obo:VO_0000002 xsd:string ?;
+ obo:VO_0000002 xsd:string {0,10};
sio:SIO_001167 xsd:string ?;
}
@@ -38,25 +38,26 @@ PREFIX wikidata:
obo:OBI_0001479 IRI {0,2};
obo:OBI_0001472 xsd:string ?;
sio:SIO_001167 xsd:string ?;
+ edam:data_2091 IRI {0,3};
}
:submitterShape {
- obo:NCIT_C42781 xsd:string ;
- sio:SIO_000116 xsd:string ?;
+ obo:NCIT_C42781 xsd:string * ;
+ sio:SIO_000116 xsd:string *;
sio:SIO_000172 xsd:string ?;
obo:NCIT_C37984 xsd:string ?;
obo:OBI_0600047 xsd:string ?;
obo:NCIT_C37900 xsd:string ?;
efo:EFO_0001741 xsd:string ?;
obo:NCIT_C19026 xsd:string ?;
- sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
+ sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/ {0,10};
sio:SIO_001167 xsd:string ?;
}
:technologyShape {
obo:OBI_0600047 IRI {0,3} ;
efo:EFO_0002699 xsd:string ?;
- obo:FLU_0000848 xsd:double {0,2};
+ obo:FLU_0000848 xsd:integer {0,2};
sio:SIO_001167 xsd:string ?;
}
diff --git a/example/maximum_metadata_example.yaml b/example/maximum_metadata_example.yaml
new file mode 100644
index 0000000..0a6d910
--- /dev/null
+++ b/example/maximum_metadata_example.yaml
@@ -0,0 +1,44 @@
+id: placeholder
+
+host:
+ host_id: XX1
+ host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+ host_sex: http://purl.obolibrary.org/obo/PATO_0000384
+ host_age: 20
+ host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
+ host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
+ host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
+ host_vaccination: [vaccines1,vaccine2]
+ additional_host_information: Optional free text field for addtional information
+
+sample:
+ sample_id: Id of the sample as defined by the submitter
+ collector_name: Name of the person that took the sample
+ collecting_institution: Institute that was responsible of sampling
+ specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835]
+ collection_date: "2020-01-01"
+ collection_location: http://www.wikidata.org/entity/Q148
+ sample_storage_conditions: frozen specimen
+ source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
+ additional_collection_information: Optional free text field for addtional information
+
+virus:
+ virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+ virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
+
+technology:
+ sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173]
+ sequence_assembly_method: Protocol used for assembly
+ sequencing_coverage: [70, 100]
+ additional_technology_information: Optional free text field for addtional information
+
+submitter:
+ submitter_name: [John Doe]
+ submitter_address: John Doe's adress
+ originating_lab: John Doe kitchen
+ lab_address: John Doe's address
+ provider_sample_id: XXX1
+ submitter_sample_id: XXX2
+ authors: [John Doe, Joe Boe, Jonny Oe]
+ submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001]
+ additional_submitter_information: Optional free text field for addtional information
\ No newline at end of file
diff --git a/example/metadata.yaml b/example/metadata.yaml
deleted file mode 100644
index a76616c..0000000
--- a/example/metadata.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-id: placeholder
-
-host:
- host_id: XX1
- host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
- host_sex: http://purl.obolibrary.org/obo/NCIT_C27993
- host_age: 20
- host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
- host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
- host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
- host_vaccination: List of vaccines given to the host (RRIDs?)
- additional_host_information: Field for additional host information
-
-sample:
- sample_id: Id of the sample as defined by the submitter
- collector_name: Name of the person that took the sample
- collecting_institution: Institute that was responsible of sampling
- specimen_source: http://purl.obolibrary.org/obo/NCIT_C155831
- specimen_source2: http://purl.obolibrary.org/obo/NCIT_C155835
- collection_date: "2020-01-01"
- collection_location: http://www.wikidata.org/entity/Q148
- sample_storage_conditions: XXX
- additional_collection_information: XXX
-
-virus:
- virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
- virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
-
-technology:
- sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173
- sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173
- sequence_assembly_method: Protocol used for assembly
- sequencing_coverage: 70
-
-submitter:
- submitter_name: John Doe
- submitter_address: John Doe's adress
- originating_lab: John Doe kitchen
- lab_address: John Doe's address
- provider_sample_id: HmX
- submitter_sample_id: xXx
- authors: John Doe et all
- submitter_orcid: https://orcid.org/0000-0000-0000-0000
\ No newline at end of file
diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml
deleted file mode 100644
index 0e36a25..0000000
--- a/example/minimal_example.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-id: placeholder
-
-host:
- host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
-
-sample:
- sample_id: XX
- collection_date: 2020-01
- collection_location: http://www.wikidata.org/entity/Q148
-
-virus:
- virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
-
-technology:
- sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632
-
-submitter:
- authors: John Doe
\ No newline at end of file
diff --git a/example/minimal_metadata_example.yaml b/example/minimal_metadata_example.yaml
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 5257bd1..148a7e1 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -112,7 +112,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
info_for_yaml_dict['sample']['sample_id'] = accession_version
- info_for_yaml_dict['sample']['source_database_accession'] = accession_version
+ info_for_yaml_dict['sample']['source_database_accession'] = "http://identifiers.org/insdc/"+accession_version+"#sequence" #accession is turned into resolvable URL/URI now
# submitter info
--
cgit v1.2.3