From 9ddcfeacb3191638f42b08af999889d867f0f81c Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 20 Apr 2020 14:57:25 -0400
Subject: Better handling of duplicate sequences
Also save original fasta label in metadata
---
bh20sequploader/bh20seq-schema.yml | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 64008f2..982447c 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -18,6 +18,7 @@ $graph:
jsonldPredicate:
_id: http://www.ebi.ac.uk/efo/EFO_0000532
_type: "@id"
+ identity: true
host_id:
doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples
type: string
@@ -29,6 +30,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/PATO_0000047
_type: "@id"
+ identity: true
host_age:
doc: Age of the host as number (e.g. 50)
type: int?
@@ -40,6 +42,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C42574
_type: "@id"
+ identity: true
host_health_status:
doc: A condition or state at a particular time
type: string?
@@ -79,12 +82,14 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
+ identity: true
specimen_source2:
doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb)
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
+ identity: true
collection_date:
doc: Date when the sample was taken
type: string
@@ -96,6 +101,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/GAZ_00000448
_type: "@id"
+ identity: true
sample_storage_conditions:
doc: Information about storage of a specified type, e.g. frozen specimen, paraffin, fresh ....
type: string?
@@ -126,6 +132,7 @@ $graph:
jsonldPredicate:
_id: http://edamontology.org/data_1875
_type: "@id"
+ identity: true
virus_strain:
doc: Name of the virus strain
type: string?
@@ -141,12 +148,14 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
+ identity: true
sample_sequencing_technology2:
doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
+ identity: true
sequence_assembly_method:
doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome
type: string?
@@ -215,7 +224,7 @@ $graph:
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
_type: "@id"
- noLinkCheck: true
+ identity: true
- name: MainSchema
type: record
--
cgit v1.2.3
From a2a4b1a16cef38bb4ec9d222430fd396c70ba225 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 20 Apr 2020 16:46:28 -0400
Subject: Schema changes from @LLTommy
---
bh20sequploader/bh20seq-schema.yml | 52 ++++++++++++++++++--------------------
bh20sequploader/bh20seq-shex.rdf | 25 +++++++++---------
2 files changed, 38 insertions(+), 39 deletions(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 982447c..3d8604a 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -21,7 +21,7 @@ $graph:
identity: true
host_id:
doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples
- type: string
+ type: string?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
host_sex:
@@ -66,16 +66,27 @@ $graph:
- name: sampleSchema
type: record
fields:
+ collection_date:
+ doc: Date when the sample was taken
+ type: string
+ jsonldPredicate:
+ _id: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164
+ collection_location:
+ doc: Geographical location where the sample was collected as wikidata reference, e.g. http://www.wikidata.org/entity/Q148 (China)
+ type: string
+ jsonldPredicate:
+ _id: http://purl.obolibrary.org/obo/GAZ_00000448
+ _type: "@id"
collector_name:
doc: Name of the person that took the sample
- type: string
+ type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001895
collecting_institution:
doc: Institute that was responsible of sampeling
- type: string
+ type: string?
jsonldPredicate:
- _id: http://semanticscience.org/resource/SIO_001167
+ _id: http://purl.obolibrary.org/obo/NCIT_C41206
specimen_source:
doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab)
type: string?
@@ -89,19 +100,6 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
- identity: true
- collection_date:
- doc: Date when the sample was taken
- type: string
- jsonldPredicate:
- _id: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164
- collection_location:
- doc: Geographical location where the sample was collected as wikidata reference, e.g. http://www.wikidata.org/entity/Q148 (China)
- type: string
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/GAZ_00000448
- _type: "@id"
- identity: true
sample_storage_conditions:
doc: Information about storage of a specified type, e.g. frozen specimen, paraffin, fresh ....
type: string?
@@ -114,7 +112,7 @@ $graph:
_id: http://semanticscience.org/resource/SIO_001167
sample_id:
doc: Id of the sample as defined by the submitter
- type: string
+ type: string?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
source_database_accession:
@@ -167,7 +165,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/FLU_0000848
sequencing_coverage2:
- doc: If a second sequence technology was use you can submit its coverage here
+ doc: If a second sequence technology was used you can submit its coverage here
type: float?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/FLU_0000848
@@ -180,9 +178,14 @@ $graph:
- name: submitterSchema
type: record
fields:
+ authors:
+ doc: Name of the author(s)
+ type: string
+ jsonldPredicate:
+ _id: http://purl.obolibrary.org/obo/NCIT_C42781
submitter_name:
doc: Name of the submitter
- type: string
+ type: string?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000116
submitter_address:
@@ -192,7 +195,7 @@ $graph:
_id: http://semanticscience.org/resource/SIO_000172
originating_lab:
doc: Name of the laboratory that took the sample
- type: string
+ type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C37984
lab_address:
@@ -208,11 +211,6 @@ $graph:
type: string?
jsonldPredicate:
_id: http://www.ebi.ac.uk/efo/EFO_0001741
- authors:
- doc: Name of the author(s)
- type: string?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/NCIT_C42781
publication:
doc: Reference to publication of this sample (e.g. DOI, pubmed ID, ...)
type: string?
@@ -232,7 +230,7 @@ $graph:
fields:
host: hostSchema
sample: sampleSchema
- virus: virusSchema?
+ virus: virusSchema
technology: technologySchema
submitter: submitterSchema
id:
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 8d3f5fc..6e646c7 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -7,6 +7,7 @@ PREFIX sio:
PREFIX efo:
PREFIX evs:
PREFIX edam:
+PREFIX wikidata:
:submissionShape {
MainSchema:host @:hostShape ;
@@ -18,8 +19,8 @@ PREFIX edam:
:hostShape {
efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
- obo:PATO_0000047 [ obo:NCIT_C20197 obo:NCIT_C27993 obo:NCIT_C17998 ] ;
- sio:SIO_000115 xsd:string ;
+ sio:SIO_000115 xsd:string ?;
+ obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?;
obo:PATO_0000011 xsd:integer ?;
obo:NCIT_C42574 [ obo:UO_~ ] ?;
sio:SIO_001167 xsd:string ?;
@@ -27,20 +28,20 @@ PREFIX edam:
}
:sampleShape {
- obo:OBI_0001895 xsd:string ;
- sio:SIO_000115 xsd:string ;
- sio:SIO_001167 xsd:string ;
- evs:C25164 xsd:string ?;
- obo:GAZ_00000448 [obo:GAZ_~] ?;
+ evs:C25164 xsd:string?;
+ obo:GAZ_00000448 [wikidata:~] ;
+ obo:OBI_0001895 xsd:string ?;
+ sio:SIO_001167 xsd:string ?;
+ sio:SIO_000115 xsd:string ?;
obo:OBI_0001472 xsd:string ?;
- obo:OBI_0001479 xsd:string ?;
+ obo:OBI_0001479 IRI {0,2};
}
:submitterShape {
- sio:SIO_000116 xsd:string ;
- obo:NCIT_C37984 xsd:string ;
+ obo:NCIT_C42781 xsd:string ;
+ obo:NCIT_C37984 xsd:string ?;
obo:NCIT_C37900 xsd:string ?;
- obo:NCIT_C42781 xsd:string ?;
+ sio:SIO_000116 xsd:string ?;
obo:OBI_0600047 xsd:string ?;
sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
sio:SIO_000172 xsd:string ?;
@@ -48,7 +49,7 @@ PREFIX edam:
}
:technologyShape {
- obo:OBI_0600047 xsd:string ;
+ obo:OBI_0600047 IRI {0,2} ;
obo:FLU_0000848 xsd:integer ?;
efo:EFO_0002699 xsd:string ?;
}
--
cgit v1.2.3
From 5b4bad5571d76957ddb7f9121f1f5a694efaa856 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 20 Apr 2020 17:00:21 -0400
Subject: Add identity:true to collection_location
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz
---
bh20sequploader/bh20seq-schema.yml | 1 +
1 file changed, 1 insertion(+)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 3d8604a..efc60a3 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -77,6 +77,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/GAZ_00000448
_type: "@id"
+ identity: true
collector_name:
doc: Name of the person that took the sample
type: string?
--
cgit v1.2.3
From 5f44da5804547088d0f39d0687d81598598eebe5 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 20 Apr 2020 17:01:30 -0400
Subject: Reconsidered these should be noLinkCheck
---
bh20sequploader/bh20seq-schema.yml | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index efc60a3..57f3b3d 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -18,7 +18,7 @@ $graph:
jsonldPredicate:
_id: http://www.ebi.ac.uk/efo/EFO_0000532
_type: "@id"
- identity: true
+ noLinkCheck: true
host_id:
doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples
type: string?
@@ -30,7 +30,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/PATO_0000047
_type: "@id"
- identity: true
+ noLinkCheck: true
host_age:
doc: Age of the host as number (e.g. 50)
type: int?
@@ -42,7 +42,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C42574
_type: "@id"
- identity: true
+ noLinkCheck: true
host_health_status:
doc: A condition or state at a particular time
type: string?
@@ -77,7 +77,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/GAZ_00000448
_type: "@id"
- identity: true
+ noLinkCheck: true
collector_name:
doc: Name of the person that took the sample
type: string?
@@ -94,7 +94,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
- identity: true
+ noLinkCheck: true
specimen_source2:
doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb)
type: string?
@@ -131,7 +131,7 @@ $graph:
jsonldPredicate:
_id: http://edamontology.org/data_1875
_type: "@id"
- identity: true
+ noLinkCheck: true
virus_strain:
doc: Name of the virus strain
type: string?
@@ -147,14 +147,14 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
- identity: true
+ noLinkCheck: true
sample_sequencing_technology2:
doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
- identity: true
+ noLinkCheck: true
sequence_assembly_method:
doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome
type: string?
@@ -223,7 +223,7 @@ $graph:
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
_type: "@id"
- identity: true
+ noLinkCheck: true
- name: MainSchema
type: record
--
cgit v1.2.3
From 85b85b676d7ecc218d9f84357b2e7ea0133eed94 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Tue, 21 Apr 2020 16:49:47 +0200
Subject: Updated shex and manditory fields and stuff
---
bh20sequploader/bh20seq-schema.yml | 10 +++++-----
bh20sequploader/bh20seq-shex.rdf | 4 ++--
example/minimal_example.yaml | 6 +-----
scripts/from_genbank_to_fasta_and_yaml.py | 19 +++++++++++++------
4 files changed, 21 insertions(+), 18 deletions(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 57f3b3d..75308ab 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -66,6 +66,11 @@ $graph:
- name: sampleSchema
type: record
fields:
+ sample_id:
+ doc: Id of the sample as defined by the submitter
+ type: string
+ jsonldPredicate:
+ _id: http://semanticscience.org/resource/SIO_000115
collection_date:
doc: Date when the sample was taken
type: string
@@ -111,11 +116,6 @@ $graph:
type: string?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_001167
- sample_id:
- doc: Id of the sample as defined by the submitter
- type: string?
- jsonldPredicate:
- _id: http://semanticscience.org/resource/SIO_000115
source_database_accession:
doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here
type: string?
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 6e646c7..59ee71b 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -28,11 +28,11 @@ PREFIX wikidata:
}
:sampleShape {
- evs:C25164 xsd:string?;
+ sio:SIO_000115 xsd:string;
obo:GAZ_00000448 [wikidata:~] ;
+ evs:C25164 xsd:string;
obo:OBI_0001895 xsd:string ?;
sio:SIO_001167 xsd:string ?;
- sio:SIO_000115 xsd:string ?;
obo:OBI_0001472 xsd:string ?;
obo:OBI_0001479 IRI {0,2};
}
diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml
index ed578e2..0e36a25 100644
--- a/example/minimal_example.yaml
+++ b/example/minimal_example.yaml
@@ -1,13 +1,10 @@
id: placeholder
host:
- host_id: XX1
host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
sample:
sample_id: XX
- collector_name: John Doe
- collecting_institution: Doe university
collection_date: 2020-01
collection_location: http://www.wikidata.org/entity/Q148
@@ -18,5 +15,4 @@ technology:
sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632
submitter:
- submitter_name: John Doe
- originating_lab: John Doe's kitchen
\ No newline at end of file
+ authors: John Doe
\ No newline at end of file
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 0c410d7..7e7c089 100644
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -1,5 +1,5 @@
from Bio import Entrez
-Entrez.email = 'insert_your_email@gmail.com'
+Entrez.email = 'another_email@gmail.com'
import xml.etree.ElementTree as ET
import yaml
@@ -31,6 +31,8 @@ for term in term_list:
tmp_list = [x.split('.')[0] for x in tmp_list]
print(term, len(tmp_list))
+ tmp_list=tmp_list
+# tmp_list = tmp_list[0:2] # restricting to small run
id_set.update([x.split('.')[0] for x in tmp_list])
@@ -78,7 +80,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
term_to_uri_dict[term] = uri
species_to_taxid_dict = {
- 'Homo sapiens': 9606
+ 'Homo sapiens': 'http://purl.obolibrary.org/obo/NCBITaxon_9606'
}
@@ -108,8 +110,8 @@ if not os.path.exists(dir_fasta_and_yaml_today):
'submitter': {}
}
-
info_for_yaml_dict['sample']['sample_id'] = accession_version
+ info_for_yaml_dict['sample']['source_database_accession'] = accession_version
info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')])
@@ -163,7 +165,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
if GBQualifier_name_text == 'host':
GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
- info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0]
+ #info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] # Removed
if GBQualifier_value_text_list[0] in species_to_taxid_dict:
info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]]
@@ -206,8 +208,13 @@ if not os.path.exists(dir_fasta_and_yaml_today):
elif GBQualifier_name_text == 'isolate':
info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text
elif GBQualifier_name_text == 'db_xref':
- info_for_yaml_dict['virus']['virus_species'] = int(GBQualifier_value_text.split('taxon:')[1])
-
+ info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
+
+
+ #Remove technology key if empty!
+ if (info_for_yaml_dict['technology']=={}):
+ del info_for_yaml_dict['key']
+
with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw:
fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
--
cgit v1.2.3
From 88d81f853cf04b7f28681dd9cdee775b0422f252 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 21 Apr 2020 12:53:19 -0400
Subject: Working on NCBI import
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz
---
bh20sequploader/bh20seq-schema.yml | 4 ++--
bh20sequploader/main.py | 7 ++++---
scripts/foreach.sh | 18 ++++++++++++++++++
scripts/from_genbank_to_fasta_and_yaml.py | 26 ++++++++++++++------------
4 files changed, 38 insertions(+), 17 deletions(-)
create mode 100755 scripts/foreach.sh
mode change 100644 => 100755 scripts/from_genbank_to_fasta_and_yaml.py
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 75308ab..ebca35b 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -162,12 +162,12 @@ $graph:
_id: http://www.ebi.ac.uk/efo/EFO_0002699
sequencing_coverage:
doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x)
- type: float?
+ type: ["null", float, int]
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/FLU_0000848
sequencing_coverage2:
doc: If a second sequence technology was used you can submit its coverage here
- type: float?
+ type: ["null", float, int]
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/FLU_0000848
additional_technology_information:
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index 49d012d..2fda347 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -44,7 +44,8 @@ def main():
with col.open(target, "w") as f:
r = args.sequence.read(65536)
- print(r[0:20])
+ seqlabel = r[1:r.index("\n")]
+ print(seqlabel)
while r:
f.write(r)
r = args.sequence.read(65536)
@@ -67,8 +68,8 @@ def main():
"upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname())
}
- col.save_new(owner_uuid=UPLOAD_PROJECT, name="Uploaded by %s from %s" %
- (properties['upload_user'], properties['upload_ip']),
+ col.save_new(owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" %
+ (seqlabel, properties['upload_user'], properties['upload_ip']),
properties=properties, ensure_unique_name=True)
print("Done")
diff --git a/scripts/foreach.sh b/scripts/foreach.sh
new file mode 100755
index 0000000..35b07b8
--- /dev/null
+++ b/scripts/foreach.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+rm -rf validated fasta_and_yaml_*
+mkdir -p validated
+./from_genbank_to_fasta_and_yaml.py
+fasta_files=$(find fasta_and_yaml_20200421/ -name "*.fasta")
+for f in $fasta_files ; do
+ yaml=$(echo $f | rev | cut -c7- | rev).yaml
+ echo $f
+ echo $yaml
+ if bh20-seq-uploader --validate $f $yaml ; then
+ sz=$(stat --format=%s $f)
+ if test $sz -gt 20000 ; then
+ mv $f $yaml validated
+ else
+ echo "Fasta file too small"
+ fi
+ fi
+done
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
old mode 100644
new mode 100755
index 7e7c089..1a12513
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -1,8 +1,10 @@
+#!/usr/bin/env python3
+
from Bio import Entrez
Entrez.email = 'another_email@gmail.com'
import xml.etree.ElementTree as ET
-import yaml
+import json
import os
from datetime import date
@@ -29,7 +31,7 @@ for term in term_list:
# Remove the version in the id
tmp_list = [x.split('.')[0] for x in tmp_list]
-
+
print(term, len(tmp_list))
tmp_list=tmp_list
# tmp_list = tmp_list[0:2] # restricting to small run
@@ -49,11 +51,11 @@ print(term_list + ['NCBI Virus'], len(id_set))
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]
-
+
num_ids_for_request = 100
if not os.path.exists(dir_metadata_today):
os.makedirs(dir_metadata_today)
-
+
for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)):
path_metadata_xxx_xml = os.path.join(dir_metadata_today, 'metadata_{}.xml'.format(i))
print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml))
@@ -63,7 +65,7 @@ if not os.path.exists(dir_metadata_today):
Entrez.efetch(db='nuccore', id=id_x_list, retmode='xml').read()
)
-
+
term_to_uri_dict = {}
for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
@@ -74,7 +76,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
if len(line.split(',')) > 2:
term, uri = line.strip('\n').split('",')
term = term.strip('"')
- else:
+ else:
term, uri = line.strip('\n').split(',')
term_to_uri_dict[term] = uri
@@ -125,7 +127,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
):
if info_to_check in GBSeq_comment_text:
tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0]
-
+
if field_in_yaml == 'sequencing_coverage':
# A regular expression would be better!
info_for_yaml_dict['technology'][field_in_yaml] = ';'.join(
@@ -139,7 +141,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
seq_tec = term_to_uri_dict[seq_tec]
else:
print(accession_version, 'missing technologies:', seq_tec)
-
+
new_seq_tec_list.append(seq_tec)
for n, seq_tec in enumerate(new_seq_tec_list):
@@ -147,7 +149,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
else:
info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse
-
+
#term_to_uri_dict
for GBFeature in GBSeq.iter('GBFeature'):
@@ -211,12 +213,12 @@ if not os.path.exists(dir_fasta_and_yaml_today):
info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
- #Remove technology key if empty!
+ # Remove technology key if empty!
if (info_for_yaml_dict['technology']=={}):
- del info_for_yaml_dict['key']
+ del info_for_yaml_dict['technology']
with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw:
fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
with open(os.path.join(dir_fasta_and_yaml_today, '{}.yaml'.format(accession_version)), 'w') as fw:
- yaml.dump(info_for_yaml_dict, fw, default_flow_style=False)
+ json.dump(info_for_yaml_dict, fw, indent=2)
--
cgit v1.2.3
From 7e085b2958d9bd4f0a2b1912cf259a05b56366bc Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 21 Apr 2020 13:22:53 -0400
Subject: Tweak handling of "coverage" also fix typo
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz
---
bh20sequploader/bh20seq-schema.yml | 4 ++--
bh20sequploader/bh20seq-shex.rdf | 2 +-
scripts/dict_ontology_standardization/ncbi_speciesman_source.csv | 2 +-
scripts/from_genbank_to_fasta_and_yaml.py | 9 ++++++---
4 files changed, 10 insertions(+), 7 deletions(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index ebca35b..75308ab 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -162,12 +162,12 @@ $graph:
_id: http://www.ebi.ac.uk/efo/EFO_0002699
sequencing_coverage:
doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x)
- type: ["null", float, int]
+ type: float?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/FLU_0000848
sequencing_coverage2:
doc: If a second sequence technology was used you can submit its coverage here
- type: ["null", float, int]
+ type: float?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/FLU_0000848
additional_technology_information:
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 59ee71b..31e714f 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -50,7 +50,7 @@ PREFIX wikidata:
:technologyShape {
obo:OBI_0600047 IRI {0,2} ;
- obo:FLU_0000848 xsd:integer ?;
+ obo:FLU_0000848 xsd:double ?;
efo:EFO_0002699 xsd:string ?;
}
diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
index 2905588..909cf37 100644
--- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
+++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
@@ -1,4 +1,4 @@
-nasopharyngeal swab, http://purl.obolibrary.org/obo/NCIT_C155831
+nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831
nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831
respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831
naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 1a12513..00c0012 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -130,9 +130,12 @@ if not os.path.exists(dir_fasta_and_yaml_today):
if field_in_yaml == 'sequencing_coverage':
# A regular expression would be better!
- info_for_yaml_dict['technology'][field_in_yaml] = ';'.join(
- [x.strip('(average)').strip("reads/nt").replace(',', '.').strip(' xX>') for x in tech_info_to_parse.split(';')]
- )
+ try:
+ info_for_yaml_dict['technology'][field_in_yaml] = float(
+ tech_info_to_parse.strip('(average)').strip("reads/nt").replace(',', '.').strip(' xX>'))
+ except ValueError:
+ print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse)
+ pass
elif field_in_yaml == 'sample_sequencing_technology':
new_seq_tec_list = []
for seq_tec in tech_info_to_parse.split(';'):
--
cgit v1.2.3
From cad23032ecf6ef325aab2978d5df36609ad50088 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 21 Apr 2020 18:16:47 +0000
Subject: add noLinkCheck to specimen_source2
---
bh20sequploader/bh20seq-schema.yml | 1 +
1 file changed, 1 insertion(+)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 75308ab..1ceebe2 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -106,6 +106,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
+ noLinkCheck: true
sample_storage_conditions:
doc: Information about storage of a specified type, e.g. frozen specimen, paraffin, fresh ....
type: string?
--
cgit v1.2.3
From f4c3da88c1233802fea46cc972a81dc3b5b51185 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 21 Apr 2020 15:37:58 -0400
Subject: Work around CWL content size limit by chunking
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz
---
bh20sequploader/main.py | 1 +
workflows/pangenome-generate/relabel-seqs.cwl | 31 +++++++++++++++++++++++----
workflows/pangenome-generate/relabel-seqs.py | 22 +++++++++++++------
3 files changed, 44 insertions(+), 10 deletions(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index 2fda347..4c4711d 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -63,6 +63,7 @@ def main():
external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')
properties = {
+ "sequence_label": seqlabel,
"upload_app": "bh20-seq-uploader",
"upload_ip": external_ip,
"upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname())
diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl
index 2b780d4..01196f6 100644
--- a/workflows/pangenome-generate/relabel-seqs.cwl
+++ b/workflows/pangenome-generate/relabel-seqs.cwl
@@ -3,6 +3,10 @@ class: CommandLineTool
inputs:
readsFA: File[]
subjects: string[]
+ script:
+ type: File
+ default: {class: File, location: relabel-seqs.py}
+ inputBinding: {}
outputs:
relabeledSeqs:
type: File
@@ -15,11 +19,30 @@ outputs:
requirements:
InlineJavascriptRequirement: {}
InitialWorkDirRequirement:
- listing:
- - entry: {$include: relabel-seqs.py}
- entryname: relabel-seqs.py
+ listing: |
+ ${
+ var i = 0;
+ var b = 1;
+ var out = [];
+ for (; i < inputs.readsFA.length; i++) {
+ var block = [];
+ for (; i < (b*100) && i < inputs.readsFA.length; i++) {
+ block.push(inputs.readsFA[i]);
+ }
+ out.push({
+ entryname: "block"+b,
+ entry: JSON.stringify(block)
+ });
+ b++;
+ }
+ out.push({
+ entry: JSON.stringify(inputs.subjects),
+ entryname: "subjects"
+ });
+ return out;
+ }
hints:
DockerRequirement:
dockerPull: commonworkflowlanguage/cwltool_module
stdout:
-baseCommand: [python, relabel-seqs.py]
+baseCommand: [python]
diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py
index 1188ceb..970540f 100644
--- a/workflows/pangenome-generate/relabel-seqs.py
+++ b/workflows/pangenome-generate/relabel-seqs.py
@@ -1,5 +1,15 @@
-reads = $(inputs.readsFA)
-subjects = $(inputs.subjects)
+import os
+import json
+
+reads = []
+b = 1
+while os.path.exists("block%i" % b):
+ with open("block%i" % b) as f:
+ reads.extend(json.load(f))
+ b += 1
+
+with open("subjects") as f:
+ subjects = json.load(f)
relabeled_fasta = open("relabeledSeqs.fasta", "wt")
original_labels = open("originalLabels.ttl", "wt")
@@ -7,12 +17,12 @@ original_labels = open("originalLabels.ttl", "wt")
for i, r in enumerate(reads):
with open(r["path"], "rt") as fa:
label = fa.readline()
- original_labels.write("<%s> \\"%s\\" .\\n" % (subjects[i], label[1:].strip().replace('"', '\\\\"')))
- relabeled_fasta.write(">"+subjects[i]+"\\n")
+ original_labels.write("<%s> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"')))
+ relabeled_fasta.write(">"+subjects[i]+"\n")
data = fa.read(8096)
while data:
relabeled_fasta.write(data)
- endswithnewline = data.endswith("\\n")
+ endswithnewline = data.endswith("\n")
data = fa.read(8096)
if not endswithnewline:
- relabeled_fasta.write("\\n")
+ relabeled_fasta.write("\n")
--
cgit v1.2.3
From a12fe94f174da766be612fbb2712b4db2ba98296 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Wed, 22 Apr 2020 19:41:27 +0200
Subject: Small changes all around, trying to make the importer/metadata better
---
bh20sequploader/bh20seq-schema.yml | 4 ++--
bh20sequploader/bh20seq-shex.rdf | 25 +++++++++++++++----------
example/metadata.yaml | 8 ++++----
scripts/foreach.sh | 2 +-
scripts/from_genbank_to_fasta_and_yaml.py | 12 ++++++++----
5 files changed, 30 insertions(+), 21 deletions(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 1ceebe2..80013c3 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -25,7 +25,7 @@ $graph:
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
host_sex:
- doc: Sex of the host as defined in NCIT, IRI expected (http://purl.obolibrary.org/obo/NCIT_C20197 (Male), http://purl.obolibrary.org/obo/NCIT_C27993 (Female), http://purl.obolibrary.org/obo/NCIT_C45908 (Intersex), or http://purl.obolibrary.org/obo/NCIT_C17998 (Unknown))
+ doc: Sex of the host as defined in PATO, expect male () or female ()
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/PATO_0000047
@@ -144,7 +144,7 @@ $graph:
fields:
sample_sequencing_technology:
doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
- type: string
+ type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 31e714f..8d0055e 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -23,35 +23,40 @@ PREFIX wikidata:
obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?;
obo:PATO_0000011 xsd:integer ?;
obo:NCIT_C42574 [ obo:UO_~ ] ?;
- sio:SIO_001167 xsd:string ?;
+ obo:NCIT_C25688 xsd:string ? ;
efo:EFO_0000727 xsd:string ?;
+ obo:VO_0000002 xsd:string ?;
+ sio:SIO_001167 xsd:string ?;
}
:sampleShape {
sio:SIO_000115 xsd:string;
- obo:GAZ_00000448 [wikidata:~] ;
evs:C25164 xsd:string;
+ obo:GAZ_00000448 [wikidata:~] ;
obo:OBI_0001895 xsd:string ?;
- sio:SIO_001167 xsd:string ?;
- obo:OBI_0001472 xsd:string ?;
+ obo:NCIT_C41206 xsd:string ?;
obo:OBI_0001479 IRI {0,2};
+ obo:OBI_0001472 xsd:string ?;
+ sio:SIO_001167 xsd:string ?;
}
:submitterShape {
obo:NCIT_C42781 xsd:string ;
- obo:NCIT_C37984 xsd:string ?;
- obo:NCIT_C37900 xsd:string ?;
sio:SIO_000116 xsd:string ?;
- obo:OBI_0600047 xsd:string ?;
- sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
sio:SIO_000172 xsd:string ?;
+ obo:NCIT_C37984 xsd:string ?;
+ obo:OBI_0600047 xsd:string ?;
+ obo:NCIT_C37900 xsd:string ?;
efo:EFO_0001741 xsd:string ?;
+ obo:NCIT_C19026 xsd:string ?;
+ sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
}
:technologyShape {
- obo:OBI_0600047 IRI {0,2} ;
- obo:FLU_0000848 xsd:double ?;
+ obo:OBI_0600047 IRI {0,2} ?;
efo:EFO_0002699 xsd:string ?;
+ obo:FLU_0000848 xsd:double {0,2};
+ sio:SIO_001167 xsd:string ?;
}
:virusShape{
diff --git a/example/metadata.yaml b/example/metadata.yaml
index 57d90b5..d1b10c1 100644
--- a/example/metadata.yaml
+++ b/example/metadata.yaml
@@ -6,7 +6,7 @@ host:
host_sex: http://purl.obolibrary.org/obo/NCIT_C27993
host_age: 20
host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
- host_health_status: A condition or state at a particular time (Disease ontology)
+ host_health_status: A condition or state at a particular time
host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
host_vaccination: List of vaccines given to the host (RRIDs?)
additional_host_information: Field for additional host information
@@ -29,15 +29,15 @@ virus:
technology:
sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173
sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173
- sequence_assembly_method: Protocol used for assembly (CWL, WDL, NF, BCO?)
+ sequence_assembly_method: Protocol used for assembly
sequencing_coverage: 70
submitter:
- submitter_name: John Doe (ORCID?)
+ submitter_name: John Doe
submitter_address: John Doe's adress
originating_lab: John Doe kitchen
lab_address: John Doe's address
provider_sample_id: HmX
submitter_sample_id: xXx
authors: John Doe et all
- submitter_orcid: https://orcid.org/0000-0000-0000-0000 (if this is here, others can be optional?)
+ submitter_orcid: https://orcid.org/0000-0000-0000-0000
\ No newline at end of file
diff --git a/scripts/foreach.sh b/scripts/foreach.sh
index 35b07b8..ddc9387 100755
--- a/scripts/foreach.sh
+++ b/scripts/foreach.sh
@@ -2,7 +2,7 @@
rm -rf validated fasta_and_yaml_*
mkdir -p validated
./from_genbank_to_fasta_and_yaml.py
-fasta_files=$(find fasta_and_yaml_20200421/ -name "*.fasta")
+fasta_files=$(find fasta_and_yaml/ -name "*.fasta")
for f in $fasta_files ; do
yaml=$(echo $f | rev | cut -c7- | rev).yaml
echo $f
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 00c0012..096a6af 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -8,10 +8,11 @@ import json
import os
from datetime import date
-today = date.today().strftime("%Y%m%d")
+#today = date.today().strftime("%Y%m%d")
-dir_metadata_today = 'metadata_from_nuccore_{}'.format(today)
-dir_fasta_and_yaml_today = 'fasta_and_yaml_{}'.format(today)
+
+dir_metadata_today = 'metadata_from_nuccore' #_{}'.format(today)
+dir_fasta_and_yaml_today = 'fasta_and_yaml' #'.format(today)
dir_dict_ontology_standardization = 'dict_ontology_standardization/'
@@ -177,7 +178,10 @@ if not os.path.exists(dir_fasta_and_yaml_today):
if len(GBQualifier_value_text_list) > 1:
if GBQualifier_value_text_list[1] in ['male', 'female']:
- info_for_yaml_dict['host']['host_sex'] = GBQualifier_value_text_list[1]
+ if GBQualifier_value_text_list[1]=='male':
+ info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384"
+ elif GBQualifier_value_text_list[1]=='female':
+ info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000383"
else:
info_for_yaml_dict['host']['host_health_status'] = GBQualifier_value_text_list[1]
--
cgit v1.2.3
From 2d3f8b9707bd13433ca82449ad82dbc406a28f95 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Wed, 22 Apr 2020 20:43:09 +0200
Subject: Including restrictions to the host status
---
bh20sequploader/bh20seq-options.yml | 9 +++++++++
bh20sequploader/bh20seq-schema.yml | 2 +-
bh20sequploader/bh20seq-shex.rdf | 2 +-
3 files changed, 11 insertions(+), 2 deletions(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-options.yml b/bh20sequploader/bh20seq-options.yml
index da47e1a..68f6e79 100644
--- a/bh20sequploader/bh20seq-options.yml
+++ b/bh20sequploader/bh20seq-options.yml
@@ -14,6 +14,15 @@ host_sex:
Male: http://purl.obolibrary.org/obo/PATO_0000384
Female: http://purl.obolibrary.org/obo/PATO_0000383
+host_health_status:
+ healthy: http://purl.obolibrary.org/obo/NCIT_C115935
+ asymptomatic: http://purl.obolibrary.org/obo/NCIT_C3833
+ sympotmatic: http://purl.obolibrary.org/obo/NCIT_C25269
+ admitted to hospital: http://purl.obolibrary.org/obo/GENEPIO_0002020
+ discharged from hospital: http://purl.obolibrary.org/obo/GENEPIO_0001849
+ dead: http://purl.obolibrary.org/obo/NCIT_C28554
+ alive: http://purl.obolibrary.org/obo/NCIT_C37987
+
sample_sequencing_technology:
Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 80013c3..232ccc6 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -44,7 +44,7 @@ $graph:
_type: "@id"
noLinkCheck: true
host_health_status:
- doc: A condition or state at a particular time
+ doc: A condition or state at a particular time, must be one of the following (obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987)
type: string?
jsonldPredicate: http://purl.obolibrary.org/obo/NCIT_C25688
host_treatment:
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 8d0055e..bb15f91 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -23,7 +23,7 @@ PREFIX wikidata:
obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?;
obo:PATO_0000011 xsd:integer ?;
obo:NCIT_C42574 [ obo:UO_~ ] ?;
- obo:NCIT_C25688 xsd:string ? ;
+ obo:NCIT_C25688 [obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987 ] ? ;
efo:EFO_0000727 xsd:string ?;
obo:VO_0000002 xsd:string ?;
sio:SIO_001167 xsd:string ?;
--
cgit v1.2.3
From a448aba5afb633dec197c93ed5fcc6fa61c7c491 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Wed, 22 Apr 2020 21:06:47 +0200
Subject: Forgot to add _id
---
bh20sequploader/bh20seq-schema.yml | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 232ccc6..9a89979 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -46,7 +46,8 @@ $graph:
host_health_status:
doc: A condition or state at a particular time, must be one of the following (obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987)
type: string?
- jsonldPredicate: http://purl.obolibrary.org/obo/NCIT_C25688
+ jsonldPredicate:
+ _id: http://purl.obolibrary.org/obo/NCIT_C25688
host_treatment:
doc: Process in which the act is intended to modify or alter host status
type: string?
--
cgit v1.2.3
From 7ef2c5c45d3d1b6e71a08fd0bdf19c42ef9e1014 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Wed, 22 Apr 2020 21:23:32 +0200
Subject: Fixing ShEx expression, one ? too much
---
bh20sequploader/bh20seq-shex.rdf | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
(limited to 'bh20sequploader')
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index bb15f91..246fd57 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -53,7 +53,7 @@ PREFIX wikidata:
}
:technologyShape {
- obo:OBI_0600047 IRI {0,2} ?;
+ obo:OBI_0600047 IRI {0,2} ;
efo:EFO_0002699 xsd:string ?;
obo:FLU_0000848 xsd:double {0,2};
sio:SIO_001167 xsd:string ?;
--
cgit v1.2.3