aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlltommy2020-04-21 16:49:47 +0200
committerlltommy2020-04-21 16:49:47 +0200
commit85b85b676d7ecc218d9f84357b2e7ea0133eed94 (patch)
tree0c2b445aa1fd653b57be6ddf341183f73b28350f
parentecae9863069585d88fc88b7f2a7434479f7425c1 (diff)
downloadbh20-seq-resource-85b85b676d7ecc218d9f84357b2e7ea0133eed94.tar.gz
bh20-seq-resource-85b85b676d7ecc218d9f84357b2e7ea0133eed94.tar.lz
bh20-seq-resource-85b85b676d7ecc218d9f84357b2e7ea0133eed94.zip
Updated shex and manditory fields and stuff
-rw-r--r--bh20sequploader/bh20seq-schema.yml10
-rw-r--r--bh20sequploader/bh20seq-shex.rdf4
-rw-r--r--example/minimal_example.yaml6
-rw-r--r--scripts/from_genbank_to_fasta_and_yaml.py19
4 files changed, 21 insertions, 18 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 57f3b3d..75308ab 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -66,6 +66,11 @@ $graph:
- name: sampleSchema
type: record
fields:
+ sample_id:
+ doc: Id of the sample as defined by the submitter
+ type: string
+ jsonldPredicate:
+ _id: http://semanticscience.org/resource/SIO_000115
collection_date:
doc: Date when the sample was taken
type: string
@@ -111,11 +116,6 @@ $graph:
type: string?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_001167
- sample_id:
- doc: Id of the sample as defined by the submitter
- type: string?
- jsonldPredicate:
- _id: http://semanticscience.org/resource/SIO_000115
source_database_accession:
doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here
type: string?
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 6e646c7..59ee71b 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -28,11 +28,11 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
}
:sampleShape {
- evs:C25164 xsd:string?;
+ sio:SIO_000115 xsd:string;
obo:GAZ_00000448 [wikidata:~] ;
+ evs:C25164 xsd:string;
obo:OBI_0001895 xsd:string ?;
sio:SIO_001167 xsd:string ?;
- sio:SIO_000115 xsd:string ?;
obo:OBI_0001472 xsd:string ?;
obo:OBI_0001479 IRI {0,2};
}
diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml
index ed578e2..0e36a25 100644
--- a/example/minimal_example.yaml
+++ b/example/minimal_example.yaml
@@ -1,13 +1,10 @@
id: placeholder
host:
- host_id: XX1
host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
sample:
sample_id: XX
- collector_name: John Doe
- collecting_institution: Doe university
collection_date: 2020-01
collection_location: http://www.wikidata.org/entity/Q148
@@ -18,5 +15,4 @@ technology:
sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632
submitter:
- submitter_name: John Doe
- originating_lab: John Doe's kitchen \ No newline at end of file
+ authors: John Doe \ No newline at end of file
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 0c410d7..7e7c089 100644
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -1,5 +1,5 @@
from Bio import Entrez
-Entrez.email = 'insert_your_email@gmail.com'
+Entrez.email = 'another_email@gmail.com'
import xml.etree.ElementTree as ET
import yaml
@@ -31,6 +31,8 @@ for term in term_list:
tmp_list = [x.split('.')[0] for x in tmp_list]
print(term, len(tmp_list))
+ tmp_list=tmp_list
+# tmp_list = tmp_list[0:2] # restricting to small run
id_set.update([x.split('.')[0] for x in tmp_list])
@@ -78,7 +80,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
term_to_uri_dict[term] = uri
species_to_taxid_dict = {
- 'Homo sapiens': 9606
+ 'Homo sapiens': 'http://purl.obolibrary.org/obo/NCBITaxon_9606'
}
@@ -108,8 +110,8 @@ if not os.path.exists(dir_fasta_and_yaml_today):
'submitter': {}
}
-
info_for_yaml_dict['sample']['sample_id'] = accession_version
+ info_for_yaml_dict['sample']['source_database_accession'] = accession_version
info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')])
@@ -163,7 +165,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
if GBQualifier_name_text == 'host':
GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
- info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0]
+ #info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] # Removed
if GBQualifier_value_text_list[0] in species_to_taxid_dict:
info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]]
@@ -206,8 +208,13 @@ if not os.path.exists(dir_fasta_and_yaml_today):
elif GBQualifier_name_text == 'isolate':
info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text
elif GBQualifier_name_text == 'db_xref':
- info_for_yaml_dict['virus']['virus_species'] = int(GBQualifier_value_text.split('taxon:')[1])
-
+ info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
+
+
+ #Remove technology key if empty!
+ if (info_for_yaml_dict['technology']=={}):
+ del info_for_yaml_dict['key']
+
with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw:
fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))