about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--bh20sequploader/bh20seq-schema.yml10
-rw-r--r--bh20sequploader/bh20seq-shex.rdf4
-rw-r--r--example/minimal_example.yaml6
-rw-r--r--scripts/from_genbank_to_fasta_and_yaml.py19
4 files changed, 21 insertions, 18 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 57f3b3d..75308ab 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -66,6 +66,11 @@ $graph:
 - name: sampleSchema
   type: record
   fields:
+    sample_id:
+      doc: Id of the sample as defined by the submitter
+      type: string
+      jsonldPredicate:
+        _id: http://semanticscience.org/resource/SIO_000115
     collection_date:
       doc: Date when the sample was taken
       type: string
@@ -111,11 +116,6 @@ $graph:
       type: string?
       jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_001167
-    sample_id:
-      doc: Id of the sample as defined by the submitter
-      type: string?
-      jsonldPredicate:
-          _id: http://semanticscience.org/resource/SIO_000115
     source_database_accession:
       doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here
       type: string?
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 6e646c7..59ee71b 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -28,11 +28,11 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 }
 
 :sampleShape  {
-	evs:C25164 xsd:string?;
+    sio:SIO_000115 xsd:string;
     obo:GAZ_00000448 [wikidata:~] ;
+	evs:C25164 xsd:string;
     obo:OBI_0001895 xsd:string ?;
     sio:SIO_001167 xsd:string ?;
-    sio:SIO_000115 xsd:string ?;
     obo:OBI_0001472 xsd:string ?;
     obo:OBI_0001479 IRI {0,2};
 }
diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml
index ed578e2..0e36a25 100644
--- a/example/minimal_example.yaml
+++ b/example/minimal_example.yaml
@@ -1,13 +1,10 @@
 id: placeholder
 
 host:
-    host_id: XX1
     host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
 
 sample:
     sample_id: XX
-    collector_name: John Doe
-    collecting_institution: Doe university
     collection_date: 2020-01
     collection_location: http://www.wikidata.org/entity/Q148
 
@@ -18,5 +15,4 @@ technology:
     sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632
 
 submitter:
-    submitter_name: John Doe
-    originating_lab: John Doe's kitchen
\ No newline at end of file
+    authors: John Doe
\ No newline at end of file
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 0c410d7..7e7c089 100644
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -1,5 +1,5 @@
 from Bio import Entrez
-Entrez.email = 'insert_your_email@gmail.com'
+Entrez.email = 'another_email@gmail.com'
 
 import xml.etree.ElementTree as ET
 import yaml
@@ -31,6 +31,8 @@ for term in term_list:
     tmp_list = [x.split('.')[0] for x in tmp_list]
     
     print(term, len(tmp_list))
+    tmp_list=tmp_list
+#    tmp_list = tmp_list[0:2] # restricting to small run
 
     id_set.update([x.split('.')[0] for x in tmp_list])
 
@@ -78,7 +80,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
             term_to_uri_dict[term] = uri
 
 species_to_taxid_dict = {
-    'Homo sapiens': 9606
+    'Homo sapiens': 'http://purl.obolibrary.org/obo/NCBITaxon_9606'
 }
 
 
@@ -108,8 +110,8 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                 'submitter': {}
             }
 
-
             info_for_yaml_dict['sample']['sample_id'] = accession_version
+            info_for_yaml_dict['sample']['source_database_accession'] = accession_version
             info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')])
 
 
@@ -163,7 +165,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                     if GBQualifier_name_text == 'host':
                         GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
 
-                        info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0]
+                        #info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] # Removed
 
                         if GBQualifier_value_text_list[0] in species_to_taxid_dict:
                             info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]]
@@ -206,8 +208,13 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                     elif GBQualifier_name_text == 'isolate':
                         info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text
                     elif GBQualifier_name_text == 'db_xref':
-                        info_for_yaml_dict['virus']['virus_species'] = int(GBQualifier_value_text.split('taxon:')[1])
-                        
+                        info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
+
+
+            #Remove technology key if empty!
+            if (info_for_yaml_dict['technology']=={}):
+                del info_for_yaml_dict['key']
+
             with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw:
                 fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))