about summary refs log tree commit diff
diff options
context:
space:
mode:
authorlltommy2020-04-22 19:41:27 +0200
committerlltommy2020-04-22 19:41:27 +0200
commita12fe94f174da766be612fbb2712b4db2ba98296 (patch)
tree0c1a21fcfc638460d91309f7dfac3fdc967ca7c1
parentba8b5d364f0ba96f3fef5214137d30ed00a8079d (diff)
downloadbh20-seq-resource-a12fe94f174da766be612fbb2712b4db2ba98296.tar.gz
bh20-seq-resource-a12fe94f174da766be612fbb2712b4db2ba98296.tar.lz
bh20-seq-resource-a12fe94f174da766be612fbb2712b4db2ba98296.zip
Small changes all around, trying to make the importer/metadata better
-rw-r--r--bh20sequploader/bh20seq-schema.yml4
-rw-r--r--bh20sequploader/bh20seq-shex.rdf25
-rw-r--r--example/metadata.yaml8
-rwxr-xr-xscripts/foreach.sh2
-rwxr-xr-xscripts/from_genbank_to_fasta_and_yaml.py12
5 files changed, 30 insertions, 21 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 1ceebe2..80013c3 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -25,7 +25,7 @@ $graph:
         jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_000115
     host_sex:
-        doc: Sex of the host as defined in NCIT, IRI expected (http://purl.obolibrary.org/obo/NCIT_C20197 (Male), http://purl.obolibrary.org/obo/NCIT_C27993 (Female), http://purl.obolibrary.org/obo/NCIT_C45908 (Intersex), or http://purl.obolibrary.org/obo/NCIT_C17998 (Unknown))
+        doc: Sex of the host as defined in PATO, expect male () or female ()
         type: string?
         jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/PATO_0000047
@@ -144,7 +144,7 @@ $graph:
   fields:
     sample_sequencing_technology:
       doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
-      type: string
+      type: string?
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/OBI_0600047
         _type: "@id"
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 31e714f..8d0055e 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -23,35 +23,40 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
     obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?;
     obo:PATO_0000011 xsd:integer ?;
     obo:NCIT_C42574 [ obo:UO_~ ] ?;
-    sio:SIO_001167 xsd:string ?;
+	obo:NCIT_C25688 xsd:string ? ;
     efo:EFO_0000727 xsd:string ?;
+    obo:VO_0000002 xsd:string ?;
+    sio:SIO_001167 xsd:string ?;
 }
 
 :sampleShape  {
     sio:SIO_000115 xsd:string;
-    obo:GAZ_00000448 [wikidata:~] ;
 	evs:C25164 xsd:string;
+	obo:GAZ_00000448 [wikidata:~] ;
     obo:OBI_0001895 xsd:string ?;
-    sio:SIO_001167 xsd:string ?;
-    obo:OBI_0001472 xsd:string ?;
+    obo:NCIT_C41206 xsd:string ?;
     obo:OBI_0001479 IRI {0,2};
+    obo:OBI_0001472 xsd:string ?;
+    sio:SIO_001167 xsd:string ?;
 }
 
 :submitterShape {
     obo:NCIT_C42781 xsd:string ;
-	obo:NCIT_C37984 xsd:string ?;
-	obo:NCIT_C37900 xsd:string ?;
     sio:SIO_000116 xsd:string ?;
-    obo:OBI_0600047 xsd:string ?;
-    sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
     sio:SIO_000172 xsd:string ?;
+    obo:NCIT_C37984 xsd:string ?;
+    obo:OBI_0600047 xsd:string ?;
+  	obo:NCIT_C37900 xsd:string ?;
     efo:EFO_0001741 xsd:string ?;
+    obo:NCIT_C19026 xsd:string ?;
+    sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
 }
 
 :technologyShape {
-    obo:OBI_0600047 IRI {0,2} ;
-    obo:FLU_0000848 xsd:double ?;
+    obo:OBI_0600047 IRI {0,2} ?;
     efo:EFO_0002699 xsd:string ?;
+    obo:FLU_0000848 xsd:double {0,2};
+    sio:SIO_001167 xsd:string ?;
 }
 
 :virusShape{
diff --git a/example/metadata.yaml b/example/metadata.yaml
index 57d90b5..d1b10c1 100644
--- a/example/metadata.yaml
+++ b/example/metadata.yaml
@@ -6,7 +6,7 @@ host:
     host_sex: http://purl.obolibrary.org/obo/NCIT_C27993
     host_age: 20
     host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
-    host_health_status: A condition or state at a particular time (Disease ontology)
+    host_health_status: A condition or state at a particular time
     host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
     host_vaccination: List of vaccines given to the host (RRIDs?)
     additional_host_information: Field for additional host information
@@ -29,15 +29,15 @@ virus:
 technology:
     sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173
     sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173
-    sequence_assembly_method: Protocol used for assembly (CWL, WDL, NF, BCO?)
+    sequence_assembly_method: Protocol used for assembly
     sequencing_coverage: 70
 
 submitter:
-    submitter_name: John Doe (ORCID?)
+    submitter_name: John Doe
     submitter_address: John Doe's adress
     originating_lab: John Doe kitchen
     lab_address: John Doe's address
     provider_sample_id: HmX
     submitter_sample_id: xXx
     authors: John Doe et all
-    submitter_orcid: https://orcid.org/0000-0000-0000-0000 (if this is here, others can be optional?)
+    submitter_orcid: https://orcid.org/0000-0000-0000-0000
\ No newline at end of file
diff --git a/scripts/foreach.sh b/scripts/foreach.sh
index 35b07b8..ddc9387 100755
--- a/scripts/foreach.sh
+++ b/scripts/foreach.sh
@@ -2,7 +2,7 @@
 rm -rf validated fasta_and_yaml_*
 mkdir -p validated
 ./from_genbank_to_fasta_and_yaml.py
-fasta_files=$(find fasta_and_yaml_20200421/ -name "*.fasta")
+fasta_files=$(find fasta_and_yaml/ -name "*.fasta")
 for f in $fasta_files ; do
     yaml=$(echo $f | rev | cut -c7- | rev).yaml
     echo $f
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 00c0012..096a6af 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -8,10 +8,11 @@ import json
 import os
 
 from datetime import date
-today = date.today().strftime("%Y%m%d")
+#today = date.today().strftime("%Y%m%d")
 
-dir_metadata_today = 'metadata_from_nuccore_{}'.format(today)
-dir_fasta_and_yaml_today = 'fasta_and_yaml_{}'.format(today)
+
+dir_metadata_today = 'metadata_from_nuccore' #_{}'.format(today)
+dir_fasta_and_yaml_today = 'fasta_and_yaml' #'.format(today)
 
 dir_dict_ontology_standardization = 'dict_ontology_standardization/'
 
@@ -177,7 +178,10 @@ if not os.path.exists(dir_fasta_and_yaml_today):
 
                         if len(GBQualifier_value_text_list) > 1:
                             if GBQualifier_value_text_list[1] in ['male', 'female']:
-                                info_for_yaml_dict['host']['host_sex'] = GBQualifier_value_text_list[1]
+                                if GBQualifier_value_text_list[1]=='male':
+                                    info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384"
+                                elif GBQualifier_value_text_list[1]=='female':
+                                    info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000383"
                             else:
                                 info_for_yaml_dict['host']['host_health_status'] = GBQualifier_value_text_list[1]