From 9ddcfeacb3191638f42b08af999889d867f0f81c Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 20 Apr 2020 14:57:25 -0400
Subject: Better handling of duplicate sequences

Also save original fasta label in metadata
---
 bh20sequploader/bh20seq-schema.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 64008f2..982447c 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -18,6 +18,7 @@ $graph:
         jsonldPredicate:
           _id: http://www.ebi.ac.uk/efo/EFO_0000532
           _type: "@id"
+          identity: true
     host_id:
         doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples
         type: string
@@ -29,6 +30,7 @@ $graph:
         jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/PATO_0000047
           _type: "@id"
+          identity: true
     host_age:
         doc: Age of the host as number (e.g. 50)
         type: int?
@@ -40,6 +42,7 @@ $graph:
         jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/NCIT_C42574
           _type: "@id"
+          identity: true
     host_health_status:
         doc: A condition or state at a particular time
         type: string?
@@ -79,12 +82,14 @@ $graph:
       jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/OBI_0001479
           _type: "@id"
+          identity: true
     specimen_source2:
       doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb)
       type: string?
       jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/OBI_0001479
           _type: "@id"
+          identity: true
     collection_date:
       doc: Date when the sample was taken
       type: string
@@ -96,6 +101,7 @@ $graph:
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/GAZ_00000448
         _type: "@id"
+        identity: true
     sample_storage_conditions:
       doc: Information about storage of a specified type, e.g.  frozen specimen, paraffin, fresh ....
       type: string?
@@ -126,6 +132,7 @@ $graph:
       jsonldPredicate:
           _id: http://edamontology.org/data_1875
           _type: "@id"
+          identity: true
     virus_strain:
       doc: Name of the virus strain
       type: string?
@@ -141,12 +148,14 @@ $graph:
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/OBI_0600047
         _type: "@id"
+        identity: true
     sample_sequencing_technology2:
       doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
       type: string?
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/OBI_0600047
         _type: "@id"
+        identity: true
     sequence_assembly_method:
       doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome
       type: string?
@@ -215,7 +224,7 @@ $graph:
       jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_000115
           _type: "@id"
-          noLinkCheck: true
+          identity: true
 
 - name: MainSchema
   type: record
-- 
cgit 1.4.1


From a2a4b1a16cef38bb4ec9d222430fd396c70ba225 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 20 Apr 2020 16:46:28 -0400
Subject: Schema changes from @LLTommy

---
 bh20sequploader/bh20seq-schema.yml | 52 ++++++++++++++++++--------------------
 bh20sequploader/bh20seq-shex.rdf   | 25 +++++++++---------
 2 files changed, 38 insertions(+), 39 deletions(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 982447c..3d8604a 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -21,7 +21,7 @@ $graph:
           identity: true
     host_id:
         doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples
-        type: string
+        type: string?
         jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_000115
     host_sex:
@@ -66,16 +66,27 @@ $graph:
 - name: sampleSchema
   type: record
   fields:
+    collection_date:
+      doc: Date when the sample was taken
+      type: string
+      jsonldPredicate:
+        _id: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164
+    collection_location:
+      doc: Geographical location where the sample was collected as wikidata reference, e.g. http://www.wikidata.org/entity/Q148 (China)
+      type: string
+      jsonldPredicate:
+        _id: http://purl.obolibrary.org/obo/GAZ_00000448
+        _type: "@id"
     collector_name:
       doc: Name of the person that took the sample
-      type: string
+      type: string?
       jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/OBI_0001895
     collecting_institution:
       doc: Institute that was responsible of sampeling
-      type: string
+      type: string?
       jsonldPredicate:
-          _id: http://semanticscience.org/resource/SIO_001167
+          _id: http://purl.obolibrary.org/obo/NCIT_C41206
     specimen_source:
       doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab)
       type: string?
@@ -89,19 +100,6 @@ $graph:
       jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/OBI_0001479
           _type: "@id"
-          identity: true
-    collection_date:
-      doc: Date when the sample was taken
-      type: string
-      jsonldPredicate:
-          _id: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164
-    collection_location:
-      doc: Geographical location where the sample was collected as wikidata reference, e.g. http://www.wikidata.org/entity/Q148 (China)
-      type: string
-      jsonldPredicate:
-        _id: http://purl.obolibrary.org/obo/GAZ_00000448
-        _type: "@id"
-        identity: true
     sample_storage_conditions:
       doc: Information about storage of a specified type, e.g.  frozen specimen, paraffin, fresh ....
       type: string?
@@ -114,7 +112,7 @@ $graph:
           _id: http://semanticscience.org/resource/SIO_001167
     sample_id:
       doc: Id of the sample as defined by the submitter
-      type: string
+      type: string?
       jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_000115
     source_database_accession:
@@ -167,7 +165,7 @@ $graph:
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/FLU_0000848
     sequencing_coverage2:
-      doc: If a second sequence technology was use you can submit its coverage here
+      doc: If a second sequence technology was used you can submit its coverage here
       type: float?
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/FLU_0000848
@@ -180,9 +178,14 @@ $graph:
 - name: submitterSchema
   type: record
   fields:
+    authors:
+      doc: Name of the author(s)
+      type: string
+      jsonldPredicate:
+          _id: http://purl.obolibrary.org/obo/NCIT_C42781
     submitter_name:
       doc: Name of the submitter
-      type: string
+      type: string?
       jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_000116
     submitter_address:
@@ -192,7 +195,7 @@ $graph:
           _id: http://semanticscience.org/resource/SIO_000172
     originating_lab:
       doc: Name of the laboratory that took the sample
-      type: string
+      type: string?
       jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/NCIT_C37984
     lab_address:
@@ -208,11 +211,6 @@ $graph:
       type: string?
       jsonldPredicate:
           _id: http://www.ebi.ac.uk/efo/EFO_0001741
-    authors:
-      doc: Name of the author(s)
-      type: string?
-      jsonldPredicate:
-          _id: http://purl.obolibrary.org/obo/NCIT_C42781
     publication:
       doc: Reference to publication of this sample (e.g. DOI, pubmed ID, ...)
       type: string?
@@ -232,7 +230,7 @@ $graph:
   fields:
     host: hostSchema
     sample: sampleSchema
-    virus: virusSchema?
+    virus: virusSchema
     technology: technologySchema
     submitter: submitterSchema
     id:
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 8d3f5fc..6e646c7 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -7,6 +7,7 @@ PREFIX sio: <http://semanticscience.org/resource/>
 PREFIX efo: <http://www.ebi.ac.uk/efo/>
 PREFIX evs: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#>
 PREFIX edam: <http://edamontology.org/>
+PREFIX wikidata: <http://www.wikidata.org/entity/>
 
 :submissionShape {
   MainSchema:host   @:hostShape ;
@@ -18,8 +19,8 @@ PREFIX edam: <http://edamontology.org/>
 
 :hostShape  {
   	efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
-    obo:PATO_0000047 [ obo:NCIT_C20197  obo:NCIT_C27993  obo:NCIT_C17998 ] ;
-    sio:SIO_000115 xsd:string ;
+    sio:SIO_000115 xsd:string ?;
+    obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?;
     obo:PATO_0000011 xsd:integer ?;
     obo:NCIT_C42574 [ obo:UO_~ ] ?;
     sio:SIO_001167 xsd:string ?;
@@ -27,20 +28,20 @@ PREFIX edam: <http://edamontology.org/>
 }
 
 :sampleShape  {
-    obo:OBI_0001895 xsd:string ;
-    sio:SIO_000115 xsd:string ;
-    sio:SIO_001167 xsd:string ;
-	evs:C25164 xsd:string ?;
-    obo:GAZ_00000448 [obo:GAZ_~] ?;
+	evs:C25164 xsd:string?;
+    obo:GAZ_00000448 [wikidata:~] ;
+    obo:OBI_0001895 xsd:string ?;
+    sio:SIO_001167 xsd:string ?;
+    sio:SIO_000115 xsd:string ?;
     obo:OBI_0001472 xsd:string ?;
-    obo:OBI_0001479 xsd:string ?;
+    obo:OBI_0001479 IRI {0,2};
 }
 
 :submitterShape {
-    sio:SIO_000116 xsd:string ;
-	obo:NCIT_C37984 xsd:string ;
+    obo:NCIT_C42781 xsd:string ;
+	obo:NCIT_C37984 xsd:string ?;
 	obo:NCIT_C37900 xsd:string ?;
-    obo:NCIT_C42781 xsd:string ?;
+    sio:SIO_000116 xsd:string ?;
     obo:OBI_0600047 xsd:string ?;
     sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
     sio:SIO_000172 xsd:string ?;
@@ -48,7 +49,7 @@ PREFIX edam: <http://edamontology.org/>
 }
 
 :technologyShape {
-    obo:OBI_0600047 xsd:string ;
+    obo:OBI_0600047 IRI {0,2} ;
     obo:FLU_0000848 xsd:integer ?;
     efo:EFO_0002699 xsd:string ?;
 }
-- 
cgit 1.4.1


From 5b4bad5571d76957ddb7f9121f1f5a694efaa856 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 20 Apr 2020 17:00:21 -0400
Subject: Add identity:true to collection_location

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 bh20sequploader/bh20seq-schema.yml | 1 +
 1 file changed, 1 insertion(+)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 3d8604a..efc60a3 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -77,6 +77,7 @@ $graph:
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/GAZ_00000448
         _type: "@id"
+        identity: true
     collector_name:
       doc: Name of the person that took the sample
       type: string?
-- 
cgit 1.4.1


From 5f44da5804547088d0f39d0687d81598598eebe5 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 20 Apr 2020 17:01:30 -0400
Subject: Reconsidered these should be noLinkCheck

---
 bh20sequploader/bh20seq-schema.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index efc60a3..57f3b3d 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -18,7 +18,7 @@ $graph:
         jsonldPredicate:
           _id: http://www.ebi.ac.uk/efo/EFO_0000532
           _type: "@id"
-          identity: true
+          noLinkCheck: true
     host_id:
         doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples
         type: string?
@@ -30,7 +30,7 @@ $graph:
         jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/PATO_0000047
           _type: "@id"
-          identity: true
+          noLinkCheck: true
     host_age:
         doc: Age of the host as number (e.g. 50)
         type: int?
@@ -42,7 +42,7 @@ $graph:
         jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/NCIT_C42574
           _type: "@id"
-          identity: true
+          noLinkCheck: true
     host_health_status:
         doc: A condition or state at a particular time
         type: string?
@@ -77,7 +77,7 @@ $graph:
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/GAZ_00000448
         _type: "@id"
-        identity: true
+        noLinkCheck: true
     collector_name:
       doc: Name of the person that took the sample
       type: string?
@@ -94,7 +94,7 @@ $graph:
       jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/OBI_0001479
           _type: "@id"
-          identity: true
+          noLinkCheck: true
     specimen_source2:
       doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb)
       type: string?
@@ -131,7 +131,7 @@ $graph:
       jsonldPredicate:
           _id: http://edamontology.org/data_1875
           _type: "@id"
-          identity: true
+          noLinkCheck: true
     virus_strain:
       doc: Name of the virus strain
       type: string?
@@ -147,14 +147,14 @@ $graph:
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/OBI_0600047
         _type: "@id"
-        identity: true
+        noLinkCheck: true
     sample_sequencing_technology2:
       doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
       type: string?
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/OBI_0600047
         _type: "@id"
-        identity: true
+        noLinkCheck: true
     sequence_assembly_method:
       doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome
       type: string?
@@ -223,7 +223,7 @@ $graph:
       jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_000115
           _type: "@id"
-          identity: true
+          noLinkCheck: true
 
 - name: MainSchema
   type: record
-- 
cgit 1.4.1


From 85b85b676d7ecc218d9f84357b2e7ea0133eed94 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Tue, 21 Apr 2020 16:49:47 +0200
Subject: Updated shex and manditory fields and stuff

---
 bh20sequploader/bh20seq-schema.yml        | 10 +++++-----
 bh20sequploader/bh20seq-shex.rdf          |  4 ++--
 example/minimal_example.yaml              |  6 +-----
 scripts/from_genbank_to_fasta_and_yaml.py | 19 +++++++++++++------
 4 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 57f3b3d..75308ab 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -66,6 +66,11 @@ $graph:
 - name: sampleSchema
   type: record
   fields:
+    sample_id:
+      doc: Id of the sample as defined by the submitter
+      type: string
+      jsonldPredicate:
+        _id: http://semanticscience.org/resource/SIO_000115
     collection_date:
       doc: Date when the sample was taken
       type: string
@@ -111,11 +116,6 @@ $graph:
       type: string?
       jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_001167
-    sample_id:
-      doc: Id of the sample as defined by the submitter
-      type: string?
-      jsonldPredicate:
-          _id: http://semanticscience.org/resource/SIO_000115
     source_database_accession:
       doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here
       type: string?
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 6e646c7..59ee71b 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -28,11 +28,11 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 }
 
 :sampleShape  {
-	evs:C25164 xsd:string?;
+    sio:SIO_000115 xsd:string;
     obo:GAZ_00000448 [wikidata:~] ;
+	evs:C25164 xsd:string;
     obo:OBI_0001895 xsd:string ?;
     sio:SIO_001167 xsd:string ?;
-    sio:SIO_000115 xsd:string ?;
     obo:OBI_0001472 xsd:string ?;
     obo:OBI_0001479 IRI {0,2};
 }
diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml
index ed578e2..0e36a25 100644
--- a/example/minimal_example.yaml
+++ b/example/minimal_example.yaml
@@ -1,13 +1,10 @@
 id: placeholder
 
 host:
-    host_id: XX1
     host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
 
 sample:
     sample_id: XX
-    collector_name: John Doe
-    collecting_institution: Doe university
     collection_date: 2020-01
     collection_location: http://www.wikidata.org/entity/Q148
 
@@ -18,5 +15,4 @@ technology:
     sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632
 
 submitter:
-    submitter_name: John Doe
-    originating_lab: John Doe's kitchen
\ No newline at end of file
+    authors: John Doe
\ No newline at end of file
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 0c410d7..7e7c089 100644
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -1,5 +1,5 @@
 from Bio import Entrez
-Entrez.email = 'insert_your_email@gmail.com'
+Entrez.email = 'another_email@gmail.com'
 
 import xml.etree.ElementTree as ET
 import yaml
@@ -31,6 +31,8 @@ for term in term_list:
     tmp_list = [x.split('.')[0] for x in tmp_list]
     
     print(term, len(tmp_list))
+    tmp_list=tmp_list
+#    tmp_list = tmp_list[0:2] # restricting to small run
 
     id_set.update([x.split('.')[0] for x in tmp_list])
 
@@ -78,7 +80,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
             term_to_uri_dict[term] = uri
 
 species_to_taxid_dict = {
-    'Homo sapiens': 9606
+    'Homo sapiens': 'http://purl.obolibrary.org/obo/NCBITaxon_9606'
 }
 
 
@@ -108,8 +110,8 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                 'submitter': {}
             }
 
-
             info_for_yaml_dict['sample']['sample_id'] = accession_version
+            info_for_yaml_dict['sample']['source_database_accession'] = accession_version
             info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')])
 
 
@@ -163,7 +165,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                     if GBQualifier_name_text == 'host':
                         GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
 
-                        info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0]
+                        #info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] # Removed
 
                         if GBQualifier_value_text_list[0] in species_to_taxid_dict:
                             info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]]
@@ -206,8 +208,13 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                     elif GBQualifier_name_text == 'isolate':
                         info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text
                     elif GBQualifier_name_text == 'db_xref':
-                        info_for_yaml_dict['virus']['virus_species'] = int(GBQualifier_value_text.split('taxon:')[1])
-                        
+                        info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
+
+
+            #Remove technology key if empty!
+            if (info_for_yaml_dict['technology']=={}):
+                del info_for_yaml_dict['key']
+
             with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw:
                 fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
 
-- 
cgit 1.4.1


From 88d81f853cf04b7f28681dd9cdee775b0422f252 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 21 Apr 2020 12:53:19 -0400
Subject: Working on NCBI import

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 bh20sequploader/bh20seq-schema.yml        |  4 ++--
 bh20sequploader/main.py                   |  7 ++++---
 scripts/foreach.sh                        | 18 ++++++++++++++++++
 scripts/from_genbank_to_fasta_and_yaml.py | 26 ++++++++++++++------------
 4 files changed, 38 insertions(+), 17 deletions(-)
 create mode 100755 scripts/foreach.sh
 mode change 100644 => 100755 scripts/from_genbank_to_fasta_and_yaml.py

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 75308ab..ebca35b 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -162,12 +162,12 @@ $graph:
         _id: http://www.ebi.ac.uk/efo/EFO_0002699
     sequencing_coverage:
       doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x)
-      type: float?
+      type: ["null", float, int]
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/FLU_0000848
     sequencing_coverage2:
       doc: If a second sequence technology was used you can submit its coverage here
-      type: float?
+      type: ["null", float, int]
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/FLU_0000848
     additional_technology_information:
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index 49d012d..2fda347 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -44,7 +44,8 @@ def main():
 
     with col.open(target, "w") as f:
         r = args.sequence.read(65536)
-        print(r[0:20])
+        seqlabel = r[1:r.index("\n")]
+        print(seqlabel)
         while r:
             f.write(r)
             r = args.sequence.read(65536)
@@ -67,8 +68,8 @@ def main():
         "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname())
     }
 
-    col.save_new(owner_uuid=UPLOAD_PROJECT, name="Uploaded by %s from %s" %
-                 (properties['upload_user'], properties['upload_ip']),
+    col.save_new(owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" %
+                 (seqlabel, properties['upload_user'], properties['upload_ip']),
                  properties=properties, ensure_unique_name=True)
 
     print("Done")
diff --git a/scripts/foreach.sh b/scripts/foreach.sh
new file mode 100755
index 0000000..35b07b8
--- /dev/null
+++ b/scripts/foreach.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+rm -rf validated fasta_and_yaml_*
+mkdir -p validated
+./from_genbank_to_fasta_and_yaml.py
+fasta_files=$(find fasta_and_yaml_20200421/ -name "*.fasta")
+for f in $fasta_files ; do
+    yaml=$(echo $f | rev | cut -c7- | rev).yaml
+    echo $f
+    echo $yaml
+    if bh20-seq-uploader --validate $f $yaml ; then
+	sz=$(stat --format=%s $f)
+	if test $sz -gt 20000 ; then
+	    mv $f $yaml validated
+	else
+	    echo "Fasta file too small"
+	fi
+    fi
+done
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
old mode 100644
new mode 100755
index 7e7c089..1a12513
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -1,8 +1,10 @@
+#!/usr/bin/env python3
+
 from Bio import Entrez
 Entrez.email = 'another_email@gmail.com'
 
 import xml.etree.ElementTree as ET
-import yaml
+import json
 import os
 
 from datetime import date
@@ -29,7 +31,7 @@ for term in term_list:
 
     # Remove the version in the id
     tmp_list = [x.split('.')[0] for x in tmp_list]
-    
+
     print(term, len(tmp_list))
     tmp_list=tmp_list
 #    tmp_list = tmp_list[0:2] # restricting to small run
@@ -49,11 +51,11 @@ print(term_list + ['NCBI Virus'], len(id_set))
 def chunks(lst, n):
     for i in range(0, len(lst), n):
         yield lst[i:i + n]
-        
+
 num_ids_for_request = 100
 if not os.path.exists(dir_metadata_today):
     os.makedirs(dir_metadata_today)
-    
+
     for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)):
         path_metadata_xxx_xml = os.path.join(dir_metadata_today, 'metadata_{}.xml'.format(i))
         print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml))
@@ -63,7 +65,7 @@ if not os.path.exists(dir_metadata_today):
                 Entrez.efetch(db='nuccore', id=id_x_list, retmode='xml').read()
             )
 
-            
+
 term_to_uri_dict = {}
 
 for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
@@ -74,7 +76,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
             if len(line.split(',')) > 2:
                 term, uri = line.strip('\n').split('",')
                 term = term.strip('"')
-            else:    
+            else:
                 term, uri = line.strip('\n').split(',')
 
             term_to_uri_dict[term] = uri
@@ -125,7 +127,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                 ):
                     if info_to_check in GBSeq_comment_text:
                         tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0]
-                        
+
                         if field_in_yaml == 'sequencing_coverage':
                             # A regular expression would be better!
                             info_for_yaml_dict['technology'][field_in_yaml] = ';'.join(
@@ -139,7 +141,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                                     seq_tec = term_to_uri_dict[seq_tec]
                                 else:
                                     print(accession_version, 'missing technologies:', seq_tec)
- 
+
                                 new_seq_tec_list.append(seq_tec)
 
                             for n, seq_tec in enumerate(new_seq_tec_list):
@@ -147,7 +149,7 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                         else:
                             info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse
 
-                        
+
                         #term_to_uri_dict
 
             for GBFeature in GBSeq.iter('GBFeature'):
@@ -211,12 +213,12 @@ if not os.path.exists(dir_fasta_and_yaml_today):
                         info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
 
 
-            #Remove technology key if empty!
+            # Remove technology key if empty!
             if (info_for_yaml_dict['technology']=={}):
-                del info_for_yaml_dict['key']
+                del info_for_yaml_dict['technology']
 
             with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw:
                 fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
 
             with open(os.path.join(dir_fasta_and_yaml_today, '{}.yaml'.format(accession_version)), 'w') as fw:
-                yaml.dump(info_for_yaml_dict, fw, default_flow_style=False)
+                json.dump(info_for_yaml_dict, fw, indent=2)
-- 
cgit 1.4.1


From 7e085b2958d9bd4f0a2b1912cf259a05b56366bc Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 21 Apr 2020 13:22:53 -0400
Subject: Tweak handling of "coverage" also fix typo

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 bh20sequploader/bh20seq-schema.yml                               | 4 ++--
 bh20sequploader/bh20seq-shex.rdf                                 | 2 +-
 scripts/dict_ontology_standardization/ncbi_speciesman_source.csv | 2 +-
 scripts/from_genbank_to_fasta_and_yaml.py                        | 9 ++++++---
 4 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index ebca35b..75308ab 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -162,12 +162,12 @@ $graph:
         _id: http://www.ebi.ac.uk/efo/EFO_0002699
     sequencing_coverage:
       doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x)
-      type: ["null", float, int]
+      type: float?
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/FLU_0000848
     sequencing_coverage2:
       doc: If a second sequence technology was used you can submit its coverage here
-      type: ["null", float, int]
+      type: float?
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/FLU_0000848
     additional_technology_information:
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 59ee71b..31e714f 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -50,7 +50,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 
 :technologyShape {
     obo:OBI_0600047 IRI {0,2} ;
-    obo:FLU_0000848 xsd:integer ?;
+    obo:FLU_0000848 xsd:double ?;
     efo:EFO_0002699 xsd:string ?;
 }
 
diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
index 2905588..909cf37 100644
--- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
+++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
@@ -1,4 +1,4 @@
-nasopharyngeal swab, http://purl.obolibrary.org/obo/NCIT_C155831
+nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831
 nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831
 respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831
 naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 1a12513..00c0012 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -130,9 +130,12 @@ if not os.path.exists(dir_fasta_and_yaml_today):
 
                         if field_in_yaml == 'sequencing_coverage':
                             # A regular expression would be better!
-                            info_for_yaml_dict['technology'][field_in_yaml] = ';'.join(
-                                [x.strip('(average)').strip("reads/nt").replace(',', '.').strip(' xX>') for x in tech_info_to_parse.split(';')]
-                            )
+                            try:
+                                info_for_yaml_dict['technology'][field_in_yaml] = float(
+                                    tech_info_to_parse.strip('(average)').strip("reads/nt").replace(',', '.').strip(' xX>'))
+                            except ValueError:
+                                print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse)
+                                pass
                         elif field_in_yaml == 'sample_sequencing_technology':
                             new_seq_tec_list = []
                             for seq_tec in tech_info_to_parse.split(';'):
-- 
cgit 1.4.1


From cad23032ecf6ef325aab2978d5df36609ad50088 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 21 Apr 2020 18:16:47 +0000
Subject: add noLinkCheck to specimen_source2

---
 bh20sequploader/bh20seq-schema.yml | 1 +
 1 file changed, 1 insertion(+)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 75308ab..1ceebe2 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -106,6 +106,7 @@ $graph:
       jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/OBI_0001479
           _type: "@id"
+          noLinkCheck: true
     sample_storage_conditions:
       doc: Information about storage of a specified type, e.g.  frozen specimen, paraffin, fresh ....
       type: string?
-- 
cgit 1.4.1


From f4c3da88c1233802fea46cc972a81dc3b5b51185 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 21 Apr 2020 15:37:58 -0400
Subject: Work around CWL content size limit by chunking

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 bh20sequploader/main.py                       |  1 +
 workflows/pangenome-generate/relabel-seqs.cwl | 31 +++++++++++++++++++++++----
 workflows/pangenome-generate/relabel-seqs.py  | 22 +++++++++++++------
 3 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index 2fda347..4c4711d 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -63,6 +63,7 @@ def main():
     external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')
 
     properties = {
+        "sequence_label": seqlabel,
         "upload_app": "bh20-seq-uploader",
         "upload_ip": external_ip,
         "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname())
diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl
index 2b780d4..01196f6 100644
--- a/workflows/pangenome-generate/relabel-seqs.cwl
+++ b/workflows/pangenome-generate/relabel-seqs.cwl
@@ -3,6 +3,10 @@ class: CommandLineTool
 inputs:
   readsFA: File[]
   subjects: string[]
+  script:
+    type: File
+    default: {class: File, location: relabel-seqs.py}
+    inputBinding: {}
 outputs:
   relabeledSeqs:
     type: File
@@ -15,11 +19,30 @@ outputs:
 requirements:
   InlineJavascriptRequirement: {}
   InitialWorkDirRequirement:
-    listing:
-      - entry: {$include: relabel-seqs.py}
-        entryname: relabel-seqs.py
+    listing: |
+          ${
+          var i = 0;
+          var b = 1;
+          var out = [];
+          for (; i < inputs.readsFA.length; i++) {
+            var block = [];
+            for (; i < (b*100) && i < inputs.readsFA.length; i++) {
+              block.push(inputs.readsFA[i]);
+            }
+            out.push({
+              entryname: "block"+b,
+              entry: JSON.stringify(block)
+            });
+            b++;
+          }
+          out.push({
+            entry: JSON.stringify(inputs.subjects),
+            entryname: "subjects"
+          });
+          return out;
+          }
 hints:
   DockerRequirement:
     dockerPull: commonworkflowlanguage/cwltool_module
 stdout:
-baseCommand: [python, relabel-seqs.py]
+baseCommand: [python]
diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py
index 1188ceb..970540f 100644
--- a/workflows/pangenome-generate/relabel-seqs.py
+++ b/workflows/pangenome-generate/relabel-seqs.py
@@ -1,5 +1,15 @@
-reads = $(inputs.readsFA)
-subjects = $(inputs.subjects)
+import os
+import json
+
+reads = []
+b = 1
+while os.path.exists("block%i" % b):
+    with open("block%i" % b) as f:
+        reads.extend(json.load(f))
+    b += 1
+
+with open("subjects") as f:
+    subjects = json.load(f)
 
 relabeled_fasta = open("relabeledSeqs.fasta", "wt")
 original_labels = open("originalLabels.ttl", "wt")
@@ -7,12 +17,12 @@ original_labels = open("originalLabels.ttl", "wt")
 for i, r in enumerate(reads):
     with open(r["path"], "rt") as fa:
         label = fa.readline()
-        original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \\"%s\\" .\\n" % (subjects[i], label[1:].strip().replace('"', '\\\\"')))
-        relabeled_fasta.write(">"+subjects[i]+"\\n")
+        original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"')))
+        relabeled_fasta.write(">"+subjects[i]+"\n")
         data = fa.read(8096)
         while data:
             relabeled_fasta.write(data)
-            endswithnewline = data.endswith("\\n")
+            endswithnewline = data.endswith("\n")
             data = fa.read(8096)
         if not endswithnewline:
-            relabeled_fasta.write("\\n")
+            relabeled_fasta.write("\n")
-- 
cgit 1.4.1


From a12fe94f174da766be612fbb2712b4db2ba98296 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Wed, 22 Apr 2020 19:41:27 +0200
Subject: Small changes all around, trying to make the importer/metadata better

---
 bh20sequploader/bh20seq-schema.yml        |  4 ++--
 bh20sequploader/bh20seq-shex.rdf          | 25 +++++++++++++++----------
 example/metadata.yaml                     |  8 ++++----
 scripts/foreach.sh                        |  2 +-
 scripts/from_genbank_to_fasta_and_yaml.py | 12 ++++++++----
 5 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 1ceebe2..80013c3 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -25,7 +25,7 @@ $graph:
         jsonldPredicate:
           _id: http://semanticscience.org/resource/SIO_000115
     host_sex:
-        doc: Sex of the host as defined in NCIT, IRI expected (http://purl.obolibrary.org/obo/NCIT_C20197 (Male), http://purl.obolibrary.org/obo/NCIT_C27993 (Female), http://purl.obolibrary.org/obo/NCIT_C45908 (Intersex), or http://purl.obolibrary.org/obo/NCIT_C17998 (Unknown))
+        doc: Sex of the host as defined in PATO, expect male () or female ()
         type: string?
         jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/PATO_0000047
@@ -144,7 +144,7 @@ $graph:
   fields:
     sample_sequencing_technology:
       doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
-      type: string
+      type: string?
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/OBI_0600047
         _type: "@id"
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 31e714f..8d0055e 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -23,35 +23,40 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
     obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?;
     obo:PATO_0000011 xsd:integer ?;
     obo:NCIT_C42574 [ obo:UO_~ ] ?;
-    sio:SIO_001167 xsd:string ?;
+	obo:NCIT_C25688 xsd:string ? ;
     efo:EFO_0000727 xsd:string ?;
+    obo:VO_0000002 xsd:string ?;
+    sio:SIO_001167 xsd:string ?;
 }
 
 :sampleShape  {
     sio:SIO_000115 xsd:string;
-    obo:GAZ_00000448 [wikidata:~] ;
 	evs:C25164 xsd:string;
+	obo:GAZ_00000448 [wikidata:~] ;
     obo:OBI_0001895 xsd:string ?;
-    sio:SIO_001167 xsd:string ?;
-    obo:OBI_0001472 xsd:string ?;
+    obo:NCIT_C41206 xsd:string ?;
     obo:OBI_0001479 IRI {0,2};
+    obo:OBI_0001472 xsd:string ?;
+    sio:SIO_001167 xsd:string ?;
 }
 
 :submitterShape {
     obo:NCIT_C42781 xsd:string ;
-	obo:NCIT_C37984 xsd:string ?;
-	obo:NCIT_C37900 xsd:string ?;
     sio:SIO_000116 xsd:string ?;
-    obo:OBI_0600047 xsd:string ?;
-    sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
     sio:SIO_000172 xsd:string ?;
+    obo:NCIT_C37984 xsd:string ?;
+    obo:OBI_0600047 xsd:string ?;
+  	obo:NCIT_C37900 xsd:string ?;
     efo:EFO_0001741 xsd:string ?;
+    obo:NCIT_C19026 xsd:string ?;
+    sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
 }
 
 :technologyShape {
-    obo:OBI_0600047 IRI {0,2} ;
-    obo:FLU_0000848 xsd:double ?;
+    obo:OBI_0600047 IRI {0,2} ?;
     efo:EFO_0002699 xsd:string ?;
+    obo:FLU_0000848 xsd:double {0,2};
+    sio:SIO_001167 xsd:string ?;
 }
 
 :virusShape{
diff --git a/example/metadata.yaml b/example/metadata.yaml
index 57d90b5..d1b10c1 100644
--- a/example/metadata.yaml
+++ b/example/metadata.yaml
@@ -6,7 +6,7 @@ host:
     host_sex: http://purl.obolibrary.org/obo/NCIT_C27993
     host_age: 20
     host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
-    host_health_status: A condition or state at a particular time (Disease ontology)
+    host_health_status: A condition or state at a particular time
     host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
     host_vaccination: List of vaccines given to the host (RRIDs?)
     additional_host_information: Field for additional host information
@@ -29,15 +29,15 @@ virus:
 technology:
     sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173
     sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173
-    sequence_assembly_method: Protocol used for assembly (CWL, WDL, NF, BCO?)
+    sequence_assembly_method: Protocol used for assembly
     sequencing_coverage: 70
 
 submitter:
-    submitter_name: John Doe (ORCID?)
+    submitter_name: John Doe
     submitter_address: John Doe's adress
     originating_lab: John Doe kitchen
     lab_address: John Doe's address
     provider_sample_id: HmX
     submitter_sample_id: xXx
     authors: John Doe et all
-    submitter_orcid: https://orcid.org/0000-0000-0000-0000 (if this is here, others can be optional?)
+    submitter_orcid: https://orcid.org/0000-0000-0000-0000
\ No newline at end of file
diff --git a/scripts/foreach.sh b/scripts/foreach.sh
index 35b07b8..ddc9387 100755
--- a/scripts/foreach.sh
+++ b/scripts/foreach.sh
@@ -2,7 +2,7 @@
 rm -rf validated fasta_and_yaml_*
 mkdir -p validated
 ./from_genbank_to_fasta_and_yaml.py
-fasta_files=$(find fasta_and_yaml_20200421/ -name "*.fasta")
+fasta_files=$(find fasta_and_yaml/ -name "*.fasta")
 for f in $fasta_files ; do
     yaml=$(echo $f | rev | cut -c7- | rev).yaml
     echo $f
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 00c0012..096a6af 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -8,10 +8,11 @@ import json
 import os
 
 from datetime import date
-today = date.today().strftime("%Y%m%d")
+#today = date.today().strftime("%Y%m%d")
 
-dir_metadata_today = 'metadata_from_nuccore_{}'.format(today)
-dir_fasta_and_yaml_today = 'fasta_and_yaml_{}'.format(today)
+
+dir_metadata_today = 'metadata_from_nuccore' #_{}'.format(today)
+dir_fasta_and_yaml_today = 'fasta_and_yaml' #'.format(today)
 
 dir_dict_ontology_standardization = 'dict_ontology_standardization/'
 
@@ -177,7 +178,10 @@ if not os.path.exists(dir_fasta_and_yaml_today):
 
                         if len(GBQualifier_value_text_list) > 1:
                             if GBQualifier_value_text_list[1] in ['male', 'female']:
-                                info_for_yaml_dict['host']['host_sex'] = GBQualifier_value_text_list[1]
+                                if GBQualifier_value_text_list[1]=='male':
+                                    info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384"
+                                elif GBQualifier_value_text_list[1]=='female':
+                                    info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000383"
                             else:
                                 info_for_yaml_dict['host']['host_health_status'] = GBQualifier_value_text_list[1]
 
-- 
cgit 1.4.1


From 2d3f8b9707bd13433ca82449ad82dbc406a28f95 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Wed, 22 Apr 2020 20:43:09 +0200
Subject: Including restrictions to the host status

---
 bh20sequploader/bh20seq-options.yml | 9 +++++++++
 bh20sequploader/bh20seq-schema.yml  | 2 +-
 bh20sequploader/bh20seq-shex.rdf    | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-options.yml b/bh20sequploader/bh20seq-options.yml
index da47e1a..68f6e79 100644
--- a/bh20sequploader/bh20seq-options.yml
+++ b/bh20sequploader/bh20seq-options.yml
@@ -14,6 +14,15 @@ host_sex:
   Male: http://purl.obolibrary.org/obo/PATO_0000384
   Female: http://purl.obolibrary.org/obo/PATO_0000383
 
+host_health_status:
+  healthy: http://purl.obolibrary.org/obo/NCIT_C115935
+  asymptomatic:	http://purl.obolibrary.org/obo/NCIT_C3833
+  sympotmatic:	http://purl.obolibrary.org/obo/NCIT_C25269
+  admitted to hospital:	http://purl.obolibrary.org/obo/GENEPIO_0002020
+  discharged from hospital:	http://purl.obolibrary.org/obo/GENEPIO_0001849
+  dead:	http://purl.obolibrary.org/obo/NCIT_C28554
+  alive: http://purl.obolibrary.org/obo/NCIT_C37987
+
 sample_sequencing_technology:
   Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
   Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 80013c3..232ccc6 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -44,7 +44,7 @@ $graph:
           _type: "@id"
           noLinkCheck: true
     host_health_status:
-        doc: A condition or state at a particular time
+        doc: A condition or state at a particular time, must be one of the following (obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987)
         type: string?
         jsonldPredicate: http://purl.obolibrary.org/obo/NCIT_C25688
     host_treatment:
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 8d0055e..bb15f91 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -23,7 +23,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
     obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?;
     obo:PATO_0000011 xsd:integer ?;
     obo:NCIT_C42574 [ obo:UO_~ ] ?;
-	obo:NCIT_C25688 xsd:string ? ;
+	obo:NCIT_C25688 [obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987 ] ? ;
     efo:EFO_0000727 xsd:string ?;
     obo:VO_0000002 xsd:string ?;
     sio:SIO_001167 xsd:string ?;
-- 
cgit 1.4.1


From a448aba5afb633dec197c93ed5fcc6fa61c7c491 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Wed, 22 Apr 2020 21:06:47 +0200
Subject: Forgot to add _id

---
 bh20sequploader/bh20seq-schema.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 232ccc6..9a89979 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -46,7 +46,8 @@ $graph:
     host_health_status:
         doc: A condition or state at a particular time, must be one of the following (obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987)
         type: string?
-        jsonldPredicate: http://purl.obolibrary.org/obo/NCIT_C25688
+        jsonldPredicate:
+          _id: http://purl.obolibrary.org/obo/NCIT_C25688
     host_treatment:
       doc: Process in which the act is intended to modify or alter host status
       type: string?
-- 
cgit 1.4.1


From 7ef2c5c45d3d1b6e71a08fd0bdf19c42ef9e1014 Mon Sep 17 00:00:00 2001
From: lltommy
Date: Wed, 22 Apr 2020 21:23:32 +0200
Subject: Fixing ShEx expression, one ? too much

---
 bh20sequploader/bh20seq-shex.rdf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'bh20sequploader')

diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index bb15f91..246fd57 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -53,7 +53,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 }
 
 :technologyShape {
-    obo:OBI_0600047 IRI {0,2} ?;
+    obo:OBI_0600047 IRI {0,2} ;
     efo:EFO_0002699 xsd:string ?;
     obo:FLU_0000848 xsd:double {0,2};
     sio:SIO_001167 xsd:string ?;
-- 
cgit 1.4.1