about summary refs log tree commit diff
path: root/bh20sequploader
diff options
context:
space:
mode:
authorPjotr Prins2020-07-17 11:08:15 +0100
committerPjotr Prins2020-07-17 11:08:15 +0100
commit16bb5df907c79cd0ce6bea0015821a2ce51fb992 (patch)
treeddb9677cddcc463bb514300189cbd4300b9117ed /bh20sequploader
parent0be9983ef88fd3b925d8fa53e7f9ab2a28703bc0 (diff)
parentc69046ee9a5e24eadcd8cb885633328b0fd88011 (diff)
downloadbh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.gz
bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.lz
bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.zip
Merge branch 'master' into ebi-submit
Diffstat (limited to 'bh20sequploader')
-rw-r--r--bh20sequploader/bh20seq-schema.yml15
-rw-r--r--bh20sequploader/bh20seq-shex.rdf25
-rw-r--r--bh20sequploader/main.py22
-rw-r--r--bh20sequploader/qc_fasta.py4
4 files changed, 40 insertions, 26 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index b3d4d12..0aead3b 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -1,6 +1,7 @@
 $base: http://biohackathon.org/bh20-seq-schema
 $namespaces:
   cc:  http://creativecommons.org/ns#
+  dc:  http://purl.org/metadata/dublin_core_elements#
   sch: https://schema.org/
   efo: http://www.ebi.ac.uk/efo/
   obo: http://purl.obolibrary.org/obo/
@@ -15,24 +16,29 @@ $graph:
   fields:
     license_type:
       doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
-      type: string?
+      type: string
       jsonldPredicate:
           _id: https://creativecommons.org/ns#License
     title:
       doc: Attribution title related to data license
       type: string?
       jsonldPredicate:
-          _id: http://semanticscience.org/resource/SIO_001167
+          _id: http://purl.org/metadata/dublin_core_elements#Title
+    attribution_name:
+      doc: Attribution NAME related to data license
+      type: string?
+      jsonldPredicate:
+          _id: https://creativecommons.org/ns#attributionName
     attribution_url:
       doc: Attribution URL related to data license
       type: string?
       jsonldPredicate:
-          _id: https://creativecommons.org/ns#Work
+          _id: https://creativecommons.org/ns#attributionURL
     attribution_source:
       doc: Attribution source URL related to data license
       type: string?
       jsonldPredicate:
-          _id: https://creativecommons.org/ns#Work
+          _id: https://creativecommons.org/ns#attributionSource
 
 - name: hostSchema
   type: record
@@ -258,6 +264,7 @@ $graph:
     virus: virusSchema
     technology: technologySchema
     submitter: submitterSchema
+    license: ["null", licenseSchema]
     id:
       doc: The subject (eg the fasta/fastq file) that the metadata describes
       type: string
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 965229c..bbc7309 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -1,6 +1,8 @@
 PREFIX : <https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#>
 PREFIX MainSchema: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
 PREFIX hostSchema: <http://biohackathon.org/bh20-seq-schema#hostSchema/>
+PREFIX cc:  <http://creativecommons.org/ns#>
+PREFIX dc:  <http://purl.org/metadata/dublin_core_elements#>
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
 PREFIX obo: <http://purl.obolibrary.org/obo/>
 PREFIX sio: <http://semanticscience.org/resource/>
@@ -15,10 +17,11 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
   MainSchema:submitter @:submitterShape ;
   MainSchema:technology @:technologyShape ;
   MainSchema:virus @:virusShape;
+  MainSchema:license @:licenseShape ?;
 }
 
 :hostShape  {
-  	efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
+    efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
     sio:SIO_000115 xsd:string ?;
     obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 obo:PATO_0001340] ?;
     obo:PATO_0000011 xsd:integer ?;
@@ -32,14 +35,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 
 :sampleShape  {
     sio:SIO_000115 xsd:string;
-	  evs:C25164 xsd:string;
-	  obo:GAZ_00000448 [wikidata:~] ;
+    evs:C25164 xsd:string;
+    obo:GAZ_00000448 [wikidata:~] ;
     obo:OBI_0001895 xsd:string ?;
     obo:NCIT_C41206 xsd:string ?;
     obo:OBI_0001479 IRI {0,2};
     obo:OBI_0001472 xsd:string ?;
     sio:SIO_001167 xsd:string ?;
-	edam:data_2091 IRI {0,3};
+    edam:data_2091 IRI {0,3};
 }
 
 :submitterShape {
@@ -47,7 +50,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
     sio:SIO_000116 xsd:string *;
     sio:SIO_000172 xsd:string ?;
     obo:NCIT_C37984 xsd:string ?;
-  	obo:NCIT_C37900 xsd:string ?;
+    obo:NCIT_C37900 xsd:string ?;
     efo:EFO_0001741 xsd:string ?;
     obo:NCIT_C42781 xsd:string ?;
     obo:NCIT_C19026 xsd:string ?;
@@ -63,6 +66,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 }
 
 :virusShape{
-	edam:data_1875 [ obo:NCBITaxon_~ ] ;
-  	sio:SIO_010055 xsd:string ?;
+    edam:data_1875 [ obo:NCBITaxon_~ ] ;
+    sio:SIO_010055 xsd:string ?;
 }
+
+:licenseShape{
+    cc:License xsd:string ;
+    dc:Title xsd:string ?;
+    cc:attributionName xsd:string ?;
+    cc:attributionURL xsd:string ?;
+    cc:attributionSource xsd:string ?;
+}
\ No newline at end of file
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index f744a8c..6049bf9 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -29,11 +29,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
     try:
         log.debug("Checking metadata" if do_qc else "Skipping metadata check")
         if do_qc and not qc_metadata(metadata.name):
-            log.warning("Failed metadata qc")
+            log.warning("Failed metadata QC")
             failed = True
     except Exception as e:
-        log.debug(e)
-        print(e)
+        log.exception("Failed metadata QC")
         failed = True
 
     target = []
@@ -45,8 +44,7 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
             target[0] = ("reads_1."+target[0][0][6:], target[0][1])
             target[1] = ("reads_2."+target[1][0][6:], target[0][1])
     except Exception as e:
-        log.debug(e)
-        print(e)
+        log.exception("Failed sequence QC")
         failed = True
 
     if failed:
@@ -82,7 +80,7 @@ def main():
     seqlabel = target[0][1]
 
     if args.validate:
-        print("Valid")
+        log.info("Valid")
         exit(0)
 
     col = arvados.collection.Collection(api_client=api)
@@ -91,10 +89,10 @@ def main():
     if args.sequence_p2:
         upload_sequence(col, target[1], args.sequence_p2)
 
-    print("Reading metadata")
+    log.info("Reading metadata")
     with col.open("metadata.yaml", "w") as f:
         r = args.metadata.read(65536)
-        print(r[0:20])
+        log.info(r[0:20])
         while r:
             f.write(r)
             r = args.metadata.read(65536)
@@ -118,7 +116,7 @@ def main():
                                            ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
     if dup["items"]:
         # This exact collection has been uploaded before.
-        print("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
+        log.error("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
         exit(1)
 
     if args.trusted:
@@ -131,9 +129,9 @@ def main():
                  (seqlabel, properties['upload_user'], properties['upload_ip']),
                  properties=properties, ensure_unique_name=True)
 
-    print("Saved to %s" % col.manifest_locator())
-
-    print("Done")
+    log.info("Saved to %s" % col.manifest_locator())
+    log.info("Done")
+    exit(0)
 
 if __name__ == "__main__":
     main()
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 37eb4e8..0c7e16d 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -84,10 +84,8 @@ def qc_fasta(arg_sequence, check_with_clustalw=True):
                 except Exception as e:
                     logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e)
 
-                if similarity and similarity < 70.0:
+                if similarity < 70.0:
                     raise ValueError("QC fail: alignment to reference was less than 70%% (was %2.2f%%)" % (similarity))
-                if similarity == 0:
-                    raise ValueError("QC fail")
 
         return ("sequence.fasta"+gz, seqlabel)
     elif seq_type == "text/fastq":