about summary refs log tree commit diff
path: root/bh20sequploader
diff options
context:
space:
mode:
authorlltommy2020-11-11 09:56:12 +0100
committerlltommy2020-11-11 09:56:12 +0100
commitd6aa323b6fc7a82e45cc1df51fc72c2d547146eb (patch)
tree6e8b77bde4dc34fab3fa8804906f3cb821f61dae /bh20sequploader
parentc5fe5de7e4c77bfb48b1ae2f662c2d9cc120c06e (diff)
parentc872248e43c1c66e5fed8ef341f7b4ac21d63e6f (diff)
downloadbh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.gz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.lz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.zip
Merge branch 'master' of https://github.com/arvados/bh20-seq-resource
Diffstat (limited to 'bh20sequploader')
-rw-r--r--bh20sequploader/bh20seq-schema.yml10
-rw-r--r--bh20sequploader/bh20seq-shex.rdf7
-rw-r--r--bh20sequploader/main.py5
-rw-r--r--bh20sequploader/qc_fasta.py9
4 files changed, 20 insertions, 11 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 0aead3b..645be5e 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -1,6 +1,6 @@
 $base: http://biohackathon.org/bh20-seq-schema
 $namespaces:
-  cc:  http://creativecommons.org/ns#
+  cc:  https://creativecommons.org/ns#
   dc:  http://purl.org/metadata/dublin_core_elements#
   sch: https://schema.org/
   efo: http://www.ebi.ac.uk/efo/
@@ -19,6 +19,8 @@ $graph:
       type: string
       jsonldPredicate:
           _id: https://creativecommons.org/ns#License
+          _type: "@id"
+          noLinkCheck: true
     title:
       doc: Attribution title related to data license
       type: string?
@@ -34,11 +36,15 @@ $graph:
       type: string?
       jsonldPredicate:
           _id: https://creativecommons.org/ns#attributionURL
+          _type: "@id"
+          noLinkCheck: true
     attribution_source:
       doc: Attribution source URL related to data license
       type: string?
       jsonldPredicate:
           _id: https://creativecommons.org/ns#attributionSource
+          _type: "@id"
+          noLinkCheck: true
 
 - name: hostSchema
   type: record
@@ -186,7 +192,7 @@ $graph:
         _type: "@id"
         noLinkCheck: true
     sequence_assembly_method:
-      doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome
+      doc: Field for additional information on the pipeline applied to obtain the assembly
       type: string?
       jsonldPredicate:
         _id: http://www.ebi.ac.uk/efo/EFO_0002699
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 6139e55..11eb75e 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -1,7 +1,8 @@
 PREFIX : <https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#>
 PREFIX MainSchema: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
 PREFIX hostSchema: <http://biohackathon.org/bh20-seq-schema#hostSchema/>
-PREFIX cc:  <http://creativecommons.org/ns#>
+PREFIX cc:  <https://creativecommons.org/ns#>
+PREFIX cclicenses:  <https://creativecommons.org/licenses/>
 PREFIX dc:  <http://purl.org/metadata/dublin_core_elements#>
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
 PREFIX obo: <http://purl.obolibrary.org/obo/>
@@ -71,9 +72,9 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 }
 
 :licenseShape{
-    cc:License xsd:string ;
+    cc:License [ cclicenses:~ ] ;
     dc:Title xsd:string ?;
     cc:attributionName xsd:string ?;
-    cc:attributionURL xsd:string ?;
+    cc:attributionURL /^http/ ;
     cc:attributionSource xsd:string ?;
 }
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index f89b458..ea0fa70 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -49,7 +49,7 @@ sequence for enough overlap with the reference genome
                 failed = True
     except Exception as e:
         log.exception("Failed metadata QC")
-        failed = True
+        failed = True # continue with the FASTA checker
 
     target = []
     try:
@@ -64,13 +64,14 @@ sequence for enough overlap with the reference genome
             target[1] = ("reads_2."+target[1][0][6:], target[1][1], target[1][2])
 
         if do_qc and target[0][2] == 'text/fasta' and sample_id != target[0][1]:
-            raise ValueError("The sample_id field in the metadata must be the same as the FASTA header")
+            raise ValueError(f"The sample_id field in the metadata ({sample_id}) must be the same as the FASTA header ({target[0][1]})")
 
     except Exception as e:
         log.exception("Failed sequence QC")
         failed = True
 
     if failed:
+        log.debug("Bailing out!")
         exit(1)
 
     return target
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index f567f0a..814fb3e 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -66,7 +66,8 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True):
 
                     similarity = 0
                     try:
-                        cmd = ["minimap2", "-c -x asm20", tmp1.name, tmp2.name]
+                        log.debug("Trying to run minimap2")
+                        cmd = ["minimap2", "-c", "-x", "asm20", tmp1.name, tmp2.name]
                         logging.info("QC checking similarity to reference")
                         logging.info(" ".join(cmd))
                         result = subprocess.run(cmd, stdout=subprocess.PIPE)
@@ -83,9 +84,7 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True):
 
                     if similarity < 70.0:
                         raise ValueError(
-                            "QC fail for {}: alignment to reference was less than 70%% (was %2.2f%%)".format(
-                                seqlabel, similarity
-                            ))
+                            f"QC fail for {seqlabel}: alignment to reference was less than 70% (was {similarity})")
 
         return "sequence.fasta" + gz, seqlabel, seq_type
     elif seq_type == "text/fastq":
@@ -93,4 +92,6 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True):
         sequence.detach()
         return "reads.fastq" + gz, seqlabel, seq_type
     else:
+        log.debug(seqlabel)
+        log.debug(seq_type)
         raise ValueError("Sequence file ({}) does not look like a DNA FASTA or FASTQ".format(arg_sequence))