diff options
Diffstat (limited to 'bh20sequploader')
| -rw-r--r-- | bh20sequploader/bh20seq-schema.yml | 10 | ||||
| -rw-r--r-- | bh20sequploader/bh20seq-shex.rdf | 7 | ||||
| -rw-r--r-- | bh20sequploader/main.py | 5 | ||||
| -rw-r--r-- | bh20sequploader/qc_fasta.py | 9 | 
4 files changed, 20 insertions, 11 deletions
| diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 0aead3b..645be5e 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,6 +1,6 @@ $base: http://biohackathon.org/bh20-seq-schema $namespaces: - cc: http://creativecommons.org/ns# + cc: https://creativecommons.org/ns# dc: http://purl.org/metadata/dublin_core_elements# sch: https://schema.org/ efo: http://www.ebi.ac.uk/efo/ @@ -19,6 +19,8 @@ $graph: type: string jsonldPredicate: _id: https://creativecommons.org/ns#License + _type: "@id" + noLinkCheck: true title: doc: Attribution title related to data license type: string? @@ -34,11 +36,15 @@ $graph: type: string? jsonldPredicate: _id: https://creativecommons.org/ns#attributionURL + _type: "@id" + noLinkCheck: true attribution_source: doc: Attribution source URL related to data license type: string? jsonldPredicate: _id: https://creativecommons.org/ns#attributionSource + _type: "@id" + noLinkCheck: true - name: hostSchema type: record @@ -186,7 +192,7 @@ $graph: _type: "@id" noLinkCheck: true sequence_assembly_method: - doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome + doc: Field for additional information on the pipeline applied to obtain the assembly type: string? jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0002699 diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 6139e55..11eb75e 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -1,7 +1,8 @@ PREFIX : <https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#> PREFIX MainSchema: <http://biohackathon.org/bh20-seq-schema#MainSchema/> PREFIX hostSchema: <http://biohackathon.org/bh20-seq-schema#hostSchema/> -PREFIX cc: <http://creativecommons.org/ns#> +PREFIX cc: <https://creativecommons.org/ns#> +PREFIX cclicenses: <https://creativecommons.org/licenses/> PREFIX dc: <http://purl.org/metadata/dublin_core_elements#> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX obo: <http://purl.obolibrary.org/obo/> @@ -71,9 +72,9 @@ PREFIX wikidata: <http://www.wikidata.org/entity/> } :licenseShape{ - cc:License xsd:string ; + cc:License [ cclicenses:~ ] ; dc:Title xsd:string ?; cc:attributionName xsd:string ?; - cc:attributionURL xsd:string ?; + cc:attributionURL /^http/ ; cc:attributionSource xsd:string ?; } diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index f89b458..ea0fa70 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -49,7 +49,7 @@ sequence for enough overlap with the reference genome failed = True except Exception as e: log.exception("Failed metadata QC") - failed = True + failed = True # continue with the FASTA checker target = [] try: @@ -64,13 +64,14 @@ sequence for enough overlap with the reference genome target[1] = ("reads_2."+target[1][0][6:], target[1][1], target[1][2]) if do_qc and target[0][2] == 'text/fasta' and sample_id != target[0][1]: - raise ValueError("The sample_id field in the metadata must be the same as the FASTA header") + raise ValueError(f"The sample_id field in the metadata ({sample_id}) must be the same as the FASTA header ({target[0][1]})") except Exception as e: log.exception("Failed sequence QC") failed = True if failed: + log.debug("Bailing out!") exit(1) return target diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index f567f0a..814fb3e 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -66,7 +66,8 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True): similarity = 0 try: - cmd = ["minimap2", "-c -x asm20", tmp1.name, tmp2.name] + log.debug("Trying to run minimap2") + cmd = ["minimap2", "-c", "-x", "asm20", tmp1.name, tmp2.name] logging.info("QC checking similarity to reference") logging.info(" ".join(cmd)) result = subprocess.run(cmd, stdout=subprocess.PIPE) @@ -83,9 +84,7 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True): if similarity < 70.0: raise ValueError( - "QC fail for {}: alignment to reference was less than 70%% (was %2.2f%%)".format( - seqlabel, similarity - )) + f"QC fail for {seqlabel}: alignment to reference was less than 70% (was {similarity})") return "sequence.fasta" + gz, seqlabel, seq_type elif seq_type == "text/fastq": @@ -93,4 +92,6 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True): sequence.detach() return "reads.fastq" + gz, seqlabel, seq_type else: + log.debug(seqlabel) + log.debug(seq_type) raise ValueError("Sequence file ({}) does not look like a DNA FASTA or FASTQ".format(arg_sequence)) | 
