From f616b4200a5a72c0c9cd0ee911d5048b67cdcca2 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 4 Nov 2020 16:11:37 +0100 Subject: fixed sequence_assembly_method field description --- bh20sequploader/bh20seq-schema.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 0aead3b..861a5ca 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -186,7 +186,7 @@ $graph: _type: "@id" noLinkCheck: true sequence_assembly_method: - doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome + doc: Field for additional information on the pipeline applied to obtain the assembly type: string? jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0002699 -- cgit v1.2.3 From 5fdfece97fb2d50a10eab5004a6467ec0097ece8 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 11:19:28 +0000 Subject: Uploader script improvements --- bh20sequploader/main.py | 5 +++-- bh20sequploader/qc_fasta.py | 9 +++++---- doc/INSTALL.md | 8 +++++++- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index f89b458..ea0fa70 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -49,7 +49,7 @@ sequence for enough overlap with the reference genome failed = True except Exception as e: log.exception("Failed metadata QC") - failed = True + failed = True # continue with the FASTA checker target = [] try: @@ -64,13 +64,14 @@ sequence for enough overlap with the reference genome target[1] = ("reads_2."+target[1][0][6:], target[1][1], target[1][2]) if do_qc and target[0][2] == 'text/fasta' and sample_id != target[0][1]: - raise ValueError("The sample_id field in the metadata must be the same as the FASTA header") + raise ValueError(f"The sample_id field in the metadata ({sample_id}) must be the same as the FASTA header ({target[0][1]})") except Exception as e: log.exception("Failed sequence QC") failed = True if failed: + log.debug("Bailing out!") exit(1) return target diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index f567f0a..814fb3e 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -66,7 +66,8 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True): similarity = 0 try: - cmd = ["minimap2", "-c -x asm20", tmp1.name, tmp2.name] + log.debug("Trying to run minimap2") + cmd = ["minimap2", "-c", "-x", "asm20", tmp1.name, tmp2.name] logging.info("QC checking similarity to reference") logging.info(" ".join(cmd)) result = subprocess.run(cmd, stdout=subprocess.PIPE) @@ -83,9 +84,7 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True): if similarity < 70.0: raise ValueError( - "QC fail for {}: alignment to reference was less than 70%% (was %2.2f%%)".format( - seqlabel, similarity - )) + f"QC fail for {seqlabel}: alignment to reference was less than 70% (was {similarity})") return "sequence.fasta" + gz, seqlabel, seq_type elif seq_type == "text/fastq": @@ -93,4 +92,6 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True): sequence.detach() return "reads.fastq" + gz, seqlabel, seq_type else: + log.debug(seqlabel) + log.debug(seq_type) raise ValueError("Sequence file ({}) does not look like a DNA FASTA or FASTQ".format(arg_sequence)) diff --git a/doc/INSTALL.md b/doc/INSTALL.md index f54c8f2..45aca0f 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -31,7 +31,7 @@ arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-clien 3. Run the tool directly with ```sh -guix environment guix --ad-hoc git python openssl python-pycurl python-magic nss-certs python-pyshex -- python3 bh20sequploader/main.py example/sequence.fasta example/maximum_metadata_example.yaml +guix environment guix --ad-hoc git python openssl python-pycurl python-magic nss-certs python-pyshex -- python3 bh20sequploader/main.py example/maximum_metadata_example.yaml example/sequence.fasta ``` Note that python-pyshex is packaged in @@ -44,6 +44,12 @@ repository. E.g. env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ ~/opt/guix/bin/guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc minimap2 python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp python3 bh20sequploader/main.py --help ``` +Latest successful Guix run + +```sh +env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ ~/opt/guix/bin/guix environment guix --ad-hoc git python openssl python-pycurl python-magic nss-certs python-pyshex python-arvados-python-client python-schema-salad minimap2 -- python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/AL_UT14.yaml scripts/uthsc_samples/yaml/AL_UT14.fa +``` + ### Using the Web Uploader To run the web uploader in a GNU Guix environment/container run it with something like -- cgit v1.2.3 From 0a2d9ef27d6c1b768873252287c6d967cf912025 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 7 Nov 2020 09:04:03 +0000 Subject: Fixed the License error (https!!) --- bh20sequploader/bh20seq-shex.rdf | 2 +- test/test_shex.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 89ecd00..fa765dd 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -1,7 +1,7 @@ PREFIX : PREFIX MainSchema: PREFIX hostSchema: -PREFIX cc: +PREFIX cc: PREFIX dc: PREFIX xsd: PREFIX obo: diff --git a/test/test_shex.py b/test/test_shex.py index e094184..50ce723 100644 --- a/test/test_shex.py +++ b/test/test_shex.py @@ -24,6 +24,7 @@ class TestStringMethods(unittest.TestCase): print(doc) g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx) shex = pkg_resources.resource_stream(__name__, "../bh20sequploader/bh20seq-shex.rdf").read().decode("utf-8") + # Note the https link simply acts as a URI descriptor (it does not fetch) rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape") g.serialize(format="ntriples") -- cgit v1.2.3 From 9709eca5c76aabe823ba34d976c5d11a9d150b76 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 7 Nov 2020 10:06:30 +0000 Subject: RDF: use URIs --- bh20sequploader/bh20seq-schema.yml | 8 ++++- bh20sequploader/bh20seq-shex.rdf | 5 +-- test/data/input/TN_UT2.yaml | 2 +- test/data/regression/TN_UT2.rdf | 64 +++++++++++++++++++------------------- 4 files changed, 43 insertions(+), 36 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 861a5ca..645be5e 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,6 +1,6 @@ $base: http://biohackathon.org/bh20-seq-schema $namespaces: - cc: http://creativecommons.org/ns# + cc: https://creativecommons.org/ns# dc: http://purl.org/metadata/dublin_core_elements# sch: https://schema.org/ efo: http://www.ebi.ac.uk/efo/ @@ -19,6 +19,8 @@ $graph: type: string jsonldPredicate: _id: https://creativecommons.org/ns#License + _type: "@id" + noLinkCheck: true title: doc: Attribution title related to data license type: string? @@ -34,11 +36,15 @@ $graph: type: string? jsonldPredicate: _id: https://creativecommons.org/ns#attributionURL + _type: "@id" + noLinkCheck: true attribution_source: doc: Attribution source URL related to data license type: string? jsonldPredicate: _id: https://creativecommons.org/ns#attributionSource + _type: "@id" + noLinkCheck: true - name: hostSchema type: record diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index fa765dd..1b15c58 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -2,6 +2,7 @@ PREFIX : PREFIX hostSchema: PREFIX cc: +PREFIX cclicenses: PREFIX dc: PREFIX xsd: PREFIX obo: @@ -71,9 +72,9 @@ PREFIX wikidata: } :licenseShape{ - cc:License xsd:string ; + cc:License [ cclicenses:~ ] ; dc:Title xsd:string ?; cc:attributionName xsd:string ?; - cc:attributionURL xsd:string ?; + cc:attributionURL /^http/ ; cc:attributionSource xsd:string ?; } diff --git a/test/data/input/TN_UT2.yaml b/test/data/input/TN_UT2.yaml index 2c5f987..9c4370d 100644 --- a/test/data/input/TN_UT2.yaml +++ b/test/data/input/TN_UT2.yaml @@ -1,7 +1,7 @@ id: placeholder license: - license_type: http://creativecommons.org/licenses/by/4.0/ + license_type: https://creativecommons.org/licenses/by/4.0/ title: "TN_UT2 - Pegram, Tennessee, USA" attribution_name: "Mariah Taylor, Colleen Jonsson" attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php diff --git a/test/data/regression/TN_UT2.rdf b/test/data/regression/TN_UT2.rdf index aa1356a..84353a5 100644 --- a/test/data/regression/TN_UT2.rdf +++ b/test/data/regression/TN_UT2.rdf @@ -1,33 +1,33 @@ -_:Nc519408f538a4bb39ea09f56ce9bb88e "TN_UT2" . -_:N299826fb66794f11991cebc5f8d8b24a "https://bio.tools/BWA#!" . -_:N982774a13a39474aae39272447a09574 "https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php" . -_:N7857c4718ab741169bae9fafa9b47c75 "2020-04-26" . -_:N708d19e87323455c9f9bb317538f742d "Pjotr Prins" . -_:N95d4a000e14d4bd99b46d6ec84989aaf "SARS-CoV-2/human/USA/TN_UT2/2020" . - _:N982774a13a39474aae39272447a09574 . -_:N95d4a000e14d4bd99b46d6ec84989aaf . -_:Nc519408f538a4bb39ea09f56ce9bb88e . -_:N708d19e87323455c9f9bb317538f742d "Colleen Jonsson" . -_:N982774a13a39474aae39272447a09574 "Mariah Taylor, Colleen Jonsson" . - _:N7857c4718ab741169bae9fafa9b47c75 . -_:N708d19e87323455c9f9bb317538f742d "Regional Biocontainment Laboratory, Memphis, TN" . -_:N299826fb66794f11991cebc5f8d8b24a . -_:N708d19e87323455c9f9bb317538f742d "UTHSC, Memphis, Tennessee 38163, USA" . -_:N7857c4718ab741169bae9fafa9b47c75 . -_:N708d19e87323455c9f9bb317538f742d "TN_UT2" . -_:N7857c4718ab741169bae9fafa9b47c75 "TN_UT2" . -_:N708d19e87323455c9f9bb317538f742d . - _:N708d19e87323455c9f9bb317538f742d . - _:N299826fb66794f11991cebc5f8d8b24a . -_:N982774a13a39474aae39272447a09574 "TN_UT2 - Pegram, Tennessee, USA" . -_:N299826fb66794f11991cebc5f8d8b24a "Oxford Nanopore MiniIon RNA long reads" . -_:N7857c4718ab741169bae9fafa9b47c75 . - _:Nc519408f538a4bb39ea09f56ce9bb88e . -_:N708d19e87323455c9f9bb317538f742d "Mariah Taylor" . -_:N708d19e87323455c9f9bb317538f742d . - _:N95d4a000e14d4bd99b46d6ec84989aaf . -_:N708d19e87323455c9f9bb317538f742d "Colleen B. Jonsson" . -_:N708d19e87323455c9f9bb317538f742d "TN_UT2" . -_:N982774a13a39474aae39272447a09574 "http://creativecommons.org/licenses/by/4.0/" . -_:N708d19e87323455c9f9bb317538f742d "Mariah Taylor" . +_:N850430bbc537473e8fbab09c300daf2b . +_:Ncab40e8bd4b84431aa6782f7e7480ef3 . +_:N3c28bc8014134d77823451de114c6085 "Mariah Taylor" . +_:Ncab40e8bd4b84431aa6782f7e7480ef3 "TN_UT2" . + _:N850430bbc537473e8fbab09c300daf2b . +_:N3c28bc8014134d77823451de114c6085 "TN_UT2" . +_:N850430bbc537473e8fbab09c300daf2b . +_:N3c28bc8014134d77823451de114c6085 "UTHSC, Memphis, Tennessee 38163, USA" . +_:Nebe62f68542a47d0936e953195cf25d7 . +_:N3c28bc8014134d77823451de114c6085 "Colleen B. Jonsson" . +_:N78bfdbe9d12745e99164d619cb125277 . +_:Nbf9e7b47381b4965926a47bf5b34e489 "Oxford Nanopore MiniIon RNA long reads" . + _:Ncab40e8bd4b84431aa6782f7e7480ef3 . +_:Nbf9e7b47381b4965926a47bf5b34e489 "https://bio.tools/BWA#!" . +_:Nebe62f68542a47d0936e953195cf25d7 "TN_UT2" . + _:Nebe62f68542a47d0936e953195cf25d7 . +_:N850430bbc537473e8fbab09c300daf2b "Mariah Taylor, Colleen Jonsson" . +_:N78bfdbe9d12745e99164d619cb125277 "SARS-CoV-2/human/USA/TN_UT2/2020" . +_:N3c28bc8014134d77823451de114c6085 "Colleen Jonsson" . +_:N3c28bc8014134d77823451de114c6085 . +_:Nebe62f68542a47d0936e953195cf25d7 "2020-04-26" . +_:Nebe62f68542a47d0936e953195cf25d7 . +_:N850430bbc537473e8fbab09c300daf2b "TN_UT2 - Pegram, Tennessee, USA" . +_:Nbf9e7b47381b4965926a47bf5b34e489 . +_:N3c28bc8014134d77823451de114c6085 "Mariah Taylor" . +_:N3c28bc8014134d77823451de114c6085 "TN_UT2" . +_:N3c28bc8014134d77823451de114c6085 "Pjotr Prins" . + _:N3c28bc8014134d77823451de114c6085 . +_:N3c28bc8014134d77823451de114c6085 . + _:N78bfdbe9d12745e99164d619cb125277 . +_:N3c28bc8014134d77823451de114c6085 "Regional Biocontainment Laboratory, Memphis, TN" . + _:Nbf9e7b47381b4965926a47bf5b34e489 . -- cgit v1.2.3