From 925058d0b3db70803d322cc2a33801240899a20a Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 10 Apr 2020 15:52:37 -0400 Subject: Fix up fasta/fastq validation --- bh20seqanalyzer/main.py | 9 ++++++++- bh20sequploader/main.py | 29 +++++------------------------ setup.py | 5 +++-- 3 files changed, 16 insertions(+), 27 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 1fb51b5..c05b402 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -8,6 +8,7 @@ import json import logging import ruamel.yaml from bh20sequploader.qc_metadata import qc_metadata +from bh20sequploader.qc_fasta import qc_fasta import pkg_resources from schema_salad.sourceline import add_lc_filename @@ -38,7 +39,13 @@ def validate_upload(api, collection, validated_project, logging.warn("Failed metadata qc") if valid: - if "sequence.fasta" not in col: + if "sequence.fasta" in col: + try: + qc_fasta(col.open("sequence.fasta")) + except Exception as e: + logging.warn(e) + valid = False + else: if "reads.fastq" in col: start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid) return False diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 2032508..4a225f6 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -8,7 +8,8 @@ from pathlib import Path import urllib.request import socket import getpass -from qc_metadata import qc_metadata +from .qc_metadata import qc_metadata +from .qc_fasta import qc_fasta ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -22,34 +23,14 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - if not bh20sequploader.qc_metadata.qc_metadata(args.metadata.name): + target = qc_fasta(args.sequence) + + if not qc_metadata(args.metadata.name): print("Failed metadata qc") exit(1) col = arvados.collection.Collection(api_client=api) - magic_file = Path(__file__).parent / "validation" / "formats.mgc" - val = magic.Magic(magic_file=magic_file.resolve().as_posix(), - uncompress=False, mime=True) - seq_type = val.from_file(args.sequence.name).lower() - print(f"Sequence type: {seq_type}") - if seq_type == "text/fasta": - # ensure that contains only one entry - entries = 0 - for line in args.sequence: - if line.startswith(">"): - entries += 1 - if entries > 1: - raise ValueError("FASTA file contains multiple entries") - break - args.sequence.close() - args.sequence = open(args.sequence.name, "r") - target = "reads.fastq" - elif seq_type == "text/fastq": - target = "sequence.fasta" - else: - raise ValueError("Sequence file does not look like FASTA or FASTQ") - with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) diff --git a/setup.py b/setup.py index 41ace7b..18e858e 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ try: except ImportError: tagger = egg_info_cmd.egg_info -install_requires = ["arvados-python-client", "schema-salad"] +install_requires = ["arvados-python-client", "schema-salad", "python-magic"] web_requires = ["flask", "pyyaml"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) @@ -31,7 +31,8 @@ setup( author_email="peter.amstutz@curii.com", license="Apache 2.0", packages=["bh20sequploader", "bh20seqanalyzer", "bh20simplewebuploader"], - package_data={"bh20sequploader": ["bh20seq-schema.yml"]}, + package_data={"bh20sequploader": ["bh20seq-schema.yml", "validation/formats"], + }, install_requires=install_requires, extras_require={ 'web': web_requires -- cgit v1.2.3