From bef2a43185f9494398f5d5a8cdb6c5f34352f912 Mon Sep 17 00:00:00 2001 From: Alex Kanitz Date: Fri, 10 Apr 2020 18:27:44 +0200 Subject: validate seq format with magic file --- bh20sequploader/main.py | 29 +++++++++++++++++++++++++---- bh20sequploader/validation/Makefile | 4 ++++ bh20sequploader/validation/formats | 4 ++++ bh20sequploader/validation/formats.mgc | Bin 0 -> 1032 bytes 4 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 bh20sequploader/validation/Makefile create mode 100644 bh20sequploader/validation/formats create mode 100644 bh20sequploader/validation/formats.mgc diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index bf74ea5..1d5b9c3 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -3,6 +3,8 @@ import time import arvados import arvados.collection import json +import magic +from pathlib import Path import urllib.request import socket import getpass @@ -14,7 +16,7 @@ UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa' def main(): parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis') - parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA') + parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ') parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') args = parser.parse_args() @@ -26,10 +28,27 @@ def main(): col = arvados.collection.Collection(api_client=api) - if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"): - target = "sequence.fasta" - elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"): + magic_file = Path(__file__).parent / "validation" / "formats.mgc" + val = magic.Magic(magic_file=magic_file.resolve().as_posix(), + uncompress=False, mime=True) + seq_type = val.from_file(args.sequence.name).lower() + print(f"Sequence type: {seq_type}") + if seq_type == "text/fasta": + # ensure that contains only one entry + entries = 0 + for line in args.sequence: + if line.startswith(">"): + entries += 1 + if entries > 1: + raise ValueError("FASTA file contains multiple entries") + break + args.sequence.close() + args.sequence = open(args.sequence.name, "r") target = "reads.fastq" + elif seq_type == "text/fastq": + target = "sequence.fasta" + else: + raise ValueError("Sequence file does not look like FASTA or FASTQ") with col.open(target, "w") as f: r = args.sequence.read(65536) @@ -37,6 +56,7 @@ def main(): while r: f.write(r) r = args.sequence.read(65536) + args.sequence.close() print("Reading metadata") with col.open("metadata.yaml", "w") as f: @@ -45,6 +65,7 @@ def main(): while r: f.write(r) r = args.metadata.read(65536) + args.metadata.close() external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') diff --git a/bh20sequploader/validation/Makefile b/bh20sequploader/validation/Makefile new file mode 100644 index 0000000..1ca13fb --- /dev/null +++ b/bh20sequploader/validation/Makefile @@ -0,0 +1,4 @@ +compile: formats.mgc + +formats.mgc : + file -C -m formats diff --git a/bh20sequploader/validation/formats b/bh20sequploader/validation/formats new file mode 100644 index 0000000..ac804cf --- /dev/null +++ b/bh20sequploader/validation/formats @@ -0,0 +1,4 @@ +0 regex \^\>.+\r?\n([acgtnACGTN]+\r?\n)*[acgtnACGTN]+(\r?\n)?$ FASTA +!:mime text/fasta +0 regex \^@.+\r?\n[acgtnACGTN]*\n\\+.*\n[!-i]*(\r\n)? FASTQ +!:mime text/fastq \ No newline at end of file diff --git a/bh20sequploader/validation/formats.mgc b/bh20sequploader/validation/formats.mgc new file mode 100644 index 0000000..bff282a Binary files /dev/null and b/bh20sequploader/validation/formats.mgc differ -- cgit v1.2.3