diff options
author | Alex Kanitz | 2020-04-10 18:27:44 +0200 |
---|---|---|
committer | Alex Kanitz | 2020-04-10 18:27:44 +0200 |
commit | bef2a43185f9494398f5d5a8cdb6c5f34352f912 (patch) | |
tree | 28cfcbeea67a10c0b6c06e452e33de3e4b4fa457 /bh20sequploader | |
parent | b1039ff369ee88770e42d1bbbdc402b1181c2aa5 (diff) | |
download | bh20-seq-resource-bef2a43185f9494398f5d5a8cdb6c5f34352f912.tar.gz bh20-seq-resource-bef2a43185f9494398f5d5a8cdb6c5f34352f912.tar.lz bh20-seq-resource-bef2a43185f9494398f5d5a8cdb6c5f34352f912.zip |
validate seq format with magic file
Diffstat (limited to 'bh20sequploader')
-rw-r--r-- | bh20sequploader/main.py | 29 | ||||
-rw-r--r-- | bh20sequploader/validation/Makefile | 4 | ||||
-rw-r--r-- | bh20sequploader/validation/formats | 4 | ||||
-rw-r--r-- | bh20sequploader/validation/formats.mgc | bin | 0 -> 1032 bytes |
4 files changed, 33 insertions, 4 deletions
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index bf74ea5..1d5b9c3 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -3,6 +3,8 @@ import time import arvados import arvados.collection import json +import magic +from pathlib import Path import urllib.request import socket import getpass @@ -14,7 +16,7 @@ UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa' def main(): parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis') - parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA') + parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ') parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') args = parser.parse_args() @@ -26,10 +28,27 @@ def main(): col = arvados.collection.Collection(api_client=api) - if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"): - target = "sequence.fasta" - elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"): + magic_file = Path(__file__).parent / "validation" / "formats.mgc" + val = magic.Magic(magic_file=magic_file.resolve().as_posix(), + uncompress=False, mime=True) + seq_type = val.from_file(args.sequence.name).lower() + print(f"Sequence type: {seq_type}") + if seq_type == "text/fasta": + # ensure that contains only one entry + entries = 0 + for line in args.sequence: + if line.startswith(">"): + entries += 1 + if entries > 1: + raise ValueError("FASTA file contains multiple entries") + break + args.sequence.close() + args.sequence = open(args.sequence.name, "r") target = "reads.fastq" + elif seq_type == "text/fastq": + target = "sequence.fasta" + else: + raise ValueError("Sequence file does not look like FASTA or FASTQ") with col.open(target, "w") as f: r = args.sequence.read(65536) @@ -37,6 +56,7 @@ def main(): while r: f.write(r) r = args.sequence.read(65536) + args.sequence.close() print("Reading metadata") with col.open("metadata.yaml", "w") as f: @@ -45,6 +65,7 @@ def main(): while r: f.write(r) r = args.metadata.read(65536) + args.metadata.close() external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') diff --git a/bh20sequploader/validation/Makefile b/bh20sequploader/validation/Makefile new file mode 100644 index 0000000..1ca13fb --- /dev/null +++ b/bh20sequploader/validation/Makefile @@ -0,0 +1,4 @@ +compile: formats.mgc + +formats.mgc : + file -C -m formats diff --git a/bh20sequploader/validation/formats b/bh20sequploader/validation/formats new file mode 100644 index 0000000..ac804cf --- /dev/null +++ b/bh20sequploader/validation/formats @@ -0,0 +1,4 @@ +0 regex \^\>.+\r?\n([acgtnACGTN]+\r?\n)*[acgtnACGTN]+(\r?\n)?$ FASTA +!:mime text/fasta +0 regex \^@.+\r?\n[acgtnACGTN]*\n\\+.*\n[!-i]*(\r\n)? FASTQ +!:mime text/fastq
\ No newline at end of file diff --git a/bh20sequploader/validation/formats.mgc b/bh20sequploader/validation/formats.mgc Binary files differnew file mode 100644 index 0000000..bff282a --- /dev/null +++ b/bh20sequploader/validation/formats.mgc |