aboutsummaryrefslogtreecommitdiff
path: root/bh20sequploader
diff options
context:
space:
mode:
authorAlex Kanitz2020-04-10 18:27:44 +0200
committerAlex Kanitz2020-04-10 18:27:44 +0200
commitbef2a43185f9494398f5d5a8cdb6c5f34352f912 (patch)
tree28cfcbeea67a10c0b6c06e452e33de3e4b4fa457 /bh20sequploader
parentb1039ff369ee88770e42d1bbbdc402b1181c2aa5 (diff)
downloadbh20-seq-resource-bef2a43185f9494398f5d5a8cdb6c5f34352f912.tar.gz
bh20-seq-resource-bef2a43185f9494398f5d5a8cdb6c5f34352f912.tar.lz
bh20-seq-resource-bef2a43185f9494398f5d5a8cdb6c5f34352f912.zip
validate seq format with magic file
Diffstat (limited to 'bh20sequploader')
-rw-r--r--bh20sequploader/main.py29
-rw-r--r--bh20sequploader/validation/Makefile4
-rw-r--r--bh20sequploader/validation/formats4
-rw-r--r--bh20sequploader/validation/formats.mgcbin0 -> 1032 bytes
4 files changed, 33 insertions, 4 deletions
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index bf74ea5..1d5b9c3 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -3,6 +3,8 @@ import time
import arvados
import arvados.collection
import json
+import magic
+from pathlib import Path
import urllib.request
import socket
import getpass
@@ -14,7 +16,7 @@ UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa'
def main():
parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis')
- parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA')
+ parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ')
parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json')
args = parser.parse_args()
@@ -26,10 +28,27 @@ def main():
col = arvados.collection.Collection(api_client=api)
- if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"):
- target = "sequence.fasta"
- elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"):
+ magic_file = Path(__file__).parent / "validation" / "formats.mgc"
+ val = magic.Magic(magic_file=magic_file.resolve().as_posix(),
+ uncompress=False, mime=True)
+ seq_type = val.from_file(args.sequence.name).lower()
+ print(f"Sequence type: {seq_type}")
+ if seq_type == "text/fasta":
+ # ensure that contains only one entry
+ entries = 0
+ for line in args.sequence:
+ if line.startswith(">"):
+ entries += 1
+ if entries > 1:
+ raise ValueError("FASTA file contains multiple entries")
+ break
+ args.sequence.close()
+ args.sequence = open(args.sequence.name, "r")
target = "reads.fastq"
+ elif seq_type == "text/fastq":
+ target = "sequence.fasta"
+ else:
+ raise ValueError("Sequence file does not look like FASTA or FASTQ")
with col.open(target, "w") as f:
r = args.sequence.read(65536)
@@ -37,6 +56,7 @@ def main():
while r:
f.write(r)
r = args.sequence.read(65536)
+ args.sequence.close()
print("Reading metadata")
with col.open("metadata.yaml", "w") as f:
@@ -45,6 +65,7 @@ def main():
while r:
f.write(r)
r = args.metadata.read(65536)
+ args.metadata.close()
external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')
diff --git a/bh20sequploader/validation/Makefile b/bh20sequploader/validation/Makefile
new file mode 100644
index 0000000..1ca13fb
--- /dev/null
+++ b/bh20sequploader/validation/Makefile
@@ -0,0 +1,4 @@
+compile: formats.mgc
+
+formats.mgc :
+ file -C -m formats
diff --git a/bh20sequploader/validation/formats b/bh20sequploader/validation/formats
new file mode 100644
index 0000000..ac804cf
--- /dev/null
+++ b/bh20sequploader/validation/formats
@@ -0,0 +1,4 @@
+0 regex \^\>.+\r?\n([acgtnACGTN]+\r?\n)*[acgtnACGTN]+(\r?\n)?$ FASTA
+!:mime text/fasta
+0 regex \^@.+\r?\n[acgtnACGTN]*\n\\+.*\n[!-i]*(\r\n)? FASTQ
+!:mime text/fastq \ No newline at end of file
diff --git a/bh20sequploader/validation/formats.mgc b/bh20sequploader/validation/formats.mgc
new file mode 100644
index 0000000..bff282a
--- /dev/null
+++ b/bh20sequploader/validation/formats.mgc
Binary files differ