about summary refs log tree commit diff
path: root/bh20sequploader/qc_fasta.py
diff options
context:
space:
mode:
authorPjotr Prins2020-04-12 12:49:06 -0500
committerGitHub2020-04-12 12:49:06 -0500
commit23722a63682fdffe51efca55b40573fa27370973 (patch)
tree7628ccda2671fe9a3e9ff204918c965d4bb9e257 /bh20sequploader/qc_fasta.py
parent10ccb97cab69cb704c154387d544a74cd38d3cdf (diff)
parent92e1608b2d8b21f2001d7bf480301d314337fdc0 (diff)
downloadbh20-seq-resource-23722a63682fdffe51efca55b40573fa27370973.tar.gz
bh20-seq-resource-23722a63682fdffe51efca55b40573fa27370973.tar.lz
bh20-seq-resource-23722a63682fdffe51efca55b40573fa27370973.zip
Merge branch 'master' into patch-1
Diffstat (limited to 'bh20sequploader/qc_fasta.py')
-rw-r--r--bh20sequploader/qc_fasta.py28
1 files changed, 28 insertions, 0 deletions
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
new file mode 100644
index 0000000..e47d66b
--- /dev/null
+++ b/bh20sequploader/qc_fasta.py
@@ -0,0 +1,28 @@
+import pkg_resources
+import tempfile
+import magic
+
+def qc_fasta(sequence):
+    schema_resource = pkg_resources.resource_stream(__name__, "validation/formats")
+    with tempfile.NamedTemporaryFile() as tmp:
+        tmp.write(schema_resource.read())
+        tmp.flush()
+        val = magic.Magic(magic_file=tmp.name,
+                          uncompress=False, mime=True)
+    seq_type = val.from_buffer(sequence.read(4096)).lower()
+    sequence.seek(0)
+    if seq_type == "text/fasta":
+        # ensure that contains only one entry
+        entries = 0
+        for line in sequence:
+            if line.startswith(">"):
+                entries += 1
+            if entries > 1:
+                raise ValueError("FASTA file contains multiple entries")
+                break
+        sequence.seek(0)
+        return "sequence.fasta"
+    elif seq_type == "text/fastq":
+        return "reads.fastq"
+    else:
+        raise ValueError("Sequence file does not look like FASTA or FASTQ")