about summary refs log tree commit diff
path: root/bh20sequploader
diff options
context:
space:
mode:
Diffstat (limited to 'bh20sequploader')
-rw-r--r--bh20sequploader/main.py58
-rw-r--r--bh20sequploader/qc_fasta.py35
2 files changed, 64 insertions, 29 deletions
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index a2e62fa..c442af0 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -22,18 +22,10 @@ ARVADOS_API_HOST='lugli.arvadosapi.com'
 ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462'
 UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa'
 
-def main():
-    parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis')
-    parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ')
-    parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json')
-    parser.add_argument("--validate", action="store_true", help="Dry run, validate only")
-    args = parser.parse_args()
-
-    api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True)
-
+def qa_stuff(metadata, sequence_p1, sequence_p2):
     try:
         log.debug("Checking metadata")
-        if not qc_metadata(args.metadata.name):
+        if not qc_metadata(metadata.name):
             log.warning("Failed metadata qc")
             exit(1)
     except ValueError as e:
@@ -42,29 +34,52 @@ def main():
         print(e)
         exit(1)
 
+    target = []
     try:
-        log.debug("Checking FASTA QC")
-        target = qc_fasta(args.sequence)
+        log.debug("Checking FASTA/FASTQ QC")
+        target.append(qc_fasta(sequence_p1))
+        if sequence_p2:
+            target.append(qc_fasta(sequence_p2))
+            target[0] = ("reads_1."+target[0][0][6:], target[0][1])
+            target[1] = ("reads_2."+target[1][0][6:], target[0][1])
     except ValueError as e:
         log.debug(e)
         log.debug("Failed FASTA qc")
         print(e)
         exit(1)
 
+    return target
+
+def upload_sequence(col, target, sequence):
+    with col.open(target[0], "wb") as f:
+        r = sequence.read(65536)
+        while r:
+            f.write(r)
+            r = sequence.read(65536)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis')
+    parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json')
+    parser.add_argument('sequence_p1', type=argparse.FileType('rb'), help='sequence FASTA/FASTQ')
+    parser.add_argument('sequence_p2', type=argparse.FileType('rb'), default=None, help='sequence FASTQ pair')
+    parser.add_argument("--validate", action="store_true", help="Dry run, validate only")
+    args = parser.parse_args()
+
+    api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True)
+
+    target = qa_stuff(args.metadata, args.sequence_p1, args.sequence_p2)
+    seqlabel = target[0][1]
+
     if args.validate:
         print("Valid")
         exit(0)
 
     col = arvados.collection.Collection(api_client=api)
 
-    with col.open(target, "w") as f:
-        r = args.sequence.read(65536)
-        seqlabel = r[1:r.index("\n")]
-        print(seqlabel)
-        while r:
-            f.write(r)
-            r = args.sequence.read(65536)
-    args.sequence.close()
+    upload_sequence(col, target[0], args.sequence_p1)
+    if args.sequence_p2:
+        upload_sequence(col, target[1], args.sequence_p2)
 
     print("Reading metadata")
     with col.open("metadata.yaml", "w") as f:
@@ -73,7 +88,6 @@ def main():
         while r:
             f.write(r)
             r = args.metadata.read(65536)
-    args.metadata.close()
 
     external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')
 
@@ -93,6 +107,8 @@ def main():
                  (seqlabel, properties['upload_user'], properties['upload_ip']),
                  properties=properties, ensure_unique_name=True)
 
+    print("Saved to %s" % col.manifest_locator())
+
     print("Done")
 
 if __name__ == "__main__":
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 5c8cf3a..e198430 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -5,6 +5,8 @@ import subprocess
 import tempfile
 import logging
 import re
+import io
+import gzip
 
 log = logging.getLogger(__name__ )
 
@@ -23,7 +25,7 @@ def read_fasta(sequence):
             raise ValueError("FASTA file contains multiple entries")
     return label, bases
 
-def qc_fasta(sequence):
+def qc_fasta(arg_sequence):
     log.debug("Starting qc_fasta")
     schema_resource = pkg_resources.resource_stream(__name__, "validation/formats")
     with tempfile.NamedTemporaryFile() as tmp:
@@ -31,12 +33,24 @@ def qc_fasta(sequence):
         tmp.flush()
         val = magic.Magic(magic_file=tmp.name,
                           uncompress=False, mime=True)
-    seq_type = val.from_buffer(sequence.read(4096)).lower()
+
+    gz = ""
+    if arg_sequence.name.endswith(".gz"):
+        sequence = gzip.GzipFile(fileobj=arg_sequence, mode='rb')
+        gz = ".gz"
+    else:
+        sequence = arg_sequence
+
+    sequence = io.TextIOWrapper(sequence)
+    r = sequence.read(4096)
     sequence.seek(0)
+
+    seqlabel = r[1:r.index("\n")]
+    seq_type = val.from_buffer(r).lower()
+
     if seq_type == "text/fasta":
         # ensure that contains only one entry
         submitlabel, submitseq = read_fasta(sequence)
-        sequence.seek(0)
 
         with tempfile.NamedTemporaryFile() as tmp1:
             refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta")
@@ -44,6 +58,9 @@ def qc_fasta(sequence):
             tmp1.write(submitlabel.encode("utf8"))
             tmp1.write(("".join(submitseq)).encode("utf8"))
             tmp1.flush()
+            subbp = 0
+            refbp = 0
+            similarity = 0
             try:
                 cmd = ["clustalw", "-infile="+tmp1.name,
                        "-quicktree", "-iteration=none", "-type=DNA"]
@@ -64,15 +81,17 @@ def qc_fasta(sequence):
             except Exception as e:
                 logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e)
 
-            if (subbp/refbp) < .7:
+            if refbp and (subbp/refbp) < .7:
                 raise ValueError("QC fail: submit sequence length is shorter than 70% reference")
-            if (subbp/refbp) > 1.3:
+            if refbp and (subbp/refbp) > 1.3:
                 raise ValueError("QC fail: submit sequence length is greater than 130% reference")
-            if similarity < 70.0:
+            if similarity and similarity < 70.0:
                 raise ValueError("QC fail: submit similarity is less than 70%")
+            if refbp == 0 or similarity == 0:
+                raise ValueError("QC fail")
 
-        return "sequence.fasta"
+        return ("sequence.fasta"+gz, seqlabel)
     elif seq_type == "text/fastq":
-        return "reads.fastq"
+        return ("reads.fastq"+gz, seqlabel)
     else:
         raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ")