aboutsummaryrefslogtreecommitdiff
path: root/bh20sequploader
diff options
context:
space:
mode:
Diffstat (limited to 'bh20sequploader')
-rw-r--r--bh20sequploader/main.py58
-rw-r--r--bh20sequploader/qc_fasta.py35
2 files changed, 64 insertions, 29 deletions
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index a2e62fa..c442af0 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -22,18 +22,10 @@ ARVADOS_API_HOST='lugli.arvadosapi.com'
ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462'
UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa'
-def main():
- parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis')
- parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ')
- parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json')
- parser.add_argument("--validate", action="store_true", help="Dry run, validate only")
- args = parser.parse_args()
-
- api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True)
-
+def qa_stuff(metadata, sequence_p1, sequence_p2):
try:
log.debug("Checking metadata")
- if not qc_metadata(args.metadata.name):
+ if not qc_metadata(metadata.name):
log.warning("Failed metadata qc")
exit(1)
except ValueError as e:
@@ -42,29 +34,52 @@ def main():
print(e)
exit(1)
+ target = []
try:
- log.debug("Checking FASTA QC")
- target = qc_fasta(args.sequence)
+ log.debug("Checking FASTA/FASTQ QC")
+ target.append(qc_fasta(sequence_p1))
+ if sequence_p2:
+ target.append(qc_fasta(sequence_p2))
+ target[0] = ("reads_1."+target[0][0][6:], target[0][1])
+ target[1] = ("reads_2."+target[1][0][6:], target[0][1])
except ValueError as e:
log.debug(e)
log.debug("Failed FASTA qc")
print(e)
exit(1)
+ return target
+
+def upload_sequence(col, target, sequence):
+ with col.open(target[0], "wb") as f:
+ r = sequence.read(65536)
+ while r:
+ f.write(r)
+ r = sequence.read(65536)
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis')
+ parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json')
+ parser.add_argument('sequence_p1', type=argparse.FileType('rb'), help='sequence FASTA/FASTQ')
+ parser.add_argument('sequence_p2', type=argparse.FileType('rb'), default=None, help='sequence FASTQ pair')
+ parser.add_argument("--validate", action="store_true", help="Dry run, validate only")
+ args = parser.parse_args()
+
+ api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True)
+
+ target = qa_stuff(args.metadata, args.sequence_p1, args.sequence_p2)
+ seqlabel = target[0][1]
+
if args.validate:
print("Valid")
exit(0)
col = arvados.collection.Collection(api_client=api)
- with col.open(target, "w") as f:
- r = args.sequence.read(65536)
- seqlabel = r[1:r.index("\n")]
- print(seqlabel)
- while r:
- f.write(r)
- r = args.sequence.read(65536)
- args.sequence.close()
+ upload_sequence(col, target[0], args.sequence_p1)
+ if args.sequence_p2:
+ upload_sequence(col, target[1], args.sequence_p2)
print("Reading metadata")
with col.open("metadata.yaml", "w") as f:
@@ -73,7 +88,6 @@ def main():
while r:
f.write(r)
r = args.metadata.read(65536)
- args.metadata.close()
external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')
@@ -93,6 +107,8 @@ def main():
(seqlabel, properties['upload_user'], properties['upload_ip']),
properties=properties, ensure_unique_name=True)
+ print("Saved to %s" % col.manifest_locator())
+
print("Done")
if __name__ == "__main__":
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 5c8cf3a..e198430 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -5,6 +5,8 @@ import subprocess
import tempfile
import logging
import re
+import io
+import gzip
log = logging.getLogger(__name__ )
@@ -23,7 +25,7 @@ def read_fasta(sequence):
raise ValueError("FASTA file contains multiple entries")
return label, bases
-def qc_fasta(sequence):
+def qc_fasta(arg_sequence):
log.debug("Starting qc_fasta")
schema_resource = pkg_resources.resource_stream(__name__, "validation/formats")
with tempfile.NamedTemporaryFile() as tmp:
@@ -31,12 +33,24 @@ def qc_fasta(sequence):
tmp.flush()
val = magic.Magic(magic_file=tmp.name,
uncompress=False, mime=True)
- seq_type = val.from_buffer(sequence.read(4096)).lower()
+
+ gz = ""
+ if arg_sequence.name.endswith(".gz"):
+ sequence = gzip.GzipFile(fileobj=arg_sequence, mode='rb')
+ gz = ".gz"
+ else:
+ sequence = arg_sequence
+
+ sequence = io.TextIOWrapper(sequence)
+ r = sequence.read(4096)
sequence.seek(0)
+
+ seqlabel = r[1:r.index("\n")]
+ seq_type = val.from_buffer(r).lower()
+
if seq_type == "text/fasta":
# ensure that contains only one entry
submitlabel, submitseq = read_fasta(sequence)
- sequence.seek(0)
with tempfile.NamedTemporaryFile() as tmp1:
refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta")
@@ -44,6 +58,9 @@ def qc_fasta(sequence):
tmp1.write(submitlabel.encode("utf8"))
tmp1.write(("".join(submitseq)).encode("utf8"))
tmp1.flush()
+ subbp = 0
+ refbp = 0
+ similarity = 0
try:
cmd = ["clustalw", "-infile="+tmp1.name,
"-quicktree", "-iteration=none", "-type=DNA"]
@@ -64,15 +81,17 @@ def qc_fasta(sequence):
except Exception as e:
logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e)
- if (subbp/refbp) < .7:
+ if refbp and (subbp/refbp) < .7:
raise ValueError("QC fail: submit sequence length is shorter than 70% reference")
- if (subbp/refbp) > 1.3:
+ if refbp and (subbp/refbp) > 1.3:
raise ValueError("QC fail: submit sequence length is greater than 130% reference")
- if similarity < 70.0:
+ if similarity and similarity < 70.0:
raise ValueError("QC fail: submit similarity is less than 70%")
+ if refbp == 0 or similarity == 0:
+ raise ValueError("QC fail")
- return "sequence.fasta"
+ return ("sequence.fasta"+gz, seqlabel)
elif seq_type == "text/fastq":
- return "reads.fastq"
+ return ("reads.fastq"+gz, seqlabel)
else:
raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ")