From 7daa9ff2cdba742a811db00c924ccde25fa2c9b6 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 22 Jun 2020 15:06:10 +0000 Subject: Handle upload & assembly of gzipped, paired-end fastq --- bh20sequploader/main.py | 58 +++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 21 deletions(-) (limited to 'bh20sequploader/main.py') diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index a2e62fa..c442af0 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -22,18 +22,10 @@ ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa' -def main(): - parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis') - parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ') - parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') - parser.add_argument("--validate", action="store_true", help="Dry run, validate only") - args = parser.parse_args() - - api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - +def qa_stuff(metadata, sequence_p1, sequence_p2): try: log.debug("Checking metadata") - if not qc_metadata(args.metadata.name): + if not qc_metadata(metadata.name): log.warning("Failed metadata qc") exit(1) except ValueError as e: @@ -42,29 +34,52 @@ def main(): print(e) exit(1) + target = [] try: - log.debug("Checking FASTA QC") - target = qc_fasta(args.sequence) + log.debug("Checking FASTA/FASTQ QC") + target.append(qc_fasta(sequence_p1)) + if sequence_p2: + target.append(qc_fasta(sequence_p2)) + target[0] = ("reads_1."+target[0][0][6:], target[0][1]) + target[1] = ("reads_2."+target[1][0][6:], target[0][1]) except ValueError as e: log.debug(e) log.debug("Failed FASTA qc") print(e) exit(1) + return target + +def upload_sequence(col, target, sequence): + with col.open(target[0], "wb") as f: + r = sequence.read(65536) + while r: + f.write(r) + r = sequence.read(65536) + + +def main(): + parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis') + parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') + parser.add_argument('sequence_p1', type=argparse.FileType('rb'), help='sequence FASTA/FASTQ') + parser.add_argument('sequence_p2', type=argparse.FileType('rb'), default=None, help='sequence FASTQ pair') + parser.add_argument("--validate", action="store_true", help="Dry run, validate only") + args = parser.parse_args() + + api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) + + target = qa_stuff(args.metadata, args.sequence_p1, args.sequence_p2) + seqlabel = target[0][1] + if args.validate: print("Valid") exit(0) col = arvados.collection.Collection(api_client=api) - with col.open(target, "w") as f: - r = args.sequence.read(65536) - seqlabel = r[1:r.index("\n")] - print(seqlabel) - while r: - f.write(r) - r = args.sequence.read(65536) - args.sequence.close() + upload_sequence(col, target[0], args.sequence_p1) + if args.sequence_p2: + upload_sequence(col, target[1], args.sequence_p2) print("Reading metadata") with col.open("metadata.yaml", "w") as f: @@ -73,7 +88,6 @@ def main(): while r: f.write(r) r = args.metadata.read(65536) - args.metadata.close() external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') @@ -93,6 +107,8 @@ def main(): (seqlabel, properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) + print("Saved to %s" % col.manifest_locator()) + print("Done") if __name__ == "__main__": -- cgit v1.2.3