aboutsummaryrefslogtreecommitdiff
path: root/bh20sequploader/main.py
diff options
context:
space:
mode:
authorPeter Amstutz2020-04-10 15:52:37 -0400
committerPeter Amstutz2020-04-10 15:52:37 -0400
commit925058d0b3db70803d322cc2a33801240899a20a (patch)
tree41e4552336d7d3bf17be8b1df6cbc280ef86eb6a /bh20sequploader/main.py
parenta6ba9a5203a568611a94c043fd13e2ec50f071da (diff)
downloadbh20-seq-resource-925058d0b3db70803d322cc2a33801240899a20a.tar.gz
bh20-seq-resource-925058d0b3db70803d322cc2a33801240899a20a.tar.lz
bh20-seq-resource-925058d0b3db70803d322cc2a33801240899a20a.zip
Fix up fasta/fastq validation
Diffstat (limited to 'bh20sequploader/main.py')
-rw-r--r--bh20sequploader/main.py29
1 files changed, 5 insertions, 24 deletions
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index 2032508..4a225f6 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -8,7 +8,8 @@ from pathlib import Path
import urllib.request
import socket
import getpass
-from qc_metadata import qc_metadata
+from .qc_metadata import qc_metadata
+from .qc_fasta import qc_fasta
ARVADOS_API_HOST='lugli.arvadosapi.com'
ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462'
@@ -22,34 +23,14 @@ def main():
api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True)
- if not bh20sequploader.qc_metadata.qc_metadata(args.metadata.name):
+ target = qc_fasta(args.sequence)
+
+ if not qc_metadata(args.metadata.name):
print("Failed metadata qc")
exit(1)
col = arvados.collection.Collection(api_client=api)
- magic_file = Path(__file__).parent / "validation" / "formats.mgc"
- val = magic.Magic(magic_file=magic_file.resolve().as_posix(),
- uncompress=False, mime=True)
- seq_type = val.from_file(args.sequence.name).lower()
- print(f"Sequence type: {seq_type}")
- if seq_type == "text/fasta":
- # ensure that contains only one entry
- entries = 0
- for line in args.sequence:
- if line.startswith(">"):
- entries += 1
- if entries > 1:
- raise ValueError("FASTA file contains multiple entries")
- break
- args.sequence.close()
- args.sequence = open(args.sequence.name, "r")
- target = "reads.fastq"
- elif seq_type == "text/fastq":
- target = "sequence.fasta"
- else:
- raise ValueError("Sequence file does not look like FASTA or FASTQ")
-
with col.open(target, "w") as f:
r = args.sequence.read(65536)
print(r[0:20])