From 88d81f853cf04b7f28681dd9cdee775b0422f252 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 21 Apr 2020 12:53:19 -0400 Subject: Working on NCBI import Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/bh20seq-schema.yml | 4 ++-- bh20sequploader/main.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 75308ab..ebca35b 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -162,12 +162,12 @@ $graph: _id: http://www.ebi.ac.uk/efo/EFO_0002699 sequencing_coverage: doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) - type: float? + type: ["null", float, int] jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 sequencing_coverage2: doc: If a second sequence technology was used you can submit its coverage here - type: float? + type: ["null", float, int] jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 additional_technology_information: diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 49d012d..2fda347 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -44,7 +44,8 @@ def main(): with col.open(target, "w") as f: r = args.sequence.read(65536) - print(r[0:20]) + seqlabel = r[1:r.index("\n")] + print(seqlabel) while r: f.write(r) r = args.sequence.read(65536) @@ -67,8 +68,8 @@ def main(): "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname()) } - col.save_new(owner_uuid=UPLOAD_PROJECT, name="Uploaded by %s from %s" % - (properties['upload_user'], properties['upload_ip']), + col.save_new(owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" % + (seqlabel, properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) print("Done") -- cgit v1.2.3 From 7e085b2958d9bd4f0a2b1912cf259a05b56366bc Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 21 Apr 2020 13:22:53 -0400 Subject: Tweak handling of "coverage" also fix typo Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/bh20seq-schema.yml | 4 ++-- bh20sequploader/bh20seq-shex.rdf | 2 +- scripts/dict_ontology_standardization/ncbi_speciesman_source.csv | 2 +- scripts/from_genbank_to_fasta_and_yaml.py | 9 ++++++--- 4 files changed, 10 insertions(+), 7 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index ebca35b..75308ab 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -162,12 +162,12 @@ $graph: _id: http://www.ebi.ac.uk/efo/EFO_0002699 sequencing_coverage: doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) - type: ["null", float, int] + type: float? jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 sequencing_coverage2: doc: If a second sequence technology was used you can submit its coverage here - type: ["null", float, int] + type: float? jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 additional_technology_information: diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 59ee71b..31e714f 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -50,7 +50,7 @@ PREFIX wikidata: :technologyShape { obo:OBI_0600047 IRI {0,2} ; - obo:FLU_0000848 xsd:integer ?; + obo:FLU_0000848 xsd:double ?; efo:EFO_0002699 xsd:string ?; } diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv index 2905588..909cf37 100644 --- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv +++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv @@ -1,4 +1,4 @@ -nasopharyngeal swab, http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831 naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 1a12513..00c0012 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -130,9 +130,12 @@ if not os.path.exists(dir_fasta_and_yaml_today): if field_in_yaml == 'sequencing_coverage': # A regular expression would be better! - info_for_yaml_dict['technology'][field_in_yaml] = ';'.join( - [x.strip('(average)').strip("reads/nt").replace(',', '.').strip(' xX>') for x in tech_info_to_parse.split(';')] - ) + try: + info_for_yaml_dict['technology'][field_in_yaml] = float( + tech_info_to_parse.strip('(average)').strip("reads/nt").replace(',', '.').strip(' xX>')) + except ValueError: + print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse) + pass elif field_in_yaml == 'sample_sequencing_technology': new_seq_tec_list = [] for seq_tec in tech_info_to_parse.split(';'): -- cgit v1.2.3 From cad23032ecf6ef325aab2978d5df36609ad50088 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 21 Apr 2020 18:16:47 +0000 Subject: add noLinkCheck to specimen_source2 --- bh20sequploader/bh20seq-schema.yml | 1 + 1 file changed, 1 insertion(+) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 75308ab..1ceebe2 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -106,6 +106,7 @@ $graph: jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001479 _type: "@id" + noLinkCheck: true sample_storage_conditions: doc: Information about storage of a specified type, e.g. frozen specimen, paraffin, fresh .... type: string? -- cgit v1.2.3 From f4c3da88c1233802fea46cc972a81dc3b5b51185 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 21 Apr 2020 15:37:58 -0400 Subject: Work around CWL content size limit by chunking Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/main.py | 1 + workflows/pangenome-generate/relabel-seqs.cwl | 31 +++++++++++++++++++++++---- workflows/pangenome-generate/relabel-seqs.py | 22 +++++++++++++------ 3 files changed, 44 insertions(+), 10 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 2fda347..4c4711d 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -63,6 +63,7 @@ def main(): external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') properties = { + "sequence_label": seqlabel, "upload_app": "bh20-seq-uploader", "upload_ip": external_ip, "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname()) diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl index 2b780d4..01196f6 100644 --- a/workflows/pangenome-generate/relabel-seqs.cwl +++ b/workflows/pangenome-generate/relabel-seqs.cwl @@ -3,6 +3,10 @@ class: CommandLineTool inputs: readsFA: File[] subjects: string[] + script: + type: File + default: {class: File, location: relabel-seqs.py} + inputBinding: {} outputs: relabeledSeqs: type: File @@ -15,11 +19,30 @@ outputs: requirements: InlineJavascriptRequirement: {} InitialWorkDirRequirement: - listing: - - entry: {$include: relabel-seqs.py} - entryname: relabel-seqs.py + listing: | + ${ + var i = 0; + var b = 1; + var out = []; + for (; i < inputs.readsFA.length; i++) { + var block = []; + for (; i < (b*100) && i < inputs.readsFA.length; i++) { + block.push(inputs.readsFA[i]); + } + out.push({ + entryname: "block"+b, + entry: JSON.stringify(block) + }); + b++; + } + out.push({ + entry: JSON.stringify(inputs.subjects), + entryname: "subjects" + }); + return out; + } hints: DockerRequirement: dockerPull: commonworkflowlanguage/cwltool_module stdout: -baseCommand: [python, relabel-seqs.py] +baseCommand: [python] diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py index 1188ceb..970540f 100644 --- a/workflows/pangenome-generate/relabel-seqs.py +++ b/workflows/pangenome-generate/relabel-seqs.py @@ -1,5 +1,15 @@ -reads = $(inputs.readsFA) -subjects = $(inputs.subjects) +import os +import json + +reads = [] +b = 1 +while os.path.exists("block%i" % b): + with open("block%i" % b) as f: + reads.extend(json.load(f)) + b += 1 + +with open("subjects") as f: + subjects = json.load(f) relabeled_fasta = open("relabeledSeqs.fasta", "wt") original_labels = open("originalLabels.ttl", "wt") @@ -7,12 +17,12 @@ original_labels = open("originalLabels.ttl", "wt") for i, r in enumerate(reads): with open(r["path"], "rt") as fa: label = fa.readline() - original_labels.write("<%s> \\"%s\\" .\\n" % (subjects[i], label[1:].strip().replace('"', '\\\\"'))) - relabeled_fasta.write(">"+subjects[i]+"\\n") + original_labels.write("<%s> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"'))) + relabeled_fasta.write(">"+subjects[i]+"\n") data = fa.read(8096) while data: relabeled_fasta.write(data) - endswithnewline = data.endswith("\\n") + endswithnewline = data.endswith("\n") data = fa.read(8096) if not endswithnewline: - relabeled_fasta.write("\\n") + relabeled_fasta.write("\n") -- cgit v1.2.3