From 535b8017ddd27a9db683f6d29368258b5c48cf5a Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 3 Jul 2020 20:45:17 +0000 Subject: Improving genbank import workflow --- scripts/docker/Dockerfile | 2 +- .../from_genbank_to_fasta_and_yaml.py | 10 +++-- scripts/fetch_from_genbank.cwl | 49 ++++++++++++++++++++++ scripts/import.cwl | 2 +- scripts/import_from_genbank.cwl | 27 ++++++++++++ scripts/import_to_arvados.py | 2 +- scripts/split_into_arrays.cwl | 30 +++++++++++++ 7 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 scripts/fetch_from_genbank.cwl create mode 100644 scripts/import_from_genbank.cwl create mode 100644 scripts/split_into_arrays.cwl diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile index 9fb33d5..8811927 100644 --- a/scripts/docker/Dockerfile +++ b/scripts/docker/Dockerfile @@ -7,4 +7,4 @@ RUN apt-get update && \ libssl-dev libmagic-dev python3-magic && \ apt-get clean -RUN pip3 install bh20-seq-uploader +RUN pip3 install bh20-seq-uploader py-dateutil diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index d76f56b..4bb4964 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -4,6 +4,8 @@ import argparse parser = argparse.ArgumentParser() parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False) parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False) +parser.add_argument('--dict-ontology', type=str, help='where is the ontology', + default='../dict_ontology_standardization/',required=False) args = parser.parse_args() from Bio import Entrez @@ -22,7 +24,7 @@ num_ids_for_request = 100 dir_metadata = 'metadata_from_nuccore' dir_fasta_and_yaml = 'fasta_and_yaml' -dir_dict_ontology_standardization = '../dict_ontology_standardization/' +dir_dict_ontology_standardization = args.dict_ontology today_date = date.today().strftime("%Y.%m.%d") path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date) @@ -126,7 +128,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x if term in term_to_uri_dict: print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) continue - + term_to_uri_dict[term] = uri if not os.path.exists(dir_fasta_and_yaml): @@ -274,7 +276,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if host_sex in ['male', 'female']: info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383" elif GBQualifier_value_text_list[1] in term_to_uri_dict: - info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]] + info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]] else: missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]])) @@ -391,5 +393,5 @@ if len(accession_with_errors_list) > 0: print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv)) with open(path_accession_with_errors_tsv, 'w') as fw: fw.write('\n'.join(accession_with_errors_list)) - + print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) diff --git a/scripts/fetch_from_genbank.cwl b/scripts/fetch_from_genbank.cwl new file mode 100644 index 0000000..45c8eec --- /dev/null +++ b/scripts/fetch_from_genbank.cwl @@ -0,0 +1,49 @@ +cwlVersion: v1.1 +class: CommandLineTool +inputs: + importScript: + type: File + default: + class: File + location: download_genbank_data/from_genbank_to_fasta_and_yaml.py + inputBinding: {position: 1} + dict: + type: Directory + inputBinding: + prefix: --dict-ontology + position: 2 + default: + class: Directory + location: dict_ontology_standardization + existing_metadata_from_nuccore: + type: Directory? + inputBinding: + valueFrom: "--skip-request" + position: 3 +outputs: + fasta_and_yaml: + type: Directory + outputBinding: + glob: fasta_and_yaml + metadata_from_nuccore: + type: Directory + outputBinding: + glob: metadata_from_nuccore + accessions: + type: File? + outputBinding: + glob: "*.acc" + missing_terms: + type: File + outputBinding: + glob: missing_terms.tsv +requirements: + InitialWorkDirRequirement: + listing: + - entry: $(inputs.existing_metadata_from_nuccore) + entryname: metadata_from_nuccore + DockerRequirement: + dockerPull: bh20-seq-uploader/import + NetworkAccess: + networkAccess: true +baseCommand: python3 diff --git a/scripts/import.cwl b/scripts/import.cwl index d84516b..4b4b8ca 100644 --- a/scripts/import.cwl +++ b/scripts/import.cwl @@ -12,7 +12,7 @@ inputs: type: File default: class: File - location: from_genbank_to_fasta_and_yaml.py + location: download_genbank_data/from_genbank_to_fasta_and_yaml.py inputBinding: {position: 2} dict: type: Directory diff --git a/scripts/import_from_genbank.cwl b/scripts/import_from_genbank.cwl new file mode 100644 index 0000000..dcf9acb --- /dev/null +++ b/scripts/import_from_genbank.cwl @@ -0,0 +1,27 @@ +cwlVersion: v1.1 +class: Workflow +inputs: + existing_metadata_from_nuccore: + type: Directory? +outputs: [] +requirements: + ScatterFeatureRequirement: {} +steps: + fetch_from_genbank: + in: + existing_metadata_from_nuccore: existing_metadata_from_nuccore + out: [fasta_and_yaml, metadata_from_nuccore, accessions] + run: fetch_from_genbank.cwl + split_into_arrays: + in: + dir: fetch_from_genbank/fasta_and_yaml + out: [fasta, metadata] + run: split_into_arrays.cwl + upload: + in: + fasta: split_into_arrays/fasta + metadata: split_into_arrays/metadata + out: [] + scatter: [fasta, metadata] + scatterMethod: dotproduct + run: upload.cwl diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py index 78cd13d..aca72d6 100644 --- a/scripts/import_to_arvados.py +++ b/scripts/import_to_arvados.py @@ -11,4 +11,4 @@ os.chdir("fasta_and_yaml") fasta_files = glob.glob("*.fasta") for f in fasta_files: - subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]]) + subprocess.run(["bh20-seq-uploader", "%s.yaml" %f[:-6], f]) diff --git a/scripts/split_into_arrays.cwl b/scripts/split_into_arrays.cwl new file mode 100644 index 0000000..102fe7d --- /dev/null +++ b/scripts/split_into_arrays.cwl @@ -0,0 +1,30 @@ +cwlVersion: v1.1 +class: ExpressionTool +requirements: + InlineJavascriptRequirement: {} +inputs: + dir: + type: Directory + loadListing: shallow_listing +outputs: + fasta: File[] + metadata: File[] +expression: | + ${ + var dir = inputs.dir; + var fasta = []; + var metadata = []; + dir.listing.sort(function(a, b) { return a.basename < b.basename; }); + for (var i = 0; i < dir.listing.length; i++) { + if (dir.listing[i].basename.substr(-6) == ".fasta") { + fasta.push(dir.listing[i]); + } + if (dir.listing[i].basename.substr(-5) == ".yaml") { + metadata.push(dir.listing[i]); + } + } + if (fasta.length != metadata.length) { + throw "They dont match"; + } + return {"fasta": fasta, "metadata": metadata}; + } -- cgit v1.2.3