Improving genbank import workflow

author: Peter Amstutz 2020-07-03 20:45:17 +0000
committer: Peter Amstutz 2020-07-03 20:47:14 +0000
commit: 535b8017ddd27a9db683f6d29368258b5c48cf5a (patch)
tree: 521a3e446f0eb9cbdc6e8736a13706c8141f8351
parent: 841af02eaa45c1b7395d5f4e4711de3c3661c146 (diff)
download: bh20-seq-resource-535b8017ddd27a9db683f6d29368258b5c48cf5a.tar.gz
bh20-seq-resource-535b8017ddd27a9db683f6d29368258b5c48cf5a.tar.lz
bh20-seq-resource-535b8017ddd27a9db683f6d29368258b5c48cf5a.zip
7 files changed, 115 insertions, 7 deletions
diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile
index 9fb33d5..8811927 100644
--- a/scripts/docker/Dockerfile
+++ b/scripts/docker/Dockerfile
@@ -7,4 +7,4 @@ RUN apt-get update && \
     libssl-dev libmagic-dev python3-magic && \
     apt-get clean
 
-RUN pip3 install bh20-seq-uploader
+RUN pip3 install bh20-seq-uploader py-dateutil
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index d76f56b..4bb4964 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -4,6 +4,8 @@ import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False)
 parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False)
+parser.add_argument('--dict-ontology', type=str, help='where is the ontology',
+                    default='../dict_ontology_standardization/',required=False)
 args = parser.parse_args()
 
 from Bio import Entrez
@@ -22,7 +24,7 @@ num_ids_for_request = 100
 
 dir_metadata = 'metadata_from_nuccore'
 dir_fasta_and_yaml = 'fasta_and_yaml'
-dir_dict_ontology_standardization = '../dict_ontology_standardization/'
+dir_dict_ontology_standardization = args.dict_ontology
 
 today_date = date.today().strftime("%Y.%m.%d")
 path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date)
@@ -126,7 +128,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
             if term in term_to_uri_dict:
                 print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
                 continue
-                
+
             term_to_uri_dict[term] = uri
 
 if not os.path.exists(dir_fasta_and_yaml):
@@ -274,7 +276,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
                             if host_sex in ['male', 'female']:
                                 info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383"
                             elif GBQualifier_value_text_list[1] in term_to_uri_dict:
-                                info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]]                            
+                                info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]]
                             else:
                                 missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]]))
 
@@ -391,5 +393,5 @@ if len(accession_with_errors_list) > 0:
     print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
     with open(path_accession_with_errors_tsv, 'w') as fw:
         fw.write('\n'.join(accession_with_errors_list))
-        
+
 print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))
diff --git a/scripts/fetch_from_genbank.cwl b/scripts/fetch_from_genbank.cwl
new file mode 100644
index 0000000..45c8eec
--- /dev/null
+++ b/scripts/fetch_from_genbank.cwl
@@ -0,0 +1,49 @@
+cwlVersion: v1.1
+class: CommandLineTool
+inputs:
+  importScript:
+    type: File
+    default:
+      class: File
+      location: download_genbank_data/from_genbank_to_fasta_and_yaml.py
+    inputBinding: {position: 1}
+  dict:
+    type: Directory
+    inputBinding:
+      prefix: --dict-ontology
+      position: 2
+    default:
+      class: Directory
+      location: dict_ontology_standardization
+  existing_metadata_from_nuccore:
+    type: Directory?
+    inputBinding:
+      valueFrom: "--skip-request"
+      position: 3
+outputs:
+  fasta_and_yaml:
+    type: Directory
+    outputBinding:
+      glob: fasta_and_yaml
+  metadata_from_nuccore:
+    type: Directory
+    outputBinding:
+      glob: metadata_from_nuccore
+  accessions:
+    type: File?
+    outputBinding:
+      glob: "*.acc"
+  missing_terms:
+    type: File
+    outputBinding:
+      glob: missing_terms.tsv
+requirements:
+  InitialWorkDirRequirement:
+    listing:
+      - entry: $(inputs.existing_metadata_from_nuccore)
+        entryname: metadata_from_nuccore
+  DockerRequirement:
+    dockerPull: bh20-seq-uploader/import
+  NetworkAccess:
+    networkAccess: true
+baseCommand: python3
diff --git a/scripts/import.cwl b/scripts/import.cwl
index d84516b..4b4b8ca 100644
--- a/scripts/import.cwl
+++ b/scripts/import.cwl
@@ -12,7 +12,7 @@ inputs:
     type: File
     default:
       class: File
-      location: from_genbank_to_fasta_and_yaml.py
+      location: download_genbank_data/from_genbank_to_fasta_and_yaml.py
     inputBinding: {position: 2}
   dict:
     type: Directory
diff --git a/scripts/import_from_genbank.cwl b/scripts/import_from_genbank.cwl
new file mode 100644
index 0000000..dcf9acb
--- /dev/null
+++ b/scripts/import_from_genbank.cwl
@@ -0,0 +1,27 @@
+cwlVersion: v1.1
+class: Workflow
+inputs:
+  existing_metadata_from_nuccore:
+    type: Directory?
+outputs: []
+requirements:
+  ScatterFeatureRequirement: {}
+steps:
+  fetch_from_genbank:
+    in:
+      existing_metadata_from_nuccore: existing_metadata_from_nuccore
+    out: [fasta_and_yaml, metadata_from_nuccore, accessions]
+    run: fetch_from_genbank.cwl
+  split_into_arrays:
+    in:
+      dir: fetch_from_genbank/fasta_and_yaml
+    out: [fasta, metadata]
+    run: split_into_arrays.cwl
+  upload:
+    in:
+      fasta: split_into_arrays/fasta
+      metadata: split_into_arrays/metadata
+    out: []
+    scatter: [fasta, metadata]
+    scatterMethod: dotproduct
+    run: upload.cwl
diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py
index 78cd13d..aca72d6 100644
--- a/scripts/import_to_arvados.py
+++ b/scripts/import_to_arvados.py
@@ -11,4 +11,4 @@ os.chdir("fasta_and_yaml")
 fasta_files = glob.glob("*.fasta")
 
 for f in fasta_files:
-    subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]])
+    subprocess.run(["bh20-seq-uploader", "%s.yaml" %f[:-6], f])
diff --git a/scripts/split_into_arrays.cwl b/scripts/split_into_arrays.cwl
new file mode 100644
index 0000000..102fe7d
--- /dev/null
+++ b/scripts/split_into_arrays.cwl
@@ -0,0 +1,30 @@
+cwlVersion: v1.1
+class: ExpressionTool
+requirements:
+  InlineJavascriptRequirement: {}
+inputs:
+  dir:
+    type: Directory
+    loadListing: shallow_listing
+outputs:
+  fasta: File[]
+  metadata: File[]
+expression: |
+  ${
+  var dir = inputs.dir;
+  var fasta = [];
+  var metadata = [];
+  dir.listing.sort(function(a, b) { return a.basename < b.basename; });
+  for (var i = 0; i < dir.listing.length; i++) {
+    if (dir.listing[i].basename.substr(-6) == ".fasta") {
+      fasta.push(dir.listing[i]);
+    }
+    if (dir.listing[i].basename.substr(-5) == ".yaml") {
+      metadata.push(dir.listing[i]);
+    }
+  }
+  if (fasta.length != metadata.length) {
+    throw "They dont match";
+  }
+  return {"fasta": fasta, "metadata": metadata};
+  }
author	Peter Amstutz	2020-07-03 20:45:17 +0000
committer	Peter Amstutz	2020-07-03 20:47:14 +0000
commit	535b8017ddd27a9db683f6d29368258b5c48cf5a (patch)
tree	521a3e446f0eb9cbdc6e8736a13706c8141f8351
parent	841af02eaa45c1b7395d5f4e4711de3c3661c146 (diff)
download	bh20-seq-resource-535b8017ddd27a9db683f6d29368258b5c48cf5a.tar.gz bh20-seq-resource-535b8017ddd27a9db683f6d29368258b5c48cf5a.tar.lz bh20-seq-resource-535b8017ddd27a9db683f6d29368258b5c48cf5a.zip