aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2020-08-27 00:18:24 +0200
committerAndreaGuarracino2020-08-27 00:18:24 +0200
commit4299c750728bbad4bdbf0311ff2a4b9c65d9883c (patch)
tree07515a9ad6d07eca22853806f879fb39d2375c09
parentb69b8560132786e16ea6a997d65fd8c43381cc03 (diff)
downloadbh20-seq-resource-4299c750728bbad4bdbf0311ff2a4b9c65d9883c.tar.gz
bh20-seq-resource-4299c750728bbad4bdbf0311ff2a4b9c65d9883c.tar.lz
bh20-seq-resource-4299c750728bbad4bdbf0311ff2a4b9c65d9883c.zip
updated dependency from clustalw to minimap2; the genbank script no longer creates YAML/FASTA pairs for too short sequences
-rw-r--r--.guix-deploy2
-rw-r--r--doc/INSTALL.md2
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py19
3 files changed, 11 insertions, 12 deletions
diff --git a/.guix-deploy b/.guix-deploy
index e978e23..570ae10 100644
--- a/.guix-deploy
+++ b/.guix-deploy
@@ -6,5 +6,5 @@ export GUILE_LOAD_COMPILED_PATH=$GUIX_PROFILE/share/guile/site/3.0/
ls $GUILE_LOAD_PATH
-env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ $GUIX_PROFILE/bin/guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc clustalw python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_RUN_PORT=5067 FLASK_APP=bh20simplewebuploader/main.py flask run
+env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ $GUIX_PROFILE/bin/guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc minimap2 python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_RUN_PORT=5067 FLASK_APP=bh20simplewebuploader/main.py flask run
diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 3b270dd..e31b7d7 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -59,7 +59,7 @@ WIP: add gunicorn container
Currently the full webserver container deploy command looks like
```
-penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-oinformatics/ ~/iwrk/opensource/guix/guix/pre-inst-env guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc clustalw python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_APP=bh20simplewebuploader/main.py flask run
+penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-oinformatics/ ~/iwrk/opensource/guix/guix/pre-inst-env guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc minimap2 python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_APP=bh20simplewebuploader/main.py flask run
```
Note: see above on GUIX_PACKAGE_PATH.
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 272b5ba..8ef76e1 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -145,7 +145,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
if not os.path.exists(dir_fasta_and_yaml):
os.makedirs(dir_fasta_and_yaml)
-min_len_to_count = 27500
+min_len_to_count = 15000
num_seq_with_len_ge_X_bp = 0
missing_value_list = []
@@ -411,18 +411,17 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
not_created_accession_dict[accession_version] = []
not_created_accession_dict[accession_version].append('host_species not found')
- if accession_version in not_created_accession_dict:
- continue
-
- with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
- fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
+ if len(GBSeq_sequence.text) < min_len_to_count:
+ not_created_accession_dict[accession_version].append('sequence shorter than {} bp'.format(min_len_to_count))
- with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw:
- json.dump(info_for_yaml_dict, fw, indent=2)
+ if accession_version not in not_created_accession_dict:
+ num_seq_with_len_ge_X_bp += 1
+ with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
+ fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
- if(len(GBSeq_sequence.text) >= min_len_to_count):
- num_seq_with_len_ge_X_bp += 1
+ with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw:
+ json.dump(info_for_yaml_dict, fw, indent=2)
except:
print("Unexpected error for the ID {}: {}".format(accession_version, sys.exc_info()[0]))
accession_with_errors_list.append(accession_version)