aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2020-09-04 11:13:25 +0200
committerAndreaGuarracino2020-09-04 11:13:25 +0200
commit327b6db4e7c9ceda190617f2a793e6a646bb30cc (patch)
treea4e4750c85dcd4352bce415091a084fe9e64b707
parent1430c62ff9245bfecb1d41cc87bbafafcfc81ca3 (diff)
downloadbh20-seq-resource-327b6db4e7c9ceda190617f2a793e6a646bb30cc.tar.gz
bh20-seq-resource-327b6db4e7c9ceda190617f2a793e6a646bb30cc.tar.lz
bh20-seq-resource-327b6db4e7c9ceda190617f2a793e6a646bb30cc.zip
added in the sra script an option to include only a subset of ids
-rw-r--r--scripts/create_sra_metadata/create_sra_metadata.py29
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py3
2 files changed, 28 insertions, 4 deletions
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index 09cc51b..7e0fc83 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -3,6 +3,7 @@
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--ids-to-ignore', type=str, help='file with ids to ignore in all steps, 1 id per line', required=False)
+parser.add_argument('--ids-to-consider', type=str, help='file with ids to consider in all steps, 1 id per line', required=False)
parser.add_argument('--dict-ontology', type=str, help='where is the ontology',
default='../dict_ontology_standardization/', required=False)
@@ -45,7 +46,22 @@ if args.ids_to_ignore:
with open(args.ids_to_ignore) as f:
accession_to_ignore_set.update(set([x.split('.')[0] for x in f.read().strip('\n').split('\n')]))
- print('There are {} accessions to ignore.'.format(len(accession_to_ignore_set)))
+
+ print('There are {} accessions to ignore.'.format(len(accession_to_ignore_set)))
+
+
+accession_to_consider_set = set()
+
+if args.ids_to_consider:
+ if not os.path.exists(args.ids_to_consider):
+ print("\tThe '{}' file doesn't exist.".format(args.ids_to_consider))
+ sys.exit(-1)
+
+ with open(args.ids_to_consider) as f:
+ accession_to_consider_set.update(set([x.split('.')[0] for x in f.read().strip('\n').split('\n')]))
+
+ if len(accession_to_consider_set) > 0:
+ print('There are {} accessions to consider.'.format(len(accession_to_consider_set)))
term_to_uri_dict = {}
@@ -103,9 +119,16 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
RUN_SET = EXPERIMENT_PACKAGE.find('RUN_SET')
RUN = RUN_SET.find('RUN')
accession = RUN.attrib['accession']
+
run_accession_set.add(accession)
#print(accession)
+ if accession in accession_to_ignore_set:
+ continue
+
+ if len(accession_to_consider_set) > 0 and accession not in accession_to_consider_set:
+ continue
+
info_for_yaml_dict['sample']['sample_id'] = accession
#SRAFiles = RUN.find('SRAFiles')
@@ -305,8 +328,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
if accession not in not_created_accession_dict:
num_yaml_created += 1
- with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
- json.dump(info_for_yaml_dict, fw, indent=2)
+ #with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
+ # json.dump(info_for_yaml_dict, fw, indent=2)
if len(missing_value_list) > 0:
path_missing_terms_tsv = 'missing_terms.sra.tsv'
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 7251819..52aee4e 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -58,7 +58,8 @@ if args.ids_to_ignore:
with open(args.ids_to_ignore) as f:
accession_to_ignore_set.update(set([x.split('.')[0] for x in f.read().strip('\n').split('\n')]))
- print('There are {} accessions to ignore.'.format(len(accession_to_ignore_set)))
+
+ print('There are {} accessions to ignore.'.format(len(accession_to_ignore_set)))
accession_already_downloaded_set = set()