diff options
author | Pjotr Prins | 2020-04-19 12:19:01 -0500 |
---|---|---|
committer | GitHub | 2020-04-19 12:19:01 -0500 |
commit | ccc4bfa4fe8466b6c19185d7d9d3e2b7a3ce30e2 (patch) | |
tree | a7e5de66a4a0aaf04c61e5a0f112341a2d956b02 /scripts | |
parent | bbca5ac9b2538e410efe3e09651f87e5573145de (diff) | |
parent | addbd80878cc4fedaf785c147073bb72ef8b54b4 (diff) | |
download | bh20-seq-resource-ccc4bfa4fe8466b6c19185d7d9d3e2b7a3ce30e2.tar.gz bh20-seq-resource-ccc4bfa4fe8466b6c19185d7d9d3e2b7a3ce30e2.tar.lz bh20-seq-resource-ccc4bfa4fe8466b6c19185d7d9d3e2b7a3ce30e2.zip |
Merge pull request #22 from AndreaGuarracino/patch-4
added type id check
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/from_genbank_to_fasta_and_yaml.py | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 0cc1a57..6a55b5e 100644 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -7,7 +7,7 @@ import os path_ncbi_virus_accession = 'sequences.acc' -date = '20200414' +date = '20200415' path_seq_fasta = 'seq_from_nuccore.{}.fasta'.format(date) path_metadata_xml = 'metadata_from_nuccore.{}.xml'.format(date) @@ -19,9 +19,15 @@ for term in term_list: tmp_list = Entrez.read( Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000') )['IdList'] - print(term, len(tmp_list)) - + + # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) + tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']] + # Remove the version in the id + tmp_list = [x.split('.')[0] for x in tmp_list] + + print(term, len(tmp_list)) + id_set.update([x.split('.')[0] for x in tmp_list]) print(term_list, len(id_set)) |