added type id check

what is not genomic DNA is removed
author: Andrea Guarracino 2020-04-15 15:54:26 +0200
committer: GitHub 2020-04-15 15:54:26 +0200
commit: addbd80878cc4fedaf785c147073bb72ef8b54b4 (patch)
tree: 9641c33fdb2f9169d7cf6cc55f276e8945a56dd1
parent: 7cf44a7aed6dd190e16ac94958aefcf7bffed6a0 (diff)
download: bh20-seq-resource-addbd80878cc4fedaf785c147073bb72ef8b54b4.tar.gz
bh20-seq-resource-addbd80878cc4fedaf785c147073bb72ef8b54b4.tar.lz
bh20-seq-resource-addbd80878cc4fedaf785c147073bb72ef8b54b4.zip
1 files changed, 9 insertions, 3 deletions
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 0cc1a57..6a55b5e 100644
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -7,7 +7,7 @@ import os
 
 path_ncbi_virus_accession = 'sequences.acc'
 
-date = '20200414'
+date = '20200415'
 path_seq_fasta = 'seq_from_nuccore.{}.fasta'.format(date)
 path_metadata_xml = 'metadata_from_nuccore.{}.xml'.format(date)
 
@@ -19,9 +19,15 @@ for term in term_list:
     tmp_list = Entrez.read(
         Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000')
     )['IdList']
-    print(term, len(tmp_list))
-    
+
+    # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq)
+    tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']]
+
     # Remove the version in the id
+    tmp_list = [x.split('.')[0] for x in tmp_list]
+    
+    print(term, len(tmp_list))
+
     id_set.update([x.split('.')[0] for x in tmp_list])
 
 print(term_list, len(id_set))
author	Andrea Guarracino	2020-04-15 15:54:26 +0200
committer	GitHub	2020-04-15 15:54:26 +0200
commit	addbd80878cc4fedaf785c147073bb72ef8b54b4 (patch)
tree	9641c33fdb2f9169d7cf6cc55f276e8945a56dd1
parent	7cf44a7aed6dd190e16ac94958aefcf7bffed6a0 (diff)
download	bh20-seq-resource-addbd80878cc4fedaf785c147073bb72ef8b54b4.tar.gz bh20-seq-resource-addbd80878cc4fedaf785c147073bb72ef8b54b4.tar.lz bh20-seq-resource-addbd80878cc4fedaf785c147073bb72ef8b54b4.zip