about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--bh20sequploader/bh20seq-schema.yml22
-rw-r--r--bh20sequploader/supporting_webuploader.yml28
-rw-r--r--scripts/from_genbank_to_fasta_and_yaml.py12
3 files changed, 58 insertions, 4 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index d8641a6..7ffc15b 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -84,6 +84,11 @@ $graph:
       type: string?
       jsonldPredicate:
           _id: http://purl.obolibrary.org/obo/OBI_0001479
+    specimen_source2:
+      doc: A specimen that derives from an anatomical part or substance arising from an organism, e.g.  tissue, organ
+      type: string?
+      jsonldPredicate:
+          _id: http://purl.obolibrary.org/obo/OBI_0001479
     collection_date:
       doc: Date when the sample was taken
       type: string?
@@ -139,6 +144,11 @@ $graph:
       type: string
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/OBI_0600047
+    sample_sequencing_technology2:
+      doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
+      type: string?
+      jsonldPredicate:
+        _id: http://purl.obolibrary.org/obo/OBI_0600047
     sequence_assembly_method:
       doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome
       type: string?
@@ -146,9 +156,19 @@ $graph:
         _id: http://www.ebi.ac.uk/efo/EFO_0002699
     sequencing_coverage:
       doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x)
-      type: int?
+      type: float?
+      jsonldPredicate:
+        _id: http://purl.obolibrary.org/obo/FLU_0000848
+    sequencing_coverage2:
+      doc: If a second sequence technology was use you can submit its coverage here
+      type: float?
       jsonldPredicate:
         _id: http://purl.obolibrary.org/obo/FLU_0000848
+    additional_technology_information:
+      doc: Field for additional technology information
+      type: string?
+      jsonldPredicate:
+        _id: http://semanticscience.org/resource/SIO_001167
 
 - name: submitterSchema
   type: record
diff --git a/bh20sequploader/supporting_webuploader.yml b/bh20sequploader/supporting_webuploader.yml
index 6b8946f..3650526 100644
--- a/bh20sequploader/supporting_webuploader.yml
+++ b/bh20sequploader/supporting_webuploader.yml
@@ -10,6 +10,34 @@ host_sex:
   Female: http://purl.obolibrary.org/obo/NCIT_C27993
   unknown: http://purl.obolibrary.org/obo/NCIT_C17998
 
+sample_sequencing_technology:
+  Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
+  Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632
+  Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205
+  IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894
+  Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
+  Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
+
+sample_sequencing_technology2:
+  Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
+  Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632
+  Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205
+  IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894
+  Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
+  Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
+
+specimen_source:
+  nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831
+  oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835
+  sputum: http://purl.obolibrary.org/obo/NCIT_C13278
+  bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195
+
+specimen_source2:
+  nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831
+  oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835
+  sputum: http://purl.obolibrary.org/obo/NCIT_C13278
+  bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195
+
 host_species:
   OLS-ontology: ncbitaxon
 
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 0cc1a57..6a55b5e 100644
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -7,7 +7,7 @@ import os
 
 path_ncbi_virus_accession = 'sequences.acc'
 
-date = '20200414'
+date = '20200415'
 path_seq_fasta = 'seq_from_nuccore.{}.fasta'.format(date)
 path_metadata_xml = 'metadata_from_nuccore.{}.xml'.format(date)
 
@@ -19,9 +19,15 @@ for term in term_list:
     tmp_list = Entrez.read(
         Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000')
     )['IdList']
-    print(term, len(tmp_list))
-    
+
+    # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq)
+    tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']]
+
     # Remove the version in the id
+    tmp_list = [x.split('.')[0] for x in tmp_list]
+    
+    print(term, len(tmp_list))
+
     id_set.update([x.split('.')[0] for x in tmp_list])
 
 print(term_list, len(id_set))