diff options
-rw-r--r-- | bh20sequploader/bh20seq-schema.yml | 22 | ||||
-rw-r--r-- | bh20sequploader/supporting_webuploader.yml | 28 | ||||
-rw-r--r-- | scripts/from_genbank_to_fasta_and_yaml.py | 12 |
3 files changed, 58 insertions, 4 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index d8641a6..7ffc15b 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -84,6 +84,11 @@ $graph: type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001479 + specimen_source2: + doc: A specimen that derives from an anatomical part or substance arising from an organism, e.g. tissue, organ + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001479 collection_date: doc: Date when the sample was taken type: string? @@ -139,6 +144,11 @@ $graph: type: string jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0600047 + sample_sequencing_technology2: + doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0600047 sequence_assembly_method: doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome type: string? @@ -146,9 +156,19 @@ $graph: _id: http://www.ebi.ac.uk/efo/EFO_0002699 sequencing_coverage: doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) - type: int? + type: float? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 + sequencing_coverage2: + doc: If a second sequence technology was use you can submit its coverage here + type: float? jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 + additional_technology_information: + doc: Field for additional technology information + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 - name: submitterSchema type: record diff --git a/bh20sequploader/supporting_webuploader.yml b/bh20sequploader/supporting_webuploader.yml index 6b8946f..3650526 100644 --- a/bh20sequploader/supporting_webuploader.yml +++ b/bh20sequploader/supporting_webuploader.yml @@ -10,6 +10,34 @@ host_sex: Female: http://purl.obolibrary.org/obo/NCIT_C27993 unknown: http://purl.obolibrary.org/obo/NCIT_C17998 +sample_sequencing_technology: + Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 + Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 + Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 + IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 + Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 + Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 + +sample_sequencing_technology2: + Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 + Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 + Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 + IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 + Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 + Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 + +specimen_source: + nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 + oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 + sputum: http://purl.obolibrary.org/obo/NCIT_C13278 + bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 + +specimen_source2: + nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 + oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 + sputum: http://purl.obolibrary.org/obo/NCIT_C13278 + bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 + host_species: OLS-ontology: ncbitaxon diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 0cc1a57..6a55b5e 100644 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -7,7 +7,7 @@ import os path_ncbi_virus_accession = 'sequences.acc' -date = '20200414' +date = '20200415' path_seq_fasta = 'seq_from_nuccore.{}.fasta'.format(date) path_metadata_xml = 'metadata_from_nuccore.{}.xml'.format(date) @@ -19,9 +19,15 @@ for term in term_list: tmp_list = Entrez.read( Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000') )['IdList'] - print(term, len(tmp_list)) - + + # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) + tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']] + # Remove the version in the id + tmp_list = [x.split('.')[0] for x in tmp_list] + + print(term, len(tmp_list)) + id_set.update([x.split('.')[0] for x in tmp_list]) print(term_list, len(id_set)) |