From 88d81f853cf04b7f28681dd9cdee775b0422f252 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 21 Apr 2020 12:53:19 -0400 Subject: Working on NCBI import Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/bh20seq-schema.yml | 4 ++-- bh20sequploader/main.py | 7 ++++--- scripts/foreach.sh | 18 ++++++++++++++++++ scripts/from_genbank_to_fasta_and_yaml.py | 26 ++++++++++++++------------ 4 files changed, 38 insertions(+), 17 deletions(-) create mode 100755 scripts/foreach.sh mode change 100644 => 100755 scripts/from_genbank_to_fasta_and_yaml.py diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 75308ab..ebca35b 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -162,12 +162,12 @@ $graph: _id: http://www.ebi.ac.uk/efo/EFO_0002699 sequencing_coverage: doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) - type: float? + type: ["null", float, int] jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 sequencing_coverage2: doc: If a second sequence technology was used you can submit its coverage here - type: float? + type: ["null", float, int] jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 additional_technology_information: diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 49d012d..2fda347 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -44,7 +44,8 @@ def main(): with col.open(target, "w") as f: r = args.sequence.read(65536) - print(r[0:20]) + seqlabel = r[1:r.index("\n")] + print(seqlabel) while r: f.write(r) r = args.sequence.read(65536) @@ -67,8 +68,8 @@ def main(): "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname()) } - col.save_new(owner_uuid=UPLOAD_PROJECT, name="Uploaded by %s from %s" % - (properties['upload_user'], properties['upload_ip']), + col.save_new(owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" % + (seqlabel, properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) print("Done") diff --git a/scripts/foreach.sh b/scripts/foreach.sh new file mode 100755 index 0000000..35b07b8 --- /dev/null +++ b/scripts/foreach.sh @@ -0,0 +1,18 @@ +#!/bin/sh +rm -rf validated fasta_and_yaml_* +mkdir -p validated +./from_genbank_to_fasta_and_yaml.py +fasta_files=$(find fasta_and_yaml_20200421/ -name "*.fasta") +for f in $fasta_files ; do + yaml=$(echo $f | rev | cut -c7- | rev).yaml + echo $f + echo $yaml + if bh20-seq-uploader --validate $f $yaml ; then + sz=$(stat --format=%s $f) + if test $sz -gt 20000 ; then + mv $f $yaml validated + else + echo "Fasta file too small" + fi + fi +done diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py old mode 100644 new mode 100755 index 7e7c089..1a12513 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -1,8 +1,10 @@ +#!/usr/bin/env python3 + from Bio import Entrez Entrez.email = 'another_email@gmail.com' import xml.etree.ElementTree as ET -import yaml +import json import os from datetime import date @@ -29,7 +31,7 @@ for term in term_list: # Remove the version in the id tmp_list = [x.split('.')[0] for x in tmp_list] - + print(term, len(tmp_list)) tmp_list=tmp_list # tmp_list = tmp_list[0:2] # restricting to small run @@ -49,11 +51,11 @@ print(term_list + ['NCBI Virus'], len(id_set)) def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] - + num_ids_for_request = 100 if not os.path.exists(dir_metadata_today): os.makedirs(dir_metadata_today) - + for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): path_metadata_xxx_xml = os.path.join(dir_metadata_today, 'metadata_{}.xml'.format(i)) print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml)) @@ -63,7 +65,7 @@ if not os.path.exists(dir_metadata_today): Entrez.efetch(db='nuccore', id=id_x_list, retmode='xml').read() ) - + term_to_uri_dict = {} for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: @@ -74,7 +76,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x if len(line.split(',')) > 2: term, uri = line.strip('\n').split('",') term = term.strip('"') - else: + else: term, uri = line.strip('\n').split(',') term_to_uri_dict[term] = uri @@ -125,7 +127,7 @@ if not os.path.exists(dir_fasta_and_yaml_today): ): if info_to_check in GBSeq_comment_text: tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0] - + if field_in_yaml == 'sequencing_coverage': # A regular expression would be better! info_for_yaml_dict['technology'][field_in_yaml] = ';'.join( @@ -139,7 +141,7 @@ if not os.path.exists(dir_fasta_and_yaml_today): seq_tec = term_to_uri_dict[seq_tec] else: print(accession_version, 'missing technologies:', seq_tec) - + new_seq_tec_list.append(seq_tec) for n, seq_tec in enumerate(new_seq_tec_list): @@ -147,7 +149,7 @@ if not os.path.exists(dir_fasta_and_yaml_today): else: info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse - + #term_to_uri_dict for GBFeature in GBSeq.iter('GBFeature'): @@ -211,12 +213,12 @@ if not os.path.exists(dir_fasta_and_yaml_today): info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1] - #Remove technology key if empty! + # Remove technology key if empty! if (info_for_yaml_dict['technology']=={}): - del info_for_yaml_dict['key'] + del info_for_yaml_dict['technology'] with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw: fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper())) with open(os.path.join(dir_fasta_and_yaml_today, '{}.yaml'.format(accession_version)), 'w') as fw: - yaml.dump(info_for_yaml_dict, fw, default_flow_style=False) + json.dump(info_for_yaml_dict, fw, indent=2) -- cgit v1.2.3