aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/blog/using-covid-19-pubseq-part3.org16
-rw-r--r--scripts/dict_ontology_standardization/ncbi_countries.csv2
-rw-r--r--scripts/esr_samples/esr_samples.py40
3 files changed, 31 insertions, 27 deletions
diff --git a/doc/blog/using-covid-19-pubseq-part3.org b/doc/blog/using-covid-19-pubseq-part3.org
index 4d70e7c..abc260c 100644
--- a/doc/blog/using-covid-19-pubseq-part3.org
+++ b/doc/blog/using-covid-19-pubseq-part3.org
@@ -21,6 +21,7 @@
- [[#bulk-sequence-uploader][Bulk sequence uploader]]
- [[#run-the-uploader-cli][Run the uploader (CLI)]]
- [[#example-uploading-bulk-genbank-sequences][Example: uploading bulk GenBank sequences]]
+ - [[#example-preparing-metadata][Example: preparing metadata]]
* Uploading Data
@@ -232,6 +233,7 @@ Guix package manager).
The web interface using this exact same script so it should just work
(TM).
+
** Example: uploading bulk GenBank sequences
We also use above script to bulk upload GenBank sequences with a [[https://github.com/arvados/bh20-seq-resource/blob/master/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py][FASTA
@@ -250,3 +252,17 @@ ls $dir_fasta_and_yaml/*.yaml | while read path_code_yaml; do
bh20-seq-uploader --skip-qc $path_code_yaml $path_code_fasta
done
#+END_SRC
+
+
+** Example: preparing metadata
+
+Usually, metadata are available in tabular format, like spreadsheets. As an example, we provide a script
+[[https://github.com/arvados/bh20-seq-resource/tree/master/scripts/esr_samples][esr_samples.py]] to show you how to parse
+your metadata in YAML files ready for the upload. To execute the script, go in the ~bh20-seq-resource/scripts/esr_samples
+and execute
+
+#+BEGIN_SRC sh
+python3 esr_samples.py
+#+END_SRC
+
+You will find the YAML files in the `yaml` folder which will be created in the same directory. \ No newline at end of file
diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv
index 90d9af3..a710906 100644
--- a/scripts/dict_ontology_standardization/ncbi_countries.csv
+++ b/scripts/dict_ontology_standardization/ncbi_countries.csv
@@ -280,7 +280,7 @@ Italy:Bologna,http://www.wikidata.org/entity/Q1891
Italy:Cagliari,http://www.wikidata.org/entity/Q1897
Italy:Lazio,http://www.wikidata.org/entity/Q1282
Italy:Milan,http://www.wikidata.org/entity/Q490
-Italy:Lombardia, Milan,http://www.wikidata.org/entity/Q490
+"Italy:Lombardia, Milan",http://www.wikidata.org/entity/Q490
Italy:Palermo,http://www.wikidata.org/entity/Q2656
Italy:Rome,http://www.wikidata.org/entity/Q220
Italy:Turin,http://www.wikidata.org/entity/Q495
diff --git a/scripts/esr_samples/esr_samples.py b/scripts/esr_samples/esr_samples.py
index bd59612..06f3d51 100644
--- a/scripts/esr_samples/esr_samples.py
+++ b/scripts/esr_samples/esr_samples.py
@@ -3,6 +3,12 @@ import pandas as pd
from string import Template
from dateutil.parser import parse
+import sys
+
+sys.path.append('../')
+from utils import check_and_get_ontology_dictionaries
+
+# Metadata in tabular format
path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx'
path_template_yaml = 'template.yaml'
@@ -13,7 +19,10 @@ path_template_yaml = 'template.yaml'
# attribution_name: "ESR"
# attribution_url: "https://www.esr.cri.nz/"
+
+# Read the dictionaries for the ontology
dir_dict_ontology_standardization = '../dict_ontology_standardization/'
+field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization)
dir_output = 'yaml'
suffix = '.consensus'
@@ -21,27 +30,6 @@ suffix = '.consensus'
if not os.path.exists(dir_output):
os.makedirs(dir_output)
-term_to_uri_dict = {}
-
-for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in
- os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
- print('Read {}'.format(path_dict_xxx_csv))
-
- with open(path_dict_xxx_csv) as f:
- for line in f:
- if len(line.split(',')) > 2:
- term, uri = line.strip('\n').split('",')
- else:
- term, uri = line.strip('\n').split(',')
-
- term = term.strip('"')
-
- if term in term_to_uri_dict:
- print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
- continue
-
- term_to_uri_dict[term] = uri
-
metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12)
# Maybe not the best pandas-way to do this
@@ -49,8 +37,8 @@ for index, row in metadata_df.iterrows():
# print(row['*sample_name'])
geo_loc_name = row['*geo_loc_name'].replace(': ', ':')
- country = ''
- if not geo_loc_name in term_to_uri_dict:
+
+ if geo_loc_name not in field_to_term_to_uri_dict['ncbi_countries']:
if geo_loc_name in [
'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern',
'New Zealand:Waikato',
@@ -63,14 +51,14 @@ for index, row in metadata_df.iterrows():
print(geo_loc_name)
break
- country = term_to_uri_dict[geo_loc_name]
+ country = field_to_term_to_uri_dict['ncbi_countries'][geo_loc_name]
d = {
- 'host_species': term_to_uri_dict[row['*host']],
+ 'host_species': field_to_term_to_uri_dict['ncbi_host_species'][row['*host']],
'sample_id': row['*sample_name'],
'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'),
'collection_location': country,
- 'specimen_source': term_to_uri_dict[row['*isolation_source']],
+ 'specimen_source': field_to_term_to_uri_dict['ncbi_speciesman_source'][row['*isolation_source']],
'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049',
'submitter_sample_id': row['bioproject_accession'],