From 7c12e4976337a063301be260cb3954bf4303f5e0 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 26 Sep 2020 12:12:17 +0200 Subject: script for processing the metadata of the ESR samples; moved delete_entries_on_arvados script in scripts directory --- scripts/README.md | 2 +- scripts/delete_entries_on_arvados.py | 34 +++++++++ scripts/esr_samples/Pathogen.cl.1.0.xlsx | Bin 0 -> 48488 bytes scripts/esr_samples/esr_samples.py | 85 +++++++++++++++++++++ scripts/esr_samples/template.yaml | 29 +++++++ .../delete_entries_on_arvados.py | 34 --------- 6 files changed, 149 insertions(+), 35 deletions(-) create mode 100644 scripts/delete_entries_on_arvados.py create mode 100644 scripts/esr_samples/Pathogen.cl.1.0.xlsx create mode 100644 scripts/esr_samples/esr_samples.py create mode 100644 scripts/esr_samples/template.yaml delete mode 100644 workflows/pangenome-generate/delete_entries_on_arvados.py diff --git a/scripts/README.md b/scripts/README.md index a3ed1b6..f6a7e29 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,4 +1,4 @@ -### Instructions for download and/or prepare the data and/ or the metadata +### Instructions for download and/or prepare the data and/or the metadata Just go into the `download_genbank_data` or `download_sra_data` directory and execute the python3 script inside. diff --git a/scripts/delete_entries_on_arvados.py b/scripts/delete_entries_on_arvados.py new file mode 100644 index 0000000..d281456 --- /dev/null +++ b/scripts/delete_entries_on_arvados.py @@ -0,0 +1,34 @@ +import sys +import arvados +import arvados.collection + +from datetime import datetime + +date_time_str = '2020-08-20' +date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d') + +api = arvados.api() +keepclient = arvados.keep.KeepClient(api_client=api) + +validated = arvados.util.list_all(api.collections().list, filters=[ + ["owner_uuid", "=", sys.argv[1]], +# ["properties.status", "=", "validated"] +]) + +# validated.sort(key=lambda v: v["portable_data_hash"]) + +num_sample_deleted = 0 +for item in validated: + sequence_label = item['properties']["sequence_label"] + + # The SRA samples start with SRR or ERR + if not sequence_label.startswith('SRR') and not sequence_label.startswith('ERR'): + created_at_obj = datetime.strptime(item["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ') + # print(item, created_at_obj) + + if created_at_obj < date_time_obj: + api.collections().delete(uuid=item['current_version_uuid']).execute() + num_sample_deleted += 1 + print(sequence_label) + +print('num_sample_deleted: {}'.format(num_sample_deleted)) diff --git a/scripts/esr_samples/Pathogen.cl.1.0.xlsx b/scripts/esr_samples/Pathogen.cl.1.0.xlsx new file mode 100644 index 0000000..c7c6393 Binary files /dev/null and b/scripts/esr_samples/Pathogen.cl.1.0.xlsx differ diff --git a/scripts/esr_samples/esr_samples.py b/scripts/esr_samples/esr_samples.py new file mode 100644 index 0000000..bd59612 --- /dev/null +++ b/scripts/esr_samples/esr_samples.py @@ -0,0 +1,85 @@ +import os +import pandas as pd +from string import Template +from dateutil.parser import parse + +path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx' + +path_template_yaml = 'template.yaml' +# Removed from the template (for now) +# license: +# license_type: "http://creativecommons.org/licenses/by/4.0/" +# title: "SARS-CoV-2 New Zealand" +# attribution_name: "ESR" +# attribution_url: "https://www.esr.cri.nz/" + +dir_dict_ontology_standardization = '../dict_ontology_standardization/' + +dir_output = 'yaml' +suffix = '.consensus' + +if not os.path.exists(dir_output): + os.makedirs(dir_output) + +term_to_uri_dict = {} + +for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in + os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: + print('Read {}'.format(path_dict_xxx_csv)) + + with open(path_dict_xxx_csv) as f: + for line in f: + if len(line.split(',')) > 2: + term, uri = line.strip('\n').split('",') + else: + term, uri = line.strip('\n').split(',') + + term = term.strip('"') + + if term in term_to_uri_dict: + print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + continue + + term_to_uri_dict[term] = uri + +metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12) + +# Maybe not the best pandas-way to do this +for index, row in metadata_df.iterrows(): + # print(row['*sample_name']) + + geo_loc_name = row['*geo_loc_name'].replace(': ', ':') + country = '' + if not geo_loc_name in term_to_uri_dict: + if geo_loc_name in [ + 'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern', + 'New Zealand:Waikato', + 'New Zealand:Lakes', 'New Zealand:Nelson Marlborough', 'New Zealand:South Canterbury', + 'New Zealand:MidCentral', + 'New Zealand:Tairawhiti', 'New Zealand:Hawkes Bay', 'New Zealand:NA', 'New Zealand:Taranaki' + ]: + geo_loc_name = 'New Zealand' + else: + print(geo_loc_name) + break + + country = term_to_uri_dict[geo_loc_name] + + d = { + 'host_species': term_to_uri_dict[row['*host']], + 'sample_id': row['*sample_name'], + 'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'), + 'collection_location': country, + 'specimen_source': term_to_uri_dict[row['*isolation_source']], + 'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049', + + 'submitter_sample_id': row['bioproject_accession'], + } + + with open(path_template_yaml) as f: + src = Template(f.read()) + + with open(os.path.join(dir_output, '{}{}.yaml'.format(row['*sample_name'], suffix)), 'w') as fw: + fw.write(src.substitute(d)) + +print('{} YAML files created.'.format(len([x for x in os.listdir(dir_output) if x.endswith('.yaml')]))) diff --git a/scripts/esr_samples/template.yaml b/scripts/esr_samples/template.yaml new file mode 100644 index 0000000..98f08a6 --- /dev/null +++ b/scripts/esr_samples/template.yaml @@ -0,0 +1,29 @@ +id: placeholder + +host: + host_species: "$host_species" + +sample: + sample_id: "$sample_id" + collection_date: "$collection_date" + collection_location: "$collection_location" + specimen_source: ["$specimen_source"] + +virus: + virus_species: "$virus_species" + +technology: + sample_sequencing_technology: ["http://www.ebi.ac.uk/efo/EFO_0008632"] + sequence_assembly_method: "https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics" + additional_technology_information: "Artic V3 workflow" + +submitter: + authors: ["Jemma L Geoghegan", "Xiaoyun Ren", "Matthew Storey", "James Hadfield", "Lauren Jelley", "Sarah Jefferies", "Jill Sherwood", "Shevaun Paine", "Sue Huang", "Jordan Douglas", "Fabio K Mendes", "Andrew Sporle", "Michael G Baker", "David R Murdoch", "Nigel French", "Colin R Simpson", "David Welch", "Alexei J Drummond", "Edward C Holmes", "Sebastian Duchene", "Joep de Ligt"] + submitter_name: ["Joep de Ligt"] + submitter_address: "PO Box 50348, Porirua 5240, New Zealand" + originating_lab: "ESR" + submitter_sample_id: "$submitter_sample_id" + publication: "https://doi.org/10.1101/2020.08.05.20168930" + submitter_orcid: ["https://orcid.org/0000-0003-0970-0153"] + additional_submitter_information: "2020-08-20" + \ No newline at end of file diff --git a/workflows/pangenome-generate/delete_entries_on_arvados.py b/workflows/pangenome-generate/delete_entries_on_arvados.py deleted file mode 100644 index d281456..0000000 --- a/workflows/pangenome-generate/delete_entries_on_arvados.py +++ /dev/null @@ -1,34 +0,0 @@ -import sys -import arvados -import arvados.collection - -from datetime import datetime - -date_time_str = '2020-08-20' -date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d') - -api = arvados.api() -keepclient = arvados.keep.KeepClient(api_client=api) - -validated = arvados.util.list_all(api.collections().list, filters=[ - ["owner_uuid", "=", sys.argv[1]], -# ["properties.status", "=", "validated"] -]) - -# validated.sort(key=lambda v: v["portable_data_hash"]) - -num_sample_deleted = 0 -for item in validated: - sequence_label = item['properties']["sequence_label"] - - # The SRA samples start with SRR or ERR - if not sequence_label.startswith('SRR') and not sequence_label.startswith('ERR'): - created_at_obj = datetime.strptime(item["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ') - # print(item, created_at_obj) - - if created_at_obj < date_time_obj: - api.collections().delete(uuid=item['current_version_uuid']).execute() - num_sample_deleted += 1 - print(sequence_label) - -print('num_sample_deleted: {}'.format(num_sample_deleted)) -- cgit v1.2.3