From 7c12e4976337a063301be260cb3954bf4303f5e0 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Sat, 26 Sep 2020 12:12:17 +0200
Subject: script for processing the metadata of the ESR samples; moved
 delete_entries_on_arvados script in scripts directory

---
 scripts/README.md                                  |   2 +-
 scripts/delete_entries_on_arvados.py               |  34 +++++++++
 scripts/esr_samples/Pathogen.cl.1.0.xlsx           | Bin 0 -> 48488 bytes
 scripts/esr_samples/esr_samples.py                 |  85 +++++++++++++++++++++
 scripts/esr_samples/template.yaml                  |  29 +++++++
 .../delete_entries_on_arvados.py                   |  34 ---------
 6 files changed, 149 insertions(+), 35 deletions(-)
 create mode 100644 scripts/delete_entries_on_arvados.py
 create mode 100644 scripts/esr_samples/Pathogen.cl.1.0.xlsx
 create mode 100644 scripts/esr_samples/esr_samples.py
 create mode 100644 scripts/esr_samples/template.yaml
 delete mode 100644 workflows/pangenome-generate/delete_entries_on_arvados.py

diff --git a/scripts/README.md b/scripts/README.md
index a3ed1b6..f6a7e29 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -1,4 +1,4 @@
-### Instructions for download and/or prepare the data and/ or the metadata
+### Instructions for download and/or prepare the data and/or the metadata
 
 Just go into the `download_genbank_data` or `download_sra_data` directory and execute the python3 script inside.
 
diff --git a/scripts/delete_entries_on_arvados.py b/scripts/delete_entries_on_arvados.py
new file mode 100644
index 0000000..d281456
--- /dev/null
+++ b/scripts/delete_entries_on_arvados.py
@@ -0,0 +1,34 @@
+import sys
+import arvados
+import arvados.collection
+
+from datetime import datetime
+
+date_time_str = '2020-08-20'
+date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d')
+
+api = arvados.api()
+keepclient = arvados.keep.KeepClient(api_client=api)
+
+validated = arvados.util.list_all(api.collections().list, filters=[
+    ["owner_uuid", "=", sys.argv[1]],
+#    ["properties.status", "=", "validated"]
+])
+
+# validated.sort(key=lambda v: v["portable_data_hash"])
+
+num_sample_deleted = 0
+for item in validated:
+    sequence_label = item['properties']["sequence_label"]
+
+    # The SRA samples start with SRR or ERR
+    if not sequence_label.startswith('SRR') and not sequence_label.startswith('ERR'):
+        created_at_obj = datetime.strptime(item["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ')
+        # print(item, created_at_obj)
+
+        if created_at_obj < date_time_obj:
+            api.collections().delete(uuid=item['current_version_uuid']).execute()
+            num_sample_deleted += 1
+            print(sequence_label)
+
+print('num_sample_deleted: {}'.format(num_sample_deleted))
diff --git a/scripts/esr_samples/Pathogen.cl.1.0.xlsx b/scripts/esr_samples/Pathogen.cl.1.0.xlsx
new file mode 100644
index 0000000..c7c6393
Binary files /dev/null and b/scripts/esr_samples/Pathogen.cl.1.0.xlsx differ
diff --git a/scripts/esr_samples/esr_samples.py b/scripts/esr_samples/esr_samples.py
new file mode 100644
index 0000000..bd59612
--- /dev/null
+++ b/scripts/esr_samples/esr_samples.py
@@ -0,0 +1,85 @@
+import os
+import pandas as pd
+from string import Template
+from dateutil.parser import parse
+
+path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx'
+
+path_template_yaml = 'template.yaml'
+# Removed from the template (for now)
+# license:
+#    license_type: "http://creativecommons.org/licenses/by/4.0/"
+#    title: "SARS-CoV-2 New Zealand"
+#    attribution_name: "ESR"
+#    attribution_url: "https://www.esr.cri.nz/"
+
+dir_dict_ontology_standardization = '../dict_ontology_standardization/'
+
+dir_output = 'yaml'
+suffix = '.consensus'
+
+if not os.path.exists(dir_output):
+    os.makedirs(dir_output)
+
+term_to_uri_dict = {}
+
+for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in
+                          os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
+    print('Read {}'.format(path_dict_xxx_csv))
+
+    with open(path_dict_xxx_csv) as f:
+        for line in f:
+            if len(line.split(',')) > 2:
+                term, uri = line.strip('\n').split('",')
+            else:
+                term, uri = line.strip('\n').split(',')
+
+            term = term.strip('"')
+
+            if term in term_to_uri_dict:
+                print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+                continue
+
+            term_to_uri_dict[term] = uri
+
+metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12)
+
+# Maybe not the best pandas-way to do this
+for index, row in metadata_df.iterrows():
+    # print(row['*sample_name'])
+
+    geo_loc_name = row['*geo_loc_name'].replace(': ', ':')
+    country = ''
+    if not geo_loc_name in term_to_uri_dict:
+        if geo_loc_name in [
+            'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern',
+            'New Zealand:Waikato',
+            'New Zealand:Lakes', 'New Zealand:Nelson Marlborough', 'New Zealand:South Canterbury',
+            'New Zealand:MidCentral',
+            'New Zealand:Tairawhiti', 'New Zealand:Hawkes Bay', 'New Zealand:NA', 'New Zealand:Taranaki'
+        ]:
+            geo_loc_name = 'New Zealand'
+        else:
+            print(geo_loc_name)
+            break
+
+    country = term_to_uri_dict[geo_loc_name]
+
+    d = {
+        'host_species': term_to_uri_dict[row['*host']],
+        'sample_id': row['*sample_name'],
+        'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'),
+        'collection_location': country,
+        'specimen_source': term_to_uri_dict[row['*isolation_source']],
+        'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049',
+
+        'submitter_sample_id': row['bioproject_accession'],
+    }
+
+    with open(path_template_yaml) as f:
+        src = Template(f.read())
+
+        with open(os.path.join(dir_output, '{}{}.yaml'.format(row['*sample_name'], suffix)), 'w') as fw:
+            fw.write(src.substitute(d))
+
+print('{} YAML files created.'.format(len([x for x in os.listdir(dir_output) if x.endswith('.yaml')])))
diff --git a/scripts/esr_samples/template.yaml b/scripts/esr_samples/template.yaml
new file mode 100644
index 0000000..98f08a6
--- /dev/null
+++ b/scripts/esr_samples/template.yaml
@@ -0,0 +1,29 @@
+id: placeholder
+
+host:
+    host_species: "$host_species"
+
+sample:
+    sample_id: "$sample_id"
+    collection_date: "$collection_date"
+    collection_location: "$collection_location"
+    specimen_source: ["$specimen_source"]
+
+virus:
+    virus_species: "$virus_species"
+
+technology:
+    sample_sequencing_technology: ["http://www.ebi.ac.uk/efo/EFO_0008632"]
+    sequence_assembly_method: "https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics"
+    additional_technology_information: "Artic V3 workflow"
+
+submitter:
+    authors: ["Jemma L Geoghegan", "Xiaoyun Ren", "Matthew Storey", "James Hadfield", "Lauren Jelley", "Sarah Jefferies", "Jill Sherwood", "Shevaun Paine", "Sue Huang", "Jordan Douglas", "Fabio K Mendes", "Andrew Sporle", "Michael G Baker", "David R Murdoch", "Nigel French", "Colin R Simpson", "David Welch", "Alexei J Drummond", "Edward C Holmes", "Sebastian Duchene", "Joep de Ligt"]
+    submitter_name: ["Joep de Ligt"]
+    submitter_address: "PO Box 50348,  Porirua 5240, New Zealand"
+    originating_lab: "ESR"
+    submitter_sample_id: "$submitter_sample_id"
+    publication: "https://doi.org/10.1101/2020.08.05.20168930"
+    submitter_orcid: ["https://orcid.org/0000-0003-0970-0153"]
+    additional_submitter_information: "2020-08-20"
+    
\ No newline at end of file
diff --git a/workflows/pangenome-generate/delete_entries_on_arvados.py b/workflows/pangenome-generate/delete_entries_on_arvados.py
deleted file mode 100644
index d281456..0000000
--- a/workflows/pangenome-generate/delete_entries_on_arvados.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import sys
-import arvados
-import arvados.collection
-
-from datetime import datetime
-
-date_time_str = '2020-08-20'
-date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d')
-
-api = arvados.api()
-keepclient = arvados.keep.KeepClient(api_client=api)
-
-validated = arvados.util.list_all(api.collections().list, filters=[
-    ["owner_uuid", "=", sys.argv[1]],
-#    ["properties.status", "=", "validated"]
-])
-
-# validated.sort(key=lambda v: v["portable_data_hash"])
-
-num_sample_deleted = 0
-for item in validated:
-    sequence_label = item['properties']["sequence_label"]
-
-    # The SRA samples start with SRR or ERR
-    if not sequence_label.startswith('SRR') and not sequence_label.startswith('ERR'):
-        created_at_obj = datetime.strptime(item["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ')
-        # print(item, created_at_obj)
-
-        if created_at_obj < date_time_obj:
-            api.collections().delete(uuid=item['current_version_uuid']).execute()
-            num_sample_deleted += 1
-            print(sequence_label)
-
-print('num_sample_deleted: {}'.format(num_sample_deleted))
-- 
cgit v1.2.3