about summary refs log tree commit diff
path: root/scripts/uthsc_samples/uthsc_samples.py
diff options
context:
space:
mode:
authorlltommy2020-11-11 09:56:12 +0100
committerlltommy2020-11-11 09:56:12 +0100
commitd6aa323b6fc7a82e45cc1df51fc72c2d547146eb (patch)
tree6e8b77bde4dc34fab3fa8804906f3cb821f61dae /scripts/uthsc_samples/uthsc_samples.py
parentc5fe5de7e4c77bfb48b1ae2f662c2d9cc120c06e (diff)
parentc872248e43c1c66e5fed8ef341f7b4ac21d63e6f (diff)
downloadbh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.gz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.lz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.zip
Merge branch 'master' of https://github.com/arvados/bh20-seq-resource
Diffstat (limited to 'scripts/uthsc_samples/uthsc_samples.py')
-rw-r--r--scripts/uthsc_samples/uthsc_samples.py59
1 files changed, 59 insertions, 0 deletions
diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py
new file mode 100644
index 0000000..54c70ee
--- /dev/null
+++ b/scripts/uthsc_samples/uthsc_samples.py
@@ -0,0 +1,59 @@
+import os
+import pandas as pd
+from string import Template
+from dateutil.parser import parse
+import re
+
+import sys
+
+# Metadata in tabular format in a spreadsheet(?!)
+xlsx = '../../test/data/10_samples.xlsx'
+
+# Template in a text file
+template_yaml = 'template.yaml'
+
+dir_output = 'yaml'
+
+if not os.path.exists(dir_output):
+    os.makedirs(dir_output)
+
+table = pd.read_excel(xlsx)
+
+print(table)
+
+for index, row in table.iterrows():
+    sample = row['Sample ID']
+    print(f"Processing sample {sample}...")
+
+    with open(template_yaml) as f:
+      text = Template(f.read())
+      with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw:
+          sample_id = sample
+          sample_name = sample
+          collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
+          locationx = row['City']+", "+row['State']+", USA"
+          location = "http://www.wikidata.org/entity/Q16563" # Memphis by default
+          map = {
+              "Pegram": "http://www.wikidata.org/entity/Q3289517",
+              "Alexander": "http://www.wikidata.org/entity/Q79663",
+              "Smithville": "http://www.wikidata.org/entity/Q2145339",
+              "Nashville": "http://www.wikidata.org/entity/Q23197",
+              "Madison": "http://www.wikidata.org/entity/Q494755"
+              }
+
+          for name in map:
+              p = re.compile(name)
+              if p.match(locationx):
+                  location = map[name]
+                  break
+
+          strain = f"SARS-CoV-2/human/USA/{sample}/2020"
+          fw.write(text.substitute(sample_id=sample_id,
+                                   sample_name=sample_name,
+                                   collection_date=collection_date,
+                                   location=location,
+                                   locationx=locationx,
+                                   strain=strain
+                                   ))
+
+          print(f"Run: python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/{sample}.yaml scripts/uthsc_samples/yaml/{sample}.fa")