1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
import os
import pandas as pd
from string import Template
from dateutil.parser import parse
import re
import sys
# Metadata in tabular format in a spreadsheet(?!)
xlsx = '../../test/data/10_samples.xlsx'
# Template in a text file
template_yaml = 'template.yaml'
dir_output = 'yaml'
if not os.path.exists(dir_output):
os.makedirs(dir_output)
table = pd.read_excel(xlsx)
print(table)
for index, row in table.iterrows():
sample = row['Sample ID']
print(f"Processing sample {sample}...")
with open(template_yaml) as f:
text = Template(f.read())
with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw:
sample_id = sample
sample_name = sample
collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
locationx = row['City']+", "+row['State']+", USA"
location = "http://www.wikidata.org/entity/Q16563" # Memphis by default
map = {
"Pegram": "http://www.wikidata.org/entity/Q3289517",
"Alexander": "http://www.wikidata.org/entity/Q79663",
"Smithville": "http://www.wikidata.org/entity/Q2145339",
"Nashville": "http://www.wikidata.org/entity/Q23197",
"Madison": "http://www.wikidata.org/entity/Q494755"
}
for name in map:
p = re.compile(name)
if p.match(locationx):
location = map[name]
break
strain = f"SARS-CoV-2/human/USA/{sample}/2020"
fw.write(text.substitute(sample_id=sample_id,
sample_name=sample_name,
collection_date=collection_date,
location=location,
locationx=locationx,
strain=strain
))
print(f"Run: python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/{sample}.yaml scripts/uthsc_samples/yaml/{sample}.fa")
|