scripts/uthsc_samples/uthsc_samples.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

import os
import pandas as pd
from string import Template
from dateutil.parser import parse
import re

import sys

# Metadata in tabular format in a spreadsheet(?!)
xlsx = '../../test/data/10_samples.xlsx'

# Template in a text file
template_yaml = 'template.yaml'

dir_output = 'yaml'

if not os.path.exists(dir_output):
    os.makedirs(dir_output)

table = pd.read_excel(xlsx)

print(table)

for index, row in table.iterrows():
    sample = row['Sample ID']
    print(f"Processing sample {sample}...")

    with open(template_yaml) as f:
      text = Template(f.read())
      with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw:
          sample_id = sample
          sample_name = sample
          collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
          locationx = row['City']+", "+row['State']+", USA"
          location = "http://www.wikidata.org/entity/Q16563" # Memphis by default
          map = {
              "Pegram": "http://www.wikidata.org/entity/Q3289517",
              "Alexander": "http://www.wikidata.org/entity/Q79663",
              "Smithville": "http://www.wikidata.org/entity/Q2145339",
              "Nashville": "http://www.wikidata.org/entity/Q23197",
              "Madison": "http://www.wikidata.org/entity/Q494755"
              }

          for name in map:
              p = re.compile(name)
              if p.match(locationx):
                  location = map[name]
                  break

          strain = f"SARS-CoV-2/human/USA/{sample}/2020"
          fw.write(text.substitute(sample_id=sample_id,
                                   sample_name=sample_name,
                                   collection_date=collection_date,
                                   location=location,
                                   locationx=locationx,
                                   strain=strain
                                   ))

          print(f"Run: python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/{sample}.yaml scripts/uthsc_samples/yaml/{sample}.fa")