aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2020-05-06 09:03:08 -0500
committerPjotr Prins2020-05-06 09:03:08 -0500
commit0031e778ee1ad8b934411da5082fcb3115646e67 (patch)
treebe17fb9aff6351e967a9210cd4c58a60ad55a382
parent7d7af6cde75d09da7a05cf5bc05ef2556c3aea92 (diff)
parentb6d846b5de6c67b28adab1fa520953115a1a1e30 (diff)
downloadbh20-seq-resource-0031e778ee1ad8b934411da5082fcb3115646e67.tar.gz
bh20-seq-resource-0031e778ee1ad8b934411da5082fcb3115646e67.tar.lz
bh20-seq-resource-0031e778ee1ad8b934411da5082fcb3115646e67.zip
Merge branch 'master' of github.com:arvados/bh20-seq-resource
-rw-r--r--.gitignore16
-rw-r--r--README.md10
-rw-r--r--bh20seqanalyzer/main.py2
-rw-r--r--bh20sequploader/SARS-CoV-2-reference.fasta430
-rw-r--r--bh20sequploader/bh20seq-options.yml30
-rw-r--r--bh20sequploader/bh20seq-schema.yml65
-rw-r--r--bh20sequploader/bh20seq-shex.rdf12
-rw-r--r--bh20sequploader/main.py7
-rw-r--r--bh20sequploader/qc_fasta.py63
-rw-r--r--bh20simplewebuploader/__init__.py0
-rw-r--r--bh20simplewebuploader/main.py127
-rw-r--r--bh20simplewebuploader/static/main.css269
-rw-r--r--bh20simplewebuploader/static/main.js47
-rw-r--r--bh20simplewebuploader/templates/form.html325
-rw-r--r--example/maximum_metadata_example.yaml46
-rw-r--r--example/metadata.yaml43
-rw-r--r--example/minimal_metadata_example.yaml (renamed from example/minimal_example.yaml)6
-rw-r--r--scripts/dict_ontology_standardization/ncbi_countries.csv9
-rw-r--r--scripts/dict_ontology_standardization/ncbi_speciesman_source.csv1
-rw-r--r--scripts/docker/Dockerfile10
-rwxr-xr-xscripts/from_genbank_to_fasta_and_yaml.py96
-rw-r--r--scripts/import.cwl30
-rw-r--r--scripts/import_to_arvados.py14
-rw-r--r--scripts/sequences.acc297
-rw-r--r--semantic_enrichment/countries.ttl279
-rw-r--r--semantic_enrichment/labels.ttl24
-rw-r--r--setup.py8
-rw-r--r--workflows/pangenome-generate/minimap2.cwl2
-rw-r--r--workflows/pangenome-generate/odgi_to_rdf.cwl4
29 files changed, 1862 insertions, 410 deletions
diff --git a/.gitignore b/.gitignore
index 925698c..9057a4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,20 @@
*.py~
+
+# Distribution / packaging
build/
cache.txt
metadata.ttl
+__pycache__/
+eggs/
+.eggs/
+*.egg-info/
+*.egg
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
diff --git a/README.md b/README.md
index 7588bbc..e8896a0 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,14 @@ Note that you will need to repeat the `. venv/bin/activate` step from this direc
3. **Install the tool.** Once in your virtualenv, install this project:
+Install from PyPi:
+
+```sh
+pip3 bh20-seq-uploader
+```
+
+Install from git:
+
```sh
pip3 install git+https://github.com/arvados/bh20-seq-resource.git@master
```
@@ -166,7 +174,7 @@ To run it locally:
```
virtualenv --python python3 venv
. venv/bin/activate
-pip install -e .[web]
+pip install -e ".[web]"
env FLASK_APP=bh20simplewebuploader/main.py flask run
```
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 8d0f562..07e5f69 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -90,7 +90,6 @@ def run_workflow(api, parent_project, workflow_uuid, name, inputobj):
cmd = ["arvados-cwl-runner",
"--submit",
"--no-wait",
- "--debug",
"--project-uuid=%s" % project["uuid"],
"arvwf:%s" % workflow_uuid,
tmp.name]
@@ -137,6 +136,7 @@ def start_pangenome_analysis(api,
"location": schema_ref
}
}
+ validated.sort(key=lambda v: v["portable_data_hash"])
for v in validated:
inputobj["inputReads"].append({
"class": "File",
diff --git a/bh20sequploader/SARS-CoV-2-reference.fasta b/bh20sequploader/SARS-CoV-2-reference.fasta
new file mode 100644
index 0000000..b364687
--- /dev/null
+++ b/bh20sequploader/SARS-CoV-2-reference.fasta
@@ -0,0 +1,430 @@
+>NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
+ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA
+CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC
+TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG
+TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC
+CCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC
+GTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG
+CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT
+GCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC
+GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT
+TCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA
+GGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG
+TTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG
+CCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTG
+TCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTG
+CTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAA
+ATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA
+CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCAC
+CAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCA
+GACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACT
+ACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAG
+GACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCG
+CACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCA
+CGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACA
+ACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA
+GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGAT
+TATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAG
+GTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCG
+TGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCC
+GCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTG
+ATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTG
+GCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTT
+AAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA
+TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGT
+AAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTA
+GGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCC
+TACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTT
+AACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAA
+GCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGT
+ACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAA
+GGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT
+GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAA
+ATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC
+ACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAA
+TTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAG
+AAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATT
+TGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAA
+CAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTC
+AACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT
+AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACA
+GTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTA
+CTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAG
+TTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGT
+GAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTAT
+TATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAA
+TGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGT
+GAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA
+AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAAC
+TCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCA
+GATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTG
+ATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAAT
+GCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAAT
+GGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTA
+TTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGC
+AGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA
+TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAA
+CAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTA
+TGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTT
+TCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAG
+AACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACA
+ACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCAC
+CTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTA
+AGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA
+ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGT
+AAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTG
+ATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAA
+TGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAA
+ATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTA
+ACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAAT
+GAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGT
+GGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT
+TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTC
+ACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGT
+GAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAG
+ACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAG
+TTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAG
+TTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAAC
+CATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAA
+CCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT
+GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAAC
+CTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTG
+TCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGA
+ATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGA
+AAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAA
+TAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTT
+ACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTG
+CTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC
+AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTA
+TTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAG
+CAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAA
+TTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTAC
+TCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAG
+GCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCT
+TAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAA
+TGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT
+ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTC
+TTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATC
+TTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTT
+GTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAG
+GTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGT
+GATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAA
+GACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCA
+TCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC
+AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAAT
+GTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACT
+AGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTT
+AATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTG
+AACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGT
+TGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTT
+ACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTG
+GTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT
+ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAG
+AATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAG
+CACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTT
+TGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAA
+ATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTA
+ACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCC
+ATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGC
+ACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC
+CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTT
+TAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTAT
+GAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACC
+TTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATC
+AGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCA
+GGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTG
+GTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTA
+CTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC
+CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTT
+ACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTT
+CACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGG
+TTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTG
+CGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTAC
+GCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGC
+TACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTC
+TTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC
+ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTT
+GATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAG
+ATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGG
+ACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAG
+TTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTT
+ACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGG
+TTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCAT
+GCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA
+CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTT
+TCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTA
+ACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTG
+CTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGA
+TGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACA
+ATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTC
+AATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTC
+TGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC
+ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATA
+TGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACT
+AATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTG
+ACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCT
+CTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTG
+TGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTC
+TTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTG
+GTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA
+GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTA
+GCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAAC
+TCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAA
+AGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTA
+GACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTA
+GTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGA
+TTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCA
+GCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG
+AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAA
+TGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACA
+ACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACAT
+TTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAG
+TGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCT
+GCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTA
+CACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACT
+TGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC
+TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTAT
+ACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCT
+ACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGAT
+GCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGT
+GTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGG
+TGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTA
+AAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAG
+TCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA
+GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCA
+CAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAA
+ATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTT
+GTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTC
+CAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCA
+ACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGAC
+ACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATG
+ATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT
+AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAA
+GATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTG
+TAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGT
+TGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTA
+AAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATG
+ACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGG
+ACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAG
+CTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT
+ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTC
+AGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTAT
+GACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTC
+AGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAG
+ACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCT
+AACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGAC
+TTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCC
+TACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC
+TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAG
+GAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAG
+TGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTT
+AGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATA
+GATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACC
+AGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTC
+ACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTAC
+AACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC
+ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTAT
+GCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTA
+TGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATAC
+AATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCC
+GGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTA
+TAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATA
+CATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGAT
+AACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG
+TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTT
+ATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTAT
+GTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATT
+GTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAA
+TACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGT
+GATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTG
+AGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCT
+TTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT
+AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACC
+GAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATT
+AAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATC
+TCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGG
+GACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGT
+GTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGAT
+AAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACAT
+TAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA
+AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTAC
+ATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATT
+TCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCC
+TGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCA
+GCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCAC
+AAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTA
+TAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGC
+TCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA
+ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTT
+GCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTC
+TTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACA
+CTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACT
+CATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAA
+GAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTG
+TTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTA
+TGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA
+CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAA
+GTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATC
+TATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTT
+TCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTA
+TGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCA
+TGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTT
+AAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAA
+AGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA
+CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGT
+GACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTG
+TATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAG
+AGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCAC
+ACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTC
+CATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTAT
+AACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCT
+TATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA
+ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGG
+ACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTA
+GAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTA
+AACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGA
+CTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAA
+CCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTAT
+TTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCC
+CAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG
+AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTA
+AACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATT
+AGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTA
+CTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTA
+CAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTT
+ATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTG
+ACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAA
+AATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT
+ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTC
+GCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTA
+TACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTAC
+GGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGAT
+TGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAA
+ATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCT
+AGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATG
+GGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG
+GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTG
+GAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTA
+AGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAG
+GTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAA
+CAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCA
+ATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCA
+GTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATG
+TCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC
+TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCC
+CTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCAT
+TTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGC
+GAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTC
+AAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTA
+TTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTAT
+TAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCA
+GGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA
+ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTT
+GAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATT
+GTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTG
+TTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATC
+ATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTAT
+GCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTG
+ATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTC
+TAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA
+GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACT
+TTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACT
+TTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAAC
+AAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTC
+TGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGA
+GATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAAC
+CAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTA
+CTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC
+TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACT
+CAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTG
+GTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTAC
+CACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCA
+ACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAA
+TAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACC
+AATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCA
+TTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT
+GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACC
+TTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGG
+ACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTG
+GAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAA
+AATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCA
+CAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATA
+TCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAG
+TTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT
+ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTA
+TGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAA
+GAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTT
+TCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACA
+CATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACC
+TGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTA
+GGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTG
+CCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC
+ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGT
+ATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACG
+ACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGA
+ATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTC
+GCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCT
+TGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGT
+GTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTG
+GCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT
+AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTT
+CTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTA
+CTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATG
+GGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCA
+ACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGC
+CTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAAT
+TTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTAC
+TCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT
+TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGT
+GAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGAT
+CTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGA
+TTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTC
+CTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTA
+AGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAAT
+AAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTC
+ATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC
+TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGT
+GATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAA
+GAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTG
+ACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAG
+CAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTAC
+TATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATA
+AACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAAC
+CAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA
+GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACA
+TACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAAT
+TTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACT
+GTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTT
+ATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTG
+CTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAA
+GATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTG
+TAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC
+GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAA
+TTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCT
+GTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGA
+AGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTG
+ATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAG
+TAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACT
+GCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTC
+CAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG
+TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCT
+GGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAA
+AAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAAC
+ATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGT
+AGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCA
+ATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGG
+TAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGG
+CAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC
+AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACA
+ATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACG
+TGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGC
+TGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGC
+TGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGAT
+TTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATG
+CAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTT
+GTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT
+TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTAC
+GATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT
+TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAA
+
diff --git a/bh20sequploader/bh20seq-options.yml b/bh20sequploader/bh20seq-options.yml
index 104ed6c..c553f41 100644
--- a/bh20sequploader/bh20seq-options.yml
+++ b/bh20sequploader/bh20seq-options.yml
@@ -35,38 +35,8 @@ sample_sequencing_technology:
Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
-sample_sequencing_technology2:
- Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
- Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566
- Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567
- Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205
- Illumina: http://purl.obolibrary.org/obo/OBI_0000759
- IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894
- Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894
- Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632
- Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
- Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
-
-sample_sequencing_technology3:
- Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
- Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566
- Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567
- Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205
- Illumina: http://purl.obolibrary.org/obo/OBI_0000759
- IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894
- Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894
- Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632
- Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
- Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
-
specimen_source:
nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831
oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835
sputum: http://purl.obolibrary.org/obo/NCIT_C13278
bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195
-
-specimen_source2:
- nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831
- oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835
- sputum: http://purl.obolibrary.org/obo/NCIT_C13278
- bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index ea813a0..99e1a11 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -48,6 +48,8 @@ $graph:
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C25688
+ _type: "@id"
+ noLinkCheck: true
host_treatment:
doc: Process in which the act is intended to modify or alter host status
type: string?
@@ -55,9 +57,16 @@ $graph:
_id: http://www.ebi.ac.uk/efo/EFO_0000727
host_vaccination:
doc: List of vaccines given to the host
- type: string?
+ type: string[]?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/VO_0000002
+ ethnicity:
+ doc: Ethinicity of the host e.g. http://purl.obolibrary.org/obo/HANCESTRO_0010
+ type: string?
+ jsonldPredicate:
+ _id: http://semanticscience.org/resource/SIO_001014
+ _type: "@id"
+ noLinkCheck: true
additional_host_information:
doc: Field for additional host information
type: string?
@@ -90,20 +99,13 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001895
collecting_institution:
- doc: Institute that was responsible of sampeling
+ doc: Institute that was responsible for sampeling
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C41206
specimen_source:
doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab)
- type: string?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/OBI_0001479
- _type: "@id"
- noLinkCheck: true
- specimen_source2:
- doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb)
- type: string?
+ type: string[]?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
@@ -119,16 +121,18 @@ $graph:
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_001167
source_database_accession:
- doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here
- type: string?
+ doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here. Please use a resolveable URL (e.g. http://identifiers.org/insdc/LC522350.1#sequence)
+ type: string[]?
jsonldPredicate:
_id: http://edamontology.org/data_2091
+ _type: "@id"
+ noLinkCheck: true
- name: virusSchema
type: record
fields:
virus_species:
- doc: The name of a taxon from the NCBI taxonomy database
+ doc: The name of virus species from the NCBI taxonomy database, e.g. http://purl.obolibrary.org/obo/NCBITaxon_2697049 for Severe acute respiratory syndrome coronavirus 2
type: string
jsonldPredicate:
_id: http://edamontology.org/data_1875
@@ -145,21 +149,7 @@ $graph:
fields:
sample_sequencing_technology:
doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
- type: string?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/OBI_0600047
- _type: "@id"
- noLinkCheck: true
- sample_sequencing_technology2:
- doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
- type: string?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/OBI_0600047
- _type: "@id"
- noLinkCheck: true
- sample_sequencing_technology3:
- doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
- type: string?
+ type: string[]?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
@@ -170,13 +160,8 @@ $graph:
jsonldPredicate:
_id: http://www.ebi.ac.uk/efo/EFO_0002699
sequencing_coverage:
- doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x)
- type: float?
- jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/FLU_0000848
- sequencing_coverage2:
- doc: If a second sequence technology was used you can submit its coverage here
- type: float?
+ doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. [100]) - if multiple technologies were used multiple float values can be submitted e.g. [100, 20]
+ type: double[]?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/FLU_0000848
additional_technology_information:
@@ -189,13 +174,13 @@ $graph:
type: record
fields:
authors:
- doc: Name of the author(s)
- type: string
+ doc: Name(s) of the author(s)
+ type: string[]
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C42781
submitter_name:
- doc: Name of the submitter
- type: string?
+ doc: Name of the submitter(s)
+ type: string[]?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000116
submitter_address:
@@ -228,7 +213,7 @@ $graph:
_id: http://purl.obolibrary.org/obo/NCIT_C19026
submitter_orcid:
doc: ORCID of the submitter as a full URI, e.g. https://orcid.org/0000-0002-1825-0097
- type: string?
+ type: string[]?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
_type: "@id"
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index c3b0ae1..cdf2296 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -25,8 +25,9 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
obo:NCIT_C42574 [ obo:UO_~ ] ?;
obo:NCIT_C25688 [obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987 ] ? ;
efo:EFO_0000727 xsd:string ?;
- obo:VO_0000002 xsd:string ?;
+ obo:VO_0000002 xsd:string {0,10};
sio:SIO_001167 xsd:string ?;
+ sio:SIO_001014 [ obo:HANCESTRO_~ ] ? ; #ethnicity
}
:sampleShape {
@@ -38,25 +39,26 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
obo:OBI_0001479 IRI {0,2};
obo:OBI_0001472 xsd:string ?;
sio:SIO_001167 xsd:string ?;
+ edam:data_2091 IRI {0,3};
}
:submitterShape {
- obo:NCIT_C42781 xsd:string ;
- sio:SIO_000116 xsd:string ?;
+ obo:NCIT_C42781 xsd:string + ;
+ sio:SIO_000116 xsd:string *;
sio:SIO_000172 xsd:string ?;
obo:NCIT_C37984 xsd:string ?;
obo:OBI_0600047 xsd:string ?;
obo:NCIT_C37900 xsd:string ?;
efo:EFO_0001741 xsd:string ?;
obo:NCIT_C19026 xsd:string ?;
- sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?;
+ sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/ {0,10};
sio:SIO_001167 xsd:string ?;
}
:technologyShape {
obo:OBI_0600047 IRI {0,3} ;
efo:EFO_0002699 xsd:string ?;
- obo:FLU_0000848 xsd:double {0,2};
+ obo:FLU_0000848 xsd:double OR xsd:integer {0,3};
sio:SIO_001167 xsd:string ?;
}
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index 4c4711d..10d1029 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -62,11 +62,16 @@ def main():
external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')
+ try:
+ username = getpass.getuser()
+ except KeyError:
+ username = "unknown"
+
properties = {
"sequence_label": seqlabel,
"upload_app": "bh20-seq-uploader",
"upload_ip": external_ip,
- "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname())
+ "upload_user": "%s@%s" % (username, socket.gethostname())
}
col.save_new(owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" %
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index e47d66b..16cf2c9 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -1,6 +1,25 @@
import pkg_resources
import tempfile
import magic
+import subprocess
+import tempfile
+import logging
+import re
+
+def read_fasta(sequence):
+ entries = 0
+ bases = []
+ label = None
+ for line in sequence:
+ if line.startswith(">"):
+ label = line
+ entries += 1
+ else:
+ bases.append(line)
+ if entries > 1:
+ raise ValueError("FASTA file contains multiple entries")
+ break
+ return label, bases
def qc_fasta(sequence):
schema_resource = pkg_resources.resource_stream(__name__, "validation/formats")
@@ -13,16 +32,44 @@ def qc_fasta(sequence):
sequence.seek(0)
if seq_type == "text/fasta":
# ensure that contains only one entry
- entries = 0
- for line in sequence:
- if line.startswith(">"):
- entries += 1
- if entries > 1:
- raise ValueError("FASTA file contains multiple entries")
- break
+ submitlabel, submitseq = read_fasta(sequence)
sequence.seek(0)
+
+ with tempfile.NamedTemporaryFile() as tmp1:
+ refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta")
+ tmp1.write(refstring)
+ tmp1.write(submitlabel.encode("utf8"))
+ tmp1.write(("".join(submitseq)).encode("utf8"))
+ tmp1.flush()
+ try:
+ cmd = ["clustalw", "-infile="+tmp1.name,
+ "-quicktree", "-iteration=none", "-type=DNA"]
+ print("QC checking similarity to reference")
+ print(" ".join(cmd))
+ result = subprocess.run(cmd, stdout=subprocess.PIPE)
+ res = result.stdout.decode("utf-8")
+ g1 = re.search(r"^Sequence 1: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE)
+ refbp = float(g1.group(1))
+ g2 = re.search(r"^Sequence 2: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE)
+ subbp = float(g2.group(1))
+ g3 = re.search(r"^Sequences \(1:2\) Aligned\. Score: (\d+(\.\d+)?)$", res, flags=re.MULTILINE)
+ similarity = float(g3.group(1))
+
+ print(g1.group(0))
+ print(g2.group(0))
+ print(g3.group(0))
+ except Exception as e:
+ logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e)
+
+ if (subbp/refbp) < .7:
+ raise ValueError("QC fail: submit sequence length is shorter than 70% reference")
+ if (subbp/refbp) > 1.3:
+ raise ValueError("QC fail: submit sequence length is greater than 130% reference")
+ if similarity < 70.0:
+ raise ValueError("QC fail: submit similarity is less than 70%")
+
return "sequence.fasta"
elif seq_type == "text/fastq":
return "reads.fastq"
else:
- raise ValueError("Sequence file does not look like FASTA or FASTQ")
+ raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ")
diff --git a/bh20simplewebuploader/__init__.py b/bh20simplewebuploader/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/bh20simplewebuploader/__init__.py
diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py
index e88eb4c..126b8dd 100644
--- a/bh20simplewebuploader/main.py
+++ b/bh20simplewebuploader/main.py
@@ -8,7 +8,7 @@ import re
import string
import yaml
import pkg_resources
-from flask import Flask, request, redirect, send_file, send_from_directory, render_template
+from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify
import os.path
import requests
@@ -197,6 +197,14 @@ def generate_form(schema, options):
record['type'] = 'number'
# Choose a reasonable precision for the control
record['step'] = '0.0001'
+
+ ### This is to fix the homepage for the moment ## needs more love though
+ # implementation of the [] stuff instead of just text fields
+ ## ToDo - implement lists
+ elif field_type == 'string[]':
+ record['type'] = 'text'
+ elif field_type == 'float[]':
+ record['type'] = 'text'
else:
raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name))
yield record
@@ -358,7 +366,8 @@ def getAllaccessions():
payload = {'query': query, 'format': 'json'}
r = requests.get(baseURL, params=payload)
result = r.json()['results']['bindings']
- return str(result)
+ return jsonify([{'uri': x['fasta']['value'],
+ 'value': x['value']['value']} for x in result])
# parameter must be encoded e.g. http://arvados.org/keep:6e6276698ed8b0e6cd21f523e4f91179+123/sequence.fasta must become
@@ -368,26 +377,69 @@ def getDetailsForSeq():
seq_id = request.args.get('seq')
query="""SELECT DISTINCT ?key ?value WHERE {<placeholder> ?x [?key ?value]}"""
query=query.replace("placeholder", seq_id)
- print(query)
+ payload = {'query': query, 'format': 'json'}
+ r = requests.get(baseURL, params=payload)
+ result = r.json()['results']['bindings']
+ return jsonify([{'uri': x['key']['value'],
+ 'value': x['value']['value']} for x in result])
+
+
+@app.route('/api/getSEQCountbytech', methods=['GET'])
+def getSEQCountbytech():
+ query="""SELECT ?tech ?tech_label (count(?fasta) as ?fastaCount) WHERE
+ {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0600047> ?tech]
+ BIND (concat(?tech,"_label") as ?tech_label)}
+ GROUP BY ?tech ?tech_label ORDER BY DESC (?fastaCount)
+ """
+ payload = {'query': query, 'format': 'json'}
+ r = requests.get(baseURL, params=payload)
+ result = r.json()['results']['bindings']
+ return jsonify([{'Fasta Count': x['fastaCount']['value'],
+ 'tech': x['tech']['value'],
+ 'Label': x['tech_label']['value']} for x in result])
+
+## Is this one really necessary or should we just use getSEQCountbytech instead?
+@app.route('/api/getAvailableTech', methods=['GET'])
+def getAvailableTech():
+ query="""SELECT distinct ?tech ?tech_label WHERE
+ {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0600047> ?tech]
+ BIND (concat(?tech,"_label") as ?tech_label)
+ } """
payload = {'query': query, 'format': 'json'}
r = requests.get(baseURL, params=payload)
result = r.json()['results']['bindings']
return str(result)
+## List all Sequences/submissions by a given tech, as example e.g. http://purl.obolibrary.org/obo/OBI_0000759
+## Has to be encoded again so should be --> http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FOBI_0000759
@app.route('/api/getSEQbytech', methods=['GET'])
def getSEQbytech():
- query="""SELECT ?specimen_source ?specimen_source_label (count(?fasta) as ?fastaCount) WHERE
- {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0600047> ?specimen_source]
- BIND (concat(?specimen_source,"_label") as ?specimen_source_label)}
- GROUP BY ?specimen_source ?specimen_source_label ORDER BY DESC (?fastaCount)
+ query="""SELECT ?fasta WHERE
+ {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0600047> <placeholder>] }
"""
+ tech = request.args.get('tech')
+ query=query.replace("placeholder", tech)
payload = {'query': query, 'format': 'json'}
r = requests.get(baseURL, params=payload)
result = r.json()['results']['bindings']
return str(result)
+
+## Example location, encoded http%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ1223
@app.route('/api/getSEQbyLocation', methods=['GET'])
def getSEQbyLocation():
+ query="""SELECT ?fasta WHERE {?fasta ?x[ <http://purl.obolibrary.org/obo/GAZ_00000448> <placeholder>]}"""
+ location=request.args.get('location')
+ query=query.replace("placeholder", location)
+ print(query)
+ payload = {'query': query, 'format': 'json'}
+ r = requests.get(baseURL, params=payload)
+ result = r.json()['results']['bindings']
+ return str(result)
+
+
+@app.route('/api/getSEQCountbyLocation', methods=['GET'])
+def getSEQCountbyLocation():
query="""SELECT ?geoLocation ?geoLocation_label (count(?fasta) as ?fastaCount) WHERE
{?fasta ?x [<http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation]
BIND (concat(?geoLocation,"_label") as ?geoLocation_label)}
@@ -396,10 +448,13 @@ def getSEQbyLocation():
payload = {'query': query, 'format': 'json'}
r = requests.get(baseURL, params=payload)
result = r.json()['results']['bindings']
- return str(result)
+ return jsonify([{'Fasta Count': x['fastaCount']['value'],
+ 'GeoLocation': x['geoLocation']['value'],
+ 'GeoLocation Label': x['geoLocation_label']['value']} for x in result])
-@app.route('/api/getSEQbySpecimenSource', methods=['GET'])
-def getSEQbySpecimenSource():
+
+@app.route('/api/getSEQCountbySpecimenSource', methods=['GET'])
+def getSEQCountbySpecimenSource():
query="""SELECT ?specimen_source ?specimen_source_label (count(?fasta) as ?fastaCount) WHERE
{?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0001479> ?specimen_source]
BIND (concat(?specimen_source,"_label") as ?specimen_source_label)}
@@ -409,11 +464,27 @@ def getSEQbySpecimenSource():
payload = {'query': query, 'format': 'json'}
r = requests.get(baseURL, params=payload)
result = r.json()['results']['bindings']
+ return jsonify([{'Fasta Count': x['fastaCount']['value'],
+ 'Specimen Source': x['specimen_source']['value'],
+ 'Label': x['specimen_source_label']['value']} for x in result])
+
+# Example specimen http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C155831
+@app.route('/api/getSEQbySpecimenSource', methods=['GET'])
+def getSEQBySpecimenSource():
+ query="""SELECT ?fasta ?specimen_source ?specimen_source_label WHERE
+ {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0001479> <placeholder>]
+ BIND (concat(?specimen_source,"_label") as ?specimen_source_label)}
+ """
+ specimen=request.args.get('specimen')
+ query = query.replace("placeholder", specimen)
+ payload = {'query': query, 'format': 'json'}
+ r = requests.get(baseURL, params=payload)
+ result = r.json()['results']['bindings']
return str(result)
#No data for this atm
-@app.route('/api/getSEQbyHostHealthStatus', methods=['GET'])
-def getSEQbyHostHealthStatus():
+@app.route('/api/getSEQCountbyHostHealthStatus', methods=['GET'])
+def getSEQCountbyHostHealthStatus():
query="""SELECT ?health_status ?health_status_label (count(?fasta) as ?fastaCount) WHERE
{?fasta ?x [<http://purl.obolibrary.org/obo/NCIT_C25688> ?health_status]
BIND (concat(?health_status,"_label") as ?health_status_label)}
@@ -423,4 +494,36 @@ def getSEQbyHostHealthStatus():
payload = {'query': query, 'format': 'json'}
r = requests.get(baseURL, params=payload)
result = r.json()['results']['bindings']
+ return str(result)
+
+@app.route('/api/getSEQbyLocationAndTech', methods=['GET'])
+def getSEQbyLocationAndTech():
+ query="""SELECT ?fasta WHERE { ?fasta ?x [
+ <http://purl.obolibrary.org/obo/GAZ_00000448> <placeholderLoc>; <http://purl.obolibrary.org/obo/OBI_0600047> <placeholderTech> ]}"""
+ location=request.args.get('location')
+ tech=request.args.get('tech')
+ query=query.replace("placeholderLoc", location)
+ query = query.replace("placeholderTech", tech)
+ print(query)
+ payload = {'query': query, 'format': 'json'}
+ r = requests.get(baseURL, params=payload)
+ result = r.json()['results']['bindings']
+ return str(result)
+
+
+# Example Location http%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ1223
+# Example specimen http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C155831
+@app.route('/api/getSEQbyLocationAndSpecimenSource', methods=['GET'])
+def getSEQbyLocationAndSpecimenSource():
+ query="""SELECT ?fasta WHERE { ?fasta ?x [
+ <http://purl.obolibrary.org/obo/GAZ_00000448> <placeholderLoc>; <http://purl.obolibrary.org/obo/OBI_0001479> <placeholderSpecimen> ]}
+ """
+ location = request.args.get('location')
+ specimen = request.args.get('specimen')
+ query = query.replace("placeholderLoc", location)
+ query = query.replace("placeholderSpecimen", specimen)
+ print(query)
+ payload = {'query': query, 'format': 'json'}
+ r = requests.get(baseURL, params=payload)
+ result = r.json()['results']['bindings']
return str(result) \ No newline at end of file
diff --git a/bh20simplewebuploader/static/main.css b/bh20simplewebuploader/static/main.css
new file mode 100644
index 0000000..57e29ef
--- /dev/null
+++ b/bh20simplewebuploader/static/main.css
@@ -0,0 +1,269 @@
+hr {
+ margin: auto 0;
+}
+
+body {
+ color: #101010;
+ background-color: #F5FFFF;
+ margin: 0;
+}
+
+h1, h2, h3, h4 {
+ font-family: 'Inter', sans-serif;
+ color: #0ED1CD;
+}
+
+h1 {
+ text-align: center;
+}
+
+.intro {
+ color: #505050;
+ font-weight: 300;
+}
+
+.header {
+ background-color: white;
+ margin: 0 auto;
+ padding: 20px;
+ text-align: center;
+ height: 150px;
+}
+
+h2 > svg {
+ position: relative;
+ top: 8px;
+}
+
+.logo {
+ float: right;
+}
+
+p, form, .about, .footer {
+ font-family: 'Inter', sans-serif;
+ line-height: 1.5;
+}
+
+form h4 {
+ text-transform: 'uppercase';
+}
+
+.intro, form, .search {
+ padding: 20px;
+}
+
+.intro {
+ background-color: inherit;
+ margin: 0 auto;
+ padding: 20px;
+}
+
+.about {
+ background-color: #B2F8F8;
+ margin: 30px auto;
+ padding: 20px;
+ width: 95%;
+ border-radius: 20px;
+}
+
+.button {
+ border-radius: 5px;
+ background: #0ED1CD;
+ margin: 0.3em auto;
+ padding: 0.4em;
+}
+
+.footer {
+ background: #058280;;
+ margin: 0 auto;
+ color: #fff;
+}
+
+.footer a {
+ color: #fff;
+}
+
+span.dropt {border-bottom: thin dotted; background: #ffeedd;}
+span.dropt:hover {text-decoration: none; background: #ffffff; z-index: 6; }
+
+.grid-container {
+ display: grid;
+ grid-template-columns: repeat(4, 1fr);
+ grid-template-rows: auto;
+ row-gap:5px;
+ grid-template-areas:
+ "a a b b"
+ "a a c c"
+ "a a d d"
+ "e e e e"
+ "f f f f";
+ grid-auto-flow: column;
+}
+
+.about {
+ display: grid;
+ grid-template-columns: repeat(2, 1fr);
+ grid-auto-flow: row;
+}
+
+.about h1 {
+ text-align: left;
+}
+
+.about p {
+ font-weight: 300;
+ color: #505050;
+}
+
+.intro {
+ grid-area: a;
+}
+
+.fasta-file-select {
+ padding: 1em;
+ grid-area: b;
+}
+
+a {
+ color: #40DBD8;
+ font-weight: 700;
+}
+
+.fasta-file-select label, .metadata label {
+ font-weight: 600;
+}
+
+.metadata {
+ padding: 1em;
+ grid-area: c;
+}
+.metadata_upload_form {
+ padding: 1em;
+ grid-area: c;
+}
+
+#metadata_upload_form_spot {
+ grid-area: d;
+}
+
+#metadata_fill_form_spot {
+ grid-area: e;
+}
+
+#metadata_fill_form {
+ column-count: 4;
+ margin-top: 0.5em;
+ column-width: 250px;
+}
+
+pre code {
+ background-color: #eee;
+ display: flex;
+ width: max-content;
+ margin: 0 auto;
+ overflow-y: scroll;
+ max-height: 300px;
+ padding: 10px;
+ border: solid 1px black;
+}
+
+.record {
+ display: flex;
+ flex-direction: column;
+ border: solid 1px #808080;
+ padding: 1em;
+ background: #F8F8F8;
+ margin-bottom: 1em;
+ -webkit-column-break-inside: avoid; /* Chrome, Safari, Opera */
+ page-break-inside: avoid; /* Firefox */
+ break-inside: avoid;
+}
+
+.record label {
+ font-size: small;
+ margin-top: 10px;
+}
+
+.search-section {
+ display: flex;
+ justify-content: space-between;
+}
+
+.search-section .filter-options {
+ display: flex;
+ flex-direction: column;
+ width: max-content;
+ padding: 20px;
+}
+
+.search-section p {
+ margin: 0;
+}
+
+.submit {
+ grid-area: f;
+ width: 17em;
+ justify-self: center;
+}
+
+footer {
+ display: block;
+ width: 100%;
+}
+
+.sponsors {
+ width: inherit;
+ display: flex;
+ flex-direction: row;
+ flex-wrap: wrap;
+ justify-content: space-evenly;
+ align-content: space-evenly;
+}
+
+.sponsors a {
+ flex-grow: 4;
+ height: 200px;
+ margin: 10px;
+ background: white;
+ display: flex;
+ flex-direction: column;
+ justify-content: center;
+}
+.sponsors img {
+ width: 100%;
+}
+.metadata input#metadata_upload:checked ~ #metadata_upload_form_spot {
+ display: block;
+}
+
+.metadata input#metadata_upload ~ #metadata_upload_form_spot {
+ display: none;
+}
+
+.loader {
+ display: block;
+ border: 5px solid #f3f3f3; /* Light grey */
+ border-top: 5px solid #3498db; /* Blue */
+ border-radius: 50%;
+ width: 20px;
+ height: 20px;
+ margin-right: auto;
+ margin-left: auto;
+ animation: spin 1.5s linear infinite;
+}
+
+.invisible {
+ display: none;
+}
+
+@keyframes spin {
+ 0% { transform: rotate(0deg); }
+ 100% { transform: rotate(360deg); }
+}
+
+@media only screen and (max-device-width: 480px) {
+ .grid-container {
+ display: flex;
+ flex-direction: column;
+ }
+}
diff --git a/bh20simplewebuploader/static/main.js b/bh20simplewebuploader/static/main.js
new file mode 100644
index 0000000..96199a0
--- /dev/null
+++ b/bh20simplewebuploader/static/main.js
@@ -0,0 +1,47 @@
+function fetchAPI(apiEndPoint) {
+ fetch(scriptRoot + apiEndPoint)
+ .then(response => {
+ return response.json();
+ })
+ .then(data => {
+ document.getElementById("json").textContent = JSON.stringify(data, undefined, 2);
+ document.getElementById("results").classList.remove("invisible");
+ document.getElementById("loader").classList.add("invisible");
+ });
+ document.getElementById("results").classList.add("invisible");
+ document.getElementById("loader").classList.remove("invisible");
+
+}
+
+let search = () => {
+ let m = document.getElementById('search-input').value;
+ fetchAPI(scriptRoot + "/api/getDetailsForSeq?seq=" + encodeURIComponent(m));
+}
+
+let fetchSEQBySpecimen = () => {
+ fetchAPI("/api/getSEQCountbySpecimenSource");
+}
+
+let fetchSEQByLocation = () => {
+ fetchAPI("/api/getSEQCountbyLocation");
+}
+
+let fetchSEQByTech = () => {
+ fetchAPI("/api/getSEQCountbytech");
+}
+
+let fetchAllaccessions = () => {
+ fetchAPI("/api/getAllaccessions");
+};
+
+/**
+ * Show form if checked
+ */
+let fillFormSpot = document.getElementById('metadata_fill_form_spot');
+function displayForm() {
+ if (document.getElementById('metadata_form').checked) {
+ fillFormSpot.classList.remove("invisible");
+ return;
+ }
+ fillFormSpot.classList.add("invisible");
+}
diff --git a/bh20simplewebuploader/templates/form.html b/bh20simplewebuploader/templates/form.html
index 02ae84d..ffd4158 100644
--- a/bh20simplewebuploader/templates/form.html
+++ b/bh20simplewebuploader/templates/form.html
@@ -1,152 +1,9 @@
<!DOCTYPE html>
<html>
- <style>
- hr {
- margin: auto 0;
- }
-
- body {
- color: #101010;
- background-color: #F9EDE1;
- }
-
- h1, h2, h3, h4 {
- font-family: 'Roboto Slab', serif;
- color: darkblue;
- }
-
- h1 {
- text-align: center;
- }
-
- p {
- color: #505050;
- font-style: italic;
- }
- .header {
- background-color: white;
- margin: 0 auto;
- padding: 20px;
- text-align: center;
- height: 150px;
- }
-
- .logo {
- float: right;
- }
-
- p, form, .about, .footer {
- font-family: 'Raleway', sans-serif;
- line-height: 1.5;
- }
-
- form h4 {
- text-transform: 'uppercase';
- }
-
- .intro, form {
- padding: 20px;
- }
-
- .intro {
- background-color: lightgrey;
- margin: 0 auto;
- padding: 20px;
- }
-
- .about {
- background-color: lightgrey;
- margin: 0 auto;
- padding: 20px;
- }
- .footer {
- background-color: white;
- margin: 0 auto;
- }
-
- span.dropt {border-bottom: thin dotted; background: #ffeedd;}
- span.dropt:hover {text-decoration: none; background: #ffffff; z-index: 6; }
-
- .grid-container {
- display: grid;
- grid-template-columns: repeat(4, 1fr);
- grid-template-rows: auto;
- row-gap:5px;
- grid-template-areas:
- "a a b b"
- "a a c c"
- "a a d d"
- "e e e e"
- "f f f f";
- grid-auto-flow: column;
- }
-
- .intro {
- grid-area: a;
- }
-
- .fasta-file-select {
- padding: 1em;
- grid-area: b;
- }
-
- .metadata {
- padding: 1em;
- grid-area: c;
- }
- .metadata_upload_form {
- padding: 1em;
- grid-area: c;
- }
-
- #metadata_upload_form_spot {
- grid-area: d;
- }
-
- #metadata_fill_form_spot {
- grid-area: e;
- }
-
- #metadata_fill_form {
- column-count: 4;
- margin-top: 0.5em;
- column-width: 250px;
- }
-
- .record {
- display: flex;
- flex-direction: column;
- border: solid 1px #808080;
- padding: 1em;
- background: #F8F8F8;
- margin-bottom: 1em;
- -webkit-column-break-inside: avoid; /* Chrome, Safari, Opera */
- page-break-inside: avoid; /* Firefox */
- break-inside: avoid;
- }
-
- .record label {
- font-size: small;
- margin-top: 10px;
- }
-
- .submit {
- grid-area: f;
- width: 17em;
- justify-self: center;
- }
-
- @media only screen and (max-device-width: 480px) {
- .grid-container {
- display: flex;
- flex-direction: column;
- }
- }
- </style>
-
<head>
<meta charset="UTF-8">
- <link href="https://fonts.googleapis.com/css2?family=Raleway:wght@500&family=Roboto+Slab&display=swap" rel="stylesheet">
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@100;200;300;400;500;600;700;800;900&display=swap" rel="stylesheet">
+ <link href="/static/main.css" rel="stylesheet" type="text/css">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Web uploader for Public SARS-CoV-2 Sequence Resource</title>
</head>
@@ -158,28 +15,67 @@
<small>Disabled until we got everything wired up</small>
</section>
- <hr>
+
+ <section class="search-section">
+ <div class="filter-options" action="#">
+ <p>[Demo] Display content sequences by: </p>
+
+ <div>
+ <button class="button" onclick="fetchSEQBySpecimen()">Specimen Source</button>
+ <button class="button" onclick="fetchSEQByLocation()">Location</button>
+ <button class="button" onclick="fetchSEQByTech()">Tech</button>
+ <button class="button" onclick="fetchAllaccessions()">Allaccessions</button>
+ </div>
+
+ </div>
+
+ <div class="search">
+ <input id="search-input" id="global-search" type="search" placeholder="FASTA uri" required>
+ <button class="button search-button" type="submit" onclick="search()">
+ <span class="icon ion-search">
+ <span class="sr-only">Search</span>
+ </span>
+ </button>
+ </div>
+ </section>
+
+ <div id="loader" class="loader invisible"></div>
+
+ <section id="results" class="invisible">
+ <pre><code id="json"></code></pre>
+ </section>
<section>
<form action="/submit" method="POST" enctype="multipart/form-data" id="main_form" class="grid-container">
- <p class="intro">
- Upload your SARS-CoV-2 sequence (FASTA or FASTQ formats) with metadata (JSONLD) to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">public sequence resource</a>. The upload will trigger a
- recompute with all available sequences into a Pangenome
- available for
- <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">download</a>!
- Your uploaded sequence will automatically be processed
- and incorporated into the public pangenome with
- metadata using worklows from the High Performance Open Biology Lab defined <a href="https://github.com/hpobio-lab/viral-analysis/tree/master/cwl/pangenome-generate">here</a>. All data is published under
- a <a href="https://creativecommons.org/licenses/by/4.0/">Creative
- Commons 4.0 attribution license</a> (CC-BY-4.0). You
- can take the published (GFA/RDF/FASTA) data and store it in
- a triple store for further processing. We also plan to
- combine identifiers with clinical data stored securely at <a href="https://redcap-covid19.elixir-luxembourg.org/redcap/">REDCap</a>.
- A free command line version of the uploader can be
- installed from <a href="https://github.com/arvados/bh20-seq-resource">source</a>.
- </p>
+ <div class="intro">
+ <p>
+ Upload your SARS-CoV-2 sequence (FASTA or FASTQ formats) with metadata (JSONLD) to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">public sequence resource</a>. The upload will trigger a
+ recompute with all available sequences into a Pangenome
+ available for
+ <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">download</a>!
+ </p>
+ <p>
+ Your uploaded sequence will automatically be processed
+ and incorporated into the public pangenome with
+ metadata using worklows from the High Performance Open Biology Lab defined <a href="https://github.com/hpobio-lab/viral-analysis/tree/master/cwl/pangenome-generate">here</a>. All data is published under
+ a <a href="https://creativecommons.org/licenses/by/4.0/">Creative
+ Commons 4.0 attribution license</a> (CC-BY-4.0). You
+ can take the published (GFA/RDF/FASTA) data and store it in
+ a triple store for further processing. We also plan to
+ combine identifiers with clinical data stored securely at <a href="https://redcap-covid19.elixir-luxembourg.org/redcap/">REDCap</a>.
+ A free command line version of the uploader can be
+ installed from <a href="https://github.com/arvados/bh20-seq-resource">source</a>.
+ </p>
+
+ </div>
<div class="fasta-file-select">
+ <h2><svg class="bi bi-cloud-upload" width="1.2em" height="1.2em" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
+ <path d="M4.887 6.2l-.964-.165A2.5 2.5 0 103.5 11H6v1H3.5a3.5 3.5 0 11.59-6.95 5.002 5.002 0 119.804 1.98A2.501 2.501 0 0113.5 12H10v-1h3.5a1.5 1.5 0 00.237-2.981L12.7 7.854l.216-1.028a4 4 0 10-7.843-1.587l-.185.96z"/>
+ <path fill-rule="evenodd" d="M5 8.854a.5.5 0 00.707 0L8 6.56l2.293 2.293A.5.5 0 1011 8.146L8.354 5.5a.5.5 0 00-.708 0L5 8.146a.5.5 0 000 .708z" clip-rule="evenodd"/>
+ <path fill-rule="evenodd" d="M8 6a.5.5 0 01.5.5v8a.5.5 0 01-1 0v-8A.5.5 0 018 6z" clip-rule="evenodd"/>
+ </svg> Upload SARS-CoV-2 Sequence</h2>
+
<label for="fasta">Select FASTA file of assembled genome (max 50K), or FASTQ of reads (<span class="dropt" title="For a larger fastq file you'll need to use a CLI uploader">max 150MB<span style="width:500px;"></span></span>) : </label>
<br>
<input type="file" id="fasta" name="fasta" accept=".fa,.fasta,.fna,.fq" required>
@@ -189,16 +85,16 @@
</div>
<div class="metadata">
- <label>Select metadata submission method:</label>
- <br>
- <input type="radio" id="metadata_form" name="metadata_type" value="fill" onchange="setMode()" checked required>
- <label for="metadata_form">Fill in metadata manually</label>
- <input type="radio" id="metadata_upload" name="metadata_type" value="upload" onchange="setMode()" required>
- <label for="metadata_upload">Upload metadata file</label>
- <br>
- <small>Make sure the metadata has submitter attribution details.</small>
+ <label>Select metadata submission method:</label>
+ <br>
+ <input type="radio" id="metadata_form" name="metadata_type" value="fill" checked onchange="displayForm()" required>
+ <label for="metadata_form">Fill in metadata manually</label>
+ <input type="radio" id="metadata_upload" name="metadata_type" value="upload" onchange="displayForm()" required>
+ <label for="metadata_upload">Upload metadata file</label>
+ <br>
+ <small>Make sure the metadata has submitter attribution details.</small>
- <div id="metadata_upload_form_spot">
+ <div id="metadata_upload_form_spot">
<div id="metadata_upload_form">
<br>
<label for="metadata">Select JSON or YAML metadata file following <a href="https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml" target="_blank">this schema</a> and <a href="https://github.com/arvados/bh20-seq-resource/blob/master/example/metadata.yaml" target="_blank">example</a> (max 50K):</label>
@@ -206,9 +102,9 @@
<input type="file" id="metadata" name="metadata" accept=".json,.yml,.yaml" required>
<br>
</div>
- </div>
- </div>
+ </div>
+ </div>
<div id="metadata_fill_form_spot">
<div id="metadata_fill_form">
{% for record in fields %}
@@ -246,40 +142,44 @@
{% endif %}
{% endfor %}
</div>
+
</div>
<input class="submit" type="submit" value="Add to Pangenome">
</form>
</section>
-<hr>
<br>
<div class="about">
- <h3>ABOUT</h3>
- <p>
- This a public repository created at the COVID-19 BioHackathon
- that has a low barrier to entry for uploading sequence data using
- best practices. I.e., data is published with a creative commons
- 4.0 (CC-4.0) license with metadata using state-of-the art
- standards and, perhaps most importantly, providing standardized
- workflows that get triggered on upload, so that results are
- immediately available in standardized data formats. The repository
- will be maintained and expanded for the duration of the
- pandemic. To contribute data simply upload it! To contribute code
- and/or workflows see
- the <a href="https://github.com/arvados/bh20-seq-resource">project
- repository</a>. For more information see the <a href="https://github.com/arvados/bh20-seq-resource/blob/master/paper/paper.md">paper</a> (WIP).
- </p>
- <br>
+ <div>
+ <h1>ABOUT</h1>
+ <p>
+ This a public repository created at the COVID-19 BioHackathon
+ that has a low barrier to entry for uploading sequence data using
+ best practices. I.e., data is published with a creative commons
+ 4.0 (CC-4.0) license with metadata using state-of-the art
+ standards and, perhaps most importantly, providing standardized
+ workflows that get triggered on upload, so that results are
+ immediately available in standardized data formats. The repository
+ will be maintained and expanded for the duration of the
+ pandemic. To contribute data simply upload it! To contribute code
+ and/or workflows see
+ the <a href="https://github.com/arvados/bh20-seq-resource">project
+ repository</a>. For more information see the <a href="https://github.com/arvados/bh20-seq-resource/blob/master/paper/paper.md">paper</a> (WIP).
+ </p>
+
+ </div>
+ <div class="sponsors">
+ <a href="https://arvados.org/"><img src="static/image/arvados-logo.png"></a>
+ <a href="https://www.commonwl.org/"><img src="static/image/CWL-Logo-Header.png"></a>
+
+ <a href="https://github.com/virtual-biohackathons/covid-19-bh20">
+ <img src="static/image/covid19biohackathon.png"></a>
+ </div>
</div>
-
- <hr>
<div class="footer">
- <a href="https://arvados.org/"><img src="static/image/arvados-logo.png" align="top"></a>
- <a href="https://www.commonwl.org/"><img src="static/image/CWL-Logo-Header.png" height="70"></a>
+ <!-- Sponsors -->
- <a href="https://github.com/virtual-biohackathons/covid-19-bh20">
- <img src="static/image/covid19biohackathon.png" align="right" height="70"></a>
<center>
<small><a href="https://github.com/arvados/bh20-seq-resource">Source code</a> &middot; Powered by <a href="https://www.commonwl.org/">Common Workflow Language</a> &amp; <a href="https://arvados.org/">Arvados</a>; Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a>
@@ -289,35 +189,10 @@
</div>
- <script type="text/javascript">
- let uploadForm = document.getElementById('metadata_upload_form')
- let uploadFormSpot = document.getElementById('metadata_upload_form_spot')
- let fillForm = document.getElementById('metadata_fill_form')
- let fillFormSpot = document.getElementById('metadata_fill_form_spot')
-
- function setUploadMode() {
- // Make the upload form the one in use
- uploadFormSpot.appendChild(uploadForm)
- fillFormSpot.removeChild(fillForm)
- }
-
- function setFillMode() {
- // Make the fillable form the one in use
- uploadFormSpot.removeChild(uploadForm)
- fillFormSpot.appendChild(fillForm)
- }
-
- function setMode() {
- // Pick mode based on radio
- if (document.getElementById('metadata_upload').checked) {
- setUploadMode()
- } else {
- setFillMode()
- }
- }
+<script type="text/javascript">
+ let scriptRoot = {{ request.script_root|tojson|safe }};
+</script>
- // Start in mode appropriate to selected form item
- setMode()
- </script>
+<script type="text/javascript" src="/static/main.js"></script>
</body>
</html>
diff --git a/example/maximum_metadata_example.yaml b/example/maximum_metadata_example.yaml
new file mode 100644
index 0000000..1bc70d7
--- /dev/null
+++ b/example/maximum_metadata_example.yaml
@@ -0,0 +1,46 @@
+id: placeholder
+
+host:
+ host_id: XX1
+ host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+ host_sex: http://purl.obolibrary.org/obo/PATO_0000384
+ host_age: 20
+ host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
+ host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
+ host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
+ host_vaccination: [vaccines1,vaccine2]
+ ethnicity: http://purl.obolibrary.org/obo/HANCESTRO_0010
+ additional_host_information: Optional free text field for addtional information
+
+sample:
+ sample_id: Id of the sample as defined by the submitter
+ collector_name: Name of the person that took the sample
+ collecting_institution: Institute that was responsible of sampling
+ specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835]
+ collection_date: "2020-01-01"
+ collection_location: http://www.wikidata.org/entity/Q148
+ sample_storage_conditions: frozen specimen
+ source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
+ additional_collection_information: Optional free text field for addtional information
+
+virus:
+ virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+ virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
+
+technology:
+ sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173]
+ sequence_assembly_method: Protocol used for assembly
+ sequencing_coverage: [70.0, 100.0]
+ additional_technology_information: Optional free text field for addtional information
+
+submitter:
+ authors: [John Doe, Joe Boe, Jonny Oe]
+ submitter_name: [John Doe]
+ submitter_address: John Doe's adress
+ originating_lab: John Doe kitchen
+ lab_address: John Doe's address
+ provider_sample_id: XXX1
+ submitter_sample_id: XXX2
+ publication: PMID00001113
+ submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001]
+ additional_submitter_information: Optional free text field for addtional information \ No newline at end of file
diff --git a/example/metadata.yaml b/example/metadata.yaml
deleted file mode 100644
index a76616c..0000000
--- a/example/metadata.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-id: placeholder
-
-host:
- host_id: XX1
- host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
- host_sex: http://purl.obolibrary.org/obo/NCIT_C27993
- host_age: 20
- host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
- host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
- host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
- host_vaccination: List of vaccines given to the host (RRIDs?)
- additional_host_information: Field for additional host information
-
-sample:
- sample_id: Id of the sample as defined by the submitter
- collector_name: Name of the person that took the sample
- collecting_institution: Institute that was responsible of sampling
- specimen_source: http://purl.obolibrary.org/obo/NCIT_C155831
- specimen_source2: http://purl.obolibrary.org/obo/NCIT_C155835
- collection_date: "2020-01-01"
- collection_location: http://www.wikidata.org/entity/Q148
- sample_storage_conditions: XXX
- additional_collection_information: XXX
-
-virus:
- virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
- virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
-
-technology:
- sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173
- sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173
- sequence_assembly_method: Protocol used for assembly
- sequencing_coverage: 70
-
-submitter:
- submitter_name: John Doe
- submitter_address: John Doe's adress
- originating_lab: John Doe kitchen
- lab_address: John Doe's address
- provider_sample_id: HmX
- submitter_sample_id: xXx
- authors: John Doe et all
- submitter_orcid: https://orcid.org/0000-0000-0000-0000 \ No newline at end of file
diff --git a/example/minimal_example.yaml b/example/minimal_metadata_example.yaml
index 0e36a25..51f8a87 100644
--- a/example/minimal_example.yaml
+++ b/example/minimal_metadata_example.yaml
@@ -5,14 +5,14 @@ host:
sample:
sample_id: XX
- collection_date: 2020-01
+ collection_date: "2020-01-01"
collection_location: http://www.wikidata.org/entity/Q148
virus:
virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
technology:
- sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632
+ sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
submitter:
- authors: John Doe \ No newline at end of file
+ authors: [John Doe] \ No newline at end of file
diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv
index 20e8a9b..6b43137 100644
--- a/scripts/dict_ontology_standardization/ncbi_countries.csv
+++ b/scripts/dict_ontology_standardization/ncbi_countries.csv
@@ -39,6 +39,7 @@ Chad,http://www.wikidata.org/entity/Q657
Chile,http://www.wikidata.org/entity/Q298
China,http://www.wikidata.org/entity/Q148
China: Anhui,http://www.wikidata.org/entity/Q40956
+"China: Anhui, Fuyang":http://www.wikidata.org/entity/Q360584
China: Beijing,http://www.wikidata.org/entity/Q956
China: Chongqing,http://www.wikidata.org/entity/Q11725
China: Fujian,http://www.wikidata.org/entity/Q41705
@@ -48,6 +49,7 @@ China: Guangdong,http://www.wikidata.org/entity/Q15175
China: Guangxi Zhuang Autonomous Region,http://www.wikidata.org/entity/Q15176
China: Guangzhou,http://www.wikidata.org/entity/Q16572
China: Guizhou,http://www.wikidata.org/entity/Q47097
+China: Hangzhou,http://www.wikidata.org/entity/Q4970
China: Hainan,http://www.wikidata.org/entity/Q42200
China: Hebei,http://www.wikidata.org/entity/Q21208
China: Heilongjiang,http://www.wikidata.org/entity/Q19206
@@ -109,6 +111,7 @@ France,http://www.wikidata.org/entity/Q142
Gabon,http://www.wikidata.org/entity/Q1000
Georgia,http://www.wikidata.org/entity/Q230
Germany,http://www.wikidata.org/entity/Q183
+Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718
Ghana,http://www.wikidata.org/entity/Q117
Greece,http://www.wikidata.org/entity/Q41
Grenada,http://www.wikidata.org/entity/Q769
@@ -123,6 +126,7 @@ Iceland,http://www.wikidata.org/entity/Q189
Icelandic Commonwealth,http://www.wikidata.org/entity/Q62389
India,http://www.wikidata.org/entity/Q668
India: Kerala State,http://www.wikidata.org/entity/Q1186
+India: Rajkot,http://www.wikidata.org/entity/Q1815245
Indonesia,http://www.wikidata.org/entity/Q252
Iran,http://www.wikidata.org/entity/Q794
Iran: Qum,http://www.wikidata.org/entity/Q131664
@@ -172,6 +176,7 @@ Mozambique,http://www.wikidata.org/entity/Q1029
Myanmar,http://www.wikidata.org/entity/Q836
Namibia,http://www.wikidata.org/entity/Q1030
Nauru,http://www.wikidata.org/entity/Q697
+Netherlands: Milheeze,https://www.wikidata.org/wiki/Q3314115
Nepal,http://www.wikidata.org/entity/Q837
New Zealand,http://www.wikidata.org/entity/Q664
Nicaragua,http://www.wikidata.org/entity/Q811
@@ -263,6 +268,7 @@ USA: CA,http://www.wikidata.org/entity/Q99
"USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143
USA: CO,http://www.wikidata.org/entity/Q1261
USA: CT,http://www.wikidata.org/entity/Q779
+USA: DC,http://www.wikidata.org/entity/Q3551781
USA: DE,http://www.wikidata.org/entity/Q1393
USA: FL,http://www.wikidata.org/entity/Q812
USA: GA,http://www.wikidata.org/entity/Q1428
@@ -293,6 +299,7 @@ USA: NM,http://www.wikidata.org/entity/Q1522
USA: North Carolina,http://www.wikidata.org/entity/Q1454
USA: NV,http://www.wikidata.org/entity/Q1227
USA: NY,http://www.wikidata.org/entity/Q1384
+USA: New York,http://www.wikidata.org/entity/Q1384
USA: OH,http://www.wikidata.org/entity/Q1397
USA: OK,http://www.wikidata.org/entity/Q1649
USA: OR,http://www.wikidata.org/entity/Q824
@@ -321,4 +328,4 @@ Viet Nam: Ho Chi Minh city,http://www.wikidata.org/entity/Q1854
Vietnam,http://www.wikidata.org/entity/Q881
Yemen,http://www.wikidata.org/entity/Q805
Zambia,http://www.wikidata.org/entity/Q953
-Zimbabwe,http://www.wikidata.org/entity/Q954
+Zimbabwe,http://www.wikidata.org/entity/Q954 \ No newline at end of file
diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
index f5aeaae..7fa67f8 100644
--- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
+++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
@@ -1,5 +1,6 @@
nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831
nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831
+nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831
respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831
naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831
nasopharyngeal aspirate,http://purl.obolibrary.org/obo/NCIT_C155831
diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile
new file mode 100644
index 0000000..5bd38dd
--- /dev/null
+++ b/scripts/docker/Dockerfile
@@ -0,0 +1,10 @@
+FROM debian:10
+
+RUN apt-get update && \
+ apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \
+ python3 python3-pip python3-setuptools python3-dev python-pycurl \
+ clustalw python3-biopython libcurl4-openssl-dev build-essential \
+ libssl-dev && \
+ apt-get clean
+
+RUN pip3 install bh20-seq-uploader \ No newline at end of file
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 5257bd1..6f046ea 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -7,6 +7,8 @@ import xml.etree.ElementTree as ET
import json
import os
+from dateutil import parser
+
num_ids_for_request = 100
dir_metadata = 'metadata_from_nuccore'
@@ -37,20 +39,19 @@ if not os.path.exists(dir_metadata):
tmp_list = [x.split('.')[0] for x in tmp_list]
print(term, len(tmp_list))
- tmp_list=tmp_list
- # tmp_list = tmp_list[0:2] # restricting to small run
+ #tmp_list = tmp_list[0:2] # restricting to small run
id_set.update([x.split('.')[0] for x in tmp_list])
print(term_list, len(id_set))
- with open(path_ncbi_virus_accession) as f:
- tmp_list = [line.strip('\n') for line in f]
-
- print('NCBI Virus', len(tmp_list))
- id_set.update(tmp_list)
-
- print(term_list + ['NCBI Virus'], len(id_set))
+ if os.path.exists(path_ncbi_virus_accession):
+ with open(path_ncbi_virus_accession) as f:
+ tmp_list = [line.strip('\n') for line in f]
+ print('NCBI Virus', len(tmp_list))
+ id_set.update(tmp_list)
+ term_list.append('NCBI Virus')
+ print(term_list, len(id_set))
for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)):
path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i))
@@ -86,7 +87,7 @@ if not os.path.exists(dir_fasta_and_yaml):
os.makedirs(dir_fasta_and_yaml)
missing_value_list = []
-
+
for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
tree = ET.parse(path_metadata_xxx_xml)
GBSet = tree.getroot()
@@ -110,23 +111,23 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
'submitter': {}
}
-
+
info_for_yaml_dict['sample']['sample_id'] = accession_version
- info_for_yaml_dict['sample']['source_database_accession'] = accession_version
-
-
+ info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now
+
+
# submitter info
GBSeq_references = GBSeq.find('GBSeq_references')
if GBSeq_references is not None:
- info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq_references.iter('GBAuthor')])
-
+ info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')]
+
GBReference = GBSeq_references.find('GBReference')
if GBReference is not None:
GBReference_journal = GBReference.find('GBReference_journal')
-
+
if GBReference_journal is not None and GBReference_journal.text != 'Unpublished':
if 'Submitted' in GBReference_journal.text:
- info_for_yaml_dict['submitter']['submitter_name'] = GBReference_journal.text.split(') ')[1].split(',')[0].strip()
+ info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())]
info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip()
else:
info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text
@@ -146,8 +147,9 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if field_in_yaml == 'sequencing_coverage':
# A regular expression would be better!
try:
- info_for_yaml_dict['technology'][field_in_yaml] = float(
- tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>'))
+ info_for_yaml_dict['technology'][field_in_yaml] = [
+ float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>'))
+ ]
except ValueError:
print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse)
pass
@@ -162,8 +164,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
new_seq_tec_list.append(seq_tec)
- for n, seq_tec in enumerate(new_seq_tec_list):
- info_for_yaml_dict['technology'][field_in_yaml + ('' if n == 0 else str(n + 1))] = seq_tec
+ info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list]
else:
info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse
@@ -199,7 +200,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if 'age' in GBQualifier_value_text:
info_for_yaml_dict['host']['host_age'] = int(GBQualifier_value_text_list[2].split('age ')[1])
- info_for_yaml_dict['host']['host_age_unit'] = 'year'
+ info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036'
elif GBQualifier_name_text == 'collected_by':
if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]):
info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text
@@ -208,24 +209,46 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
elif GBQualifier_name_text == 'isolation_source':
if GBQualifier_value_text.upper() in term_to_uri_dict:
GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa'
-
+
if GBQualifier_value_text in term_to_uri_dict:
- info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict[GBQualifier_value_text]
+ info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]]
else:
if GBQualifier_value_text in ['NP/OP swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'np/np swab', 'np/op']:
- info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab']
- info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['oropharyngeal swab']
- elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab']:
- info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab']
- info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab']
+ info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
+ elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab']:
+ info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
elif GBQualifier_value_text in ['nasopharyngeal aspirate/throat swab']:
- info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal aspirate']
- info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab']
+ info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']]
else:
missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text]))
elif GBQualifier_name_text == 'collection_date':
# TO_DO: which format we will use?
- info_for_yaml_dict['sample']['collection_date'] = GBQualifier_value_text
+ date_to_write = GBQualifier_value_text
+
+ if len(GBQualifier_value_text.split('-')) == 1:
+ if int(GBQualifier_value_text) < 2020:
+ date_to_write = "15 12 {}".format(GBQualifier_value_text)
+ else:
+ date_to_write = "15 01 {}".format(GBQualifier_value_text)
+
+ if 'additional_collection_information' in info_for_yaml_dict['sample']:
+ info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
+ else:
+ info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
+ elif len(GBQualifier_value_text.split('-')) == 2:
+ date_to_write += '-15'
+
+ if 'additional_collection_information' in info_for_yaml_dict['sample']:
+ info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
+ else:
+ info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
+ elif len(GBQualifier_value_text.split('-')) == 3:
+ GBQualifier_value_text_list = GBQualifier_value_text.split('-')
+
+ if GBQualifier_value_text_list[1].isalpha():
+ date_to_write = GBQualifier_value_text_list[1] + ' ' + GBQualifier_value_text_list[0] + ' ' + GBQualifier_value_text_list[2]
+
+ info_for_yaml_dict['sample']['collection_date'] = date_to_write
elif GBQualifier_name_text in ['lat_lon', 'country']:
if GBQualifier_value_text == 'Hong Kong':
GBQualifier_value_text = 'China: Hong Kong'
@@ -237,7 +260,10 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text
elif GBQualifier_name_text == 'note':
- info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text
+ if 'additional_collection_information' in info_for_yaml_dict['sample']:
+ info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text
+ else:
+ info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text
elif GBQualifier_name_text == 'isolate':
info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text
elif GBQualifier_name_text == 'db_xref':
@@ -254,7 +280,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw:
json.dump(info_for_yaml_dict, fw, indent=2)
-
+
if len(missing_value_list) > 0:
with open('missing_terms.tsv', 'w') as fw:
fw.write('\n'.join(missing_value_list))
diff --git a/scripts/import.cwl b/scripts/import.cwl
new file mode 100644
index 0000000..d84516b
--- /dev/null
+++ b/scripts/import.cwl
@@ -0,0 +1,30 @@
+cwlVersion: v1.1
+class: CommandLineTool
+baseCommand: python3
+inputs:
+ scripts:
+ type: File
+ default:
+ class: File
+ location: import_to_arvados.py
+ inputBinding: {position: 1}
+ importScript:
+ type: File
+ default:
+ class: File
+ location: from_genbank_to_fasta_and_yaml.py
+ inputBinding: {position: 2}
+ dict:
+ type: Directory
+ default:
+ class: Directory
+ location: dict_ontology_standardization
+ inputBinding: {position: 3}
+outputs: []
+requirements:
+ DockerRequirement:
+ dockerPull: bh20-seq-uploader/import
+ NetworkAccess:
+ networkAccess: true
+ WorkReuse:
+ enableReuse: false
diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py
new file mode 100644
index 0000000..78cd13d
--- /dev/null
+++ b/scripts/import_to_arvados.py
@@ -0,0 +1,14 @@
+import os
+import subprocess
+import glob
+import sys
+
+os.chdir(os.environ["TMPDIR"])
+os.symlink(sys.argv[2], "dict_ontology_standardization")
+subprocess.run(sys.argv[1])
+
+os.chdir("fasta_and_yaml")
+fasta_files = glob.glob("*.fasta")
+
+for f in fasta_files:
+ subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]])
diff --git a/scripts/sequences.acc b/scripts/sequences.acc
index a99c4e6..697d868 100644
--- a/scripts/sequences.acc
+++ b/scripts/sequences.acc
@@ -1,4 +1,299 @@
NC_045512
+MT394528
+MT394529
+MT394530
+MT394531
+MT394864
+MT396241
+MT396242
+MT396243
+MT396244
+MT396245
+MT396246
+MT396247
+MT396248
+MT396266
+MT380726
+MT380727
+MT380728
+MT380729
+MT380730
+MT380731
+MT380732
+MT380733
+MT380734
+MT385414
+MT385415
+MT385416
+MT385417
+MT385418
+MT385419
+MT385420
+MT385421
+MT385422
+MT385423
+MT385424
+MT385425
+MT385426
+MT385427
+MT385428
+MT385429
+MT385430
+MT385431
+MT385432
+MT385433
+MT385434
+MT385435
+MT385436
+MT385437
+MT385438
+MT385439
+MT385440
+MT385441
+MT385442
+MT385443
+MT385444
+MT385445
+MT385446
+MT385447
+MT385448
+MT385449
+MT385450
+MT385451
+MT385452
+MT385453
+MT385454
+MT385455
+MT385456
+MT385457
+MT385458
+MT385459
+MT385460
+MT385461
+MT385462
+MT385463
+MT385464
+MT385465
+MT385466
+MT385467
+MT385468
+MT385469
+MT385470
+MT385471
+MT385472
+MT385473
+MT385474
+MT385475
+MT385476
+MT385477
+MT385478
+MT385479
+MT385480
+MT385481
+MT385482
+MT385483
+MT385484
+MT385485
+MT385486
+MT385487
+MT385488
+MT385489
+MT385490
+MT385491
+MT385492
+MT385493
+MT385494
+MT385495
+MT385496
+MT385497
+MT186683
+MT252677
+MT252678
+MT252679
+MT252680
+MT252681
+MT252682
+MT252683
+MT252684
+MT252685
+MT252686
+MT252687
+MT252688
+MT252689
+MT252690
+MT252691
+MT252692
+MT252693
+MT252694
+MT252695
+MT252696
+MT252697
+MT252698
+MT252699
+MT252700
+MT252701
+MT252702
+MT252703
+MT252704
+MT252705
+MT252706
+MT252707
+MT252708
+MT252709
+MT252710
+MT252711
+MT252712
+MT252713
+MT252715
+MT252716
+MT252717
+MT252719
+MT252721
+MT252723
+MT252725
+MT252726
+MT252728
+MT252729
+MT252730
+MT252733
+MT252734
+MT252735
+MT252736
+MT252737
+MT252738
+MT252739
+MT252740
+MT252741
+MT252742
+MT252745
+MT252746
+MT252747
+MT252748
+MT252749
+MT252756
+MT252757
+MT252758
+MT252761
+MT252763
+MT252764
+MT252765
+MT252766
+MT252767
+MT252768
+MT252769
+MT252770
+MT252771
+MT252772
+MT252773
+MT252774
+MT252775
+MT252778
+MT252779
+MT252780
+MT252781
+MT252782
+MT252783
+MT252784
+MT252785
+MT252787
+MT252788
+MT252792
+MT252793
+MT252794
+MT252795
+MT252797
+MT252798
+MT252799
+MT252800
+MT252801
+MT252802
+MT252803
+MT252804
+MT252805
+MT252806
+MT252807
+MT252808
+MT252809
+MT252810
+MT252811
+MT252821
+MT252822
+MT252823
+MT252824
+MT339043
+MT365033
+MT374101
+MT374102
+MT374103
+MT374104
+MT374105
+MT374106
+MT374107
+MT374108
+MT374109
+MT374110
+MT374111
+MT374112
+MT374113
+MT374114
+MT374115
+MT374116
+MT375428
+MT375429
+MT375430
+MT375431
+MT375432
+MT375433
+MT375434
+MT375435
+MT375436
+MT375437
+MT375438
+MT375439
+MT375440
+MT375441
+MT375442
+MT375443
+MT375444
+MT375445
+MT375446
+MT375447
+MT375448
+MT375449
+MT375450
+MT375451
+MT375452
+MT375453
+MT375454
+MT375455
+MT375456
+MT375457
+MT375458
+MT375459
+MT375460
+MT375461
+MT375462
+MT375463
+MT375464
+MT375465
+MT375466
+MT375467
+MT375468
+MT375469
+MT375470
+MT375471
+MT375472
+MT375473
+MT375474
+MT375475
+MT375476
+MT375477
+MT375478
+MT375479
+MT375480
+MT375481
+MT375482
+MT375483
MT370516
MT370517
MT370518
@@ -225,6 +520,8 @@ MT372480
MT372481
MT372482
MT372483
+7BV2_P
+7BV2_T
LC542976
LC542809
MT114412
diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl
new file mode 100644
index 0000000..0f364fc
--- /dev/null
+++ b/semantic_enrichment/countries.ttl
@@ -0,0 +1,279 @@
+@prefix ns1: <http://www.wikidata.org/prop/direct/> .
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix xml: <http://www.w3.org/XML/1998/namespace> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+<http://www.wikidata.org/entity/Q108143> rdfs:label "San Diego County" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-116.77 33.02)" .
+
+<http://www.wikidata.org/entity/Q110403> rdfs:label "Snohomish County" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-121.71 48.04)" .
+
+<http://www.wikidata.org/entity/Q1166> rdfs:label "Michigan" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-85.58 44.34)" .
+
+<http://www.wikidata.org/entity/Q11746> rdfs:label "Wuhan" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(114.288055555 30.587222222)" .
+
+<http://www.wikidata.org/entity/Q1186> rdfs:label "Kerala" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q668> ;
+ ns1:P625 "Point(76.972 8.5074)" .
+
+<http://www.wikidata.org/entity/Q1204> rdfs:label "Illinois" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-89.0 40.0)" .
+
+<http://www.wikidata.org/entity/Q1221> rdfs:label "Idaho" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-114.0 45.0)" .
+
+<http://www.wikidata.org/entity/Q1223> rdfs:label "Washington" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-120.5 47.5)" .
+
+<http://www.wikidata.org/entity/Q1227> rdfs:label "Nevada" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-117.0 39.0)" .
+
+<http://www.wikidata.org/entity/Q123304> rdfs:label "Antioquia Department" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q739> ;
+ ns1:P625 "Point(-75.566666666 6.216666666)" .
+
+<http://www.wikidata.org/entity/Q1370> rdfs:label "Virginia" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-79.0 37.5)" .
+
+<http://www.wikidata.org/entity/Q1384> rdfs:label "New York" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-75.0 43.0)" .
+
+<http://www.wikidata.org/entity/Q1387> rdfs:label "Rhode Island" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-71.5 41.7)" .
+
+<http://www.wikidata.org/entity/Q1391> rdfs:label "Maryland" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-76.7 39.0)" .
+
+<http://www.wikidata.org/entity/Q1397> rdfs:label "Ohio" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-82.5 40.5)" .
+
+<http://www.wikidata.org/entity/Q1400> rdfs:label "Pennsylvania" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-77.5 41.0)" .
+
+<http://www.wikidata.org/entity/Q1408> rdfs:label "New Jersey" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-74.5 40.0)" .
+
+<http://www.wikidata.org/entity/Q1415> rdfs:label "Indiana" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-86.216666666 39.933333333)" .
+
+<http://www.wikidata.org/entity/Q1428> rdfs:label "Georgia" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-83.5 33.0)" .
+
+<http://www.wikidata.org/entity/Q1439> rdfs:label "Texas" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-100.0 31.0)" .
+
+<http://www.wikidata.org/entity/Q1454> rdfs:label "North Carolina" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-80.0 35.5)" .
+
+<http://www.wikidata.org/entity/Q1456> rdfs:label "South Carolina" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-81.0 34.0)" .
+
+<http://www.wikidata.org/entity/Q15174> rdfs:label "Shenzhen" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(114.054 22.535)" .
+
+<http://www.wikidata.org/entity/Q1527> rdfs:label "Minnesota" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-94.0 46.0)" .
+
+<http://www.wikidata.org/entity/Q1537> rdfs:label "Wisconsin" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-89.5 44.5)" .
+
+<http://www.wikidata.org/entity/Q1546> rdfs:label "Iowa" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-93.0 42.0)" .
+
+<http://www.wikidata.org/entity/Q1553> rdfs:label "Nebraska" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-100.0 41.5)" .
+
+<http://www.wikidata.org/entity/Q1558> rdfs:label "Kansas" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-98.0 38.5)" .
+
+<http://www.wikidata.org/entity/Q1581> rdfs:label "Missouri" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-92.5 38.5)" .
+
+<http://www.wikidata.org/entity/Q1588> rdfs:label "Louisiana" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-92.0 31.0)" .
+
+<http://www.wikidata.org/entity/Q16572> rdfs:label "Guangzhou" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(113.258976 23.128795)" .
+
+<http://www.wikidata.org/entity/Q1854> rdfs:label "Ho Chi Minh City" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q881> ;
+ ns1:P625 "Point(106.62965 10.82302)",
+ "Point(106.633333333 10.816666666)" .
+
+<http://www.wikidata.org/entity/Q198244> rdfs:label "Zhuozhou" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(115.99176 39.48873)" .
+
+<http://www.wikidata.org/entity/Q36687> rdfs:label "Victoria" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q408> ;
+ ns1:P625 "Point(144.0 -37.0)" .
+
+<http://www.wikidata.org/entity/Q43194> rdfs:label "Yunnan" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(101.5 24.5)" .
+
+<http://www.wikidata.org/entity/Q4970> rdfs:label "Hangzhou" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(120.1675 30.25)" .
+
+<http://www.wikidata.org/entity/Q62> rdfs:label "San Francisco" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-122.416388888 37.7775)" .
+
+<http://www.wikidata.org/entity/Q759> rdfs:label "New Hampshire" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-71.5 44.0)" .
+
+<http://www.wikidata.org/entity/Q771> rdfs:label "Massachusetts" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-71.8 42.3)" .
+
+<http://www.wikidata.org/entity/Q779> rdfs:label "Connecticut" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-72.7 41.6)" .
+
+<http://www.wikidata.org/entity/Q782> rdfs:label "Hawaii" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-157.796388888 21.311388888)" .
+
+<http://www.wikidata.org/entity/Q812> rdfs:label "Florida" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-81.631666666 28.133333333)" .
+
+<http://www.wikidata.org/entity/Q816> rdfs:label "Arizona" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-111.656944 34.286667)" .
+
+<http://www.wikidata.org/entity/Q81725> rdfs:label "KwaZulu-Natal" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q258> ;
+ ns1:P625 "Point(31.0 -29.0)" .
+
+<http://www.wikidata.org/entity/Q824> rdfs:label "Oregon" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-120.575 43.935833)" .
+
+<http://www.wikidata.org/entity/Q829> rdfs:label "Utah" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-111.5 39.5)" .
+
+<http://www.wikidata.org/entity/Q8686> rdfs:label "Shanghai" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(121.466666666 31.166666666)" .
+
+<http://www.wikidata.org/entity/Q8818> rdfs:label "Valencia" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q29> ;
+ ns1:P625 "Point(-0.375 39.466666666)" .
+
+<http://www.wikidata.org/entity/Q956> rdfs:label "Beijing" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(116.391388888 39.905)" .
+
+<http://www.wikidata.org/entity/Q99> rdfs:label "California" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-120.0 37.0)" .
+
+<http://www.wikidata.org/entity/Q142> rdfs:label "France" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q142> ;
+ ns1:P625 "Point(2.0 47.0)" .
+
+<http://www.wikidata.org/entity/Q155> rdfs:label "Brazil" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q155> ;
+ ns1:P625 "Point(-53.0 -14.0)" .
+
+<http://www.wikidata.org/entity/Q258> rdfs:label "South Africa" .
+
+<http://www.wikidata.org/entity/Q29> rdfs:label "Spain" .
+
+<http://www.wikidata.org/entity/Q33> rdfs:label "Finland" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q33> ;
+ ns1:P625 "Point(27.0 65.0)" .
+
+<http://www.wikidata.org/entity/Q34> rdfs:label "Sweden" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q34> ;
+ ns1:P625 "Point(15.0 61.0)" .
+
+<http://www.wikidata.org/entity/Q38> rdfs:label "Italy" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q38> ;
+ ns1:P625 "Point(12.5 42.5)" .
+
+<http://www.wikidata.org/entity/Q408> rdfs:label "Australia" .
+
+<http://www.wikidata.org/entity/Q41> rdfs:label "Greece" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q41> ;
+ ns1:P625 "Point(23.0 38.5)" .
+
+<http://www.wikidata.org/entity/Q419> rdfs:label "Peru" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q419> ;
+ ns1:P625 "Point(-76.0 -9.4)" .
+
+<http://www.wikidata.org/entity/Q43> rdfs:label "Turkey" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q43> ;
+ ns1:P625 "Point(36.0 39.0)" .
+
+<http://www.wikidata.org/entity/Q668> rdfs:label "India" .
+
+<http://www.wikidata.org/entity/Q739> rdfs:label "Colombia" .
+
+<http://www.wikidata.org/entity/Q794> rdfs:label "Iran" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q794> ;
+ ns1:P625 "Point(53.0 32.0)" .
+
+<http://www.wikidata.org/entity/Q801> rdfs:label "Israel" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q801> ;
+ ns1:P625 "Point(35.0 31.0)" .
+
+<http://www.wikidata.org/entity/Q837> rdfs:label "Nepal" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q837> ;
+ ns1:P625 "Point(84.0 28.0)" .
+
+<http://www.wikidata.org/entity/Q865> rdfs:label "Taiwan" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q865> ;
+ ns1:P625 "Point(121.0 24.0)" .
+
+<http://www.wikidata.org/entity/Q881> rdfs:label "Vietnam" .
+
+<http://www.wikidata.org/entity/Q884> rdfs:label "South Korea" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q884> ;
+ ns1:P625 "Point(128.0 36.0)" .
+
+<http://www.wikidata.org/entity/Q148> rdfs:label "People's Republic of China" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q148> ;
+ ns1:P625 "Point(103.0 35.0)" .
+
+<http://www.wikidata.org/entity/Q30> rdfs:label "United States of America" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-77.036666666 38.895)" .
+
diff --git a/semantic_enrichment/labels.ttl b/semantic_enrichment/labels.ttl
new file mode 100644
index 0000000..b4e5d1f
--- /dev/null
+++ b/semantic_enrichment/labels.ttl
@@ -0,0 +1,24 @@
+<http://edamontology.org/data_1875> <http://www.w3.org/2000/01/rdf-schema#label> "NCBI taxon" .
+<http://purl.obolibrary.org/obo/GAZ_00000448> <http://www.w3.org/2000/01/rdf-schema#label> "geographic location" .
+<http://purl.obolibrary.org/obo/FLU_0000848> <http://www.w3.org/2000/01/rdf-schema#label> "sequence coverage" .
+<http://purl.obolibrary.org/obo/NCIT_C41206> <http://www.w3.org/2000/01/rdf-schema#label> "Institution" .
+<http://purl.obolibrary.org/obo/NCIT_C42781> <http://www.w3.org/2000/01/rdf-schema#label> "Author" .
+<http://purl.obolibrary.org/obo/OBI_0001479> <http://www.w3.org/2000/01/rdf-schema#label> "specimen from organism" .
+<http://purl.obolibrary.org/obo/OBI_0600047> <http://www.w3.org/2000/01/rdf-schema#label> "sequencing assay" .
+<http://semanticscience.org/resource/SIO_000115> <http://www.w3.org/2000/01/rdf-schema#label> "identifier" .
+<http://www.ebi.ac.uk/efo/EFO_0000532> <http://www.w3.org/2000/01/rdf-schema#label> "host" .
+<http://semanticscience.org/resource/SIO_001167> <http://www.w3.org/2000/01/rdf-schema#label> "comment" .
+<http://www.ebi.ac.uk/efo/EFO_0002699> <http://www.w3.org/2000/01/rdf-schema#label> "high throughput sequencer" .
+<http://semanticscience.org/resource/SIO_010055> <http://www.w3.org/2000/01/rdf-schema#label> "strain" .
+<http://purl.obolibrary.org/obo/OBI_0001895> <http://www.w3.org/2000/01/rdf-schema#label> "specimen collector" .
+<http://edamontology.org/data_2091> <http://www.w3.org/2000/01/rdf-schema#label> "Accession" .
+<http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164> <http://www.w3.org/2000/01/rdf-schema#label> "collection date" .
+<http://purl.obolibrary.org/obo/NCIT_C3833> <http://www.w3.org/2000/01/rdf-schema#label> "Asymptomatic" .
+<http://purl.obolibrary.org/obo/NCIT_C25269> <http://www.w3.org/2000/01/rdf-schema#label> "Symptomatic" .
+<http://purl.obolibrary.org/obo/GENEPIO_0002020> <http://www.w3.org/2000/01/rdf-schema#label> "admitted to hospital" .
+<http://purl.obolibrary.org/obo/GENEPIO_0001849> <http://www.w3.org/2000/01/rdf-schema#label> "discharged from hospital" .
+<http://purl.obolibrary.org/obo/NCIT_C28554> <http://www.w3.org/2000/01/rdf-schema#label> "Dead" .
+<http://purl.obolibrary.org/obo/NCIT_C37987> <http://www.w3.org/2000/01/rdf-schema#label> "Alive" .
+<http://purl.obolibrary.org/obo/NCIT_C115935> <http://www.w3.org/2000/01/rdf-schema#label> "Healthy" .
+<http://purl.obolibrary.org/obo/PATO_0000384> <http://www.w3.org/2000/01/rdf-schema#label> "male".
+<http://purl.obolibrary.org/obo/PATO_0000383> <http://www.w3.org/2000/01/rdf-schema#label> "female" .
diff --git a/setup.py b/setup.py
index 4ab6329..412c103 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,8 @@ try:
except ImportError:
tagger = egg_info_cmd.egg_info
-install_requires = ["arvados-python-client", "schema-salad", "python-magic", "pyshex"]
+install_requires = ["arvados-python-client", "schema-salad",
+ "python-magic", "pyshex", "py-dateutil"]
web_requires = ["flask", "pyyaml"]
needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv)
@@ -26,7 +27,7 @@ setup(
version="1.0",
description="Biohackathon sequence uploader",
long_description=open(README).read(),
- long_description_content_type="text/x-rst",
+ long_description_content_type="text/markdown",
author="Peter Amstutz",
author_email="peter.amstutz@curii.com",
license="Apache 2.0",
@@ -34,7 +35,8 @@ setup(
package_data={"bh20sequploader": ["bh20seq-schema.yml",
"bh20seq-options.yml",
"bh20seq-shex.rdf",
- "validation/formats"],
+ "validation/formats",
+ "SARS-CoV-2-reference.fasta",],
},
install_requires=install_requires,
extras_require={
diff --git a/workflows/pangenome-generate/minimap2.cwl b/workflows/pangenome-generate/minimap2.cwl
index 42d1dce..bf8eb4c 100644
--- a/workflows/pangenome-generate/minimap2.cwl
+++ b/workflows/pangenome-generate/minimap2.cwl
@@ -12,7 +12,7 @@ hints:
ResourceRequirement:
coresMin: 8
coresMax: 32
- ramMin: $(9 * 1024)
+ ramMin: $(15 * 1024)
outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20))
stdout: $(inputs.readsFA.nameroot).paf
baseCommand: minimap2
diff --git a/workflows/pangenome-generate/odgi_to_rdf.cwl b/workflows/pangenome-generate/odgi_to_rdf.cwl
index 079d6fb..e6a279b 100644
--- a/workflows/pangenome-generate/odgi_to_rdf.cwl
+++ b/workflows/pangenome-generate/odgi_to_rdf.cwl
@@ -3,10 +3,12 @@ class: CommandLineTool
cwlVersion: v1.1
hints:
DockerRequirement:
- dockerPull: spodgi/spodgi
+ dockerPull: jerven/spodgi:0.0.6
requirements:
InlineJavascriptRequirement: {}
ShellCommandRequirement: {}
+ ResourceRequirement:
+ ramMin: $((2 * 1024) + 1)
inputs:
- id: odgi
type: File