diff options
29 files changed, 1862 insertions, 410 deletions
@@ -1,4 +1,20 @@ *.py~ + +# Distribution / packaging build/ cache.txt metadata.ttl +__pycache__/ +eggs/ +.eggs/ +*.egg-info/ +*.egg + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ @@ -35,6 +35,14 @@ Note that you will need to repeat the `. venv/bin/activate` step from this direc 3. **Install the tool.** Once in your virtualenv, install this project: +Install from PyPi: + +```sh +pip3 bh20-seq-uploader +``` + +Install from git: + ```sh pip3 install git+https://github.com/arvados/bh20-seq-resource.git@master ``` @@ -166,7 +174,7 @@ To run it locally: ``` virtualenv --python python3 venv . venv/bin/activate -pip install -e .[web] +pip install -e ".[web]" env FLASK_APP=bh20simplewebuploader/main.py flask run ``` diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 8d0f562..07e5f69 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -90,7 +90,6 @@ def run_workflow(api, parent_project, workflow_uuid, name, inputobj): cmd = ["arvados-cwl-runner", "--submit", "--no-wait", - "--debug", "--project-uuid=%s" % project["uuid"], "arvwf:%s" % workflow_uuid, tmp.name] @@ -137,6 +136,7 @@ def start_pangenome_analysis(api, "location": schema_ref } } + validated.sort(key=lambda v: v["portable_data_hash"]) for v in validated: inputobj["inputReads"].append({ "class": "File", diff --git a/bh20sequploader/SARS-CoV-2-reference.fasta b/bh20sequploader/SARS-CoV-2-reference.fasta new file mode 100644 index 0000000..b364687 --- /dev/null +++ b/bh20sequploader/SARS-CoV-2-reference.fasta @@ -0,0 +1,430 @@ +>NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA +CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC +TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG +TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC +CCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC +GTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG +CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT +GCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC +GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT +TCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA +GGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG +TTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG +CCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTG +TCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTG +CTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAA +ATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA +CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCAC +CAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCA +GACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACT +ACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAG +GACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCG +CACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCA +CGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACA +ACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA +GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGAT +TATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAG +GTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCG +TGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCC +GCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTG +ATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTG +GCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTT +AAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA +TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGT +AAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTA +GGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCC +TACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTT +AACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAA +GCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGT +ACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAA +GGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT +GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAA +ATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC +ACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAA +TTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAG +AAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATT +TGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAA +CAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTC +AACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT +AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACA +GTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTA +CTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAG +TTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGT +GAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTAT +TATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAA +TGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGT +GAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA +AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAAC +TCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCA +GATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTG +ATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAAT +GCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAAT +GGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTA +TTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGC +AGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA +TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAA +CAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTA +TGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTT +TCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAG +AACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACA +ACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCAC +CTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTA +AGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA +ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGT +AAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTG +ATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAA +TGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAA +ATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTA +ACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAAT +GAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGT +GGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT +TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTC +ACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGT +GAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAG +ACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAG +TTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAG +TTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAAC +CATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAA +CCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT +GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAAC +CTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTG +TCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGA +ATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGA +AAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAA +TAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTT +ACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTG +CTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC +AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTA +TTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAG +CAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAA +TTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTAC +TCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAG +GCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCT +TAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAA +TGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT +ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTC +TTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATC +TTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTT +GTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAG +GTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGT +GATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAA +GACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCA +TCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC +AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAAT +GTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACT +AGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTT +AATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTG +AACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGT +TGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTT +ACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTG +GTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT +ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAG +AATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAG +CACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTT +TGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAA +ATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTA +ACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCC +ATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGC +ACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC +CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTT +TAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTAT +GAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACC +TTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATC +AGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCA +GGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTG +GTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTA +CTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC +CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTT +ACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTT +CACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGG +TTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTG +CGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTAC +GCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGC +TACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTC +TTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC +ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTT +GATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAG +ATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGG +ACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAG +TTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTT +ACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGG +TTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCAT +GCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA +CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTT +TCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTA +ACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTG +CTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGA +TGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACA +ATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTC +AATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTC +TGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC +ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATA +TGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACT +AATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTG +ACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCT +CTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTG +TGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTC +TTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTG +GTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA +GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTA +GCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAAC +TCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAA +AGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTA +GACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTA +GTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGA +TTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCA +GCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG +AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAA +TGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACA +ACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACAT +TTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAG +TGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCT +GCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTA +CACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACT +TGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC +TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTAT +ACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCT +ACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGAT +GCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGT +GTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGG +TGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTA +AAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAG +TCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA +GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCA +CAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAA +ATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTT +GTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTC +CAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCA +ACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGAC +ACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATG +ATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT +AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAA +GATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTG +TAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGT +TGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTA +AAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATG +ACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGG +ACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAG +CTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT +ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTC +AGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTAT +GACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTC +AGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAG +ACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCT +AACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGAC +TTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCC +TACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC +TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAG +GAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAG +TGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTT +AGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATA +GATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACC +AGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTC +ACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTAC +AACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC +ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTAT +GCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTA +TGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATAC +AATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCC +GGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTA +TAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATA +CATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGAT +AACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG +TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTT +ATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTAT +GTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATT +GTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAA +TACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGT +GATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTG +AGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCT +TTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT +AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACC +GAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATT +AAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATC +TCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGG +GACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGT +GTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGAT +AAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACAT +TAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA +AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTAC +ATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATT +TCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCC +TGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCA +GCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCAC +AAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTA +TAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGC +TCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA +ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTT +GCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTC +TTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACA +CTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACT +CATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAA +GAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTG +TTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTA +TGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA +CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAA +GTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATC +TATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTT +TCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTA +TGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCA +TGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTT +AAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAA +AGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA +CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGT +GACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTG +TATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAG +AGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCAC +ACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTC +CATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTAT +AACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCT +TATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA +ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGG +ACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTA +GAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTA +AACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGA +CTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAA +CCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTAT +TTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCC +CAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG +AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTA +AACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATT +AGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTA +CTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTA +CAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTT +ATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTG +ACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAA +AATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT +ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTC +GCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTA +TACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTAC +GGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGAT +TGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAA +ATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCT +AGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATG +GGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG +GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTG +GAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTA +AGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAG +GTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAA +CAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCA +ATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCA +GTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATG +TCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC +TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCC +CTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCAT +TTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGC +GAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTC +AAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTA +TTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTAT +TAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCA +GGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA +ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTT +GAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATT +GTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTG +TTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATC +ATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTAT +GCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTG +ATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTC +TAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA +GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACT +TTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACT +TTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAAC +AAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTC +TGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGA +GATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAAC +CAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTA +CTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC +TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACT +CAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTG +GTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTAC +CACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCA +ACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAA +TAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACC +AATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCA +TTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT +GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACC +TTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGG +ACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTG +GAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAA +AATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCA +CAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATA +TCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAG +TTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT +ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTA +TGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAA +GAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTT +TCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACA +CATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACC +TGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTA +GGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTG +CCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC +ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGT +ATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACG +ACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGA +ATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTC +GCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCT +TGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGT +GTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTG +GCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT +AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTT +CTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTA +CTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATG +GGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCA +ACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGC +CTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAAT +TTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTAC +TCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT +TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGT +GAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGAT +CTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGA +TTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTC +CTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTA +AGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAAT +AAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTC +ATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC +TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGT +GATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAA +GAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTG +ACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAG +CAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTAC +TATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATA +AACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAAC +CAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA +GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACA +TACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAAT +TTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACT +GTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTT +ATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTG +CTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAA +GATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTG +TAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC +GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAA +TTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCT +GTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGA +AGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTG +ATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAG +TAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACT +GCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTC +CAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG +TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCT +GGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAA +AAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAAC +ATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGT +AGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCA +ATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGG +TAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGG +CAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC +AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACA +ATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACG +TGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGC +TGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGC +TGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGAT +TTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATG +CAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTT +GTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT +TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTAC +GATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT +TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAA + diff --git a/bh20sequploader/bh20seq-options.yml b/bh20sequploader/bh20seq-options.yml index 104ed6c..c553f41 100644 --- a/bh20sequploader/bh20seq-options.yml +++ b/bh20sequploader/bh20seq-options.yml @@ -35,38 +35,8 @@ sample_sequencing_technology: Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 -sample_sequencing_technology2: - Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 - Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566 - Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567 - Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 - Illumina: http://purl.obolibrary.org/obo/OBI_0000759 - IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 - Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894 - Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 - Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 - Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 - -sample_sequencing_technology3: - Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 - Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566 - Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567 - Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 - Illumina: http://purl.obolibrary.org/obo/OBI_0000759 - IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 - Ion Semiconductor Sequencing: http://purl.obolibraryorg/obo/NCIT_C125894 - Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 - Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 - Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 - specimen_source: nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 sputum: http://purl.obolibrary.org/obo/NCIT_C13278 bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 - -specimen_source2: - nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 - oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 - sputum: http://purl.obolibrary.org/obo/NCIT_C13278 - bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index ea813a0..99e1a11 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -48,6 +48,8 @@ $graph: type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C25688 + _type: "@id" + noLinkCheck: true host_treatment: doc: Process in which the act is intended to modify or alter host status type: string? @@ -55,9 +57,16 @@ $graph: _id: http://www.ebi.ac.uk/efo/EFO_0000727 host_vaccination: doc: List of vaccines given to the host - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/VO_0000002 + ethnicity: + doc: Ethinicity of the host e.g. http://purl.obolibrary.org/obo/HANCESTRO_0010 + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001014 + _type: "@id" + noLinkCheck: true additional_host_information: doc: Field for additional host information type: string? @@ -90,20 +99,13 @@ $graph: jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001895 collecting_institution: - doc: Institute that was responsible of sampeling + doc: Institute that was responsible for sampeling type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C41206 specimen_source: doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0001479 - _type: "@id" - noLinkCheck: true - specimen_source2: - doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb) - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001479 _type: "@id" @@ -119,16 +121,18 @@ $graph: jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 source_database_accession: - doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here - type: string? + doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here. Please use a resolveable URL (e.g. http://identifiers.org/insdc/LC522350.1#sequence) + type: string[]? jsonldPredicate: _id: http://edamontology.org/data_2091 + _type: "@id" + noLinkCheck: true - name: virusSchema type: record fields: virus_species: - doc: The name of a taxon from the NCBI taxonomy database + doc: The name of virus species from the NCBI taxonomy database, e.g. http://purl.obolibrary.org/obo/NCBITaxon_2697049 for Severe acute respiratory syndrome coronavirus 2 type: string jsonldPredicate: _id: http://edamontology.org/data_1875 @@ -145,21 +149,7 @@ $graph: fields: sample_sequencing_technology: doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0600047 - _type: "@id" - noLinkCheck: true - sample_sequencing_technology2: - doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/OBI_0600047 - _type: "@id" - noLinkCheck: true - sample_sequencing_technology3: - doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string? + type: string[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0600047 _type: "@id" @@ -170,13 +160,8 @@ $graph: jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0002699 sequencing_coverage: - doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) - type: float? - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/FLU_0000848 - sequencing_coverage2: - doc: If a second sequence technology was used you can submit its coverage here - type: float? + doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. [100]) - if multiple technologies were used multiple float values can be submitted e.g. [100, 20] + type: double[]? jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 additional_technology_information: @@ -189,13 +174,13 @@ $graph: type: record fields: authors: - doc: Name of the author(s) - type: string + doc: Name(s) of the author(s) + type: string[] jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C42781 submitter_name: - doc: Name of the submitter - type: string? + doc: Name of the submitter(s) + type: string[]? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000116 submitter_address: @@ -228,7 +213,7 @@ $graph: _id: http://purl.obolibrary.org/obo/NCIT_C19026 submitter_orcid: doc: ORCID of the submitter as a full URI, e.g. https://orcid.org/0000-0002-1825-0097 - type: string? + type: string[]? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000115 _type: "@id" diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index c3b0ae1..cdf2296 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -25,8 +25,9 @@ PREFIX wikidata: <http://www.wikidata.org/entity/> obo:NCIT_C42574 [ obo:UO_~ ] ?; obo:NCIT_C25688 [obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987 ] ? ; efo:EFO_0000727 xsd:string ?; - obo:VO_0000002 xsd:string ?; + obo:VO_0000002 xsd:string {0,10}; sio:SIO_001167 xsd:string ?; + sio:SIO_001014 [ obo:HANCESTRO_~ ] ? ; #ethnicity } :sampleShape { @@ -38,25 +39,26 @@ PREFIX wikidata: <http://www.wikidata.org/entity/> obo:OBI_0001479 IRI {0,2}; obo:OBI_0001472 xsd:string ?; sio:SIO_001167 xsd:string ?; + edam:data_2091 IRI {0,3}; } :submitterShape { - obo:NCIT_C42781 xsd:string ; - sio:SIO_000116 xsd:string ?; + obo:NCIT_C42781 xsd:string + ; + sio:SIO_000116 xsd:string *; sio:SIO_000172 xsd:string ?; obo:NCIT_C37984 xsd:string ?; obo:OBI_0600047 xsd:string ?; obo:NCIT_C37900 xsd:string ?; efo:EFO_0001741 xsd:string ?; obo:NCIT_C19026 xsd:string ?; - sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?; + sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/ {0,10}; sio:SIO_001167 xsd:string ?; } :technologyShape { obo:OBI_0600047 IRI {0,3} ; efo:EFO_0002699 xsd:string ?; - obo:FLU_0000848 xsd:double {0,2}; + obo:FLU_0000848 xsd:double OR xsd:integer {0,3}; sio:SIO_001167 xsd:string ?; } diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 4c4711d..10d1029 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -62,11 +62,16 @@ def main(): external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') + try: + username = getpass.getuser() + except KeyError: + username = "unknown" + properties = { "sequence_label": seqlabel, "upload_app": "bh20-seq-uploader", "upload_ip": external_ip, - "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname()) + "upload_user": "%s@%s" % (username, socket.gethostname()) } col.save_new(owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" % diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index e47d66b..16cf2c9 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -1,6 +1,25 @@ import pkg_resources import tempfile import magic +import subprocess +import tempfile +import logging +import re + +def read_fasta(sequence): + entries = 0 + bases = [] + label = None + for line in sequence: + if line.startswith(">"): + label = line + entries += 1 + else: + bases.append(line) + if entries > 1: + raise ValueError("FASTA file contains multiple entries") + break + return label, bases def qc_fasta(sequence): schema_resource = pkg_resources.resource_stream(__name__, "validation/formats") @@ -13,16 +32,44 @@ def qc_fasta(sequence): sequence.seek(0) if seq_type == "text/fasta": # ensure that contains only one entry - entries = 0 - for line in sequence: - if line.startswith(">"): - entries += 1 - if entries > 1: - raise ValueError("FASTA file contains multiple entries") - break + submitlabel, submitseq = read_fasta(sequence) sequence.seek(0) + + with tempfile.NamedTemporaryFile() as tmp1: + refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta") + tmp1.write(refstring) + tmp1.write(submitlabel.encode("utf8")) + tmp1.write(("".join(submitseq)).encode("utf8")) + tmp1.flush() + try: + cmd = ["clustalw", "-infile="+tmp1.name, + "-quicktree", "-iteration=none", "-type=DNA"] + print("QC checking similarity to reference") + print(" ".join(cmd)) + result = subprocess.run(cmd, stdout=subprocess.PIPE) + res = result.stdout.decode("utf-8") + g1 = re.search(r"^Sequence 1: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE) + refbp = float(g1.group(1)) + g2 = re.search(r"^Sequence 2: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE) + subbp = float(g2.group(1)) + g3 = re.search(r"^Sequences \(1:2\) Aligned\. Score: (\d+(\.\d+)?)$", res, flags=re.MULTILINE) + similarity = float(g3.group(1)) + + print(g1.group(0)) + print(g2.group(0)) + print(g3.group(0)) + except Exception as e: + logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e) + + if (subbp/refbp) < .7: + raise ValueError("QC fail: submit sequence length is shorter than 70% reference") + if (subbp/refbp) > 1.3: + raise ValueError("QC fail: submit sequence length is greater than 130% reference") + if similarity < 70.0: + raise ValueError("QC fail: submit similarity is less than 70%") + return "sequence.fasta" elif seq_type == "text/fastq": return "reads.fastq" else: - raise ValueError("Sequence file does not look like FASTA or FASTQ") + raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ") diff --git a/bh20simplewebuploader/__init__.py b/bh20simplewebuploader/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/bh20simplewebuploader/__init__.py diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index e88eb4c..126b8dd 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -8,7 +8,7 @@ import re import string import yaml import pkg_resources -from flask import Flask, request, redirect, send_file, send_from_directory, render_template +from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify import os.path import requests @@ -197,6 +197,14 @@ def generate_form(schema, options): record['type'] = 'number' # Choose a reasonable precision for the control record['step'] = '0.0001' + + ### This is to fix the homepage for the moment ## needs more love though + # implementation of the [] stuff instead of just text fields + ## ToDo - implement lists + elif field_type == 'string[]': + record['type'] = 'text' + elif field_type == 'float[]': + record['type'] = 'text' else: raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) yield record @@ -358,7 +366,8 @@ def getAllaccessions(): payload = {'query': query, 'format': 'json'} r = requests.get(baseURL, params=payload) result = r.json()['results']['bindings'] - return str(result) + return jsonify([{'uri': x['fasta']['value'], + 'value': x['value']['value']} for x in result]) # parameter must be encoded e.g. http://arvados.org/keep:6e6276698ed8b0e6cd21f523e4f91179+123/sequence.fasta must become @@ -368,26 +377,69 @@ def getDetailsForSeq(): seq_id = request.args.get('seq') query="""SELECT DISTINCT ?key ?value WHERE {<placeholder> ?x [?key ?value]}""" query=query.replace("placeholder", seq_id) - print(query) + payload = {'query': query, 'format': 'json'} + r = requests.get(baseURL, params=payload) + result = r.json()['results']['bindings'] + return jsonify([{'uri': x['key']['value'], + 'value': x['value']['value']} for x in result]) + + +@app.route('/api/getSEQCountbytech', methods=['GET']) +def getSEQCountbytech(): + query="""SELECT ?tech ?tech_label (count(?fasta) as ?fastaCount) WHERE + {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0600047> ?tech] + BIND (concat(?tech,"_label") as ?tech_label)} + GROUP BY ?tech ?tech_label ORDER BY DESC (?fastaCount) + """ + payload = {'query': query, 'format': 'json'} + r = requests.get(baseURL, params=payload) + result = r.json()['results']['bindings'] + return jsonify([{'Fasta Count': x['fastaCount']['value'], + 'tech': x['tech']['value'], + 'Label': x['tech_label']['value']} for x in result]) + +## Is this one really necessary or should we just use getSEQCountbytech instead? +@app.route('/api/getAvailableTech', methods=['GET']) +def getAvailableTech(): + query="""SELECT distinct ?tech ?tech_label WHERE + {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0600047> ?tech] + BIND (concat(?tech,"_label") as ?tech_label) + } """ payload = {'query': query, 'format': 'json'} r = requests.get(baseURL, params=payload) result = r.json()['results']['bindings'] return str(result) +## List all Sequences/submissions by a given tech, as example e.g. http://purl.obolibrary.org/obo/OBI_0000759 +## Has to be encoded again so should be --> http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FOBI_0000759 @app.route('/api/getSEQbytech', methods=['GET']) def getSEQbytech(): - query="""SELECT ?specimen_source ?specimen_source_label (count(?fasta) as ?fastaCount) WHERE - {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0600047> ?specimen_source] - BIND (concat(?specimen_source,"_label") as ?specimen_source_label)} - GROUP BY ?specimen_source ?specimen_source_label ORDER BY DESC (?fastaCount) + query="""SELECT ?fasta WHERE + {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0600047> <placeholder>] } """ + tech = request.args.get('tech') + query=query.replace("placeholder", tech) payload = {'query': query, 'format': 'json'} r = requests.get(baseURL, params=payload) result = r.json()['results']['bindings'] return str(result) + +## Example location, encoded http%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ1223 @app.route('/api/getSEQbyLocation', methods=['GET']) def getSEQbyLocation(): + query="""SELECT ?fasta WHERE {?fasta ?x[ <http://purl.obolibrary.org/obo/GAZ_00000448> <placeholder>]}""" + location=request.args.get('location') + query=query.replace("placeholder", location) + print(query) + payload = {'query': query, 'format': 'json'} + r = requests.get(baseURL, params=payload) + result = r.json()['results']['bindings'] + return str(result) + + +@app.route('/api/getSEQCountbyLocation', methods=['GET']) +def getSEQCountbyLocation(): query="""SELECT ?geoLocation ?geoLocation_label (count(?fasta) as ?fastaCount) WHERE {?fasta ?x [<http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation] BIND (concat(?geoLocation,"_label") as ?geoLocation_label)} @@ -396,10 +448,13 @@ def getSEQbyLocation(): payload = {'query': query, 'format': 'json'} r = requests.get(baseURL, params=payload) result = r.json()['results']['bindings'] - return str(result) + return jsonify([{'Fasta Count': x['fastaCount']['value'], + 'GeoLocation': x['geoLocation']['value'], + 'GeoLocation Label': x['geoLocation_label']['value']} for x in result]) -@app.route('/api/getSEQbySpecimenSource', methods=['GET']) -def getSEQbySpecimenSource(): + +@app.route('/api/getSEQCountbySpecimenSource', methods=['GET']) +def getSEQCountbySpecimenSource(): query="""SELECT ?specimen_source ?specimen_source_label (count(?fasta) as ?fastaCount) WHERE {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0001479> ?specimen_source] BIND (concat(?specimen_source,"_label") as ?specimen_source_label)} @@ -409,11 +464,27 @@ def getSEQbySpecimenSource(): payload = {'query': query, 'format': 'json'} r = requests.get(baseURL, params=payload) result = r.json()['results']['bindings'] + return jsonify([{'Fasta Count': x['fastaCount']['value'], + 'Specimen Source': x['specimen_source']['value'], + 'Label': x['specimen_source_label']['value']} for x in result]) + +# Example specimen http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C155831 +@app.route('/api/getSEQbySpecimenSource', methods=['GET']) +def getSEQBySpecimenSource(): + query="""SELECT ?fasta ?specimen_source ?specimen_source_label WHERE + {?fasta ?x [<http://purl.obolibrary.org/obo/OBI_0001479> <placeholder>] + BIND (concat(?specimen_source,"_label") as ?specimen_source_label)} + """ + specimen=request.args.get('specimen') + query = query.replace("placeholder", specimen) + payload = {'query': query, 'format': 'json'} + r = requests.get(baseURL, params=payload) + result = r.json()['results']['bindings'] return str(result) #No data for this atm -@app.route('/api/getSEQbyHostHealthStatus', methods=['GET']) -def getSEQbyHostHealthStatus(): +@app.route('/api/getSEQCountbyHostHealthStatus', methods=['GET']) +def getSEQCountbyHostHealthStatus(): query="""SELECT ?health_status ?health_status_label (count(?fasta) as ?fastaCount) WHERE {?fasta ?x [<http://purl.obolibrary.org/obo/NCIT_C25688> ?health_status] BIND (concat(?health_status,"_label") as ?health_status_label)} @@ -423,4 +494,36 @@ def getSEQbyHostHealthStatus(): payload = {'query': query, 'format': 'json'} r = requests.get(baseURL, params=payload) result = r.json()['results']['bindings'] + return str(result) + +@app.route('/api/getSEQbyLocationAndTech', methods=['GET']) +def getSEQbyLocationAndTech(): + query="""SELECT ?fasta WHERE { ?fasta ?x [ + <http://purl.obolibrary.org/obo/GAZ_00000448> <placeholderLoc>; <http://purl.obolibrary.org/obo/OBI_0600047> <placeholderTech> ]}""" + location=request.args.get('location') + tech=request.args.get('tech') + query=query.replace("placeholderLoc", location) + query = query.replace("placeholderTech", tech) + print(query) + payload = {'query': query, 'format': 'json'} + r = requests.get(baseURL, params=payload) + result = r.json()['results']['bindings'] + return str(result) + + +# Example Location http%3A%2F%2Fwww.wikidata.org%2Fentity%2FQ1223 +# Example specimen http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCIT_C155831 +@app.route('/api/getSEQbyLocationAndSpecimenSource', methods=['GET']) +def getSEQbyLocationAndSpecimenSource(): + query="""SELECT ?fasta WHERE { ?fasta ?x [ + <http://purl.obolibrary.org/obo/GAZ_00000448> <placeholderLoc>; <http://purl.obolibrary.org/obo/OBI_0001479> <placeholderSpecimen> ]} + """ + location = request.args.get('location') + specimen = request.args.get('specimen') + query = query.replace("placeholderLoc", location) + query = query.replace("placeholderSpecimen", specimen) + print(query) + payload = {'query': query, 'format': 'json'} + r = requests.get(baseURL, params=payload) + result = r.json()['results']['bindings'] return str(result)
\ No newline at end of file diff --git a/bh20simplewebuploader/static/main.css b/bh20simplewebuploader/static/main.css new file mode 100644 index 0000000..57e29ef --- /dev/null +++ b/bh20simplewebuploader/static/main.css @@ -0,0 +1,269 @@ +hr { + margin: auto 0; +} + +body { + color: #101010; + background-color: #F5FFFF; + margin: 0; +} + +h1, h2, h3, h4 { + font-family: 'Inter', sans-serif; + color: #0ED1CD; +} + +h1 { + text-align: center; +} + +.intro { + color: #505050; + font-weight: 300; +} + +.header { + background-color: white; + margin: 0 auto; + padding: 20px; + text-align: center; + height: 150px; +} + +h2 > svg { + position: relative; + top: 8px; +} + +.logo { + float: right; +} + +p, form, .about, .footer { + font-family: 'Inter', sans-serif; + line-height: 1.5; +} + +form h4 { + text-transform: 'uppercase'; +} + +.intro, form, .search { + padding: 20px; +} + +.intro { + background-color: inherit; + margin: 0 auto; + padding: 20px; +} + +.about { + background-color: #B2F8F8; + margin: 30px auto; + padding: 20px; + width: 95%; + border-radius: 20px; +} + +.button { + border-radius: 5px; + background: #0ED1CD; + margin: 0.3em auto; + padding: 0.4em; +} + +.footer { + background: #058280;; + margin: 0 auto; + color: #fff; +} + +.footer a { + color: #fff; +} + +span.dropt {border-bottom: thin dotted; background: #ffeedd;} +span.dropt:hover {text-decoration: none; background: #ffffff; z-index: 6; } + +.grid-container { + display: grid; + grid-template-columns: repeat(4, 1fr); + grid-template-rows: auto; + row-gap:5px; + grid-template-areas: + "a a b b" + "a a c c" + "a a d d" + "e e e e" + "f f f f"; + grid-auto-flow: column; +} + +.about { + display: grid; + grid-template-columns: repeat(2, 1fr); + grid-auto-flow: row; +} + +.about h1 { + text-align: left; +} + +.about p { + font-weight: 300; + color: #505050; +} + +.intro { + grid-area: a; +} + +.fasta-file-select { + padding: 1em; + grid-area: b; +} + +a { + color: #40DBD8; + font-weight: 700; +} + +.fasta-file-select label, .metadata label { + font-weight: 600; +} + +.metadata { + padding: 1em; + grid-area: c; +} +.metadata_upload_form { + padding: 1em; + grid-area: c; +} + +#metadata_upload_form_spot { + grid-area: d; +} + +#metadata_fill_form_spot { + grid-area: e; +} + +#metadata_fill_form { + column-count: 4; + margin-top: 0.5em; + column-width: 250px; +} + +pre code { + background-color: #eee; + display: flex; + width: max-content; + margin: 0 auto; + overflow-y: scroll; + max-height: 300px; + padding: 10px; + border: solid 1px black; +} + +.record { + display: flex; + flex-direction: column; + border: solid 1px #808080; + padding: 1em; + background: #F8F8F8; + margin-bottom: 1em; + -webkit-column-break-inside: avoid; /* Chrome, Safari, Opera */ + page-break-inside: avoid; /* Firefox */ + break-inside: avoid; +} + +.record label { + font-size: small; + margin-top: 10px; +} + +.search-section { + display: flex; + justify-content: space-between; +} + +.search-section .filter-options { + display: flex; + flex-direction: column; + width: max-content; + padding: 20px; +} + +.search-section p { + margin: 0; +} + +.submit { + grid-area: f; + width: 17em; + justify-self: center; +} + +footer { + display: block; + width: 100%; +} + +.sponsors { + width: inherit; + display: flex; + flex-direction: row; + flex-wrap: wrap; + justify-content: space-evenly; + align-content: space-evenly; +} + +.sponsors a { + flex-grow: 4; + height: 200px; + margin: 10px; + background: white; + display: flex; + flex-direction: column; + justify-content: center; +} +.sponsors img { + width: 100%; +} +.metadata input#metadata_upload:checked ~ #metadata_upload_form_spot { + display: block; +} + +.metadata input#metadata_upload ~ #metadata_upload_form_spot { + display: none; +} + +.loader { + display: block; + border: 5px solid #f3f3f3; /* Light grey */ + border-top: 5px solid #3498db; /* Blue */ + border-radius: 50%; + width: 20px; + height: 20px; + margin-right: auto; + margin-left: auto; + animation: spin 1.5s linear infinite; +} + +.invisible { + display: none; +} + +@keyframes spin { + 0% { transform: rotate(0deg); } + 100% { transform: rotate(360deg); } +} + +@media only screen and (max-device-width: 480px) { + .grid-container { + display: flex; + flex-direction: column; + } +} diff --git a/bh20simplewebuploader/static/main.js b/bh20simplewebuploader/static/main.js new file mode 100644 index 0000000..96199a0 --- /dev/null +++ b/bh20simplewebuploader/static/main.js @@ -0,0 +1,47 @@ +function fetchAPI(apiEndPoint) { + fetch(scriptRoot + apiEndPoint) + .then(response => { + return response.json(); + }) + .then(data => { + document.getElementById("json").textContent = JSON.stringify(data, undefined, 2); + document.getElementById("results").classList.remove("invisible"); + document.getElementById("loader").classList.add("invisible"); + }); + document.getElementById("results").classList.add("invisible"); + document.getElementById("loader").classList.remove("invisible"); + +} + +let search = () => { + let m = document.getElementById('search-input').value; + fetchAPI(scriptRoot + "/api/getDetailsForSeq?seq=" + encodeURIComponent(m)); +} + +let fetchSEQBySpecimen = () => { + fetchAPI("/api/getSEQCountbySpecimenSource"); +} + +let fetchSEQByLocation = () => { + fetchAPI("/api/getSEQCountbyLocation"); +} + +let fetchSEQByTech = () => { + fetchAPI("/api/getSEQCountbytech"); +} + +let fetchAllaccessions = () => { + fetchAPI("/api/getAllaccessions"); +}; + +/** + * Show form if checked + */ +let fillFormSpot = document.getElementById('metadata_fill_form_spot'); +function displayForm() { + if (document.getElementById('metadata_form').checked) { + fillFormSpot.classList.remove("invisible"); + return; + } + fillFormSpot.classList.add("invisible"); +} diff --git a/bh20simplewebuploader/templates/form.html b/bh20simplewebuploader/templates/form.html index 02ae84d..ffd4158 100644 --- a/bh20simplewebuploader/templates/form.html +++ b/bh20simplewebuploader/templates/form.html @@ -1,152 +1,9 @@ <!DOCTYPE html> <html> - <style> - hr { - margin: auto 0; - } - - body { - color: #101010; - background-color: #F9EDE1; - } - - h1, h2, h3, h4 { - font-family: 'Roboto Slab', serif; - color: darkblue; - } - - h1 { - text-align: center; - } - - p { - color: #505050; - font-style: italic; - } - .header { - background-color: white; - margin: 0 auto; - padding: 20px; - text-align: center; - height: 150px; - } - - .logo { - float: right; - } - - p, form, .about, .footer { - font-family: 'Raleway', sans-serif; - line-height: 1.5; - } - - form h4 { - text-transform: 'uppercase'; - } - - .intro, form { - padding: 20px; - } - - .intro { - background-color: lightgrey; - margin: 0 auto; - padding: 20px; - } - - .about { - background-color: lightgrey; - margin: 0 auto; - padding: 20px; - } - .footer { - background-color: white; - margin: 0 auto; - } - - span.dropt {border-bottom: thin dotted; background: #ffeedd;} - span.dropt:hover {text-decoration: none; background: #ffffff; z-index: 6; } - - .grid-container { - display: grid; - grid-template-columns: repeat(4, 1fr); - grid-template-rows: auto; - row-gap:5px; - grid-template-areas: - "a a b b" - "a a c c" - "a a d d" - "e e e e" - "f f f f"; - grid-auto-flow: column; - } - - .intro { - grid-area: a; - } - - .fasta-file-select { - padding: 1em; - grid-area: b; - } - - .metadata { - padding: 1em; - grid-area: c; - } - .metadata_upload_form { - padding: 1em; - grid-area: c; - } - - #metadata_upload_form_spot { - grid-area: d; - } - - #metadata_fill_form_spot { - grid-area: e; - } - - #metadata_fill_form { - column-count: 4; - margin-top: 0.5em; - column-width: 250px; - } - - .record { - display: flex; - flex-direction: column; - border: solid 1px #808080; - padding: 1em; - background: #F8F8F8; - margin-bottom: 1em; - -webkit-column-break-inside: avoid; /* Chrome, Safari, Opera */ - page-break-inside: avoid; /* Firefox */ - break-inside: avoid; - } - - .record label { - font-size: small; - margin-top: 10px; - } - - .submit { - grid-area: f; - width: 17em; - justify-self: center; - } - - @media only screen and (max-device-width: 480px) { - .grid-container { - display: flex; - flex-direction: column; - } - } - </style> - <head> <meta charset="UTF-8"> - <link href="https://fonts.googleapis.com/css2?family=Raleway:wght@500&family=Roboto+Slab&display=swap" rel="stylesheet"> + <link href="https://fonts.googleapis.com/css2?family=Inter:wght@100;200;300;400;500;600;700;800;900&display=swap" rel="stylesheet"> + <link href="/static/main.css" rel="stylesheet" type="text/css"> <meta name="viewport" content="width=device-width, initial-scale=1"> <title>Web uploader for Public SARS-CoV-2 Sequence Resource</title> </head> @@ -158,28 +15,67 @@ <small>Disabled until we got everything wired up</small> </section> - <hr> + + <section class="search-section"> + <div class="filter-options" action="#"> + <p>[Demo] Display content sequences by: </p> + + <div> + <button class="button" onclick="fetchSEQBySpecimen()">Specimen Source</button> + <button class="button" onclick="fetchSEQByLocation()">Location</button> + <button class="button" onclick="fetchSEQByTech()">Tech</button> + <button class="button" onclick="fetchAllaccessions()">Allaccessions</button> + </div> + + </div> + + <div class="search"> + <input id="search-input" id="global-search" type="search" placeholder="FASTA uri" required> + <button class="button search-button" type="submit" onclick="search()"> + <span class="icon ion-search"> + <span class="sr-only">Search</span> + </span> + </button> + </div> + </section> + + <div id="loader" class="loader invisible"></div> + + <section id="results" class="invisible"> + <pre><code id="json"></code></pre> + </section> <section> <form action="/submit" method="POST" enctype="multipart/form-data" id="main_form" class="grid-container"> - <p class="intro"> - Upload your SARS-CoV-2 sequence (FASTA or FASTQ formats) with metadata (JSONLD) to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">public sequence resource</a>. The upload will trigger a - recompute with all available sequences into a Pangenome - available for - <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">download</a>! - Your uploaded sequence will automatically be processed - and incorporated into the public pangenome with - metadata using worklows from the High Performance Open Biology Lab defined <a href="https://github.com/hpobio-lab/viral-analysis/tree/master/cwl/pangenome-generate">here</a>. All data is published under - a <a href="https://creativecommons.org/licenses/by/4.0/">Creative - Commons 4.0 attribution license</a> (CC-BY-4.0). You - can take the published (GFA/RDF/FASTA) data and store it in - a triple store for further processing. We also plan to - combine identifiers with clinical data stored securely at <a href="https://redcap-covid19.elixir-luxembourg.org/redcap/">REDCap</a>. - A free command line version of the uploader can be - installed from <a href="https://github.com/arvados/bh20-seq-resource">source</a>. - </p> + <div class="intro"> + <p> + Upload your SARS-CoV-2 sequence (FASTA or FASTQ formats) with metadata (JSONLD) to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">public sequence resource</a>. The upload will trigger a + recompute with all available sequences into a Pangenome + available for + <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">download</a>! + </p> + <p> + Your uploaded sequence will automatically be processed + and incorporated into the public pangenome with + metadata using worklows from the High Performance Open Biology Lab defined <a href="https://github.com/hpobio-lab/viral-analysis/tree/master/cwl/pangenome-generate">here</a>. All data is published under + a <a href="https://creativecommons.org/licenses/by/4.0/">Creative + Commons 4.0 attribution license</a> (CC-BY-4.0). You + can take the published (GFA/RDF/FASTA) data and store it in + a triple store for further processing. We also plan to + combine identifiers with clinical data stored securely at <a href="https://redcap-covid19.elixir-luxembourg.org/redcap/">REDCap</a>. + A free command line version of the uploader can be + installed from <a href="https://github.com/arvados/bh20-seq-resource">source</a>. + </p> + + </div> <div class="fasta-file-select"> + <h2><svg class="bi bi-cloud-upload" width="1.2em" height="1.2em" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg"> + <path d="M4.887 6.2l-.964-.165A2.5 2.5 0 103.5 11H6v1H3.5a3.5 3.5 0 11.59-6.95 5.002 5.002 0 119.804 1.98A2.501 2.501 0 0113.5 12H10v-1h3.5a1.5 1.5 0 00.237-2.981L12.7 7.854l.216-1.028a4 4 0 10-7.843-1.587l-.185.96z"/> + <path fill-rule="evenodd" d="M5 8.854a.5.5 0 00.707 0L8 6.56l2.293 2.293A.5.5 0 1011 8.146L8.354 5.5a.5.5 0 00-.708 0L5 8.146a.5.5 0 000 .708z" clip-rule="evenodd"/> + <path fill-rule="evenodd" d="M8 6a.5.5 0 01.5.5v8a.5.5 0 01-1 0v-8A.5.5 0 018 6z" clip-rule="evenodd"/> + </svg> Upload SARS-CoV-2 Sequence</h2> + <label for="fasta">Select FASTA file of assembled genome (max 50K), or FASTQ of reads (<span class="dropt" title="For a larger fastq file you'll need to use a CLI uploader">max 150MB<span style="width:500px;"></span></span>) : </label> <br> <input type="file" id="fasta" name="fasta" accept=".fa,.fasta,.fna,.fq" required> @@ -189,16 +85,16 @@ </div> <div class="metadata"> - <label>Select metadata submission method:</label> - <br> - <input type="radio" id="metadata_form" name="metadata_type" value="fill" onchange="setMode()" checked required> - <label for="metadata_form">Fill in metadata manually</label> - <input type="radio" id="metadata_upload" name="metadata_type" value="upload" onchange="setMode()" required> - <label for="metadata_upload">Upload metadata file</label> - <br> - <small>Make sure the metadata has submitter attribution details.</small> + <label>Select metadata submission method:</label> + <br> + <input type="radio" id="metadata_form" name="metadata_type" value="fill" checked onchange="displayForm()" required> + <label for="metadata_form">Fill in metadata manually</label> + <input type="radio" id="metadata_upload" name="metadata_type" value="upload" onchange="displayForm()" required> + <label for="metadata_upload">Upload metadata file</label> + <br> + <small>Make sure the metadata has submitter attribution details.</small> - <div id="metadata_upload_form_spot"> + <div id="metadata_upload_form_spot"> <div id="metadata_upload_form"> <br> <label for="metadata">Select JSON or YAML metadata file following <a href="https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml" target="_blank">this schema</a> and <a href="https://github.com/arvados/bh20-seq-resource/blob/master/example/metadata.yaml" target="_blank">example</a> (max 50K):</label> @@ -206,9 +102,9 @@ <input type="file" id="metadata" name="metadata" accept=".json,.yml,.yaml" required> <br> </div> - </div> - </div> + </div> + </div> <div id="metadata_fill_form_spot"> <div id="metadata_fill_form"> {% for record in fields %} @@ -246,40 +142,44 @@ {% endif %} {% endfor %} </div> + </div> <input class="submit" type="submit" value="Add to Pangenome"> </form> </section> -<hr> <br> <div class="about"> - <h3>ABOUT</h3> - <p> - This a public repository created at the COVID-19 BioHackathon - that has a low barrier to entry for uploading sequence data using - best practices. I.e., data is published with a creative commons - 4.0 (CC-4.0) license with metadata using state-of-the art - standards and, perhaps most importantly, providing standardized - workflows that get triggered on upload, so that results are - immediately available in standardized data formats. The repository - will be maintained and expanded for the duration of the - pandemic. To contribute data simply upload it! To contribute code - and/or workflows see - the <a href="https://github.com/arvados/bh20-seq-resource">project - repository</a>. For more information see the <a href="https://github.com/arvados/bh20-seq-resource/blob/master/paper/paper.md">paper</a> (WIP). - </p> - <br> + <div> + <h1>ABOUT</h1> + <p> + This a public repository created at the COVID-19 BioHackathon + that has a low barrier to entry for uploading sequence data using + best practices. I.e., data is published with a creative commons + 4.0 (CC-4.0) license with metadata using state-of-the art + standards and, perhaps most importantly, providing standardized + workflows that get triggered on upload, so that results are + immediately available in standardized data formats. The repository + will be maintained and expanded for the duration of the + pandemic. To contribute data simply upload it! To contribute code + and/or workflows see + the <a href="https://github.com/arvados/bh20-seq-resource">project + repository</a>. For more information see the <a href="https://github.com/arvados/bh20-seq-resource/blob/master/paper/paper.md">paper</a> (WIP). + </p> + + </div> + <div class="sponsors"> + <a href="https://arvados.org/"><img src="static/image/arvados-logo.png"></a> + <a href="https://www.commonwl.org/"><img src="static/image/CWL-Logo-Header.png"></a> + + <a href="https://github.com/virtual-biohackathons/covid-19-bh20"> + <img src="static/image/covid19biohackathon.png"></a> + </div> </div> - - <hr> <div class="footer"> - <a href="https://arvados.org/"><img src="static/image/arvados-logo.png" align="top"></a> - <a href="https://www.commonwl.org/"><img src="static/image/CWL-Logo-Header.png" height="70"></a> + <!-- Sponsors --> - <a href="https://github.com/virtual-biohackathons/covid-19-bh20"> - <img src="static/image/covid19biohackathon.png" align="right" height="70"></a> <center> <small><a href="https://github.com/arvados/bh20-seq-resource">Source code</a> · Powered by <a href="https://www.commonwl.org/">Common Workflow Language</a> & <a href="https://arvados.org/">Arvados</a>; Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a> @@ -289,35 +189,10 @@ </div> - <script type="text/javascript"> - let uploadForm = document.getElementById('metadata_upload_form') - let uploadFormSpot = document.getElementById('metadata_upload_form_spot') - let fillForm = document.getElementById('metadata_fill_form') - let fillFormSpot = document.getElementById('metadata_fill_form_spot') - - function setUploadMode() { - // Make the upload form the one in use - uploadFormSpot.appendChild(uploadForm) - fillFormSpot.removeChild(fillForm) - } - - function setFillMode() { - // Make the fillable form the one in use - uploadFormSpot.removeChild(uploadForm) - fillFormSpot.appendChild(fillForm) - } - - function setMode() { - // Pick mode based on radio - if (document.getElementById('metadata_upload').checked) { - setUploadMode() - } else { - setFillMode() - } - } +<script type="text/javascript"> + let scriptRoot = {{ request.script_root|tojson|safe }}; +</script> - // Start in mode appropriate to selected form item - setMode() - </script> +<script type="text/javascript" src="/static/main.js"></script> </body> </html> diff --git a/example/maximum_metadata_example.yaml b/example/maximum_metadata_example.yaml new file mode 100644 index 0000000..1bc70d7 --- /dev/null +++ b/example/maximum_metadata_example.yaml @@ -0,0 +1,46 @@ +id: placeholder + +host: + host_id: XX1 + host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 + host_sex: http://purl.obolibrary.org/obo/PATO_0000384 + host_age: 20 + host_age_unit: http://purl.obolibrary.org/obo/UO_0000036 + host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269 + host_treatment: Process in which the act is intended to modify or alter host status (Compounds) + host_vaccination: [vaccines1,vaccine2] + ethnicity: http://purl.obolibrary.org/obo/HANCESTRO_0010 + additional_host_information: Optional free text field for addtional information + +sample: + sample_id: Id of the sample as defined by the submitter + collector_name: Name of the person that took the sample + collecting_institution: Institute that was responsible of sampling + specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835] + collection_date: "2020-01-01" + collection_location: http://www.wikidata.org/entity/Q148 + sample_storage_conditions: frozen specimen + source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] + additional_collection_information: Optional free text field for addtional information + +virus: + virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 + virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 + +technology: + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173] + sequence_assembly_method: Protocol used for assembly + sequencing_coverage: [70.0, 100.0] + additional_technology_information: Optional free text field for addtional information + +submitter: + authors: [John Doe, Joe Boe, Jonny Oe] + submitter_name: [John Doe] + submitter_address: John Doe's adress + originating_lab: John Doe kitchen + lab_address: John Doe's address + provider_sample_id: XXX1 + submitter_sample_id: XXX2 + publication: PMID00001113 + submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001] + additional_submitter_information: Optional free text field for addtional information
\ No newline at end of file diff --git a/example/metadata.yaml b/example/metadata.yaml deleted file mode 100644 index a76616c..0000000 --- a/example/metadata.yaml +++ /dev/null @@ -1,43 +0,0 @@ -id: placeholder - -host: - host_id: XX1 - host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 - host_sex: http://purl.obolibrary.org/obo/NCIT_C27993 - host_age: 20 - host_age_unit: http://purl.obolibrary.org/obo/UO_0000036 - host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269 - host_treatment: Process in which the act is intended to modify or alter host status (Compounds) - host_vaccination: List of vaccines given to the host (RRIDs?) - additional_host_information: Field for additional host information - -sample: - sample_id: Id of the sample as defined by the submitter - collector_name: Name of the person that took the sample - collecting_institution: Institute that was responsible of sampling - specimen_source: http://purl.obolibrary.org/obo/NCIT_C155831 - specimen_source2: http://purl.obolibrary.org/obo/NCIT_C155835 - collection_date: "2020-01-01" - collection_location: http://www.wikidata.org/entity/Q148 - sample_storage_conditions: XXX - additional_collection_information: XXX - -virus: - virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 - virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 - -technology: - sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173 - sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173 - sequence_assembly_method: Protocol used for assembly - sequencing_coverage: 70 - -submitter: - submitter_name: John Doe - submitter_address: John Doe's adress - originating_lab: John Doe kitchen - lab_address: John Doe's address - provider_sample_id: HmX - submitter_sample_id: xXx - authors: John Doe et all - submitter_orcid: https://orcid.org/0000-0000-0000-0000
\ No newline at end of file diff --git a/example/minimal_example.yaml b/example/minimal_metadata_example.yaml index 0e36a25..51f8a87 100644 --- a/example/minimal_example.yaml +++ b/example/minimal_metadata_example.yaml @@ -5,14 +5,14 @@ host: sample: sample_id: XX - collection_date: 2020-01 + collection_date: "2020-01-01" collection_location: http://www.wikidata.org/entity/Q148 virus: virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 technology: - sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632 + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] submitter: - authors: John Doe
\ No newline at end of file + authors: [John Doe]
\ No newline at end of file diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 20e8a9b..6b43137 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -39,6 +39,7 @@ Chad,http://www.wikidata.org/entity/Q657 Chile,http://www.wikidata.org/entity/Q298 China,http://www.wikidata.org/entity/Q148 China: Anhui,http://www.wikidata.org/entity/Q40956 +"China: Anhui, Fuyang":http://www.wikidata.org/entity/Q360584 China: Beijing,http://www.wikidata.org/entity/Q956 China: Chongqing,http://www.wikidata.org/entity/Q11725 China: Fujian,http://www.wikidata.org/entity/Q41705 @@ -48,6 +49,7 @@ China: Guangdong,http://www.wikidata.org/entity/Q15175 China: Guangxi Zhuang Autonomous Region,http://www.wikidata.org/entity/Q15176 China: Guangzhou,http://www.wikidata.org/entity/Q16572 China: Guizhou,http://www.wikidata.org/entity/Q47097 +China: Hangzhou,http://www.wikidata.org/entity/Q4970 China: Hainan,http://www.wikidata.org/entity/Q42200 China: Hebei,http://www.wikidata.org/entity/Q21208 China: Heilongjiang,http://www.wikidata.org/entity/Q19206 @@ -109,6 +111,7 @@ France,http://www.wikidata.org/entity/Q142 Gabon,http://www.wikidata.org/entity/Q1000 Georgia,http://www.wikidata.org/entity/Q230 Germany,http://www.wikidata.org/entity/Q183 +Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718 Ghana,http://www.wikidata.org/entity/Q117 Greece,http://www.wikidata.org/entity/Q41 Grenada,http://www.wikidata.org/entity/Q769 @@ -123,6 +126,7 @@ Iceland,http://www.wikidata.org/entity/Q189 Icelandic Commonwealth,http://www.wikidata.org/entity/Q62389 India,http://www.wikidata.org/entity/Q668 India: Kerala State,http://www.wikidata.org/entity/Q1186 +India: Rajkot,http://www.wikidata.org/entity/Q1815245 Indonesia,http://www.wikidata.org/entity/Q252 Iran,http://www.wikidata.org/entity/Q794 Iran: Qum,http://www.wikidata.org/entity/Q131664 @@ -172,6 +176,7 @@ Mozambique,http://www.wikidata.org/entity/Q1029 Myanmar,http://www.wikidata.org/entity/Q836 Namibia,http://www.wikidata.org/entity/Q1030 Nauru,http://www.wikidata.org/entity/Q697 +Netherlands: Milheeze,https://www.wikidata.org/wiki/Q3314115 Nepal,http://www.wikidata.org/entity/Q837 New Zealand,http://www.wikidata.org/entity/Q664 Nicaragua,http://www.wikidata.org/entity/Q811 @@ -263,6 +268,7 @@ USA: CA,http://www.wikidata.org/entity/Q99 "USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143 USA: CO,http://www.wikidata.org/entity/Q1261 USA: CT,http://www.wikidata.org/entity/Q779 +USA: DC,http://www.wikidata.org/entity/Q3551781 USA: DE,http://www.wikidata.org/entity/Q1393 USA: FL,http://www.wikidata.org/entity/Q812 USA: GA,http://www.wikidata.org/entity/Q1428 @@ -293,6 +299,7 @@ USA: NM,http://www.wikidata.org/entity/Q1522 USA: North Carolina,http://www.wikidata.org/entity/Q1454 USA: NV,http://www.wikidata.org/entity/Q1227 USA: NY,http://www.wikidata.org/entity/Q1384 +USA: New York,http://www.wikidata.org/entity/Q1384 USA: OH,http://www.wikidata.org/entity/Q1397 USA: OK,http://www.wikidata.org/entity/Q1649 USA: OR,http://www.wikidata.org/entity/Q824 @@ -321,4 +328,4 @@ Viet Nam: Ho Chi Minh city,http://www.wikidata.org/entity/Q1854 Vietnam,http://www.wikidata.org/entity/Q881 Yemen,http://www.wikidata.org/entity/Q805 Zambia,http://www.wikidata.org/entity/Q953 -Zimbabwe,http://www.wikidata.org/entity/Q954 +Zimbabwe,http://www.wikidata.org/entity/Q954
\ No newline at end of file diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv index f5aeaae..7fa67f8 100644 --- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv +++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv @@ -1,5 +1,6 @@ nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831 respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831 naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal aspirate,http://purl.obolibrary.org/obo/NCIT_C155831 diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile new file mode 100644 index 0000000..5bd38dd --- /dev/null +++ b/scripts/docker/Dockerfile @@ -0,0 +1,10 @@ +FROM debian:10 + +RUN apt-get update && \ + apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \ + python3 python3-pip python3-setuptools python3-dev python-pycurl \ + clustalw python3-biopython libcurl4-openssl-dev build-essential \ + libssl-dev && \ + apt-get clean + +RUN pip3 install bh20-seq-uploader
\ No newline at end of file diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 5257bd1..6f046ea 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -7,6 +7,8 @@ import xml.etree.ElementTree as ET import json import os +from dateutil import parser + num_ids_for_request = 100 dir_metadata = 'metadata_from_nuccore' @@ -37,20 +39,19 @@ if not os.path.exists(dir_metadata): tmp_list = [x.split('.')[0] for x in tmp_list] print(term, len(tmp_list)) - tmp_list=tmp_list - # tmp_list = tmp_list[0:2] # restricting to small run + #tmp_list = tmp_list[0:2] # restricting to small run id_set.update([x.split('.')[0] for x in tmp_list]) print(term_list, len(id_set)) - with open(path_ncbi_virus_accession) as f: - tmp_list = [line.strip('\n') for line in f] - - print('NCBI Virus', len(tmp_list)) - id_set.update(tmp_list) - - print(term_list + ['NCBI Virus'], len(id_set)) + if os.path.exists(path_ncbi_virus_accession): + with open(path_ncbi_virus_accession) as f: + tmp_list = [line.strip('\n') for line in f] + print('NCBI Virus', len(tmp_list)) + id_set.update(tmp_list) + term_list.append('NCBI Virus') + print(term_list, len(id_set)) for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) @@ -86,7 +87,7 @@ if not os.path.exists(dir_fasta_and_yaml): os.makedirs(dir_fasta_and_yaml) missing_value_list = [] - + for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: tree = ET.parse(path_metadata_xxx_xml) GBSet = tree.getroot() @@ -110,23 +111,23 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) 'submitter': {} } - + info_for_yaml_dict['sample']['sample_id'] = accession_version - info_for_yaml_dict['sample']['source_database_accession'] = accession_version - - + info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now + + # submitter info GBSeq_references = GBSeq.find('GBSeq_references') if GBSeq_references is not None: - info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq_references.iter('GBAuthor')]) - + info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] + GBReference = GBSeq_references.find('GBReference') if GBReference is not None: GBReference_journal = GBReference.find('GBReference_journal') - + if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': if 'Submitted' in GBReference_journal.text: - info_for_yaml_dict['submitter']['submitter_name'] = GBReference_journal.text.split(') ')[1].split(',')[0].strip() + info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip() else: info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text @@ -146,8 +147,9 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if field_in_yaml == 'sequencing_coverage': # A regular expression would be better! try: - info_for_yaml_dict['technology'][field_in_yaml] = float( - tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + info_for_yaml_dict['technology'][field_in_yaml] = [ + float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + ] except ValueError: print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse) pass @@ -162,8 +164,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) new_seq_tec_list.append(seq_tec) - for n, seq_tec in enumerate(new_seq_tec_list): - info_for_yaml_dict['technology'][field_in_yaml + ('' if n == 0 else str(n + 1))] = seq_tec + info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list] else: info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse @@ -199,7 +200,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if 'age' in GBQualifier_value_text: info_for_yaml_dict['host']['host_age'] = int(GBQualifier_value_text_list[2].split('age ')[1]) - info_for_yaml_dict['host']['host_age_unit'] = 'year' + info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036' elif GBQualifier_name_text == 'collected_by': if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]): info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text @@ -208,24 +209,46 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) elif GBQualifier_name_text == 'isolation_source': if GBQualifier_value_text.upper() in term_to_uri_dict: GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' - + if GBQualifier_value_text in term_to_uri_dict: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict[GBQualifier_value_text] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: if GBQualifier_value_text in ['NP/OP swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'np/np swab', 'np/op']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['oropharyngeal swab'] - elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal swab'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab'] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] + elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] elif GBQualifier_value_text in ['nasopharyngeal aspirate/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal aspirate'] - info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab'] + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] else: missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) elif GBQualifier_name_text == 'collection_date': # TO_DO: which format we will use? - info_for_yaml_dict['sample']['collection_date'] = GBQualifier_value_text + date_to_write = GBQualifier_value_text + + if len(GBQualifier_value_text.split('-')) == 1: + if int(GBQualifier_value_text) < 2020: + date_to_write = "15 12 {}".format(GBQualifier_value_text) + else: + date_to_write = "15 01 {}".format(GBQualifier_value_text) + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 2: + date_to_write += '-15' + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 3: + GBQualifier_value_text_list = GBQualifier_value_text.split('-') + + if GBQualifier_value_text_list[1].isalpha(): + date_to_write = GBQualifier_value_text_list[1] + ' ' + GBQualifier_value_text_list[0] + ' ' + GBQualifier_value_text_list[2] + + info_for_yaml_dict['sample']['collection_date'] = date_to_write elif GBQualifier_name_text in ['lat_lon', 'country']: if GBQualifier_value_text == 'Hong Kong': GBQualifier_value_text = 'China: Hong Kong' @@ -237,7 +260,10 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text elif GBQualifier_name_text == 'note': - info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text + else: + info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text elif GBQualifier_name_text == 'isolate': info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text elif GBQualifier_name_text == 'db_xref': @@ -254,7 +280,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: json.dump(info_for_yaml_dict, fw, indent=2) - + if len(missing_value_list) > 0: with open('missing_terms.tsv', 'w') as fw: fw.write('\n'.join(missing_value_list)) diff --git a/scripts/import.cwl b/scripts/import.cwl new file mode 100644 index 0000000..d84516b --- /dev/null +++ b/scripts/import.cwl @@ -0,0 +1,30 @@ +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 +inputs: + scripts: + type: File + default: + class: File + location: import_to_arvados.py + inputBinding: {position: 1} + importScript: + type: File + default: + class: File + location: from_genbank_to_fasta_and_yaml.py + inputBinding: {position: 2} + dict: + type: Directory + default: + class: Directory + location: dict_ontology_standardization + inputBinding: {position: 3} +outputs: [] +requirements: + DockerRequirement: + dockerPull: bh20-seq-uploader/import + NetworkAccess: + networkAccess: true + WorkReuse: + enableReuse: false diff --git a/scripts/import_to_arvados.py b/scripts/import_to_arvados.py new file mode 100644 index 0000000..78cd13d --- /dev/null +++ b/scripts/import_to_arvados.py @@ -0,0 +1,14 @@ +import os +import subprocess +import glob +import sys + +os.chdir(os.environ["TMPDIR"]) +os.symlink(sys.argv[2], "dict_ontology_standardization") +subprocess.run(sys.argv[1]) + +os.chdir("fasta_and_yaml") +fasta_files = glob.glob("*.fasta") + +for f in fasta_files: + subprocess.run(["bh20-seq-uploader", f, "%s.yaml" %f[:-6]]) diff --git a/scripts/sequences.acc b/scripts/sequences.acc index a99c4e6..697d868 100644 --- a/scripts/sequences.acc +++ b/scripts/sequences.acc @@ -1,4 +1,299 @@ NC_045512 +MT394528 +MT394529 +MT394530 +MT394531 +MT394864 +MT396241 +MT396242 +MT396243 +MT396244 +MT396245 +MT396246 +MT396247 +MT396248 +MT396266 +MT380726 +MT380727 +MT380728 +MT380729 +MT380730 +MT380731 +MT380732 +MT380733 +MT380734 +MT385414 +MT385415 +MT385416 +MT385417 +MT385418 +MT385419 +MT385420 +MT385421 +MT385422 +MT385423 +MT385424 +MT385425 +MT385426 +MT385427 +MT385428 +MT385429 +MT385430 +MT385431 +MT385432 +MT385433 +MT385434 +MT385435 +MT385436 +MT385437 +MT385438 +MT385439 +MT385440 +MT385441 +MT385442 +MT385443 +MT385444 +MT385445 +MT385446 +MT385447 +MT385448 +MT385449 +MT385450 +MT385451 +MT385452 +MT385453 +MT385454 +MT385455 +MT385456 +MT385457 +MT385458 +MT385459 +MT385460 +MT385461 +MT385462 +MT385463 +MT385464 +MT385465 +MT385466 +MT385467 +MT385468 +MT385469 +MT385470 +MT385471 +MT385472 +MT385473 +MT385474 +MT385475 +MT385476 +MT385477 +MT385478 +MT385479 +MT385480 +MT385481 +MT385482 +MT385483 +MT385484 +MT385485 +MT385486 +MT385487 +MT385488 +MT385489 +MT385490 +MT385491 +MT385492 +MT385493 +MT385494 +MT385495 +MT385496 +MT385497 +MT186683 +MT252677 +MT252678 +MT252679 +MT252680 +MT252681 +MT252682 +MT252683 +MT252684 +MT252685 +MT252686 +MT252687 +MT252688 +MT252689 +MT252690 +MT252691 +MT252692 +MT252693 +MT252694 +MT252695 +MT252696 +MT252697 +MT252698 +MT252699 +MT252700 +MT252701 +MT252702 +MT252703 +MT252704 +MT252705 +MT252706 +MT252707 +MT252708 +MT252709 +MT252710 +MT252711 +MT252712 +MT252713 +MT252715 +MT252716 +MT252717 +MT252719 +MT252721 +MT252723 +MT252725 +MT252726 +MT252728 +MT252729 +MT252730 +MT252733 +MT252734 +MT252735 +MT252736 +MT252737 +MT252738 +MT252739 +MT252740 +MT252741 +MT252742 +MT252745 +MT252746 +MT252747 +MT252748 +MT252749 +MT252756 +MT252757 +MT252758 +MT252761 +MT252763 +MT252764 +MT252765 +MT252766 +MT252767 +MT252768 +MT252769 +MT252770 +MT252771 +MT252772 +MT252773 +MT252774 +MT252775 +MT252778 +MT252779 +MT252780 +MT252781 +MT252782 +MT252783 +MT252784 +MT252785 +MT252787 +MT252788 +MT252792 +MT252793 +MT252794 +MT252795 +MT252797 +MT252798 +MT252799 +MT252800 +MT252801 +MT252802 +MT252803 +MT252804 +MT252805 +MT252806 +MT252807 +MT252808 +MT252809 +MT252810 +MT252811 +MT252821 +MT252822 +MT252823 +MT252824 +MT339043 +MT365033 +MT374101 +MT374102 +MT374103 +MT374104 +MT374105 +MT374106 +MT374107 +MT374108 +MT374109 +MT374110 +MT374111 +MT374112 +MT374113 +MT374114 +MT374115 +MT374116 +MT375428 +MT375429 +MT375430 +MT375431 +MT375432 +MT375433 +MT375434 +MT375435 +MT375436 +MT375437 +MT375438 +MT375439 +MT375440 +MT375441 +MT375442 +MT375443 +MT375444 +MT375445 +MT375446 +MT375447 +MT375448 +MT375449 +MT375450 +MT375451 +MT375452 +MT375453 +MT375454 +MT375455 +MT375456 +MT375457 +MT375458 +MT375459 +MT375460 +MT375461 +MT375462 +MT375463 +MT375464 +MT375465 +MT375466 +MT375467 +MT375468 +MT375469 +MT375470 +MT375471 +MT375472 +MT375473 +MT375474 +MT375475 +MT375476 +MT375477 +MT375478 +MT375479 +MT375480 +MT375481 +MT375482 +MT375483 MT370516 MT370517 MT370518 @@ -225,6 +520,8 @@ MT372480 MT372481 MT372482 MT372483 +7BV2_P +7BV2_T LC542976 LC542809 MT114412 diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl new file mode 100644 index 0000000..0f364fc --- /dev/null +++ b/semantic_enrichment/countries.ttl @@ -0,0 +1,279 @@ +@prefix ns1: <http://www.wikidata.org/prop/direct/> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix xml: <http://www.w3.org/XML/1998/namespace> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +<http://www.wikidata.org/entity/Q108143> rdfs:label "San Diego County" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-116.77 33.02)" . + +<http://www.wikidata.org/entity/Q110403> rdfs:label "Snohomish County" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-121.71 48.04)" . + +<http://www.wikidata.org/entity/Q1166> rdfs:label "Michigan" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-85.58 44.34)" . + +<http://www.wikidata.org/entity/Q11746> rdfs:label "Wuhan" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(114.288055555 30.587222222)" . + +<http://www.wikidata.org/entity/Q1186> rdfs:label "Kerala" ; + ns1:P17 <http://www.wikidata.org/entity/Q668> ; + ns1:P625 "Point(76.972 8.5074)" . + +<http://www.wikidata.org/entity/Q1204> rdfs:label "Illinois" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-89.0 40.0)" . + +<http://www.wikidata.org/entity/Q1221> rdfs:label "Idaho" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-114.0 45.0)" . + +<http://www.wikidata.org/entity/Q1223> rdfs:label "Washington" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-120.5 47.5)" . + +<http://www.wikidata.org/entity/Q1227> rdfs:label "Nevada" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-117.0 39.0)" . + +<http://www.wikidata.org/entity/Q123304> rdfs:label "Antioquia Department" ; + ns1:P17 <http://www.wikidata.org/entity/Q739> ; + ns1:P625 "Point(-75.566666666 6.216666666)" . + +<http://www.wikidata.org/entity/Q1370> rdfs:label "Virginia" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-79.0 37.5)" . + +<http://www.wikidata.org/entity/Q1384> rdfs:label "New York" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-75.0 43.0)" . + +<http://www.wikidata.org/entity/Q1387> rdfs:label "Rhode Island" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-71.5 41.7)" . + +<http://www.wikidata.org/entity/Q1391> rdfs:label "Maryland" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-76.7 39.0)" . + +<http://www.wikidata.org/entity/Q1397> rdfs:label "Ohio" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-82.5 40.5)" . + +<http://www.wikidata.org/entity/Q1400> rdfs:label "Pennsylvania" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-77.5 41.0)" . + +<http://www.wikidata.org/entity/Q1408> rdfs:label "New Jersey" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-74.5 40.0)" . + +<http://www.wikidata.org/entity/Q1415> rdfs:label "Indiana" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-86.216666666 39.933333333)" . + +<http://www.wikidata.org/entity/Q1428> rdfs:label "Georgia" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-83.5 33.0)" . + +<http://www.wikidata.org/entity/Q1439> rdfs:label "Texas" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-100.0 31.0)" . + +<http://www.wikidata.org/entity/Q1454> rdfs:label "North Carolina" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-80.0 35.5)" . + +<http://www.wikidata.org/entity/Q1456> rdfs:label "South Carolina" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-81.0 34.0)" . + +<http://www.wikidata.org/entity/Q15174> rdfs:label "Shenzhen" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(114.054 22.535)" . + +<http://www.wikidata.org/entity/Q1527> rdfs:label "Minnesota" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-94.0 46.0)" . + +<http://www.wikidata.org/entity/Q1537> rdfs:label "Wisconsin" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-89.5 44.5)" . + +<http://www.wikidata.org/entity/Q1546> rdfs:label "Iowa" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-93.0 42.0)" . + +<http://www.wikidata.org/entity/Q1553> rdfs:label "Nebraska" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-100.0 41.5)" . + +<http://www.wikidata.org/entity/Q1558> rdfs:label "Kansas" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-98.0 38.5)" . + +<http://www.wikidata.org/entity/Q1581> rdfs:label "Missouri" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-92.5 38.5)" . + +<http://www.wikidata.org/entity/Q1588> rdfs:label "Louisiana" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-92.0 31.0)" . + +<http://www.wikidata.org/entity/Q16572> rdfs:label "Guangzhou" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(113.258976 23.128795)" . + +<http://www.wikidata.org/entity/Q1854> rdfs:label "Ho Chi Minh City" ; + ns1:P17 <http://www.wikidata.org/entity/Q881> ; + ns1:P625 "Point(106.62965 10.82302)", + "Point(106.633333333 10.816666666)" . + +<http://www.wikidata.org/entity/Q198244> rdfs:label "Zhuozhou" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(115.99176 39.48873)" . + +<http://www.wikidata.org/entity/Q36687> rdfs:label "Victoria" ; + ns1:P17 <http://www.wikidata.org/entity/Q408> ; + ns1:P625 "Point(144.0 -37.0)" . + +<http://www.wikidata.org/entity/Q43194> rdfs:label "Yunnan" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(101.5 24.5)" . + +<http://www.wikidata.org/entity/Q4970> rdfs:label "Hangzhou" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(120.1675 30.25)" . + +<http://www.wikidata.org/entity/Q62> rdfs:label "San Francisco" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-122.416388888 37.7775)" . + +<http://www.wikidata.org/entity/Q759> rdfs:label "New Hampshire" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-71.5 44.0)" . + +<http://www.wikidata.org/entity/Q771> rdfs:label "Massachusetts" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-71.8 42.3)" . + +<http://www.wikidata.org/entity/Q779> rdfs:label "Connecticut" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-72.7 41.6)" . + +<http://www.wikidata.org/entity/Q782> rdfs:label "Hawaii" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-157.796388888 21.311388888)" . + +<http://www.wikidata.org/entity/Q812> rdfs:label "Florida" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-81.631666666 28.133333333)" . + +<http://www.wikidata.org/entity/Q816> rdfs:label "Arizona" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-111.656944 34.286667)" . + +<http://www.wikidata.org/entity/Q81725> rdfs:label "KwaZulu-Natal" ; + ns1:P17 <http://www.wikidata.org/entity/Q258> ; + ns1:P625 "Point(31.0 -29.0)" . + +<http://www.wikidata.org/entity/Q824> rdfs:label "Oregon" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-120.575 43.935833)" . + +<http://www.wikidata.org/entity/Q829> rdfs:label "Utah" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-111.5 39.5)" . + +<http://www.wikidata.org/entity/Q8686> rdfs:label "Shanghai" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(121.466666666 31.166666666)" . + +<http://www.wikidata.org/entity/Q8818> rdfs:label "Valencia" ; + ns1:P17 <http://www.wikidata.org/entity/Q29> ; + ns1:P625 "Point(-0.375 39.466666666)" . + +<http://www.wikidata.org/entity/Q956> rdfs:label "Beijing" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(116.391388888 39.905)" . + +<http://www.wikidata.org/entity/Q99> rdfs:label "California" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-120.0 37.0)" . + +<http://www.wikidata.org/entity/Q142> rdfs:label "France" ; + ns1:P17 <http://www.wikidata.org/entity/Q142> ; + ns1:P625 "Point(2.0 47.0)" . + +<http://www.wikidata.org/entity/Q155> rdfs:label "Brazil" ; + ns1:P17 <http://www.wikidata.org/entity/Q155> ; + ns1:P625 "Point(-53.0 -14.0)" . + +<http://www.wikidata.org/entity/Q258> rdfs:label "South Africa" . + +<http://www.wikidata.org/entity/Q29> rdfs:label "Spain" . + +<http://www.wikidata.org/entity/Q33> rdfs:label "Finland" ; + ns1:P17 <http://www.wikidata.org/entity/Q33> ; + ns1:P625 "Point(27.0 65.0)" . + +<http://www.wikidata.org/entity/Q34> rdfs:label "Sweden" ; + ns1:P17 <http://www.wikidata.org/entity/Q34> ; + ns1:P625 "Point(15.0 61.0)" . + +<http://www.wikidata.org/entity/Q38> rdfs:label "Italy" ; + ns1:P17 <http://www.wikidata.org/entity/Q38> ; + ns1:P625 "Point(12.5 42.5)" . + +<http://www.wikidata.org/entity/Q408> rdfs:label "Australia" . + +<http://www.wikidata.org/entity/Q41> rdfs:label "Greece" ; + ns1:P17 <http://www.wikidata.org/entity/Q41> ; + ns1:P625 "Point(23.0 38.5)" . + +<http://www.wikidata.org/entity/Q419> rdfs:label "Peru" ; + ns1:P17 <http://www.wikidata.org/entity/Q419> ; + ns1:P625 "Point(-76.0 -9.4)" . + +<http://www.wikidata.org/entity/Q43> rdfs:label "Turkey" ; + ns1:P17 <http://www.wikidata.org/entity/Q43> ; + ns1:P625 "Point(36.0 39.0)" . + +<http://www.wikidata.org/entity/Q668> rdfs:label "India" . + +<http://www.wikidata.org/entity/Q739> rdfs:label "Colombia" . + +<http://www.wikidata.org/entity/Q794> rdfs:label "Iran" ; + ns1:P17 <http://www.wikidata.org/entity/Q794> ; + ns1:P625 "Point(53.0 32.0)" . + +<http://www.wikidata.org/entity/Q801> rdfs:label "Israel" ; + ns1:P17 <http://www.wikidata.org/entity/Q801> ; + ns1:P625 "Point(35.0 31.0)" . + +<http://www.wikidata.org/entity/Q837> rdfs:label "Nepal" ; + ns1:P17 <http://www.wikidata.org/entity/Q837> ; + ns1:P625 "Point(84.0 28.0)" . + +<http://www.wikidata.org/entity/Q865> rdfs:label "Taiwan" ; + ns1:P17 <http://www.wikidata.org/entity/Q865> ; + ns1:P625 "Point(121.0 24.0)" . + +<http://www.wikidata.org/entity/Q881> rdfs:label "Vietnam" . + +<http://www.wikidata.org/entity/Q884> rdfs:label "South Korea" ; + ns1:P17 <http://www.wikidata.org/entity/Q884> ; + ns1:P625 "Point(128.0 36.0)" . + +<http://www.wikidata.org/entity/Q148> rdfs:label "People's Republic of China" ; + ns1:P17 <http://www.wikidata.org/entity/Q148> ; + ns1:P625 "Point(103.0 35.0)" . + +<http://www.wikidata.org/entity/Q30> rdfs:label "United States of America" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-77.036666666 38.895)" . + diff --git a/semantic_enrichment/labels.ttl b/semantic_enrichment/labels.ttl new file mode 100644 index 0000000..b4e5d1f --- /dev/null +++ b/semantic_enrichment/labels.ttl @@ -0,0 +1,24 @@ +<http://edamontology.org/data_1875> <http://www.w3.org/2000/01/rdf-schema#label> "NCBI taxon" . +<http://purl.obolibrary.org/obo/GAZ_00000448> <http://www.w3.org/2000/01/rdf-schema#label> "geographic location" . +<http://purl.obolibrary.org/obo/FLU_0000848> <http://www.w3.org/2000/01/rdf-schema#label> "sequence coverage" . +<http://purl.obolibrary.org/obo/NCIT_C41206> <http://www.w3.org/2000/01/rdf-schema#label> "Institution" . +<http://purl.obolibrary.org/obo/NCIT_C42781> <http://www.w3.org/2000/01/rdf-schema#label> "Author" . +<http://purl.obolibrary.org/obo/OBI_0001479> <http://www.w3.org/2000/01/rdf-schema#label> "specimen from organism" . +<http://purl.obolibrary.org/obo/OBI_0600047> <http://www.w3.org/2000/01/rdf-schema#label> "sequencing assay" . +<http://semanticscience.org/resource/SIO_000115> <http://www.w3.org/2000/01/rdf-schema#label> "identifier" . +<http://www.ebi.ac.uk/efo/EFO_0000532> <http://www.w3.org/2000/01/rdf-schema#label> "host" . +<http://semanticscience.org/resource/SIO_001167> <http://www.w3.org/2000/01/rdf-schema#label> "comment" . +<http://www.ebi.ac.uk/efo/EFO_0002699> <http://www.w3.org/2000/01/rdf-schema#label> "high throughput sequencer" . +<http://semanticscience.org/resource/SIO_010055> <http://www.w3.org/2000/01/rdf-schema#label> "strain" . +<http://purl.obolibrary.org/obo/OBI_0001895> <http://www.w3.org/2000/01/rdf-schema#label> "specimen collector" . +<http://edamontology.org/data_2091> <http://www.w3.org/2000/01/rdf-schema#label> "Accession" . +<http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164> <http://www.w3.org/2000/01/rdf-schema#label> "collection date" . +<http://purl.obolibrary.org/obo/NCIT_C3833> <http://www.w3.org/2000/01/rdf-schema#label> "Asymptomatic" . +<http://purl.obolibrary.org/obo/NCIT_C25269> <http://www.w3.org/2000/01/rdf-schema#label> "Symptomatic" . +<http://purl.obolibrary.org/obo/GENEPIO_0002020> <http://www.w3.org/2000/01/rdf-schema#label> "admitted to hospital" . +<http://purl.obolibrary.org/obo/GENEPIO_0001849> <http://www.w3.org/2000/01/rdf-schema#label> "discharged from hospital" . +<http://purl.obolibrary.org/obo/NCIT_C28554> <http://www.w3.org/2000/01/rdf-schema#label> "Dead" . +<http://purl.obolibrary.org/obo/NCIT_C37987> <http://www.w3.org/2000/01/rdf-schema#label> "Alive" . +<http://purl.obolibrary.org/obo/NCIT_C115935> <http://www.w3.org/2000/01/rdf-schema#label> "Healthy" . +<http://purl.obolibrary.org/obo/PATO_0000384> <http://www.w3.org/2000/01/rdf-schema#label> "male". +<http://purl.obolibrary.org/obo/PATO_0000383> <http://www.w3.org/2000/01/rdf-schema#label> "female" . @@ -15,7 +15,8 @@ try: except ImportError: tagger = egg_info_cmd.egg_info -install_requires = ["arvados-python-client", "schema-salad", "python-magic", "pyshex"] +install_requires = ["arvados-python-client", "schema-salad", + "python-magic", "pyshex", "py-dateutil"] web_requires = ["flask", "pyyaml"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) @@ -26,7 +27,7 @@ setup( version="1.0", description="Biohackathon sequence uploader", long_description=open(README).read(), - long_description_content_type="text/x-rst", + long_description_content_type="text/markdown", author="Peter Amstutz", author_email="peter.amstutz@curii.com", license="Apache 2.0", @@ -34,7 +35,8 @@ setup( package_data={"bh20sequploader": ["bh20seq-schema.yml", "bh20seq-options.yml", "bh20seq-shex.rdf", - "validation/formats"], + "validation/formats", + "SARS-CoV-2-reference.fasta",], }, install_requires=install_requires, extras_require={ diff --git a/workflows/pangenome-generate/minimap2.cwl b/workflows/pangenome-generate/minimap2.cwl index 42d1dce..bf8eb4c 100644 --- a/workflows/pangenome-generate/minimap2.cwl +++ b/workflows/pangenome-generate/minimap2.cwl @@ -12,7 +12,7 @@ hints: ResourceRequirement: coresMin: 8 coresMax: 32 - ramMin: $(9 * 1024) + ramMin: $(15 * 1024) outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20)) stdout: $(inputs.readsFA.nameroot).paf baseCommand: minimap2 diff --git a/workflows/pangenome-generate/odgi_to_rdf.cwl b/workflows/pangenome-generate/odgi_to_rdf.cwl index 079d6fb..e6a279b 100644 --- a/workflows/pangenome-generate/odgi_to_rdf.cwl +++ b/workflows/pangenome-generate/odgi_to_rdf.cwl @@ -3,10 +3,12 @@ class: CommandLineTool cwlVersion: v1.1 hints: DockerRequirement: - dockerPull: spodgi/spodgi + dockerPull: jerven/spodgi:0.0.6 requirements: InlineJavascriptRequirement: {} ShellCommandRequirement: {} + ResourceRequirement: + ramMin: $((2 * 1024) + 1) inputs: - id: odgi type: File |