about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--bh20seqanalyzer/main.py71
-rw-r--r--bh20sequploader/main.py58
-rw-r--r--bh20sequploader/qc_fasta.py35
-rw-r--r--scripts/dict_ontology_standardization/ncbi_countries.csv32
-rw-r--r--scripts/dict_ontology_standardization/ncbi_host_species.csv2
-rw-r--r--scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv7
-rw-r--r--scripts/dict_ontology_standardization/ncbi_speciesman_source.csv1
-rw-r--r--scripts/docker/Dockerfile4
-rw-r--r--workflows/fastq2fasta/bam2fasta.cwl2
-rw-r--r--workflows/fastq2fasta/bcftools-consensus.cwl9
-rw-r--r--workflows/fastq2fasta/bcftools-view-qc.cwl2
-rw-r--r--workflows/fastq2fasta/fastq2fasta.cwl2
12 files changed, 161 insertions, 64 deletions
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 31ad4c4..9a36cae 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -30,6 +30,7 @@ def validate_upload(api, collection, validated_project,
         try:
             metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml"))
             metadata_content["id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection["portable_data_hash"]
+            sample_id = metadata_content["sample"]["sample_id"]
             add_lc_filename(metadata_content, metadata_content["id"])
             valid = qc_metadata(metadata_content) and valid
         except Exception as e:
@@ -39,21 +40,25 @@ def validate_upload(api, collection, validated_project,
             logging.warn("Failed metadata qc")
 
     if valid:
-        tgt = None
-        for n in ("sequence.fasta", "reads.fastq"):
-            if n not in col:
-                continue
-            with col.open(n) as qf:
-                tgt = qc_fasta(qf)
-                if tgt != n:
-                    logging.info("Expected %s but magic says it should be %s", n, tgt)
-                    valid = False
-                elif tgt == "reads.fastq":
-                    start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid)
-                    return False
-        if tgt is None:
+        try:
+            tgt = None
+            paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"}
+            for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
+                if n not in col:
+                    continue
+                with col.open(n, 'rb') as qf:
+                    tgt = qc_fasta(qf)[0]
+                    if tgt != n and tgt != paired.get(n):
+                        logging.info("Expected %s but magic says it should be %s", n, tgt)
+                        valid = False
+                    elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
+                        start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid, n, sample_id)
+                        return False
+            if tgt is None:
+                valid = False
+                logging.warn("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"])
+        except ValueError as v:
             valid = False
-            logging.warn("Upload '%s' does not contain sequence.fasta or reads.fastq", collection["name"])
 
     dup = api.collections().list(filters=[["owner_uuid", "=", validated_project],
                                           ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
@@ -69,9 +74,8 @@ def validate_upload(api, collection, validated_project,
             "owner_uuid": validated_project,
             "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime()))}).execute()
     else:
-        pass
         # It is invalid, delete it.
-        #logging.warn("Deleting '%s'" % collection["name"])
+        logging.warn("Suggest deleting '%s'" % collection["name"])
         #api.collections().delete(uuid=collection["uuid"]).execute()
 
     return valid
@@ -95,6 +99,7 @@ def run_workflow(api, parent_project, workflow_uuid, name, inputobj):
                tmp.name]
         logging.info("Running %s" % ' '.join(cmd))
         comp = subprocess.run(cmd, capture_output=True)
+    logging.info("Submitted %s", comp.stdout)
     if comp.returncode != 0:
         logging.error(comp.stderr.decode('utf-8'))
 
@@ -103,12 +108,11 @@ def run_workflow(api, parent_project, workflow_uuid, name, inputobj):
 
 def start_fastq_to_fasta(api, collection,
                          analysis_project,
-                         fastq_workflow_uuid):
-    newproject = run_workflow(api, analysis_project, fastq_workflow_uuid, "FASTQ to FASTA", {
-        "fastq_forward": {
-            "class": "File",
-            "location": "keep:%s/reads.fastq" % collection["portable_data_hash"]
-        },
+                         fastq_workflow_uuid,
+                         tgt,
+                         sample_id):
+
+    params = {
         "metadata": {
             "class": "File",
             "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"]
@@ -116,8 +120,26 @@ def start_fastq_to_fasta(api, collection,
         "ref_fasta": {
             "class": "File",
             "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta"
+        },
+        "sample_id": sample_id
+    }
+
+    if tgt.startswith("reads.fastq"):
+        params["fastq_forward"] = {
+            "class": "File",
+            "location": "keep:%s/%s" % (collection["portable_data_hash"], tgt)
+        }
+    elif tgt.startswith("reads_1.fastq"):
+        params["fastq_forward"] = {
+            "class": "File",
+            "location": "keep:%s/reads_1.%s" % (collection["portable_data_hash"], tgt[8:])
         }
-    })
+        params["fastq_reverse"] = {
+            "class": "File",
+            "location": "keep:%s/reads_2.%s" % (collection["portable_data_hash"], tgt[8:])
+        }
+
+    newproject = run_workflow(api, analysis_project, fastq_workflow_uuid, "FASTQ to FASTA", params)
     api.collections().update(uuid=collection["uuid"],
                              body={"owner_uuid": newproject["uuid"]}).execute()
 
@@ -222,6 +244,7 @@ def main():
 
     parser.add_argument('--latest-result-collection', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='')
     parser.add_argument('--kickoff', action="store_true")
+    parser.add_argument('--once', action="store_true")
     args = parser.parse_args()
 
     api = arvados.api()
@@ -265,4 +288,6 @@ def main():
                                 args.pangenome_analysis_project,
                                 args.latest_result_collection)
 
+        if args.once:
+            break
         time.sleep(15)
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index a2e62fa..c442af0 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -22,18 +22,10 @@ ARVADOS_API_HOST='lugli.arvadosapi.com'
 ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462'
 UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa'
 
-def main():
-    parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis')
-    parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ')
-    parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json')
-    parser.add_argument("--validate", action="store_true", help="Dry run, validate only")
-    args = parser.parse_args()
-
-    api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True)
-
+def qa_stuff(metadata, sequence_p1, sequence_p2):
     try:
         log.debug("Checking metadata")
-        if not qc_metadata(args.metadata.name):
+        if not qc_metadata(metadata.name):
             log.warning("Failed metadata qc")
             exit(1)
     except ValueError as e:
@@ -42,29 +34,52 @@ def main():
         print(e)
         exit(1)
 
+    target = []
     try:
-        log.debug("Checking FASTA QC")
-        target = qc_fasta(args.sequence)
+        log.debug("Checking FASTA/FASTQ QC")
+        target.append(qc_fasta(sequence_p1))
+        if sequence_p2:
+            target.append(qc_fasta(sequence_p2))
+            target[0] = ("reads_1."+target[0][0][6:], target[0][1])
+            target[1] = ("reads_2."+target[1][0][6:], target[0][1])
     except ValueError as e:
         log.debug(e)
         log.debug("Failed FASTA qc")
         print(e)
         exit(1)
 
+    return target
+
+def upload_sequence(col, target, sequence):
+    with col.open(target[0], "wb") as f:
+        r = sequence.read(65536)
+        while r:
+            f.write(r)
+            r = sequence.read(65536)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis')
+    parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json')
+    parser.add_argument('sequence_p1', type=argparse.FileType('rb'), help='sequence FASTA/FASTQ')
+    parser.add_argument('sequence_p2', type=argparse.FileType('rb'), default=None, help='sequence FASTQ pair')
+    parser.add_argument("--validate", action="store_true", help="Dry run, validate only")
+    args = parser.parse_args()
+
+    api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True)
+
+    target = qa_stuff(args.metadata, args.sequence_p1, args.sequence_p2)
+    seqlabel = target[0][1]
+
     if args.validate:
         print("Valid")
         exit(0)
 
     col = arvados.collection.Collection(api_client=api)
 
-    with col.open(target, "w") as f:
-        r = args.sequence.read(65536)
-        seqlabel = r[1:r.index("\n")]
-        print(seqlabel)
-        while r:
-            f.write(r)
-            r = args.sequence.read(65536)
-    args.sequence.close()
+    upload_sequence(col, target[0], args.sequence_p1)
+    if args.sequence_p2:
+        upload_sequence(col, target[1], args.sequence_p2)
 
     print("Reading metadata")
     with col.open("metadata.yaml", "w") as f:
@@ -73,7 +88,6 @@ def main():
         while r:
             f.write(r)
             r = args.metadata.read(65536)
-    args.metadata.close()
 
     external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')
 
@@ -93,6 +107,8 @@ def main():
                  (seqlabel, properties['upload_user'], properties['upload_ip']),
                  properties=properties, ensure_unique_name=True)
 
+    print("Saved to %s" % col.manifest_locator())
+
     print("Done")
 
 if __name__ == "__main__":
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 5c8cf3a..e198430 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -5,6 +5,8 @@ import subprocess
 import tempfile
 import logging
 import re
+import io
+import gzip
 
 log = logging.getLogger(__name__ )
 
@@ -23,7 +25,7 @@ def read_fasta(sequence):
             raise ValueError("FASTA file contains multiple entries")
     return label, bases
 
-def qc_fasta(sequence):
+def qc_fasta(arg_sequence):
     log.debug("Starting qc_fasta")
     schema_resource = pkg_resources.resource_stream(__name__, "validation/formats")
     with tempfile.NamedTemporaryFile() as tmp:
@@ -31,12 +33,24 @@ def qc_fasta(sequence):
         tmp.flush()
         val = magic.Magic(magic_file=tmp.name,
                           uncompress=False, mime=True)
-    seq_type = val.from_buffer(sequence.read(4096)).lower()
+
+    gz = ""
+    if arg_sequence.name.endswith(".gz"):
+        sequence = gzip.GzipFile(fileobj=arg_sequence, mode='rb')
+        gz = ".gz"
+    else:
+        sequence = arg_sequence
+
+    sequence = io.TextIOWrapper(sequence)
+    r = sequence.read(4096)
     sequence.seek(0)
+
+    seqlabel = r[1:r.index("\n")]
+    seq_type = val.from_buffer(r).lower()
+
     if seq_type == "text/fasta":
         # ensure that contains only one entry
         submitlabel, submitseq = read_fasta(sequence)
-        sequence.seek(0)
 
         with tempfile.NamedTemporaryFile() as tmp1:
             refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta")
@@ -44,6 +58,9 @@ def qc_fasta(sequence):
             tmp1.write(submitlabel.encode("utf8"))
             tmp1.write(("".join(submitseq)).encode("utf8"))
             tmp1.flush()
+            subbp = 0
+            refbp = 0
+            similarity = 0
             try:
                 cmd = ["clustalw", "-infile="+tmp1.name,
                        "-quicktree", "-iteration=none", "-type=DNA"]
@@ -64,15 +81,17 @@ def qc_fasta(sequence):
             except Exception as e:
                 logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e)
 
-            if (subbp/refbp) < .7:
+            if refbp and (subbp/refbp) < .7:
                 raise ValueError("QC fail: submit sequence length is shorter than 70% reference")
-            if (subbp/refbp) > 1.3:
+            if refbp and (subbp/refbp) > 1.3:
                 raise ValueError("QC fail: submit sequence length is greater than 130% reference")
-            if similarity < 70.0:
+            if similarity and similarity < 70.0:
                 raise ValueError("QC fail: submit similarity is less than 70%")
+            if refbp == 0 or similarity == 0:
+                raise ValueError("QC fail")
 
-        return "sequence.fasta"
+        return ("sequence.fasta"+gz, seqlabel)
     elif seq_type == "text/fastq":
-        return "reads.fastq"
+        return ("reads.fastq"+gz, seqlabel)
     else:
         raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ")
diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv
index 7e83564..85d4e8a 100644
--- a/scripts/dict_ontology_standardization/ncbi_countries.csv
+++ b/scripts/dict_ontology_standardization/ncbi_countries.csv
@@ -12,6 +12,9 @@ Armenia,http://www.wikidata.org/entity/Q399
 Australia,http://www.wikidata.org/entity/Q408
 Australia: Queensland,http://www.wikidata.org/entity/Q36074
 Australia: Victoria,http://www.wikidata.org/entity/Q36687
+"Australia: Melbourne, Victoria",http://www.wikidata.org/entity/Q3141
+Australia: Northern Territory,http://www.wikidata.org/entity/Q3235
+Australia: NSW,http://www.wikidata.org/entity/Q3224
 Austria,http://www.wikidata.org/entity/Q40
 Azerbaijan,http://www.wikidata.org/entity/Q227
 Bahrain,http://www.wikidata.org/entity/Q398
@@ -31,8 +34,10 @@ Bulgaria,http://www.wikidata.org/entity/Q219
 Burkina Faso,http://www.wikidata.org/entity/Q965
 Burundi,http://www.wikidata.org/entity/Q967
 Cambodia,http://www.wikidata.org/entity/Q424
+Cambodia:Sihanoukville,http://www.wikidata.org/entity/Q18207676
 Cameroon,http://www.wikidata.org/entity/Q1009
 Canada,http://www.wikidata.org/entity/Q16
+Canada: Toronto,http://www.wikidata.org/entity/Q172
 Cape Verde,http://www.wikidata.org/entity/Q1011
 Central African Republic,http://www.wikidata.org/entity/Q929
 Chad,http://www.wikidata.org/entity/Q657
@@ -65,12 +70,13 @@ China: Jiangxi,http://www.wikidata.org/entity/Q57052
 China: Jilin,http://www.wikidata.org/entity/Q45208
 China: Liaoning,http://www.wikidata.org/entity/Q43934
 China: Macau,http://www.wikidata.org/entity/Q14773
-China: Nanchang,https://www.wikidata.org/wiki/Q171943
+China: Nanchang,http://www.wikidata.org/entity/Q171943
 China: Ningxia Hui Autonomous Region,http://www.wikidata.org/entity/Q57448
 China: Qinghai,http://www.wikidata.org/entity/Q45833
 China: Shaanxi,http://www.wikidata.org/entity/Q47974
 China: Shandong,http://www.wikidata.org/entity/Q43407
 China: Shanghai,http://www.wikidata.org/entity/Q8686
+China:Shanghai,http://www.wikidata.org/entity/Q8686
 China: Shanxi,http://www.wikidata.org/entity/Q46913
 China: Shenzhen,http://www.wikidata.org/entity/Q15174
 China: Sichuan,http://www.wikidata.org/entity/Q19770
@@ -111,11 +117,13 @@ France,http://www.wikidata.org/entity/Q142
 Gabon,http://www.wikidata.org/entity/Q1000
 Georgia,http://www.wikidata.org/entity/Q230
 Germany,http://www.wikidata.org/entity/Q183
-Germany: Bavaria,https://www.wikidata.org/wiki/Q980
-Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718
+Germany: Bavaria,http://www.wikidata.org/entity/Q980
+Germany: Dusseldorf,http://www.wikidata.org/entity/Q1718
+Germany: Heinsberg,http://www.wikidata.org/entity/Q14833
+Germany: Starnberg,http://www.wikidata.org/entity/Q61936
 Ghana,http://www.wikidata.org/entity/Q117
 Greece,http://www.wikidata.org/entity/Q41
-Greece: Athens,https://www.wikidata.org/wiki/Q1524
+Greece: Athens,http://www.wikidata.org/entity/Q1524
 Grenada,http://www.wikidata.org/entity/Q769
 Guatemala,http://www.wikidata.org/entity/Q774
 Guinea,http://www.wikidata.org/entity/Q1006
@@ -139,8 +147,8 @@ Ireland,http://www.wikidata.org/entity/Q27
 Israel,http://www.wikidata.org/entity/Q801
 Italy,http://www.wikidata.org/entity/Q38
 Italy: Cagliari,http://www.wikidata.org/entity/Q1897
-Italy: Lazio,https://www.wikidata.org/wiki/Q1282
-Italy: Palermo,https://www.wikidata.org/wiki/Q2656
+Italy: Lazio,http://www.wikidata.org/entity/Q1282
+Italy: Palermo,http://www.wikidata.org/entity/Q2656
 Italy: Rome,http://www.wikidata.org/entity/Q220
 Ivory Coast,http://www.wikidata.org/entity/Q1008
 Jamaica,http://www.wikidata.org/entity/Q766
@@ -181,7 +189,7 @@ Mozambique,http://www.wikidata.org/entity/Q1029
 Myanmar,http://www.wikidata.org/entity/Q836
 Namibia,http://www.wikidata.org/entity/Q1030
 Nauru,http://www.wikidata.org/entity/Q697
-Netherlands: Milheeze,https://www.wikidata.org/wiki/Q3314115
+Netherlands: Milheeze,http://www.wikidata.org/entity/Q3314115
 Nepal,http://www.wikidata.org/entity/Q837
 New Zealand,http://www.wikidata.org/entity/Q664
 Nicaragua,http://www.wikidata.org/entity/Q811
@@ -269,10 +277,14 @@ USA: AK,http://www.wikidata.org/entity/Q797
 USA: AL,http://www.wikidata.org/entity/Q173
 USA: AR,http://www.wikidata.org/entity/Q1612
 USA: AZ,http://www.wikidata.org/entity/Q816
+USA: Arizona,http://www.wikidata.org/entity/Q816
 USA: CA,http://www.wikidata.org/entity/Q99
+USA: California,http://www.wikidata.org/entity/Q99
+USA:California,http://www.wikidata.org/entity/Q99
 "USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143
 USA: CO,http://www.wikidata.org/entity/Q1261
 USA: CT,http://www.wikidata.org/entity/Q779
+USA: Connecticut,http://www.wikidata.org/entity/Q779
 USA: DC,http://www.wikidata.org/entity/Q3551781
 USA: DE,http://www.wikidata.org/entity/Q1393
 USA: FL,http://www.wikidata.org/entity/Q812
@@ -287,7 +299,7 @@ USA: IN,http://www.wikidata.org/entity/Q1415
 USA: KS,http://www.wikidata.org/entity/Q1558
 USA: KY,http://www.wikidata.org/entity/Q1603
 USA: LA,http://www.wikidata.org/entity/Q1588
-"USA: New Orleans, LA",https://www.wikidata.org/wiki/Q34404
+"USA: New Orleans, LA",http://www.wikidata.org/entity/Q34404
 USA: MA,http://www.wikidata.org/entity/Q771
 USA: Massachusetts,http://www.wikidata.org/entity/Q771
 USA: MD,http://www.wikidata.org/entity/Q1391
@@ -295,6 +307,8 @@ USA: ME,http://www.wikidata.org/entity/Q724
 USA: MI,http://www.wikidata.org/entity/Q1166
 USA: Michigan,http://www.wikidata.org/entity/Q1166
 USA: MN,http://www.wikidata.org/entity/Q1527
+USA:Minnesota,http://www.wikidata.org/entity/Q1527
+USA: Minnesota,http://www.wikidata.org/entity/Q1527
 USA: MO,http://www.wikidata.org/entity/Q1581
 USA: MS,http://www.wikidata.org/entity/Q1494
 USA: MT,http://www.wikidata.org/entity/Q1212
@@ -323,8 +337,10 @@ USA: TX,http://www.wikidata.org/entity/Q1439
 USA: UT,http://www.wikidata.org/entity/Q829
 USA: VA,http://www.wikidata.org/entity/Q1370
 USA: Virginia,http://www.wikidata.org/entity/Q1370
+USA:Virginia,http://www.wikidata.org/entity/Q1370
 USA: VT,http://www.wikidata.org/entity/Q16551
 USA: WA,http://www.wikidata.org/entity/Q1223
+USA: Washington,http://www.wikidata.org/entity/Q1223
 USA: WI,http://www.wikidata.org/entity/Q1537
 USA: Wisconsin,http://www.wikidata.org/entity/Q1537
 USA: WV,http://www.wikidata.org/entity/Q1371
diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv
index 0d2120c..102d458 100644
--- a/scripts/dict_ontology_standardization/ncbi_host_species.csv
+++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv
@@ -1,4 +1,6 @@
 Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
+human,http://purl.obolibrary.org/obo/NCBITaxon_9606
+Human,http://purl.obolibrary.org/obo/NCBITaxon_9606
 Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666
 Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974
 Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685
diff --git a/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv b/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv
index 49cb6b7..110e90b 100644
--- a/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv
+++ b/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv
@@ -1,6 +1,13 @@
 Illumian NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173
 Illumina NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173
 NextSeq500,http://www.ebi.ac.uk/efo/EFO_0009173
+NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173
+Illumian NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566
+Illumina NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566
+NextSeq550,http://www.ebi.ac.uk/efo/EFO_0008566
+NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566
+Illumina MiniSeq,http://www.ebi.ac.uk/efo/EFO_0008636
+Illumina NovaSeq 6000,http://www.ebi.ac.uk/efo/EFO_0008637
 Nanopore MinION,http://www.ebi.ac.uk/efo/EFO_0008632
 Oxford Nanopore MinION,http://www.ebi.ac.uk/efo/EFO_0008632
 ONT (Oxford Nanopore Technologies),http://purl.obolibrary.org/obo/NCIT_C146818
diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
index 18b986c..0fa2219 100644
--- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
+++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
@@ -14,6 +14,7 @@ oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835
 throat swab,http://purl.obolibrary.org/obo/NCIT_C155835
 oro-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835
 Oropharyngal,http://purl.obolibrary.org/obo/NCIT_C155835
+oralpharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835
 Oral-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835
 Oro-pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835
 Oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835
diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile
index 5bd38dd..9fb33d5 100644
--- a/scripts/docker/Dockerfile
+++ b/scripts/docker/Dockerfile
@@ -4,7 +4,7 @@ RUN apt-get update && \
     apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \
     python3 python3-pip python3-setuptools python3-dev python-pycurl \
     clustalw python3-biopython libcurl4-openssl-dev build-essential \
-    libssl-dev && \
+    libssl-dev libmagic-dev python3-magic && \
     apt-get clean
 
-RUN pip3 install bh20-seq-uploader
\ No newline at end of file
+RUN pip3 install bh20-seq-uploader
diff --git a/workflows/fastq2fasta/bam2fasta.cwl b/workflows/fastq2fasta/bam2fasta.cwl
index efe580f..dd4020b 100644
--- a/workflows/fastq2fasta/bam2fasta.cwl
+++ b/workflows/fastq2fasta/bam2fasta.cwl
@@ -15,6 +15,7 @@ inputs:
   threads:
     type: int
     default: 4
+  sample_id: string
 
 outputs:
   out_fasta:
@@ -61,5 +62,6 @@ steps:
     in:
       ref_fasta: fasta
       vcf: bcftools_index_after_qc/indexed
+      sample_id: sample_id
     out: [out_fasta]
     run: bcftools-consensus.cwl
diff --git a/workflows/fastq2fasta/bcftools-consensus.cwl b/workflows/fastq2fasta/bcftools-consensus.cwl
index c111792..dffdbe3 100644
--- a/workflows/fastq2fasta/bcftools-consensus.cwl
+++ b/workflows/fastq2fasta/bcftools-consensus.cwl
@@ -4,20 +4,27 @@ cwlVersion: v1.1
 hints:
   DockerRequirement:
     dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
+  ShellCommandRequirement: {}
 baseCommand: bcftools
 arguments:
   - consensus
-  - -i'QUAL > 1 && GT="A"'
+  - -i
+  - 'QUAL > 1 && GT="a"'
   - -Hla
   - -f
   - $(inputs.ref_fasta)
   - $(inputs.vcf)
+  - {shellQuote: false, valueFrom: "|"}
+  - sed
+  - "s/^>.*/>$(inputs.sample_id)/g"
 inputs:
   - id: ref_fasta
     type: File
   - id: vcf
     type: File
     secondaryFiles: [.csi]
+  - id: sample_id
+    type: string
 outputs:
   - id: out_fasta
     type: stdout
diff --git a/workflows/fastq2fasta/bcftools-view-qc.cwl b/workflows/fastq2fasta/bcftools-view-qc.cwl
index 477c596..336f455 100644
--- a/workflows/fastq2fasta/bcftools-view-qc.cwl
+++ b/workflows/fastq2fasta/bcftools-view-qc.cwl
@@ -8,7 +8,7 @@ baseCommand: bcftools
 arguments:
   - view
   - -i
-  - 'QUAL>1 && (GT="AA" || GT="Aa")'
+  - 'QUAL > 1 && GT="a"'
   - -Oz
   - --threads=$(inputs.threads)
   - $(inputs.bcf)
diff --git a/workflows/fastq2fasta/fastq2fasta.cwl b/workflows/fastq2fasta/fastq2fasta.cwl
index 0cf5c48..d529d99 100644
--- a/workflows/fastq2fasta/fastq2fasta.cwl
+++ b/workflows/fastq2fasta/fastq2fasta.cwl
@@ -22,6 +22,7 @@ inputs:
     type: int
     default: 4
   metadata: File?
+  sample_id: string
 
 outputs:
   out_fasta:
@@ -57,5 +58,6 @@ steps:
       bam: samtools-sort/sorted_bam
       fasta: ref_fasta
       threads: threads
+      sample_id: sample_id
     out: [out_fasta]
     run: bam2fasta.cwl