aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Amstutz2020-04-20 14:57:25 -0400
committerPeter Amstutz2020-04-20 14:57:25 -0400
commit9ddcfeacb3191638f42b08af999889d867f0f81c (patch)
tree4cfe4c2b1df38bf6e5c79f5f8c0700407f76a472
parentd29dfd593233541b85c1cefb239650279d57d59f (diff)
downloadbh20-seq-resource-9ddcfeacb3191638f42b08af999889d867f0f81c.tar.gz
bh20-seq-resource-9ddcfeacb3191638f42b08af999889d867f0f81c.tar.lz
bh20-seq-resource-9ddcfeacb3191638f42b08af999889d867f0f81c.zip
Better handling of duplicate sequences
Also save original fasta label in metadata
-rw-r--r--bh20sequploader/bh20seq-schema.yml11
-rw-r--r--workflows/pangenome-generate/merge-metadata.cwl2
-rw-r--r--workflows/pangenome-generate/merge-metadata.py21
-rw-r--r--workflows/pangenome-generate/pangenome-generate.cwl10
-rw-r--r--workflows/pangenome-generate/relabel-seqs.cwl10
-rw-r--r--workflows/pangenome-generate/relabel-seqs.py12
-rw-r--r--workflows/pangenome-generate/seqkit-rmdup.cwl4
-rw-r--r--workflows/pangenome-generate/testjob.yml16
8 files changed, 69 insertions, 17 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 64008f2..982447c 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -18,6 +18,7 @@ $graph:
jsonldPredicate:
_id: http://www.ebi.ac.uk/efo/EFO_0000532
_type: "@id"
+ identity: true
host_id:
doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples
type: string
@@ -29,6 +30,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/PATO_0000047
_type: "@id"
+ identity: true
host_age:
doc: Age of the host as number (e.g. 50)
type: int?
@@ -40,6 +42,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/NCIT_C42574
_type: "@id"
+ identity: true
host_health_status:
doc: A condition or state at a particular time
type: string?
@@ -79,12 +82,14 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
+ identity: true
specimen_source2:
doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155835 (=throat swabb)
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0001479
_type: "@id"
+ identity: true
collection_date:
doc: Date when the sample was taken
type: string
@@ -96,6 +101,7 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/GAZ_00000448
_type: "@id"
+ identity: true
sample_storage_conditions:
doc: Information about storage of a specified type, e.g. frozen specimen, paraffin, fresh ....
type: string?
@@ -126,6 +132,7 @@ $graph:
jsonldPredicate:
_id: http://edamontology.org/data_1875
_type: "@id"
+ identity: true
virus_strain:
doc: Name of the virus strain
type: string?
@@ -141,12 +148,14 @@ $graph:
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
+ identity: true
sample_sequencing_technology2:
doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION)
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/OBI_0600047
_type: "@id"
+ identity: true
sequence_assembly_method:
doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome
type: string?
@@ -215,7 +224,7 @@ $graph:
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
_type: "@id"
- noLinkCheck: true
+ identity: true
- name: MainSchema
type: record
diff --git a/workflows/pangenome-generate/merge-metadata.cwl b/workflows/pangenome-generate/merge-metadata.cwl
index 9164c09..fcefe32 100644
--- a/workflows/pangenome-generate/merge-metadata.cwl
+++ b/workflows/pangenome-generate/merge-metadata.cwl
@@ -7,6 +7,8 @@ inputs:
metadata: File[]
metadataSchema: File
subjects: string[]
+ dups: File?
+ originalLabels: File
outputs:
merged: stdout
stdout: mergedmetadata.ttl
diff --git a/workflows/pangenome-generate/merge-metadata.py b/workflows/pangenome-generate/merge-metadata.py
index 64275b1..bfec781 100644
--- a/workflows/pangenome-generate/merge-metadata.py
+++ b/workflows/pangenome-generate/merge-metadata.py
@@ -1,9 +1,13 @@
+import re
import schema_salad.schema
import schema_salad.jsonld_context
+import json
metadataSchema = '$(inputs.metadataSchema.path)'
metadata = $(inputs.metadata)
subjects = $(inputs.subjects)
+dups = json.loads('''$(inputs.dups)''')
+originalLabels = $(inputs.originalLabels)
(document_loader,
avsc_names,
@@ -11,7 +15,22 @@ subjects = $(inputs.subjects)
metaschema_loader) = schema_salad.schema.load_schema(metadataSchema)
for i, m in enumerate(metadata):
- doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, m["path"], True)
+ doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, m["path"], False, False)
doc["id"] = subjects[i]
g = schema_salad.jsonld_context.makerdf(subjects[i], doc, document_loader.ctx)
print(g.serialize(format="ntriples").decode("utf-8"))
+
+import logging
+
+if dups:
+ sameseqs = open(dups["path"], "rt")
+ for d in sameseqs:
+ logging.warn(d)
+ g = re.match(r"\\d+\\t(.*)", d)
+ logging.warn("%s", g.group(1))
+ sp = g.group(1).split(",")
+ for n in sp[1:]:
+ print("<%s> <http://biohackathon.org/bh20-seq-schema/has_duplicate_sequence> <%s> ." % (n.strip(), sp[0].strip()))
+
+orig = open(originalLabels["path"], "rt")
+print(orig.read())
diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl
index 896f936..0cb1368 100644
--- a/workflows/pangenome-generate/pangenome-generate.cwl
+++ b/workflows/pangenome-generate/pangenome-generate.cwl
@@ -26,15 +26,11 @@ steps:
in:
readsFA: inputReads
subjects: subjects
- out: [relabeledSeqs]
+ out: [relabeledSeqs, originalLabels]
run: relabel-seqs.cwl
- common:
- in: {readsFA: relabel/relabeledSeqs}
- out: [duplicatedReads]
- run: seqkit-common.cwl
dedup:
in: {readsFA: relabel/relabeledSeqs}
- out: [readsMergeDedup]
+ out: [readsMergeDedup, dups]
run: seqkit-rmdup.cwl
overlapReads:
in: {readsFA: dedup/readsMergeDedup}
@@ -63,5 +59,7 @@ steps:
metadata: metadata
metadataSchema: metadataSchema
subjects: subjects
+ dups: dedup/dups
+ originalLabels: relabel/originalLabels
out: [merged]
run: merge-metadata.cwl
diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl
index b5b7231..2b780d4 100644
--- a/workflows/pangenome-generate/relabel-seqs.cwl
+++ b/workflows/pangenome-generate/relabel-seqs.cwl
@@ -5,7 +5,13 @@ inputs:
subjects: string[]
outputs:
relabeledSeqs:
- type: stdout
+ type: File
+ outputBinding:
+ glob: relabeledSeqs.fasta
+ originalLabels:
+ type: File
+ outputBinding:
+ glob: originalLabels.ttl
requirements:
InlineJavascriptRequirement: {}
InitialWorkDirRequirement:
@@ -15,5 +21,5 @@ requirements:
hints:
DockerRequirement:
dockerPull: commonworkflowlanguage/cwltool_module
-stdout: relabeledSeqs.fasta
+stdout:
baseCommand: [python, relabel-seqs.py]
diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py
index 32f2386..b558fe2 100644
--- a/workflows/pangenome-generate/relabel-seqs.py
+++ b/workflows/pangenome-generate/relabel-seqs.py
@@ -1,13 +1,15 @@
-import sys
-
reads = $(inputs.readsFA)
subjects = $(inputs.subjects)
+relabeled_fasta = open("relabeledSeqs.fasta", "wt")
+original_labels = open("originalLabels.ttl", "wt")
+
for i, r in enumerate(reads):
with open(r["path"], "rt") as fa:
- fa.readline()
- print(">"+subjects[i])
+ label = fa.readline()
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \\"%s\\" .\\n" % (subjects[i], label[1:].strip().replace('"', '\\\\"')))
+ relabeled_fasta.write(">"+subjects[i]+"\\n")
data = fa.read(8096)
while data:
- sys.stdout.write(data)
+ relabeled_fasta.write(data)
data = fa.read(8096)
diff --git a/workflows/pangenome-generate/seqkit-rmdup.cwl b/workflows/pangenome-generate/seqkit-rmdup.cwl
index 07184c3..071fa66 100644
--- a/workflows/pangenome-generate/seqkit-rmdup.cwl
+++ b/workflows/pangenome-generate/seqkit-rmdup.cwl
@@ -1,14 +1,14 @@
cwlVersion: v1.1
class: CommandLineTool
inputs:
- readsFA: File[]
+ readsFA: File
outputs:
readsMergeDedup:
type: File
outputBinding:
glob: readsMergeDedup.fasta
dups:
- type: File
+ type: File?
outputBinding:
glob: dups.txt
requirements:
diff --git a/workflows/pangenome-generate/testjob.yml b/workflows/pangenome-generate/testjob.yml
new file mode 100644
index 0000000..a48aff8
--- /dev/null
+++ b/workflows/pangenome-generate/testjob.yml
@@ -0,0 +1,16 @@
+inputReads:
+ - class: File
+ location: ../../example/sequence.fasta
+ - class: File
+ location: ../../example/sequence.fasta
+metadata:
+ - class: File
+ location: ../../example/metadata.yaml
+ - class: File
+ location: ../../example/metadata.yaml
+metadataSchema:
+ class: File
+ location: ../../bh20sequploader/bh20seq-schema.yml
+subjects:
+ - http://arvados.org/keep/seq1
+ - http://arvados.org/keep/seq2