aboutsummaryrefslogtreecommitdiff
path: root/bh20sequploader
diff options
context:
space:
mode:
Diffstat (limited to 'bh20sequploader')
-rw-r--r--bh20sequploader/bh20seq-schema.yml15
-rw-r--r--bh20sequploader/bh20seq-shex.rdf25
-rw-r--r--bh20sequploader/main.py22
-rw-r--r--bh20sequploader/qc_fasta.py4
4 files changed, 40 insertions, 26 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index b3d4d12..0aead3b 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -1,6 +1,7 @@
$base: http://biohackathon.org/bh20-seq-schema
$namespaces:
cc: http://creativecommons.org/ns#
+ dc: http://purl.org/metadata/dublin_core_elements#
sch: https://schema.org/
efo: http://www.ebi.ac.uk/efo/
obo: http://purl.obolibrary.org/obo/
@@ -15,24 +16,29 @@ $graph:
fields:
license_type:
doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
- type: string?
+ type: string
jsonldPredicate:
_id: https://creativecommons.org/ns#License
title:
doc: Attribution title related to data license
type: string?
jsonldPredicate:
- _id: http://semanticscience.org/resource/SIO_001167
+ _id: http://purl.org/metadata/dublin_core_elements#Title
+ attribution_name:
+ doc: Attribution NAME related to data license
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#attributionName
attribution_url:
doc: Attribution URL related to data license
type: string?
jsonldPredicate:
- _id: https://creativecommons.org/ns#Work
+ _id: https://creativecommons.org/ns#attributionURL
attribution_source:
doc: Attribution source URL related to data license
type: string?
jsonldPredicate:
- _id: https://creativecommons.org/ns#Work
+ _id: https://creativecommons.org/ns#attributionSource
- name: hostSchema
type: record
@@ -258,6 +264,7 @@ $graph:
virus: virusSchema
technology: technologySchema
submitter: submitterSchema
+ license: ["null", licenseSchema]
id:
doc: The subject (eg the fasta/fastq file) that the metadata describes
type: string
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 965229c..bbc7309 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -1,6 +1,8 @@
PREFIX : <https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#>
PREFIX MainSchema: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
PREFIX hostSchema: <http://biohackathon.org/bh20-seq-schema#hostSchema/>
+PREFIX cc: <http://creativecommons.org/ns#>
+PREFIX dc: <http://purl.org/metadata/dublin_core_elements#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX sio: <http://semanticscience.org/resource/>
@@ -15,10 +17,11 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
MainSchema:submitter @:submitterShape ;
MainSchema:technology @:technologyShape ;
MainSchema:virus @:virusShape;
+ MainSchema:license @:licenseShape ?;
}
:hostShape {
- efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
+ efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
sio:SIO_000115 xsd:string ?;
obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 obo:PATO_0001340] ?;
obo:PATO_0000011 xsd:integer ?;
@@ -32,14 +35,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
:sampleShape {
sio:SIO_000115 xsd:string;
- evs:C25164 xsd:string;
- obo:GAZ_00000448 [wikidata:~] ;
+ evs:C25164 xsd:string;
+ obo:GAZ_00000448 [wikidata:~] ;
obo:OBI_0001895 xsd:string ?;
obo:NCIT_C41206 xsd:string ?;
obo:OBI_0001479 IRI {0,2};
obo:OBI_0001472 xsd:string ?;
sio:SIO_001167 xsd:string ?;
- edam:data_2091 IRI {0,3};
+ edam:data_2091 IRI {0,3};
}
:submitterShape {
@@ -47,7 +50,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
sio:SIO_000116 xsd:string *;
sio:SIO_000172 xsd:string ?;
obo:NCIT_C37984 xsd:string ?;
- obo:NCIT_C37900 xsd:string ?;
+ obo:NCIT_C37900 xsd:string ?;
efo:EFO_0001741 xsd:string ?;
obo:NCIT_C42781 xsd:string ?;
obo:NCIT_C19026 xsd:string ?;
@@ -63,6 +66,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
}
:virusShape{
- edam:data_1875 [ obo:NCBITaxon_~ ] ;
- sio:SIO_010055 xsd:string ?;
+ edam:data_1875 [ obo:NCBITaxon_~ ] ;
+ sio:SIO_010055 xsd:string ?;
}
+
+:licenseShape{
+ cc:License xsd:string ;
+ dc:Title xsd:string ?;
+ cc:attributionName xsd:string ?;
+ cc:attributionURL xsd:string ?;
+ cc:attributionSource xsd:string ?;
+} \ No newline at end of file
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index f744a8c..6049bf9 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -29,11 +29,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
try:
log.debug("Checking metadata" if do_qc else "Skipping metadata check")
if do_qc and not qc_metadata(metadata.name):
- log.warning("Failed metadata qc")
+ log.warning("Failed metadata QC")
failed = True
except Exception as e:
- log.debug(e)
- print(e)
+ log.exception("Failed metadata QC")
failed = True
target = []
@@ -45,8 +44,7 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
target[0] = ("reads_1."+target[0][0][6:], target[0][1])
target[1] = ("reads_2."+target[1][0][6:], target[0][1])
except Exception as e:
- log.debug(e)
- print(e)
+ log.exception("Failed sequence QC")
failed = True
if failed:
@@ -82,7 +80,7 @@ def main():
seqlabel = target[0][1]
if args.validate:
- print("Valid")
+ log.info("Valid")
exit(0)
col = arvados.collection.Collection(api_client=api)
@@ -91,10 +89,10 @@ def main():
if args.sequence_p2:
upload_sequence(col, target[1], args.sequence_p2)
- print("Reading metadata")
+ log.info("Reading metadata")
with col.open("metadata.yaml", "w") as f:
r = args.metadata.read(65536)
- print(r[0:20])
+ log.info(r[0:20])
while r:
f.write(r)
r = args.metadata.read(65536)
@@ -118,7 +116,7 @@ def main():
["portable_data_hash", "=", col.portable_data_hash()]]).execute()
if dup["items"]:
# This exact collection has been uploaded before.
- print("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
+ log.error("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
exit(1)
if args.trusted:
@@ -131,9 +129,9 @@ def main():
(seqlabel, properties['upload_user'], properties['upload_ip']),
properties=properties, ensure_unique_name=True)
- print("Saved to %s" % col.manifest_locator())
-
- print("Done")
+ log.info("Saved to %s" % col.manifest_locator())
+ log.info("Done")
+ exit(0)
if __name__ == "__main__":
main()
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 37eb4e8..0c7e16d 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -84,10 +84,8 @@ def qc_fasta(arg_sequence, check_with_clustalw=True):
except Exception as e:
logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e)
- if similarity and similarity < 70.0:
+ if similarity < 70.0:
raise ValueError("QC fail: alignment to reference was less than 70%% (was %2.2f%%)" % (similarity))
- if similarity == 0:
- raise ValueError("QC fail")
return ("sequence.fasta"+gz, seqlabel)
elif seq_type == "text/fastq":