diff options
Diffstat (limited to 'bh20sequploader')
-rw-r--r-- | bh20sequploader/bh20seq-schema.yml | 15 | ||||
-rw-r--r-- | bh20sequploader/bh20seq-shex.rdf | 25 | ||||
-rw-r--r-- | bh20sequploader/main.py | 22 | ||||
-rw-r--r-- | bh20sequploader/qc_fasta.py | 4 |
4 files changed, 40 insertions, 26 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index b3d4d12..0aead3b 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,6 +1,7 @@ $base: http://biohackathon.org/bh20-seq-schema $namespaces: cc: http://creativecommons.org/ns# + dc: http://purl.org/metadata/dublin_core_elements# sch: https://schema.org/ efo: http://www.ebi.ac.uk/efo/ obo: http://purl.obolibrary.org/obo/ @@ -15,24 +16,29 @@ $graph: fields: license_type: doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf - type: string? + type: string jsonldPredicate: _id: https://creativecommons.org/ns#License title: doc: Attribution title related to data license type: string? jsonldPredicate: - _id: http://semanticscience.org/resource/SIO_001167 + _id: http://purl.org/metadata/dublin_core_elements#Title + attribution_name: + doc: Attribution NAME related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionName attribution_url: doc: Attribution URL related to data license type: string? jsonldPredicate: - _id: https://creativecommons.org/ns#Work + _id: https://creativecommons.org/ns#attributionURL attribution_source: doc: Attribution source URL related to data license type: string? jsonldPredicate: - _id: https://creativecommons.org/ns#Work + _id: https://creativecommons.org/ns#attributionSource - name: hostSchema type: record @@ -258,6 +264,7 @@ $graph: virus: virusSchema technology: technologySchema submitter: submitterSchema + license: ["null", licenseSchema] id: doc: The subject (eg the fasta/fastq file) that the metadata describes type: string diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 965229c..bbc7309 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -1,6 +1,8 @@ PREFIX : <https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#> PREFIX MainSchema: <http://biohackathon.org/bh20-seq-schema#MainSchema/> PREFIX hostSchema: <http://biohackathon.org/bh20-seq-schema#hostSchema/> +PREFIX cc: <http://creativecommons.org/ns#> +PREFIX dc: <http://purl.org/metadata/dublin_core_elements#> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX obo: <http://purl.obolibrary.org/obo/> PREFIX sio: <http://semanticscience.org/resource/> @@ -15,10 +17,11 @@ PREFIX wikidata: <http://www.wikidata.org/entity/> MainSchema:submitter @:submitterShape ; MainSchema:technology @:technologyShape ; MainSchema:virus @:virusShape; + MainSchema:license @:licenseShape ?; } :hostShape { - efo:EFO_0000532 [ obo:NCBITaxon_~ ] ; + efo:EFO_0000532 [ obo:NCBITaxon_~ ] ; sio:SIO_000115 xsd:string ?; obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 obo:PATO_0001340] ?; obo:PATO_0000011 xsd:integer ?; @@ -32,14 +35,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/> :sampleShape { sio:SIO_000115 xsd:string; - evs:C25164 xsd:string; - obo:GAZ_00000448 [wikidata:~] ; + evs:C25164 xsd:string; + obo:GAZ_00000448 [wikidata:~] ; obo:OBI_0001895 xsd:string ?; obo:NCIT_C41206 xsd:string ?; obo:OBI_0001479 IRI {0,2}; obo:OBI_0001472 xsd:string ?; sio:SIO_001167 xsd:string ?; - edam:data_2091 IRI {0,3}; + edam:data_2091 IRI {0,3}; } :submitterShape { @@ -47,7 +50,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/> sio:SIO_000116 xsd:string *; sio:SIO_000172 xsd:string ?; obo:NCIT_C37984 xsd:string ?; - obo:NCIT_C37900 xsd:string ?; + obo:NCIT_C37900 xsd:string ?; efo:EFO_0001741 xsd:string ?; obo:NCIT_C42781 xsd:string ?; obo:NCIT_C19026 xsd:string ?; @@ -63,6 +66,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/> } :virusShape{ - edam:data_1875 [ obo:NCBITaxon_~ ] ; - sio:SIO_010055 xsd:string ?; + edam:data_1875 [ obo:NCBITaxon_~ ] ; + sio:SIO_010055 xsd:string ?; } + +:licenseShape{ + cc:License xsd:string ; + dc:Title xsd:string ?; + cc:attributionName xsd:string ?; + cc:attributionURL xsd:string ?; + cc:attributionSource xsd:string ?; +}
\ No newline at end of file diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index f744a8c..6049bf9 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -29,11 +29,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True): try: log.debug("Checking metadata" if do_qc else "Skipping metadata check") if do_qc and not qc_metadata(metadata.name): - log.warning("Failed metadata qc") + log.warning("Failed metadata QC") failed = True except Exception as e: - log.debug(e) - print(e) + log.exception("Failed metadata QC") failed = True target = [] @@ -45,8 +44,7 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True): target[0] = ("reads_1."+target[0][0][6:], target[0][1]) target[1] = ("reads_2."+target[1][0][6:], target[0][1]) except Exception as e: - log.debug(e) - print(e) + log.exception("Failed sequence QC") failed = True if failed: @@ -82,7 +80,7 @@ def main(): seqlabel = target[0][1] if args.validate: - print("Valid") + log.info("Valid") exit(0) col = arvados.collection.Collection(api_client=api) @@ -91,10 +89,10 @@ def main(): if args.sequence_p2: upload_sequence(col, target[1], args.sequence_p2) - print("Reading metadata") + log.info("Reading metadata") with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) - print(r[0:20]) + log.info(r[0:20]) while r: f.write(r) r = args.metadata.read(65536) @@ -118,7 +116,7 @@ def main(): ["portable_data_hash", "=", col.portable_data_hash()]]).execute() if dup["items"]: # This exact collection has been uploaded before. - print("Duplicate of %s" % ([d["uuid"] for d in dup["items"]])) + log.error("Duplicate of %s" % ([d["uuid"] for d in dup["items"]])) exit(1) if args.trusted: @@ -131,9 +129,9 @@ def main(): (seqlabel, properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) - print("Saved to %s" % col.manifest_locator()) - - print("Done") + log.info("Saved to %s" % col.manifest_locator()) + log.info("Done") + exit(0) if __name__ == "__main__": main() diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index 37eb4e8..0c7e16d 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -84,10 +84,8 @@ def qc_fasta(arg_sequence, check_with_clustalw=True): except Exception as e: logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e) - if similarity and similarity < 70.0: + if similarity < 70.0: raise ValueError("QC fail: alignment to reference was less than 70%% (was %2.2f%%)" % (similarity)) - if similarity == 0: - raise ValueError("QC fail") return ("sequence.fasta"+gz, seqlabel) elif seq_type == "text/fastq": |