From 88d81f853cf04b7f28681dd9cdee775b0422f252 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 21 Apr 2020 12:53:19 -0400 Subject: Working on NCBI import Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- scripts/foreach.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 scripts/foreach.sh (limited to 'scripts/foreach.sh') diff --git a/scripts/foreach.sh b/scripts/foreach.sh new file mode 100755 index 0000000..35b07b8 --- /dev/null +++ b/scripts/foreach.sh @@ -0,0 +1,18 @@ +#!/bin/sh +rm -rf validated fasta_and_yaml_* +mkdir -p validated +./from_genbank_to_fasta_and_yaml.py +fasta_files=$(find fasta_and_yaml_20200421/ -name "*.fasta") +for f in $fasta_files ; do + yaml=$(echo $f | rev | cut -c7- | rev).yaml + echo $f + echo $yaml + if bh20-seq-uploader --validate $f $yaml ; then + sz=$(stat --format=%s $f) + if test $sz -gt 20000 ; then + mv $f $yaml validated + else + echo "Fasta file too small" + fi + fi +done -- cgit v1.2.3 From a12fe94f174da766be612fbb2712b4db2ba98296 Mon Sep 17 00:00:00 2001 From: lltommy Date: Wed, 22 Apr 2020 19:41:27 +0200 Subject: Small changes all around, trying to make the importer/metadata better --- bh20sequploader/bh20seq-schema.yml | 4 ++-- bh20sequploader/bh20seq-shex.rdf | 25 +++++++++++++++---------- example/metadata.yaml | 8 ++++---- scripts/foreach.sh | 2 +- scripts/from_genbank_to_fasta_and_yaml.py | 12 ++++++++---- 5 files changed, 30 insertions(+), 21 deletions(-) (limited to 'scripts/foreach.sh') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 1ceebe2..80013c3 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -25,7 +25,7 @@ $graph: jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000115 host_sex: - doc: Sex of the host as defined in NCIT, IRI expected (http://purl.obolibrary.org/obo/NCIT_C20197 (Male), http://purl.obolibrary.org/obo/NCIT_C27993 (Female), http://purl.obolibrary.org/obo/NCIT_C45908 (Intersex), or http://purl.obolibrary.org/obo/NCIT_C17998 (Unknown)) + doc: Sex of the host as defined in PATO, expect male () or female () type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/PATO_0000047 @@ -144,7 +144,7 @@ $graph: fields: sample_sequencing_technology: doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) - type: string + type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0600047 _type: "@id" diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 31e714f..8d0055e 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -23,35 +23,40 @@ PREFIX wikidata: obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?; obo:PATO_0000011 xsd:integer ?; obo:NCIT_C42574 [ obo:UO_~ ] ?; - sio:SIO_001167 xsd:string ?; + obo:NCIT_C25688 xsd:string ? ; efo:EFO_0000727 xsd:string ?; + obo:VO_0000002 xsd:string ?; + sio:SIO_001167 xsd:string ?; } :sampleShape { sio:SIO_000115 xsd:string; - obo:GAZ_00000448 [wikidata:~] ; evs:C25164 xsd:string; + obo:GAZ_00000448 [wikidata:~] ; obo:OBI_0001895 xsd:string ?; - sio:SIO_001167 xsd:string ?; - obo:OBI_0001472 xsd:string ?; + obo:NCIT_C41206 xsd:string ?; obo:OBI_0001479 IRI {0,2}; + obo:OBI_0001472 xsd:string ?; + sio:SIO_001167 xsd:string ?; } :submitterShape { obo:NCIT_C42781 xsd:string ; - obo:NCIT_C37984 xsd:string ?; - obo:NCIT_C37900 xsd:string ?; sio:SIO_000116 xsd:string ?; - obo:OBI_0600047 xsd:string ?; - sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?; sio:SIO_000172 xsd:string ?; + obo:NCIT_C37984 xsd:string ?; + obo:OBI_0600047 xsd:string ?; + obo:NCIT_C37900 xsd:string ?; efo:EFO_0001741 xsd:string ?; + obo:NCIT_C19026 xsd:string ?; + sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?; } :technologyShape { - obo:OBI_0600047 IRI {0,2} ; - obo:FLU_0000848 xsd:double ?; + obo:OBI_0600047 IRI {0,2} ?; efo:EFO_0002699 xsd:string ?; + obo:FLU_0000848 xsd:double {0,2}; + sio:SIO_001167 xsd:string ?; } :virusShape{ diff --git a/example/metadata.yaml b/example/metadata.yaml index 57d90b5..d1b10c1 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -6,7 +6,7 @@ host: host_sex: http://purl.obolibrary.org/obo/NCIT_C27993 host_age: 20 host_age_unit: http://purl.obolibrary.org/obo/UO_0000036 - host_health_status: A condition or state at a particular time (Disease ontology) + host_health_status: A condition or state at a particular time host_treatment: Process in which the act is intended to modify or alter host status (Compounds) host_vaccination: List of vaccines given to the host (RRIDs?) additional_host_information: Field for additional host information @@ -29,15 +29,15 @@ virus: technology: sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0009173 sample_sequencing_technology2: http://www.ebi.ac.uk/efo/EFO_0009173 - sequence_assembly_method: Protocol used for assembly (CWL, WDL, NF, BCO?) + sequence_assembly_method: Protocol used for assembly sequencing_coverage: 70 submitter: - submitter_name: John Doe (ORCID?) + submitter_name: John Doe submitter_address: John Doe's adress originating_lab: John Doe kitchen lab_address: John Doe's address provider_sample_id: HmX submitter_sample_id: xXx authors: John Doe et all - submitter_orcid: https://orcid.org/0000-0000-0000-0000 (if this is here, others can be optional?) + submitter_orcid: https://orcid.org/0000-0000-0000-0000 \ No newline at end of file diff --git a/scripts/foreach.sh b/scripts/foreach.sh index 35b07b8..ddc9387 100755 --- a/scripts/foreach.sh +++ b/scripts/foreach.sh @@ -2,7 +2,7 @@ rm -rf validated fasta_and_yaml_* mkdir -p validated ./from_genbank_to_fasta_and_yaml.py -fasta_files=$(find fasta_and_yaml_20200421/ -name "*.fasta") +fasta_files=$(find fasta_and_yaml/ -name "*.fasta") for f in $fasta_files ; do yaml=$(echo $f | rev | cut -c7- | rev).yaml echo $f diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 00c0012..096a6af 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -8,10 +8,11 @@ import json import os from datetime import date -today = date.today().strftime("%Y%m%d") +#today = date.today().strftime("%Y%m%d") -dir_metadata_today = 'metadata_from_nuccore_{}'.format(today) -dir_fasta_and_yaml_today = 'fasta_and_yaml_{}'.format(today) + +dir_metadata_today = 'metadata_from_nuccore' #_{}'.format(today) +dir_fasta_and_yaml_today = 'fasta_and_yaml' #'.format(today) dir_dict_ontology_standardization = 'dict_ontology_standardization/' @@ -177,7 +178,10 @@ if not os.path.exists(dir_fasta_and_yaml_today): if len(GBQualifier_value_text_list) > 1: if GBQualifier_value_text_list[1] in ['male', 'female']: - info_for_yaml_dict['host']['host_sex'] = GBQualifier_value_text_list[1] + if GBQualifier_value_text_list[1]=='male': + info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" + elif GBQualifier_value_text_list[1]=='female': + info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000383" else: info_for_yaml_dict['host']['host_health_status'] = GBQualifier_value_text_list[1] -- cgit v1.2.3