From 87e634040767373309fd7eb99784de0537f72059 Mon Sep 17 00:00:00 2001
From: Andrea Guarracino
Date: Fri, 10 Jul 2020 11:50:06 +0200
Subject: other term for Homo sapiens (for SRA samples)
---
scripts/dict_ontology_standardization/ncbi_host_species.csv | 1 +
1 file changed, 1 insertion(+)
diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv
index 40572a3..0bfc455 100644
--- a/scripts/dict_ontology_standardization/ncbi_host_species.csv
+++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv
@@ -2,6 +2,7 @@ Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
human,http://purl.obolibrary.org/obo/NCBITaxon_9606
Human,http://purl.obolibrary.org/obo/NCBITaxon_9606
sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
+homosapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666
Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974
Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685
--
cgit v1.2.3
From 1655762b516804dad3d71538e95d97d74653c3e9 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Fri, 10 Jul 2020 13:43:59 +0200
Subject: updated metadata source
---
.../SraExperimentPackage.2020.07.05.xml.gz | Bin 6502056 -> 0 bytes
.../SraExperimentPackage.2020.07.09.xml.gz | Bin 0 -> 9744133 bytes
2 files changed, 0 insertions(+), 0 deletions(-)
delete mode 100644 scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
create mode 100644 scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
deleted file mode 100644
index 88acb18..0000000
Binary files a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz and /dev/null differ
diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
new file mode 100644
index 0000000..93ef550
Binary files /dev/null and b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz differ
--
cgit v1.2.3
From 8cb542fdf60273aec7ec107f8bc4896375381263 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Fri, 10 Jul 2020 13:55:49 +0200
Subject: an output file is created with the accessions for which no YAML file
is created
---
scripts/create_sra_metadata/create_sra_metadata.py | 55 +++++++++++++---------
.../from_genbank_to_fasta_and_yaml.py | 10 +++-
2 files changed, 41 insertions(+), 24 deletions(-)
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index ef0d119..10ac85b 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -8,7 +8,7 @@ import gzip
dir_yaml = 'yaml'
-date = '2020.07.05'
+date = '2020.07.09'
# Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D)
# Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases)
@@ -50,13 +50,14 @@ sra_metadata_xml_file.close()
EXPERIMENT_PACKAGE_SET = tree.getroot()
missing_value_list = []
+not_created_accession_list = []
run_accession_set = set()
run_accession_to_downloadble_file_url_dict = {}
for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
#print(i, EXPERIMENT_PACKAGE)
-
+
# A general default-empty yaml could be read from the definitive one
info_for_yaml_dict = {
'id': 'placeholder',
@@ -74,17 +75,17 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
#print(accession)
info_for_yaml_dict['sample']['sample_id'] = accession
-
+
#SRAFiles = RUN.find('SRAFiles')
#if SRAFiles is not None:
# url = SRAFiles.find('SRAFile').attrib['url']
# if 'sra-download.ncbi.nlm.nih.gov' in url:
# run_accession_to_downloadble_file_url_dict[accession] = url
-
+
SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE')
SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE')
-
+
for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list:
VALUE = SAMPLE_ATTRIBUTE.find('VALUE')
if VALUE is not None:
@@ -101,7 +102,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text]))
elif TAG_text in ['host_health_status', 'host health state']:
if VALUE_text in term_to_uri_dict:
- info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
+ info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']:
missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text]))
elif TAG_text in ['strain', 'isolate']:
@@ -113,12 +114,12 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
if value_to_insert in term_to_uri_dict:
value_to_insert = term_to_uri_dict[value_to_insert]
-
- if 'virus_strain' not in info_for_yaml_dict:
+
+ if 'virus_strain' not in info_for_yaml_dict:
info_for_yaml_dict['virus']['virus_strain'] = value_to_insert
else:
info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert
- elif TAG_text in ['isolation_source', 'isolation source host-associated']:
+ elif TAG_text in ['isolation_source', 'isolation source host-associated']:
if VALUE_text in term_to_uri_dict:
info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
else:
@@ -145,7 +146,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
elif TAG_text == 'collected_by':
if VALUE_text.lower() not in ['not available', 'missing']:
name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli']
-
+
info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text
elif TAG_text == 'collecting institution':
if VALUE_text.lower() not in ['not provided', 'na']:
@@ -154,11 +155,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
if VALUE_text.lower() not in ['not applicable', 'missing', 'na']:
date_to_write = VALUE_text
date_is_estimated = True
-
+
VALUE_text_list = VALUE_text.split('-')
if len(VALUE_text_list) == 3:
date_is_estimated = False
-
+
if VALUE_text_list[1].isalpha():
date_to_write = parse(VALUE_text).strftime('%Y-%m-%d')
elif len(VALUE_text_list) == 2:
@@ -170,7 +171,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
date_to_write = "{}-01-15".format(VALUE_text)
info_for_yaml_dict['sample']['collection_date'] = date_to_write
-
+
if date_is_estimated:
if 'additional_collection_information' in info_for_yaml_dict['sample']:
info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
@@ -188,8 +189,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text
info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id
-
-
+
+
EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT')
INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0]
@@ -206,18 +207,18 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION')
info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession']
-
+
if SUBMISSION.attrib['lab_name'].lower() not in ['na']:
info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name']
- STUDY = EXPERIMENT_PACKAGE.find('STUDY')
+ STUDY = EXPERIMENT_PACKAGE.find('STUDY')
info_for_yaml_dict['submitter']['publication'] = STUDY.attrib['alias']
-
-
+
+
Organization = EXPERIMENT_PACKAGE.find('Organization')
Organization_Name = Organization.find('Name')
info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text]
-
+
Organization_Contact = Organization.find('Contact')
if Organization_Contact is not None:
Organization_Contact_Name = Organization_Contact.find('Name')
@@ -231,20 +232,28 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
Organization_Address = Organization.find('Address')
if Organization_Address is not None:
info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']])
-
+
if 'collection_date' not in info_for_yaml_dict['sample']:
info_for_yaml_dict['sample']['collection_date'] = '1970-01-01'
info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing"
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
- print(accession, ' - technology not found')
+ #print(accession, ' - technology not found')
+ not_created_accession_list.append([accession, 'technology not found'])
continue
with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
json.dump(info_for_yaml_dict, fw, indent=2)
-
+
if len(missing_value_list) > 0:
path_missing_terms_tsv = 'missing_terms.tsv'
print('Written missing terms in {}'.format(path_missing_terms_tsv))
with open(path_missing_terms_tsv, 'w') as fw:
fw.write('\n'.join(missing_value_list))
+
+if len(not_created_accession_list) > 0:
+ path_not_created_accession_tsv = 'not_created_accession.tsv'
+ print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+ with open(path_not_created_accession_tsv, 'w') as fw:
+ fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 39e401a..d5b0ffd 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -138,6 +138,7 @@ min_len_to_count = 27500
num_seq_with_len_ge_X_bp = 0
missing_value_list = []
+not_created_accession_list = []
accession_with_errors_list = []
for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
@@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
- print(accession_version, ' - technology not found')
+ #print(accession_version, ' - technology not found')
+ not_created_accession_list.append([accession_version, 'technology not found'])
continue
with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
@@ -400,4 +402,10 @@ if len(accession_with_errors_list) > 0:
with open(path_accession_with_errors_tsv, 'w') as fw:
fw.write('\n'.join(accession_with_errors_list))
+if len(not_created_accession_list) > 0:
+ path_not_created_accession_tsv = 'not_created_accession.tsv'
+ print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+ with open(path_not_created_accession_tsv, 'w') as fw:
+ fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))
--
cgit v1.2.3
From 2eab71a70b8630649303a9319e1baf9fa06f8ab4 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Fri, 10 Jul 2020 15:39:02 +0200
Subject: metadata with missing host_species are not created
---
scripts/create_sra_metadata/create_sra_metadata.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index 10ac85b..a31bd36 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -242,6 +242,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
not_created_accession_list.append([accession, 'technology not found'])
continue
+ if 'host_species' not in info_for_yaml_dict['host']:
+ #print(accession, ' - technology not found')
+ not_created_accession_list.append([accession, 'missing host species'])
+ continue
+
with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
json.dump(info_for_yaml_dict, fw, indent=2)
--
cgit v1.2.3
From bb90f06da570624952d4b7001ee37fc7018e3a7d Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Sun, 12 Jul 2020 15:58:29 +0200
Subject: added a suffix to distinguish which script created the error/warning
files
---
scripts/create_sra_metadata/create_sra_metadata.py | 4 ++--
scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py | 6 +++---
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index a31bd36..352a30e 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -251,13 +251,13 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
json.dump(info_for_yaml_dict, fw, indent=2)
if len(missing_value_list) > 0:
- path_missing_terms_tsv = 'missing_terms.tsv'
+ path_missing_terms_tsv = 'missing_terms.sra.tsv'
print('Written missing terms in {}'.format(path_missing_terms_tsv))
with open(path_missing_terms_tsv, 'w') as fw:
fw.write('\n'.join(missing_value_list))
if len(not_created_accession_list) > 0:
- path_not_created_accession_tsv = 'not_created_accession.tsv'
+ path_not_created_accession_tsv = 'not_created_accession.sra.tsv'
print('Written not created accession in {}'.format(path_not_created_accession_tsv))
with open(path_not_created_accession_tsv, 'w') as fw:
fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index d5b0ffd..dbebfbb 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -391,19 +391,19 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
continue
if len(missing_value_list) > 0:
- path_missing_terms_tsv = 'missing_terms.tsv'
+ path_missing_terms_tsv = 'missing_terms.genbank.tsv'
print('Written missing terms in {}'.format(path_missing_terms_tsv))
with open(path_missing_terms_tsv, 'w') as fw:
fw.write('\n'.join(missing_value_list))
if len(accession_with_errors_list) > 0:
- path_accession_with_errors_tsv = 'accession_with_errors.tsv'
+ path_accession_with_errors_tsv = 'accession_with_errors.genbank.tsv'
print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
with open(path_accession_with_errors_tsv, 'w') as fw:
fw.write('\n'.join(accession_with_errors_list))
if len(not_created_accession_list) > 0:
- path_not_created_accession_tsv = 'not_created_accession.tsv'
+ path_not_created_accession_tsv = 'not_created_accession.genbank.tsv'
print('Written not created accession in {}'.format(path_not_created_accession_tsv))
with open(path_not_created_accession_tsv, 'w') as fw:
fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
--
cgit v1.2.3
From f44d555b788e29a1896a69d75401f0e145ad9299 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 15 Jul 2020 11:37:19 +0100
Subject: Update Guix install
---
doc/INSTALL.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 6dcd72b..3b270dd 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -42,7 +42,7 @@ repository.
### Using the Web Uploader
-To run the web uploader in a GNU Guix environment/container
+To run the web uploader in a GNU Guix environment/container run it with something like
```
guix environment guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs --network openssl -- env FLASK_ENV=development PYTHONPATH=$PYTHONPATH:./bh20sequploader FLASK_APP=bh20simplewebuploader/main.py flask run
@@ -59,7 +59,7 @@ WIP: add gunicorn container
Currently the full webserver container deploy command looks like
```
-penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ ~/iwrk/opensource/guix/guix/pre-inst-env guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc clustalw python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_APP=bh20simplewebuploader/main.py flask run
-``
+penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-oinformatics/ ~/iwrk/opensource/guix/guix/pre-inst-env guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc clustalw python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_APP=bh20simplewebuploader/main.py flask run
+```
Note: see above on GUIX_PACKAGE_PATH.
--
cgit v1.2.3
From 9c9512a7e040f8247d259bdc6f9cf55d5d276baf Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 15 Jul 2020 12:48:12 +0100
Subject: Load metadata locally without pkg_resources
---
bh20simplewebuploader/main.py | 9 +++++++--
bh20simplewebuploader/static/main.js | 2 +-
doc/blog/using-covid-19-pubseq-part5.org | 14 +++++++++++---
3 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py
index 77e345b..8b5781a 100644
--- a/bh20simplewebuploader/main.py
+++ b/bh20simplewebuploader/main.py
@@ -227,8 +227,13 @@ def generate_form(schema, options):
# At startup, we need to load the metadata schema from the uploader module, so we can make a form for it
-METADATA_SCHEMA = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
-METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
+if os.path.isfile("bh20sequploader/bh20seq-schema.yml"):
+ METADATA_SCHEMA = yaml.safe_load(open("bh20sequploader/bh20seq-schema.yml","r").read())
+ METADATA_OPTION_DEFINITIONS = yaml.safe_load(open("bh20sequploader/bh20seq-options.yml","r").read())
+else:
+ METADATA_SCHEMA = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
+ METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
+print(METADATA_SCHEMA,file=sys.stderr)
FORM_ITEMS = generate_form(METADATA_SCHEMA, METADATA_OPTION_DEFINITIONS)
@app.route('/')
diff --git a/bh20simplewebuploader/static/main.js b/bh20simplewebuploader/static/main.js
index 751e478..4703047 100644
--- a/bh20simplewebuploader/static/main.js
+++ b/bh20simplewebuploader/static/main.js
@@ -195,7 +195,7 @@ function addField(e) {
// Increment the number and use the keypath and number to set IDs and cross
// references.
// TODO: Heavily dependent on the form field HTML. Maybe we want custom
- // elements for the labeled controlsd that know how to be list items?
+ // elements for the labeled controls that know how to be list items?
fieldNumber++
newField.dataset.number = fieldNumber
let newID = keypath + '[' + fieldNumber + ']'
diff --git a/doc/blog/using-covid-19-pubseq-part5.org b/doc/blog/using-covid-19-pubseq-part5.org
index 4b0ea64..aa06d5e 100644
--- a/doc/blog/using-covid-19-pubseq-part5.org
+++ b/doc/blog/using-covid-19-pubseq-part5.org
@@ -13,6 +13,7 @@
- [[#what-is-the-schema][What is the schema?]]
- [[#how-is-the-website-generated][How is the website generated?]]
- [[#modifying-the-schema][Modifying the schema]]
+ - [[#adding-fields-to-the-form][Adding fields to the form]]
* Modify Metadata
@@ -113,8 +114,15 @@ So, we'll add it simply as a title field. Now the draft schema is
_id: https://creativecommons.org/ns#Work
#+END_SRC
-Now, we are no ontology experts, right? So, next we submit a patch to our source tree and
-ask for feedback before wiring it up in the data entry form. The pull request was
-submitted here FIXME.
+Now, we are no ontology experts, right? So, next we submit a patch to
+our source tree and ask for feedback before wiring it up in the data
+entry form. The pull request was submitted [[https://github.com/arvados/bh20-seq-resource/pull/97][here]] and reviewed on the
+gitter channel and I merged it.
+
+* Adding fields to the form
+
+To add the new fields to the form we have to modify it a little. If we
+go to the upload form we need to add the license box. The schema is
+loaded in [[https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e8b0efec4abfaf892eb6c45/bh20simplewebuploader/main.py#L229][main.py]] in the 'generate_form' function.
/Note: work in progress/
--
cgit v1.2.3
From b9691c7deae30bd6422fb7b0681572b7b6f78ae3 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 15 Jul 2020 14:16:11 +0100
Subject: Web: add license to input form
---
bh20sequploader/bh20seq-schema.yml | 3 ++-
bh20simplewebuploader/main.py | 3 ++-
example/minimal_metadata_example.yaml | 6 +++++-
3 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index b3d4d12..29ac22c 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -15,7 +15,7 @@ $graph:
fields:
license_type:
doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
- type: string?
+ type: string
jsonldPredicate:
_id: https://creativecommons.org/ns#License
title:
@@ -258,6 +258,7 @@ $graph:
virus: virusSchema
technology: technologySchema
submitter: submitterSchema
+ license: licenseSchema
id:
doc: The subject (eg the fasta/fastq file) that the metadata describes
type: string
diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py
index 8b5781a..8a6794e 100644
--- a/bh20simplewebuploader/main.py
+++ b/bh20simplewebuploader/main.py
@@ -47,6 +47,7 @@ def type_to_heading(type_name):
Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading.
"""
+ print(type_name,file=sys.stderr)
# Remove camel case
decamel = re.sub('([A-Z])', r' \1', type_name)
# Split
@@ -233,7 +234,7 @@ if os.path.isfile("bh20sequploader/bh20seq-schema.yml"):
else:
METADATA_SCHEMA = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
-print(METADATA_SCHEMA,file=sys.stderr)
+# print(METADATA_SCHEMA,file=sys.stderr)
FORM_ITEMS = generate_form(METADATA_SCHEMA, METADATA_OPTION_DEFINITIONS)
@app.route('/')
diff --git a/example/minimal_metadata_example.yaml b/example/minimal_metadata_example.yaml
index 51f8a87..1b46cc7 100644
--- a/example/minimal_metadata_example.yaml
+++ b/example/minimal_metadata_example.yaml
@@ -1,5 +1,9 @@
id: placeholder
+
+license:
+ license_type: http://creativecommons.org/licenses/by/4.0/
+
host:
host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
@@ -15,4 +19,4 @@ technology:
sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
submitter:
- authors: [John Doe]
\ No newline at end of file
+ authors: [John Doe]
--
cgit v1.2.3
From f4ed46dae20abe5147871495ede2d6ac2b0854bc Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 15 Jul 2020 14:30:56 +0100
Subject: Add RDF output
---
bh20sequploader/bh20seq-schema.yml | 9 +++++++--
bh20sequploader/bh20seq-shex.rdf | 24 +++++++++++++++++-------
doc/blog/using-covid-19-pubseq-part5.org | 2 ++
3 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 29ac22c..c690e8a 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -23,16 +23,21 @@ $graph:
type: string?
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_001167
+ attribution_name:
+ doc: Attribution NAME related to data license
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#attributionName
attribution_url:
doc: Attribution URL related to data license
type: string?
jsonldPredicate:
- _id: https://creativecommons.org/ns#Work
+ _id: https://creativecommons.org/ns#attributionURL
attribution_source:
doc: Attribution source URL related to data license
type: string?
jsonldPredicate:
- _id: https://creativecommons.org/ns#Work
+ _id: https://creativecommons.org/ns#attributionSource
- name: hostSchema
type: record
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 965229c..c48267d 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -1,6 +1,7 @@
PREFIX :
- Click here to try again. + Click here to try again.