From 1b7199abd2d7f410a46158f9c66b8b373d3574f9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 25 Aug 2020 11:55:11 +0100 Subject: Using Arvados --- doc/blog/using-covid-19-pubseq-part2.html | 161 +++++++++++++++++++++++++----- 1 file changed, 134 insertions(+), 27 deletions(-) (limited to 'doc/blog/using-covid-19-pubseq-part2.html') diff --git a/doc/blog/using-covid-19-pubseq-part2.html b/doc/blog/using-covid-19-pubseq-part2.html index c041ebe..b124c89 100644 --- a/doc/blog/using-covid-19-pubseq-part2.html +++ b/doc/blog/using-covid-19-pubseq-part2.html @@ -3,7 +3,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
- +Arvados has the web server, but it also has a REST API and associated command line tools. We are already using the API to upload data. If @@ -322,13 +322,11 @@ Now, this is a public instance so we can use the tokens from the uploader.
--export ARVADOSAPIHOST='lugli.arvadosapi.com' -export ARVADOSAPITOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' +
export ARVADOS_API_HOST='lugli.arvadosapi.com' +export ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' arv-ls lugli-4zz18-z513nlpqm03hpca - - +
@@ -336,13 +334,11 @@ will list all files (the UUID we got from the Arvados results page). To get the UUID of the files
--curl https://lugli.arvadosapi.com/arvados/v1/config | jq .Users.AnonymousUserToken -env ARVADOSAPITOKEN=5o42qdxpxp5cj15jqjf7vnxx5xduhm4ret703suuoa3ivfglfh \ +
curl https://lugli.arvadosapi.com/arvados/v1/config | jq .Users.AnonymousUserToken +env ARVADOS_API_TOKEN=5o42qdxpxp5cj15jqjf7vnxx5xduhm4ret703suuoa3ivfglfh \ arv-get lugli-4zz18-z513nlpqm03hpca - - +
@@ -356,12 +352,123 @@ arv-get 2be6af7b4741f2a5c5f8ff2bc6152d73+1955623+Ab9ad65d7fe958a053b3a57d545839d
+When you login to Arvados (you can request permission from us) it is +possible to upload an ssh key in your profile and get an shell prompt +with +
+ ++ssh pjotrpbl@shell.lugli.arvadosapi.com +Linux ip-10-255-0-202 4.19.0-9-cloud-amd64 #1 SMP Debian 4.19.118-2+deb10u1 (2020-06-07) x86_64 ++ + + +
+It is a small Debian VM hosted on AWS somewhere. The PubSeq material
+is mounted on /data/pubseq
. The log is in nohup.out
. Update/edit
+the code (bh20-seq-resource git checkout) and restart the service (the
+run script). The log says
+
+you should have permission to read the log (nohup.out) update / edit the code (bh20-seq-resource git checkout) and restart the service (the run script) ++ + +
+which means it will trigger the run on upload. The service is running as a +Python virtualenv: +
+ ++/data/pubseq/bh20-seq-resource/venv3/bin/python3 /data/pubseq/bh20-seq-resource/venv3/bin/bh20-seq-analyzer --no-start-analysis ++ + +
+and is restarted by a run
script:
+
+/data/pubseq/run [options] ++ + +
+The run script kills the old process, sets up the API tokens, pulls +the git repo and starts a new run calling into +/data/pubseq/bh20-seq-resource/venv3/bin/bh20-seq-analyzer which is +essentially monitoring for uploads. +
+
+In above script bh20-seq-analyzer
you can see that the Common
+Workflow Language (CWL) gets triggered; for example fastq2fasta which
+is part of the main repo. The actual script is in fastq2fasta.cwl and
+runs the following tools in sequence: bwa-mem, samtools-view,
+samtools-sort, and bam2fasta.
+
+It probably pays to familiarize yourself with CWL and its concepts. We +believe it has a lot going for it though CWL is some steps removed +from traditional shell scripts for running work flows. Main thing to +understand is that CWL is a separation of concerns, i.e., +
+ ++and each of these are described separately. This contrasts largely +with shell scripts (though you can invoke shell scripts from CWL). +Also, CWL is written in JSON/YAML, which means everything can be parsed +as a tree and you can easily get visualisations such as +
+ ++ + +
+ ++For more see Creating a reproducible workflow with CWL by Pjotr Prins. +
++Arvados provides a rich API for accessing internals of the Cloud +infrastructure. +
+ +
+In above script bh20-seq-analyzer
there are examples of querying the
+Arvados API using the Python Arvados client and libraries. For example
+get a list of projects in Arvados. Main thing is to get the
+ARVADOS-API-HOST
and ARVADOS-API-TOKEN
right as is shown above.
+