From adba766908bc34dfb8e236ea978b6e87ccdfbc4b Mon Sep 17 00:00:00 2001 From: Nick Guenther Date: Fri, 24 Feb 2023 11:48:23 -0500 Subject: [PATCH 1/5] Place results in subdirs Subdirs are simply named by taking the prefix of the input UUID. This is a common technique to prevent the chance of overflowing filesystem limits on the number of files allowed in a single directory. e.g. Before - /srv/gitea/custom/public/90fa9a89-5e50-4bd4-834d-d42655e1ee8e.html - http://127.0.0.1:3000/assets/90fa9a89-5e50-4bd4-834d-d42655e1ee8e.html After: - /srv/gitea/custom/public/bids-validator/84/b1/84b10e1c-b188-41e2-a92d-fb6ded9c6889.html - http://127.0.0.1:3000/assets/bids-validator/84/b1/84b10e1c-b188-41e2-a92d-fb6ded9c6889.html --- bids-hook.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/bids-hook.go b/bids-hook.go index a792ec6..f32815b 100644 --- a/bids-hook.go +++ b/bids-hook.go @@ -279,13 +279,13 @@ type job struct { // web link to the results page for this job // see also j.resultPath() func (j job) resultUrl() string { - return giteaRootUrl.JoinPath("assets", fmt.Sprintf("%s.html", j.uuid)).String() + return giteaRootUrl.JoinPath("assets", "bids-validator", j.uuid[:2], j.uuid[2:4], fmt.Sprintf("%s.html", j.uuid)).String() } // file path to the results page for this job // see also j.resultUrl() func (j job) resultPath() string { - return filepath.Join(giteaCustom, "public", fmt.Sprintf("%s.html", j.uuid)) + return filepath.Join(giteaCustom, "public", "bids-validator", j.uuid[:2], j.uuid[2:4], fmt.Sprintf("%s.html", j.uuid)) } // file path to the log file for this job @@ -360,7 +360,12 @@ func (j job) run() (state string, _ error) { ) // redirect stdout to the result file - stdout, err := os.OpenFile(j.resultPath(), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + resultPath := j.resultPath() + err := os.MkdirAll(filepath.Dir(resultPath), 0750) + if err != nil { + return stateError, err + } + stdout, err := os.OpenFile(resultPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0640) if err != nil { return stateError, err } @@ -464,7 +469,7 @@ func readConfig() { if err != nil { log.Fatalf("invalid GITEA_CUSTOM: %v", err) } - err = os.MkdirAll(filepath.Join(giteaCustom, "public"), 0750) + err = os.MkdirAll(filepath.Join(giteaCustom, "public", "bids-validator"), 0750) if err != nil { log.Fatalf("error creating output folder: %v", err) } From 9c70be47f7a1d58e11294881bad50ee534e16a4c Mon Sep 17 00:00:00 2001 From: Nick Guenther Date: Mon, 20 Feb 2023 17:00:17 -0500 Subject: [PATCH 2/5] Run bids-validator To test: - set up a gitea install at ../gitea/ and run `./start` (if elsewhere, run `GITEA_APP_PATH=path/to/gitea ./start`) - make a repo in it; upload some git-annex (or not git-annex) files - commit and push to the test repo --- worker | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/worker b/worker index 5992a77..0b7d356 100755 --- a/worker +++ b/worker @@ -1,6 +1,5 @@ #!/bin/bash set -o nounset -o pipefail -trap 'exit 3' ERR (exec 1>&2 date @@ -12,13 +11,62 @@ BH_UUID=${BH_UUID} EOF ) +GITEA_REPO="$GITEA_REPOSITORY_ROOT"/"${BH_USER}"/"${BH_REPO}".git +#echo $GITEA_REPO #DEBUG + sleep 5 cat <<"EOF" a title -

a paragraph +

a paragraph

+
+EOF
+
+(
+WORKDIR="$(mktemp -d --tmpdir bids-hook."$BH_UUID".XXXXXXXXXXX)"
+cd "$WORKDIR"
+
+#git clone "$GITEA_REPO" dataset && cd dataset # this is standard but clones a full copy of the repo which might be expensive
+#git clone -b "${BH_COMMIT}" --depth 1 "$GITEA_REPO" dataset && cd dataset # this doesn't work unless BH_COMMIT is a branch name
+# this is more complicated but it allows to us to --depth 1 with branches OR commit IDs:
+(mkdir dataset && cd dataset && git init && git remote add origin "$GITEA_REPO" && git fetch --depth 1 origin "${BH_COMMIT}")
+#trap 'echo -n "cleaning "; pwd; chmod -R +w "$WORKDIR" && rm -rf "$WORKDIR"' EXIT
+(
+cd dataset
+git checkout "$BH_COMMIT"
+set -e
+
+if git ls-remote --exit-code "$GITEA_REPO" "refs/heads/git-annex" >/dev/null; then
+  ANNEXED=1
+else
+  ANNEXED=""
+fi
+
+if [ -n "$ANNEXED" ]; then
+  set -x
+  # this reduces copies; always overrides annex.hardlink even if that is set system-wide
+  git config annex.thin true
+  # make sure we don't corrupt origin accidentally
+  git config remote.origin.annex-readonly true
+  git config annex.private true # XXX this doesn't do anything until git-annex 10
+  git annex init
+  git annex dead here # this is like annex.private, but has to be run
+  git annex sync --only-annex --no-content # grab the git-annex branch (since we did a shallow clone above)
+  git annex copy --from origin  # NB: using copy --from origin and not git annex get to ensure we're validating the contents of origin and not any special remotes
+else
+  set -x
+fi
+
+bids-validator --verbose .
+)
+) 2>&1
+R=$?
+
+cat <<"EOF"
+
+ EOF -exit 0 +exit $R From f6b5cd04eaf2d12828e228d87c5fe2b6cfd81bf6 Mon Sep 17 00:00:00 2001 From: Mathieu Guay-Paquet Date: Thu, 2 Mar 2023 15:10:54 -0500 Subject: [PATCH 3/5] Place log files in subdirs Similar to results, see 5d3f078520aea0cb0629afa6c8a05c694eb84f6a. * While we're at it, create the log folder if need be, instead of erroring out if it doesn't exist already. * As a result, we no longer need to import io/fs. * Uniformize permissions to 0750 and 0640 too. --- bids-hook.go | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/bids-hook.go b/bids-hook.go index f32815b..fbcefb0 100644 --- a/bids-hook.go +++ b/bids-hook.go @@ -10,7 +10,6 @@ import ( "errors" "fmt" "io" - "io/fs" "log" "net/http" "net/url" @@ -66,13 +65,13 @@ var ( // * 0 = "success" (green checkmark) // * 1 = "failure" (red "X" mark) // * 2 = "warning" (yellow "!" mark) - // stdout will be saved to the Gitea url "/assets/${BH_UUID}.html" and linked from the commit status - // stderr will be appended to the log file "{{workerLogPath}}/${BH_UUID}.log" + // * 3+ = "error" (red "!" mark, no link to the result page) + // stdout will be saved to the Gitea url "/assets/bids-validator/XX/YY/${BH_UUID}.html" and linked from the commit status + // stderr will be appended to the log file "{{workerLogPath}}/XX/YY/${BH_UUID}.log" workerScript string // the path to a log directory for worker stderr output // read from environment variable WORKER_LOG_PATH - // it should already exist workerLogPath string // channel used to ferry jobs from the server to the worker @@ -290,7 +289,7 @@ func (j job) resultPath() string { // file path to the log file for this job func (j job) logPath() string { - return filepath.Join(workerLogPath, fmt.Sprintf("%s.log", j.uuid)) + return filepath.Join(workerLogPath, j.uuid[:2], j.uuid[2:4], fmt.Sprintf("%s.log", j.uuid)) } // postStatus posts a commit status to Gitea @@ -378,7 +377,12 @@ func (j job) run() (state string, _ error) { cmd.Stdout = stdout // redirect stderr to the log file - stderr, err := os.OpenFile(j.logPath(), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + logPath := j.logPath() + err = os.MkdirAll(filepath.Dir(logPath), 0750) + if err != nil { + return stateError, err + } + stderr, err := os.OpenFile(j.logPath(), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0640) if err != nil { return stateError, err } @@ -425,10 +429,9 @@ func worker() { // readConfig sets up global variables from environment values func readConfig() { var ( - val string - ok bool - err error - info fs.FileInfo + val string + ok bool + err error ) val, ok = os.LookupEnv("BIDS_HOOK_URL") @@ -491,12 +494,9 @@ func readConfig() { if err != nil { log.Fatalf("invalid WORKER_LOG_PATH: %v", err) } - info, err = os.Stat(workerLogPath) + err = os.MkdirAll(workerLogPath, 0750) if err != nil { - log.Fatalf("error opening WORKER_LOG_PATH: %v", err) - } - if !info.IsDir() { - log.Fatal("WORKER_LOG_PATH is not a directory") + log.Fatalf("error creating log folder: %v", err) } val, ok = os.LookupEnv("WORKER_QUEUE_CAPACITY") From 709929eaec704af664816b2faeca26fb7a2eec29 Mon Sep 17 00:00:00 2001 From: Mathieu Guay-Paquet Date: Thu, 2 Mar 2023 15:20:27 -0500 Subject: [PATCH 4/5] Revamp the worker script * I added documentation for the script's contract/API in the script itself, for people who want to modify it. * I restored the ERR trap, but managed to still save an exit status using the following idiom: ```sh some_command && STATUS=$? || STATUS=$? ``` * I added a temp dir cleanup trap on EXIT, and checked that the commands in it don't cause ERR traps, and don't affect the script's exit status. * I removed the $BH_UUID from the temp dir name, because that's mildly sensitive info, and the temp dir name is world-readable. * I removed `--verbose` from `bids-validator .` because it can generate so many lines that it's hard to find the actual warning types in the output. * With an extra dependency on `jq`, I added a way to check whether `bids-validator` returns success with or without warnings. * I added an `ansifilter` step to turn the terminal color escape sequences and characters like `<`, `&` into valid HTML. (I guess we should similarly sanitize $BH_USER, etc.?) * I made the script exit status conform to its contract/API. --- worker | 157 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 41 deletions(-) diff --git a/worker b/worker index 0b7d356..3035b81 100755 --- a/worker +++ b/worker @@ -1,7 +1,48 @@ #!/bin/bash +# ------------------------------------------------------------------- +# Default bids-hook worker script +# ------------------------------------------------------------------- +# +# If bids-hook is started with the environment variable WORKER_SCRIPT +# pointing to this script, then it will be executed by the worker +# once for each accepted job. +# +# In addition to inheriting all environment variables used to launch +# bids-hook, the script will be given the details of the job in the +# following environment variables: +# +# BH_USER, BH_REPO, BH_COMMIT, BH_UUID +# +# Any output generated by the script on stdout (file descriptor 1) +# is saved as a result page, overwriting the output of any previous +# run for the same UUID. The result page is linked from the commit +# status posted on Gitea, and will be visible to anyone who has a +# link. It should be a complete, properly formatted HTML document. +# +# Any output generated by the script on stderr (file descriptor 2) +# is appended to a log file, after the log output of any previous +# run for the same UUID, visible at the filesystem path: +# +# "${WORKER_LOG_PATH}/ab/cd/${BH_UUID}.log" +# +# where "abcd" are the first four characters of BH_UUID. +# +# The exit code of the script is interpreted as follows: +# +# 0 = "success" (green checkmark) +# 1 = "failure" (red "X" mark) +# 2 = "warning" (yellow "!" mark) +# 3+ = "internal error" (red "!" mark, no link to the result page) +# +# ------------------------------------------------------------------- + +# Exit with the "internal error" status if anything goes wrong. set -o nounset -o pipefail +trap 'exit 3' ERR +# Log the job metadata to stderr. (exec 1>&2 +echo '=== job start ===' date cat <&2 "GITEA_REPO=${GITEA_REPO}" -sleep 5 +# Do all the work in a private temporary directory. +WORKDIR=$(mktemp --directory --tmpdir bids-hook.XXXXXXXXXX) +echo 1>&2 "WORKDIR=${WORKDIR}" +# We have an ERR trap to account for this: +# shellcheck disable=SC2164 +cd 1>&2 "$WORKDIR" +# We have an EXIT trap which makes this code reachable: +# shellcheck disable=SC2317 +function cleanup() { + echo "# cleaning ${WORKDIR}" + chmod --recursive u+w "$WORKDIR" + rm --recursive "$WORKDIR" +} 1>&2 +trap cleanup EXIT -cat <<"EOF" - - -a title -

a paragraph

-
-EOF
-
-(
-WORKDIR="$(mktemp -d --tmpdir bids-hook."$BH_UUID".XXXXXXXXXXX)"
-cd "$WORKDIR"
-
-#git clone "$GITEA_REPO" dataset && cd dataset # this is standard but clones a full copy of the repo which might be expensive
-#git clone -b "${BH_COMMIT}" --depth 1 "$GITEA_REPO" dataset && cd dataset # this doesn't work unless BH_COMMIT is a branch name
-# this is more complicated but it allows to us to --depth 1 with branches OR commit IDs:
-(mkdir dataset && cd dataset && git init && git remote add origin "$GITEA_REPO" && git fetch --depth 1 origin "${BH_COMMIT}")
-#trap 'echo -n "cleaning "; pwd; chmod -R +w "$WORKDIR" && rm -rf "$WORKDIR"' EXIT
-(
-cd dataset
+# Check out the repository, redirecting all output to stderr.
+(exec 1>&2
+# We'd like to do 'git clone -b "$BH_COMMIT" --depth 1 "$GITEA_REPO"'
+# but maybe "$BH_COMMIT" is a commit ID instead of a branch name, so
+# we do it this way instead.
+echo "# cloning ${GITEA_REPO}"
+git init
+git remote add origin "$GITEA_REPO"
+git fetch --depth 1 origin "$BH_COMMIT"
 git checkout "$BH_COMMIT"
-set -e
-
-if git ls-remote --exit-code "$GITEA_REPO" "refs/heads/git-annex" >/dev/null; then
-  ANNEXED=1
-else
-  ANNEXED=""
-fi
 
-if [ -n "$ANNEXED" ]; then
-  set -x
+# If this is a git-annex repository, we need to get the contents.
+if git ls-remote --exit-code origin refs/heads/git-annex >/dev/null; then
+  echo '# getting git-annexed files'
   # this reduces copies; always overrides annex.hardlink even if that is set system-wide
   git config annex.thin true
   # make sure we don't corrupt origin accidentally
@@ -53,20 +94,54 @@ if [ -n "$ANNEXED" ]; then
   git config annex.private true # XXX this doesn't do anything until git-annex 10
   git annex init
   git annex dead here # this is like annex.private, but has to be run
-  git annex sync --only-annex --no-content # grab the git-annex branch (since we did a shallow clone above)
-  git annex copy --from origin  # NB: using copy --from origin and not git annex get to ensure we're validating the contents of origin and not any special remotes
-else
-  set -x
+  # grab the git-annex branch (since we did a shallow clone above)
+  git annex sync --only-annex --no-content
+  # NB: using 'copy --from origin' and not 'git annex get; to ensure we're
+  # validating the contents of origin and not any special remotes
+  git annex copy --from origin
 fi
-
-bids-validator --verbose .
 )
-) 2>&1
-R=$?
 
-cat <<"EOF"
+echo 1>&2 '# running bids-validator'
+OUTPUT=$(bids-validator .) && STATUS=$? || STATUS=$?
+WARNINGS=$(bids-validator . --json | jq '.issues.warnings | length') || true
+cat 1>&2 <&2 '# formatting output'
+HTML=$(echo "$OUTPUT" | ansifilter --html --fragment --line-numbers --anchors=self)
+
+# Produce the HTML result page on stdout.
+cat <
+
+bids-validator results for ${BH_USER}/${BH_REPO}@${BH_COMMIT}
+

Here are the bids-validator results for ${BH_USER}/${BH_REPO}@${BH_COMMIT}:

+
+${HTML}
 
EOF -exit $R +# Our exit code means: +# 0 = "success" (green checkmark) +# 1 = "failure" (red "X" mark) +# 2 = "warning" (yellow "!" mark) +# 3+ = "internal error" (red "!" mark, no link to the result page) +case $STATUS in +(0) + if ((WARNINGS)); then + exit 2 + else + exit 0 + fi + ;; +(1) + exit 1 + ;; +(*) + exit 3 + ;; +esac From 3312fd01639234dc5d2fcfda5ee2ac57af85f7fb Mon Sep 17 00:00:00 2001 From: Mathieu Guay-Paquet Date: Thu, 2 Mar 2023 15:37:17 -0500 Subject: [PATCH 5/5] Rename the worker script Hopefully this name is more inviting for people who want to mod it. --- start | 2 +- worker => worker-script | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename worker => worker-script (100%) diff --git a/start b/start index 89be690..992da9f 100755 --- a/start +++ b/start @@ -22,7 +22,7 @@ export BIDS_HOOK_SECRET='blabla' export GITEA_ROOT_URL='http://127.0.0.1:3000' export GITEA_TOKEN='69e45fa9cfa75a7497633c6be8dd2347226e2f62' -export WORKER_SCRIPT='./worker' +export WORKER_SCRIPT='./worker-script' export WORKER_LOG_PATH='./log' export WORKER_QUEUE_CAPACITY=20 diff --git a/worker b/worker-script similarity index 100% rename from worker rename to worker-script