[ENH] Add single docker compose recipe for all deployment flavours (#32

) * create full-stack Docker Compose file Co-authored-by: Alyssa Dai <[email protected]> Co-authored-by: Sebastian Urchs <[email protected]> * Added `template.env` file * Wait specifically for the graphDB process ID * Use Docker service name for local node * Add profiles - "dev" replaces the old behaviour - other profile names replace the files in the other subdirectories - start with `docker compose --profile <NAME> up` * Make the local data directory configurable * upload vocab from separate directory + more progress messages * add data-config.ttl to gitignore * fix default value and mounting of data/ * replace hardcoded params inside setup.sh with env vars * set default profile in .env * write output of setup.sh to log * rename dev profile and update README * write log to script dir rather than working dir --------- Co-authored-by: rmanaem <[email protected]> Co-authored-by: Sebastian Urchs <[email protected]> Co-authored-by: Sebastian Urchs <[email protected]>
neurobagel · Apr 11, 2024 · 546f3ee · 546f3ee
1 parent 9c52651
commit 546f3ee
Show file tree

Hide file tree

Showing 11 changed files with 1,065 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -143,6 +143,11 @@ venv.bak/
 .dmypy.json
 dmypy.json
 
+# Neurobagel
+# RDF data configuration file, created automatically during setup from data-config_template.ttl based on .env values
+data-config.ttl
+local_nb_nodes.json
+
 # Pyre type checker
 .pyre/
 

diff --git a/README.md b/README.md
@@ -4,6 +4,56 @@ Configuration files for a Neurobagel deployment.
 ## How to use
 For detailed instructions on the deployment options for Neurobagel, see the official Neurobagel documentation on [setting up a local knowledge graph (node)](https://neurobagel.org/infrastructure/) and [local query federation](https://neurobagel.org/federate/).
 
+### Using the full-stack Docker Compose file
+
+1. Clone the repository
+```bash
+git clone https://github.com/neurobagel/recipes.git
+```
+
+2. `cd` into the directory containing the Neurobagel deployment recipe
+<!---
+TODO: Change once we rename this directory for production!
+-->
+```bash
+cd recipes/dev
+```
+
+3. Copy and rename template files in the directory
+```bash
+cp template.env .env
+
+# if also setting up local federation
+cp local_nb_nodes.template.json local_nb_nodes.json
+```
+Ensure to edit the file(s) according to your deployment.
+
+4. Start the Docker Compose stack and specify your desired deployment profile
+
+    **To set up only a local node:**
+    ```bash
+    docker compose up -d
+    ```
+    or
+    ```bash
+    docker compose --profile local_node up -d
+    ```
+
+    **To set up a local node with a graphical query tool:**
+    ```bash
+    docker compose --profile local_node_query up -d
+    ```
+
+    **To set up a local node and local federation (including a graphical query tool):**
+    ```bash
+    docker compose --profile full_stack up -d
+    ```
+A log file `DEPLOY.log` will be automatically created in the current directory with a copy of the STDOUT from the automatic deployment process.
+
+### Using deployment type-specific Docker Compose files
+
+**Note: These instructions will soon be superceded by the full-stack Docker Compose file instructions.**
+
 1. Clone the repository
 ```bash
 git clone https://github.com/neurobagel/recipes.git

diff --git a/dev/add_data_to_graph.sh b/dev/add_data_to_graph.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+#
+# ARG_HELP([Upload JSONLD and Turtle data to a Neurobagel graph])
+# ARG_POSITIONAL_SINGLE([dir],[Path to directory containing .jsonld and/or .ttl files. ALL .jsonld and .ttl files in this directory will be uploaded.])
+# ARG_POSITIONAL_SINGLE([graph-url],[Host and port at which to access the graph database to add data to (e.g., localhost:7200)])
+# ARG_POSITIONAL_SINGLE([graph-database],[Name of graph database to add data to])
+# ARG_POSITIONAL_SINGLE([user],[Username for graph database access])
+# ARG_POSITIONAL_SINGLE([password],[Password for graph database user])
+# ARG_OPTIONAL_BOOLEAN([clear-data],[],[Whether or not to first clear all existing data from the graph database],[off])
+# ARG_OPTIONAL_BOOLEAN([use-stardog-syntax],[],[Set to use Stardog API endpoints to update the specified graph database. If unset, assumes the graph database is a GraphDB database.],[off])
+# ARG_OPTIONAL_BOOLEAN([log-output],[],[Whether or not to write the output to a log file],[off])
+# ARG_OPTIONAL_SINGLE([log-file],[],[Path to the log file],[LOG.txt])
+# ARGBASH_GO()
+# needed because of Argbash --> m4_ignore([
+### START OF CODE GENERATED BY Argbash v2.9.0 one line above ###
+# Argbash is a bash code generator used to get arguments parsing right.
+# Argbash is FREE SOFTWARE, see https://argbash.io for more info
+# Generated online by https://argbash.io/generate
+
+
+die()
+{
+	local _ret="${2:-1}"
+	test "${_PRINT_HELP:-no}" = yes && print_help >&2
+	echo "$1" >&2
+	exit "${_ret}"
+}
+
+
+begins_with_short_option()
+{
+	local first_option all_short_options='h'
+	first_option="${1:0:1}"
+	test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
+}
+
+# THE DEFAULTS INITIALIZATION - POSITIONALS
+_positionals=()
+# THE DEFAULTS INITIALIZATION - OPTIONALS
+_arg_clear_data="off"
+_arg_use_stardog_syntax="off"
+
+_arg_log_output="off"
+_arg_log_file="LOG.txt"
+
+print_help()
+{
+	printf '%s\n' "Upload JSONLD and Turtle data to a Neurobagel graph"
+	printf 'Usage: %s [-h|--help] [--(no-)clear-data] [--(no-)use-stardog-syntax] [--(no-)log-output] [--log-file <arg>] <dir> <graph-url> <graph-database> <user> <password>\n' "$0"
+	printf '\t%s\n' "<dir>: Path to directory containing .jsonld and/or .ttl files. ALL .jsonld and .ttl files in this directory will be uploaded."
+	printf '\t%s\n' "<graph-url>: Host and port at which to access the graph database to add data to (e.g., localhost:7200)"
+	printf '\t%s\n' "<graph-database>: Name of graph database to add data to"
+	printf '\t%s\n' "<user>: Username for graph database access"
+	printf '\t%s\n' "<password>: Password for graph database user"
+	printf '\t%s\n' "-h, --help: Prints help"
+	printf '\t%s\n' "--clear-data, --no-clear-data: Whether or not to first clear all existing data from the graph database (off by default)"
+	printf '\t%s\n' "--use-stardog-syntax, --no-use-stardog-syntax: Set to use Stardog API endpoints to update the specified graph database. If unset, assumes the graph database is a GraphDB database. (off by default)"
+  	printf '\t%s\n' "--log-output, --no-log-output: Whether or not to write the output to a log file (off by default)"
+    printf '\t%s\n' "--log-file: Path to the log file (default: 'LOG.txt')"
+}
+
+
+parse_commandline()
+{
+	_positionals_count=0
+	while test $# -gt 0
+	do
+		_key="$1"
+		case "$_key" in
+			-h|--help)
+				print_help
+				exit 0
+				;;
+			-h*)
+				print_help
+				exit 0
+				;;
+			--no-clear-data|--clear-data)
+				_arg_clear_data="on"
+				test "${1:0:5}" = "--no-" && _arg_clear_data="off"
+				;;
+			--no-use-stardog-syntax|--use-stardog-syntax)
+				_arg_use_stardog_syntax="on"
+				test "${1:0:5}" = "--no-" && _arg_use_stardog_syntax="off"
+				;;
+			--no-log-output|--log-output)
+				_arg_log_output="on"
+				test "${1:0:5}" = "--no-" && _arg_log_output="off"
+				;;
+			--log-file)
+				test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
+				_arg_log_file="$2"
+				shift
+				;;
+			--log-file=*)
+				_arg_log_file="${_key##--log-file=}"
+				;;
+			*)
+				_last_positional="$1"
+				_positionals+=("$_last_positional")
+				_positionals_count=$((_positionals_count + 1))
+				;;
+		esac
+		shift
+	done
+}
+
+
+handle_passed_args_count()
+{
+	local _required_args_string="'dir', 'graph-url', 'graph-database', 'user' and 'password'"
+	test "${_positionals_count}" -ge 5 || _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 5 (namely: $_required_args_string), but got only ${_positionals_count}." 1
+	test "${_positionals_count}" -le 5 || _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 5 (namely: $_required_args_string), but got ${_positionals_count} (the last one was: '${_last_positional}')." 1
+}
+
+
+assign_positional_args()
+{
+	local _positional_name _shift_for=$1
+	_positional_names="_arg_dir _arg_graph_url _arg_graph_database _arg_user _arg_password "
+
+	shift "$_shift_for"
+	for _positional_name in ${_positional_names}
+	do
+		test $# -gt 0 || break
+		eval "$_positional_name=\${1}" || die "Error during argument parsing, possibly an Argbash bug." 1
+		shift
+	done
+}
+
+parse_commandline "$@"
+handle_passed_args_count
+assign_positional_args 1 "${_positionals[@]}"
+
+# OTHER STUFF GENERATED BY Argbash
+
+### END OF CODE GENERATED BY Argbash (sortof) ### ])
+# [ <-- needed because of Argbash
+
+
+# Reassign positional args to more readable named variables (https://argbash.readthedocs.io/en/latest/guide.html#using-parsing-results)
+jsonld_dir=$_arg_dir
+user=$_arg_user
+password=$_arg_password
+graph_db=$_arg_graph_database
+graph_url=$_arg_graph_url
+clear_data=$_arg_clear_data  # value is either on or off (https://argbash.readthedocs.io/en/stable/guide.html#optional-arguments)
+use_stardog_syntax=$_arg_use_stardog_syntax
+
+log_output=$_arg_log_output
+log_file=$_arg_log_file
+
+DELETE_TRIPLES_QUERY="
+DELETE {
+	?s ?p ?o .
+} WHERE {
+	?s ?p ?o .
+}"
+
+# Depending on the graph backend used, set URLs for uploading data to and clearing data in graph database
+base_url="http://${graph_url}/${graph_db}"
+if [ "$use_stardog_syntax" = "on" ]; then
+	upload_data_url=$base_url
+	clear_data_url="${base_url}/update"
+else
+	upload_data_url="${base_url}/statements"
+	clear_data_url=$upload_data_url
+fi
+
+# Main logic
+main() {
+    # Clear existing data in graph database if requested
+    if [ "$clear_data" = "on" ]; then
+        echo -e "\nCLEARING EXISTING DATA FROM ${graph_db}..."
+
+        response=$(curl -u "${user}:${password}" -s -S -i -w "\n%{http_code}\n" \
+            -X POST $clear_data_url \
+            -H "Content-Type: application/sparql-update" \
+            --data-binary "${DELETE_TRIPLES_QUERY}")
+
+        # Extract and check status code outputted as final line of response
+        httpcode=$(tail -n1 <<< "$response")
+        if (( $httpcode < 200 || $httpcode >= 300 )); then
+            echo -e "\nERROR: Failed to clear ${graph_db}:"
+            echo "$(sed '$d' <<< "$response")"
+            echo -e "\nEXITING..."
+            exit 1
+        fi
+    fi
+
+    # Add data to specified graph database
+    echo -e "\nUPLOADING DATA FROM ${jsonld_dir} TO ${graph_db}...\n"
+
+    upload_failed=()
+
+    for db in ${jsonld_dir}/*.jsonld; do
+        # Prevent edge case where no matching files are present in directory and so loop executes once with glob pattern string itself
+        [ -e "$db" ] || continue
+
+        echo "$(basename ${db}):"
+        response=$(curl -u "${user}:${password}" -s -S -i -w "\n%{http_code}\n" \
+                    -X POST $upload_data_url \
+                    -H "Content-Type: application/ld+json" \
+                    --data-binary @${db})
+
+        httpcode=$(tail -n1 <<< "$response")
+        if (( $httpcode < 200 || $httpcode >= 300 )); then
+            upload_failed+=("${db}")
+        fi
+        # Print rest of response to stdout
+        echo -e "$(sed '$d' <<< "$response")\n"
+    done
+
+    for file in ${jsonld_dir}/*.ttl; do
+        [ -e "$file" ] || continue
+
+        echo "$(basename ${file}):"
+        response=$(curl -u "${user}:${password}" -s -S -i -w "\n%{http_code}\n" \
+                    -X POST $upload_data_url \
+                    -H "Content-Type: text/turtle" \
+                    --data-binary @${file})
+
+        httpcode=$(tail -n1 <<< "$response")
+        if (( $httpcode < 200 || $httpcode >= 300 )); then
+            upload_failed+=("${file}")
+        fi
+        echo -e "$(sed '$d' <<< "$response")\n"
+    done
+
+    echo "FINISHED UPLOADING DATA FROM ${jsonld_dir} TO ${graph_db}."
+
+    if (( ${#upload_failed[@]} != 0 )); then
+        echo -e "\nERROR: Upload failed for these files:"
+        printf '%s\n' "${upload_failed[@]}"
+    fi
+}
+
+# Call the main logic function with or without output redirection
+if [ "$log_output" = "on" ]; then
+    main > "$log_file"
+else
+    main
+fi
+# ] <-- needed because of Argbash
diff --git a/dev/data-config_template.ttl b/dev/data-config_template.ttl
@@ -0,0 +1,49 @@
+#
+# RDF4J configuration template for a GraphDB repository
+#
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
+@prefix rep: <http://www.openrdf.org/config/repository#>.
+@prefix sr: <http://www.openrdf.org/config/repository/sail#>.
+@prefix sail: <http://www.openrdf.org/config/sail#>.
+@prefix graphdb: <http://www.ontotext.com/config/graphdb#>.
+
+[] a rep:Repository ;
+    rep:repositoryID "my_db" ;
+    rdfs:label "" ;
+    rep:repositoryImpl [
+        rep:repositoryType "graphdb:SailRepository" ;
+        sr:sailImpl [
+            sail:sailType "graphdb:Sail" ;
+
+            graphdb:read-only "false" ;
+
+            # Inference and Validation
+            graphdb:ruleset "rdfsplus-optimized" ;
+            graphdb:disable-sameAs "true" ;
+            graphdb:check-for-inconsistencies "false" ;
+
+            # Indexing
+            graphdb:entity-id-size "32" ;
+            graphdb:enable-context-index "false" ;
+            graphdb:enablePredicateList "true" ;
+            graphdb:enable-fts-index "false" ;
+            graphdb:fts-indexes ("default" "iri") ;
+            graphdb:fts-string-literals-index "default" ;
+            graphdb:fts-iris-index "none" ;
+
+            # Queries and Updates
+            graphdb:query-timeout "0" ;
+            graphdb:throw-QueryEvaluationException-on-timeout "false" ;
+            graphdb:query-limit-results "0" ;
+
+            # Settable in the file but otherwise hidden in the UI and in the RDF4J console
+            graphdb:base-URL "http://example.org/owlim#" ;
+            graphdb:defaultNS "" ;
+            graphdb:imports "" ;
+            graphdb:repository-type "file-repository" ;
+            graphdb:storage-folder "storage" ;
+            graphdb:entity-index-size "10000000" ;
+            graphdb:in-memory-literal-properties "true" ;
+            graphdb:enable-literal-index "true" ;
+        ]
+    ].