Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Make data upload script backend responsive #205

Merged
merged 6 commits into from
Oct 22, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 74 additions & 19 deletions add_data_to_graph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
# ARG_HELP([Upload JSONLD and Turtle data to a Neurobagel graph])
# ARG_POSITIONAL_SINGLE([dir],[Path to directory containing .jsonld and/or .ttl files. ALL .jsonld and .ttl files in this directory will be uploaded.])
# ARG_POSITIONAL_SINGLE([graph-url],[Host and port at which to access the graph database to add data to (e.g., localhost:7200)])
# ARG_POSITIONAL_SINGLE([graph-db],[Name of graph database to add data to])
# ARG_POSITIONAL_SINGLE([graph-database],[Name of graph database to add data to])
# ARG_POSITIONAL_SINGLE([user],[Username for graph database access])
# ARG_POSITIONAL_SINGLE([password],[Password for graph database user])
# ARG_OPTIONAL_BOOLEAN([clear-data],[],[Whether or not to first clear all existing data from the graph database],[off])
# ARG_OPTIONAL_BOOLEAN([use-graphdb-syntax],[],[Whether or not to use GraphDB API endpoints to update the specified graph database. If off, assumes the graph database is a Stardog database.],[off])
# ARGBASH_GO()
# needed because of Argbash --> m4_ignore([
### START OF CODE GENERATED BY Argbash v2.9.0 one line above ###
Expand Down Expand Up @@ -35,19 +36,21 @@ begins_with_short_option()
_positionals=()
# THE DEFAULTS INITIALIZATION - OPTIONALS
_arg_clear_data="off"
_arg_use_graphdb_syntax="off"


print_help()
{
printf '%s\n' "Upload JSONLD and Turtle data to a Neurobagel graph"
printf 'Usage: %s [-h|--help] [--(no-)clear-data] <dir> <graph-url> <graph-db> <user> <password>\n' "$0"
printf 'Usage: %s [-h|--help] [--(no-)clear-data] [--(no-)use-graphdb-syntax] <dir> <graph-url> <graph-database> <user> <password>\n' "$0"
printf '\t%s\n' "<dir>: Path to directory containing .jsonld and/or .ttl files. ALL .jsonld and .ttl files in this directory will be uploaded."
printf '\t%s\n' "<graph-url>: Host and port at which to access the graph database to add data to (e.g., localhost:7200)"
printf '\t%s\n' "<graph-db>: Name of graph database to add data to"
printf '\t%s\n' "<graph-database>: Name of graph database to add data to"
printf '\t%s\n' "<user>: Username for graph database access"
printf '\t%s\n' "<password>: Password for graph database user"
printf '\t%s\n' "-h, --help: Prints help"
printf '\t%s\n' "--clear-data, --no-clear-data: Whether or not to first clear all existing data from the graph database (off by default)"
printf '\t%s\n' "--use-graphdb-syntax, --no-use-graphdb-syntax: Whether or not to use GraphDB API endpoints to update the specified graph database. If off, assumes the graph database is a Stardog database. (off by default)"
}


Expand All @@ -70,6 +73,10 @@ parse_commandline()
_arg_clear_data="on"
test "${1:0:5}" = "--no-" && _arg_clear_data="off"
;;
--no-use-graphdb-syntax|--use-graphdb-syntax)
_arg_use_graphdb_syntax="on"
test "${1:0:5}" = "--no-" && _arg_use_graphdb_syntax="off"
;;
*)
_last_positional="$1"
_positionals+=("$_last_positional")
Expand All @@ -83,7 +90,7 @@ parse_commandline()

handle_passed_args_count()
{
local _required_args_string="'dir', 'graph-url', 'graph-db', 'user' and 'password'"
local _required_args_string="'dir', 'graph-url', 'graph-database', 'user' and 'password'"
test "${_positionals_count}" -ge 5 || _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 5 (namely: $_required_args_string), but got only ${_positionals_count}." 1
test "${_positionals_count}" -le 5 || _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 5 (namely: $_required_args_string), but got ${_positionals_count} (the last one was: '${_last_positional}')." 1
}
Expand All @@ -92,7 +99,7 @@ handle_passed_args_count()
assign_positional_args()
{
local _positional_name _shift_for=$1
_positional_names="_arg_dir _arg_graph_url _arg_graph_db _arg_user _arg_password "
_positional_names="_arg_dir _arg_graph_url _arg_graph_database _arg_user _arg_password "

shift "$_shift_for"
for _positional_name in ${_positional_names}
Expand All @@ -116,9 +123,10 @@ assign_positional_args 1 "${_positionals[@]}"
jsonld_dir=$_arg_dir
user=$_arg_user
password=$_arg_password
graph_db=$_arg_graph_db
graph_db=$_arg_graph_database
graph_url=$_arg_graph_url
clear_data=$_arg_clear_data # value is either on or off (https://argbash.readthedocs.io/en/stable/guide.html#optional-arguments)
use_graphdb_syntax=$_arg_use_graphdb_syntax

DELETE_TRIPLES_QUERY="
DELETE {
Expand All @@ -127,34 +135,81 @@ DELETE {
?s ?p ?o .
}"

# Depending on the graph backend used, set URLs for uploading data to and clearing data in graph database
base_url="http://${graph_url}/${graph_db}"
if [ "$use_graphdb_syntax" = "on" ]; then
upload_data_url="${base_url}/statements"
clear_data_url=$upload_data_url
else
upload_data_url=$base_url
clear_data_url="${base_url}/update"
fi


# Clear existing data in graph database if requested
if [ "$clear_data" = "on" ]; then
echo -e "\nClearing existing data from ${graph_db}..."
echo -e "\nCLEARING EXISTING DATA FROM ${graph_db}..."

curl -u "${user}:${password}" -X POST http://${graph_url}/${graph_db}/update \
response=$(curl -u "${user}:${password}" --no-progress-meter -i -w "\n%{http_code}\n" \
-X POST $clear_data_url \
-H "Content-Type: application/sparql-update" \
--data-binary "${DELETE_TRIPLES_QUERY}"

echo -e "Done clearing existing data from ${graph_db}.\n"
--data-binary "${DELETE_TRIPLES_QUERY}")

# Extract and check status code outputted as final line of response
httpcode=$(tail -n1 <<< "$response")
if (( $httpcode < 200 || $httpcode >= 300 )); then
echo -e "\nERROR: Failed to clear ${graph_db}:"
echo "$(sed '$d' <<< "$response")"
echo -e "\nEXITING..."
exit 1
fi
fi


# Add data to specified graph database
echo "Uploading data from ${jsonld_dir} to ${graph_db}..."
echo -e "\nUPLOADING DATA FROM ${jsonld_dir} TO ${graph_db}...\n"

upload_failed=()

for db in ${jsonld_dir}/*.jsonld; do
curl -u "${user}:${password}" -i -X POST http://${graph_url}/${graph_db} \
-H "Content-Type: application/ld+json" \
--data-binary @${db}
# Prevent edge case where no matching files are present in directory and so loop executes once with glob pattern string itself
[ -e "$db" ] || continue
alyssadai marked this conversation as resolved.
Show resolved Hide resolved

echo "$(basename ${db}):"
response=$(curl -u "${user}:${password}" --no-progress-meter -i -w "\n%{http_code}\n" \
-X POST $upload_data_url \
-H "Content-Type: application/ld+json" \
--data-binary @${db})

httpcode=$(tail -n1 <<< "$response")
if (( $httpcode < 200 || $httpcode >= 300 )); then
upload_failed+=("${db}")
fi
# Print rest of response to stdout
echo -e "$(sed '$d' <<< "$response")\n"
done

for file in ${jsonld_dir}/*.ttl; do
curl -u "${user}:${password}" -i -X POST http://${graph_url}/${graph_db} \
-H "Content-Type: text/turtle" \
--data-binary @${file}
[ -e "$file" ] || continue

echo "$(basename ${file}):"
response=$(curl -u "${user}:${password}" --no-progress-meter -i -w "\n%{http_code}\n" \
-X POST $upload_data_url \
-H "Content-Type: text/turtle" \
--data-binary @${file})

httpcode=$(tail -n1 <<< "$response")
if (( $httpcode < 200 || $httpcode >= 300 )); then
upload_failed+=("${file}")
fi
echo -e "$(sed '$d' <<< "$response")\n"
done

echo "Finished uploading data from ${jsonld_dir} to ${graph_db}"
echo "FINISHED UPLOADING DATA FROM ${jsonld_dir} TO ${graph_db}."

if (( ${#upload_failed[@]} != 0 )); then
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
echo -e "\nERROR: Upload failed for these files:"
printf '%s\n' "${upload_failed[@]}"
fi

# ] <-- needed because of Argbash