Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2195 Add s3a wrapper around run scripts #2198

Merged
merged 16 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ insert_final_newline = true
[*.properties]
insert_final_newline = true

[*.{java,scala,js,json,css}]
[*.{java,scala,js,json,css,sh}]
indent_size = 2
indent_style = space
insert_final_newline = true
Expand Down
7 changes: 7 additions & 0 deletions scripts/bash/enceladus_env.template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,10 @@ ADDITIONAL_JVM_EXECUTOR_CONF_CLUSTER="$KRB5_CONF_CLUSTER $TRUST_STORE_CLUSTER $T
# Switch that tells the script if it should exit if it encounters unrecognized.
# On true it prints an Error and exits with 127, on false it only prints a warning
EXIT_ON_UNRECOGNIZED_OPTIONS="true"

# Variables for the s3a wrapper implementation
MENAS_API="http://localhost:8080/menas/api"
ECS_API_BASE="https://localhost"
ECS_API_KK="$ECS_API_BASE/kk"
ECS_API_BUCKET="$ECS_API_BASE/bucket"
ECS_API_KEY="MY_SECRET_KEY"
8 changes: 8 additions & 0 deletions scripts/bash/run_enceladus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ AUTOCLEAN_STD_FOLDER=""
PERSIST_STORAGE_LEVEL=""
HELP_CALL="0"
ASYNCHRONOUSMODE="0"
JCEKS_PATH=""

# Spark configuration options
CONF_SPARK_EXECUTOR_MEMORY_OVERHEAD=""
Expand Down Expand Up @@ -326,6 +327,10 @@ case $key in
ASYNCHRONOUSMODE="1"
shift
;;
--jceks-path)
JCEKS_PATH="$2"
shift 2 # past argument and value
;;
*) # unknown option
OTHER_PARAMETERS+=("$1") # save it in an array for later
shift # past argument
Expand Down Expand Up @@ -520,6 +525,9 @@ fi
CMD_LINE="${CMD_LINE} ${ADDITIONAL_SPARK_CONF} ${SPARK_CONF}"
CMD_LINE="${CMD_LINE} --conf \"${JVM_CONF} ${ADDITIONAL_JVM_CONF}\""
CMD_LINE="${CMD_LINE} --conf \"spark.executor.extraJavaOptions=${ADDITIONAL_JVM_EXECUTOR_CONF}\""
if [[ -n $JCEKS_PATH ]]; then
CMD_LINE="${CMD_LINE} --conf \"${JCEKS_PATH}\""
fi
CMD_LINE="${CMD_LINE} --class ${CLASS} ${JAR}"

# Adding command line parameters that go AFTER the jar file
Expand Down
155 changes: 155 additions & 0 deletions scripts/bash/s3a_wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/bin/bash

# Copyright 2018 ABSA Group Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This is a wrapper script for Enceladus run scripts.
# It is used to resolve the paths to the JCEKS files and
# to clean up the versions in Dell ECS after the run.
# To use it just prepend your Enceladus run script by this script
# Example:
# ./s3a_wrapper.sh run_standardization_conformance.sh \
# --dataset-name "MyDataset"
# --dataset-version 1
# --report-date "2018-01-01"
# --report-version 1
#
# If the dataset is published to S3A, the script will find the JCEKS file on its own.
# If the dataset is not published to S3A, the script will just pass the arguments to the enceladus run script.
# If the dataset is published to S3A and you pass --jceks-path argument, the script will use the provided JCEKS conf.
# Format of the --jceks-path argument is:
# "spark.hadoop.fs.s3a.bucket.$BUCKET_NAME.security.credential.provider.path=jceks:$JCEKS_PATH"
#
# Requirements:
# - jq
# - curl
# - klist

set -e

# Run klist to check for a current Kerberos ticket
if klist -s; then
echo "Kerberos ticket found."
else
error_exit "No Kerberos ticket found or ticket is expired. Please run kinit."
fi

# Source environment variables
source "$(dirname "$0")/enceladus_env.sh"

# The first argument is the name of the original script
original_script="$(dirname "$0")/$(basename "$1")"
# Shift the first argument so we can process the rest
shift

hdfsPublishPath=""
hdfsPath=""
jceks_flag=""

# Function to print error message and exit
function error_exit() {
echo "Error: $1" >&2
exit 1
}

# Function to get dataset information
function get_dataset_info() {
local response=""

response=$(curl --negotiate -s -u : "$MENAS_API/dataset/$dataset_name/$dataset_version")
[[ $? -ne 0 ]] && error_exit "Could not load dataset info - $dataset_name v $dataset_version from Menas at $MENAS_API"

hdfsPublishPath=$(echo "$response" | jq -r '.hdfsPublishPath')
hdfsPath=$(echo "$response" | jq -r '.hdfsPath')
[[ $hdfsPublishPath == "null" || $hdfsPath == "null" ]] && error_exit "Could not find the required paths in the response."
return 0
}

# Function to handle JCEKS and set jceks_flag if need be
function handle_jceks_path() {
if [[ $hdfsPublishPath =~ ^s3a://.* ]]; then
echo "hdfsPublishPath starts with s3a://. Using JCEKS file."
if [[ -z $jceks_path ]]; then
readwrite_jceks=$(curl -s -X GET -d "{\"ecs_path\":\"$hdfsPublishPath\"}" "$ECS_API_BUCKET" | jq -r '.readwrite_jceks')
[[ -z $readwrite_jceks ]] && error_exit "Could not find readwrite_jceks in the response."
bucket_name=$(echo "$hdfsPublishPath" | cut -d'/' -f3)
jceks_flag="--jceks-path \"spark.hadoop.fs.s3a.bucket.$bucket_name.security.credential.provider.path=jceks:$readwrite_jceks\""
else
echo "--jceks-path argument is set by user"
jceks_flag="--jceks-path $jceks_path"
fi
fi
return 0
}

# Function to clean up versions
function cleanup_versions() {
local path=$1
local api=$2
echo "Cleaning versions for $path"
curl -s -X GET --header "x-api-key: $ECS_API_KEY" -d "{\"ecs_path\":\"${path#s3a://}\"}" "$api"
echo
curl -s -X DELETE --header "x-api-key: $ECS_API_KEY" -d "{\"ecs_path\":\"${path#s3a://}\"}" "$api"
echo
echo "Versions cleaned"
return 0
}

# Parse command line arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--menas-auth-keytab) keytab="$2"; shift 2 ;;
--dataset-name) dataset_name="$2"; shift 2 ;;
--dataset-version) dataset_version="$2"; shift 2 ;;
--report-date) report_date="$2"; shift 2 ;;
--report-version) report_version="$2"; shift 2 ;;
--jceks-path) jceks_path="$2"; shift 2 ;;
*)
if [ -z "$1" ]; then
# If the argument is an empty string, add two quotes to represent it
other_args+=('""')
elif [ "$1" == "''" ]; then
# If the argument is ''
other_args+=("\'\'")
elif [ "$1" == "|" ]; then
# If the argument is a pipe character
other_args+=("'|'")
else
other_args+=("$1")
fi
shift ;;
esac
done

[[ ! -f "$original_script" ]] && error_exit "The script '$original_script' does not exist in the current directory."

# Main script execution
get_dataset_info
handle_jceks_path

# Run the original script
echo "$original_script" "${other_args[@]}" \
--menas-auth-keytab "$keytab" \
--dataset-name "$dataset_name" \
--dataset-version "$dataset_version" \
--report-date "$report_date" \
--report-version "$report_version" \
"$jceks_flag" | bash

exit_code=$?

# Clean up versions if necessary
[[ $hdfsPublishPath == s3a://* ]] && cleanup_versions "$hdfsPublishPath" "$ECS_API_KK"
[[ $STD_HDFS_PATH == s3a://* ]] && cleanup_versions "$STD_HDFS_PATH" "$ECS_API_KK"

exit $exit_code
Loading