-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from rearc-data/adu-md5_compare
Adds md5 comparison to keep from re-publishing non-modified files
Showing
5 changed files
with
154 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Compare the md5 of a file to the s3 etag md5 | ||
# Source: li xin on StackOverflow | ||
# https://stackoverflow.com/questions/1775816/how-to-get-the-md5sum-of-a-file-on-amazons-s3 | ||
|
||
import hashlib | ||
import botocore.exceptions | ||
|
||
def md5_checksum(filename): | ||
m = hashlib.md5() | ||
with open(filename, 'rb') as f: | ||
for data in iter(lambda: f.read(1024 * 1024), b''): | ||
m.update(data) | ||
return m.hexdigest() | ||
|
||
def etag_checksum(filename, chunk_size=8 * 1024 * 1024): | ||
md5s = [] | ||
with open(filename, 'rb') as f: | ||
for data in iter(lambda: f.read(chunk_size), b''): | ||
md5s.append(hashlib.md5(data).digest()) | ||
m = hashlib.md5(b"".join(md5s)) | ||
return '{}-{}'.format(m.hexdigest(), len(md5s)) | ||
|
||
def etag_compare(filename, etag): | ||
et = etag[1:-1] # strip quotes | ||
if '-' in et and et == etag_checksum(filename): | ||
return False | ||
if '-' not in et and et == md5_checksum(filename): | ||
return False | ||
return True | ||
|
||
def md5_compare(s3, bucket_name, s3_key, filename): | ||
#Get the file metadata from s3 | ||
#If the file does not exist, return True for changes found | ||
try: | ||
obj_dict = s3.head_object(Bucket=bucket_name, Key=s3_key) | ||
except botocore.exceptions.ClientError as e: | ||
error_code = e.response['Error']['Code'] | ||
if error_code == '404': | ||
return True | ||
|
||
etag = (obj_dict['ETag']) | ||
|
||
md5_matches = etag_compare(filename,etag) | ||
|
||
return md5_matches | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Exit on error. Append "|| true" if you expect an error. | ||
set -o errexit | ||
# Exit on error inside any functions or subshells. | ||
set -o errtrace | ||
# Do not allow use of undefined vars. Use ${VAR:-} to use an undefined VAR | ||
#set -o nounset | ||
# Catch the error in case mysqldump fails (but gzip succeeds) in `mysqldump |gzip` | ||
set -o pipefail | ||
# Turn on traces, useful while debugging but commented out by default | ||
# set -o xtrace | ||
|
||
# Sets profile variable to an empty value by default, reassigns in while loop below if it was included as a parameter | ||
PROFILE="" | ||
|
||
while [[ $# -gt 0 ]]; do | ||
opt="${1}" | ||
shift; | ||
current_arg="$1" | ||
case ${opt} in | ||
"-d"|"--dataset-name") export DATASET_NAME="$1"; shift;; | ||
"-r"|"--region") export REGION="$1"; shift;; | ||
"-f"|"--profile") PROFILE=" --profile $1"; shift;; | ||
*) echo "ERROR: Invalid option: \""$opt"\"" >&2; exit 1;; | ||
esac | ||
done | ||
|
||
while [[ ${#DATASET_NAME} -gt 53 ]]; do | ||
echo "dataset-name must be under 53 characters in length, enter a shorter name:" | ||
read -p "New dataset-name: " DATASET_NAME | ||
case ${#DATASET_NAME} in | ||
[1-9]|[1-4][0-9]|5[0-3]) break;; | ||
* ) echo "Enter in a shorter dataset-name";; | ||
esac | ||
done | ||
|
||
#get existing cloudformation stack | ||
echo "getting existing CFN parameters" | ||
CFN_STACK_NAME="producer-${DATASET_NAME}-preprocessing" | ||
while read parameter_name parameter_value; do | ||
echo "$parameter_name: $parameter_value" | ||
case ${parameter_name} in | ||
"S3Bucket") export S3_BUCKET="$parameter_value";; | ||
"DataSetArn") export DATASET_ARN="$parameter_value";; | ||
"ProductId") export PRODUCT_ID="$parameter_value";; | ||
#Ignore these two because they were set manually already | ||
"Region");; | ||
"DataSetName");; | ||
*) echo "ERROR: Invalid parameter found: \""$parameter_name"\", please update manually" >&2; exit 1;; | ||
esac | ||
done < <(aws cloudformation describe-stacks --stack-name $CFN_STACK_NAME --query 'Stacks[0].Parameters' --output text$PROFILE) | ||
|
||
#creating a pre-processing zip package, these commands may need to be adjusted depending on folder structure and dependencies | ||
(cd pre-processing/pre-processing-code && zip -r pre-processing-code.zip . -x "*.dist-info/*" -x "bin/*" -x "**/__pycache__/*") | ||
|
||
#upload pre-preprocessing.zip to s3 | ||
echo "uploading pre-preprocessing.zip to s3" | ||
aws s3 cp pre-processing/pre-processing-code/pre-processing-code.zip s3://$S3_BUCKET/$DATASET_NAME/automation/pre-processing-code.zip --region $REGION$PROFILE | ||
|
||
#invoking the pre-processing lambda function to create first dataset revision | ||
echo "updating the pre-processing lambda function code" | ||
LAMBDA_FUNCTION_NAME="source-for-${DATASET_NAME}" | ||
# AWS CLI version 2 changes require explicitly declairing `--cli-binary-format raw-in-base64-out` for the format of the `--payload` | ||
aws lambda update-function-code --function-name $LAMBDA_FUNCTION_NAME --s3-bucket $S3_BUCKET --s3-key $DATASET_NAME/automation/pre-processing-code.zip$PROFILE | ||
echo "updated lambda function code to use latest pre-processing.zip" | ||
|
||
#updating pre-processing cloudformation stack | ||
echo "updating pre-processing cloudformation stack" | ||
CFN_STACK_NAME="producer-${DATASET_NAME}-preprocessing" | ||
aws cloudformation update-stack --stack-name $CFN_STACK_NAME --template-body file://pre-processing/pre-processing-cfn.yaml --parameters ParameterKey=S3Bucket,ParameterValue=$S3_BUCKET ParameterKey=DataSetName,ParameterValue=$DATASET_NAME ParameterKey=DataSetArn,ParameterValue=$DATASET_ARN ParameterKey=ProductId,ParameterValue=$PRODUCT_ID ParameterKey=Region,ParameterValue=$REGION --region $REGION --capabilities "CAPABILITY_AUTO_EXPAND" "CAPABILITY_NAMED_IAM" "CAPABILITY_IAM"$PROFILE | ||
|
||
echo "waiting for cloudformation stack update to complete" | ||
aws cloudformation wait stack-update-complete --stack-name $CFN_STACK_NAME --region $REGION$PROFILE | ||
|
||
if [[ $? -ne 0 ]] | ||
then | ||
echo "Cloudformation stack update failed" | ||
break | ||
fi | ||
echo "cloudformation stack update completed" |