From f36422d5b015ec996eaa204c1734ffb6f381cb4a Mon Sep 17 00:00:00 2001 From: Scott Barnhart Date: Tue, 17 Mar 2020 06:48:02 -0700 Subject: [PATCH] If "apiary-data-lake" enables S3 inventory, create Hive tables on top of bucket data. (#70) * Create hive tables on top of S3 inventory bucket if feature is enabled. * Update CHANGELOG and README * cleanup * PR comments * update CHANGELOG * Refactor inventory table logic to separate script. * minor cleanup Co-authored-by: Scott Barnhart Co-authored-by: Adrian Woodhead --- CHANGELOG.md | 7 ++++ Dockerfile | 2 ++ README.md | 3 ++ files/s3_inventory_repair.sh | 64 ++++++++++++++++++++++++++++++++++++ files/s3inventory.tpl | 19 +++++++++++ files/startup.sh | 59 +++++++++++++++++++-------------- 6 files changed, 130 insertions(+), 24 deletions(-) create mode 100755 files/s3_inventory_repair.sh create mode 100644 files/s3inventory.tpl diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c909bb..16a5d86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [1.11.0] - TBD +### Added +- If S3 Inventory is enabled in `apiary-data-lake`, create Hive `s3_inventory` database on startup. +- Add script `/s3_inventory_repair.sh` which can be used as the entrypoint of this Docker image to create and repair S3 + inventory tables in the inventory database (if S3 inventory is enabled). The intent is to run the image this way on a + scheduled basis in Kubernetes after AWS creates new inventory partition files in S3 each day. + ## [1.10.0] - 2020-03-16 ### Changed diff --git a/Dockerfile b/Dockerfile index cf4a2ff..d317915 100644 --- a/Dockerfile +++ b/Dockerfile @@ -76,5 +76,7 @@ COPY files/jmx-exporter.yaml /etc/hive/conf/jmx-exporter.yaml EXPOSE 9083 COPY files/update_property.py /bin/update_property.py +COPY files/s3inventory.tpl /s3inventory.tpl COPY files/startup.sh /startup.sh +COPY files/s3_inventory_repair.sh /s3_inventory_repair.sh CMD /startup.sh diff --git a/README.md b/README.md index da0f05c..cbb4b5f 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,13 @@ For more information please refer to the main [Apiary](https://github.com/Expedi ## Environment Variables |Environment Variable|Required|Description| |----|----|----| +|APIARY_S3_INVENTORY_PREFIX|No (defaults to `EntireBucketDaily`)|Prefix used by S3 Inventory when creating data in the inventory bucket.| +|APIARY_S3_INVENTORY_TABLE_FORMAT|No (defaults to `ORC`)|Format of S3 inventory data - `ORC`, `Parquet`, or `CSV`| |AWS_REGION|Yes|AWS region to configure various AWS clients.| |ATLAS_KAFKA_BOOTSTRAP_SERVERS|No|Atlas hive-bridge kafka bootstrap servers.| |ENABLE_METRICS|No|Option to enable sending Hive Metastore metrics to CloudWatch.| |ENABLE_GLUESYNC|No|Option to turn on GlueSync Hive Metastore listener.| +|ENABLE_S3_INVENTORY|No|Option to create Hive tables on top of S3 inventory data if enabled in `apiary-data-lake`. Enabled if value is not null/empty.| |EXTERNAL_DATABASE|No|Option to enable external database mode, when specified it disables managing Hive Metastore MySQL database schema.| |GLUE_PREFIX|No|Prefix added to Glue databases to handle database name collisions when synchronizing multiple Hive Metastores to the Glue catalog.| |HADOOP_HEAPSIZE|No|Hive Metastore Java process heapsize.| diff --git a/files/s3_inventory_repair.sh b/files/s3_inventory_repair.sh new file mode 100755 index 0000000..3996c8a --- /dev/null +++ b/files/s3_inventory_repair.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright (C) 2018-2020 Expedia, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); + +[[ -z ${ENABLE_S3_INVENTORY} ]] && exit 0 +[[ -z ${HIVE_DB_NAMES} ]] && exit 1 + +AWS_ACCOUNT=`aws sts get-caller-identity|jq -r .Account` + +[[ -z ${APIARY_S3_INVENTORY_SCHEMA} ]] && APIARY_S3_INVENTORY_SCHEMA=s3_inventory +[[ -z ${APIARY_S3_INVENTORY_PREFIX} ]] && APIARY_S3_INVENTORY_PREFIX=EntireBucketDaily +[[ -z ${APIARY_S3_INVENTORY_TABLE_FORMAT} ]] && APIARY_S3_INVENTORY_TABLE_FORMAT=ORC +[[ -z ${APIARY_RW_METASTORE_URI} ]] && APIARY_RW_METASTORE_URI=thrift://hms-readwrite.apiary-${AWS_REGION}.lcl:9083 + +[[ -z $HIVE_METASTORE_LOG_LEVEL ]] && HIVE_METASTORE_LOG_LEVEL="INFO" +sed "s/HIVE_METASTORE_LOG_LEVEL/$HIVE_METASTORE_LOG_LEVEL/" -i /etc/hive/conf/hive-log4j2.properties + +# +# S3 Inventory is enabled - need to create and repair inventory tables on top of S3 inventory data that AWS wrote. +# + +# Hive CLI won't run unless /tmp/hive exists and is writeable +su hive -s /bin/bash -c "mkdir /tmp/hive && chmod 777 /tmp/hive" + +# Create and repair S3 inventory tables +APIARY_S3_INVENTORY_TEMPLATE_FILE=s3inventory.tpl +APIARY_S3_INVENTORY_BUCKET=$(echo "${INSTANCE_NAME}-${AWS_ACCOUNT}-${AWS_REGION}-${APIARY_S3_INVENTORY_SCHEMA}"|tr "_" "-") +APIARY_S3_INVENTORY_TABLE_FORMAT=`echo $APIARY_S3_INVENTORY_TABLE_FORMAT | tr "[:upper:]" "[:lower:]"` + +if [ "${APIARY_S3_INVENTORY_TABLE_FORMAT}" = "parquet" ]; then + APIARY_S3_INVENTORY_TABLE_SERDE=org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe +elif [ "${APIARY_S3_INVENTORY_TABLE_FORMAT}" = "csv" ] ; then + APIARY_S3_INVENTORY_TABLE_SERDE=org.apache.hadoop.hive.serde2.OpenCSVSerde +else + APIARY_S3_INVENTORY_TABLE_SERDE=org.apache.hadoop.hive.ql.io.orc.OrcSerde +fi +APIARY_S3_INVENTORY_TABLE_HQL_FILE="/CreateInventoryTables.hql" + +> ${APIARY_S3_INVENTORY_TABLE_HQL_FILE} +for HIVE_DB in `echo $HIVE_DB_NAMES|tr "," "\n"` +do + APIARY_SCHEMA_BUCKET=$(echo "${INSTANCE_NAME}-${AWS_ACCOUNT}-${AWS_REGION}-${HIVE_DB}"|tr "_" "-") + APIARY_S3_INVENTORY_TABLE=$(echo "${APIARY_SCHEMA_BUCKET}"|tr "-" "_") + + # Check if AWS has written the Hive data files at least once. If not, skip trying to create/repair the table. + HIVEDIR=s3://${APIARY_S3_INVENTORY_BUCKET}/${APIARY_SCHEMA_BUCKET}/${APIARY_S3_INVENTORY_PREFIX}/hive + + # wc -l will return 1 if dir exists, 0 otherwise. xargs is used here to trim the output of "wc -l" from " " to just "number" + HIVEDIREXISTS=`aws s3 ls ${HIVEDIR} | wc -l | xargs` + if [ "$HIVEDIREXISTS" -eq "0" ] ; then + echo "S3 Inventory Hive data for ${APIARY_SCHEMA_BUCKET} doesn't exist yet, skipping create/repair" + else + echo "Writing S3 inventory table create/repair statements for schema: $HIVE_DB" + # Format the template file with environment variable values defined above. Unsetting IFS preserves newlines. + IFS= HQL_STMT=`eval "echo \"$(cat "${APIARY_S3_INVENTORY_TEMPLATE_FILE}")\""` + + echo ${HQL_STMT} >> ${APIARY_S3_INVENTORY_TABLE_HQL_FILE} + fi +done + +# Run the create/repair statements that we wrote to the .hql file +echo "Creating and repairing S3 inventory tables..." +su hive -s/bin/bash -c "/usr/lib/hive/bin/hive --hiveconf hive.metastore.uris=${APIARY_RW_METASTORE_URI} -f ${APIARY_S3_INVENTORY_TABLE_HQL_FILE}" +echo "Done creating and repairing S3 inventory tables." diff --git a/files/s3inventory.tpl b/files/s3inventory.tpl new file mode 100644 index 0000000..1dd519a --- /dev/null +++ b/files/s3inventory.tpl @@ -0,0 +1,19 @@ +CREATE EXTERNAL TABLE IF NOT EXISTS ${APIARY_S3_INVENTORY_SCHEMA}.${APIARY_S3_INVENTORY_TABLE}( + bucket string, + key string, + version_id string, + is_latest boolean, + is_delete_marker boolean, + size bigint, + last_modified_date timestamp, + e_tag string, + storage_class string, + intelligent_tiering_access_tier string + ) +PARTITIONED BY (dt string) +ROW FORMAT SERDE '${APIARY_S3_INVENTORY_TABLE_SERDE}' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' +LOCATION 's3://${APIARY_S3_INVENTORY_BUCKET}/${APIARY_SCHEMA_BUCKET}/${APIARY_S3_INVENTORY_PREFIX}/hive/'; + +MSCK REPAIR TABLE ${APIARY_S3_INVENTORY_SCHEMA}.${APIARY_S3_INVENTORY_TABLE}; diff --git a/files/startup.sh b/files/startup.sh index 37d71bd..ff67114 100755 --- a/files/startup.sh +++ b/files/startup.sh @@ -1,7 +1,9 @@ #!/bin/bash -# Copyright (C) 2018-2019 Expedia, Inc. +# Copyright (C) 2018-2020 Expedia, Inc. # Licensed under the Apache License, Version 2.0 (the "License"); +set -x + [[ -z "$MYSQL_DB_USERNAME" ]] && export MYSQL_DB_USERNAME=$(aws secretsmanager get-secret-value --secret-id ${MYSQL_SECRET_ARN}|jq .SecretString -r|jq .username -r) [[ -z "$MYSQL_DB_PASSWORD" ]] && export MYSQL_DB_PASSWORD=$(aws secretsmanager get-secret-value --secret-id ${MYSQL_SECRET_ARN}|jq .SecretString -r|jq .password -r) @@ -83,34 +85,43 @@ if [[ ! -z $KAFKA_BOOTSTRAP_SERVERS ]]; then [[ -n $KAFKA_CLIENT_ID ]] && sed "s/KAFKA_CLIENT_ID/$KAFKA_CLIENT_ID/" -i /etc/hive/conf/hive-site.xml fi +APIARY_S3_INVENTORY_SCHEMA=s3_inventory + #check if database is initialized, test only from rw instances and only if DB is managed by apiary if [ -z $EXTERNAL_DATABASE ] && [ "$HIVE_METASTORE_ACCESS_MODE" = "readwrite" ]; then -MYSQL_OPTIONS="-h$MYSQL_DB_HOST -u$MYSQL_DB_USERNAME -p$MYSQL_DB_PASSWORD $MYSQL_DB_NAME -N" -schema_version=`echo "select SCHEMA_VERSION from VERSION"|mysql $MYSQL_OPTIONS` -if [ "$schema_version" != "2.3.0" ]; then - cd /usr/lib/hive/scripts/metastore/upgrade/mysql - cat hive-schema-2.3.0.mysql.sql|mysql $MYSQL_OPTIONS - cd / -fi + MYSQL_OPTIONS="-h$MYSQL_DB_HOST -u$MYSQL_DB_USERNAME -p$MYSQL_DB_PASSWORD $MYSQL_DB_NAME -N" + schema_version=`echo "select SCHEMA_VERSION from VERSION"|mysql $MYSQL_OPTIONS` + if [ "$schema_version" != "2.3.0" ]; then + cd /usr/lib/hive/scripts/metastore/upgrade/mysql + cat hive-schema-2.3.0.mysql.sql|mysql $MYSQL_OPTIONS + cd / + fi -#create hive databases -if [ ! -z $HIVE_DB_NAMES ]; then - for HIVE_DB in `echo $HIVE_DB_NAMES|tr "," "\n"` - do - echo "creating hive database $HIVE_DB" - DB_ID=`echo "select MAX(DB_ID)+1 from DBS"|mysql $MYSQL_OPTIONS` - AWS_ACCOUNT=`aws sts get-caller-identity|jq -r .Account` - BUCKET_NAME=$(echo "${INSTANCE_NAME}-${AWS_ACCOUNT}-${AWS_REGION}-${HIVE_DB}"|tr "_" "-") - echo "insert into DBS(DB_ID,DB_LOCATION_URI,NAME,OWNER_NAME,OWNER_TYPE) values(\"$DB_ID\",\"s3://${BUCKET_NAME}/\",\"${HIVE_DB}\",\"root\",\"USER\") on duplicate key update DB_LOCATION_URI=\"s3://${BUCKET_NAME}/\";"|mysql $MYSQL_OPTIONS - #create glue database - if [ ! -z $ENABLE_GLUESYNC ]; then - echo "creating glue database $HIVE_DB" - aws --region=${AWS_REGION} glue create-database --database-input Name=${GLUE_PREFIX}${HIVE_DB},LocationUri=s3://${BUCKET_NAME}/ &> /dev/null - aws --region=${AWS_REGION} glue update-database --name=${GLUE_PREFIX}${HIVE_DB} --database-input "Name=${GLUE_PREFIX}${HIVE_DB},LocationUri=s3://${BUCKET_NAME}/,Description=Managed by ${INSTANCE_NAME} datalake." + #create hive databases + if [ ! -z $HIVE_DB_NAMES ]; then + if [ ! -z $ENABLE_S3_INVENTORY ]; then + HIVE_APIARY_DB_NAMES="${HIVE_DB_NAMES},${APIARY_S3_INVENTORY_SCHEMA}" + else + HIVE_APIARY_DB_NAMES="${HIVE_DB_NAMES}" fi - done -fi + + AWS_ACCOUNT=`aws sts get-caller-identity|jq -r .Account` + for HIVE_DB in `echo ${HIVE_APIARY_DB_NAMES}|tr "," "\n"` + do + echo "creating hive database $HIVE_DB" + DB_ID=`echo "select MAX(DB_ID)+1 from DBS"|mysql $MYSQL_OPTIONS` + BUCKET_NAME=$(echo "${INSTANCE_NAME}-${AWS_ACCOUNT}-${AWS_REGION}-${HIVE_DB}"|tr "_" "-") + echo "insert into DBS(DB_ID,DB_LOCATION_URI,NAME,OWNER_NAME,OWNER_TYPE) values(\"$DB_ID\",\"s3://${BUCKET_NAME}/\",\"${HIVE_DB}\",\"root\",\"USER\") on duplicate key update DB_LOCATION_URI=\"s3://${BUCKET_NAME}/\";"|mysql $MYSQL_OPTIONS + #create glue database + if [ ! -z $ENABLE_GLUESYNC ]; then + echo "creating glue database $HIVE_DB" + aws --region=${AWS_REGION} glue create-database --database-input Name=${GLUE_PREFIX}${HIVE_DB},LocationUri=s3://${BUCKET_NAME}/ &> /dev/null + aws --region=${AWS_REGION} glue update-database --name=${GLUE_PREFIX}${HIVE_DB} --database-input "Name=${GLUE_PREFIX}${HIVE_DB},LocationUri=s3://${BUCKET_NAME}/,Description=Managed by ${INSTANCE_NAME} datalake." + fi + done + fi fi + #pre event listener to restrict hive database access in read-only metastores [[ ! -z $HIVE_DB_WHITELIST ]] && export METASTORE_PRELISTENERS="${METASTORE_PRELISTENERS},com.expediagroup.apiary.extensions.readonlyauth.listener.ApiaryReadOnlyAuthPreEventListener"