If "apiary-data-lake" enables S3 inventory, create Hive tables on top…

… of bucket data. (#70) * Create hive tables on top of S3 inventory bucket if feature is enabled. * Update CHANGELOG and README * cleanup * PR comments * update CHANGELOG * Refactor inventory table logic to separate script. * minor cleanup Co-authored-by: Scott Barnhart <[email protected]> Co-authored-by: Adrian Woodhead <[email protected]>
ExpediaGroup · Mar 17, 2020 · f36422d · f36422d
1 parent e61dd5b
commit f36422d
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,13 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [1.11.0] - TBD
+### Added
+- If S3 Inventory is enabled in `apiary-data-lake`, create Hive `s3_inventory` database on startup.
+- Add script `/s3_inventory_repair.sh` which can be used as the entrypoint of this Docker image to create and repair S3 
+  inventory tables in the inventory database (if S3 inventory is enabled). The intent is to run the image this way on a 
+  scheduled basis in Kubernetes after AWS creates new inventory partition files in S3 each day. 
+
 ## [1.10.0] - 2020-03-16
 
 ### Changed

diff --git a/Dockerfile b/Dockerfile
@@ -76,5 +76,7 @@ COPY files/jmx-exporter.yaml /etc/hive/conf/jmx-exporter.yaml
 
 EXPOSE 9083
 COPY files/update_property.py /bin/update_property.py
+COPY files/s3inventory.tpl /s3inventory.tpl
 COPY files/startup.sh /startup.sh
+COPY files/s3_inventory_repair.sh /s3_inventory_repair.sh
 CMD /startup.sh
diff --git a/README.md b/README.md
@@ -6,10 +6,13 @@ For more information please refer to the main [Apiary](https://github.com/Expedi
 ## Environment Variables
 |Environment Variable|Required|Description|
 |----|----|----|
+|APIARY_S3_INVENTORY_PREFIX|No (defaults to `EntireBucketDaily`)|Prefix used by S3 Inventory when creating data in the inventory bucket.|
+|APIARY_S3_INVENTORY_TABLE_FORMAT|No (defaults to `ORC`)|Format of S3 inventory data - `ORC`, `Parquet`, or `CSV`|
 |AWS_REGION|Yes|AWS region to configure various AWS clients.|
 |ATLAS_KAFKA_BOOTSTRAP_SERVERS|No|Atlas hive-bridge kafka bootstrap servers.|
 |ENABLE_METRICS|No|Option to enable sending Hive Metastore metrics to CloudWatch.|
 |ENABLE_GLUESYNC|No|Option to turn on GlueSync Hive Metastore listener.|
+|ENABLE_S3_INVENTORY|No|Option to create Hive tables on top of S3 inventory data if enabled in `apiary-data-lake`. Enabled if value is not null/empty.|
 |EXTERNAL_DATABASE|No|Option to enable external database mode, when specified it disables managing Hive Metastore MySQL database schema.|
 |GLUE_PREFIX|No|Prefix added to Glue databases to handle database name collisions when synchronizing multiple Hive Metastores to the Glue catalog.|
 |HADOOP_HEAPSIZE|No|Hive Metastore Java process heapsize.|

diff --git a/files/s3_inventory_repair.sh b/files/s3_inventory_repair.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Copyright (C) 2018-2020 Expedia, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+[[ -z ${ENABLE_S3_INVENTORY} ]] && exit 0
+[[ -z ${HIVE_DB_NAMES} ]] && exit 1
+
+AWS_ACCOUNT=`aws sts get-caller-identity|jq -r .Account`
+
+[[ -z ${APIARY_S3_INVENTORY_SCHEMA} ]] && APIARY_S3_INVENTORY_SCHEMA=s3_inventory
+[[ -z ${APIARY_S3_INVENTORY_PREFIX} ]] && APIARY_S3_INVENTORY_PREFIX=EntireBucketDaily
+[[ -z ${APIARY_S3_INVENTORY_TABLE_FORMAT} ]] && APIARY_S3_INVENTORY_TABLE_FORMAT=ORC
+[[ -z ${APIARY_RW_METASTORE_URI} ]] && APIARY_RW_METASTORE_URI=thrift://hms-readwrite.apiary-${AWS_REGION}.lcl:9083
+
+[[ -z $HIVE_METASTORE_LOG_LEVEL ]] && HIVE_METASTORE_LOG_LEVEL="INFO"
+sed "s/HIVE_METASTORE_LOG_LEVEL/$HIVE_METASTORE_LOG_LEVEL/" -i /etc/hive/conf/hive-log4j2.properties
+
+#
+# S3 Inventory is enabled - need to create and repair inventory tables on top of S3 inventory data that AWS wrote.
+#
+
+# Hive CLI won't run unless /tmp/hive exists and is writeable
+su hive -s /bin/bash -c "mkdir /tmp/hive && chmod 777 /tmp/hive"
+
+# Create and repair S3 inventory tables
+APIARY_S3_INVENTORY_TEMPLATE_FILE=s3inventory.tpl
+APIARY_S3_INVENTORY_BUCKET=$(echo "${INSTANCE_NAME}-${AWS_ACCOUNT}-${AWS_REGION}-${APIARY_S3_INVENTORY_SCHEMA}"|tr "_" "-")
+APIARY_S3_INVENTORY_TABLE_FORMAT=`echo $APIARY_S3_INVENTORY_TABLE_FORMAT | tr "[:upper:]" "[:lower:]"`
+
+if [ "${APIARY_S3_INVENTORY_TABLE_FORMAT}" = "parquet" ]; then
+  APIARY_S3_INVENTORY_TABLE_SERDE=org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
+elif [ "${APIARY_S3_INVENTORY_TABLE_FORMAT}" = "csv" ] ; then
+  APIARY_S3_INVENTORY_TABLE_SERDE=org.apache.hadoop.hive.serde2.OpenCSVSerde
+else
+  APIARY_S3_INVENTORY_TABLE_SERDE=org.apache.hadoop.hive.ql.io.orc.OrcSerde
+fi
+APIARY_S3_INVENTORY_TABLE_HQL_FILE="/CreateInventoryTables.hql"
+
+> ${APIARY_S3_INVENTORY_TABLE_HQL_FILE}
+for HIVE_DB in `echo $HIVE_DB_NAMES|tr "," "\n"`
+do
+  APIARY_SCHEMA_BUCKET=$(echo "${INSTANCE_NAME}-${AWS_ACCOUNT}-${AWS_REGION}-${HIVE_DB}"|tr "_" "-")
+  APIARY_S3_INVENTORY_TABLE=$(echo "${APIARY_SCHEMA_BUCKET}"|tr "-" "_")
+
+  # Check if AWS has written the Hive data files at least once.  If not, skip trying to create/repair the table.
+  HIVEDIR=s3://${APIARY_S3_INVENTORY_BUCKET}/${APIARY_SCHEMA_BUCKET}/${APIARY_S3_INVENTORY_PREFIX}/hive
+
+  # wc -l will return 1 if dir exists, 0 otherwise. xargs is used here to trim the output of "wc -l" from "     <number>" to just "number"
+  HIVEDIREXISTS=`aws s3 ls ${HIVEDIR} | wc -l | xargs`
+  if [ "$HIVEDIREXISTS" -eq "0" ] ; then
+    echo "S3 Inventory Hive data for ${APIARY_SCHEMA_BUCKET} doesn't exist yet, skipping create/repair"
+  else
+    echo "Writing S3 inventory table create/repair statements for schema: $HIVE_DB"
+    # Format the template file with environment variable values defined above.  Unsetting IFS preserves newlines.
+    IFS= HQL_STMT=`eval "echo \"$(cat "${APIARY_S3_INVENTORY_TEMPLATE_FILE}")\""`
+
+    echo ${HQL_STMT} >> ${APIARY_S3_INVENTORY_TABLE_HQL_FILE}
+  fi
+done
+
+# Run the create/repair statements that we wrote to the .hql file
+echo "Creating and repairing S3 inventory tables..."
+su hive -s/bin/bash -c "/usr/lib/hive/bin/hive --hiveconf hive.metastore.uris=${APIARY_RW_METASTORE_URI} -f ${APIARY_S3_INVENTORY_TABLE_HQL_FILE}"
+echo "Done creating and repairing S3 inventory tables."
diff --git a/files/s3inventory.tpl b/files/s3inventory.tpl
@@ -0,0 +1,19 @@
+CREATE EXTERNAL TABLE IF NOT EXISTS ${APIARY_S3_INVENTORY_SCHEMA}.${APIARY_S3_INVENTORY_TABLE}(
+  bucket string,
+  key string,
+  version_id string,
+  is_latest boolean,
+  is_delete_marker boolean,
+  size bigint,
+  last_modified_date timestamp,
+  e_tag string,
+  storage_class string,
+  intelligent_tiering_access_tier string
+  )
+PARTITIONED BY (dt string)
+ROW FORMAT SERDE '${APIARY_S3_INVENTORY_TABLE_SERDE}'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
+OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
+LOCATION 's3://${APIARY_S3_INVENTORY_BUCKET}/${APIARY_SCHEMA_BUCKET}/${APIARY_S3_INVENTORY_PREFIX}/hive/';
+
+MSCK REPAIR TABLE ${APIARY_S3_INVENTORY_SCHEMA}.${APIARY_S3_INVENTORY_TABLE};
diff --git a/files/startup.sh b/files/startup.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
-# Copyright (C) 2018-2019 Expedia, Inc.
+# Copyright (C) 2018-2020 Expedia, Inc.
 # Licensed under the Apache License, Version 2.0 (the "License");
 
+set -x
+
 [[ -z "$MYSQL_DB_USERNAME" ]] && export MYSQL_DB_USERNAME=$(aws secretsmanager get-secret-value --secret-id ${MYSQL_SECRET_ARN}|jq .SecretString -r|jq .username -r)
 [[ -z "$MYSQL_DB_PASSWORD" ]] && export MYSQL_DB_PASSWORD=$(aws secretsmanager get-secret-value --secret-id ${MYSQL_SECRET_ARN}|jq .SecretString -r|jq .password -r)
 
@@ -83,34 +85,43 @@ if [[ ! -z $KAFKA_BOOTSTRAP_SERVERS ]]; then
     [[ -n $KAFKA_CLIENT_ID ]] && sed "s/KAFKA_CLIENT_ID/$KAFKA_CLIENT_ID/" -i /etc/hive/conf/hive-site.xml
 fi
 
+APIARY_S3_INVENTORY_SCHEMA=s3_inventory
+
 #check if database is initialized, test only from rw instances and only if DB is managed by apiary
 if [ -z $EXTERNAL_DATABASE ] && [ "$HIVE_METASTORE_ACCESS_MODE" = "readwrite" ]; then
-MYSQL_OPTIONS="-h$MYSQL_DB_HOST -u$MYSQL_DB_USERNAME -p$MYSQL_DB_PASSWORD $MYSQL_DB_NAME -N"
-schema_version=`echo "select SCHEMA_VERSION from VERSION"|mysql $MYSQL_OPTIONS`
-if [ "$schema_version" != "2.3.0" ]; then
-    cd /usr/lib/hive/scripts/metastore/upgrade/mysql
-    cat hive-schema-2.3.0.mysql.sql|mysql $MYSQL_OPTIONS
-    cd /
-fi
+    MYSQL_OPTIONS="-h$MYSQL_DB_HOST -u$MYSQL_DB_USERNAME -p$MYSQL_DB_PASSWORD $MYSQL_DB_NAME -N"
+    schema_version=`echo "select SCHEMA_VERSION from VERSION"|mysql $MYSQL_OPTIONS`
+    if [ "$schema_version" != "2.3.0" ]; then
+        cd /usr/lib/hive/scripts/metastore/upgrade/mysql
+        cat hive-schema-2.3.0.mysql.sql|mysql $MYSQL_OPTIONS
+        cd /
+    fi
 
-#create hive databases
-if [ ! -z $HIVE_DB_NAMES ]; then
-    for HIVE_DB in `echo $HIVE_DB_NAMES|tr "," "\n"`
-    do
-        echo "creating hive database $HIVE_DB"
-        DB_ID=`echo "select MAX(DB_ID)+1 from DBS"|mysql $MYSQL_OPTIONS`
-        AWS_ACCOUNT=`aws sts get-caller-identity|jq -r .Account`
-        BUCKET_NAME=$(echo "${INSTANCE_NAME}-${AWS_ACCOUNT}-${AWS_REGION}-${HIVE_DB}"|tr "_" "-")
-        echo "insert into DBS(DB_ID,DB_LOCATION_URI,NAME,OWNER_NAME,OWNER_TYPE) values(\"$DB_ID\",\"s3://${BUCKET_NAME}/\",\"${HIVE_DB}\",\"root\",\"USER\") on duplicate key update DB_LOCATION_URI=\"s3://${BUCKET_NAME}/\";"|mysql $MYSQL_OPTIONS
-        #create glue database
-        if [ ! -z $ENABLE_GLUESYNC ]; then
-            echo "creating glue database $HIVE_DB"
-            aws --region=${AWS_REGION} glue create-database --database-input Name=${GLUE_PREFIX}${HIVE_DB},LocationUri=s3://${BUCKET_NAME}/ &> /dev/null
-            aws --region=${AWS_REGION} glue update-database --name=${GLUE_PREFIX}${HIVE_DB} --database-input "Name=${GLUE_PREFIX}${HIVE_DB},LocationUri=s3://${BUCKET_NAME}/,Description=Managed by ${INSTANCE_NAME} datalake."
+    #create hive databases
+    if [ ! -z $HIVE_DB_NAMES ]; then
+        if [ ! -z $ENABLE_S3_INVENTORY ]; then
+            HIVE_APIARY_DB_NAMES="${HIVE_DB_NAMES},${APIARY_S3_INVENTORY_SCHEMA}"
+        else
+            HIVE_APIARY_DB_NAMES="${HIVE_DB_NAMES}"
         fi
-    done
-fi
+
+        AWS_ACCOUNT=`aws sts get-caller-identity|jq -r .Account`
+        for HIVE_DB in `echo ${HIVE_APIARY_DB_NAMES}|tr "," "\n"`
+        do
+            echo "creating hive database $HIVE_DB"
+            DB_ID=`echo "select MAX(DB_ID)+1 from DBS"|mysql $MYSQL_OPTIONS`
+            BUCKET_NAME=$(echo "${INSTANCE_NAME}-${AWS_ACCOUNT}-${AWS_REGION}-${HIVE_DB}"|tr "_" "-")
+            echo "insert into DBS(DB_ID,DB_LOCATION_URI,NAME,OWNER_NAME,OWNER_TYPE) values(\"$DB_ID\",\"s3://${BUCKET_NAME}/\",\"${HIVE_DB}\",\"root\",\"USER\") on duplicate key update DB_LOCATION_URI=\"s3://${BUCKET_NAME}/\";"|mysql $MYSQL_OPTIONS
+            #create glue database
+            if [ ! -z $ENABLE_GLUESYNC ]; then
+                echo "creating glue database $HIVE_DB"
+                aws --region=${AWS_REGION} glue create-database --database-input Name=${GLUE_PREFIX}${HIVE_DB},LocationUri=s3://${BUCKET_NAME}/ &> /dev/null
+                aws --region=${AWS_REGION} glue update-database --name=${GLUE_PREFIX}${HIVE_DB} --database-input "Name=${GLUE_PREFIX}${HIVE_DB},LocationUri=s3://${BUCKET_NAME}/,Description=Managed by ${INSTANCE_NAME} datalake."
+            fi
+        done
+    fi
 fi
+
 #pre event listener to restrict hive database access in read-only metastores
 [[ ! -z $HIVE_DB_WHITELIST ]] && export METASTORE_PRELISTENERS="${METASTORE_PRELISTENERS},com.expediagroup.apiary.extensions.readonlyauth.listener.ApiaryReadOnlyAuthPreEventListener"