From b35fbe7cec9c1238807963c722d8cd7eab8d92e3 Mon Sep 17 00:00:00 2001
From: Scott Barnhart <github@barnharts4.com>
Date: Fri, 20 May 2022 15:09:38 -0700
Subject: [PATCH] Configure MySQL thread-pool size and stats auto-gathering.
 (#101)

* Make size of MySQL connection pool configurable.

* fix table header

* configure hive.stats.autogather

* update changelog

Co-authored-by: Scott Barnhart <sbarnhart@expedia.com>
---
 CHANGELOG.md        |  4 ++++
 README.md           |  6 ++++--
 files/hive-site.xml | 10 ++++++++++
 files/startup.sh    | 11 +++++++++++
 4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c3a0cef..cf1584d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [3.0.3] - 2022-05-20
+### Added
+- Add ability to configure size of HMS MySQL connection pool, and configure stats computation on table/partition creation.
+
 ## [3.0.2] - 2022-03-29
 ### Changed
 - Upgrade EMR repository to version `5.31.0` (was `5.30.2`) so `AWS SDK for Java` library is upgraded to `1.11.852` that enables AWS web identity token file file authentication using hadoop and public constructors.
diff --git a/README.md b/README.md
index cf64cb4..7ee5d0f 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@ For more information please refer to the main [Apiary](https://github.com/Expedi
 |APIARY_S3_INVENTORY_TABLE_FORMAT|No (defaults to `ORC`)|Format of S3 inventory data - `ORC`, `Parquet`, or `CSV`|
 |APIARY_SYSTEM_SCHEMA|No (defaults to `apiary_system`)|Name for internal system database.|
 |AWS_REGION|Yes|AWS region to configure various AWS clients.|
+|AWS_WEB_IDENTITY_TOKEN_FILE|No|Path of the AWS Web Identity Token File for IRSA/OIDC AWS authentication.|
+|DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES|No|`true`/`false` value for hive.metastore.disallow.incompatible.col.type.changes, default `true`.|
 |ENABLE_GLUESYNC|No|Option to turn on GlueSync Hive Metastore listener.|
 |ENABLE_METRICS|No|Option to enable sending Hive Metastore metrics to CloudWatch.|
 |ENABLE_S3_INVENTORY|No|Option to create Hive tables on top of S3 inventory data if enabled in `apiary-data-lake`. Enabled if value is not null/empty.|
@@ -17,6 +19,7 @@ For more information please refer to the main [Apiary](https://github.com/Expedi
 |EXTERNAL_DATABASE|No|Option to enable external database mode, when specified it disables managing Hive Metastore MySQL database schema.|
 |GLUE_PREFIX|No|Prefix added to Glue databases to handle database name collisions when synchronizing multiple Hive Metastores to the Glue catalog.|
 |HADOOP_HEAPSIZE|No|Hive Metastore Java process heapsize.|
+|HMS_AUTOGATHER_STATS|No (default is `true`)|Whether or not to create basic statistics on table/partition creation. Valid values are `true` or `false`.|
 |HIVE_METASTORE_ACCESS_MODE|No|Hive Metastore access mode, applicable values are: readwrite, readonly|
 |HIVE_DB_NAMES|No|comma separated list of Hive database names, when specified Hive databases will be created and mapped to corresponding S3 buckets.|
 |HIVE_METASTORE_LOG_LEVEL|No|Hive Metastore service Log4j log level.|
@@ -27,6 +30,7 @@ For more information please refer to the main [Apiary](https://github.com/Expedi
 |LDAP_CA_CERT|Base64 encoded Certificate Authority Bundle to validate LDAP SSL connection.|
 |LDAP_SECRET_ARN|No|LDAP bind DN SecretsManager secret ARN.|
 |LDAP_URL|No|Active Directory URL to enable group mapping in metastore.|
+|MYSQL_CONNECTION_POOL_SIZE|No (defaults to `10`)|MySQL Connection pool size for Hive Metastore. See [here](https://github.com/apache/hive/blob/master/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L1181) for more info.|
 |MYSQL_DB_HOST|Yes|Hive Metastore MySQL database hostname.|
 |MYSQL_DB_NAME|Yes|Hive Metastore MySQL database name.|
 |MYSQL_SECRET_ARN|Yes|Hive Metastore MySQL SecretsManager secret ARN.|
@@ -37,8 +41,6 @@ For more information please refer to the main [Apiary](https://github.com/Expedi
 |RANGER_SERVICE_NAME|No|Ranger service name used to configure RangerAuth plugin.|
 |SNS_ARN|No|The SNS topic ARN to which metadata updates will be sent.|
 |TABLE_PARAM_FILTER|No|A regular expression for selecting necessary table parameters. If the value isn't set, then no table parameters are selected.|
-|DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES|No|`true`/`false` value for hive.metastore.disallow.incompatible.col.type.changes, default `true`.|
-|AWS_WEB_IDENTITY_TOKEN_FILE|No|Path of the AWS Web Identity Token File for IRSA/OIDC AWS authentication.|
 
 # Contact
 
diff --git a/files/hive-site.xml b/files/hive-site.xml
index 443f619..c21ed31 100644
--- a/files/hive-site.xml
+++ b/files/hive-site.xml
@@ -28,6 +28,11 @@
   <value>true</value>
 </property>
 
+  <property>
+    <name>datanucleus.connectionPool.maxPoolSize</name>
+    <value>10</value>
+  </property>
+
 <property>
   <name>hive.metastore.uris</name>
   <value>thrift://localhost:9083</value>
@@ -88,4 +93,9 @@
   <value>true</value>
 </property>
 
+<property>
+  <name>hive.stats.autogather</name>
+  <value>true</value>
+</property>
+
 </configuration>
diff --git a/files/startup.sh b/files/startup.sh
index 21df902..4c4ce0e 100755
--- a/files/startup.sh
+++ b/files/startup.sh
@@ -16,10 +16,21 @@ if [[ -n ${HMS_MAX_THREADS} ]]; then
   update_property.py hive.metastore.server.max.threads "${HMS_MAX_THREADS}" /etc/hive/conf/hive-site.xml
 fi
 
+# config size of MySQL connection pool.
+# See https://github.com/apache/hive/blob/master/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L1181
+# and also make sure 2 * MYSQL_CONNECTION_POOL_SIZE * numHmsContainers is less than max connections for your MySQL instance.
+if [[ -n ${MYSQL_CONNECTION_POOL_SIZE} ]]; then
+  update_property.py datanucleus.connectionPool.maxPoolSize "${MYSQL_CONNECTION_POOL_SIZE}" /etc/hive/conf/hive-site.xml
+fi
+
 if [[ -n ${DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES} ]]; then
   update_property.py hive.metastore.disallow.incompatible.col.type.changes "${DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES}" /etc/hive/conf/hive-site.xml
 fi
 
+if [[ -n ${HMS_AUTOGATHER_STATS} ]]; then
+  update_property.py hive.stats.autogather "${HMS_AUTOGATHER_STATS}" /etc/hive/conf/hive-site.xml
+fi
+
 #configure LDAP group mapping, required for ranger authorization
 if [[ -n $LDAP_URL ]] ; then
     update_property.py hadoop.security.group.mapping.ldap.bind.user "$(aws secretsmanager get-secret-value --secret-id ${LDAP_SECRET_ARN}|jq .SecretString -r|jq .username -r)" /etc/hadoop/conf/core-site.xml