From b35fbe7cec9c1238807963c722d8cd7eab8d92e3 Mon Sep 17 00:00:00 2001 From: Scott Barnhart Date: Fri, 20 May 2022 15:09:38 -0700 Subject: [PATCH] Configure MySQL thread-pool size and stats auto-gathering. (#101) * Make size of MySQL connection pool configurable. * fix table header * configure hive.stats.autogather * update changelog Co-authored-by: Scott Barnhart --- CHANGELOG.md | 4 ++++ README.md | 6 ++++-- files/hive-site.xml | 10 ++++++++++ files/startup.sh | 11 +++++++++++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3a0cef..cf1584d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [3.0.3] - 2022-05-20 +### Added +- Add ability to configure size of HMS MySQL connection pool, and configure stats computation on table/partition creation. + ## [3.0.2] - 2022-03-29 ### Changed - Upgrade EMR repository to version `5.31.0` (was `5.30.2`) so `AWS SDK for Java` library is upgraded to `1.11.852` that enables AWS web identity token file file authentication using hadoop and public constructors. diff --git a/README.md b/README.md index cf64cb4..7ee5d0f 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ For more information please refer to the main [Apiary](https://github.com/Expedi |APIARY_S3_INVENTORY_TABLE_FORMAT|No (defaults to `ORC`)|Format of S3 inventory data - `ORC`, `Parquet`, or `CSV`| |APIARY_SYSTEM_SCHEMA|No (defaults to `apiary_system`)|Name for internal system database.| |AWS_REGION|Yes|AWS region to configure various AWS clients.| +|AWS_WEB_IDENTITY_TOKEN_FILE|No|Path of the AWS Web Identity Token File for IRSA/OIDC AWS authentication.| +|DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES|No|`true`/`false` value for hive.metastore.disallow.incompatible.col.type.changes, default `true`.| |ENABLE_GLUESYNC|No|Option to turn on GlueSync Hive Metastore listener.| |ENABLE_METRICS|No|Option to enable sending Hive Metastore metrics to CloudWatch.| |ENABLE_S3_INVENTORY|No|Option to create Hive tables on top of S3 inventory data if enabled in `apiary-data-lake`. Enabled if value is not null/empty.| @@ -17,6 +19,7 @@ For more information please refer to the main [Apiary](https://github.com/Expedi |EXTERNAL_DATABASE|No|Option to enable external database mode, when specified it disables managing Hive Metastore MySQL database schema.| |GLUE_PREFIX|No|Prefix added to Glue databases to handle database name collisions when synchronizing multiple Hive Metastores to the Glue catalog.| |HADOOP_HEAPSIZE|No|Hive Metastore Java process heapsize.| +|HMS_AUTOGATHER_STATS|No (default is `true`)|Whether or not to create basic statistics on table/partition creation. Valid values are `true` or `false`.| |HIVE_METASTORE_ACCESS_MODE|No|Hive Metastore access mode, applicable values are: readwrite, readonly| |HIVE_DB_NAMES|No|comma separated list of Hive database names, when specified Hive databases will be created and mapped to corresponding S3 buckets.| |HIVE_METASTORE_LOG_LEVEL|No|Hive Metastore service Log4j log level.| @@ -27,6 +30,7 @@ For more information please refer to the main [Apiary](https://github.com/Expedi |LDAP_CA_CERT|Base64 encoded Certificate Authority Bundle to validate LDAP SSL connection.| |LDAP_SECRET_ARN|No|LDAP bind DN SecretsManager secret ARN.| |LDAP_URL|No|Active Directory URL to enable group mapping in metastore.| +|MYSQL_CONNECTION_POOL_SIZE|No (defaults to `10`)|MySQL Connection pool size for Hive Metastore. See [here](https://github.com/apache/hive/blob/master/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L1181) for more info.| |MYSQL_DB_HOST|Yes|Hive Metastore MySQL database hostname.| |MYSQL_DB_NAME|Yes|Hive Metastore MySQL database name.| |MYSQL_SECRET_ARN|Yes|Hive Metastore MySQL SecretsManager secret ARN.| @@ -37,8 +41,6 @@ For more information please refer to the main [Apiary](https://github.com/Expedi |RANGER_SERVICE_NAME|No|Ranger service name used to configure RangerAuth plugin.| |SNS_ARN|No|The SNS topic ARN to which metadata updates will be sent.| |TABLE_PARAM_FILTER|No|A regular expression for selecting necessary table parameters. If the value isn't set, then no table parameters are selected.| -|DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES|No|`true`/`false` value for hive.metastore.disallow.incompatible.col.type.changes, default `true`.| -|AWS_WEB_IDENTITY_TOKEN_FILE|No|Path of the AWS Web Identity Token File for IRSA/OIDC AWS authentication.| # Contact diff --git a/files/hive-site.xml b/files/hive-site.xml index 443f619..c21ed31 100644 --- a/files/hive-site.xml +++ b/files/hive-site.xml @@ -28,6 +28,11 @@ true + + datanucleus.connectionPool.maxPoolSize + 10 + + hive.metastore.uris thrift://localhost:9083 @@ -88,4 +93,9 @@ true + + hive.stats.autogather + true + + diff --git a/files/startup.sh b/files/startup.sh index 21df902..4c4ce0e 100755 --- a/files/startup.sh +++ b/files/startup.sh @@ -16,10 +16,21 @@ if [[ -n ${HMS_MAX_THREADS} ]]; then update_property.py hive.metastore.server.max.threads "${HMS_MAX_THREADS}" /etc/hive/conf/hive-site.xml fi +# config size of MySQL connection pool. +# See https://github.com/apache/hive/blob/master/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L1181 +# and also make sure 2 * MYSQL_CONNECTION_POOL_SIZE * numHmsContainers is less than max connections for your MySQL instance. +if [[ -n ${MYSQL_CONNECTION_POOL_SIZE} ]]; then + update_property.py datanucleus.connectionPool.maxPoolSize "${MYSQL_CONNECTION_POOL_SIZE}" /etc/hive/conf/hive-site.xml +fi + if [[ -n ${DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES} ]]; then update_property.py hive.metastore.disallow.incompatible.col.type.changes "${DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES}" /etc/hive/conf/hive-site.xml fi +if [[ -n ${HMS_AUTOGATHER_STATS} ]]; then + update_property.py hive.stats.autogather "${HMS_AUTOGATHER_STATS}" /etc/hive/conf/hive-site.xml +fi + #configure LDAP group mapping, required for ranger authorization if [[ -n $LDAP_URL ]] ; then update_property.py hadoop.security.group.mapping.ldap.bind.user "$(aws secretsmanager get-secret-value --secret-id ${LDAP_SECRET_ARN}|jq .SecretString -r|jq .username -r)" /etc/hadoop/conf/core-site.xml