From caf3d13a797c78f47817bdf1a53c583ba51623d2 Mon Sep 17 00:00:00 2001 From: Georgi Ivanov Date: Fri, 21 Jun 2024 13:06:28 +0100 Subject: [PATCH 1/3] feat: make the connection pooling selection configurable --- CHANGELOG.md | 6 ++++++ files/hive-site.xml | 5 +++++ files/startup.sh | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2feab79..deaeeba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [5.0.2] - 2024-06-21 +### Added +- Added `datanucleus.connectionPoolingType` to hive-site.xml, defaults: `BoneCP` +- Added `DATANUCLEUS_CONNECTION_POOLING_HIKARICP` to support using HikariCP for database connection pooling. +- Added `DATANUCLEUS_CONNECTION_POOLING_MAXLIFETIME` to support configuring the max life time of a connection in the pool before it's recycled. The value is in miliseconds, defaults: `180000`. + ## [5.0.1] - 2024-06-19 ### Fixed - Added `MYSQL_DRIVER_JAR` to add the driver connector JAR to the system classpath. By default it is now using `/usr/share/java/mysql-connector-java.jar`. diff --git a/files/hive-site.xml b/files/hive-site.xml index 9a8350f..b2ae80f 100644 --- a/files/hive-site.xml +++ b/files/hive-site.xml @@ -28,6 +28,11 @@ true + + datanucleus.connectionPoolingType + BoneCP + + datanucleus.connectionPool.maxPoolSize 10 diff --git a/files/startup.sh b/files/startup.sh index 66d301f..d89fd96 100755 --- a/files/startup.sh +++ b/files/startup.sh @@ -27,6 +27,47 @@ if [[ -n ${MYSQL_CONNECTION_POOL_SIZE} ]]; then update_property.py datanucleus.connectionPool.maxPoolSize "${MYSQL_CONNECTION_POOL_SIZE}" /etc/hive/conf/hive-site.xml fi +# configure Connection Pooling +# valid options are BoneCP, DBCP, DBCP2, C3P0, HikariCP +if [ ! -z ${DATANUCLEUS_CONNECTION_POOLING_TYPE} ]; then + update_property.py datanucleus.connectionPoolingType "${DATANUCLEUS_CONNECTION_POOLING_TYPE}" /etc/hive/conf/hive-site.xml + + if [[ ${DATANUCLEUS_CONNECTION_POOLING_TYPE,,} == 'bonecp' ]]; then + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.maxPoolSize "${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MIN_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.minPoolSize "${DATANUCLEUS_CONNECTION_POOL_MIN_POOLSIZE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MAX_IDLE} ]] && update_property.py datanucleus.connectionPool.maxIdle "${DATANUCLEUS_CONNECTION_POOL_MAX_IDLE}" /etc/hive/conf/hive-site.xml + fi + + if [[ ${DATANUCLEUS_CONNECTION_POOLING_TYPE,,} == 'dbcp' || ${DATANUCLEUS_CONNECTION_POOLING_TYPE,,} == 'dbcp2' ]]; then + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MAX_IDLE} ]] && update_property.py datanucleus.connectionPool.maxIdle "${DATANUCLEUS_CONNECTION_POOL_MAX_IDLE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MIN_IDLE} ]] && update_property.py datanucleus.connectionPool.minIdle "${DATANUCLEUS_CONNECTION_POOL_MIN_IDLE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MIN_ACTIVE} ]] && update_property.py datanucleus.connectionPool.maxActive "${DATANUCLEUS_CONNECTION_POOL_MIN_ACTIVE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MAX_WAIT} ]] && update_property.py datanucleus.connectionPool.maxWait "${DATANUCLEUS_CONNECTION_POOL_MAX_WAIT}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_VALIDATION_TIMEOUT} ]] && update_property.py datanucleus.connectionPool.validationTimeout "${DATANUCLEUS_CONNECTION_POOL_VALIDATION_TIMEOUT}" /etc/hive/conf/hive-site.xml + fi + + if [[ ${DATANUCLEUS_CONNECTION_POOLING_TYPE,,} == 'c3p0' ]]; then + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.maxPoolSize "${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MIN_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.minPoolSize "${DATANUCLEUS_CONNECTION_POOL_MIN_POOLSIZE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_INITIAL_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.initialPoolSize "${DATANUCLEUS_CONNECTION_POOL_INITIAL_POOLSIZE}" /etc/hive/conf/hive-site.xml + fi + + if [[ ${DATANUCLEUS_CONNECTION_POOLING_TYPE,,} == 'hikaricp' ]]; then + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.maxPoolSize "${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MIN_IDLE} ]] && update_property.py datanucleus.connectionPool.minIdle "${DATANUCLEUS_CONNECTION_POOL_MIN_IDLE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_LEAK_DETECTION_THRESHOLD} ]] && update_property.py datanucleus.connectionPool.leakThreshold "${DATANUCLEUS_CONNECTION_POOL_LEAK_DETECTION_THRESHOLD}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_LEAK_MAX_LIFETIME} ]] && update_property.py datanucleus.connectionPool.maxLifetime "${DATANUCLEUS_CONNECTION_POOL_LEAK_MAX_LIFETIME}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_AUTO_COMMIT} ]] && update_property.py datanucleus.connectionPool.autoCommit "${DATANUCLEUS_CONNECTION_POOL_AUTO_COMMIT}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_IDLE_TIMEOUT} ]] && update_property.py datanucleus.connectionPool.idleTimeout "${DATANUCLEUS_CONNECTION_POOL_IDLE_TIMEOUT}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_CONNECTION_WAIT_TIMEOUT} ]] && update_property.py datanucleus.connectionPool.connectionWaitTimeout "${DATANUCLEUS_CONNECTION_POOL_CONNECTION_WAIT_TIMEOUT}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_VALIDATION_TIMEOUT} ]] && update_property.py datanucleus.connectionPool.validationTimeout "${DATANUCLEUS_CONNECTION_POOL_VALIDATION_TIMEOUT}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_READ_ONLY} ]] && update_property.py datanucleus.connectionPool.readOnly "${DATANUCLEUS_CONNECTION_POOL_READ_ONLY}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_NAME} ]] && update_property.py datanucleus.connectionPool.name "${DATANUCLEUS_CONNECTION_POOL_NAME}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_CATALOG} ]] && update_property.py datanucleus.connectionPool.catalog "${DATANUCLEUS_CONNECTION_POOL_CATALOG}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_REGISTER_MBEANS} ]] && update_property.py datanucleus.connectionPool.registerMbeans "${DATANUCLEUS_CONNECTION_POOL_REGISTER_MBEANS}" /etc/hive/conf/hive-site.xml + fi +fi + if [[ -n ${DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES} ]]; then update_property.py hive.metastore.disallow.incompatible.col.type.changes "${DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES}" /etc/hive/conf/hive-site.xml fi @@ -208,6 +249,7 @@ if [ ! -z ${ENABLE_HIVE_LOCK_HOUSE_KEEPER} ]; then update_property.py hive.compactor.worker.threads 1 /etc/hive/conf/hive-site.xml update_property.py hive.txn.strict.locking.mode false /etc/hive/conf/hive-site.xml fi + #auto configure heapsize if [ ! -z ${ECS_CONTAINER_METADATA_URI} ]; then export MEM_LIMIT=$(wget -q -O - ${ECS_CONTAINER_METADATA_URI}/task|jq -r .Limits.Memory) From 4be83a29939691a247ca0e77fd366547477c5536 Mon Sep 17 00:00:00 2001 From: Georgi Ivanov Date: Fri, 21 Jun 2024 16:25:55 +0100 Subject: [PATCH 2/3] update CHANGELOG and README --- CHANGELOG.md | 22 ++++++++-- README.md | 112 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 84 insertions(+), 50 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index deaeeba..e3944af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,27 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [5.0.2] - 2024-06-21 +## [5.1.0] - 2024-06-2x ### Added - Added `datanucleus.connectionPoolingType` to hive-site.xml, defaults: `BoneCP` -- Added `DATANUCLEUS_CONNECTION_POOLING_HIKARICP` to support using HikariCP for database connection pooling. -- Added `DATANUCLEUS_CONNECTION_POOLING_MAXLIFETIME` to support configuring the max life time of a connection in the pool before it's recycled. The value is in miliseconds, defaults: `180000`. +- Added `DATANUCLEUS_CONNECTION_POOLING_TYPE` to support changing the database connection pooling. Valid options are `BoneCP`, `DBCP`, `DBCP2`, `C3P0`, `HikariCP`. +- Added `DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE` - Maximum pool size for the connection pool. +- Added `DATANUCLEUS_CONNECTION_POOL_MIN_POOLSIZE` - Minimum pool size for the connection pool. +- Added `DATANUCLEUS_CONNECTION_POOL_INITIAL_POOLSIZE` - Initial pool size for the connection pool (C3P0 only). +- Added `DATANUCLEUS_CONNECTION_POOL_MAX_IDLE` - Maximum idle connections for the connection pool. +- Added `DATANUCLEUS_CONNECTION_POOL_MIN_IDLE` - Minimum idle connections for the connection pool. +- Added `DATANUCLEUS_CONNECTION_POOL_MIN_ACTIVE` - Maximum active connections for the connection pool (DBCP/DBCP2 only). +- Added `DATANUCLEUS_CONNECTION_POOL_MAX_WAIT` - Maximum wait time for the connection pool (DBCP/DBCP2 only). +- Added `DATANUCLEUS_CONNECTION_POOL_VALIDATION_TIMEOUT` - Validation timeout for the connection pool (DBCP/DBCP2/HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_LEAK_DETECTION_THRESHOLD` - Leak detection threshold for the connection pool (HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_LEAK_MAX_LIFETIME` - Maximum lifetime for the connection pool (HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_AUTO_COMMIT` - Auto commit for the connection pool (HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_IDLE_TIMEOUT` - Idle timeout for the connection pool (HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_CONNECTION_WAIT_TIMEOUT` - Connection wait timeout for the connection pool (HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_READ_ONLY` - Read only mode for the connection pool (HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_NAME` - Connection pool name (HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_CATALOG` - Connection pool catalog (HikariCP only). +- Added `DATANUCLEUS_CONNECTION_POOL_REGISTER_MBEANS` - Register MBeans for the connection pool (HikariCP only). ## [5.0.1] - 2024-06-19 ### Fixed diff --git a/README.md b/README.md index 02bf4b6..5744743 100644 --- a/README.md +++ b/README.md @@ -4,53 +4,71 @@ For more information please refer to the main [Apiary](https://github.com/ExpediaGroup/apiary) project page. ## Environment Variables -|Environment Variable|Required| Description | -|----|----|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -|APIARY_S3_INVENTORY_PREFIX|No (defaults to `EntireBucketDaily`)| Prefix used by S3 Inventory when creating data in the inventory bucket. | -|APIARY_S3_INVENTORY_TABLE_FORMAT|No (defaults to `ORC`)| Format of S3 inventory data - `ORC`, `Parquet`, or `CSV` | -|APIARY_SYSTEM_SCHEMA|No (defaults to `apiary_system`)| Name for internal system database. | -|AWS_REGION|Yes| AWS region to configure various AWS clients. | -|AWS_WEB_IDENTITY_TOKEN_FILE|No| Path of the AWS Web Identity Token File for IRSA/OIDC AWS authentication. | -|DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES|No| `true`/`false` value for hive.metastore.disallow.incompatible.col.type.changes, default `true`. | -|ENABLE_GLUESYNC|No| Option to turn on GlueSync Hive Metastore listener. | -|ENABLE_HIVE_LOCK_HOUSE_KEEPER|No| Option to turn on Hive Metastore Hive Lock House Keeper. | -|ENABLE_METRICS|No| Option to enable sending Hive Metastore and JMX metrics to Prometheus. | -|ENABLE_S3_INVENTORY|No| Option to create Hive tables on top of S3 inventory data if enabled in `apiary-data-lake`. Enabled if value is not null/empty. | -|ENABLE_S3_LOGS|No| Option to create Hive tables on top of S3 access logs data if enabled in `apiary-data-lake`. Enabled if value is not null/empty. | -|EXTERNAL_DATABASE|No| Option to enable external database mode, when specified it disables managing Hive Metastore MySQL database schema. | -|GLUE_PREFIX|No| Prefix added to Glue databases to handle database name collisions when synchronizing multiple Hive Metastores to the Glue catalog. | -|HADOOP_HEAPSIZE|No| Hive Metastore Java process heapsize. | -|HMS_AUTOGATHER_STATS|No (default is `true`)| Whether or not to create basic statistics on table/partition creation. Valid values are `true` or `false`. | -|LIMIT_PARTITION_REQUEST_NUMBER|No (default is `-1`)| To protect the cluster, this controls how many partitions can be scanned for each partitioned table. The default value "-1" means no limit. The limit on partitions does not affect metadata-only queries. | -|HIVE_METASTORE_ACCESS_MODE|No| Hive Metastore access mode, applicable values are: readwrite, readonly | -|HIVE_DB_NAMES|No| comma separated list of Hive database names, when specified Hive databases will be created and mapped to corresponding S3 buckets. | -|HIVE_METASTORE_LOG_LEVEL|No| Hive Metastore service Log4j log level. | -|HMS_MIN_THREADS|No (defaults to `200`)| Minimum size of the Hive metastore thread pool. | -|HMS_MAX_THREADS|No (defaults to `1000`)| Maximum size of the Hive metastore thread pool. | -|INSTANCE_NAME|Yes| Apiary instance name, will be used as prefix on most AWS resources to allow multiple Apiary instance deployments. | -|KAFKA_BOOTSTRAP_SERVERS|No| Kafka Bootstrap Servers to enable Kafka Metastore listener and send Metastore events to Kafka. | -|KAFKA_CLIENT_ID|No| Kafka label you define that names the Kafka producer. | -|KAFKA_COMPRESSION_TYPE|No (defaults to `1048576`)| The maximum size of a request in bytes. This setting will limit the number of record batches the producer will send in a single request to avoid sending huge requests. This is also effectively a cap on the maximum uncompressed record batch size. | -|KAFKA_MAX_REQUEST_SIZE|No| Kafka Compression type, if none is specified there is no compression enabled. Values available are gzip, lz4 and snappy. | -|LDAP_BASE|No| LDAP base DN used to search for user groups. | -|LDAP_CA_CERT|Base64 encoded Certificate Authority Bundle to validate LDAP SSL connection.| -|LDAP_SECRET_ARN|No| LDAP bind DN SecretsManager secret ARN. | -|LDAP_URL|No| Active Directory URL to enable group mapping in metastore. | -|MYSQL_CONNECTION_DRIVER_NAME|No (defaults to `com.mysql.jdbc.Driver`)| Hive Metastore MySQL database JDBC connection Driver Name. | -|MYSQL_CONNECTION_POOL_SIZE|No (defaults to `10`)| MySQL Connection pool size for Hive Metastore. See [here](https://github.com/apache/hive/blob/master/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L1181) for more info. | -|MYSQL_DB_HOST|Yes| Hive Metastore MySQL database hostname. | -|MYSQL_DB_NAME|Yes| Hive Metastore MySQL database name. | -|MYSQL_SECRET_ARN|Yes| Hive Metastore MySQL SecretsManager secret ARN. | -|MYSQL_SECRET_USERNAME_KEY|No (defaults to `username`)| Hive Metastore MySQL SecretsManager secret username key. | -|MYSQL_TYPE|No (defaults to `mysql`)| Hive Metastore MySQL database Type (mariadb, mysql). | -|MYSQL_DRIVER_JAR|No (defaults to `/usr/share/java/mysql-connector-java.jar`)| Hive Metastore MySQL connector JAR location | -|RANGER_AUDIT_DB_URL|No| Ranger audit database JDBC URL. | -|RANGER_AUDIT_SECRET_ARN|No| Ranger audit database secret ARN. | -|RANGER_AUDIT_SOLR_URL|No| Ranger Solr audit URL. | -|RANGER_POLICY_MANAGER_URL|No| Ranger admin URL from where policies will be downloaded. | -|RANGER_SERVICE_NAME|No| Ranger service name used to configure RangerAuth plugin. | -|SNS_ARN|No| The SNS topic ARN to which metadata updates will be sent. | -|TABLE_PARAM_FILTER|No| A regular expression for selecting necessary table parameters. If the value isn't set, then no table parameters are selected. | +| Environment Variable | Required | Description | +|------------------------------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| APIARY_S3_INVENTORY_PREFIX | No | Prefix used by S3 Inventory when creating data in the inventory bucket. Default is `EntireBucketDaily`. | +| APIARY_S3_INVENTORY_TABLE_FORMAT | No | Format of S3 inventory data. Valid options are `ORC`, `Parquet`, or `CSV`. Default is `ORC`. | +| APIARY_SYSTEM_SCHEMA | No | Name for internal system database. Default is `apiary_system`. | +| AWS_REGION | Yes | AWS region to configure various AWS clients. | +| AWS_WEB_IDENTITY_TOKEN_FILE | No | Path of the AWS Web Identity Token File for IRSA/OIDC AWS authentication. | +| DATANUCLEUS_CONNECTION_POOLING_TYPE | No | Type of connection pooling. Valid options are `BoneCP`, `DBCP`, `DBCP2`, `C3P0`, `HikariCP`. | +| DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE | No | Maximum pool size for the connection pool. | +| DATANUCLEUS_CONNECTION_POOL_MIN_POOLSIZE | No | Minimum pool size for the connection pool. | +| DATANUCLEUS_CONNECTION_POOL_INITIAL_POOLSIZE | No | Initial pool size for the connection pool (C3P0 only). | +| DATANUCLEUS_CONNECTION_POOL_MAX_IDLE | No | Maximum idle connections for the connection pool. | +| DATANUCLEUS_CONNECTION_POOL_MIN_IDLE | No | Minimum idle connections for the connection pool. | +| DATANUCLEUS_CONNECTION_POOL_MIN_ACTIVE | No | Maximum active connections for the connection pool (DBCP/DBCP2 only). | +| DATANUCLEUS_CONNECTION_POOL_MAX_WAIT | No | Maximum wait time for the connection pool (DBCP/DBCP2 only). | +| DATANUCLEUS_CONNECTION_POOL_VALIDATION_TIMEOUT | No | Validation timeout for the connection pool (DBCP/DBCP2/HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_LEAK_DETECTION_THRESHOLD | No | Leak detection threshold for the connection pool (HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_LEAK_MAX_LIFETIME | No | Maximum lifetime for the connection pool (HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_AUTO_COMMIT | No | Auto commit for the connection pool (HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_IDLE_TIMEOUT | No | Idle timeout for the connection pool (HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_CONNECTION_WAIT_TIMEOUT | No | Connection wait timeout for the connection pool (HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_READ_ONLY | No | Read only mode for the connection pool (HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_NAME | No | Connection pool name (HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_CATALOG | No | Connection pool catalog (HikariCP only). | +| DATANUCLEUS_CONNECTION_POOL_REGISTER_MBEANS | No | Register MBeans for the connection pool (HikariCP only). | +| DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES | No | `true`/`false` value for hive.metastore.disallow.incompatible.col.type.changes, default `true`. | +| ENABLE_GLUESYNC | No | Option to turn on GlueSync Hive Metastore listener. | +| ENABLE_HIVE_LOCK_HOUSE_KEEPER | No | Option to turn on Hive Metastore Hive Lock House Keeper. | +| ENABLE_METRICS | No | Option to enable sending Hive Metastore and JMX metrics to Prometheus. | +| ENABLE_S3_INVENTORY | No | Option to create Hive tables on top of S3 inventory data if enabled in `apiary-data-lake`. Enabled if value is not null/empty. | +| ENABLE_S3_LOGS | No | Option to create Hive tables on top of S3 access logs data if enabled in `apiary-data-lake`. Enabled if value is not null/empty. | +| EXTERNAL_DATABASE | No | Option to enable external database mode, when specified it disables managing Hive Metastore MySQL database schema. | +| GLUE_PREFIX | No | Prefix added to Glue databases to handle database name collisions when synchronizing multiple Hive Metastores to the Glue catalog. | +| HADOOP_HEAPSIZE | No | Hive Metastore Java process heapsize. Default is `1024`. | +| HMS_AUTOGATHER_STATS | No | Whether or not to create basic statistics on table/partition creation. Valid values are `true` or `false`. Default is `true`. | +| LIMIT_PARTITION_REQUEST_NUMBER | No | To protect the cluster, this controls how many partitions can be scanned for each partitioned table. The default value `-1` means no limit. The limit on partitions does not affect metadata-only queries. | +| HIVE_METASTORE_ACCESS_MODE | No | Hive Metastore access mode, applicable values are: readwrite, readonly. | +| HIVE_DB_NAMES | No | Comma separated list of Hive database names, when specified Hive databases will be created and mapped to corresponding S3 buckets. | +| HIVE_METASTORE_LOG_LEVEL | No | Hive Metastore service Log4j log level. Default is `INFO`. | +| HMS_MIN_THREADS | No | Minimum size of the Hive metastore thread pool. Default is `200`. | +| HMS_MAX_THREADS | No | Maximum size of the Hive metastore thread pool. Default is `1000`. | +| INSTANCE_NAME | Yes | Apiary instance name, will be used as prefix on most AWS resources to allow multiple Apiary instance deployments. | +| KAFKA_BOOTSTRAP_SERVERS | No | Kafka Bootstrap Servers to enable Kafka Metastore listener and send Metastore events to Kafka. | +| KAFKA_CLIENT_ID | No | Kafka label you define that names the Kafka producer. | +| KAFKA_COMPRESSION_TYPE | No | Kafka Compression type, if none is specified there is no compression enabled. Values available are gzip, lz4 and snappy. Default is `1048576`. | +| KAFKA_MAX_REQUEST_SIZE | No | The maximum size of a request in bytes. This setting will limit the number of record batches the producer will send in a single request to avoid sending huge requests. This is also effectively a cap on the maximum uncompressed record batch size. | +| LDAP_BASE | No | LDAP base DN used to search for user groups. | +| LDAP_CA_CERT | No | Base64 encoded Certificate Authority Bundle to validate LDAP SSL connection. | +| LDAP_SECRET_ARN | No | LDAP bind DN SecretsManager secret ARN. | +| LDAP_URL | No | Active Directory URL to enable group mapping in metastore. | +| MYSQL_CONNECTION_DRIVER_NAME | No | Hive Metastore MySQL database JDBC connection Driver Name. Default is `com.mysql.jdbc.Driver`. | +| MYSQL_CONNECTION_POOL_SIZE | No | MySQL Connection pool size for Hive Metastore. Default is `10`. See [here](https://github.com/apache/hive/blob/master/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L1181) for more info. | +| MYSQL_DB_HOST | Yes | Hive Metastore MySQL database hostname. | +| MYSQL_DB_NAME | Yes | Hive Metastore MySQL database name. | +| MYSQL_SECRET_ARN | Yes | Hive Metastore MySQL SecretsManager secret ARN. | +| MYSQL_SECRET_USERNAME_KEY | No | Hive Metastore MySQL SecretsManager secret username key. Default is `username`. | +| MYSQL_TYPE | No | Hive Metastore MySQL database Type (mariadb, mysql). Default is `mysql`. | +| MYSQL_DRIVER_JAR | No | Hive Metastore MySQL connector JAR location. Default is `/usr/share/java/mysql-connector-java.jar`. | +| RANGER_AUDIT_DB_URL | No | Ranger audit database JDBC URL. | +| RANGER_AUDIT_SECRET_ARN | No | Ranger audit database secret ARN. | +| RANGER_AUDIT_SOLR_URL | No | Ranger Solr audit URL. | +| RANGER_POLICY_MANAGER_URL | No | Ranger admin URL from where policies will be downloaded. | +| RANGER_SERVICE_NAME | No | Ranger service name used to configure RangerAuth plugin. | +| SNS_ARN | No | The SNS topic ARN to which metadata updates will be + | # Contact From 8375ba11c08ee3c1d832d3d813c33d77a4140845 Mon Sep 17 00:00:00 2001 From: Georgi Ivanov Date: Fri, 21 Jun 2024 16:28:15 +0100 Subject: [PATCH 3/3] update CHANGELOG and README --- files/startup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/files/startup.sh b/files/startup.sh index d89fd96..725be4f 100755 --- a/files/startup.sh +++ b/files/startup.sh @@ -49,12 +49,12 @@ if [ ! -z ${DATANUCLEUS_CONNECTION_POOLING_TYPE} ]; then if [[ ${DATANUCLEUS_CONNECTION_POOLING_TYPE,,} == 'c3p0' ]]; then [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.maxPoolSize "${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE}" /etc/hive/conf/hive-site.xml [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MIN_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.minPoolSize "${DATANUCLEUS_CONNECTION_POOL_MIN_POOLSIZE}" /etc/hive/conf/hive-site.xml - [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_INITIAL_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.initialPoolSize "${DATANUCLEUS_CONNECTION_POOL_INITIAL_POOLSIZE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_INITIAL_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.initialPoolSize "${DATANUCLEUS_CONNECTION_POOL_INITIAL_POOLSIZE}" /etc/hive/conf/hive-site.xml fi if [[ ${DATANUCLEUS_CONNECTION_POOLING_TYPE,,} == 'hikaricp' ]]; then [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE} ]] && update_property.py datanucleus.connectionPool.maxPoolSize "${DATANUCLEUS_CONNECTION_POOL_MAX_POOLSIZE}" /etc/hive/conf/hive-site.xml - [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MIN_IDLE} ]] && update_property.py datanucleus.connectionPool.minIdle "${DATANUCLEUS_CONNECTION_POOL_MIN_IDLE}" /etc/hive/conf/hive-site.xml + [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_MIN_IDLE} ]] && update_property.py datanucleus.connectionPool.minIdle "${DATANUCLEUS_CONNECTION_POOL_MIN_IDLE}" /etc/hive/conf/hive-site.xml [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_LEAK_DETECTION_THRESHOLD} ]] && update_property.py datanucleus.connectionPool.leakThreshold "${DATANUCLEUS_CONNECTION_POOL_LEAK_DETECTION_THRESHOLD}" /etc/hive/conf/hive-site.xml [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_LEAK_MAX_LIFETIME} ]] && update_property.py datanucleus.connectionPool.maxLifetime "${DATANUCLEUS_CONNECTION_POOL_LEAK_MAX_LIFETIME}" /etc/hive/conf/hive-site.xml [[ ! -z ${DATANUCLEUS_CONNECTION_POOL_AUTO_COMMIT} ]] && update_property.py datanucleus.connectionPool.autoCommit "${DATANUCLEUS_CONNECTION_POOL_AUTO_COMMIT}" /etc/hive/conf/hive-site.xml