diff --git a/.gitignore b/.gitignore index 636179f..caab972 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ ansible/*-hosts ansible/*-hosts test/* +TODO diff --git a/CHANGELOG.md b/CHANGELOG.md index 36ea998..e6b9c5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [3.3.0] - 2021-03-18 +### Added +- Add support for remote region metastores and alluxio cache. Access to tables in remote region s3 can be transparently redirected to alluxio using waggle-dance and hive hook. + ## [3.2.3] - 2021-03-15 ### Changed - Option to disable metastore to mitigate issues. diff --git a/README.md b/README.md index e2d26f7..c53769f 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ For more information please refer to the main [Apiary](https://github.com/Expedi ## Variables | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| alluxio_endpoints | List of Alluxio endpoints(map of root url and s3 buckets) used to replace s3 paths with alluxio paths. See section [`Usage`](#Usage)| list | `` | no | | aws_region | AWS region to use for resources. | string | - | yes | | bastion_ssh_key_secret_name | Secret name in AWS Secrets Manager which stores the private key used to log in to bastions. The secret's key should be `private_key` and the value should be stored as a base64 encoded string. Max character limit for a secret's value is 4096. | string | `` | no | | cpu | The number of CPU units to reserve for the Waggle Dance container. Valid values can be 256, 512, 1024, 2048 and 4096. Reference: https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html | string | `1024` | no | @@ -27,6 +28,7 @@ For more information please refer to the main [Apiary](https://github.com/Expedi | primary_metastore_port | Primary Hive Metastore port | string | `9083` | no | | primary_metastore_whitelist | List of Hive databases to whitelist on primary Metastore. | list | `` | no | | remote_metastores | List of VPC endpoint services to federate Metastores in other accounts. See section [`remote_metastores`](#remote_metastores) for more info.| list | `` | no | +| remote_region_metastores | List of VPC endpoint services to federate Metastores in other region,other accounts. The actual data from tables in these metastores can be accessed using Alluxio caching instead of reading the data from S3 directly. See section [`remote_region_metastores`](#remote_region_metastores) for more info.| list | `` | no | | secondary_vpcs | List of VPCs to associate with Service Discovery namespace. | list | `` | no | | ssh_metastores | List of federated Metastores to connect to over SSH via bastion. See section [`ssh_metastores`](#ssh_metastores) for more info.| list | `` | no | | subnets | ECS container subnets. | list | - | yes | @@ -44,6 +46,12 @@ Example module invocation: ``` module "apiary-waggledance" { source = "git::https://github.com/ExpediaGroup/apiary-federation.git?ref=master" + + #required for creating VPC endpoints in remote region + providers = { + aws.remote = aws.remote + } + instance_name = "waggledance-test" wd_ecs_task_count = "1" aws_region = "us-west-2" @@ -79,6 +87,32 @@ module "apiary-waggledance" { enabled = false //option to enable/disable metastore in waggle-dance without removing vpc endpoint. }, ] + remote_region_metastores = [ + { + endpoint = "com.amazonaws.vpce.us-west-2.vpce-svc-1" + port = "9083" + prefix = "metastore1" + mapped-databases = "default,test" + database-name-mapping = "test:test_alias,default:default_alias" + writable-whitelist = "test" + vpc_id = "vpc-123456" + subnets = "subnet-1,subnet-2" + security_group_id = "sg1" + }, + ] + + alluxio_endpoints = [ + { + root_url = "alluxio://alluxio1:19998/" + s3_buckets = "bucket1,bucket2" + } + , + { + root_url = "alluxio://alluxio2:19998/" + s3_buckets = "bucket3,bucket4" + } + ] + } ``` @@ -142,6 +176,43 @@ Name | Description | Type | Default | Required | See [Waggle Dance README](https://github.com/HotelsDotCom/waggle-dance/README.md) for more information on all these parameters. +### remote_region_metastores + +A list of maps. Each map entry describes a federated metastore endpoint accessible via an AWS VPC endpoint. The actual data for these metastores will be accessed using Alluxio caching instead of reading the data from S3 directly. + +An example entry looks like: +``` +remote_region_metastores = [ + { + endpoint = "com.amazonaws.vpce.us-west-2.vpce-svc-1" + port = "9083" + prefix = "remote1" + mapped-databases = "default,test" + database-name-mapping = "test:test_alias,default:default_alias" + writable-whitelist = ".*" + vpc_id = "vpc-123456" + subnets = "subnet-1,subnet-2" + security_group_id = "sg1 + } +] +``` +`remote_region_metastores` map entry fields: + +Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| endpoint | AWS VPC endpoint service name that is connected to the remote Hive metastore. | string | - | yes | +| port | IP port that the Thrift server of the remote Hive metastore listens on. | string | `"9083"` | no | +| prefix | Prefix added to the database names from this metastore. Must be unique among all local, remote, and SSH federated metastores in this Waggle Dance instance. | string | - | yes | +| mapped-databases | Comma-separated list of databases from this metastore to expose to federation. If not specified, *all* databases are exposed.| string | `""` | no | +| database-name-mapping | Comma-separated list of `:` key/value pairs to add aliases for the given databases. Default is no aliases. This is used primarily in migration scenarios where a database has been renamed/relocated. See [Waggle Dance Database Name Mapping](https://github.com/HotelsDotCom/waggle-dance#database-name-mapping) for more information. | string | `""` | no | +| writable-whitelist | Comma-separated list of databases from this metastore that can be in read-write mode. If not specified, all databases are read-only. Use `.*` to allow all databases to be written to. | string | `""` | no | +| vpc_id | Remote region AWS VPC id. | string | - | yes | +| subnets | AWS VPC subnets in remote region. | string | - | yes | +| security_group_id | AWS EC2 security group in remote region. | string | - | yes | + +See [Waggle Dance README](https://github.com/HotelsDotCom/waggle-dance/README.md) for more information on all these parameters. + +An example entry looks like: ### ssh_metastores A list of maps. Each map entry describes a federated metastore endpoint connected via an SSH bastion host. diff --git a/endpoints.tf b/endpoints.tf index e80d907..977d36b 100644 --- a/endpoints.tf +++ b/endpoints.tf @@ -41,6 +41,19 @@ resource "aws_vpc_endpoint" "remote_metastores" { tags = merge(map("Name", "${var.remote_metastores[count.index].prefix}_metastore"), var.tags) } +resource "aws_vpc_endpoint" "remote_region_metastores" { + for_each = { + for metastore in var.remote_region_metastores : "${metastore["endpoint"]}" => metastore + } + provider = "aws.remote" + vpc_id = each.value["vpc_id"] + vpc_endpoint_type = "Interface" + service_name = each.value["endpoint"] + subnet_ids = split(",", each.value["subnets"]) + security_group_ids = [each.value["security_group_id"]] + tags = merge(map("Name", "${each.value["prefix"]}_metastore"), var.tags) +} + resource "aws_route53_zone" "remote_metastore" { count = var.enable_remote_metastore_dns == "" ? 0 : 1 name = "${local.remote_metastore_zone_prefix}-${var.aws_region}.${var.domain_extension}" diff --git a/hive-site.tf b/hive-site.tf new file mode 100644 index 0000000..fe35079 --- /dev/null +++ b/hive-site.tf @@ -0,0 +1,28 @@ +data "template_file" "hive_site_xml" { + template = < + + + + apiary.path.replacement.enabled + true + +%{for alluxio_endpoint in var.alluxio_endpoints} +%{for s3_bucket in split(",", alluxio_endpoint.s3_buckets)} + + apiary.path.replacement.regex.alluxio-${s3_bucket} + ^(s3://)${s3_bucket}/.* + + + apiary.path.replacement.value.alluxio-${s3_bucket} + ${alluxio_endpoint.root_url} + + + apiary.path.replacement.capturegroups.alluxio-${s3_bucket} + 1 + +%{endfor} +%{endfor} + +EOF +} diff --git a/k8s.tf b/k8s.tf index 7c9ecee..b3a49ac 100644 --- a/k8s.tf +++ b/k8s.tf @@ -54,6 +54,10 @@ resource "kubernetes_deployment" "waggle_dance" { name = "FEDERATION_YAML" value = base64encode(data.template_file.federation_yaml.rendered) } + env { + name = "HIVE_SITE_XML" + value = var.alluxio_endpoints == [] ? "" : base64encode(data.template_file.hive_site_xml.rendered) + } resources { limits { memory = "${local.memory_limit}Mi" diff --git a/provider_proxy.tf b/provider_proxy.tf new file mode 100644 index 0000000..ad43096 --- /dev/null +++ b/provider_proxy.tf @@ -0,0 +1,3 @@ +provider "aws" { + alias = "remote" +} diff --git a/templates.tf b/templates.tf index 539ad8e..3d0ae81 100644 --- a/templates.tf +++ b/templates.tf @@ -50,13 +50,14 @@ data "template_file" "local_metastores_yaml" { template = file("${path.module}/templates/waggle-dance-federation-local.yml.tmpl") vars = { - prefix = var.local_metastores[count.index].prefix - metastore_host = var.local_metastores[count.index].host - metastore_port = lookup(var.local_metastores[count.index], "port", "9083") - mapped_databases = lookup(var.local_metastores[count.index], "mapped-databases", "") - database_name_mapping = lookup(var.local_metastores[count.index], "database-name-mapping", "") - writable_whitelist = lookup(var.local_metastores[count.index], "writable-whitelist", "") - metastore_enabled = lookup(var.local_metastores[count.index], "enabled", true) + prefix = var.local_metastores[count.index].prefix + metastore_host = var.local_metastores[count.index].host + metastore_port = lookup(var.local_metastores[count.index], "port", "9083") + mapped_databases = lookup(var.local_metastores[count.index], "mapped-databases", "") + database_name_mapping = lookup(var.local_metastores[count.index], "database-name-mapping", "") + writable_whitelist = lookup(var.local_metastores[count.index], "writable-whitelist", "") + enable_path_conversion = lookup(var.local_metastores[count.index], "enable_path_conversion", false) + metastore_enabled = lookup(var.local_metastores[count.index], "enabled", true) } } @@ -65,16 +66,34 @@ data "template_file" "remote_metastores_yaml" { template = file("${path.module}/templates/waggle-dance-federation-remote.yml.tmpl") vars = { - prefix = var.remote_metastores[count.index].prefix - metastore_host = aws_vpc_endpoint.remote_metastores[count.index].dns_entry[0].dns_name - metastore_port = lookup(var.remote_metastores[count.index], "port", "9083") - mapped_databases = lookup(var.remote_metastores[count.index], "mapped-databases", "") - database_name_mapping = lookup(var.remote_metastores[count.index], "database-name-mapping", "") - writable_whitelist = lookup(var.remote_metastores[count.index], "writable-whitelist", "") - metastore_enabled = lookup(var.remote_metastores[count.index], "enabled", true) + prefix = var.remote_metastores[count.index].prefix + metastore_host = aws_vpc_endpoint.remote_metastores[count.index].dns_entry[0].dns_name + metastore_port = lookup(var.remote_metastores[count.index], "port", "9083") + mapped_databases = lookup(var.remote_metastores[count.index], "mapped-databases", "") + database_name_mapping = lookup(var.remote_metastores[count.index], "database-name-mapping", "") + writable_whitelist = lookup(var.remote_metastores[count.index], "writable-whitelist", "") + enable_path_conversion = lookup(var.remote_metastores[count.index], "enable_path_conversion", false) + metastore_enabled = lookup(var.remote_metastores[count.index], "enabled", true) } } +data "template_file" "remote_region_metastores_yaml" { + count = length(var.remote_region_metastores) + template = file("${path.module}/templates/waggle-dance-federation-remote.yml.tmpl") + + vars = { + prefix = var.remote_region_metastores[count.index].prefix + metastore_host = aws_vpc_endpoint.remote_region_metastores[var.remote_region_metastores[count.index]["endpoint"]].dns_entry[0].dns_name + metastore_port = lookup(var.remote_region_metastores[count.index], "port", "9083") + mapped_databases = lookup(var.remote_region_metastores[count.index], "mapped-databases", "") + database_name_mapping = lookup(var.remote_region_metastores[count.index], "database-name-mapping", "") + writable_whitelist = lookup(var.remote_region_metastores[count.index], "writable-whitelist", "") + enable_path_conversion = lookup(var.remote_region_metastores[count.index], "enable_path_conversion", true) + metastore_enabled = lookup(var.remote_region_metastores[count.index], "enabled", true) + } +} + + data "template_file" "ssh_metastores_yaml" { count = length(var.ssh_metastores) template = file("${path.module}/templates/waggle-dance-federation-ssh.yml.tmpl") @@ -102,6 +121,7 @@ data "template_file" "federation_yaml" { primary_metastore_whitelist = join("", data.template_file.primary_metastore_whitelist.*.rendered) local_metastores = join("", data.template_file.local_metastores_yaml.*.rendered) remote_metastores = join("", data.template_file.remote_metastores_yaml.*.rendered) + remote_region_metastores = join("", data.template_file.remote_region_metastores_yaml.*.rendered) ssh_metastores = join("", data.template_file.ssh_metastores_yaml.*.rendered) } } @@ -117,6 +137,7 @@ data "template_file" "waggledance" { loggroup = var.wd_instance_type == "ecs" ? join("", aws_cloudwatch_log_group.waggledance_ecs.*.name) : "" server_yaml = base64encode(data.template_file.server_yaml.rendered) federation_yaml = base64encode(data.template_file.federation_yaml.rendered) + hive_site_xml = var.alluxio_endpoints == [] ? "" : base64encode(data.template_file.hive_site_xml.rendered) bastion_ssh_key_arn = var.bastion_ssh_key_secret_name == "" ? "" : join("", data.aws_secretsmanager_secret.bastion_ssh_key.*.arn) docker_auth = var.docker_registry_auth_secret_name == "" ? "" : format("\"repositoryCredentials\" :{\n \"credentialsParameter\":\"%s\"\n},", join("\",\"", concat(data.aws_secretsmanager_secret.docker_registry.*.arn))) } diff --git a/templates/waggle-dance-federation-local.yml.tmpl b/templates/waggle-dance-federation-local.yml.tmpl index 3f0bae5..fb11ed5 100644 --- a/templates/waggle-dance-federation-local.yml.tmpl +++ b/templates/waggle-dance-federation-local.yml.tmpl @@ -3,6 +3,9 @@ access-control-type: ${ writable_whitelist == "" ? "READ_ONLY" : "READ_AND_WRITE_ON_DATABASE_WHITELIST" } database-prefix: ${prefix}_ remote-meta-store-uris: thrift://${metastore_host}:${metastore_port} +%{if enable_path_conversion ~} + hive-metastore-filter-hook: com.expediagroup.apiary.extensions.hooks.filters.ApiaryMetastoreFilter +%{~endif} ${ mapped_databases == "" ? "" : " mapped-databases:" } ${ mapped_databases == "" ? "" : join("\n",formatlist(" - %s",split(",",mapped_databases))) } ${ database_name_mapping == "" ? "" : " database-name-mapping:" } diff --git a/templates/waggle-dance-federation-remote.yml.tmpl b/templates/waggle-dance-federation-remote.yml.tmpl index 3f0bae5..fb11ed5 100644 --- a/templates/waggle-dance-federation-remote.yml.tmpl +++ b/templates/waggle-dance-federation-remote.yml.tmpl @@ -3,6 +3,9 @@ access-control-type: ${ writable_whitelist == "" ? "READ_ONLY" : "READ_AND_WRITE_ON_DATABASE_WHITELIST" } database-prefix: ${prefix}_ remote-meta-store-uris: thrift://${metastore_host}:${metastore_port} +%{if enable_path_conversion ~} + hive-metastore-filter-hook: com.expediagroup.apiary.extensions.hooks.filters.ApiaryMetastoreFilter +%{~endif} ${ mapped_databases == "" ? "" : " mapped-databases:" } ${ mapped_databases == "" ? "" : join("\n",formatlist(" - %s",split(",",mapped_databases))) } ${ database_name_mapping == "" ? "" : " database-name-mapping:" } diff --git a/templates/waggle-dance-federation.yml.tmpl b/templates/waggle-dance-federation.yml.tmpl index 29cc464..4bea7b5 100644 --- a/templates/waggle-dance-federation.yml.tmpl +++ b/templates/waggle-dance-federation.yml.tmpl @@ -8,4 +8,5 @@ ${primary_metastore_whitelist} federated-meta-stores: ${local_metastores} ${remote_metastores} +${remote_region_metastores} ${ssh_metastores} diff --git a/templates/waggledance.json b/templates/waggledance.json index 5a89436..fab571c 100644 --- a/templates/waggledance.json +++ b/templates/waggledance.json @@ -31,6 +31,10 @@ "name": "FEDERATION_YAML", "value": "${federation_yaml}" }, + { + "name": "HIVE_SITE_XML", + "value": "${hive_site_xml}" + }, { "name": "BASTION_SSH_KEY_ARN", "value": "${bastion_ssh_key_arn}" diff --git a/variables.tf b/variables.tf index d3606e7..2ceddcf 100644 --- a/variables.tf +++ b/variables.tf @@ -165,6 +165,13 @@ variable "remote_metastores" { default = [] } +#list of maps, example: [ {endpoint="vpce1", port="9083", prefix="pre1", writable-whitelist="db1,test", vpc_id = "vpc-123456", subnets = "subnet1,subnet2", security_group_id="sg1" } ] +variable "remote_region_metastores" { + description = "List of VPC endpoint services to federate Metastores in other region,other accounts. The actual data from tables in these metastores can be accessed using Alluxio caching instead of reading the data from S3 directly." + type = list(map(string)) + default = [] +} + #list of maps, example: [ {bastion-host="10.x.x.x", metastore-host="10.x.x.x", port="9083", prefix="pre1", user="my-unix-user", mapped-databases="test1,test2"}, {bastion-host="10.x.x.x", metastore-host="10.x.x.x", port="9083", prefix="pre1", user="my-unix-user", writable-whitelist="db1,test", mapped-databases="test1,test2"} ] variable "ssh_metastores" { description = "List of federated Metastores to connect to over SSH via bastion." @@ -212,3 +219,10 @@ variable "prometheus_enabled" { default = false type = bool } + +//[ { root_url = "alluxio://alluxio1:19998/", s3_buckets = "bucket1,bucket2" }, { root_url = "alluxio://alluxio2:19998/", s3_buckets = "bucket3,bucket4" } ] +//it is important that root_url contains / at the end for hive hook to create valid url after replacement +variable "alluxio_endpoints" { + type = list(map(string)) + default = [] +}