diff --git a/terraform/data/amazon-emr/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh b/terraform/data/amazon-emr/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh new file mode 100644 index 0000000000..ec26ca742f --- /dev/null +++ b/terraform/data/amazon-emr/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -e + +echo "# Add SSH keys" +echo ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPWhR5NV13iw0X8lKqsrSRqbcIJcA5AVMjyfJjOrplwH hongbo-miao >> /home/hadoop/.ssh/authorized_keys + +echo "# Install Python" +# https://github.com/aws-samples/aws-emr-utilities/blob/main/utilities/emr-ec2-custom-python3/README.md +PYTHON_VERSION=3.11.6 +sudo yum --assumeyes remove openssl-devel* +sudo yum --assumeyes install gcc openssl11-devel bzip2-devel libffi-devel tar gzip wget make expat-devel +curl --silent --fail --show-error --location "https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tar.xz" | tar -x -J -v +cd "Python-${PYTHON_VERSION}" +./configure \ + --enable-loadable-sqlite-extensions \ + --with-dtrace \ + --with-lto \ + --enable-optimizations \ + --with-system-expat \ + --prefix="/usr/local/python${PYTHON_VERSION}" +sudo make altinstall +sudo "/usr/local/python${PYTHON_VERSION}/bin/python${PYTHON_VERSION%.*}" -m pip install --upgrade pip + +echo "# Install dependencies" +sudo curl --silent --fail --show-error --location --remote-name --output-dir /usr/lib/spark/jars/ https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-3.4_2.12/1.5.0/sedona-spark-shaded-3.4_2.12-1.5.0.jar +sudo curl --silent --fail --show-error --location --remote-name --output-dir /usr/lib/spark/jars/ https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/1.5.0-28.2/geotools-wrapper-1.5.0-28.2.jar +"/usr/local/python${PYTHON_VERSION}/bin/python${PYTHON_VERSION%.*}" -m pip install \ + apache-sedona==1.5.0 \ + attrs==23.1.0 \ + descartes==1.1.0 \ + geopandas==0.14.0 \ + matplotlib==3.8.0 \ + pandas==2.1.2 \ + shapely==2.0.2 diff --git a/terraform/main.tf b/terraform/main.tf index a4dd61786f..ef47a1cab5 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -101,6 +101,61 @@ module "hm_route_53_record" { amazon_route_53_record_values = [data.aws_instance.hm_trino_primary_node_ec2_instance.private_ip] } +# Amazon EMR - Apache Sedona +module "hm_sedona_s3_set_up_script" { + source = "./modules/hm_amazon_s3_object" + amazon_s3_bucket = "hongbomiao-bucket" + amazon_s3_key = "amazon-emr/clusters/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh" + local_file_path = "./data/amazon-emr/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh" +} +module "hm_sedona_emr" { + source = "./modules/hm_amazon_emr_cluster" + amazon_emr_cluster_name = "hm-sedona" + amazon_emr_version = "emr-6.14.0" + applications = ["Hadoop", "Hive", "JupyterEnterpriseGateway", "Spark"] + primary_instance_type = "r7a.2xlarge" + core_instance_type = "r7a.2xlarge" + core_target_on_demand_capacity = 1 + bootstrap_set_up_script_s3_uri = "s3://hongbomiao-bucket/amazon-emr/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh" + configurations = [ + { + Classification : "delta-defaults", + Properties : { + "delta.enabled" : "true" + } + }, + { + "Classification" : "spark-hive-site", + "Properties" : { + "hive.metastore.client.factory.class" : "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" + } + }, + { + "Classification" : "spark-defaults", + "Properties" : { + "spark.yarn.dist.jars" : "/usr/lib/spark/jars/sedona-spark-shaded-3.4_2.12-1.5.0.jar,/usr/lib/spark/jars/geotools-wrapper-1.5.0-28.2.jar", + "spark.serializer" : "org.apache.spark.serializer.KryoSerializer", + "spark.kryo.registrator" : "org.apache.sedona.core.serde.SedonaKryoRegistrator", + "spark.sql.extensions" : "org.apache.sedona.viz.sql.SedonaVizExtensions,org.apache.sedona.sql.SedonaSqlExtensions" + } + } + ] + aws_iam_role = "arn:aws:iam::272394222652:role/service-role/AmazonEMR-ServiceRole-hm" + environment = var.environment + team = var.team +} +module "hm_sedona_emr_task_instance_fleet" { + source = "./modules/hm_amazon_emr_cluster_task_instance_fleet" + amazon_emr_cluster_id = module.hm_sedona_emr.id + task_instance_type = "r7a.2xlarge" + task_target_spot_capacity = 1 +} +module "hm_sedona_emr_managed_scaling_policy" { + source = "./modules/hm_amazon_emr_managed_scaling_policy" + amazon_emr_cluster_id = module.hm_sedona_emr.id + max_capacity_units = 10 +} + # AWS Glue DataBrew job # AWS Glue DataBrew job - ADS-B 2x Flight Trace module "hm_glue_databrew_job_write_csv_to_parquet_adsb_2x_flight_trace_data" { diff --git a/terraform/modules/hm_amazon_emr_managed_scaling_policy/main.tf b/terraform/modules/hm_amazon_emr_managed_scaling_policy/main.tf new file mode 100644 index 0000000000..a248eb0f1e --- /dev/null +++ b/terraform/modules/hm_amazon_emr_managed_scaling_policy/main.tf @@ -0,0 +1,11 @@ +# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/emr_managed_scaling_policy +resource "aws_emr_managed_scaling_policy" "hm_amazon_emr_managed_scaling_policy" { + cluster_id = var.amazon_emr_cluster_id + compute_limits { + unit_type = "InstanceFleetUnits" + minimum_capacity_units = 2 + maximum_capacity_units = var.max_capacity_units + maximum_ondemand_capacity_units = 1 + maximum_core_capacity_units = 1 + } +} diff --git a/terraform/modules/hm_amazon_emr_managed_scaling_policy/variables.tf b/terraform/modules/hm_amazon_emr_managed_scaling_policy/variables.tf new file mode 100644 index 0000000000..a4ccf7a4e0 --- /dev/null +++ b/terraform/modules/hm_amazon_emr_managed_scaling_policy/variables.tf @@ -0,0 +1,6 @@ +variable "amazon_emr_cluster_id" { + type = string +} +variable "max_capacity_units" { + type = number +}