Skip to content

Commit

Permalink
feat(terraform): add apache sedona (#12433)
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao authored Nov 7, 2023
1 parent 348b801 commit 458aa97
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash
set -e

echo "# Add SSH keys"
echo ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPWhR5NV13iw0X8lKqsrSRqbcIJcA5AVMjyfJjOrplwH hongbo-miao >> /home/hadoop/.ssh/authorized_keys

echo "# Install Python"
# https://github.com/aws-samples/aws-emr-utilities/blob/main/utilities/emr-ec2-custom-python3/README.md
PYTHON_VERSION=3.11.6
sudo yum --assumeyes remove openssl-devel*
sudo yum --assumeyes install gcc openssl11-devel bzip2-devel libffi-devel tar gzip wget make expat-devel
curl --silent --fail --show-error --location "https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tar.xz" | tar -x -J -v
cd "Python-${PYTHON_VERSION}"
./configure \
--enable-loadable-sqlite-extensions \
--with-dtrace \
--with-lto \
--enable-optimizations \
--with-system-expat \
--prefix="/usr/local/python${PYTHON_VERSION}"
sudo make altinstall
sudo "/usr/local/python${PYTHON_VERSION}/bin/python${PYTHON_VERSION%.*}" -m pip install --upgrade pip

echo "# Install dependencies"
sudo curl --silent --fail --show-error --location --remote-name --output-dir /usr/lib/spark/jars/ https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-3.4_2.12/1.5.0/sedona-spark-shaded-3.4_2.12-1.5.0.jar
sudo curl --silent --fail --show-error --location --remote-name --output-dir /usr/lib/spark/jars/ https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/1.5.0-28.2/geotools-wrapper-1.5.0-28.2.jar
"/usr/local/python${PYTHON_VERSION}/bin/python${PYTHON_VERSION%.*}" -m pip install \
apache-sedona==1.5.0 \
attrs==23.1.0 \
descartes==1.1.0 \
geopandas==0.14.0 \
matplotlib==3.8.0 \
pandas==2.1.2 \
shapely==2.0.2
55 changes: 55 additions & 0 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,61 @@ module "hm_route_53_record" {
amazon_route_53_record_values = [data.aws_instance.hm_trino_primary_node_ec2_instance.private_ip]
}

# Amazon EMR - Apache Sedona
module "hm_sedona_s3_set_up_script" {
source = "./modules/hm_amazon_s3_object"
amazon_s3_bucket = "hongbomiao-bucket"
amazon_s3_key = "amazon-emr/clusters/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh"
local_file_path = "./data/amazon-emr/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh"
}
module "hm_sedona_emr" {
source = "./modules/hm_amazon_emr_cluster"
amazon_emr_cluster_name = "hm-sedona"
amazon_emr_version = "emr-6.14.0"
applications = ["Hadoop", "Hive", "JupyterEnterpriseGateway", "Spark"]
primary_instance_type = "r7a.2xlarge"
core_instance_type = "r7a.2xlarge"
core_target_on_demand_capacity = 1
bootstrap_set_up_script_s3_uri = "s3://hongbomiao-bucket/amazon-emr/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh"
configurations = [
{
Classification : "delta-defaults",
Properties : {
"delta.enabled" : "true"
}
},
{
"Classification" : "spark-hive-site",
"Properties" : {
"hive.metastore.client.factory.class" : "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification" : "spark-defaults",
"Properties" : {
"spark.yarn.dist.jars" : "/usr/lib/spark/jars/sedona-spark-shaded-3.4_2.12-1.5.0.jar,/usr/lib/spark/jars/geotools-wrapper-1.5.0-28.2.jar",
"spark.serializer" : "org.apache.spark.serializer.KryoSerializer",
"spark.kryo.registrator" : "org.apache.sedona.core.serde.SedonaKryoRegistrator",
"spark.sql.extensions" : "org.apache.sedona.viz.sql.SedonaVizExtensions,org.apache.sedona.sql.SedonaSqlExtensions"
}
}
]
aws_iam_role = "arn:aws:iam::272394222652:role/service-role/AmazonEMR-ServiceRole-hm"
environment = var.environment
team = var.team
}
module "hm_sedona_emr_task_instance_fleet" {
source = "./modules/hm_amazon_emr_cluster_task_instance_fleet"
amazon_emr_cluster_id = module.hm_sedona_emr.id
task_instance_type = "r7a.2xlarge"
task_target_spot_capacity = 1
}
module "hm_sedona_emr_managed_scaling_policy" {
source = "./modules/hm_amazon_emr_managed_scaling_policy"
amazon_emr_cluster_id = module.hm_sedona_emr.id
max_capacity_units = 10
}

# AWS Glue DataBrew job
# AWS Glue DataBrew job - ADS-B 2x Flight Trace
module "hm_glue_databrew_job_write_csv_to_parquet_adsb_2x_flight_trace_data" {
Expand Down
11 changes: 11 additions & 0 deletions terraform/modules/hm_amazon_emr_managed_scaling_policy/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/emr_managed_scaling_policy
resource "aws_emr_managed_scaling_policy" "hm_amazon_emr_managed_scaling_policy" {
cluster_id = var.amazon_emr_cluster_id
compute_limits {
unit_type = "InstanceFleetUnits"
minimum_capacity_units = 2
maximum_capacity_units = var.max_capacity_units
maximum_ondemand_capacity_units = 1
maximum_core_capacity_units = 1
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
variable "amazon_emr_cluster_id" {
type = string
}
variable "max_capacity_units" {
type = number
}

0 comments on commit 458aa97

Please sign in to comment.