added azure terraform deployment module

Updated as per pull request comments fixed dns prefix
BigDataBoutique · Feb 23, 2021 · 4d449c0 · 4d449c0
1 parent 3463915
commit 4d449c0
Show file tree

Hide file tree

Showing 15 changed files with 1,036 additions and 1 deletion.
diff --git a/assets/azure/az.user_data.sh b/assets/azure/az.user_data.sh
@@ -0,0 +1,199 @@
+#!/usr/bin/env bash
+set -ex
+
+exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
+
+function setup_hive_metastore {
+
+  echo "setup_hive_metastore stub.."
+
+  # Mount persistent storage and apply Hive Metastore schema if needed
+  DEVICE_NAME="/dev/disk/azure/scsi1/lun0"
+  MOUNT_PATH=/var/lib/mysql
+
+
+  sudo mv $MOUNT_PATH /tmp/mysql.backup
+  sudo mkdir -p $MOUNT_PATH
+
+  if sudo mount -o defaults -t ext4 "$DEVICE_NAME" $MOUNT_PATH; then
+    echo 'Successfully mounted existing disk'
+  else
+    echo 'Trying to mount a fresh disk'
+    sudo mkfs.ext4 -m 0 -F -E lazy_itable_init=0,lazy_journal_init=0,discard "$DEVICE_NAME"
+    sudo mount -o defaults -t ext4 "$DEVICE_NAME" $MOUNT_PATH && echo 'Successfully mounted a fresh disk'
+    sudo cp -ar /tmp/mysql.backup/* $MOUNT_PATH/
+  fi
+
+  sudo chown mysql:mysql -R $MOUNT_PATH
+  sudo chmod 700 $MOUNT_PATH
+
+  service mysql start
+  systemctl enable mysql
+
+  . /etc/environment
+  export HADOOP_HOME=$HADOOP_HOME
+
+  if ! "$HIVE_HOME"/bin/schematool -validate -dbType mysql; then
+    echo "Mysql schema is not valid"
+    "$HIVE_HOME"/bin/schematool -dbType mysql -initSchema
+  fi
+
+  echo "Initializing Hive Metastore ($HIVE_HOME)..."
+  service hive-metastore start
+  systemctl enable hive-metastore
+}
+
+
+
+cat <<'EOF' >/etc/security/limits.d/100-presto-nofile.conf
+presto soft nofile 16384
+presto hard nofile 16384
+EOF
+
+/usr/bin/printf "
+node.environment=${environment_name}
+node.id=$(hostname)
+node.data-dir=/var/lib/presto/
+" > /etc/presto/node.properties
+
+/usr/bin/printf "-server
+-Xmx${heap_size}G
+-XX:-UseBiasedLocking
+-XX:+UseG1GC
+-XX:G1HeapRegionSize=32M
+-XX:+ExplicitGCInvokesConcurrent
+-XX:+HeapDumpOnOutOfMemoryError
+-XX:+ExitOnOutOfMemoryError
+-XX:+UseGCOverheadLimit
+-XX:ReservedCodeCacheSize=512M
+-Djdk.attach.allowAttachSelf=true
+-Djdk.nio.maxCachedBufferSize=2000000
+-Duser.timezone=UTC
+" > /etc/presto/jvm.config
+
+
+#
+# Configure as COORDINATOR
+#
+if [[ "${mode_presto}" == "coordinator" ]]; then
+  echo "Configuring node as a [${mode_presto}]..."
+
+  /usr/bin/printf "
+#
+# coordinator
+#
+coordinator=true
+discovery-server.enabled=true
+discovery.uri=http://localhost:${http_port}
+node-scheduler.include-coordinator=false
+
+http-server.http.port=${http_port}
+# query.max-memory-per-node has to be <= query.max-total-memory-per-node
+#query.max-memory-per-node=${query_max_memory_per_node}GB
+#query.max-total-memory-per-node=${query_max_total_memory_per_node}GB
+query.max-memory=${query_max_memory}GB
+# query.max-total-memory defaults to query.max-memory * 2 so we are good
+${extra_worker_configs}
+" > /etc/presto/config.properties
+
+  setup_hive_metastore
+fi
+
+
+
+#
+# Configure as WORKER
+#
+if [[ "${mode_presto}" == "worker" ]]; then
+  echo "Configuring node as a [${mode_presto}]..."
+
+  /usr/bin/printf "
+#
+# worker
+#
+coordinator=false
+discovery.uri=http://${address_presto_coordinator}:${http_port}
+node-scheduler.include-coordinator=false
+
+http-server.http.port=${http_port}
+# query.max-memory-per-node has to be <= query.max-total-memory-per-node
+#query.max-memory-per-node=${query_max_memory_per_node}GB
+#query.max-total-memory-per-node=${query_max_total_memory_per_node}GB
+query.max-memory=${query_max_memory}GB
+# query.max-total-memory defaults to query.max-memory * 2 so we are good
+${extra_worker_configs}
+" > /etc/presto/config.properties
+fi
+
+#
+# Configure as BOTH coordinator and worker
+#
+if [[ "${mode_presto}" == "coordinator-worker" ]]; then
+  echo "Configuring node as a [${mode_presto}]..."
+
+  /usr/bin/printf "
+#
+# coordinator-worker
+#
+coordinator=true
+discovery-server.enabled=true
+discovery.uri=http://localhost:${http_port}
+node-scheduler.include-coordinator=true
+
+http-server.http.port=${http_port}
+# query.max-memory-per-node has to be <= query.max-total-memory-per-node
+#query.max-memory-per-node=${query_max_memory_per_node}GB
+#query.max-total-memory-per-node=${query_max_total_memory_per_node}GB
+query.max-memory=${query_max_memory}GB
+# query.max-total-memory defaults to query.max-memory * 2 so we are good
+${extra_worker_configs}
+" > /etc/presto/config.properties
+
+  setup_hive_metastore
+fi
+
+if [[ "${mode_presto}" == "worker" ]]; then
+  echo "Waiting for Presto Coordinator to come online at: http://${address_presto_coordinator}:${http_port}"
+  while ! nc -z ${address_presto_coordinator} ${http_port}; do
+      sleep 5
+  done
+fi
+
+
+AZURE_ACCOUNT="${az_account_name}"
+AZURE_KEY="${az_access_key}"
+
+if [ ! -z "$AZURE_ACCOUNT" ] && [ ! -z "$AZURE_KEY" ]; then
+  # Update hive-site.xml
+  /usr/bin/printf "<configuration>
+  <property>
+    <name>fs.azure.account.key.$AZURE_ACCOUNT.blob.core.windows.net</name>
+    <value>$AZURE_KEY</value>
+  </property>
+  " > /tmp/hive-site-partial.txt
+  sudo sed -i "s/<configuration>/$(sed 's@[/\&]@\\&@g;$!s/$/\\/' /tmp/hive-site-partial.txt)/g" /usr/local/apache-hive-*-bin/conf/hive-site.xml
+  rm /tmp/hive-site-partial.txt
+
+  # Update hive.properties
+  /usr/bin/printf "\nhive.allow-drop-table=true" >> /etc/presto/catalog/hive.properties
+  /usr/bin/printf "\nhive.non-managed-table-writes-enabled=true" >> /etc/presto/catalog/hive.properties
+  /usr/bin/printf "\n#hive.time-zone=UTC" >> /etc/presto/catalog/hive.properties
+  /usr/bin/printf "\nhive.hive.azure.wasb-storage-account=$AZURE_ACCOUNT" >> /etc/presto/catalog/hive.properties
+  /usr/bin/printf "\nhive.hive.azure.wasb-access-key=$AZURE_KEY" >> /etc/presto/catalog/hive.properties
+  /usr/bin/printf "\n" >> /etc/presto/catalog/hive.properties
+fi
+
+
+echo "Starting presto..."
+systemctl enable presto.service
+systemctl start presto.service
+
+if [[ "${mode_presto}" == "coordinator" ]] || [[ "${mode_presto}" == "coordinator-worker" ]]; then
+    echo "Waiting for Presto Coordinator to start"
+    while ! presto --execute='select * from system.runtime.nodes'; do
+      sleep 10
+    done
+    echo "Presto Coordinator is now online"
+fi
+
+
diff --git a/packer/README.md b/packer/README.md
@@ -94,6 +94,9 @@ Building the AMIs is done using the following commands:
 
 ```bash
 packer build -only=amazon-ebs -var-file=variables.json presto.json
+
+packer build -only=azure-arm -var-file=variables.json presto.json
+
 ```
 
 Override the aws_region and aws_az variables to change the target region and

diff --git a/packer/prestoclients.json b/packer/prestoclients.json
@@ -36,6 +36,25 @@
         },
         "spot_price_auto_product": "Linux/UNIX (Amazon VPC)",
         "spot_price": "auto"
+      },
+      {
+        "type": "azure-arm",
+
+        "client_id": "{{user `azure_client_id`}}",
+        "client_secret": "{{user `azure_client_secret`}}",
+        "tenant_id": "{{user `azure_tenant_id`}}",
+        "subscription_id": "{{user `azure_subscription_id`}}",
+
+        "managed_image_resource_group_name": "{{user `azure_resource_group_name`}}",
+        "managed_image_name": "prestoclients-{{isotime \"2006-01-02T030405\"}}",
+
+        "os_type": "Linux",
+        "image_publisher": "Canonical",
+        "image_offer": "UbuntuServer",
+        "image_sku": "18.04-LTS",
+
+        "location": "{{user `azure_location`}}",
+        "vm_size": "Standard_DS2_v2"
       }
     ],
     "provisioners": [

diff --git a/packer/prestoclients/update-machine.sh b/packer/prestoclients/update-machine.sh
@@ -26,7 +26,7 @@ cd /opt/certs
 openssl genrsa -des3 -passout pass:xxxx -out keypair 2048
 openssl rsa -passin pass:xxxx -in keypair -out server.key
 rm keypair
-touch /home/ubuntu/.rnd
+touch ~/.rnd
 openssl req -new -key server.key -out server.csr -subj "/CN=*"
 openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt
 rm server.csr

diff --git a/terraform-azure/README.md b/terraform-azure/README.md
@@ -0,0 +1,88 @@
+# Azure deployment
+
+## Create the machine images with Packer
+
+Go to the packer folder and see the README there. Once you have the machine image IDs, return here and continue with the next steps.
+
+## Create key-pair or use your own
+
+This deployment is configured to use your default SSH keys as machine credentials. If you want to use other keys, change the path to the keys you want to use (look for `key_path` in variables.tf). Use [this guide](https://help.github.com/articles/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent/) to generate new keys if needed.
+
+## Configurations
+
+Edit `variables.tf` or create separate `terraform.tfvars` file to specify the following:
+
+* `azure_location` - the Azure location where to launch the cluster in.
+* `azure_subscription_id`, `azure_client_id`, `azure_client_secret`, `azure_tenant_id` - the same credentials used in the Packer step. See the README there for instructions on how to retrieve them.
+* `presto_cluster` - the name of the Presto cluster to launch.
+* `key_path` - the filesystem path to the SSH key to use as virtual machines login credentials.
+* `coordinator_instance_type`, `worker_instance_type`, `client_instance_type` - Azure machine instance types to use for each machine type in the cluster.
+
+The rest of the configurations are mostly around cluster topology and  machine types and sizes.
+
+### Cluster topology
+
+Two modes of deployment are supported:
+
+* A recommended configuration, with dedicated coordinator node, worker nodes and client nodes. This is a production-ready and best-practice configuration.
+* Coordinator+Worker node mode - mostly useful for experimentation
+
+The default mode is the single-node mode. To change it to the recommended configuration, edit `variables.tf` and set number of worker and client nodes to at least 1.
+
+All nodes with the `coordinator` and `client` role will be attached to an Azure load balancer, so access to all client nodes can be done via the DNS it exposes.
+
+## Launch the cluster with Terraform
+
+```bash
+terraform plan
+terraform apply
+```
+
+When terraform is done, you should see a lot of output ending with something like this:
+
+```
+Apply complete! Resources: 14 added, 0 changed, 0 destroyed.
+
+The state of your infrastructure has been saved to the path
+below. This state is required to modify and destroy your
+infrastructure, so keep it safe. To inspect the complete state
+use the `terraform show` command.
+
+State path: terraform.tfstate
+
+Outputs:
+
+public_dns = presto-cluster-foo.eastus.cloudapp.azure.com
+vm_password = rBTKoLsf7x8ODZVd
+```
+
+Note `clients_lb_public_ipaddress` and `vm-password` - that's your entry point to the cluster and the password for the `exampleuser` default user.
+
+### Look around
+
+The client nodes are the ones exposed to external networks. They provide endpoints for Kibana, Grafana, Cerebro and direct Presto access. By default client nodes are accessible via their public IPs and the DNS of the load balancer they are attached to (see above).
+
+Client nodes listen on port 8080 and are password protected. Access is managed by nginx which is expecting a username and password pair. Default user name is exampleuser and the password is generated automatically when deploying. You can change those defaults by editing [this file](https://github.com/synhershko/presto-cloud-deploy/blob/master/packer/install-nginx.sh) and running Packer again.
+
+On client nodes you will find:
+
+* [Redash](https://redash.io) (data query / visualisation tool) is available on http://host:8500
+* [Superset](https://superset.apache.org/) (data exploration platform) is available on http://host:8600
+* [Zeppelin](https://zeppelin.apache.org/) (web-based data notebook) is available on http://host:8700
+
+The default credentials are `admin` or `admin@redash` as username, and password as generated by Terraform during the deployment (will show up as `clients_password` after deployment when you run `terraform output`).
+
+To ssh to one of the instances:
+
+```bash
+ssh ubuntu@{public IP / DNS of the instance or load balancer}
+```
+
+## Backups
+
+The Azure repository plugin is installed on the cluster and ready to be used for index snapshots and (should you ever need) a restore.
+
+### Auto- and manual- scale out
+
+The entire stack is deployed using Azure scale-sets, which are easy to scale up and down manually (from the Azure portal, from the command line, or using the same Terraform scripts), or automatically based on host metrics and application metrics using [Azure scale-set features](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/virtual-machine-scale-sets-autoscale-overview).
+