From 63e67fa17abe124943d0f4361bc0b2f595f5f950 Mon Sep 17 00:00:00 2001 From: Ilias Xenogiannis Date: Tue, 29 Oct 2024 11:23:29 +0200 Subject: [PATCH 1/3] Test IAM connection to spark --- .github/workflows/pr_tests.yml | 37 +++++-- .../spark_deployment/docker-compose.yml | 104 +++++++++++------- .../spark_deployment/spark-defaults.conf | 79 ++++++------- 3 files changed, 134 insertions(+), 86 deletions(-) diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 69122d12..8f782271 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -50,6 +50,17 @@ env: DATABRICKS_TEST_TOKEN: ${{ secrets.DATABRICKS_TEST_TOKEN }} DATABRICKS_TEST_ENDPOINT: ${{ secrets.DATABRICKS_TEST_ENDPOINT }} + + AWS_REGION: eu-west-1 + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + S3_BUCKET: "dbt-testing-automation" + +permissions: + id-token: write + contents: read + packages: read + + jobs: pr_tests: name: pr_tests @@ -81,19 +92,29 @@ jobs: - 5432:5432 steps: - - name: Check out - uses: actions/checkout@v3 + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ env.AWS_ROLE_ARN }} + aws-region: ${{ env.AWS_REGION }} + + - name: Store AWS credentials + run: | + mkdir -p ~/.aws + aws configure set aws_access_key_id $(aws configure get aws_access_key_id) + aws configure set aws_secret_access_key $(aws configure get aws_secret_access_key) + aws configure set aws_session_token $(aws configure get aws_session_token) + aws configure set region ${{ env.AWS_REGION }} + - name: Configure Docker credentials uses: docker/login-action@v2 with: username: ${{ secrets.DOCKERHUB_SNOWPLOWCI_READ_USERNAME }} password: ${{ secrets.DOCKERHUB_SNOWPLOWCI_READ_PASSWORD }} - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-1 + - name: Set warehouse variables id: set_warehouse run: | diff --git a/.github/workflows/spark_deployment/docker-compose.yml b/.github/workflows/spark_deployment/docker-compose.yml index 2e8077ba..22d9b039 100644 --- a/.github/workflows/spark_deployment/docker-compose.yml +++ b/.github/workflows/spark_deployment/docker-compose.yml @@ -1,66 +1,88 @@ version: '3' - -networks: - spark-network: - driver: bridge - services: spark-master: - image: snowplow/spark-s3-iceberg:latest - command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] + image: snowplow/spark-s3-iceberg:latest-iam + command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] hostname: spark-master ports: - '8080:8080' - '7077:7077' - environment: - - SPARK_LOCAL_IP=spark-master - - SPARK_MASTER_HOST=spark-master - - SPARK_MASTER_PORT=7077 - - SPARK_MASTER_OPTS="-Dspark.driver.memory=2g" - - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - - AWS_REGION=eu-west-1 - - AWS_DEFAULT_REGION=eu-west-1 - volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf + environment: &spark-env + SPARK_LOCAL_IP: spark-master + SPARK_MASTER_HOST: spark-master + SPARK_MASTER_PORT: 7077 + AWS_REGION: eu-west-1 + AWS_DEFAULT_REGION: eu-west-1 + SOFTWARE_AMAZON_AWSSDK_HTTP_SERVICE_IMPL: "software.amazon.awssdk.http.apache.ApacheSdkHttpService" + volumes: &spark-volumes + - ./spark-defaults.conf:/spark/conf/spark-defaults.conf:ro + - ./logs:/spark/logs + - ./events:/tmp/spark-events + - ~/.aws:/root/.aws:ro networks: - spark-network + deploy: + resources: + limits: + memory: 2G + cpus: '1' spark-worker: - image: snowplow/spark-s3-iceberg:latest - command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] + image: snowplow/spark-s3-iceberg:latest-iam + command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] depends_on: - spark-master environment: - - SPARK_WORKER_CORES=2 - - SPARK_WORKER_MEMORY=4G - - SPARK_EXECUTOR_MEMORY=3G - - SPARK_LOCAL_IP=spark-worker - - SPARK_MASTER=spark://spark-master:7077 - - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - - AWS_REGION=eu-west-1 - - AWS_DEFAULT_REGION=eu-west-1 - volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf + <<: *spark-env + SPARK_LOCAL_IP: spark-worker + SPARK_MASTER: spark://spark-master:7077 + SPARK_WORKER_CORES: 2 + SPARK_WORKER_MEMORY: 2g + volumes: *spark-volumes networks: - spark-network + deploy: + resources: + limits: + memory: 2.5G + cpus: '2' thrift-server: - image: snowplow/spark-s3-iceberg:latest - command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] + image: snowplow/spark-s3-iceberg:latest-iam + command: > + /bin/bash -c ' + sleep 30 && + /spark/sbin/start-thriftserver.sh + --master spark://spark-master:7077 + --driver-memory 1g + --executor-memory 2g + --conf "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog" + --conf "spark.sql.catalog.spark_catalog.type=hive" + --conf "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" + --conf "spark.sql.defaultCatalog=iceberg_catalog" + --hiveconf hive.server2.thrift.port=10000 + --hiveconf hive.server2.thrift.bind.host=0.0.0.0 && + tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out' ports: - '10000:10000' + - '4040:4040' depends_on: - spark-master - spark-worker environment: - - SPARK_LOCAL_IP=thrift-server - - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - - AWS_REGION=eu-west-1 - - AWS_DEFAULT_REGION=eu-west-1 - volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf + <<: *spark-env + SPARK_LOCAL_IP: thrift-server + HIVE_SERVER2_THRIFT_PORT: 10000 + HIVE_SERVER2_THRIFT_BIND_HOST: 0.0.0.0 + volumes: *spark-volumes networks: - - spark-network \ No newline at end of file + - spark-network + deploy: + resources: + limits: + memory: 2G + cpus: '1' + +networks: + spark-network: + driver: bridge \ No newline at end of file diff --git a/.github/workflows/spark_deployment/spark-defaults.conf b/.github/workflows/spark_deployment/spark-defaults.conf index 9052a056..91628abe 100644 --- a/.github/workflows/spark_deployment/spark-defaults.conf +++ b/.github/workflows/spark_deployment/spark-defaults.conf @@ -1,44 +1,49 @@ +# Core Infrastructure spark.master spark://spark-master:7077 +spark.eventLog.enabled true +spark.eventLog.dir /tmp/spark-events -spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing -spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog -spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog -spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing -spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO -spark.sql.defaultCatalog glue -spark.sql.catalog.glue.database dbt-spark-iceberg +# Core Spark SQL Settings +spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog +spark.sql.catalog.spark_catalog.type hive +spark.sql.defaultCatalog iceberg_catalog -spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem -spark.hadoop.fs.s3a.access.key -spark.hadoop.fs.s3a.secret.key -spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com -spark.hadoop.fs.s3a.path.style.access true -spark.hadoop.fs.s3a.region eu-west-1 -spark.hadoop.fs.s3a.aws.region eu-west-1 +# Hive and Thrift Server Settings +spark.sql.hive.metastore.version 2.3.9 +spark.sql.hive.metastore.jars builtin +spark.sql.warehouse.dir /tmp/spark-warehouse +spark.sql.hive.thriftServer.singleSession true +hive.server2.thrift.port 10000 +hive.server2.thrift.bind.host 0.0.0.0 -# Enabling AWS SDK V4 signing (required for regions launched after January 2014) -spark.hadoop.com.amazonaws.services.s3.enableV4 true -spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider +# Memory and Performance Settings (Optimized for GitHub runner) +spark.memory.fraction 0.6 +spark.memory.storageFraction 0.3 +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.sql.adaptive.enabled true +spark.sql.shuffle.partitions 100 +spark.default.parallelism 100 +spark.driver.memory 1g +spark.executor.memory 2g +spark.executor.cores 1 -# Hive Metastore Configuration (using AWS Glue) -spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory +# AWS Integration +spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem +spark.hadoop.fs.s3a.aws.credentials.provider com.amazonaws.auth.DefaultAWSCredentialsProviderChain +spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com -# Thrift Server Configuration for better performance in concurrent environments -spark.sql.hive.thriftServer.singleSession false -spark.sql.hive.thriftServer.async true -# spark.sql.hive.thriftServer.maxWorkerThreads 100 -# spark.sql.hive.thriftServer.minWorkerThreads 50 -# spark.sql.hive.thriftServer.workerQueue.size 2000 +# S3 Performance Settings +spark.hadoop.fs.s3a.connection.maximum 50 +spark.hadoop.fs.s3a.connection.timeout 300000 +spark.hadoop.fs.s3a.threads.max 10 +spark.hadoop.fs.s3a.multipart.size 32M +spark.hadoop.fs.s3a.fast.upload true +spark.hadoop.fs.s3a.fast.upload.buffer disk +spark.hadoop.fs.s3a.path.style.access true -# Memory and Performance Tuning -# spark.driver.memory 2g -# spark.executor.memory 3g -# spark.worker.memory 4g -spark.network.timeout 600s -spark.sql.broadcastTimeout 600s -spark.sql.adaptive.enabled true -spark.serializer org.apache.spark.serializer.KryoSerializer - -# Logging and Debugging -spark.eventLog.enabled true -spark.eventLog.dir /tmp/spark-events +# Iceberg Catalog Configuration +spark.sql.catalog.iceberg_catalog org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.iceberg_catalog.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog +spark.sql.catalog.iceberg_catalog.io-impl org.apache.iceberg.aws.s3.S3FileIO +spark.sql.catalog.iceberg_catalog.warehouse s3a://dbt-testing-automation/warehouse \ No newline at end of file From 2d0bdb1f3879acd04410ce501b10b4656e32d2d9 Mon Sep 17 00:00:00 2001 From: Ilias Xenogiannis Date: Tue, 29 Oct 2024 11:27:02 +0200 Subject: [PATCH 2/3] Update pr_tests.yml --- .github/workflows/pr_tests.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 8f782271..36282188 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -104,10 +104,12 @@ jobs: - name: Store AWS credentials run: | mkdir -p ~/.aws - aws configure set aws_access_key_id $(aws configure get aws_access_key_id) - aws configure set aws_secret_access_key $(aws configure get aws_secret_access_key) - aws configure set aws_session_token $(aws configure get aws_session_token) - aws configure set region ${{ env.AWS_REGION }} + aws configure get aws_access_key_id > /dev/null 2>&1 && \ + aws configure set aws_access_key_id "$(aws configure get aws_access_key_id)" && \ + aws configure set aws_secret_access_key "$(aws configure get aws_secret_access_key)" && \ + aws configure set aws_session_token "$(aws configure get aws_session_token)" && \ + aws configure set region "${{ env.AWS_REGION }}" + - name: Configure Docker credentials uses: docker/login-action@v2 From 279db14f678e21b264ea554839568a80d2e92053 Mon Sep 17 00:00:00 2001 From: Ilias Xenogiannis Date: Tue, 29 Oct 2024 11:49:18 +0200 Subject: [PATCH 3/3] Update pr_tests.yml --- .github/workflows/pr_tests.yml | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 36282188..43d826a3 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -103,12 +103,19 @@ jobs: - name: Store AWS credentials run: | - mkdir -p ~/.aws - aws configure get aws_access_key_id > /dev/null 2>&1 && \ - aws configure set aws_access_key_id "$(aws configure get aws_access_key_id)" && \ - aws configure set aws_secret_access_key "$(aws configure get aws_secret_access_key)" && \ - aws configure set aws_session_token "$(aws configure get aws_session_token)" && \ - aws configure set region "${{ env.AWS_REGION }}" + mkdir -p ~/.aws + cat > ~/.aws/credentials << EOF + [default] + aws_access_key_id=${{ env.AWS_ACCESS_KEY_ID }} + aws_secret_access_key=${{ env.AWS_SECRET_ACCESS_KEY }} + aws_session_token=${{ env.AWS_SESSION_TOKEN }} + EOF + + cat > ~/.aws/config << EOF + [default] + region=${{ env.AWS_REGION }} + output=json + EOF - name: Configure Docker credentials