Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test IAM connection to spark #187

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 38 additions & 8 deletions .github/workflows/pr_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ env:
DATABRICKS_TEST_TOKEN: ${{ secrets.DATABRICKS_TEST_TOKEN }}
DATABRICKS_TEST_ENDPOINT: ${{ secrets.DATABRICKS_TEST_ENDPOINT }}


AWS_REGION: eu-west-1
AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
S3_BUCKET: "dbt-testing-automation"

permissions:
id-token: write
contents: read
packages: read


jobs:
pr_tests:
name: pr_tests
Expand Down Expand Up @@ -81,19 +92,38 @@ jobs:
- 5432:5432

steps:
- name: Check out
uses: actions/checkout@v3
- name: Checkout repository
uses: actions/checkout@v4

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}

- name: Store AWS credentials
run: |
mkdir -p ~/.aws
cat > ~/.aws/credentials << EOF
[default]
aws_access_key_id=${{ env.AWS_ACCESS_KEY_ID }}
aws_secret_access_key=${{ env.AWS_SECRET_ACCESS_KEY }}
aws_session_token=${{ env.AWS_SESSION_TOKEN }}
EOF

cat > ~/.aws/config << EOF
[default]
region=${{ env.AWS_REGION }}
output=json
EOF


- name: Configure Docker credentials
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_SNOWPLOWCI_READ_USERNAME }}
password: ${{ secrets.DOCKERHUB_SNOWPLOWCI_READ_PASSWORD }}
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-1

- name: Set warehouse variables
id: set_warehouse
run: |
Expand Down
104 changes: 63 additions & 41 deletions .github/workflows/spark_deployment/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,66 +1,88 @@
version: '3'

networks:
spark-network:
driver: bridge

services:
spark-master:
image: snowplow/spark-s3-iceberg:latest
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
image: snowplow/spark-s3-iceberg:latest-iam
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
hostname: spark-master
ports:
- '8080:8080'
- '7077:7077'
environment:
- SPARK_LOCAL_IP=spark-master
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=7077
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g"
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
environment: &spark-env
SPARK_LOCAL_IP: spark-master
SPARK_MASTER_HOST: spark-master
SPARK_MASTER_PORT: 7077
AWS_REGION: eu-west-1
AWS_DEFAULT_REGION: eu-west-1
SOFTWARE_AMAZON_AWSSDK_HTTP_SERVICE_IMPL: "software.amazon.awssdk.http.apache.ApacheSdkHttpService"
volumes: &spark-volumes
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf:ro
- ./logs:/spark/logs
- ./events:/tmp/spark-events
- ~/.aws:/root/.aws:ro
networks:
- spark-network
deploy:
resources:
limits:
memory: 2G
cpus: '1'

spark-worker:
image: snowplow/spark-s3-iceberg:latest
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
image: snowplow/spark-s3-iceberg:latest-iam
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
depends_on:
- spark-master
environment:
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=4G
- SPARK_EXECUTOR_MEMORY=3G
- SPARK_LOCAL_IP=spark-worker
- SPARK_MASTER=spark://spark-master:7077
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
<<: *spark-env
SPARK_LOCAL_IP: spark-worker
SPARK_MASTER: spark://spark-master:7077
SPARK_WORKER_CORES: 2
SPARK_WORKER_MEMORY: 2g
volumes: *spark-volumes
networks:
- spark-network
deploy:
resources:
limits:
memory: 2.5G
cpus: '2'

thrift-server:
image: snowplow/spark-s3-iceberg:latest
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
image: snowplow/spark-s3-iceberg:latest-iam
command: >
/bin/bash -c '
sleep 30 &&
/spark/sbin/start-thriftserver.sh
--master spark://spark-master:7077
--driver-memory 1g
--executor-memory 2g
--conf "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
--conf "spark.sql.catalog.spark_catalog.type=hive"
--conf "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
--conf "spark.sql.defaultCatalog=iceberg_catalog"
--hiveconf hive.server2.thrift.port=10000
--hiveconf hive.server2.thrift.bind.host=0.0.0.0 &&
tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out'
ports:
- '10000:10000'
- '4040:4040'
depends_on:
- spark-master
- spark-worker
environment:
- SPARK_LOCAL_IP=thrift-server
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
<<: *spark-env
SPARK_LOCAL_IP: thrift-server
HIVE_SERVER2_THRIFT_PORT: 10000
HIVE_SERVER2_THRIFT_BIND_HOST: 0.0.0.0
volumes: *spark-volumes
networks:
- spark-network
- spark-network
deploy:
resources:
limits:
memory: 2G
cpus: '1'

networks:
spark-network:
driver: bridge
79 changes: 42 additions & 37 deletions .github/workflows/spark_deployment/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -1,44 +1,49 @@
# Core Infrastructure
spark.master spark://spark-master:7077
spark.eventLog.enabled true
spark.eventLog.dir /tmp/spark-events

spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue
spark.sql.catalog.glue.database dbt-spark-iceberg
# Core Spark SQL Settings
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog
spark.sql.catalog.spark_catalog.type hive
spark.sql.defaultCatalog iceberg_catalog

spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.access.key <AWS_ACCESS_KEY_ID>
spark.hadoop.fs.s3a.secret.key <AWS_SECRET_ACCESS_KEY>
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.region eu-west-1
spark.hadoop.fs.s3a.aws.region eu-west-1
# Hive and Thrift Server Settings
spark.sql.hive.metastore.version 2.3.9
spark.sql.hive.metastore.jars builtin
spark.sql.warehouse.dir /tmp/spark-warehouse
spark.sql.hive.thriftServer.singleSession true
hive.server2.thrift.port 10000
hive.server2.thrift.bind.host 0.0.0.0

# Enabling AWS SDK V4 signing (required for regions launched after January 2014)
spark.hadoop.com.amazonaws.services.s3.enableV4 true
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
# Memory and Performance Settings (Optimized for GitHub runner)
spark.memory.fraction 0.6
spark.memory.storageFraction 0.3
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.sql.adaptive.enabled true
spark.sql.shuffle.partitions 100
spark.default.parallelism 100
spark.driver.memory 1g
spark.executor.memory 2g
spark.executor.cores 1

# Hive Metastore Configuration (using AWS Glue)
spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory
# AWS Integration
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.aws.credentials.provider com.amazonaws.auth.DefaultAWSCredentialsProviderChain
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com

# Thrift Server Configuration for better performance in concurrent environments
spark.sql.hive.thriftServer.singleSession false
spark.sql.hive.thriftServer.async true
# spark.sql.hive.thriftServer.maxWorkerThreads 100
# spark.sql.hive.thriftServer.minWorkerThreads 50
# spark.sql.hive.thriftServer.workerQueue.size 2000
# S3 Performance Settings
spark.hadoop.fs.s3a.connection.maximum 50
spark.hadoop.fs.s3a.connection.timeout 300000
spark.hadoop.fs.s3a.threads.max 10
spark.hadoop.fs.s3a.multipart.size 32M
spark.hadoop.fs.s3a.fast.upload true
spark.hadoop.fs.s3a.fast.upload.buffer disk
spark.hadoop.fs.s3a.path.style.access true

# Memory and Performance Tuning
# spark.driver.memory 2g
# spark.executor.memory 3g
# spark.worker.memory 4g
spark.network.timeout 600s
spark.sql.broadcastTimeout 600s
spark.sql.adaptive.enabled true
spark.serializer org.apache.spark.serializer.KryoSerializer

# Logging and Debugging
spark.eventLog.enabled true
spark.eventLog.dir /tmp/spark-events
# Iceberg Catalog Configuration
spark.sql.catalog.iceberg_catalog org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.iceberg_catalog.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.iceberg_catalog.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.catalog.iceberg_catalog.warehouse s3a://dbt-testing-automation/warehouse
Loading