diff --git a/.github/workflows/pr_tests_spark.yml b/.github/workflows/pr_tests_spark.yml index c94a9e3d..164e6259 100644 --- a/.github/workflows/pr_tests_spark.yml +++ b/.github/workflows/pr_tests_spark.yml @@ -10,6 +10,7 @@ env: SPARK_SCHEMA: default AWS_REGION: eu-west-1 AWS_DEFAULT_REGION: eu-west-1 + DOCKER_PLATFORM: linux/amd64 jobs: pr_tests_spark: @@ -63,65 +64,32 @@ jobs: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: eu-west-1 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - name: Install Docker Compose run: | sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose - - name: Build and start Spark cluster + - name: Start Spark cluster working-directory: .github/workflows/spark_deployment run: | - # docker-compose build docker-compose up -d echo "Waiting for Spark services to start..." - sleep 90 + sleep 120 - name: Check running containers working-directory: .github/workflows/spark_deployment run: docker ps - - name: Check Docker network - working-directory: .github/workflows/spark_deployment - run: | - docker network ls - # docker network inspect spark-network - - name: Print Docker logs + if: failure() working-directory: .github/workflows/spark_deployment run: | echo "Docker logs for spark-master:" - docker-compose logs --tail=1000 spark-master + docker-compose logs spark-master echo "Docker logs for spark-worker:" - docker-compose logs --tail=1000 spark-worker + docker-compose logs spark-worker echo "Docker logs for thrift-server:" - docker-compose logs --tail=1000 thrift-server - - - name: Verify Spark configuration - working-directory: .github/workflows/spark_deployment - run: | - echo "Verifying Spark configuration..." - docker-compose exec -T spark-master bash -c "cat /spark/conf/spark-defaults.conf" - - - name: Wait for Thrift Server - run: | - echo "Waiting for Thrift Server to be fully operational..." - sleep 60 - - - name: Check ThriftServer Process - working-directory: .github/workflows/spark_deployment - run: docker-compose exec -T thrift-server bash -c "ps aux | grep ThriftServer" - - - name: Check Latest ThriftServer Log - working-directory: .github/workflows/spark_deployment - run: docker-compose exec -T thrift-server bash -c "tail -n 50 /spark/logs/\$(ls -t /spark/logs/ | grep thriftserver | head -n1)" - - - name: Test ThriftServer connection with Beeline - working-directory: .github/workflows/spark_deployment - run: | - docker-compose exec -T thrift-server bash -c '/spark/bin/beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES;"' + docker-compose logs thrift-server - name: "Pre-test: Drop ci schemas" run: | diff --git a/.github/workflows/spark_deployment/docker-compose.yml b/.github/workflows/spark_deployment/docker-compose.yml index 56d5c6b9..739aa93c 100644 --- a/.github/workflows/spark_deployment/docker-compose.yml +++ b/.github/workflows/spark_deployment/docker-compose.yml @@ -5,6 +5,7 @@ networks: services: spark-master: image: snowplow/spark-s3-iceberg:v2 + platform: ${DOCKER_PLATFORM:-linux/amd64} command: ["/bin/bash", "-c", "/opt/spark/sbin/start-master.sh -h spark-master --properties-file /opt/spark/conf/spark-defaults.conf && tail -f /opt/spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] hostname: spark-master ports: @@ -26,6 +27,7 @@ services: spark-worker: image: snowplow/spark-s3-iceberg:v2 + platform: ${DOCKER_PLATFORM:-linux/amd64} command: ["/bin/bash", "-c", "sleep 10 && /opt/spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /opt/spark/conf/spark-defaults.conf && tail -f /opt/spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] depends_on: - spark-master @@ -46,6 +48,7 @@ services: thrift-server: image: snowplow/spark-s3-iceberg:v2 + platform: ${DOCKER_PLATFORM:-linux/amd64} command: ["/bin/bash", "-c", "sleep 30 && /opt/spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /opt/spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] ports: - '10000:10000'