Skip to content

Update pr_tests_spark.yml #13

Update pr_tests_spark.yml

Update pr_tests_spark.yml #13

Workflow file for this run

name: pr_tests_spark
on:
pull_request:
branches:
- main
push:
branches:
- feature/**
- dev
- staging
- template-spark-tests
- spark_prep
concurrency: dbt_integration_tests
env:
DBT_PROFILES_DIR: ./ci
SPARK_MASTER_HOST: spark-master
SPARK_USER: spark
SPARK_SCHEMA: default
jobs:
pr_tests:
name: pr_tests
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./integration_tests
strategy:
matrix:
dbt_version:
- 1.*
warehouse:
- spark
steps:
- name: Check out
uses: actions/checkout@v3
- name: Set SCHEMA_SUFFIX env
run: >-
echo "SCHEMA_SUFFIX=$(echo ${DBT_VERSION%.*} | tr . _)" >> $GITHUB_ENV
env:
DBT_VERSION: '${{ matrix.dbt_version }}'
- name: Set DEFAULT_TARGET env
run: |
echo "DEFAULT_TARGET=${{ matrix.warehouse }}" >> $GITHUB_ENV
- name: Python setup
uses: actions/setup-python@v4
with:
python-version: 3.8.x
- name: Pip cache
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: >-
${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{ matrix.warehouse }}
restore-keys: >-
${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{ matrix.warehouse }}
- name: Install spark dependencies
run: |
pip install --upgrade pip wheel setuptools
pip install -Iv "dbt-spark[PyHive]==${{ matrix.dbt_version }}" --upgrade
dbt deps
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Install Docker Compose
run: |
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
- name: Check Docker and Docker Compose versions
run: |
docker --version
docker-compose --version
- name: Create Dockerfile
run: |
cat << EOF > Dockerfile
FROM openjdk:11-jdk-slim
ENV SPARK_VERSION=3.5.1
ENV HADOOP_VERSION=3.3.4
ENV SPARK_HOME=/spark
RUN apt-get update && apt-get install -y curl wget procps rsync ssh iputils-ping net-tools jq
RUN wget --tries=5 --retry-connrefused --waitretry=1 --timeout=20 https://downloads.apache.org/spark/spark-\${SPARK_VERSION}/spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \
tar -xvzf spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \
mv spark-\${SPARK_VERSION}-bin-hadoop3 \${SPARK_HOME} && \
rm spark-\${SPARK_VERSION}-bin-hadoop3.tgz
ENV PATH=\$PATH:\${SPARK_HOME}/bin:\${SPARK_HOME}/sbin
WORKDIR \${SPARK_HOME}
CMD ["bash"]
EOF
- name: Create spark-defaults.conf
run: |
cat << EOF > spark-defaults.conf
spark.sql.hive.thriftServer.singleSession true
spark.hadoop.hive.server2.thrift.port 10000
EOF
- name: Create docker-compose.yml
run: |
cat << EOF > docker-compose.yml
version: '3'
networks:
spark-network:
driver: bridge
services:
spark-master:
build: .
command: |
bash -c "
/spark/bin/spark-sql --conf spark.sql.hive.metastore.jars=builtin --conf spark.sql.hive.metastore.version=2.3.9 --conf spark.sql.catalogImplementation=hive -e 'show databases;' &&
/spark/sbin/start-master.sh &&
/spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 &&
tail -f /spark/logs/*"
ports:
- "8080:8080"
- "7077:7077"
- "10000:10000"
- "4040:4040"
environment:
- SPARK_MODE=master
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=7077
- SPARK_MASTER_WEBUI_PORT=8080
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
networks:
- spark-network
spark-worker:
build: .
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
depends_on:
- spark-master
environment:
- SPARK_MODE=worker
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=2g
- SPARK_WORKER_PORT=8081
- SPARK_WORKER_WEBUI_PORT=8081
- SPARK_MASTER=spark://spark-master:7077
networks:
- spark-network
EOF
- name: Debug Docker Compose file
run: cat docker-compose.yml
- name: Build and start Spark cluster
run: |
docker-compose build --no-cache
docker-compose up -d
- name: Check running containers
run: docker ps
- name: Wait for services to start
run: |
echo "Waiting for Spark services to start..."
sleep 120
- name: Check Docker network
run: |
docker network ls
docker network inspect integration_tests_spark-network
- name: Print Docker logs
run: |
echo "Docker logs for spark-master:"
docker-compose logs --tail=1000 spark-master
echo "Docker logs for spark-worker:"
docker-compose logs --tail=1000 spark-worker
- name: Inspect Docker containers
run: |
echo "Inspecting spark-master container:"
docker inspect integration_tests_spark-master_1
echo "Inspecting spark-worker container:"
docker inspect integration_tests_spark-worker_1
- name: Check Spark cluster status
run: |
docker-compose exec -T spark-master bash -c "jps && ps aux | grep spark && netstat -tuln"
docker-compose exec -T spark-worker bash -c "jps && ps aux | grep spark && netstat -tuln"
- name: Debug Spark Master Configuration
run: docker-compose exec -T spark-master bash -c "cat /spark/conf/spark-defaults.conf"
- name: Debug Spark Master Logs
run: docker-compose exec -T spark-master bash -c "cat /spark/logs/spark--org.apache.spark.deploy.master.Master-*.out"
- name: Check ThriftServer Process
run: docker-compose exec -T spark-master bash -c "ps aux | grep ThriftServer"
- name: List Spark Logs
run: docker-compose exec -T spark-master bash -c "ls -l /spark/logs/"
- name: Check Latest ThriftServer Log
run: docker-compose exec -T spark-master bash -c "tail -n 50 /spark/logs/\$(ls -t /spark/logs/ | grep thriftserver | head -n1)"
- name: Check if port 10000 is actually listening inside the spark-master container
run: docker-compose exec -T spark-master bash -c "netstat -tuln | grep 10000"
# - name: Try to connect to the Thrift server from the spark-master container itself (with timeout)
# run: docker-compose exec -T spark-master bash -c "timeout 5 curl -v telnet://spark-master:10000"
- name: Check network connectivity
run: |
docker-compose exec -T spark-master bash -c "ping -c 4 spark-master"
docker-compose exec -T spark-master bash -c "nc -zv spark-master 10000"
- name: Verify ThriftServer JDBC URL
run: |
docker-compose exec -T spark-master bash -c 'echo "jdbc:hive2://spark-master:10000"'
- name: Test ThriftServer connection with Beeline
run: |
docker-compose exec -T spark-master bash -c '
beeline -u "jdbc:hive2://spark-master:10000" -n root -e "SHOW DATABASES;"
'
- name: Check Spark event logs
run: |
docker-compose exec -T spark-master bash -c "cat /spark/spark-events/*"
- name: Run simple Spark SQL query
run: |
docker-compose exec -T spark-master bash -c '
spark-sql --conf spark.sql.hive.thriftServer.singleSession=true \
--conf spark.sql.catalogImplementation=hive \
-e "SELECT 1 as test;"
'
- name: Check Spark Master UI
run: |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.workers'"
- name: Verify Hive metastore
run: |
docker-compose exec -T spark-master bash -c '
spark-sql --conf spark.sql.hive.metastore.jars=builtin \
--conf spark.sql.hive.metastore.version=2.3.9 \
--conf spark.sql.catalogImplementation=hive \
-e "show databases;"
'
- name: Check ThriftServer UI
run: |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:4040/api/v1/applications | jq '.[0].name'"
- name: Check Spark Applications
run: |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.activeapps[0].name'"
- name: Wait for Thrift Server
run: |
echo "Waiting for Thrift Server to be fully operational..."
sleep 30
- name: 'Pre-test: Drop ci schemas'
run: |
dbt run-operation post_ci_cleanup --target spark
- name: Run tests
run: ./.scripts/integration_tests.sh -d spark
- name: 'Post-test: Drop ci schemas'
run: |
dbt run-operation post_ci_cleanup --target spark
- name: Cleanup Spark cluster
if: always()
run: |
docker-compose down