Update pr_tests_spark.yml #20
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: pr_tests_spark | |
on: | |
pull_request: | |
branches: | |
- main | |
push: | |
branches: | |
- feature/** | |
- dev | |
- staging | |
- template-spark-tests | |
- spark_prep | |
concurrency: dbt_integration_tests | |
env: | |
DBT_PROFILES_DIR: ./ci | |
SPARK_MASTER_HOST: spark-master | |
SPARK_USER: spark | |
SPARK_SCHEMA: default | |
jobs: | |
pr_tests: | |
name: pr_tests | |
runs-on: ubuntu-latest | |
defaults: | |
run: | |
working-directory: ./integration_tests | |
strategy: | |
matrix: | |
dbt_version: | |
- 1.* | |
warehouse: | |
- spark | |
steps: | |
- name: Check out | |
uses: actions/checkout@v3 | |
- name: Set SCHEMA_SUFFIX env | |
run: >- | |
echo "SCHEMA_SUFFIX=$(echo ${DBT_VERSION%.*} | tr . _)" >> $GITHUB_ENV | |
env: | |
DBT_VERSION: '${{ matrix.dbt_version }}' | |
- name: Set DEFAULT_TARGET env | |
run: | | |
echo "DEFAULT_TARGET=${{ matrix.warehouse }}" >> $GITHUB_ENV | |
- name: Python setup | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.8.x | |
- name: Pip cache | |
uses: actions/cache@v3 | |
with: | |
path: ~/.cache/pip | |
key: >- | |
${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{ matrix.warehouse }} | |
restore-keys: >- | |
${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{ matrix.warehouse }} | |
- name: Install spark dependencies | |
run: | | |
pip install --upgrade pip wheel setuptools | |
pip install -Iv "dbt-spark[PyHive]==${{ matrix.dbt_version }}" --upgrade | |
dbt deps | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v1 | |
- name: Install Docker Compose | |
run: | | |
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose | |
sudo chmod +x /usr/local/bin/docker-compose | |
- name: Check Docker and Docker Compose versions | |
run: | | |
docker --version | |
docker-compose --version | |
- name: Create Dockerfile | |
run: | | |
cat << EOF > Dockerfile | |
FROM openjdk:11-jdk-slim | |
ENV SPARK_VERSION=3.5.1 | |
ENV HADOOP_VERSION=3.3.4 | |
ENV SPARK_HOME=/spark | |
RUN apt-get update && apt-get install -y curl wget procps rsync ssh iputils-ping net-tools jq | |
RUN wget --tries=5 --retry-connrefused --waitretry=1 --timeout=20 https://downloads.apache.org/spark/spark-\${SPARK_VERSION}/spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \ | |
tar -xvzf spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \ | |
mv spark-\${SPARK_VERSION}-bin-hadoop3 \${SPARK_HOME} && \ | |
rm spark-\${SPARK_VERSION}-bin-hadoop3.tgz | |
ENV PATH=\$PATH:\${SPARK_HOME}/bin:\${SPARK_HOME}/sbin | |
RUN mkdir -p /spark/spark-warehouse && \ | |
chown -R root:root /spark/spark-warehouse && \ | |
chmod -R 777 /spark/spark-warehouse | |
WORKDIR \${SPARK_HOME} | |
CMD ["bash"] | |
EOF | |
- name: Create spark-defaults.conf | |
run: | | |
cat << EOF > spark-defaults.conf | |
spark.sql.hive.thriftServer.singleSession true | |
spark.hadoop.hive.server2.thrift.port 10000 | |
spark.sql.warehouse.dir /spark/spark-warehouse | |
javax.jdo.option.ConnectionURL jdbc:derby:;databaseName=/spark/metastore_db;create=true | |
EOF | |
- name: Create docker-compose.yml | |
run: | | |
cat << EOF > docker-compose.yml | |
version: '3' | |
networks: | |
spark-network: | |
driver: bridge | |
services: | |
spark-master: | |
build: . | |
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] | |
hostname: spark-master | |
ports: | |
- '8080:8080' | |
- '7077:7077' | |
environment: | |
- SPARK_LOCAL_IP=spark-master | |
- SPARK_MASTER_HOST=spark-master | |
- SPARK_MASTER_PORT=7077 | |
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g" | |
volumes: | |
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | |
networks: | |
- spark-network | |
spark-worker: | |
build: . | |
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] | |
depends_on: | |
- spark-master | |
environment: | |
- SPARK_WORKER_CORES=2 | |
- SPARK_WORKER_MEMORY=4G | |
- SPARK_EXECUTOR_MEMORY=3G | |
- SPARK_LOCAL_IP=spark-worker | |
- SPARK_MASTER=spark://spark-master:7077 | |
volumes: | |
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | |
networks: | |
- spark-network | |
thrift-server: | |
build: . | |
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] | |
ports: | |
- '10000:10000' | |
depends_on: | |
- spark-master | |
- spark-worker | |
environment: | |
- SPARK_LOCAL_IP=thrift-server | |
volumes: | |
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | |
networks: | |
- spark-network | |
EOF | |
- name: Debug Docker Compose file | |
run: cat docker-compose.yml | |
- name: Build and start Spark cluster | |
run: | | |
docker-compose build --no-cache | |
docker-compose up -d | |
echo "Waiting for Spark services to start..." | |
sleep 90 # Increased wait time to account for Thrift Server startup | |
- name: Check running containers | |
run: docker ps | |
- name: Initialize Metastore | |
run: | | |
echo "Initializing metastore..." | |
docker-compose exec -T thrift-server bash -c ' | |
spark-sql --conf spark.sql.hive.metastore.version=2.3.9 \ | |
--conf spark.sql.hive.metastore.jars=builtin \ | |
--conf spark.sql.warehouse.dir=/spark/spark-warehouse \ | |
--conf javax.jdo.option.ConnectionURL="jdbc:derby:;databaseName=/spark/metastore_db;create=true" \ | |
-e "CREATE DATABASE IF NOT EXISTS default;" | |
' | |
echo "Metastore initialization completed." | |
- name: Debug Spark Warehouse | |
run: | | |
docker-compose exec -T spark-master bash -c " | |
ls -la /spark/spark-warehouse | |
ls -la /spark | |
" | |
- name: Check Docker network | |
run: | | |
docker network ls | |
docker network inspect integration_tests_spark-network | |
- name: Print Docker logs | |
run: | | |
echo "Docker logs for spark-master:" | |
docker-compose logs --tail=1000 spark-master | |
echo "Docker logs for spark-worker:" | |
docker-compose logs --tail=1000 spark-worker | |
- name: Inspect Docker containers | |
run: | | |
echo "Inspecting spark-master container:" | |
docker inspect integration_tests_spark-master_1 | |
echo "Inspecting spark-worker container:" | |
docker inspect integration_tests_spark-worker_1 | |
- name: Check Spark cluster status | |
run: | | |
docker-compose exec -T spark-master bash -c "jps && ps aux | grep spark && netstat -tuln" | |
docker-compose exec -T spark-worker bash -c "jps && ps aux | grep spark && netstat -tuln" | |
- name: Debug Spark Master Configuration | |
run: docker-compose exec -T spark-master bash -c "cat /spark/conf/spark-defaults.conf" | |
- name: Debug Spark Master Logs | |
run: docker-compose exec -T spark-master bash -c "cat /spark/logs/spark--org.apache.spark.deploy.master.Master-*.out" | |
- name: Check ThriftServer Process | |
run: docker-compose exec -T thrift-server bash -c "ps aux | grep ThriftServer" | |
- name: Check Latest ThriftServer Log | |
run: docker-compose exec -T thrift-server bash -c "tail -n 50 /spark/logs/\$(ls -t /spark/logs/ | grep thriftserver | head -n1)" | |
- name: Test ThriftServer connection with Beeline | |
run: | | |
docker-compose exec -T thrift-server bash -c ' | |
beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES;" | |
' | |
- name: List Spark Logs | |
run: docker-compose exec -T spark-master bash -c "ls -l /spark/logs/" | |
- name: Check if port 10000 is actually listening inside the spark-master container | |
run: docker-compose exec -T spark-master bash -c "netstat -tuln | grep 10000" | |
- name: Verify ThriftServer JDBC URL | |
run: | | |
docker-compose exec -T spark-master bash -c 'echo "jdbc:hive2://spark-master:10000"' | |
- name: Create Spark events directory | |
run: | | |
docker-compose exec -T spark-master bash -c "mkdir -p /spark/spark-events && ls -l /spark/spark-events" | |
- name: Run simple Spark SQL query | |
run: | | |
echo "Running Spark SQL query..." | |
docker-compose exec -T spark-master bash -c ' | |
spark-sql --conf spark.sql.hive.metastore.version=2.3.9 \ | |
--conf spark.sql.hive.metastore.jars=builtin \ | |
--conf spark.sql.warehouse.dir=/spark/spark-warehouse \ | |
--conf javax.jdo.option.ConnectionURL="jdbc:derby:;databaseName=/spark/metastore_db;create=true" \ | |
-e "SELECT 1 as test;" | |
' | |
echo "Spark SQL query completed." | |
- name: Check Spark Master UI | |
run: | | |
echo "Checking Spark Master UI..." | |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.workers'" | |
- name: Verify Hive metastore | |
run: | | |
echo "Verifying Hive metastore..." | |
docker-compose exec -T spark-master bash -c ' | |
spark-sql --conf spark.sql.hive.metastore.version=2.3.9 \ | |
--conf spark.sql.hive.metastore.jars=builtin \ | |
--conf spark.sql.warehouse.dir=/spark/spark-warehouse \ | |
--conf javax.jdo.option.ConnectionURL="jdbc:derby:;databaseName=/spark/metastore_db;create=true" \ | |
-e "SHOW DATABASES;" | |
' | |
- name: Check ThriftServer UI | |
run: | | |
echo "Checking ThriftServer UI..." | |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:4040/api/v1/applications | jq '.[0].name'" | |
- name: Check Spark Applications | |
run: | | |
echo "Checking Spark Applications..." | |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.activeapps[0].name'" | |
- name: Wait for Thrift Server | |
run: | | |
echo "Waiting for Thrift Server to be fully operational..." | |
sleep 30 | |
- name: 'Pre-test: Drop ci schemas' | |
run: | | |
dbt run-operation post_ci_cleanup --target spark | |
- name: Run tests | |
run: | | |
echo "Running DBT tests..." | |
./.scripts/integration_tests.sh -d spark | |
echo "DBT tests completed." | |
- name: 'Post-test: Drop ci schemas' | |
run: | | |
dbt run-operation post_ci_cleanup --target spark | |
- name: Cleanup Spark cluster | |
if: always() | |
run: | | |
docker-compose down |