-
Notifications
You must be signed in to change notification settings - Fork 8
275 lines (238 loc) · 11.1 KB
/
pr_tests_spark.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
name: Spark S3 Integration Tests
on:
pull_request:
branches:
- main
push:
branches:
- feature/**
- dev
- staging
- template-spark-tests
- spark_prep
concurrency: dbt_integration_tests
env:
DBT_PROFILES_DIR: ./ci
SPARK_MASTER_HOST: spark-master
SPARK_USER: spark
SPARK_SCHEMA: default
AWS_REGION: eu-west-1
AWS_DEFAULT_REGION: eu-west-1
jobs:
spark_s3_integration_tests:
name: Spark S3 Integration Tests
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./integration_tests
strategy:
matrix:
dbt_version:
- 1.*
warehouse:
- spark
steps:
- name: Check out
uses: actions/checkout@v3
- name: Set SCHEMA_SUFFIX env
run: >-
echo "SCHEMA_SUFFIX=$(echo ${DBT_VERSION%.*} | tr . _)" >> $GITHUB_ENV
env:
DBT_VERSION: '${{ matrix.dbt_version }}'
- name: Set DEFAULT_TARGET env
run: |
echo "DEFAULT_TARGET=${{ matrix.warehouse }}" >> $GITHUB_ENV
- name: Python setup
uses: actions/setup-python@v4
with:
python-version: 3.8.x
- name: Install dependencies
run: |
pip install --upgrade pip wheel setuptools
pip install -Iv "dbt-spark[PyHive]==${{ matrix.dbt_version }}" --upgrade
pip install boto3 awscli
dbt deps
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Install Docker Compose
run: |
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
- name: Create Dockerfile
run: |
cat << EOF > Dockerfile
FROM openjdk:11-jdk-slim
ENV SPARK_VERSION=3.5.1
ENV HADOOP_VERSION=3.3.4
ENV SPARK_HOME=/spark
RUN apt-get update && apt-get install -y curl wget procps rsync ssh iputils-ping net-tools jq
RUN wget --tries=5 --retry-connrefused --waitretry=1 --timeout=20 https://downloads.apache.org/spark/spark-\${SPARK_VERSION}/spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \
tar -xvzf spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \
mv spark-\${SPARK_VERSION}-bin-hadoop3 \${SPARK_HOME} && \
rm spark-\${SPARK_VERSION}-bin-hadoop3.tgz
ENV PATH=\$PATH:\${SPARK_HOME}/bin:\${SPARK_HOME}/sbin
RUN mkdir -p /spark/spark-warehouse && \
chown -R root:root /spark/spark-warehouse && \
chmod -R 777 /spark/spark-warehouse
# Install AWS Glue libraries
RUN curl -L -o /spark/jars/aws-java-sdk-bundle-1.11.1026.jar https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar && \
curl -L -o /spark/jars/hadoop-aws-3.3.4.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
curl -L -o /spark/jars/iceberg-spark-runtime-3.5_2.12-1.3.1.jar https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.3.1/iceberg-spark-runtime-3.5_2.12-1.3.1.jar
WORKDIR \${SPARK_HOME}
CMD ["bash"]
EOF
- name: Create spark-defaults.conf
run: |
cat << EOF > spark-defaults.conf
spark.master spark://spark-master:7077
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue
spark.sql.catalog.glue.database dbt-spark-iceberg
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.access.key ${AWS_ACCESS_KEY_ID}
spark.hadoop.fs.s3a.secret.key ${AWS_SECRET_ACCESS_KEY}
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.region eu-west-1
spark.hadoop.fs.s3a.aws.region eu-west-1
spark.hadoop.com.amazonaws.services.s3.enableV4 true
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory
EOF
- name: Create docker-compose.yml
run: |
cat << EOF > docker-compose.yml
version: '3'
networks:
spark-network:
driver: bridge
services:
spark-master:
build: .
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
hostname: spark-master
ports:
- '8080:8080'
- '7077:7077'
environment:
- SPARK_LOCAL_IP=spark-master
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=7077
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g"
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
networks:
- spark-network
spark-worker:
build: .
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
depends_on:
- spark-master
environment:
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=4G
- SPARK_EXECUTOR_MEMORY=3G
- SPARK_LOCAL_IP=spark-worker
- SPARK_MASTER=spark://spark-master:7077
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
networks:
- spark-network
thrift-server:
build: .
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
ports:
- '10000:10000'
depends_on:
- spark-master
- spark-worker
environment:
- SPARK_LOCAL_IP=thrift-server
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_REGION=eu-west-1
- AWS_DEFAULT_REGION=eu-west-1
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
networks:
- spark-network
EOF
- name: Build and start Spark cluster
run: |
docker-compose build --no-cache
docker-compose up -d
echo "Waiting for Spark services to start..."
sleep 90
- name: Check running containers
run: docker ps
- name: Check Docker network
run: |
docker network ls
docker network inspect integration_tests_spark-network
- name: Print Docker logs
run: |
echo "Docker logs for spark-master:"
docker-compose logs --tail=1000 spark-master
echo "Docker logs for spark-worker:"
docker-compose logs --tail=1000 spark-worker
echo "Docker logs for thrift-server:"
docker-compose logs --tail=1000 thrift-server
- name: Check Spark cluster status
run: |
docker-compose exec -T spark-master bash -c "jps && ps aux | grep spark && netstat -tuln"
docker-compose exec -T spark-worker bash -c "jps && ps aux | grep spark && netstat -tuln"
docker-compose exec -T thrift-server bash -c "jps && ps aux | grep spark && netstat -tuln"
- name: Check Spark Master UI
run: |
echo "Checking Spark Master UI..."
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.workers'"
- name: Verify Spark configuration
run: |
echo "Verifying Spark configuration..."
docker-compose exec -T spark-master bash -c "cat /spark/conf/spark-defaults.conf"
- name: Wait for Thrift Server
run: |
echo "Waiting for Thrift Server to be fully operational..."
sleep 60 # Increased wait time
- name: Check ThriftServer Process
run: docker-compose exec -T thrift-server bash -c "ps aux | grep ThriftServer"
- name: Check Latest ThriftServer Log
run: docker-compose exec -T thrift-server bash -c "tail -n 50 /spark/logs/\$(ls -t /spark/logs/ | grep thriftserver | head -n1)"
- name: Test ThriftServer connection with Beeline
run: |
docker-compose exec -T thrift-server bash -c '
beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES;"
'
- name: 'Pre-test: Drop ci schemas'
run: |
dbt run-operation post_ci_cleanup --target spark
- name: Run tests
run: |
echo "Running DBT tests..."
./.scripts/integration_tests.sh -d spark
echo "DBT tests completed."
- name: 'Post-test: Drop ci schemas'
run: |
dbt run-operation post_ci_cleanup --target spark
- name: Cleanup Spark cluster
if: always()
run: |
docker-compose down