-
Notifications
You must be signed in to change notification settings - Fork 8
288 lines (237 loc) · 9.61 KB
/
pr_tests_spark.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
name: pr_tests_spark
on:
pull_request:
branches:
- main
push:
branches:
- feature/**
- dev
- staging
- template-spark-tests
- spark_prep
concurrency: dbt_integration_tests
env:
DBT_PROFILES_DIR: ./ci
SPARK_MASTER_HOST: spark-master
SPARK_USER: spark
SPARK_SCHEMA: default
jobs:
pr_tests:
name: pr_tests
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./integration_tests
strategy:
matrix:
dbt_version:
- 1.*
warehouse:
- spark
steps:
- name: Check out
uses: actions/checkout@v3
- name: Set SCHEMA_SUFFIX env
run: >-
echo "SCHEMA_SUFFIX=$(echo ${DBT_VERSION%.*} | tr . _)" >> $GITHUB_ENV
env:
DBT_VERSION: '${{ matrix.dbt_version }}'
- name: Set DEFAULT_TARGET env
run: |
echo "DEFAULT_TARGET=${{ matrix.warehouse }}" >> $GITHUB_ENV
- name: Python setup
uses: actions/setup-python@v4
with:
python-version: 3.8.x
- name: Pip cache
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: >-
${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{ matrix.warehouse }}
restore-keys: >-
${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{ matrix.warehouse }}
- name: Install spark dependencies
run: |
pip install --upgrade pip wheel setuptools
pip install -Iv "dbt-spark[PyHive]==${{ matrix.dbt_version }}" --upgrade
dbt deps
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Install Docker Compose
run: |
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
- name: Check Docker and Docker Compose versions
run: |
docker --version
docker-compose --version
- name: Create Dockerfile
run: |
cat << EOF > Dockerfile
FROM openjdk:11-jdk-slim
ENV SPARK_VERSION=3.5.1
ENV HADOOP_VERSION=3.3.4
ENV SPARK_HOME=/spark
RUN apt-get update && apt-get install -y curl wget procps rsync ssh iputils-ping net-tools jq
RUN wget --tries=5 --retry-connrefused --waitretry=1 --timeout=20 https://downloads.apache.org/spark/spark-\${SPARK_VERSION}/spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \
tar -xvzf spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \
mv spark-\${SPARK_VERSION}-bin-hadoop3 \${SPARK_HOME} && \
rm spark-\${SPARK_VERSION}-bin-hadoop3.tgz
ENV PATH=\$PATH:\${SPARK_HOME}/bin:\${SPARK_HOME}/sbin
WORKDIR \${SPARK_HOME}
CMD ["bash"]
EOF
- name: Create spark-defaults.conf
run: |
cat << EOF > spark-defaults.conf
spark.sql.hive.thriftServer.singleSession true
spark.hadoop.hive.server2.thrift.port 10000
EOF
- name: Create docker-compose.yml
run: |
cat << EOF > docker-compose.yml
version: '3'
networks:
spark-network:
driver: bridge
services:
spark-master:
build: .
command: |
bash -c "
/spark/bin/spark-sql --conf spark.sql.hive.metastore.jars=builtin --conf spark.sql.hive.metastore.version=2.3.9 --conf spark.sql.catalogImplementation=hive -e 'show databases;' &&
/spark/sbin/start-master.sh &&
/spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 &&
tail -f /spark/logs/*"
ports:
- "8080:8080"
- "7077:7077"
- "10000:10000"
- "4040:4040"
environment:
- SPARK_MODE=master
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=7077
- SPARK_MASTER_WEBUI_PORT=8080
volumes:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
networks:
- spark-network
spark-worker:
build: .
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
depends_on:
- spark-master
environment:
- SPARK_MODE=worker
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=2g
- SPARK_WORKER_PORT=8081
- SPARK_WORKER_WEBUI_PORT=8081
- SPARK_MASTER=spark://spark-master:7077
networks:
- spark-network
EOF
- name: Debug Docker Compose file
run: cat docker-compose.yml
- name: Build and start Spark cluster
run: |
docker-compose build --no-cache
docker-compose up -d
- name: Check running containers
run: docker ps
- name: Wait for services to start
run: |
echo "Waiting for Spark services to start..."
sleep 120
- name: Check Docker network
run: |
docker network ls
docker network inspect integration_tests_spark-network
- name: Print Docker logs
run: |
echo "Docker logs for spark-master:"
docker-compose logs --tail=1000 spark-master
echo "Docker logs for spark-worker:"
docker-compose logs --tail=1000 spark-worker
- name: Inspect Docker containers
run: |
echo "Inspecting spark-master container:"
docker inspect integration_tests_spark-master_1
echo "Inspecting spark-worker container:"
docker inspect integration_tests_spark-worker_1
- name: Check Spark cluster status
run: |
docker-compose exec -T spark-master bash -c "jps && ps aux | grep spark && netstat -tuln"
docker-compose exec -T spark-worker bash -c "jps && ps aux | grep spark && netstat -tuln"
- name: Debug Spark Master Configuration
run: docker-compose exec -T spark-master bash -c "cat /spark/conf/spark-defaults.conf"
- name: Debug Spark Master Logs
run: docker-compose exec -T spark-master bash -c "cat /spark/logs/spark--org.apache.spark.deploy.master.Master-*.out"
- name: Check ThriftServer Process
run: docker-compose exec -T spark-master bash -c "ps aux | grep ThriftServer"
- name: List Spark Logs
run: docker-compose exec -T spark-master bash -c "ls -l /spark/logs/"
- name: Check Latest ThriftServer Log
run: docker-compose exec -T spark-master bash -c "tail -n 50 /spark/logs/\$(ls -t /spark/logs/ | grep thriftserver | head -n1)"
- name: Check if port 10000 is actually listening inside the spark-master container
run: docker-compose exec -T spark-master bash -c "netstat -tuln | grep 10000"
# - name: Try to connect to the Thrift server from the spark-master container itself (with timeout)
# run: docker-compose exec -T spark-master bash -c "timeout 5 curl -v telnet://spark-master:10000"
# - name: Check network connectivity
# run: |
# docker-compose exec -T spark-master bash -c "ping -c 4 spark-master"
# docker-compose exec -T spark-master bash -c "nc -zv spark-master 10000"
- name: Verify ThriftServer JDBC URL
run: |
docker-compose exec -T spark-master bash -c 'echo "jdbc:hive2://spark-master:10000"'
- name: Test ThriftServer connection with Beeline
run: |
docker-compose exec -T spark-master bash -c '
beeline -u "jdbc:hive2://spark-master:10000" -n root -e "SHOW DATABASES;"
'
- name: Check Spark event logs
run: |
docker-compose exec -T spark-master bash -c "cat /spark/spark-events/*"
- name: Run simple Spark SQL query
run: |
docker-compose exec -T spark-master bash -c '
spark-sql --conf spark.sql.hive.thriftServer.singleSession=true \
--conf spark.sql.catalogImplementation=hive \
-e "SELECT 1 as test;"
'
- name: Check Spark Master UI
run: |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.workers'"
- name: Verify Hive metastore
run: |
docker-compose exec -T spark-master bash -c '
spark-sql --conf spark.sql.hive.metastore.jars=builtin \
--conf spark.sql.hive.metastore.version=2.3.9 \
--conf spark.sql.catalogImplementation=hive \
-e "show databases;"
'
- name: Check ThriftServer UI
run: |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:4040/api/v1/applications | jq '.[0].name'"
- name: Check Spark Applications
run: |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.activeapps[0].name'"
- name: Wait for Thrift Server
run: |
echo "Waiting for Thrift Server to be fully operational..."
sleep 30
- name: 'Pre-test: Drop ci schemas'
run: |
dbt run-operation post_ci_cleanup --target spark
- name: Run tests
run: ./.scripts/integration_tests.sh -d spark
- name: 'Post-test: Drop ci schemas'
run: |
dbt run-operation post_ci_cleanup --target spark
- name: Cleanup Spark cluster
if: always()
run: |
docker-compose down