This repository has been archived by the owner on May 1, 2023. It is now read-only.
forked from alltheplaces/alltheplaces
-
Notifications
You must be signed in to change notification settings - Fork 0
170 lines (146 loc) · 6.75 KB
/
spider-pr.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
name: Run Changed Spiders in Pull Requests
# Only run on pushes
on:
pull_request:
paths:
- 'locations/spiders/*.py'
jobs:
test-spider:
runs-on: ubuntu-latest
env:
AWS_KEY: QUtJQTRLMllUQzIzRzVZRkxENkg=
AWS_SEC: b1RmNXNxc0VSNlNKdFNhbUdlcjJwZHR6YXQwNXpRRDRPZUtoVkpPSA==
BUCKET: alltheplaces.openaddresses.io
steps:
- name: Checkout code
uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.11'
cache: 'pipenv'
- name: Install pipenv
run: |
pip --quiet install pipenv
- name: Install dependencies
run: |
pipenv --bare install --deploy
sudo apt-get install jq
- name: Run Scrapy check
run: |
pipenv run scrapy check
- name: Run spiders that changed
shell: bash
run: |
PR_COMMENT_BODY="I ran the spiders in this pull request and got these results:\\n\\n|Spider|Results|Log|\\n|---|---|---|\\n"
export AWS_ACCESS_KEY_ID=$(echo -n "${AWS_KEY}" | base64 --decode)
export AWS_SECRET_ACCESS_KEY=$(echo -n "${AWS_SEC}" | base64 --decode)
export AWS_REGION="us-east-1"
pr_file_changes=$(curl -sL --header 'authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' "https://api.github.com/repos/alltheplaces/alltheplaces/pulls/${{ github.event.number }}/files")
(>&2 echo "PR response: ${pr_file_changes}")
SPIDERS=$(echo $pr_file_changes | jq -r '.[] | select(.status != "removed") | .filename')
retval=$?
if [ ! $retval -eq 0 ]; then
(>&2 echo "checking file changes failed. response was ${pr_file_changes}")
exit 1
fi
if [ $(wc -l <<< "$SPIDERS") -gt 5 ]; then
(>&2 echo "refusing to run on more than 5 spiders")
exit 1
fi
if grep PLAYWRIGHT -q -m 1 $SPIDERS; then
echo "Playwright detected. Installing requirements."
pipenv run playwright install-deps
pipenv run playwright install firefox
fi
RUN_DIR="${GITHUB_WORKSPACE}/output"
EXIT_CODE=0
for file_changed in $SPIDERS
do
if [[ $file_changed != locations/spiders/* ]]; then
echo "${file_changed} is not a spider. Skipping."
continue
fi
spider="${file_changed}"
(>&2 echo "${spider} running ...")
SPIDER_NAME=$(basename $spider)
SPIDER_NAME=${SPIDER_NAME%.py}
SPIDER_RUN_DIR="${RUN_DIR}/${SPIDER_NAME}"
mkdir -p "${SPIDER_RUN_DIR}"
LOGFILE="${SPIDER_RUN_DIR}/log.txt"
OUTFILE="${SPIDER_RUN_DIR}/output.geojson"
pipenv run scrapy runspider \
-o "file://${OUTFILE}:geojson" \
--loglevel=INFO \
--logfile="${LOGFILE}" \
-s CLOSESPIDER_TIMEOUT=60 \
-s CLOSESPIDER_ERRORCOUNT=1 \
-s LOGSTATS_FILE=${SPIDER_RUN_DIR}/stats.json \
$spider
FAILURE_REASON="success"
if grep -q "Spider closed (closespider_errorcount)" $LOGFILE; then
(>&2 echo "${spider} exited with errors")
EXIT_CODE=1
FAILURE_REASON="exception"
elif grep -q "Spider closed (closespider_timeout)" $LOGFILE; then
(>&2 echo "${spider} exited because of timeout")
FAILURE_REASON="timeout"
fi
aws --only-show-errors s3 cp ${LOGFILE} s3://${BUCKET}/ci/${GITHUB_RUN_ID}/${SPIDER_NAME}/log.txt
retval=$?
if [ ! $retval -eq 0 ]; then
(>&2 echo "log copy to s3 failed with exit code ${retval}")
exit 1
fi
LOGFILE_URL="https://data.alltheplaces.xyz/ci/${GITHUB_RUN_ID}/${SPIDER_NAME}/log.txt"
echo "${spider} log: ${LOGFILE_URL}"
if [ -f "$OUTFILE" ]; then
FEATURE_COUNT=$(jq --raw-output '.item_scraped_count' ${SPIDER_RUN_DIR}/stats.json)
if [ $FEATURE_COUNT == "null" ]; then
FEATURE_COUNT="0"
fi
if [ $FEATURE_COUNT == "0" ]; then
echo "${spider} has no output"
FAILURE_REASON="no output"
PR_COMMENT_BODY="${PR_COMMENT_BODY}|[\`$spider\`](https://github.com/${GITHUB_REPOSITORY}/blob/${GITHUB_SHA}/${spider})| (No Output) |Resulted in a \`${FAILURE_REASON}\` ([Log](${LOGFILE_URL}))|\\n"
EXIT_CODE=1
continue
fi
aws s3 cp --only-show-errors ${OUTFILE} s3://${BUCKET}/ci/${GITHUB_RUN_ID}/${SPIDER_NAME}/output.geojson
retval=$?
if [ ! $retval -eq 0 ]; then
(>&2 echo "output copy to s3 failed with exit code ${retval}")
exit 1
fi
OUTFILE_URL="https://data.alltheplaces.xyz/ci/${GITHUB_RUN_ID}/${SPIDER_NAME}/output.geojson"
if grep -q 'Stored geojson feed' $LOGFILE; then
echo "${spider} has ${FEATURE_COUNT} features: https://data.alltheplaces.xyz/map.html?show=${OUTFILE_URL}"
fi
PR_COMMENT_BODY="${PR_COMMENT_BODY}|[\`$spider\`](https://github.com/${GITHUB_REPOSITORY}/blob/${GITHUB_SHA}/${spider})|[${FEATURE_COUNT} items](${OUTFILE_URL}) ([Map](https://data.alltheplaces.xyz/map.html?show=${OUTFILE_URL}))|Resulted in a \`${FAILURE_REASON}\` ([Log](${LOGFILE_URL}))|\\n"
else
echo "${spider} has no output"
FAILURE_REASON="no output"
PR_COMMENT_BODY="${PR_COMMENT_BODY}|[\`$spider\`](https://github.com/${GITHUB_REPOSITORY}/blob/${GITHUB_SHA}/${spider})| (No Output) |Resulted in a \`${FAILURE_REASON}\` ([Log](${LOGFILE_URL}))|\\n"
EXIT_CODE=1
fi
(>&2 echo "${spider} done")
done
if [[ ! "$(ls ${RUN_DIR})" ]]; then
echo "Nothing ran. Exiting."
echo $EXIT_CODE
fi
if [ -z "${{ secrets.GITHUB_TOKEN }}" ]; then
echo "No GITHUB_TOKEN set"
else
if [ "${{ github.event.number }}" != "false" ]; then
curl \
-s \
-XPOST \
-H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
-d "{\"body\":\"${PR_COMMENT_BODY}\"}" \
"https://api.github.com/repos/${GITHUB_REPOSITORY}/issues/${{ github.event.number }}/comments"
echo "Added a comment to pull https://github.com/${GITHUB_REPOSITORY}/pull/${{ github.event.number }}"
else
echo "Not posting to GitHub because no pull event number set"
fi
fi
exit $EXIT_CODE