forked from AlexsLemonade/refinebio
-
Notifications
You must be signed in to change notification settings - Fork 0
/
deploy.sh
executable file
·366 lines (313 loc) · 13.4 KB
/
deploy.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/bin/bash -e
print_description() {
echo 'This script can be used to deploy and update a `refine.bio` instance stack.'
echo 'It will create all of the AWS infrasctructure (roles/instances/db/network/etc),'
echo 'open an ingress, kill all running Nomad jobs, perform a database migration,'
echo 're-define and re-register Nomad job specifications, and finally close the'
echo 'ingress. This can be run from a CI/CD machine or a local dev box.'
echo 'This script must be run from /infrastructure!'
}
print_options() {
echo 'This script accepts the following arguments: -e, -v, -u, -r, and -h.'
echo '-h prints this help message and exits.'
echo '-e specifies the environment you would like to deploy to and is not optional. Its valid values are:'
echo ' "-e prod" will deploy the production stack. This should only be used from a CD machine.'
echo ' "-e staging" will deploy the staging stack. This should only be used from a CD machine.'
echo ' "-e dev" will deploy a dev stack which is appropriate for a single developer to use to test.'
echo '-d May be used to override the Dockerhub repo where the images will be pulled from.'
echo ' This may also be specified by setting the TF_VAR_dockerhub_repo environment variable.'
echo ' If unset, defaults to the value in `infrastructure/environments/$env`, which is "ccdlstaging"'
echo ' for dev and staging environments and "ccdl" for prod.'
echo ' This option is useful for testing code changes. Images with the code to be tested can be pushed'
echo ' to your private Dockerhub repo and then the system will find them.'
echo '-v specifies the version of the system which is being deployed and is not optional.'
echo "-u specifies the username of the deployer. Should be the developer's name in development stacks."
echo ' This option may be omitted, in which case the TF_VAR_user variable MUST be set instead.'
echo '-r specifies the AWS region to deploy the stack to. Defaults to us-east-1.'
}
while getopts ":e:d:v:u:r:h" opt; do
case $opt in
e)
export env=$OPTARG
export TF_VAR_stage=$OPTARG
;;
d)
export TF_VAR_dockerhub_repo=$OPTARG
;;
v)
export SYSTEM_VERSION=$OPTARG
;;
u)
export TF_VAR_user=$OPTARG
;;
r)
export TF_VAR_region=$OPTARG
;;
h)
print_description
echo
print_options
exit 0
;;
\?)
echo "Invalid option: -$OPTARG" >&2
print_options >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
print_options >&2
exit 1
;;
esac
done
if [[ $env != "dev" && $env != "staging" && $env != "prod" ]]; then
echo 'Error: must specify environment as either "dev", "staging", or "prod" with -e.'
exit 1
fi
if [[ -z $TF_VAR_user ]]; then
echo 'Error: must specify the username by either providing the -u argument or setting TF_VAR_user.'
exit 1
fi
if [[ -z $SYSTEM_VERSION ]]; then
echo 'Error: must specify the system version with -v.'
exit 1
fi
if [[ -z $TF_VAR_region ]]; then
TF_VAR_region=us-east-1
fi
# This function checks what the status of the Nomad agent is.
# Returns an HTTP response code. i.e. 200 means everything is ok.
check_nomad_status () {
echo $(curl --write-out %{http_code} \
--silent \
--output /dev/null \
$NOMAD_ADDR/v1/status/leader)
}
# We have terraform output environment variables via a single output
# variable, which we then read in as json using the command line tool
# `jq`, so that we can use them via bash.
format_environment_variables () {
json_env_vars=$(terraform output -json environment_variables | jq -c '.value[]')
for row in $json_env_vars; do
env_var_assignment=$(echo $row | jq -r ".name")=$(echo $row | jq -r ".value")
export $env_var_assignment
echo $env_var_assignment >> prod_env
done
}
# Load $ALL_CCDL_IMAGES and helper functions
source ../common.sh
# Make our IP address known to terraform.
export TF_VAR_host_ip=`dig +short myip.opendns.com @resolver1.opendns.com`
for IMAGE in $ALL_CCDL_IMAGES; do
# For each image we need to set the env var that is used by our
# scripts and the env var that gets picked up by terraform because
# it is preceeded with TF_VAR.
IMAGE_UPPER=$IMAGE | tr a-z A-Z
export ${IMAGE_UPPER}_DOCKER_IMAGE=dr_$IMAGE:$SYSTEM_VERSION
export TF_VAR_${IMAGE}_docker_image=dr_$IMAGE:$SYSTEM_VERSION
done
# Copy ingress config to top level so it can be applied.
cp deploy/ci_ingress.tf .
# Always init terraform first, especially since we're using a remote backend.
./init_terraform.sh
if [[ ! -f terraform.tfstate ]]; then
ran_init_build=true
echo "No terraform state file found, applying initial terraform deployment."
# These files are inputs but are created by format_nomad_with_env.sh
# based on outputs from terraform. Kinda a Catch 22, but we can
# get around it by providing dummy files to get bootstrapped.
touch api-configuration/environment
touch foreman-configuration/environment
# Output the plan for debugging deployments later.
# Until terraform plan supports -var-file the plan is wrong.
# terraform plan
if [[ ! -z $CIRCLE_BUILD_NUM ]]; then
# Make sure we can't expose secrets in circleci
terraform apply -var-file=environments/$env.tfvars -auto-approve > /dev/null
else
terraform apply -var-file=environments/$env.tfvars -auto-approve
fi
fi
# We have to do this once before the initial deploy..
rm -f prod_env
format_environment_variables
../format_nomad_with_env.sh -p api -e $env -o $(pwd)/api-configuration/
../format_nomad_with_env.sh -p foreman -e $env -o $(pwd)/foreman-configuration/
if [[ -z $ran_init_build ]]; then
# Open up ingress to AWS for Circle, stop jobs, migrate DB.
echo "Deploying with ingress.."
# Output the plan for debugging deployments later.
# Until terraform plan supports -var-file the plan is wrong.
# terraform plan
if [[ ! -z $CIRCLE_BUILD_NUM ]]; then
# Make sure we can't expose secrets in circleci
terraform apply -var-file=environments/$env.tfvars -auto-approve > /dev/null
else
terraform apply -var-file=environments/$env.tfvars -auto-approve
fi
fi
# Find address of Nomad server.
export NOMAD_LEAD_SERVER_IP=`terraform output nomad_server_1_ip`
export NOMAD_ADDR=http://$NOMAD_LEAD_SERVER_IP:4646
# Wait for Nomad to get started in case the server just went up for
# the first time.
echo "Confirming Nomad cluster.."
start_time=$(date +%s)
diff=0
nomad_status=$(check_nomad_status)
while [[ $diff < 300 && $nomad_status != "200" ]]; do
sleep 1
nomad_status=$(check_nomad_status)
let "diff = $(date +%s) - $start_time"
done
# Kill Base Nomad Jobs so no new jobs can be queued.
echo "Killing base jobs.. (this takes a while..)"
if [[ $(nomad status) != "No running jobs" ]]; then
for job in $(nomad status | grep running | awk {'print $1'} || grep --invert-match /)
do
# '|| true' so that if a job is garbage collected before we can remove it the error
# doesn't interrupt our deploy.
nomad stop -purge -detach $job > /dev/null || true &
done
fi
# Wait to make sure that all base jobs are killed so no new jobs can
# be queued while we kill the parameterized Nomad jobs.
wait $(jobs -p)
# Kill parameterized Nomad Jobs so no jobs will be running when we
# apply migrations.
echo "Killing dispatch jobs.. (this also takes a while..)"
if [[ $(nomad status) != "No running jobs" ]]; then
counter=0
for job in $(nomad status | awk {'print $1'} || grep /)
do
# Skip the header row for jobs.
if [ $job != "ID" ]; then
# '|| true' so that if a job is garbage collected before we can remove it the error
# doesn't interrupt our deploy.
nomad stop -purge -detach $job > /dev/null || true &
counter=$((counter+1))
fi
# Wait for all the jobs to stop every 100 so we don't knock
# over the deploy box if there are 1000's.
if [[ "$counter" -gt 100 ]]; then
wait $(jobs -p)
counter=0
fi
done
fi
# Wait for any remaining jobs to all die.
wait $(jobs -p)
# Make sure that prod_env is empty since we are only appending to it.
# prod_env is a temporary file we use to pass environment variables to
# `docker run` commands when running migrations.
rm -f prod_env
# (cont'd) ..and once again after the update when this is re-run.
format_environment_variables
# Get an image to run the migrations with.
docker pull $DOCKERHUB_REPO/$FOREMAN_DOCKER_IMAGE
# Migrate auth.
# Override database settings so we connect directly to the RDS instance.
docker run \
--env-file prod_env \
--env DATABASE_HOST=$RDS_HOST \
--env DATABASE_PORT=$DATABASE_HIDDEN_PORT \
--env RUNNING_IN_CLOUD=False \
$DOCKERHUB_REPO/$FOREMAN_DOCKER_IMAGE python3 manage.py migrate auth
# Apply general migrations.
docker run \
--env-file prod_env \
--env DATABASE_HOST=$RDS_HOST \
--env DATABASE_PORT=$DATABASE_HIDDEN_PORT \
--env RUNNING_IN_CLOUD=False \
$DOCKERHUB_REPO/$FOREMAN_DOCKER_IMAGE python3 manage.py migrate
# Template the environment variables for production into the Nomad Job
# specs and API confs.
mkdir -p nomad-job-specs
../format_nomad_with_env.sh -p workers -e $env -o $(pwd)/nomad-job-specs
../format_nomad_with_env.sh -p surveyor -e $env -o $(pwd)/nomad-job-specs
# API and foreman aren't run as nomad jobs, but the templater still works.
../format_nomad_with_env.sh -p foreman -e $env -o $(pwd)/foreman-configuration
../format_nomad_with_env.sh -p api -e $env -o $(pwd)/api-configuration/
# Don't leave secrets lying around!
rm -f prod_env
# Re-register Nomad jobs.
echo "Registering new job specifications.."
nomad_job_specs=nomad-job-specs/*
for nomad_job_spec in $nomad_job_specs; do
nomad run $nomad_job_spec &
done
echo "Job registrations have been fired off."
# Ensure the latest image version is being used for the Foreman
terraform taint aws_instance.foreman_server_1
# Remove the ingress config so the next `terraform apply` will remove
# access for Circle.
echo "Removing ingress.."
rm ci_ingress.tf
if [[ ! -z $CIRCLE_BUILD_NUM ]]; then
# Make sure we can't expose secrets in circleci
terraform apply -var-file=environments/$env.tfvars -auto-approve > /dev/null
else
terraform apply -var-file=environments/$env.tfvars -auto-approve
fi
# We try to avoid rebuilding the API server because we can only run certbot
# 5 times a week. Therefore we pull the newest image and restart the API
# this way rather than by tainting the server like we do for foreman.
chmod 600 data-refinery-key.pem
API_IP_ADDRESS=$(terraform output -json api_server_1_ip | jq -c '.value' | tr -d '"')
# To check to see if the docker container needs to be stopped before
# it can be started, grep for the name of the container. However if
# it's not found then grep will return a non-zero exit code so in that
# case return an empty string.
container_running=$(ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
ubuntu@$API_IP_ADDRESS "docker ps -a" | grep dr_api || echo "")
# If $container_running is empty, then it's because the container isn't running.
# If the container isn't running, then it's because the instance is spinning up.
# The container will be started by the API's init script, so no need to do anything more.
# However if $container_running isn't empty then we need to stop and restart it.
if [[ ! -z $container_running ]]; then
echo "Restarting API with latest image."
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
ubuntu@$API_IP_ADDRESS "docker pull $DOCKERHUB_REPO/$API_DOCKER_IMAGE"
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
ubuntu@$API_IP_ADDRESS "docker rm -f dr_api"
scp -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
api-configuration/environment ubuntu@$API_IP_ADDRESS:/home/ubuntu/environment
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
ubuntu@$API_IP_ADDRESS "docker run \
--env-file environment \
-e DATABASE_HOST=$DATABASE_HOST \
-e DATABASE_NAME=$DATABASE_NAME \
-e DATABASE_USER=$DATABASE_USER \
-e DATABASE_PASSWORD=$DATABASE_PASSWORD \
-v /tmp/volumes_static:/tmp/www/static \
--log-driver=awslogs \
--log-opt awslogs-region=$REGION \
--log-opt awslogs-group=data-refinery-log-group-$USER-$STAGE \
--log-opt awslogs-stream=log-stream-api-$USER-$STAGE \
-p 8081:8081 \
--name=dr_api \
-it -d $DOCKERHUB_REPO/$API_DOCKER_IMAGE /bin/sh -c /home/user/collect_and_run_uwsgi.sh"
# Don't leave secrets lying around.
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
ubuntu@$API_IP_ADDRESS "rm -f environment"
fi
echo "Deploy completed successfully."