diff --git a/jobs/garden/monit b/jobs/garden/monit index 806a1416..5b9739d1 100644 --- a/jobs/garden/monit +++ b/jobs/garden/monit @@ -20,11 +20,3 @@ check process garden group vcap <% end %> - -check process garden-healthchecker - with pidfile /var/vcap/sys/run/bpm/garden/garden-healthchecker.pid - start program "/var/vcap/jobs/bpm/bin/bpm start garden -p garden-healthchecker" - stop program "/var/vcap/jobs/bpm/bin/bpm stop garden -p garden-healthchecker" - if 1 restarts within 1 cycles then exec "/var/vcap/packages/garden-runc-healthchecker/bin/restart-monit-job garden <%= p('healthchecker.failure_counter_file') %>" - depends on garden - group vcap diff --git a/jobs/garden/spec b/jobs/garden/spec index 125909ed..0d2a8fb4 100644 --- a/jobs/garden/spec +++ b/jobs/garden/spec @@ -23,7 +23,6 @@ templates: bin/containerd_utils.erb: bin/containerd_utils bin/pre-start: bin/pre-start bin/post-start: bin/post-start - config/healthchecker.yml.erb: config/healthchecker.yml packages: - guardian @@ -309,7 +308,3 @@ properties: logging.format.timestamp: description: "Format for timestamp in component logs. Valid values are 'unix-epoch' and 'rfc3339'." default: "unix-epoch" - - healthchecker.failure_counter_file: - description: "File used by the healthchecker to monitor consecutive failures." - default: /var/vcap/data/garden/counters/consecutive_healthchecker_failures.count diff --git a/jobs/garden/templates/bin/restart-garden.erb b/jobs/garden/templates/bin/restart-garden.erb deleted file mode 100644 index f7df11db..00000000 --- a/jobs/garden/templates/bin/restart-garden.erb +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -PIDFILE="/var/vcap/sys/run/garden/restart-garden.pid" -FAILURE_COUNTER_FILE="<%= p("healthchecker.failure_counter_file") %>" - -# As this script might run longer than a monit cycle (10s) and thus might be -# triggered several times, it must be ensured that it runs only once. -[[ -s "$PIDFILE" ]] && exit - -function on_exit { - rm -f $PIDFILE -} - -trap on_exit EXIT - -echo "$BASHPID" > "$PIDFILE" - -LOGFILE="/var/vcap/sys/log/garden/restart-garden.log" -echo "$(date) - pid: $BASHPID - Monit triggered restart" >> "$LOGFILE" - -failure_counter="$(cat ${FAILURE_COUNTER_FILE})" - -if (( failure_counter < 10 )); then - /var/vcap/bosh/bin/monit restart garden - sleep 1 - echo "$(date) - pid: $BASHPID - Waiting for garden to be restarted" >> "$LOGFILE" - - until /var/vcap/bosh/bin/monit summary | grep garden | grep -v healthchecker | grep running; do - sleep 1 - done - /var/vcap/bosh/bin/monit reload garden-healthchecker - echo "$(date) - pid: $BASHPID - garden was restarted" >> "$LOGFILE" -else - echo "$(date) - pid: $BASHPID - 10 consecutive failures in a row. Stopping healthcheck to avoid constantly bringing down the main service." >> "${LOGFILE}" - /var/vcap/bosh/bin/monit unmonitor garden-healthchecker -fi diff --git a/jobs/garden/templates/config/bpm.yml.erb b/jobs/garden/templates/config/bpm.yml.erb index ac79716d..f67bc0b3 100644 --- a/jobs/garden/templates/config/bpm.yml.erb +++ b/jobs/garden/templates/config/bpm.yml.erb @@ -26,16 +26,3 @@ processes: shared: true writable: true <%- } } -%> - - name: garden-healthchecker - executable: /var/vcap/packages/garden-runc-healthchecker/bin/healthchecker - args: - - -c - - /var/vcap/jobs/garden/config/healthchecker.yml - additional_volumes: -<%- if p('garden.listen_network') == "unix" -%> - - path: /var/vcap/data/garden/garden.sock - writable: true - mount_only: true -<%- end -%> - - path: <%= File.dirname(p('healthchecker.failure_counter_file')) %> - writable: true diff --git a/jobs/garden/templates/config/healthchecker.yml.erb b/jobs/garden/templates/config/healthchecker.yml.erb deleted file mode 100644 index 7a7bd66b..00000000 --- a/jobs/garden/templates/config/healthchecker.yml.erb +++ /dev/null @@ -1,25 +0,0 @@ -<%= - -if p('garden.listen_network') == "unix" - health_endpoint = { - 'socket' => p('garden.listen_address'), - 'path' => '/ping', - } -else - ip, port = p('garden.listen_address').split(':') - health_endpoint = { - 'host' => ip, - 'port' => Integer(port), - 'path' => '/ping' - } -end - -config = { - 'component_name' => 'garden-healthchecker', - 'failure_counter_file' => p('healthchecker.failure_counter_file'), - 'log_level' => p('garden.log_level'), - 'healthcheck_endpoint' => health_endpoint, -} - -config.to_yaml --%> diff --git a/spec/jobs/garden_spec.rb b/spec/jobs/garden_spec.rb index 1eba2f05..8fe92266 100644 --- a/spec/jobs/garden_spec.rb +++ b/spec/jobs/garden_spec.rb @@ -91,18 +91,6 @@ expect(rendered_template['server']['cpu-entitlement-per-share']).to eql(0) end - context 'healthchecker' do - let(:template) { job.template('config/healthchecker.yml') } - let(:rendered_template) { YAML.load(template.render(properties)) } - - it 'parses out the correct health endpoint' do - expect(rendered_template['healthcheck_endpoint']).to eql({ - 'socket' => '/var/vcap/data/garden/garden.sock', - 'path' => '/ping', - }) - end - end - context 'cpu throttling' do context 'by default' do it 'sets the enable cpu throttling per share to false' do @@ -239,34 +227,18 @@ end context 'with a listen address' do - before do + it 'switches to a listen address and port' do properties.merge!( 'garden' => { 'listen_network' => 'tcp', 'listen_address' => '127.0.0.1:5555' } ) - end - - it 'switches to a listen address and port' do rendered_template = IniParse.parse(template.render(properties)) expect(rendered_template['server']['bind-ip']).to eql('127.0.0.1') expect(rendered_template['server']['bind-port']).to eql(5555) end - context 'healthchecker' do - let(:template) { job.template('config/healthchecker.yml') } - let(:rendered_template) { YAML.load(template.render(properties)) } - - it 'parses out the correct health endpoint' do - expect(rendered_template['healthcheck_endpoint']).to eql({ - 'port' => 5555, - 'host' => '127.0.0.1', - 'path' => '/ping', - }) - end - end - # it 'throws an exception if the ip is invalid' do # properties.merge!({ # 'garden' => {