forked from skyscrapers/monitoring-plugins
-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_couchdb_replications.py
executable file
·125 lines (103 loc) · 4 KB
/
check_couchdb_replications.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
'''Check to make sure that all the replications on the local CouchDB server
are actually running.'''
import argparse
from datetime import datetime, timedelta
from operator import itemgetter
import requests
import sys
import tempfile
import base64
# Nagios status codes
OK, WARNING, CRITICAL, UNKNOWN = range(4)
def main(host, replicator, auth, age_timeout):
headers = {
'Content-type': 'application/json'
}
if auth:
headers['Authorization'] = 'Basic %s' % base64.b64encode(auth)
# Query the server and parse into objects
try:
replicator_url = host + replicator + '/_all_docs?include_docs=true'
replications = requests.get(replicator_url, headers=headers)
active_tasks = requests.get(host + '/_active_tasks', headers=headers)
except Exception:
print 'REPLICATION STATUS UNKNOWN - Error connecting to server'
return UNKNOWN
if replications.status_code >= 400:
print 'REPLICATION STATUS UNKNOWN - Error connecting to server (HTTP %i)' % replications.status_code
return UNKNOWN
if active_tasks.status_code >= 400:
print 'REPLICATION STATUS UNKNOWN - Error connecting to server (HTTP %i)' % replications.status_code
return UNKNOWN
reps = filter(lambda x: 'source' in x, [x['doc'] for x in replications.json()['rows']])
reps = sorted(reps, key=itemgetter('source'))
tasks = {}
for at in active_tasks.json():
rep_id = at.get('replication_id', '')
if '+' in rep_id:
r_id = rep_id.split('+')[0]
else:
r_id = rep_id
tasks[r_id] = at
# Merge and check
status = OK
problems = []
for rep in reps:
if '_replication_id' in rep:
task = tasks.get(rep['_replication_id'])
if task:
rep.update(task)
doc_id = rep['_id']
rep_state = rep.get('_replication_state', 'N/A')
reason = rep.get('_replication_state_reason', '')
updated = rep.get('updated_on', 0)
age = datetime.now() - datetime.fromtimestamp(updated)
rep_problems = {}
# Check that all replications are in the triggered state
if rep_state != 'triggered':
rep_problems['state'] = rep_state
if reason:
rep_problems['state'] += ', ' + reason
status = CRITICAL
# Check that all replications have updated recently
if age > timedelta(seconds=int(age_timeout)):
if updated > 0:
rep_problems['age'] = str(age)
else:
rep_problems['age'] = 'infinity'
status = max(status, WARNING)
if len(rep_problems) > 0:
problem_list = []
for k, v in rep_problems.items():
problem_list.append('%s: %s' % (k, v))
problems.append('%s (%s)' % (doc_id, '; '.join(problem_list)))
# Build the output string
if status == OK:
s = 'OK'
elif status == WARNING:
s = 'WARNING'
elif status == CRITICAL:
s = 'CRITICAL'
else:
s = 'UNKNOWN'
output = 'REPLICATION STATUS %s - %i replications, %i problems' % (s, len(reps), len(problems))
if len(problems) > 0:
output += ': ' + ', '.join(problems)
print output
return status
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--host', '-s', default='http://localhost:5984')
parser.add_argument('--replicator', '-d', default='/_replicator')
parser.add_argument('--age-timeout', '-t', default=120,
help='Warning time for stale replications, in s')
parser.add_argument('--auth', '-a',
help='Basic HTTP authentication string as "username:password"')
args = parser.parse_args()
try:
status = main(args.host, args.replicator, args.auth, args.age_timeout)
except Exception as e:
print 'REPLICATION STATUS UNKNOWN - Python exception %s' % str(e)
status = UNKNOWN
sys.exit(status)