Skip to content

Commit

Permalink
#72 - Adding the ability to control incident status
Browse files Browse the repository at this point in the history
  • Loading branch information
mtakaki committed Oct 25, 2019
1 parent 9e5d42f commit e132f86
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 45 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ endpoint:
expectation:
- type: HTTP_STATUS
status_range: 200-300
incident: MAJOR
- type: LATENCY
threshold: 1
- type: REGEX
Expand Down Expand Up @@ -65,6 +66,19 @@ frequency: 30
- **latency_unit**, the latency unit used when reporting the metrics. It will automatically convert to the specified unit. It's not mandatory and it will default to **seconds**. Available units: `ms`, `s`, `m`, `h`.
- **frequency**, how often we'll send a request to the given URL. The unit is in seconds.

Each `expectation` has their own default incident status. It can be overridden by setting the `incident` property to any of the following values:
- `PARTIAL`
- `MAJOR`
- `PERFORMANCE`

By choosing any of the aforementioned statuses, it will let you control the kind of incident it should be considered. These are the default incident status for each `expectation` type:

| Expectation | Incident status |
| ----------- | --------------- |
| HTTP_STATUS | PARTIAL |
| LATENCY | PERFORMANCE |
| REGEX | PARTIAL |

## Setting up

The application should be installed using **virtualenv**, through the following command:
Expand Down
97 changes: 58 additions & 39 deletions cachet_url_monitor/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(self, component_id):
self.component_id = component_id

def __str__(self):
return repr('Component with id [%d] does not exist.' % (self.component_id,))
return repr(f'Component with id [{self.component_id}] does not exist.')


class MetricNonexistentError(Exception):
Expand All @@ -49,15 +49,15 @@ def __init__(self, metric_id):
self.metric_id = metric_id

def __str__(self):
return repr('Metric with id [%d] does not exist.' % (self.metric_id,))
return repr(f'Metric with id [{self.metric_id}] does not exist.')


def get_current_status(endpoint_url, component_id, headers):
"""Retrieves the current status of the component that is being monitored. It will fail if the component does
not exist or doesn't respond with the expected data.
:return component status.
"""
get_status_request = requests.get('%s/components/%s' % (endpoint_url, component_id), headers=headers)
get_status_request = requests.get(f'{endpoint_url}/components/{component_id}', headers=headers)

if get_status_request.ok:
# The component exists.
Expand All @@ -69,7 +69,7 @@ def get_current_status(endpoint_url, component_id, headers):
def normalize_url(url):
"""If passed url doesn't include schema return it with default one - http."""
if not url.lower().startswith('http'):
return 'http://%s' % url
return f'http://{url}'
return url


Expand Down Expand Up @@ -120,7 +120,7 @@ def __init__(self, config_file):
os.environ.get('CACHET_PUBLIC_INCIDENTS') or self.data['cachet']['public_incidents'])

self.logger.info('Monitoring URL: %s %s' % (self.endpoint_method, self.endpoint_url))
self.expectations = [Expectaction.create(expectation) for expectation in self.data['endpoint']['expectation']]
self.expectations = [Expectation.create(expectation) for expectation in self.data['endpoint']['expectation']]
for expectation in self.expectations:
self.logger.info('Registered expectation: %s' % (expectation,))

Expand Down Expand Up @@ -157,16 +157,15 @@ def validate(self):
configuration_errors.append('%s.%s' % (key, sub_key))

if ('endpoint' in self.data and 'expectation' in
self.data['endpoint']):
self.data['endpoint']):
if (not isinstance(self.data['endpoint']['expectation'], list) or
(isinstance(self.data['endpoint']['expectation'], list) and
len(self.data['endpoint']['expectation']) == 0)):
len(self.data['endpoint']['expectation']) == 0)):
configuration_errors.append('endpoint.expectation')

if len(configuration_errors) > 0:
raise ConfigurationValidationError(
'Config file [%s] failed validation. Missing keys: %s' % (self.config_file,
', '.join(configuration_errors)))
f"Config file [{self.config_file}] failed validation. Missing keys: {', '.join(configuration_errors)}")

def evaluate(self):
"""Sends the request to the URL set in the configuration and executes
Expand All @@ -175,9 +174,10 @@ def evaluate(self):
"""
try:
if self.endpoint_header is not None:
self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout, headers=self.endpoint_header)
self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout,
headers=self.endpoint_header)
else:
self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout)
self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout)
self.current_timestamp = int(time.time())
except requests.ConnectionError:
self.message = 'The URL is unreachable: %s %s' % (self.endpoint_method, self.endpoint_url)
Expand Down Expand Up @@ -208,7 +208,7 @@ def evaluate(self):
self.logger.info(self.message)

def print_out(self):
self.logger.info('Current configuration:\n%s' % (self.__repr__()))
self.logger.info(f'Current configuration:\n{self.__repr__()}')

def __repr__(self):
temporary_data = copy.deepcopy(self.data)
Expand All @@ -224,7 +224,7 @@ def if_trigger_update(self):

if self.status != 1:
self.current_fails = self.current_fails + 1
self.logger.info('Failure #%s with threshold set to %s' % (self.current_fails, self.allowed_fails))
self.logger.warning(f'Failure #{self.current_fails} with threshold set to {self.allowed_fails}')
if self.current_fails <= self.allowed_fails:
self.trigger_update = False
return
Expand Down Expand Up @@ -276,8 +276,7 @@ def push_metrics(self):
# Successful metrics upload
self.logger.info('Metric uploaded: %.6f %s' % (value, self.latency_unit))
else:
self.logger.warning('Metric upload failed with status [%d]' %
(metrics_request.status_code,))
self.logger.warning(f'Metric upload failed with status [{metrics_request.status_code}]')

def push_incident(self):
"""If the component status has changed, we create a new incident (if this is the first time it becomes unstable)
Expand All @@ -291,36 +290,33 @@ def push_incident(self):
'component_status': self.status,
'notify': True}

incident_request = requests.put('%s/incidents/%d' % (self.api_url, self.incident_id), params=params,
incident_request = requests.put(f'{self.api_url}/incidents/{self.incident_id}', params=params,
headers=self.headers)
if incident_request.ok:
# Successful metrics upload
self.logger.info(
'Incident updated, API healthy again: component status [%d], message: "%s"' % (
self.status, self.message))
f'Incident updated, API healthy again: component status [{self.status}], message: "{self.message}"')
del self.incident_id
else:
self.logger.warning('Incident update failed with status [%d], message: "%s"' % (
incident_request.status_code, self.message))
self.logger.warning(
f'Incident update failed with status [{incident_request.status_code}], message: "{self.message}"')
elif not hasattr(self, 'incident_id') and self.status != st.COMPONENT_STATUS_OPERATIONAL:
# This is the first time the incident is being created.
params = {'name': 'URL unavailable', 'message': self.message, 'status': 1, 'visible': self.public_incidents,
'component_id': self.component_id, 'component_status': self.status, 'notify': True}
incident_request = requests.post('%s/incidents' % (self.api_url,), params=params, headers=self.headers)
incident_request = requests.post(f'{self.api_url}/incidents', params=params, headers=self.headers)
if incident_request.ok:
# Successful incident upload.
self.incident_id = incident_request.json()['data']['id']
self.logger.info(
'Incident uploaded, API unhealthy: component status [%d], message: "%s"' % (
self.status, self.message))
f'Incident uploaded, API unhealthy: component status [{self.status}], message: "{self.message}"')
else:
self.logger.warning(
'Incident upload failed with status [%d], message: "%s"' % (
incident_request.status_code, self.message))
f'Incident upload failed with status [{incident_request.status_code}], message: "{self.message}"')


class Expectaction(object):
"""Base class for URL result expectations. Any new excpectation should extend
class Expectation(object):
"""Base class for URL result expectations. Any new expectation should extend
this class and the name added to create() method.
"""

Expand All @@ -329,13 +325,17 @@ def create(configuration):
"""Creates a list of expectations based on the configuration types
list.
"""
# If a need expectation is created, this is where we need to add it.
expectations = {
'HTTP_STATUS': HttpStatus,
'LATENCY': Latency,
'REGEX': Regex
}
return expectations.get(configuration['type'])(configuration)

def __init__(self, configuration):
self.incident_status = self.parse_incident_status(configuration)

@abc.abstractmethod
def get_status(self, response):
"""Returns the status of the API, following cachet's component status
Expand All @@ -346,43 +346,58 @@ def get_status(self, response):
def get_message(self, response):
"""Gets the error message."""

@abc.abstractmethod
def get_default_incident(self):
"""Returns the default status when this incident happens."""

def parse_incident_status(self, configuration):
return st.INCIDENT_MAP.get(configuration.get('incident', None), self.get_default_incident())

class HttpStatus(Expectaction):

class HttpStatus(Expectation):
def __init__(self, configuration):
self.status_range = HttpStatus.parse_range(configuration['status_range'])
super(HttpStatus, self).__init__(configuration)

@staticmethod
def parse_range(range_string):
statuses = range_string.split("-")
if len(statuses) == 1:
# When there was no range given, we should treat the first number as a single status check.
return (int(statuses[0]), int(statuses[0]) + 1)
return int(statuses[0]), int(statuses[0]) + 1
else:
# We shouldn't look into more than one value, as this is a range value.
return (int(statuses[0]), int(statuses[1]))
return int(statuses[0]), int(statuses[1])

def get_status(self, response):
if response.status_code >= self.status_range[0] and response.status_code < self.status_range[1]:
if self.status_range[0] <= response.status_code < self.status_range[1]:
return st.COMPONENT_STATUS_OPERATIONAL
else:
return st.COMPONENT_STATUS_PARTIAL_OUTAGE
return self.incident_status

def get_default_incident(self):
return st.COMPONENT_STATUS_PARTIAL_OUTAGE

def get_message(self, response):
return 'Unexpected HTTP status (%s)' % (response.status_code,)
return f'Unexpected HTTP status ({response.status_code})'

def __str__(self):
return repr('HTTP status range: %s' % (self.status_range,))
return repr(f'HTTP status range: {self.status_range}')


class Latency(Expectaction):
class Latency(Expectation):
def __init__(self, configuration):
self.threshold = configuration['threshold']
super(Latency, self).__init__(configuration)

def get_status(self, response):
if response.elapsed.total_seconds() <= self.threshold:
return st.COMPONENT_STATUS_OPERATIONAL
else:
return st.COMPONENT_STATUS_PERFORMANCE_ISSUES
return self.incident_status

def get_default_incident(self):
return st.COMPONENT_STATUS_PERFORMANCE_ISSUES

def get_message(self, response):
return 'Latency above threshold: %.4f seconds' % (response.elapsed.total_seconds(),)
Expand All @@ -391,19 +406,23 @@ def __str__(self):
return repr('Latency threshold: %.4f seconds' % (self.threshold,))


class Regex(Expectaction):
class Regex(Expectation):
def __init__(self, configuration):
self.regex_string = configuration['regex']
self.regex = re.compile(configuration['regex'], re.UNICODE + re.DOTALL)
super(Regex, self).__init__(configuration)

def get_status(self, response):
if self.regex.match(response.text):
return st.COMPONENT_STATUS_OPERATIONAL
else:
return st.COMPONENT_STATUS_PARTIAL_OUTAGE
return self.incident_status

def get_default_incident(self):
return st.COMPONENT_STATUS_PARTIAL_OUTAGE

def get_message(self, response):
return 'Regex did not match anything in the body'

def __str__(self):
return repr('Regex: %s' % (self.regex_string,))
return repr(f'Regex: {self.regex_string}')
15 changes: 12 additions & 3 deletions cachet_url_monitor/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,21 @@
These are all constants and are coupled to cachet's API configuration.
"""


COMPONENT_STATUS_OPERATIONAL = 1
COMPONENT_STATUS_PERFORMANCE_ISSUES = 2
COMPONENT_STATUS_PARTIAL_OUTAGE = 3
COMPONENT_STATUS_MAJOR_OUTAGE = 4

COMPONENT_STATUSES = [COMPONENT_STATUS_OPERATIONAL,
COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE,
COMPONENT_STATUS_MAJOR_OUTAGE]
COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE,
COMPONENT_STATUS_MAJOR_OUTAGE]

INCIDENT_PARTIAL = 'PARTIAL'
INCIDENT_MAJOR = 'MAJOR'
INCIDENT_PERFORMANCE = 'PERFORMANCE'

INCIDENT_MAP = {
INCIDENT_PARTIAL: COMPONENT_STATUS_PARTIAL_OUTAGE,
INCIDENT_MAJOR: COMPONENT_STATUS_MAJOR_OUTAGE,
INCIDENT_PERFORMANCE: COMPONENT_STATUS_PERFORMANCE_ISSUES,
}
1 change: 1 addition & 0 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ endpoint:
expectation:
- type: HTTP_STATUS
status_range: 200-300
incident: MAJOR
- type: LATENCY
threshold: 1
- type: REGEX
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from setuptools import setup

setup(name='cachet-url-monitor',
version='1.4',
version='1.5',
description='Cachet URL monitor plugin',
author='Mitsuo Takaki',
author_email='[email protected]',
Expand Down
4 changes: 2 additions & 2 deletions tests/test_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def request(method, url, headers, timeout=None):
sys.modules['requests'].request = request
self.configuration.evaluate()

self.assertEqual(self.configuration.status, cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE,
'Component status set incorrectly')
self.assertEqual(self.configuration.status, cachet_url_monitor.status.COMPONENT_STATUS_MAJOR_OUTAGE,
'Component status set incorrectly or custom incident status is incorrectly parsed')

def test_evaluate_with_timeout(self):
def request(method, url, headers, timeout=None):
Expand Down

0 comments on commit e132f86

Please sign in to comment.