Skip to content

Commit

Permalink
Improve incident handling (#97)
Browse files Browse the repository at this point in the history
* Allow to customize incident titles

* Post incident update when issue is resolved

This provides better feedback as it includes the resolution time
in the incident history.
  • Loading branch information
nijel authored May 1, 2020
1 parent a92622a commit a04f7ae
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 12 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ webhooks:
title: "{title}"
message: "{message}"
priority: 5
messages:
incident_outage: "{name} is unavailable"
incident_operational: "{name} is operational"
incident_performance: "{name} has degraded performance"
```
- **endpoints**, the configuration about the URL/Urls that will be monitored.
Expand Down Expand Up @@ -97,6 +101,10 @@ webhooks:
- **webhooks**, generic webhooks to be notified about incident updates
- **url**, webhook URL, will be interpolated
- **params**, POST parameters, will be interpolated
- **messages**, customize text for generated events, use any of **endpoint** parameter in interpolation
- **incident_outage**, title of incident in case of outage
- **incident_performace**, title of incident in case of performance issues
- **incident_operational**, title of incident in case service is operational

Each `expectation` has their own default incident status. It can be overridden by setting the `incident` property to any of the following values:
- `PARTIAL`
Expand Down
11 changes: 5 additions & 6 deletions cachet_url_monitor/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,20 +108,19 @@ def push_metrics(self, metric_id: int, latency_time_unit: str, elapsed_time_in_s
params = {'id': metric_id, 'value': value, 'timestamp': timestamp}
return requests.post(f"{self.url}/metrics/{metric_id}/points", params=params, headers=self.headers)

def push_incident(self, status_value: status.ComponentStatus, is_public_incident: bool, component_id: int,
def push_incident(self, status_value: status.ComponentStatus, is_public_incident: bool, component_id: int, title: str,
previous_incident_id=None, message=None):
"""If the component status has changed, we create a new incident (if this is the first time it becomes unstable)
or updates the existing incident once it becomes healthy again.
"""
if previous_incident_id and status_value == status.ComponentStatus.OPERATIONAL:
# If the incident already exists, it means it was unhealthy but now it's healthy again.
params = {'status': status.IncidentStatus.FIXED.value, 'visible': is_public_incident,
'component_id': component_id, 'component_status': status_value.value, 'notify': True}
# If the incident already exists, it means it was unhealthy but now it's healthy again, post update
params = {'status': status.IncidentStatus.FIXED.value, 'message': title}

return requests.put(f'{self.url}/incidents/{previous_incident_id}', params=params, headers=self.headers)
return requests.post(f'{self.url}/incidents/{previous_incident_id}/updates', params=params, headers=self.headers)
elif not previous_incident_id and status_value != status.ComponentStatus.OPERATIONAL:
# This is the first time the incident is being created.
params = {'name': 'URL unavailable', 'message': message,
params = {'name': title, 'message': message,
'status': status.IncidentStatus.INVESTIGATING.value,
'visible': is_public_incident, 'component_id': component_id, 'component_status': status_value.value,
'notify': True}
Expand Down
30 changes: 27 additions & 3 deletions cachet_url_monitor/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@
# same exact structure.
configuration_mandatory_fields = ['url', 'method', 'timeout', 'expectation', 'component_id', 'frequency']

incident_title_map = {
ComponentStatus.UNKNOWN: "incident_outage",
ComponentStatus.OPERATIONAL: "incident_operational",
ComponentStatus.PERFORMANCE_ISSUES: "incident_performance",
ComponentStatus.PARTIAL_OUTAGE: "incident_outage",
ComponentStatus.MAJOR_OUTAGE: "incident_outage",
}
default_messages = {
"incident_outage": "{name} is unavailable",
"incident_operational": "{name} is operational",
"incident_performance": "{name} has degraded performance",
}


class Configuration(object):
"""Represents a configuration file, but it also includes the functionality
Expand Down Expand Up @@ -51,6 +64,7 @@ def __init__(self, config, endpoint_index: int, client: CachetClient, webhooks :
self.endpoint_index = endpoint_index
self.data = config
self.endpoint = self.data['endpoints'][endpoint_index]
self.messages = config.get("messages", default_messages)
self.client = client
self.webhooks = webhooks or []

Expand Down Expand Up @@ -99,6 +113,12 @@ def __init__(self, config, endpoint_index: int, client: CachetClient, webhooks :
for expectation in self.expectations:
self.logger.info('Registered expectation: %s' % (expectation,))

def get_incident_title(self):
"""Generates incident title for current status."""
key = incident_title_map[self.status]
template = self.messages.get(key, default_messages[key])
return template.format(**self.endpoint)

def get_action(self):
"""Retrieves the action list from the configuration. If it's empty, returns an empty list.
:return: The list of actions, which can be an empty list.
Expand All @@ -124,6 +144,10 @@ def validate(self):
len(self.endpoint['expectation']) == 0)):
configuration_errors.append('endpoint.expectation')

for key, message in self.messages.items():
if not isinstance(message, str):
configuration_errors.append(f'message.{key}')

if len(configuration_errors) > 0:
raise ConfigurationValidationError(
'Endpoint [%s] failed validation. Missing keys: %s' % (self.endpoint,
Expand Down Expand Up @@ -249,7 +273,7 @@ def trigger_webhooks(self):
message = self.message
title = f'{self.endpoint["name"]} unavailable'
for webhook in self.webhooks:
webhook_request = webhook.push_incident(title, message)
webhook_request = webhook.push_incident(self.get_incident_title(), self.message)
if webhook_request.ok:
self.logger.info(f'Webhook {webhook.url} triggered with {title}')
else:
Expand All @@ -262,7 +286,7 @@ def push_incident(self):
if not self.trigger_update:
return
if hasattr(self, 'incident_id') and self.status == st.ComponentStatus.OPERATIONAL:
incident_request = self.client.push_incident(self.status, self.public_incidents, self.component_id,
incident_request = self.client.push_incident(self.status, self.public_incidents, self.component_id, self.get_incident_title(),
previous_incident_id=self.incident_id)

if incident_request.ok:
Expand All @@ -276,7 +300,7 @@ def push_incident(self):

self.trigger_webhooks()
elif not hasattr(self, 'incident_id') and self.status != st.ComponentStatus.OPERATIONAL:
incident_request = self.client.push_incident(self.status, self.public_incidents, self.component_id,
incident_request = self.client.push_incident(self.status, self.public_incidents, self.component_id, self.get_incident_title(),
message=self.message)
if incident_request.ok:
# Successful incident upload.
Expand Down
2 changes: 1 addition & 1 deletion cachet_url_monitor/webhook.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self, url: str, params: Dict[str, str]):
def push_incident(self, title: str, message: str):
format_args = {
"title": title,
"message": message,
"message": message or title,
}
# Interpolate URL and params
url = self.url.format(**format_args)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ def test_webhooks(webhooks_configuration, mock_logger, mock_client):
mock_client.push_incident.return_value = push_incident_response
with requests_mock.mock() as m:
m.get('http://localhost:8080/swagger', exc=requests.HTTPError)
m.post('https://push.example.com/foo%20unavailable', text='')
m.post('https://push.example.com/message?token=%3Capptoken%3E&title=foo+unavailable', text='')
m.post('https://push.example.com/foo%20is%20unavailable', text='')
m.post('https://push.example.com/message?token=%3Capptoken%3E&title=foo+is+unavailable', text='')
webhooks_configuration.evaluate()

assert webhooks_configuration.status == cachet_url_monitor.status.ComponentStatus.PARTIAL_OUTAGE, 'Component status set incorrectly'
Expand Down

0 comments on commit a04f7ae

Please sign in to comment.