Skip to content

Commit

Permalink
Merge pull request #6 from spreaker/export-prometheus-metrics
Browse files Browse the repository at this point in the history
Export prometheus metrics
  • Loading branch information
Marco Pracucci authored Apr 3, 2019
2 parents 743a134 + a4a7714 commit 076b3ef
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

### 1.1.0 (2019-04-02)
- Introduced a built-in Prometheus exporter

### 1.0.4 (2019-03-29)
- [BUGFIX] Fixed memory leak

Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,22 @@ The cli supports the following arguments:
| `--instances-region REGION [REGION ...]` | yes | AWS regions where EC2 instances should be checked |
| `--frequency N` | | How frequently the service should be reconciled (in seconds). Defaults to `300` sec |
| `--single-run` | | Run a single reconcile and then exit |
| `--enable-prometheus` | | Enable the Prometheus exporter. Disabled by default |
| `--prometheus-host` | | The host at which the Prometheus exporter should listen to. Defaults to `127.0.0.1` |
| `--prometheus-port` | | The port at which the Prometheus exporter should listen to. Defaults to `9100` |
| `--log-level LOG_LEVEL` | | Minimum log level. Accepted values are: `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`. Defaults to `INFO` |


## Exported metrics

The application features an integrated Prometheus exporter. The following metrics are exported:

| Metric name | Labels | Description |
| ---------------------------------------------------------- | ------------ | ----------- |
| `aws_cloud_unmap_up` | `service_id` | Always `1`: can be used to check if it's running |
| `aws_cloud_unmap_last_reconcile_success_timestamp_seconds` | `service_id` | The timestamp (in seconds) of the last successful reconciliation |


## Development

Run the development environment:
Expand Down
30 changes: 29 additions & 1 deletion cloudunmap/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@
from typing import List
from pythonjsonlogger import jsonlogger
from .unmap import unmapTerminatedInstancesFromService
from prometheus_client import start_http_server, Gauge


# Prometheus metrics
upMetric = Gauge(
"aws_cloud_unmap_up",
"Always 1 - can by used to check if it's running",
labelnames=["service_id"])

lastReconcileTimestampMetric = Gauge(
"aws_cloud_unmap_last_reconcile_success_timestamp_seconds",
"The timestamp (in seconds) of the last successful reconciliation",
labelnames=["service_id"])


def parseArguments(argv: List[str]):
Expand All @@ -16,6 +29,9 @@ def parseArguments(argv: List[str]):
parser.add_argument("--instances-region", metavar="REGION", required=True, nargs='+', help="AWS region where EC2 instances should be checked")
parser.add_argument("--frequency", metavar="N", required=False, type=int, default=300, help="How frequently the service should be reconciled (in seconds)")
parser.add_argument("--single-run", required=False, default=False, action="store_true", help="Run a single reconcile and then exit")
parser.add_argument("--enable-prometheus", required=False, default=False, action="store_true", help="Enable the Prometheus exporter")
parser.add_argument("--prometheus-host", required=False, default="127.0.0.1", help="The host at which the Prometheus exporter should listen to")
parser.add_argument("--prometheus-port", required=False, default="9100", type=int, help="The port at which the Prometheus exporter should listen to")
parser.add_argument("--log-level", help="Minimum log level. Accepted values are: DEBUG, INFO, WARNING, ERROR, CRITICAL", default="INFO")

return parser.parse_args(argv)
Expand All @@ -25,9 +41,13 @@ def reconcile(serviceId: str, serviceRegion: str, instancesRegion: List[str]):
logger = logging.getLogger()

try:
unmapTerminatedInstancesFromService(serviceId, serviceRegion, instancesRegion)
success = unmapTerminatedInstancesFromService(serviceId, serviceRegion, instancesRegion)
except Exception as error:
logger.error(f"An error occurred while reconciling service {serviceId}: {str(error)}")
success = False

if success:
lastReconcileTimestampMetric.labels(serviceId).set(int(time.time()))


def main(args):
Expand All @@ -51,6 +71,14 @@ def _on_sigterm(signal, frame):
signal.signal(signal.SIGINT, _on_sigterm)
signal.signal(signal.SIGTERM, _on_sigterm)

# Start Prometheus exporter
if args.enable_prometheus:
start_http_server(args.prometheus_port, args.prometheus_host)
logger.info("Prometheus exporter listening on {host}:{port}".format(port=args.prometheus_port, host=args.prometheus_host))

# Set the up metric value, which will be steady to 1 for the entire app lifecycle
upMetric.labels(args.service_id).set(1)

# Reconcile
if args.single_run:
reconcile(args.service_id, args.service_region, args.instances_region)
Expand Down
5 changes: 3 additions & 2 deletions cloudunmap/unmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def matchServiceInstanceInRunningInstances(serviceInstance, runningInstances):
return False


def unmapTerminatedInstancesFromService(serviceId: str, serviceRegion: str, instancesRegions: List[str]):
def unmapTerminatedInstancesFromService(serviceId: str, serviceRegion: str, instancesRegions: List[str]) -> bool:
logger = logging.getLogger()
logger.info(f"Checking EC2 instances registered to service {serviceId} in {serviceRegion}")

Expand Down Expand Up @@ -63,7 +63,7 @@ def unmapTerminatedInstancesFromService(serviceId: str, serviceRegion: str, inst
# from the service
if len(unmatchingInstances) >= len(serviceInstances):
logger.warning(f"All instances registered to service {serviceId} appear to not match any running EC2 instance in {instancesRegions}, but skipping deregistering as safe protection")
return
return False

# Remove all unmatching instances from the service
logger.info(f"Found {len(unmatchingInstances)} instances in service {serviceId} not matching any running EC2 instance in {instancesRegions}")
Expand All @@ -73,3 +73,4 @@ def unmapTerminatedInstancesFromService(serviceId: str, serviceRegion: str, inst
sdClient.deregister_instance(ServiceId=serviceId, InstanceId=unmatchingInstance["Id"])

logger.info(f"Checked EC2 instances registered to service {serviceId} in {serviceRegion}")
return True
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
keywords = ['aws', 'cloud map'],
classifiers = [],
python_requires = ' >= 3',
install_requires = ["boto3==1.9.123", "python-json-logger==0.1.10"],
install_requires = ["boto3==1.9.123", "python-json-logger==0.1.10", "prometheus_client==0.6.0"],
extras_require = {
'dev': [
'flake8==3.7.7',
Expand Down
33 changes: 33 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import unittest
import boto3
import time
from unittest.mock import patch
from botocore.stub import Stubber
from cloudunmap.cli import main, parseArguments
from .mocks import mockBotoClient, mockServiceInstance, mockEC2Instance
from prometheus_client.registry import REGISTRY as prometheusDefaultRegistry


class TestCli(unittest.TestCase):
Expand Down Expand Up @@ -45,6 +47,10 @@ def testMainShouldReconcileService(self):
with patch("boto3.client", side_effect=self.botoClientMock):
main(parseArguments(["--service-id", "srv-1", "--service-region", "eu-west-1", "--instances-region", "eu-west-1", "--single-run"]))

# Check exported metrics
self.assertEqual(prometheusDefaultRegistry.get_sample_value("aws_cloud_unmap_up", labels={"service_id": "srv-1"}), 1)
self.assertAlmostEqual(prometheusDefaultRegistry.get_sample_value("aws_cloud_unmap_last_reconcile_success_timestamp_seconds", labels={"service_id": "srv-1"}), time.time(), delta=2)

self.ec2Stubber.assert_no_pending_responses()
self.sdStubber.assert_no_pending_responses()

Expand All @@ -55,5 +61,32 @@ def testMainShouldGracefullyHandleAnErrorWhileCallingAwsAPI(self):
with patch("boto3.client", side_effect=self.botoClientMock):
main(parseArguments(["--service-id", "srv-1", "--service-region", "eu-west-1", "--instances-region", "eu-west-1", "--single-run"]))

# Check exported metrics
self.assertEqual(prometheusDefaultRegistry.get_sample_value("aws_cloud_unmap_up", labels={"service_id": "srv-1"}), 1)
self.assertIsNone(prometheusDefaultRegistry.get_sample_value("aws_cloud_unmap_last_reconcile_success_timestamp_seconds", labels={"service_id": "srv-1"}))

self.ec2Stubber.assert_no_pending_responses()
self.sdStubber.assert_no_pending_responses()

def testMainShouldDoNotDeregisterInstancesIfAllRegisteredInstancesWouldBeDeregistered(self):
# Mock Cloud Map client
self.sdStubber.add_response(
"list_instances",
{"Instances": [mockServiceInstance("i-1", "172.0.0.1"), mockServiceInstance("i-2", "2.2.2.2")]},
{"ServiceId": "srv-1", "MaxResults": 100})

# Mock EC2 client
self.ec2Stubber.add_response(
"describe_instances",
{"Reservations": []},
{"Filters": [{"Name": "instance-id", "Values": ["i-1", "i-2"]}], "MaxResults": 1000})

with patch("boto3.client", side_effect=self.botoClientMock):
main(parseArguments(["--service-id", "srv-1", "--service-region", "eu-west-1", "--instances-region", "eu-west-1", "--single-run"]))

# Check exported metrics
self.assertEqual(prometheusDefaultRegistry.get_sample_value("aws_cloud_unmap_up", labels={"service_id": "srv-1"}), 1)
self.assertIsNone(prometheusDefaultRegistry.get_sample_value("aws_cloud_unmap_last_reconcile_success_timestamp_seconds", labels={"service_id": "srv-1"}))

self.ec2Stubber.assert_no_pending_responses()
self.sdStubber.assert_no_pending_responses()

0 comments on commit 076b3ef

Please sign in to comment.