diff --git a/envoy/changelog.d/16718.added b/envoy/changelog.d/16718.added new file mode 100644 index 0000000000000..43a4f0f3f5ebd --- /dev/null +++ b/envoy/changelog.d/16718.added @@ -0,0 +1 @@ +Add connection limit metrics for envoy diff --git a/envoy/datadog_checks/envoy/check.py b/envoy/datadog_checks/envoy/check.py index 656569ca27535..dbc3576ca20b3 100644 --- a/envoy/datadog_checks/envoy/check.py +++ b/envoy/datadog_checks/envoy/check.py @@ -17,6 +17,7 @@ 'cluster_name': 'envoy_cluster', 'envoy_cluster_name': 'envoy_cluster', 'envoy_local_http_ratelimit_prefix': 'stat_prefix', # local rate limit + 'envoy_connection_limit_prefix': 'stat_prefix', # connection limit 'envoy_http_conn_manager_prefix': 'stat_prefix', # tracing 'envoy_listener_address': 'address', # listener 'envoy_virtual_cluster': 'virtual_envoy_cluster', # vhost @@ -90,6 +91,16 @@ 'metric_type': 'monotonic_count', 'new_name': 'listener.downstream_cx.count', }, + r'envoy_connection_limit_(.+)_active_connections$': { + 'label_name': 'stat_prefix', + 'metric_type': 'gauge', + 'new_name': 'connection_limit.active_connections', + }, + r'envoy_connection_limit_(.+)_limited_connections$': { + 'label_name': 'stat_prefix', + 'metric_type': 'monotonic_count', + 'new_name': 'connection_limit.limited_connections.count', + }, r'envoy_(.+)_http_local_rate_limit_enabled$': { 'label_name': 'stat_prefix', 'metric_type': 'monotonic_count', diff --git a/envoy/datadog_checks/envoy/metrics.py b/envoy/datadog_checks/envoy/metrics.py index a3a949989ca24..1eee9e9fb68b8 100644 --- a/envoy/datadog_checks/envoy/metrics.py +++ b/envoy/datadog_checks/envoy/metrics.py @@ -383,6 +383,8 @@ 'envoy_cluster_client_ssl_socket_factory_ssl_context_update_by_sds': 'cluster.client_ssl_socket_factory.ssl_context_update_by_sds', # noqa: E501 'envoy_cluster_client_ssl_socket_factory_upstream_context_secrets_not_ready': 'cluster.client_ssl_socket_factory.upstream_context_secrets_not_ready', # noqa: E501 'envoy_cluster_client_ssl_socket_factory_downstream_context_secrets_not_ready': 'cluster.client_ssl_socket_factory.downstream_context_secrets_not_ready', # noqa: E501 + 'envoy_connection_limit_active_connections': 'connection_limit.active_connections', + 'envoy_connection_limit_limited_connections': 'connection_limit.limited_connections', } # fmt: off @@ -3933,6 +3935,20 @@ ), 'method': 'monotonic_count', }, + 'connection_limit.active_connections': { + 'tags': ( + ('stat_prefix',), + (), + ), + 'method': 'gauge', + }, + 'connection_limit.limited_connections': { + 'tags': ( + ('stat_prefix',), + (), + ), + 'method': 'monotonic_count', + }, } # fmt: on diff --git a/envoy/metadata.csv b/envoy/metadata.csv index 9d9ced2053061..18b5da051d9e5 100644 --- a/envoy/metadata.csv +++ b/envoy/metadata.csv @@ -149,6 +149,9 @@ envoy.cluster_manager.cluster_updated.count,count,,,,[OpenMetrics V2] Total clus envoy.cluster_manager.custer_updated_via_merge.count,count,,,,[OpenMetrics V2],0,envoy,, envoy.cluster_manager.update_merge_cancelled.count,count,,,,[OpenMetrics V2] Total merged updates that got cancelled and delivered early,0,envoy,, envoy.cluster_manager.update_out_of_merge_window.count,count,,,,[OpenMetrics V2] Total updates which arrived out of a merge window,0,envoy,, +envoy.connection_limit.active_connections,gauge,,,,[OpenMetrics V2] Number of currently active connections in the scope of this network filter chain,0,envoy,, +envoy.connection_limit.limited_connections.count,count,,,,[OpenMetrics V2] Total connections that have been rejected due to connection limit exceeded,0,envoy,, +envoy.connection_limit.limited_connections,count,,,,[Legacy] Total connections that have been rejected due to connection limit exceeded,0,envoy,, envoy.filesystem.flushed_by_timer.count,count,,,,[OpenMetrics V2],0,envoy,, envoy.filesystem.reopen_failed.count,count,,,,[OpenMetrics V2],0,envoy,, envoy.filesystem.write_buffered.count,count,,,,[OpenMetrics V2],0,envoy,, diff --git a/envoy/tests/common.py b/envoy/tests/common.py index 56556ca8ab4c7..ed763aa2144f6 100644 --- a/envoy/tests/common.py +++ b/envoy/tests/common.py @@ -376,6 +376,13 @@ "vhost.vcluster.upstream_rq.count", ] +CONNECTION_LIMIT_METRICS = [ + "connection_limit.active_connections", + "connection_limit.limited_connections.count", +] + +CONNECTION_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:ingress_http' + LOCAL_RATE_LIMIT_METRICS = [ "http.local_rate_limit_enabled.count", "http.local_rate_limit_enforced.count", @@ -383,6 +390,8 @@ "http.local_rate_limit_ok.count", ] +RATE_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:http_local_rate_limiter' + CLUSTER_AND_LISTENER_SSL_METRICS = [ "cluster.client_ssl_socket_factory.downstream_context_secrets_not_ready.count", "cluster.client_ssl_socket_factory.ssl_context_update_by_sds.count", @@ -394,8 +403,6 @@ CONNECT_STATE_METRIC = ['control_plane.connected_state'] -RATE_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:http_local_rate_limiter' - FLAKY_METRICS = [ "listener.downstream_cx_active", "listener.downstream_cx_destroy.count", @@ -731,6 +738,8 @@ "tcp.on_demand_cluster_timeout.count", "tcp.upstream_flush.count", "tcp.upstream_flush_active", + "connection_limit.active_connections", + "connection_limit.limited_connections.count", ] diff --git a/envoy/tests/docker/api_v3/front-envoy.yaml b/envoy/tests/docker/api_v3/front-envoy.yaml index ef613f8b94436..1849c6ea5af12 100644 --- a/envoy/tests/docker/api_v3/front-envoy.yaml +++ b/envoy/tests/docker/api_v3/front-envoy.yaml @@ -24,6 +24,12 @@ static_resources: socket_address: {address: 0.0.0.0, port_value: 80} filter_chains: - filters: + - name: envoy.filters.network.connection_limit + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.connection_limit.v3.ConnectionLimit + stat_prefix: ingress_http + max_connections: 1000 + delay: 0s - name: envoy.http_connection_manager typed_config: "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager diff --git a/envoy/tests/fixtures/legacy/connection_limit.txt b/envoy/tests/fixtures/legacy/connection_limit.txt new file mode 100644 index 0000000000000..775f988c949c2 --- /dev/null +++ b/envoy/tests/fixtures/legacy/connection_limit.txt @@ -0,0 +1,2 @@ +connection_limit.ingress_http.active_connections: 0 +connection_limit.ingress_http.limited_connections: 0 diff --git a/envoy/tests/fixtures/legacy/stat_prefix b/envoy/tests/fixtures/legacy/stat_prefix index b3a191b8ab719..1ad6efc9e5bb4 100644 --- a/envoy/tests/fixtures/legacy/stat_prefix +++ b/envoy/tests/fixtures/legacy/stat_prefix @@ -8,3 +8,5 @@ cluster.foo.ext_authz.bar.disabled: 6 cluster.foo.ext_authz.bar.error: 7 cluster.foo.ext_authz.bar.failure_mode_allowed: 8 cluster.foo.ext_authz.bar.ok: 9 +connection_limit.ingress_http.active_connections: 0 +connection_limit.ingress_http.limited_connections: 0 diff --git a/envoy/tests/fixtures/openmetrics/openmetrics.txt b/envoy/tests/fixtures/openmetrics/openmetrics.txt index 99906261ee4aa..6a3d462c078dd 100644 --- a/envoy/tests/fixtures/openmetrics/openmetrics.txt +++ b/envoy/tests/fixtures/openmetrics/openmetrics.txt @@ -521,6 +521,10 @@ envoy_filesystem_write_buffered{} 7 envoy_http_downstream_cx_upgrades_total{envoy_http_conn_manager_prefix="admin"} 0 # TYPE envoy_cluster_manager_cluster_removed counter envoy_cluster_manager_cluster_removed{} 0 +# TYPE envoy_connection_limit_active_connections gauge +envoy_connection_limit_active_connections{envoy_connection_limit_prefix="ingress_http"} 0 +# TYPE envoy_connection_limit_limited_connections counter +envoy_connection_limit_limited_connections{envoy_connection_limit_prefix="ingress_http"} 0 # TYPE envoy_server_debug_assertion_failures counter envoy_server_debug_assertion_failures{} 0 # TYPE envoy_server_worker_3_watchdog_miss counter diff --git a/envoy/tests/legacy/common.py b/envoy/tests/legacy/common.py index ddac5df67f931..65502ddd7f837 100644 --- a/envoy/tests/legacy/common.py +++ b/envoy/tests/legacy/common.py @@ -52,7 +52,14 @@ "envoy.http_local_rate_limit.ok", ] -STAT_PREFIX_TAG = ['stat_prefix:http_local_rate_limiter', 'stat_prefix:foo_buz_112'] +RATE_LIMIT_STAT_PREFIX_TAG = ['stat_prefix:http_local_rate_limiter', 'stat_prefix:foo_buz_112'] + +CONNECTION_LIMIT_METRICS = [ + "envoy.connection_limit.active_connections", + "envoy.connection_limit.limited_connections", +] + +CONNECTION_LIMIT_STAT_PREFIX_TAG = ['stat_prefix:ingress_http'] RBAC_METRICS = [ "envoy.http.rbac.allowed", diff --git a/envoy/tests/legacy/test_unit.py b/envoy/tests/legacy/test_unit.py index dc3a359e06c50..cf6d7a84ec5e3 100644 --- a/envoy/tests/legacy/test_unit.py +++ b/envoy/tests/legacy/test_unit.py @@ -12,14 +12,16 @@ from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS from .common import ( + CONNECTION_LIMIT_METRICS, + CONNECTION_LIMIT_STAT_PREFIX_TAG, ENVOY_VERSION, EXT_METRICS, FLAVOR, HOST, INSTANCES, LOCAL_RATE_LIMIT_METRICS, + RATE_LIMIT_STAT_PREFIX_TAG, RBAC_METRICS, - STAT_PREFIX_TAG, ) CHECK_NAME = 'envoy' @@ -289,7 +291,7 @@ def test_stats_prefix_optional_tags( standard_tags.append('endpoint:{}'.format(instance["stats_url"])) tags_prefix = standard_tags + additional_tags c = check(instance) - mock_http_response(file_path=fixture_path(fixture_file)).return_value + mock_http_response(file_path=fixture_path(fixture_file)) dd_run_check(c) # To ensure that this change didn't break the old behavior, both the value and the tags are asserted. @@ -313,7 +315,20 @@ def test_local_rate_limit_metrics(aggregator, fixture_path, mock_http_response, for metric in LOCAL_RATE_LIMIT_METRICS: aggregator.assert_metric(metric) - for tag in STAT_PREFIX_TAG: + for tag in RATE_LIMIT_STAT_PREFIX_TAG: + aggregator.assert_metric_has_tag(metric, tag, count=1) + + aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + + +def test_connection_limit_metrics(aggregator, fixture_path, mock_http_response, check, dd_run_check): + instance = INSTANCES['main'] + c = check(instance) + + mock_http_response(file_path=fixture_path('./legacy/connection_limit.txt')) + dd_run_check(c) + for metric in CONNECTION_LIMIT_METRICS: + for tag in CONNECTION_LIMIT_STAT_PREFIX_TAG: aggregator.assert_metric_has_tag(metric, tag, count=1) aggregator.assert_metrics_using_metadata(get_metadata_metrics()) diff --git a/envoy/tests/test_e2e.py b/envoy/tests/test_e2e.py index 63ece2e38524e..fbcc79076fac9 100644 --- a/envoy/tests/test_e2e.py +++ b/envoy/tests/test_e2e.py @@ -8,6 +8,7 @@ from datadog_checks.envoy import Envoy from .common import ( + CONNECTION_LIMIT_METRICS, DEFAULT_INSTANCE, FLAKY_METRICS, LOCAL_RATE_LIMIT_METRICS, @@ -22,7 +23,7 @@ def test_e2e(dd_agent_check): aggregator = dd_agent_check(DEFAULT_INSTANCE, rate=True) - for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS: + for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS + CONNECTION_LIMIT_METRICS: formatted_metric = "envoy.{}".format(metric) if metric in FLAKY_METRICS: aggregator.assert_metric(formatted_metric, at_least=0) diff --git a/envoy/tests/test_integration.py b/envoy/tests/test_integration.py index 4020349d4baa9..efdca881192ec 100644 --- a/envoy/tests/test_integration.py +++ b/envoy/tests/test_integration.py @@ -9,6 +9,7 @@ from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS from .common import ( + CONNECTION_LIMIT_METRICS, DEFAULT_INSTANCE, ENVOY_VERSION, FLAKY_METRICS, @@ -31,7 +32,7 @@ def test_check(aggregator, dd_run_check, check): dd_run_check(c) dd_run_check(c) - for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS: + for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS + CONNECTION_LIMIT_METRICS: formatted_metric = "envoy.{}".format(metric) if metric in FLAKY_METRICS: aggregator.assert_metric(formatted_metric, at_least=0) diff --git a/envoy/tests/test_unit.py b/envoy/tests/test_unit.py index e7d1e00011c7b..263d86aa91760 100644 --- a/envoy/tests/test_unit.py +++ b/envoy/tests/test_unit.py @@ -12,6 +12,8 @@ from .common import ( CLUSTER_AND_LISTENER_SSL_METRICS, CONNECT_STATE_METRIC, + CONNECTION_LIMIT_METRICS, + CONNECTION_LIMIT_STAT_PREFIX_TAG, DEFAULT_INSTANCE, LOCAL_RATE_LIMIT_METRICS, MOCKED_PROMETHEUS_METRICS, @@ -52,6 +54,10 @@ def test_check(aggregator, dd_run_check, check, mock_http_response): for metric in CONNECT_STATE_METRIC: aggregator.assert_metric('envoy.{}'.format(metric)) + for metric in CONNECTION_LIMIT_METRICS: + aggregator.assert_metric('envoy.{}'.format(metric)) + aggregator.assert_metric_has_tag('envoy.{}'.format(metric), CONNECTION_LIMIT_STAT_PREFIX_TAG) + aggregator.assert_service_check( "envoy.openmetrics.health", status=AgentCheck.OK, tags=['endpoint:http://localhost:8001/stats/prometheus'] )