From 1581e7492a6d2dcfffdf5d4318e595a3662b29ec Mon Sep 17 00:00:00 2001 From: Thomas Pierce Date: Tue, 9 Jan 2024 11:57:05 -0800 Subject: [PATCH] TEMP PR - Add Span Metrics Processor and MetricAttributeGenerator interface --- .../distro/aws_span_metrics_processor.py | 111 ++++++++++++++++++ .../distro/metric_attribute_generator.py | 28 +++++ 2 files changed, 139 insertions(+) create mode 100644 opentelemetry-distro/src/amazon/opentelemetry/distro/aws_span_metrics_processor.py create mode 100644 opentelemetry-distro/src/amazon/opentelemetry/distro/metric_attribute_generator.py diff --git a/opentelemetry-distro/src/amazon/opentelemetry/distro/aws_span_metrics_processor.py b/opentelemetry-distro/src/amazon/opentelemetry/distro/aws_span_metrics_processor.py new file mode 100644 index 000000000..9cb107ccd --- /dev/null +++ b/opentelemetry-distro/src/amazon/opentelemetry/distro/aws_span_metrics_processor.py @@ -0,0 +1,111 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +from metric_attribute_generator import MetricAttributeGenerator +from typing_extensions import override + +from opentelemetry.context import Context +from opentelemetry.metrics import Histogram +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import BoundedAttributes, ReadableSpan, Span, SpanProcessor, StatusCode +from opentelemetry.semconv.trace import SpanAttributes + +_HTTP_STATUS_CODE = SpanAttributes.HTTP_STATUS_CODE +_NANOS_TO_MILLIS: float = 1_000_000.0 + +# Constants for deriving error and fault metrics +_ERROR_CODE_LOWER_BOUND: int = 400 +_ERROR_CODE_UPPER_BOUND: int = 499 +_FAULT_CODE_LOWER_BOUND: int = 500 +_FAULT_CODE_UPPER_BOUND: int = 599 + + +class AwsSpanMetricsProcessor(SpanProcessor): + """AwsSpanMetricsProcessor is SpanProcessor that generates metrics from spans + + This processor will generate metrics based on span data. It depends on a MetricAttributeGenerator being provided on + instantiation, which will provide a means to determine attributes which should be used to create metrics. A Resource + must also be provided, which is used to generate metrics. Finally, three Histogram must be provided, which will be + used to actually create desired metrics (see below) + + AwsSpanMetricsProcessor produces metrics for errors (e.g. HTTP 4XX status codes), faults (e.g. HTTP 5XX status + codes), and latency (in Milliseconds). Errors and faults are counted, while latency is measured with a histogram. + Metrics are emitted with attributes derived from span attributes. + + For highest fidelity metrics, this processor should be coupled with the AlwaysRecordSampler, which will result in + 100% of spans being sent to the processor. + """ + + # Metric instruments + _error_histogram: Histogram + _fault_histogram: Histogram + _latency_histogram: Histogram + + _generator: MetricAttributeGenerator + _resource: Resource + + def __init__(self, error_histogram: Histogram, fault_histogram: Histogram, latency_histogram: Histogram, generator: MetricAttributeGenerator, resource: Resource): + self._error_histogram = error_histogram + self._fault_histogram = fault_histogram + self._latency_histogram = latency_histogram + self._generator = generator + self._resource = resource + + @override + def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None: + return + + @override + def on_end(self, span: ReadableSpan) -> None: + attribute_dict: dict[str, BoundedAttributes] = self._generator.generate_metric_attributes_dict_from_span(span, self._resource) + map(lambda attributes: self._record_metrics(span, attributes), attribute_dict.values()) + + @override + def shutdown(self) -> None: + self.force_flush() + + @override + def force_flush(self, timeout_millis: int = None) -> bool: + return True + + def _record_metrics(self, span: ReadableSpan, attributes: BoundedAttributes) -> None: + # Only record metrics if non-empty attributes are returned. + if len(attributes) > 0: + self._record_error_or_fault(span, attributes) + self._record_latency(span, attributes) + + def _record_error_or_fault(self, span: ReadableSpan, attributes: BoundedAttributes) -> None: + # The logic to record error and fault should be kept in sync with the aws-xray exporter whenever possible except + # for the throttle. + # https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/awsxrayexporter/internal/translator/cause.go#L121-L160 + http_status_code: int = span.attributes.get(_HTTP_STATUS_CODE) + status_code: StatusCode = span.status.status_code + + if http_status_code is None: + http_status_code = attributes.get(_HTTP_STATUS_CODE) + + if _is_not_error_or_fault(http_status_code): + if StatusCode.ERROR == status_code: + self._error_histogram.record(0, attributes) + self._fault_histogram.record(1, attributes) + else: + self._error_histogram.record(0, attributes) + self._fault_histogram.record(0, attributes) + elif _ERROR_CODE_LOWER_BOUND <= http_status_code <= _ERROR_CODE_UPPER_BOUND: + self._error_histogram.record(1, attributes) + self._fault_histogram.record(0, attributes) + elif _FAULT_CODE_LOWER_BOUND <= http_status_code <= _FAULT_CODE_UPPER_BOUND: + self._error_histogram.record(0, attributes) + self._fault_histogram.record(1, attributes) + + def _record_latency(self, span: ReadableSpan, attributes: BoundedAttributes) -> None: + nanos: int = span.end_time - span.start_time + millis: float = nanos / _NANOS_TO_MILLIS + self._latency_histogram.record(millis, attributes) + + +def _is_not_error_or_fault(http_status_code: int) -> bool: + return (http_status_code is None + or http_status_code < _ERROR_CODE_LOWER_BOUND + or http_status_code > _FAULT_CODE_UPPER_BOUND) diff --git a/opentelemetry-distro/src/amazon/opentelemetry/distro/metric_attribute_generator.py b/opentelemetry-distro/src/amazon/opentelemetry/distro/metric_attribute_generator.py new file mode 100644 index 000000000..2faf5672a --- /dev/null +++ b/opentelemetry-distro/src/amazon/opentelemetry/distro/metric_attribute_generator.py @@ -0,0 +1,28 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import BoundedAttributes, ReadableSpan + + +class MetricAttributeGenerator: + """MetricAttributeGenerator is an interface for generating metric attributes from a span. + + Metric attribute generator defines an interface for classes that can generate specific attributes to be used by an + AwsSpanMetricsProcessor to produce metrics and by AwsMetricAttributesSpanExporter to wrap the original span. + """ + SERVICE_METRIC: str = "Service" + DEPENDENCY_METRIC: str = "Dependency" + + def generate_metric_attributes_dict_from_span(self, span: ReadableSpan, resource: Resource) -> [str, BoundedAttributes]: + """Generate metric attributes from a span. + + Given a span and associated resource, produce meaningful metric attributes for metrics produced from the span. + If no metrics should be generated from this span, return empty attributes. + + Args: + span - ReadableSpan to be used to generate metric attributes. + resource - Resource associated with Span to be used to generate metric attributes. + Returns: + A dictionary of Attributes objects with values assigned to key "Service" or "Dependency". It will contain + either 0, 1, or 2 items. + """