From 59d173112dd4e18607051bdfe078a517a91ff8db Mon Sep 17 00:00:00 2001 From: Luca Bello <36242061+lucabello@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:27:51 +0100 Subject: [PATCH] fix: set correct web.external-url and web.route-prefix (#302) * fix: set correct web.external-url and web.route-prefix * pin websockets * add reason why websockets is pinned * fetch cert_handler lib with a tls readiness workaround (#303) * Stop svc while waiting for cert * handle_exec * Lint * Debug * Partial revert * Cleanup * Update test * fetch-lib the library after merging that PR * trigger ci --------- Co-authored-by: Luca Bello <luca.bello@canonical.com> * fetch fixed cert_handler --------- Co-authored-by: Leon <82407168+sed-i@users.noreply.github.com> --- .../observability_libs/v1/cert_handler.py | 17 +- .../tempo_coordinator_k8s/v0/charm_tracing.py | 386 ++++++++++++++++-- .../v3/tls_certificates.py | 30 +- requirements.txt | 6 +- src/alertmanager.py | 3 + src/charm.py | 38 +- tests/unit/test_charm.py | 16 +- ...test_push_config_to_workload_on_startup.py | 9 +- .../test_remote_configuration_requirer.py | 29 +- tox.ini | 2 + 10 files changed, 462 insertions(+), 74 deletions(-) diff --git a/lib/charms/observability_libs/v1/cert_handler.py b/lib/charms/observability_libs/v1/cert_handler.py index 26be8793..7fcc3258 100644 --- a/lib/charms/observability_libs/v1/cert_handler.py +++ b/lib/charms/observability_libs/v1/cert_handler.py @@ -68,7 +68,7 @@ LIBID = "b5cd5cd580f3428fa5f59a8876dcbe6a" LIBAPI = 1 -LIBPATCH = 14 +LIBPATCH = 15 VAULT_SECRET_LABEL = "cert-handler-private-vault" @@ -127,7 +127,7 @@ class _RelationVaultBackend(_VaultBackend): _NEST_UNDER = "lib.charms.observability_libs.v1.cert_handler::vault" # This key needs to be relation-unique. If someone ever creates multiple Vault(_RelationVaultBackend) # instances backed by the same (peer) relation, they'll need to set different _NEST_UNDERs - # for each _RelationVaultBackend instance or they'll be fighting over it. + # for each _RelationVaultBackend instance, or they'll be fighting over it. def __init__(self, charm: CharmBase, relation_name: str): self.charm = charm @@ -344,6 +344,13 @@ def __init__( self.charm.on[self.certificates_relation_name].relation_joined, # pyright: ignore self._on_certificates_relation_joined, ) + # The following observer is a workaround. The tls-certificates lib sometimes fails to emit the custom + # "certificate_available" event on relation changed. Not sure why this was happening. We certainly have some + # tech debt here to address, but this workaround proved to work. + self.framework.observe( + self.charm.on[self.certificates_relation_name].relation_changed, # pyright: ignore + self._on_certificate_available, + ) self.framework.observe( self.certificates.on.certificate_available, # pyright: ignore self._on_certificate_available, @@ -366,7 +373,7 @@ def __init__( ) if refresh_events: - logger.warn( + logger.warning( "DEPRECATION WARNING. `refresh_events` is now deprecated. CertHandler will automatically refresh the CSR when necessary." ) @@ -429,7 +436,7 @@ def enabled(self) -> bool: See also the `available` property. """ # We need to check for units as a temporary workaround because of https://bugs.launchpad.net/juju/+bug/2024583 - # This could in theory not work correctly on scale down to 0 but it is necessary for the moment. + # This could in theory not work correctly on scale down to 0, but it is necessary for the moment. if not self.relation: return False @@ -636,7 +643,7 @@ def _on_all_certificates_invalidated(self, _: AllCertificatesInvalidatedEvent) - # Note: assuming "limit: 1" in metadata # The "certificates_relation_broken" event is converted to "all invalidated" custom # event by the tls-certificates library. Per convention, we let the lib manage the - # relation and we do not observe "certificates_relation_broken" directly. + # relation, and we do not observe "certificates_relation_broken" directly. self.vault.clear() # We do not generate a CSR here because the relation is gone. self.on.cert_changed.emit() # pyright: ignore diff --git a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py index 3aea50f0..cf8def11 100644 --- a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py +++ b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py @@ -117,6 +117,57 @@ def get_tracer(self) -> opentelemetry.trace.Tracer: See the official opentelemetry Python SDK documentation for usage: https://opentelemetry-python.readthedocs.io/en/latest/ + +## Caching traces +The `trace_charm` machinery will buffer any traces collected during charm execution and store them +to a file on the charm container until a tracing backend becomes available. At that point, it will +flush them to the tracing receiver. + +By default, the buffer is configured to start dropping old traces if any of these conditions apply: + +- the storage size exceeds 10 MiB +- the number of buffered events exceeds 100 + +You can configure this by, for example: + +```python +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert", + # only cache up to 42 events + buffer_max_events=42, + # only cache up to 42 MiB + buffer_max_size_mib=42, # minimum 10! +) +class MyCharm(CharmBase): + ... +``` + +Note that setting `buffer_max_events` to 0 will effectively disable the buffer. + +The path of the buffer file is by default in the charm's execution root, which for k8s charms means +that in case of pod churn, the cache will be lost. The recommended solution is to use an existing storage +(or add a new one) such as: + +```yaml +storage: + data: + type: filesystem + location: /charm-traces +``` + +and then configure the `@trace_charm` decorator to use it as path for storing the buffer: +```python +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert", + # store traces to a PVC so they're not lost on pod restart. + buffer_path="/charm-traces/buffer.file", +) +class MyCharm(CharmBase): + ... +``` + ## Upgrading from `v0` If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already @@ -174,6 +225,12 @@ def my_tracing_endpoint(self) -> Optional[str]: 3) If you were passing a certificate (str) using `server_cert`, you need to change it to provide an *absolute* path to the certificate file instead. """ +import typing + +from opentelemetry.exporter.otlp.proto.common._internal.trace_encoder import ( + encode_spans, +) +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter def _remove_stale_otel_sdk_packages(): @@ -225,6 +282,9 @@ def _remove_stale_otel_sdk_packages(): otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ") +# apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. +# it could be trouble if someone ever decides to implement their own tracer parallel to +# ours and before the charm has inited. We assume they won't. _remove_stale_otel_sdk_packages() import functools @@ -238,6 +298,7 @@ def _remove_stale_otel_sdk_packages(): Any, Callable, Generator, + List, Optional, Sequence, Type, @@ -250,8 +311,12 @@ def _remove_stale_otel_sdk_packages(): import ops from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import Span, TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace import ReadableSpan, Span, TracerProvider +from opentelemetry.sdk.trace.export import ( + BatchSpanProcessor, + SpanExporter, + SpanExportResult, +) from opentelemetry.trace import INVALID_SPAN, Tracer from opentelemetry.trace import get_current_span as otlp_get_current_span from opentelemetry.trace import ( @@ -272,7 +337,7 @@ def _remove_stale_otel_sdk_packages(): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 3 +LIBPATCH = 4 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] @@ -280,7 +345,7 @@ def _remove_stale_otel_sdk_packages(): dev_logger = logging.getLogger("tracing-dev") # set this to 0 if you are debugging/developing this library source -dev_logger.setLevel(logging.CRITICAL) +dev_logger.setLevel(logging.ERROR) _CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof _C = TypeVar("_C", bound=_CharmType) @@ -290,6 +355,186 @@ def _remove_stale_otel_sdk_packages(): _GetterType = Union[Callable[[_CharmType], Optional[str]], property] CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" +BUFFER_DEFAULT_CACHE_FILE_NAME = ".charm_tracing_buffer.raw" +# we store the buffer as raw otlp-native protobuf (bytes) since it's hard to serialize/deserialize it in +# any portable format. Json dumping is supported, but loading isn't. +# cfr: https://github.com/open-telemetry/opentelemetry-python/issues/1003 + +BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB = 10 +_BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN = 10 +BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH = 100 +_MiB_TO_B = 2**20 # megabyte to byte conversion rate +_OTLP_SPAN_EXPORTER_TIMEOUT = 1 +"""Timeout in seconds that the OTLP span exporter has to push traces to the backend.""" + + +class _Buffer: + """Handles buffering for spans emitted while no tracing backend is configured or available. + + Use the max_event_history_length_buffering param of @trace_charm to tune + the amount of memory that this will hog on your units. + + The buffer is formatted as a bespoke byte dump (protobuf limitation). + We cannot store them as json because that is not well-supported by the sdk + (see https://github.com/open-telemetry/opentelemetry-python/issues/3364). + """ + + _SPANSEP = b"__CHARM_TRACING_BUFFER_SPAN_SEP__" + + def __init__(self, db_file: Path, max_event_history_length: int, max_buffer_size_mib: int): + self._db_file = db_file + self._max_event_history_length = max_event_history_length + self._max_buffer_size_mib = max(max_buffer_size_mib, _BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN) + + # set by caller + self.exporter: Optional[OTLPSpanExporter] = None + + def save(self, spans: typing.Sequence[ReadableSpan]): + """Save the spans collected by this exporter to the cache file. + + This method should be as fail-safe as possible. + """ + if self._max_event_history_length < 1: + dev_logger.debug("buffer disabled: max history length < 1") + return + + current_history_length = len(self.load()) + new_history_length = current_history_length + len(spans) + if (diff := self._max_event_history_length - new_history_length) < 0: + self.drop(diff) + self._save(spans) + + def _serialize(self, spans: Sequence[ReadableSpan]) -> bytes: + # encode because otherwise we can't json-dump them + return encode_spans(spans).SerializeToString() + + def _save(self, spans: Sequence[ReadableSpan], replace: bool = False): + dev_logger.debug(f"saving {len(spans)} new spans to buffer") + old = [] if replace else self.load() + new = self._serialize(spans) + + try: + # if the buffer exceeds the size limit, we start dropping old spans until it does + + while len((new + self._SPANSEP.join(old))) > (self._max_buffer_size_mib * _MiB_TO_B): + if not old: + # if we've already dropped all spans and still we can't get under the + # size limit, we can't save this span + logger.error( + f"span exceeds total buffer size limit ({self._max_buffer_size_mib}MiB); " + f"buffering FAILED" + ) + return + + old = old[1:] + logger.warning( + f"buffer size exceeds {self._max_buffer_size_mib}MiB; dropping older spans... " + f"Please increase the buffer size, disable buffering, or ensure the spans can be flushed." + ) + + self._db_file.write_bytes(new + self._SPANSEP.join(old)) + except Exception: + logger.exception("error buffering spans") + + def load(self) -> List[bytes]: + """Load currently buffered spans from the cache file. + + This method should be as fail-safe as possible. + """ + if not self._db_file.exists(): + dev_logger.debug("buffer file not found. buffer empty.") + return [] + try: + spans = self._db_file.read_bytes().split(self._SPANSEP) + except Exception: + logger.exception(f"error parsing {self._db_file}") + return [] + return spans + + def drop(self, n_spans: Optional[int] = None): + """Drop some currently buffered spans from the cache file.""" + current = self.load() + if n_spans: + dev_logger.debug(f"dropping {n_spans} spans from buffer") + new = current[n_spans:] + else: + dev_logger.debug("emptying buffer") + new = [] + + self._db_file.write_bytes(self._SPANSEP.join(new)) + + def flush(self) -> Optional[bool]: + """Export all buffered spans to the given exporter, then clear the buffer. + + Returns whether the flush was successful, and None if there was nothing to flush. + """ + if not self.exporter: + dev_logger.debug("no exporter set; skipping buffer flush") + return False + + buffered_spans = self.load() + if not buffered_spans: + dev_logger.debug("nothing to flush; buffer empty") + return None + + errors = False + for span in buffered_spans: + try: + out = self.exporter._export(span) # type: ignore + if not (200 <= out.status_code < 300): + # take any 2xx status code as a success + errors = True + except ConnectionError: + dev_logger.debug( + "failed exporting buffered span; backend might be down or still starting" + ) + errors = True + except Exception: + logger.exception("unexpected error while flushing span batch from buffer") + errors = True + + if not errors: + self.drop() + else: + logger.error("failed flushing spans; buffer preserved") + return not errors + + @property + def is_empty(self): + """Utility to check whether the buffer has any stored spans. + + This is more efficient than attempting a load() given how large the buffer might be. + """ + return (not self._db_file.exists()) or (self._db_file.stat().st_size == 0) + + +class _OTLPSpanExporter(OTLPSpanExporter): + """Subclass of OTLPSpanExporter to configure the max retry timeout, so that it fails a bit faster.""" + + # The issue we're trying to solve is that the model takes AGES to settle if e.g. tls is misconfigured, + # as every hook of a charm_tracing-instrumented charm takes about a minute to exit, as the charm can't + # flush the traces and keeps retrying for 'too long' + + _MAX_RETRY_TIMEOUT = 4 + # we give the exporter 4 seconds in total to succeed pushing the traces to tempo + # if it fails, we'll be caching the data in the buffer and flush it the next time, so there's no data loss risk. + # this means 2/3 retries (hard to guess from the implementation) and up to ~7 seconds total wait + + +class _BufferedExporter(InMemorySpanExporter): + def __init__(self, buffer: _Buffer) -> None: + super().__init__() + self._buffer = buffer + + def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: + self._buffer.save(spans) + return super().export(spans) + + def force_flush(self, timeout_millis: int = 0) -> bool: + # parent implementation is fake, so the timeout_millis arg is not doing anything. + result = super().force_flush(timeout_millis) + self._buffer.save(self.get_finished_spans()) + return result def is_enabled() -> bool: @@ -426,7 +671,10 @@ def _setup_root_span_initializer( charm_type: _CharmType, tracing_endpoint_attr: str, server_cert_attr: Optional[str], - service_name: Optional[str] = None, + service_name: Optional[str], + buffer_path: Optional[Path], + buffer_max_events: int, + buffer_max_size_mib: int, ): """Patch the charm's initializer.""" original_init = charm_type.__init__ @@ -445,18 +693,11 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): logger.info("Tracing DISABLED: skipping root span initialization") return - # already init some attrs that will be reinited later by calling original_init: - # self.framework = framework - # self.handle = Handle(None, self.handle_kind, None) - original_event_context = framework._event_context # default service name isn't just app name because it could conflict with the workload service name _service_name = service_name or f"{self.app.name}-charm" unit_name = self.unit.name - # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. - # it could be trouble if someone ever decides to implement their own tracer parallel to - # ours and before the charm has inited. We assume they won't. resource = Resource.create( attributes={ "service.name": _service_name, @@ -474,33 +715,60 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) + buffer_only = False + # whether we're only exporting to buffer, or also to the otlp exporter. + if not tracing_endpoint: # tracing is off if tracing_endpoint is None - return + # however we can buffer things until tracing comes online + buffer_only = True server_cert: Optional[Union[str, Path]] = ( _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None ) - if tracing_endpoint.startswith("https://") and not server_cert: + if (tracing_endpoint and tracing_endpoint.startswith("https://")) and not server_cert: logger.error( "Tracing endpoint is https, but no server_cert has been passed." "Please point @trace_charm to a `server_cert` attr. " "This might also mean that the tracing provider is related to a " "certificates provider, but this application is not (yet). " "In that case, you might just have to wait a bit for the certificates " - "integration to settle. " + "integration to settle. This span will be buffered." ) - return + buffer_only = True - exporter = OTLPSpanExporter( - endpoint=tracing_endpoint, - certificate_file=str(Path(server_cert).absolute()) if server_cert else None, - timeout=2, + buffer = _Buffer( + db_file=buffer_path or Path() / BUFFER_DEFAULT_CACHE_FILE_NAME, + max_event_history_length=buffer_max_events, + max_buffer_size_mib=buffer_max_size_mib, ) + previous_spans_buffered = not buffer.is_empty + + exporters: List[SpanExporter] = [] + if buffer_only: + # we have to buffer because we're missing necessary backend configuration + dev_logger.debug("buffering mode: ON") + exporters.append(_BufferedExporter(buffer)) + + else: + dev_logger.debug("buffering mode: FALLBACK") + # in principle, we have the right configuration to be pushing traces, + # but if we fail for whatever reason, we will put everything in the buffer + # and retry the next time + otlp_exporter = _OTLPSpanExporter( + endpoint=tracing_endpoint, + certificate_file=str(Path(server_cert).absolute()) if server_cert else None, + timeout=_OTLP_SPAN_EXPORTER_TIMEOUT, # give individual requests 1 second to succeed + ) + exporters.append(otlp_exporter) + exporters.append(_BufferedExporter(buffer)) + buffer.exporter = otlp_exporter + + for exporter in exporters: + processor = BatchSpanProcessor(exporter) + provider.add_span_processor(processor) - processor = BatchSpanProcessor(exporter) - provider.add_span_processor(processor) set_tracer_provider(provider) _tracer = get_tracer(_service_name) # type: ignore _tracer_token = tracer.set(_tracer) @@ -524,7 +792,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): @contextmanager def wrap_event_context(event_name: str): - dev_logger.info(f"entering event context: {event_name}") + dev_logger.debug(f"entering event context: {event_name}") # when the framework enters an event context, we create a span. with _span("event: " + event_name) as event_context_span: if event_context_span: @@ -538,12 +806,50 @@ def wrap_event_context(event_name: str): @functools.wraps(original_close) def wrap_close(): - dev_logger.info("tearing down tracer and flushing traces") + dev_logger.debug("tearing down tracer and flushing traces") span.end() opentelemetry.context.detach(span_token) # type: ignore tracer.reset(_tracer_token) tp = cast(TracerProvider, get_tracer_provider()) - tp.force_flush(timeout_millis=1000) # don't block for too long + flush_successful = tp.force_flush(timeout_millis=1000) # don't block for too long + + if buffer_only: + # if we're in buffer_only mode, it means we couldn't even set up the exporter for + # tempo as we're missing some data. + # so attempting to flush the buffer doesn't make sense + dev_logger.debug("tracing backend unavailable: all spans pushed to buffer") + + else: + dev_logger.debug("tracing backend found: attempting to flush buffer...") + + # if we do have an exporter for tempo, and we could send traces to it, + # we can attempt to flush the buffer as well. + if not flush_successful: + logger.error("flushing FAILED: unable to push traces to backend.") + else: + dev_logger.debug("flush succeeded.") + + # the backend has accepted the spans generated during this event, + if not previous_spans_buffered: + # if the buffer was empty to begin with, any spans we collected now can be discarded + buffer.drop() + dev_logger.debug("buffer dropped: this trace has been sent already") + else: + # if the buffer was nonempty, we can attempt to flush it + dev_logger.debug("attempting buffer flush...") + buffer_flush_successful = buffer.flush() + if buffer_flush_successful: + dev_logger.debug("buffer flush OK") + elif buffer_flush_successful is None: + # TODO is this even possible? + dev_logger.debug("buffer flush OK; empty: nothing to flush") + else: + # this situation is pretty weird, I'm not even sure it can happen, + # because it would mean that we did manage + # to push traces directly to the tempo exporter (flush_successful), + # but the buffer flush failed to push to the same exporter! + logger.error("buffer flush FAILED") + tp.shutdown() original_close() @@ -558,6 +864,9 @@ def trace_charm( server_cert: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), + buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, + buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, + buffer_path: Optional[Union[str, Path]] = None, ) -> Callable[[_T], _T]: """Autoinstrument the decorated charm with tracing telemetry. @@ -599,6 +908,10 @@ def trace_charm( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... + :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. + :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. + Minimum 10MiB. + :param buffer_path: path to buffer file to use for saving buffered spans. """ def _decorator(charm_type: _T) -> _T: @@ -609,6 +922,9 @@ def _decorator(charm_type: _T) -> _T: server_cert_attr=server_cert, service_name=service_name, extra_types=extra_types, + buffer_path=Path(buffer_path) if buffer_path else None, + buffer_max_size_mib=buffer_max_size_mib, + buffer_max_events=buffer_max_events, ) return charm_type @@ -621,6 +937,9 @@ def _autoinstrument( server_cert_attr: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), + buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, + buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, + buffer_path: Optional[Path] = None, ) -> _T: """Set up tracing on this charm class. @@ -653,13 +972,20 @@ def _autoinstrument( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... + :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. + :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. + Minimum 10MiB. + :param buffer_path: path to buffer file to use for saving buffered spans. """ - dev_logger.info(f"instrumenting {charm_type}") + dev_logger.debug(f"instrumenting {charm_type}") _setup_root_span_initializer( charm_type, tracing_endpoint_attr, server_cert_attr=server_cert_attr, service_name=service_name, + buffer_path=buffer_path, + buffer_max_events=buffer_max_events, + buffer_max_size_mib=buffer_max_size_mib, ) trace_type(charm_type) for type_ in extra_types: @@ -675,12 +1001,12 @@ def trace_type(cls: _T) -> _T: It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` has been instantiated. """ - dev_logger.info(f"instrumenting {cls}") + dev_logger.debug(f"instrumenting {cls}") for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): - dev_logger.info(f"discovered {method}") + dev_logger.debug(f"discovered {method}") if method.__name__.startswith("__"): - dev_logger.info(f"skipping {method} (dunder)") + dev_logger.debug(f"skipping {method} (dunder)") continue # the span title in the general case should be: @@ -726,7 +1052,7 @@ def trace_function(function: _F, name: Optional[str] = None) -> _F: def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: - dev_logger.info(f"instrumenting {callable}") + dev_logger.debug(f"instrumenting {callable}") # sig = inspect.signature(callable) @functools.wraps(callable) diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py index da7fa95e..141412b0 100644 --- a/lib/charms/tls_certificates_interface/v3/tls_certificates.py +++ b/lib/charms/tls_certificates_interface/v3/tls_certificates.py @@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 20 +LIBPATCH = 23 PYDEPS = ["cryptography", "jsonschema"] @@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: ) else: try: + secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") logger.debug( "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}" ) - secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") + # Juju < 3.6 will create a new revision even if the content is the same + if ( + secret.get_content(refresh=True).get("certificate", "") + == certificate.certificate + ): + logger.debug( + "Secret %s with correct certificate already exists", + f"{LIBID}-{csr_in_sha256_hex}", + ) + continue secret.set_content( {"certificate": certificate.certificate, "csr": certificate.csr} ) @@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None: provider_certificate = self._find_certificate_in_relation_data(csr) if not provider_certificate: # A secret expired but we did not find matching certificate. Cleaning up + logger.warning( + "Failed to find matching certificate for csr, cleaning up secret %s", + event.secret.label, + ) event.secret.remove_all_revisions() return if not provider_certificate.expiry_time: # A secret expired but matching certificate is invalid. Cleaning up + logger.warning( + "Certificate matching csr is invalid, cleaning up secret %s", + event.secret.label, + ) event.secret.remove_all_revisions() return @@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti return provider_certificate return None - def _get_csr_from_secret(self, secret: Secret) -> str: + def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]: """Extract the CSR from the secret label or content. This function is a workaround to maintain backwards compatibility and fix the issue reported in https://github.com/canonical/tls-certificates-interface/issues/228 """ - if not (csr := secret.get_content().get("csr", "")): + try: + content = secret.get_content(refresh=True) + except SecretNotFoundError: + return None + if not (csr := content.get("csr", None)): # In versions <14 of the Lib we were storing the CSR in the label of the secret # The CSR now is stored int the content of the secret, which was a breaking change # Here we get the CSR if the secret was created by an app using libpatch 14 or lower diff --git a/requirements.txt b/requirements.txt index cfe46334..7b0d8ffc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,8 +11,7 @@ jsonschema # Code: https://github.com/canonical/operator/ # Docs: https://ops.rtfd.io/ # Deps: charm -# pinned to 2.16 as 2.17 breaks our unittests -ops == 2.16 +ops # YAML processing framework # Code: https://github.com/yaml/pyyaml @@ -36,3 +35,6 @@ pydantic>=2 # Deps: tracing opentelemetry-exporter-otlp-proto-http>=1.21.0 + +# Deps: lib/charms/observability_libs/v0/kubernetes_compute_resources_patch.py +tenacity diff --git a/src/alertmanager.py b/src/alertmanager.py index a0e37f3f..0e6e7243 100644 --- a/src/alertmanager.py +++ b/src/alertmanager.py @@ -84,6 +84,7 @@ def __init__( api_port: int, ha_port: int, web_external_url: str, + web_route_prefix: str, config_path: str, web_config_path: str, tls_enabled: Callable[[], bool], @@ -103,6 +104,7 @@ def __init__( self._ha_port = ha_port self.api = Alertmanager(endpoint_url=web_external_url, cafile=cafile) self._web_external_url = web_external_url + self._web_route_prefix = web_route_prefix self._config_path = config_path self._web_config_path = web_config_path self._is_tls_enabled = tls_enabled @@ -190,6 +192,7 @@ def _command(): f"--web.listen-address=:{self._api_port} " f"--cluster.listen-address={listen_netloc_arg} " f"--web.external-url={self._web_external_url} " + f"--web.route-prefix={self._web_route_prefix} " f"{web_config_arg}" f"{peer_cmd_args}" ) diff --git a/src/charm.py b/src/charm.py index 0f03ed0b..e1ed6662 100755 --- a/src/charm.py +++ b/src/charm.py @@ -106,7 +106,7 @@ def __init__(self, *args): self.ingress = IngressPerAppRequirer( self, port=self.api_port, - scheme=lambda: urlparse(self._internal_url).scheme, + scheme=self._scheme, strip_prefix=True, redirect_https=True, ) @@ -142,7 +142,8 @@ def __init__(self, *args): resource_reqs_func=self._resource_reqs_from_config, ) self.framework.observe( - self.resources_patch.on.patch_failed, self._on_k8s_patch_failed # pyright: ignore + self.resources_patch.on.patch_failed, + self._on_k8s_patch_failed, # pyright: ignore ) # Self-monitoring @@ -178,7 +179,8 @@ def __init__(self, *args): peer_netlocs=peer_ha_netlocs, api_port=self.api_port, ha_port=self._ports.ha, - web_external_url=self._internal_url, + web_external_url=self._external_url, + web_route_prefix="/", config_path=self._config_path, web_config_path=self._web_config_path, tls_enabled=self._is_tls_ready, @@ -210,10 +212,12 @@ def __init__(self, *args): # Action events self.framework.observe( - self.on.show_config_action, self._on_show_config_action # pyright: ignore + self.on.show_config_action, + self._on_show_config_action, # pyright: ignore ) self.framework.observe( - self.on.check_config_action, self._on_check_config # pyright: ignore + self.on.check_config_action, + self._on_check_config, # pyright: ignore ) def set_ports(self): @@ -404,7 +408,7 @@ def _render_manifest(self) -> ConfigFileSystemState: self._templates_path: config_suite.templates, self._amtool_config_path: config_suite.amtool, self._server_cert_path: self.server_cert.server_cert, - self._key_path: self.server_cert.private_key if self.server_cert.enabled else None, + self._key_path: self.server_cert.private_key, self._ca_cert_path: self.server_cert.ca_cert, } ) @@ -420,6 +424,9 @@ def _common_exit_hook(self, update_ca_certs: bool = False) -> None: self.unit.status = MaintenanceStatus("Waiting for pod startup to complete") return + if update_ca_certs or (self.server_cert.available and not self._certs_on_disk): + self._update_ca_certs() + # Make sure the external url is valid if external_url := self._external_url: parsed = urlparse(external_url) @@ -440,9 +447,7 @@ def _common_exit_hook(self, update_ca_certs: bool = False) -> None: # - https://github.com/canonical/prometheus-k8s-operator/issues/530, self.alertmanager_provider.update(external_url=self._internal_url) - self.ingress.provide_ingress_requirements( - scheme=urlparse(self._internal_url).scheme, port=self.api_port - ) + self.ingress.provide_ingress_requirements(scheme=self._scheme, port=self.api_port) self._scraping.update_scrape_job_spec(self.self_scraping_job) if self.peer_relation: @@ -460,9 +465,6 @@ def _common_exit_hook(self, update_ca_certs: bool = False) -> None: self.unit.status = BlockedStatus(str(e)) return - if update_ca_certs: - self._update_ca_certs() - # Update pebble layer self.alertmanager_workload.update_layer() @@ -571,8 +573,9 @@ def _get_peer_hostnames(self, include_this_unit=True) -> List[str]: return hostnames - def _is_tls_ready(self) -> bool: - """Returns True if the workload is ready to operate in TLS mode.""" + @property + def _certs_on_disk(self) -> bool: + """Check if the TLS setup is ready on disk.""" return ( self.container.can_connect() and self.container.exists(self._server_cert_path) @@ -580,6 +583,13 @@ def _is_tls_ready(self) -> bool: and self.container.exists(self._ca_cert_path) ) + def _is_tls_ready(self) -> bool: + """Returns True if the workload is ready to operate in TLS mode.""" + return self.server_cert.available and self._certs_on_disk + + def _is_waiting_for_cert(self) -> bool: + return self.server_cert.enabled and not self.server_cert.available + @property def _internal_url(self) -> str: """Return the fqdn dns-based in-cluster (private) address of the alertmanager api server.""" diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 50c6d8b4..eca9f252 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -221,11 +221,13 @@ def setUp(self, *unused): @k8s_resource_multipatch @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) def test_show_config(self, *_unused): - tls_paths = { + conditional_tls_paths = { self.harness.charm._server_cert_path, - self.harness.charm._key_path, self.harness.charm._ca_cert_path, } + unconditional_paths = { + self.harness.charm._key_path, + } # GIVEN an isolated charm (see setUp, decorator) # WHEN the "show-config" action runs @@ -237,14 +239,16 @@ def test_show_config(self, *_unused): # AND configs DOES NOT contain cert-related entries # results.configs is a list of dicts, [{"path": ..., "content": ...}, {...}, ...]. paths_rendered = {d["path"] for d in yaml.safe_load(results["configs"])} - for filepath in tls_paths: + for filepath in conditional_tls_paths: self.assertNotIn(filepath, paths_rendered) + for filepath in unconditional_paths: + self.assertIn(filepath, paths_rendered) # AND GIVEN a tls relation is in place rel_id = self.harness.add_relation("certificates", "ca") self.harness.add_relation_unit(rel_id, "ca/0") # AND cert files are on disk - for filepath in tls_paths: + for filepath in conditional_tls_paths: self.harness.model.unit.get_container("alertmanager").push( filepath, "test", make_dirs=True ) @@ -257,5 +261,7 @@ def test_show_config(self, *_unused): # AND configs contains cert-related entries paths_rendered = {d["path"] for d in yaml.safe_load(results["configs"])} - for filepath in tls_paths: + for filepath in conditional_tls_paths: + self.assertIn(filepath, paths_rendered) + for filepath in unconditional_paths: self.assertIn(filepath, paths_rendered) diff --git a/tests/unit/test_push_config_to_workload_on_startup.py b/tests/unit/test_push_config_to_workload_on_startup.py index af4142cb..3475a7d5 100644 --- a/tests/unit/test_push_config_to_workload_on_startup.py +++ b/tests/unit/test_push_config_to_workload_on_startup.py @@ -24,12 +24,14 @@ @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("0.0.0", "")) +@patch("subprocess.run") class TestPushConfigToWorkloadOnStartup(unittest.TestCase): """Feature: Push config to workload on startup. Background: Charm starts up with initial hooks. """ + @patch("subprocess.run") @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("0.0.0", "")) @k8s_resource_multipatch @patch("lightkube.core.client.GenericSyncClient") @@ -45,7 +47,7 @@ def setUp(self, *_): self.harness.begin_with_initial_hooks() @given(st.booleans()) - def test_single_unit_cluster(self, is_leader): + def test_single_unit_cluster(self, is_leader, _): """Scenario: Current unit is the only unit present.""" # WHEN only one unit is self.assertEqual(self.harness.model.app.planned_units(), 1) @@ -100,13 +102,14 @@ def test_multi_unit_cluster(self, *_): ) self.assertIn("--cluster.peer=", command) - def test_charm_blocks_on_connection_error(self): + def test_charm_blocks_on_connection_error(self, *_): self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus) self.harness.set_can_connect(CONTAINER_NAME, False) self.harness.update_config({"templates_file": "doesn't matter"}) self.assertNotIsInstance(self.harness.charm.unit.status, ActiveStatus) +@patch("subprocess.run") class TestInvalidConfig(unittest.TestCase): """Feature: Charm must block when invalid config is provided. @@ -118,6 +121,8 @@ def setUp(self): self.harness = Harness(AlertmanagerCharm) self.addCleanup(self.harness.cleanup) + self.harness.handle_exec("alertmanager", ["update-ca-certificates", "--fresh"], result="") + @k8s_resource_multipatch @patch("lightkube.core.client.GenericSyncClient") @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) diff --git a/tests/unit/test_remote_configuration_requirer.py b/tests/unit/test_remote_configuration_requirer.py index 97741806..5aa2ed2f 100644 --- a/tests/unit/test_remote_configuration_requirer.py +++ b/tests/unit/test_remote_configuration_requirer.py @@ -5,7 +5,7 @@ import logging import unittest from typing import cast -from unittest.mock import Mock, patch +from unittest.mock import patch import yaml from charms.alertmanager_k8s.v0.alertmanager_remote_configuration import ( @@ -41,16 +41,25 @@ """ +@patch("subprocess.run") class TestAlertmanagerRemoteConfigurationRequirer(unittest.TestCase): + @patch("subprocess.run") @patch("lightkube.core.client.GenericSyncClient") @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None) @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) @k8s_resource_multipatch - def setUp(self, _) -> None: + def setUp(self, *_) -> None: self.harness = testing.Harness(AlertmanagerCharm) self.addCleanup(self.harness.cleanup) self.harness.set_leader(True) + self.harness.handle_exec("alertmanager", ["update-ca-certificates", "--fresh"], result="") + self.harness.handle_exec( + "alertmanager", + [WorkloadManager._amtool_path, "check-config", AlertmanagerCharm._config_path], + result="", + ) + # TODO: Once we're on ops 2.0.0+ this can be removed as begin_with_initial_hooks() # now does it. self.harness.set_can_connect("alertmanager", True) @@ -70,14 +79,11 @@ def setUp(self, _) -> None: ) self.harness.add_relation_unit(self.relation_id, "remote-config-provider/0") - @patch("ops.model.Container.exec") @k8s_resource_multipatch def test_valid_config_pushed_to_relation_data_bag_updates_alertmanager_config( - self, patched_exec + self, + *_, ): - patched_exec_mock = Mock() - patched_exec_mock.wait_output.return_value = ("whatever", "") - patched_exec.return_value = patched_exec_mock expected_config = remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG) # add juju topology to "group_by" route = cast(dict, expected_config.get("route", {})) @@ -104,6 +110,7 @@ def test_valid_config_pushed_to_relation_data_bag_updates_alertmanager_config( @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) def test_configs_available_from_both_relation_data_bag_and_charm_config_block_charm( self, + *_, ): sample_remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG) self.harness.update_relation_data( @@ -117,15 +124,12 @@ def test_configs_available_from_both_relation_data_bag_and_charm_config_block_ch self.harness.charm.unit.status, BlockedStatus("Multiple configs detected") ) - @patch("ops.model.Container.exec") @patch("config_builder.default_config", yaml.safe_load(TEST_ALERTMANAGER_DEFAULT_CONFIG)) @k8s_resource_multipatch def test_invalid_config_pushed_to_the_relation_data_bag_does_not_update_alertmanager_config( - self, patched_exec + self, + *_, ): - patched_exec_mock = Mock() - patched_exec_mock.wait_output.return_value = ("whatever", "") - patched_exec.return_value = patched_exec_mock invalid_config = yaml.safe_load("some: invalid_config") self.harness.update_relation_data( @@ -141,6 +145,7 @@ def test_invalid_config_pushed_to_the_relation_data_bag_does_not_update_alertman @k8s_resource_multipatch def test_templates_pushed_to_relation_data_bag_are_saved_to_templates_file_in_alertmanager( self, + *_, ): sample_remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG) test_template = '{{define "myTemplate"}}do something{{end}}' diff --git a/tox.ini b/tox.ini index 2d744f55..66da8f20 100644 --- a/tox.ini +++ b/tox.ini @@ -124,6 +124,8 @@ deps = numpy # https://github.com/juju/python-libjuju/issues/1025 juju<=3.3.0,>=3.0 + # https://github.com/juju/python-libjuju/issues/1184 + websockets<14 pytest pytest-operator pytest-httpserver