From 59d173112dd4e18607051bdfe078a517a91ff8db Mon Sep 17 00:00:00 2001
From: Luca Bello <36242061+lucabello@users.noreply.github.com>
Date: Tue, 3 Dec 2024 12:27:51 +0100
Subject: [PATCH] fix: set correct web.external-url and web.route-prefix (#302)

* fix: set correct web.external-url and web.route-prefix

* pin websockets

* add reason why websockets is pinned

* fetch cert_handler lib with a tls readiness workaround (#303)

* Stop svc while waiting for cert

* handle_exec

* Lint

* Debug

* Partial revert

* Cleanup

* Update test

* fetch-lib the library after merging that PR

* trigger ci

---------

Co-authored-by: Luca Bello <luca.bello@canonical.com>

* fetch fixed cert_handler

---------

Co-authored-by: Leon <82407168+sed-i@users.noreply.github.com>
---
 .../observability_libs/v1/cert_handler.py     |  17 +-
 .../tempo_coordinator_k8s/v0/charm_tracing.py | 386 ++++++++++++++++--
 .../v3/tls_certificates.py                    |  30 +-
 requirements.txt                              |   6 +-
 src/alertmanager.py                           |   3 +
 src/charm.py                                  |  38 +-
 tests/unit/test_charm.py                      |  16 +-
 ...test_push_config_to_workload_on_startup.py |   9 +-
 .../test_remote_configuration_requirer.py     |  29 +-
 tox.ini                                       |   2 +
 10 files changed, 462 insertions(+), 74 deletions(-)

diff --git a/lib/charms/observability_libs/v1/cert_handler.py b/lib/charms/observability_libs/v1/cert_handler.py
index 26be8793..7fcc3258 100644
--- a/lib/charms/observability_libs/v1/cert_handler.py
+++ b/lib/charms/observability_libs/v1/cert_handler.py
@@ -68,7 +68,7 @@
 
 LIBID = "b5cd5cd580f3428fa5f59a8876dcbe6a"
 LIBAPI = 1
-LIBPATCH = 14
+LIBPATCH = 15
 
 VAULT_SECRET_LABEL = "cert-handler-private-vault"
 
@@ -127,7 +127,7 @@ class _RelationVaultBackend(_VaultBackend):
     _NEST_UNDER = "lib.charms.observability_libs.v1.cert_handler::vault"
     # This key needs to be relation-unique. If someone ever creates multiple Vault(_RelationVaultBackend)
     # instances backed by the same (peer) relation, they'll need to set different _NEST_UNDERs
-    # for each _RelationVaultBackend instance or they'll be fighting over it.
+    # for each _RelationVaultBackend instance, or they'll be fighting over it.
 
     def __init__(self, charm: CharmBase, relation_name: str):
         self.charm = charm
@@ -344,6 +344,13 @@ def __init__(
             self.charm.on[self.certificates_relation_name].relation_joined,  # pyright: ignore
             self._on_certificates_relation_joined,
         )
+        # The following observer is a workaround. The tls-certificates lib sometimes fails to emit the custom
+        # "certificate_available" event on relation changed. Not sure why this was happening. We certainly have some
+        # tech debt here to address, but this workaround proved to work.
+        self.framework.observe(
+            self.charm.on[self.certificates_relation_name].relation_changed,  # pyright: ignore
+            self._on_certificate_available,
+        )
         self.framework.observe(
             self.certificates.on.certificate_available,  # pyright: ignore
             self._on_certificate_available,
@@ -366,7 +373,7 @@ def __init__(
         )
 
         if refresh_events:
-            logger.warn(
+            logger.warning(
                 "DEPRECATION WARNING. `refresh_events` is now deprecated. CertHandler will automatically refresh the CSR when necessary."
             )
 
@@ -429,7 +436,7 @@ def enabled(self) -> bool:
         See also the `available` property.
         """
         # We need to check for units as a temporary workaround because of https://bugs.launchpad.net/juju/+bug/2024583
-        # This could in theory not work correctly on scale down to 0 but it is necessary for the moment.
+        # This could in theory not work correctly on scale down to 0, but it is necessary for the moment.
 
         if not self.relation:
             return False
@@ -636,7 +643,7 @@ def _on_all_certificates_invalidated(self, _: AllCertificatesInvalidatedEvent) -
         # Note: assuming "limit: 1" in metadata
         # The "certificates_relation_broken" event is converted to "all invalidated" custom
         # event by the tls-certificates library. Per convention, we let the lib manage the
-        # relation and we do not observe "certificates_relation_broken" directly.
+        # relation, and we do not observe "certificates_relation_broken" directly.
         self.vault.clear()
         # We do not generate a CSR here because the relation is gone.
         self.on.cert_changed.emit()  # pyright: ignore
diff --git a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py
index 3aea50f0..cf8def11 100644
--- a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py
+++ b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py
@@ -117,6 +117,57 @@ def get_tracer(self) -> opentelemetry.trace.Tracer:
 See the official opentelemetry Python SDK documentation for usage:
 https://opentelemetry-python.readthedocs.io/en/latest/
 
+
+## Caching traces
+The `trace_charm` machinery will buffer any traces collected during charm execution and store them
+to a file on the charm container until a tracing backend becomes available. At that point, it will
+flush them to the tracing receiver.
+
+By default, the buffer is configured to start dropping old traces if any of these conditions apply:
+
+- the storage size exceeds 10 MiB
+- the number of buffered events exceeds 100
+
+You can configure this by, for example:
+
+```python
+@trace_charm(
+    tracing_endpoint="my_tracing_endpoint",
+    server_cert="_server_cert",
+    # only cache up to 42 events
+    buffer_max_events=42,
+    # only cache up to 42 MiB
+    buffer_max_size_mib=42,  # minimum 10!
+)
+class MyCharm(CharmBase):
+    ...
+```
+
+Note that setting `buffer_max_events` to 0 will effectively disable the buffer.
+
+The path of the buffer file is by default in the charm's execution root, which for k8s charms means
+that in case of pod churn, the cache will be lost. The recommended solution is to use an existing storage
+(or add a new one) such as:
+
+```yaml
+storage:
+  data:
+    type: filesystem
+    location: /charm-traces
+```
+
+and then configure the `@trace_charm` decorator to use it as path for storing the buffer:
+```python
+@trace_charm(
+    tracing_endpoint="my_tracing_endpoint",
+    server_cert="_server_cert",
+    # store traces to a PVC so they're not lost on pod restart.
+    buffer_path="/charm-traces/buffer.file",
+)
+class MyCharm(CharmBase):
+    ...
+```
+
 ## Upgrading from `v0`
 
 If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already
@@ -174,6 +225,12 @@ def my_tracing_endpoint(self) -> Optional[str]:
 3) If you were passing a certificate (str) using `server_cert`, you need to change it to
 provide an *absolute* path to the certificate file instead.
 """
+import typing
+
+from opentelemetry.exporter.otlp.proto.common._internal.trace_encoder import (
+    encode_spans,
+)
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
 
 
 def _remove_stale_otel_sdk_packages():
@@ -225,6 +282,9 @@ def _remove_stale_otel_sdk_packages():
     otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ")
 
 
+# apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm.
+# it could be trouble if someone ever decides to implement their own tracer parallel to
+# ours and before the charm has inited. We assume they won't.
 _remove_stale_otel_sdk_packages()
 
 import functools
@@ -238,6 +298,7 @@ def _remove_stale_otel_sdk_packages():
     Any,
     Callable,
     Generator,
+    List,
     Optional,
     Sequence,
     Type,
@@ -250,8 +311,12 @@ def _remove_stale_otel_sdk_packages():
 import ops
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.trace import Span, TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace import ReadableSpan, Span, TracerProvider
+from opentelemetry.sdk.trace.export import (
+    BatchSpanProcessor,
+    SpanExporter,
+    SpanExportResult,
+)
 from opentelemetry.trace import INVALID_SPAN, Tracer
 from opentelemetry.trace import get_current_span as otlp_get_current_span
 from opentelemetry.trace import (
@@ -272,7 +337,7 @@ def _remove_stale_otel_sdk_packages():
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
 
-LIBPATCH = 3
+LIBPATCH = 4
 
 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"]
 
@@ -280,7 +345,7 @@ def _remove_stale_otel_sdk_packages():
 dev_logger = logging.getLogger("tracing-dev")
 
 # set this to 0 if you are debugging/developing this library source
-dev_logger.setLevel(logging.CRITICAL)
+dev_logger.setLevel(logging.ERROR)
 
 _CharmType = Type[CharmBase]  # the type CharmBase and any subclass thereof
 _C = TypeVar("_C", bound=_CharmType)
@@ -290,6 +355,186 @@ def _remove_stale_otel_sdk_packages():
 _GetterType = Union[Callable[[_CharmType], Optional[str]], property]
 
 CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED"
+BUFFER_DEFAULT_CACHE_FILE_NAME = ".charm_tracing_buffer.raw"
+# we store the buffer as raw otlp-native protobuf (bytes) since it's hard to serialize/deserialize it in
+# any portable format. Json dumping is supported, but loading isn't.
+# cfr: https://github.com/open-telemetry/opentelemetry-python/issues/1003
+
+BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB = 10
+_BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN = 10
+BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH = 100
+_MiB_TO_B = 2**20  # megabyte to byte conversion rate
+_OTLP_SPAN_EXPORTER_TIMEOUT = 1
+"""Timeout in seconds that the OTLP span exporter has to push traces to the backend."""
+
+
+class _Buffer:
+    """Handles buffering for spans emitted while no tracing backend is configured or available.
+
+    Use the max_event_history_length_buffering param of @trace_charm to tune
+    the amount of memory that this will hog on your units.
+
+    The buffer is formatted as a bespoke byte dump (protobuf limitation).
+    We cannot store them as json because that is not well-supported by the sdk
+    (see https://github.com/open-telemetry/opentelemetry-python/issues/3364).
+    """
+
+    _SPANSEP = b"__CHARM_TRACING_BUFFER_SPAN_SEP__"
+
+    def __init__(self, db_file: Path, max_event_history_length: int, max_buffer_size_mib: int):
+        self._db_file = db_file
+        self._max_event_history_length = max_event_history_length
+        self._max_buffer_size_mib = max(max_buffer_size_mib, _BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN)
+
+        # set by caller
+        self.exporter: Optional[OTLPSpanExporter] = None
+
+    def save(self, spans: typing.Sequence[ReadableSpan]):
+        """Save the spans collected by this exporter to the cache file.
+
+        This method should be as fail-safe as possible.
+        """
+        if self._max_event_history_length < 1:
+            dev_logger.debug("buffer disabled: max history length < 1")
+            return
+
+        current_history_length = len(self.load())
+        new_history_length = current_history_length + len(spans)
+        if (diff := self._max_event_history_length - new_history_length) < 0:
+            self.drop(diff)
+        self._save(spans)
+
+    def _serialize(self, spans: Sequence[ReadableSpan]) -> bytes:
+        # encode because otherwise we can't json-dump them
+        return encode_spans(spans).SerializeToString()
+
+    def _save(self, spans: Sequence[ReadableSpan], replace: bool = False):
+        dev_logger.debug(f"saving {len(spans)} new spans to buffer")
+        old = [] if replace else self.load()
+        new = self._serialize(spans)
+
+        try:
+            # if the buffer exceeds the size limit, we start dropping old spans until it does
+
+            while len((new + self._SPANSEP.join(old))) > (self._max_buffer_size_mib * _MiB_TO_B):
+                if not old:
+                    # if we've already dropped all spans and still we can't get under the
+                    # size limit, we can't save this span
+                    logger.error(
+                        f"span exceeds total buffer size limit ({self._max_buffer_size_mib}MiB); "
+                        f"buffering FAILED"
+                    )
+                    return
+
+                old = old[1:]
+                logger.warning(
+                    f"buffer size exceeds {self._max_buffer_size_mib}MiB; dropping older spans... "
+                    f"Please increase the buffer size, disable buffering, or ensure the spans can be flushed."
+                )
+
+            self._db_file.write_bytes(new + self._SPANSEP.join(old))
+        except Exception:
+            logger.exception("error buffering spans")
+
+    def load(self) -> List[bytes]:
+        """Load currently buffered spans from the cache file.
+
+        This method should be as fail-safe as possible.
+        """
+        if not self._db_file.exists():
+            dev_logger.debug("buffer file not found. buffer empty.")
+            return []
+        try:
+            spans = self._db_file.read_bytes().split(self._SPANSEP)
+        except Exception:
+            logger.exception(f"error parsing {self._db_file}")
+            return []
+        return spans
+
+    def drop(self, n_spans: Optional[int] = None):
+        """Drop some currently buffered spans from the cache file."""
+        current = self.load()
+        if n_spans:
+            dev_logger.debug(f"dropping {n_spans} spans from buffer")
+            new = current[n_spans:]
+        else:
+            dev_logger.debug("emptying buffer")
+            new = []
+
+        self._db_file.write_bytes(self._SPANSEP.join(new))
+
+    def flush(self) -> Optional[bool]:
+        """Export all buffered spans to the given exporter, then clear the buffer.
+
+        Returns whether the flush was successful, and None if there was nothing to flush.
+        """
+        if not self.exporter:
+            dev_logger.debug("no exporter set; skipping buffer flush")
+            return False
+
+        buffered_spans = self.load()
+        if not buffered_spans:
+            dev_logger.debug("nothing to flush; buffer empty")
+            return None
+
+        errors = False
+        for span in buffered_spans:
+            try:
+                out = self.exporter._export(span)  # type: ignore
+                if not (200 <= out.status_code < 300):
+                    # take any 2xx status code as a success
+                    errors = True
+            except ConnectionError:
+                dev_logger.debug(
+                    "failed exporting buffered span; backend might be down or still starting"
+                )
+                errors = True
+            except Exception:
+                logger.exception("unexpected error while flushing span batch from buffer")
+                errors = True
+
+        if not errors:
+            self.drop()
+        else:
+            logger.error("failed flushing spans; buffer preserved")
+        return not errors
+
+    @property
+    def is_empty(self):
+        """Utility to check whether the buffer has any stored spans.
+
+        This is more efficient than attempting a load() given how large the buffer might be.
+        """
+        return (not self._db_file.exists()) or (self._db_file.stat().st_size == 0)
+
+
+class _OTLPSpanExporter(OTLPSpanExporter):
+    """Subclass of OTLPSpanExporter to configure the max retry timeout, so that it fails a bit faster."""
+
+    # The issue we're trying to solve is that the model takes AGES to settle if e.g. tls is misconfigured,
+    # as every hook of a charm_tracing-instrumented charm takes about a minute to exit, as the charm can't
+    # flush the traces and keeps retrying for 'too long'
+
+    _MAX_RETRY_TIMEOUT = 4
+    # we give the exporter 4 seconds in total to succeed pushing the traces to tempo
+    # if it fails, we'll be caching the data in the buffer and flush it the next time, so there's no data loss risk.
+    # this means 2/3 retries (hard to guess from the implementation) and up to ~7 seconds total wait
+
+
+class _BufferedExporter(InMemorySpanExporter):
+    def __init__(self, buffer: _Buffer) -> None:
+        super().__init__()
+        self._buffer = buffer
+
+    def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult:
+        self._buffer.save(spans)
+        return super().export(spans)
+
+    def force_flush(self, timeout_millis: int = 0) -> bool:
+        # parent implementation is fake, so the timeout_millis arg is not doing anything.
+        result = super().force_flush(timeout_millis)
+        self._buffer.save(self.get_finished_spans())
+        return result
 
 
 def is_enabled() -> bool:
@@ -426,7 +671,10 @@ def _setup_root_span_initializer(
     charm_type: _CharmType,
     tracing_endpoint_attr: str,
     server_cert_attr: Optional[str],
-    service_name: Optional[str] = None,
+    service_name: Optional[str],
+    buffer_path: Optional[Path],
+    buffer_max_events: int,
+    buffer_max_size_mib: int,
 ):
     """Patch the charm's initializer."""
     original_init = charm_type.__init__
@@ -445,18 +693,11 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs):
             logger.info("Tracing DISABLED: skipping root span initialization")
             return
 
-        # already init some attrs that will be reinited later by calling original_init:
-        # self.framework = framework
-        # self.handle = Handle(None, self.handle_kind, None)
-
         original_event_context = framework._event_context
         # default service name isn't just app name because it could conflict with the workload service name
         _service_name = service_name or f"{self.app.name}-charm"
 
         unit_name = self.unit.name
-        # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm.
-        # it could be trouble if someone ever decides to implement their own tracer parallel to
-        # ours and before the charm has inited. We assume they won't.
         resource = Resource.create(
             attributes={
                 "service.name": _service_name,
@@ -474,33 +715,60 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs):
         # if anything goes wrong with retrieving the endpoint, we let the exception bubble up.
         tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type)
 
+        buffer_only = False
+        # whether we're only exporting to buffer, or also to the otlp exporter.
+
         if not tracing_endpoint:
             # tracing is off if tracing_endpoint is None
-            return
+            # however we can buffer things until tracing comes online
+            buffer_only = True
 
         server_cert: Optional[Union[str, Path]] = (
             _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None
         )
 
-        if tracing_endpoint.startswith("https://") and not server_cert:
+        if (tracing_endpoint and tracing_endpoint.startswith("https://")) and not server_cert:
             logger.error(
                 "Tracing endpoint is https, but no server_cert has been passed."
                 "Please point @trace_charm to a `server_cert` attr. "
                 "This might also mean that the tracing provider is related to a "
                 "certificates provider, but this application is not (yet). "
                 "In that case, you might just have to wait a bit for the certificates "
-                "integration to settle. "
+                "integration to settle. This span will be buffered."
             )
-            return
+            buffer_only = True
 
-        exporter = OTLPSpanExporter(
-            endpoint=tracing_endpoint,
-            certificate_file=str(Path(server_cert).absolute()) if server_cert else None,
-            timeout=2,
+        buffer = _Buffer(
+            db_file=buffer_path or Path() / BUFFER_DEFAULT_CACHE_FILE_NAME,
+            max_event_history_length=buffer_max_events,
+            max_buffer_size_mib=buffer_max_size_mib,
         )
+        previous_spans_buffered = not buffer.is_empty
+
+        exporters: List[SpanExporter] = []
+        if buffer_only:
+            # we have to buffer because we're missing necessary backend configuration
+            dev_logger.debug("buffering mode: ON")
+            exporters.append(_BufferedExporter(buffer))
+
+        else:
+            dev_logger.debug("buffering mode: FALLBACK")
+            # in principle, we have the right configuration to be pushing traces,
+            # but if we fail for whatever reason, we will put everything in the buffer
+            # and retry the next time
+            otlp_exporter = _OTLPSpanExporter(
+                endpoint=tracing_endpoint,
+                certificate_file=str(Path(server_cert).absolute()) if server_cert else None,
+                timeout=_OTLP_SPAN_EXPORTER_TIMEOUT,  # give individual requests 1 second to succeed
+            )
+            exporters.append(otlp_exporter)
+            exporters.append(_BufferedExporter(buffer))
+            buffer.exporter = otlp_exporter
+
+        for exporter in exporters:
+            processor = BatchSpanProcessor(exporter)
+            provider.add_span_processor(processor)
 
-        processor = BatchSpanProcessor(exporter)
-        provider.add_span_processor(processor)
         set_tracer_provider(provider)
         _tracer = get_tracer(_service_name)  # type: ignore
         _tracer_token = tracer.set(_tracer)
@@ -524,7 +792,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs):
 
         @contextmanager
         def wrap_event_context(event_name: str):
-            dev_logger.info(f"entering event context: {event_name}")
+            dev_logger.debug(f"entering event context: {event_name}")
             # when the framework enters an event context, we create a span.
             with _span("event: " + event_name) as event_context_span:
                 if event_context_span:
@@ -538,12 +806,50 @@ def wrap_event_context(event_name: str):
 
         @functools.wraps(original_close)
         def wrap_close():
-            dev_logger.info("tearing down tracer and flushing traces")
+            dev_logger.debug("tearing down tracer and flushing traces")
             span.end()
             opentelemetry.context.detach(span_token)  # type: ignore
             tracer.reset(_tracer_token)
             tp = cast(TracerProvider, get_tracer_provider())
-            tp.force_flush(timeout_millis=1000)  # don't block for too long
+            flush_successful = tp.force_flush(timeout_millis=1000)  # don't block for too long
+
+            if buffer_only:
+                # if we're in buffer_only mode, it means we couldn't even set up the exporter for
+                # tempo as we're missing some data.
+                # so attempting to flush the buffer doesn't make sense
+                dev_logger.debug("tracing backend unavailable: all spans pushed to buffer")
+
+            else:
+                dev_logger.debug("tracing backend found: attempting to flush buffer...")
+
+                # if we do have an exporter for tempo, and we could send traces to it,
+                # we can attempt to flush the buffer as well.
+                if not flush_successful:
+                    logger.error("flushing FAILED: unable to push traces to backend.")
+                else:
+                    dev_logger.debug("flush succeeded.")
+
+                    # the backend has accepted the spans generated during this event,
+                    if not previous_spans_buffered:
+                        # if the buffer was empty to begin with, any spans we collected now can be discarded
+                        buffer.drop()
+                        dev_logger.debug("buffer dropped: this trace has been sent already")
+                    else:
+                        # if the buffer was nonempty, we can attempt to flush it
+                        dev_logger.debug("attempting buffer flush...")
+                        buffer_flush_successful = buffer.flush()
+                        if buffer_flush_successful:
+                            dev_logger.debug("buffer flush OK")
+                        elif buffer_flush_successful is None:
+                            # TODO is this even possible?
+                            dev_logger.debug("buffer flush OK; empty: nothing to flush")
+                        else:
+                            # this situation is pretty weird, I'm not even sure it can happen,
+                            # because it would mean that we did manage
+                            # to push traces directly to the tempo exporter (flush_successful),
+                            # but the buffer flush failed to push to the same exporter!
+                            logger.error("buffer flush FAILED")
+
             tp.shutdown()
             original_close()
 
@@ -558,6 +864,9 @@ def trace_charm(
     server_cert: Optional[str] = None,
     service_name: Optional[str] = None,
     extra_types: Sequence[type] = (),
+    buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH,
+    buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB,
+    buffer_path: Optional[Union[str, Path]] = None,
 ) -> Callable[[_T], _T]:
     """Autoinstrument the decorated charm with tracing telemetry.
 
@@ -599,6 +908,10 @@ def trace_charm(
         Defaults to the juju application name this charm is deployed under.
     :param extra_types: pass any number of types that you also wish to autoinstrument.
         For example, charm libs, relation endpoint wrappers, workload abstractions, ...
+    :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering.
+    :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped.
+        Minimum 10MiB.
+    :param buffer_path: path to buffer file to use for saving buffered spans.
     """
 
     def _decorator(charm_type: _T) -> _T:
@@ -609,6 +922,9 @@ def _decorator(charm_type: _T) -> _T:
             server_cert_attr=server_cert,
             service_name=service_name,
             extra_types=extra_types,
+            buffer_path=Path(buffer_path) if buffer_path else None,
+            buffer_max_size_mib=buffer_max_size_mib,
+            buffer_max_events=buffer_max_events,
         )
         return charm_type
 
@@ -621,6 +937,9 @@ def _autoinstrument(
     server_cert_attr: Optional[str] = None,
     service_name: Optional[str] = None,
     extra_types: Sequence[type] = (),
+    buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH,
+    buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB,
+    buffer_path: Optional[Path] = None,
 ) -> _T:
     """Set up tracing on this charm class.
 
@@ -653,13 +972,20 @@ def _autoinstrument(
         Defaults to the juju application name this charm is deployed under.
     :param extra_types: pass any number of types that you also wish to autoinstrument.
         For example, charm libs, relation endpoint wrappers, workload abstractions, ...
+    :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering.
+    :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped.
+        Minimum 10MiB.
+    :param buffer_path: path to buffer file to use for saving buffered spans.
     """
-    dev_logger.info(f"instrumenting {charm_type}")
+    dev_logger.debug(f"instrumenting {charm_type}")
     _setup_root_span_initializer(
         charm_type,
         tracing_endpoint_attr,
         server_cert_attr=server_cert_attr,
         service_name=service_name,
+        buffer_path=buffer_path,
+        buffer_max_events=buffer_max_events,
+        buffer_max_size_mib=buffer_max_size_mib,
     )
     trace_type(charm_type)
     for type_ in extra_types:
@@ -675,12 +1001,12 @@ def trace_type(cls: _T) -> _T:
     It assumes that this class is only instantiated after a charm type decorated with `@trace_charm`
     has been instantiated.
     """
-    dev_logger.info(f"instrumenting {cls}")
+    dev_logger.debug(f"instrumenting {cls}")
     for name, method in inspect.getmembers(cls, predicate=inspect.isfunction):
-        dev_logger.info(f"discovered {method}")
+        dev_logger.debug(f"discovered {method}")
 
         if method.__name__.startswith("__"):
-            dev_logger.info(f"skipping {method} (dunder)")
+            dev_logger.debug(f"skipping {method} (dunder)")
             continue
 
         # the span title in the general case should be:
@@ -726,7 +1052,7 @@ def trace_function(function: _F, name: Optional[str] = None) -> _F:
 
 
 def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F:
-    dev_logger.info(f"instrumenting {callable}")
+    dev_logger.debug(f"instrumenting {callable}")
 
     # sig = inspect.signature(callable)
     @functools.wraps(callable)
diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py
index da7fa95e..141412b0 100644
--- a/lib/charms/tls_certificates_interface/v3/tls_certificates.py
+++ b/lib/charms/tls_certificates_interface/v3/tls_certificates.py
@@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 20
+LIBPATCH = 23
 
 PYDEPS = ["cryptography", "jsonschema"]
 
@@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None:
                     )
                 else:
                     try:
+                        secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
                         logger.debug(
                             "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}"
                         )
-                        secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
+                        # Juju < 3.6 will create a new revision even if the content is the same
+                        if (
+                            secret.get_content(refresh=True).get("certificate", "")
+                            == certificate.certificate
+                        ):
+                            logger.debug(
+                                "Secret %s with correct certificate already exists",
+                                f"{LIBID}-{csr_in_sha256_hex}",
+                            )
+                            continue
                         secret.set_content(
                             {"certificate": certificate.certificate, "csr": certificate.csr}
                         )
@@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None:
         provider_certificate = self._find_certificate_in_relation_data(csr)
         if not provider_certificate:
             # A secret expired but we did not find matching certificate. Cleaning up
+            logger.warning(
+                "Failed to find matching certificate for csr, cleaning up secret %s",
+                event.secret.label,
+            )
             event.secret.remove_all_revisions()
             return
 
         if not provider_certificate.expiry_time:
             # A secret expired but matching certificate is invalid. Cleaning up
+            logger.warning(
+                "Certificate matching csr is invalid, cleaning up secret %s",
+                event.secret.label,
+            )
             event.secret.remove_all_revisions()
             return
 
@@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti
             return provider_certificate
         return None
 
-    def _get_csr_from_secret(self, secret: Secret) -> str:
+    def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]:
         """Extract the CSR from the secret label or content.
 
         This function is a workaround to maintain backwards compatibility
         and fix the issue reported in
         https://github.com/canonical/tls-certificates-interface/issues/228
         """
-        if not (csr := secret.get_content().get("csr", "")):
+        try:
+            content = secret.get_content(refresh=True)
+        except SecretNotFoundError:
+            return None
+        if not (csr := content.get("csr", None)):
             # In versions <14 of the Lib we were storing the CSR in the label of the secret
             # The CSR now is stored int the content of the secret, which was a breaking change
             # Here we get the CSR if the secret was created by an app using libpatch 14 or lower
diff --git a/requirements.txt b/requirements.txt
index cfe46334..7b0d8ffc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,8 +11,7 @@ jsonschema
 # Code: https://github.com/canonical/operator/
 # Docs: https://ops.rtfd.io/
 # Deps: charm
-# pinned to 2.16 as 2.17 breaks our unittests
-ops == 2.16
+ops
 
 # YAML processing framework
 # Code: https://github.com/yaml/pyyaml
@@ -36,3 +35,6 @@ pydantic>=2
 
 # Deps: tracing
 opentelemetry-exporter-otlp-proto-http>=1.21.0
+
+# Deps: lib/charms/observability_libs/v0/kubernetes_compute_resources_patch.py
+tenacity
diff --git a/src/alertmanager.py b/src/alertmanager.py
index a0e37f3f..0e6e7243 100644
--- a/src/alertmanager.py
+++ b/src/alertmanager.py
@@ -84,6 +84,7 @@ def __init__(
         api_port: int,
         ha_port: int,
         web_external_url: str,
+        web_route_prefix: str,
         config_path: str,
         web_config_path: str,
         tls_enabled: Callable[[], bool],
@@ -103,6 +104,7 @@ def __init__(
         self._ha_port = ha_port
         self.api = Alertmanager(endpoint_url=web_external_url, cafile=cafile)
         self._web_external_url = web_external_url
+        self._web_route_prefix = web_route_prefix
         self._config_path = config_path
         self._web_config_path = web_config_path
         self._is_tls_enabled = tls_enabled
@@ -190,6 +192,7 @@ def _command():
                 f"--web.listen-address=:{self._api_port} "
                 f"--cluster.listen-address={listen_netloc_arg} "
                 f"--web.external-url={self._web_external_url} "
+                f"--web.route-prefix={self._web_route_prefix} "
                 f"{web_config_arg}"
                 f"{peer_cmd_args}"
             )
diff --git a/src/charm.py b/src/charm.py
index 0f03ed0b..e1ed6662 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -106,7 +106,7 @@ def __init__(self, *args):
         self.ingress = IngressPerAppRequirer(
             self,
             port=self.api_port,
-            scheme=lambda: urlparse(self._internal_url).scheme,
+            scheme=self._scheme,
             strip_prefix=True,
             redirect_https=True,
         )
@@ -142,7 +142,8 @@ def __init__(self, *args):
             resource_reqs_func=self._resource_reqs_from_config,
         )
         self.framework.observe(
-            self.resources_patch.on.patch_failed, self._on_k8s_patch_failed  # pyright: ignore
+            self.resources_patch.on.patch_failed,
+            self._on_k8s_patch_failed,  # pyright: ignore
         )
 
         # Self-monitoring
@@ -178,7 +179,8 @@ def __init__(self, *args):
             peer_netlocs=peer_ha_netlocs,
             api_port=self.api_port,
             ha_port=self._ports.ha,
-            web_external_url=self._internal_url,
+            web_external_url=self._external_url,
+            web_route_prefix="/",
             config_path=self._config_path,
             web_config_path=self._web_config_path,
             tls_enabled=self._is_tls_ready,
@@ -210,10 +212,12 @@ def __init__(self, *args):
 
         # Action events
         self.framework.observe(
-            self.on.show_config_action, self._on_show_config_action  # pyright: ignore
+            self.on.show_config_action,
+            self._on_show_config_action,  # pyright: ignore
         )
         self.framework.observe(
-            self.on.check_config_action, self._on_check_config  # pyright: ignore
+            self.on.check_config_action,
+            self._on_check_config,  # pyright: ignore
         )
 
     def set_ports(self):
@@ -404,7 +408,7 @@ def _render_manifest(self) -> ConfigFileSystemState:
                 self._templates_path: config_suite.templates,
                 self._amtool_config_path: config_suite.amtool,
                 self._server_cert_path: self.server_cert.server_cert,
-                self._key_path: self.server_cert.private_key if self.server_cert.enabled else None,
+                self._key_path: self.server_cert.private_key,
                 self._ca_cert_path: self.server_cert.ca_cert,
             }
         )
@@ -420,6 +424,9 @@ def _common_exit_hook(self, update_ca_certs: bool = False) -> None:
             self.unit.status = MaintenanceStatus("Waiting for pod startup to complete")
             return
 
+        if update_ca_certs or (self.server_cert.available and not self._certs_on_disk):
+            self._update_ca_certs()
+
         # Make sure the external url is valid
         if external_url := self._external_url:
             parsed = urlparse(external_url)
@@ -440,9 +447,7 @@ def _common_exit_hook(self, update_ca_certs: bool = False) -> None:
         #  - https://github.com/canonical/prometheus-k8s-operator/issues/530,
         self.alertmanager_provider.update(external_url=self._internal_url)
 
-        self.ingress.provide_ingress_requirements(
-            scheme=urlparse(self._internal_url).scheme, port=self.api_port
-        )
+        self.ingress.provide_ingress_requirements(scheme=self._scheme, port=self.api_port)
         self._scraping.update_scrape_job_spec(self.self_scraping_job)
 
         if self.peer_relation:
@@ -460,9 +465,6 @@ def _common_exit_hook(self, update_ca_certs: bool = False) -> None:
             self.unit.status = BlockedStatus(str(e))
             return
 
-        if update_ca_certs:
-            self._update_ca_certs()
-
         # Update pebble layer
         self.alertmanager_workload.update_layer()
 
@@ -571,8 +573,9 @@ def _get_peer_hostnames(self, include_this_unit=True) -> List[str]:
 
         return hostnames
 
-    def _is_tls_ready(self) -> bool:
-        """Returns True if the workload is ready to operate in TLS mode."""
+    @property
+    def _certs_on_disk(self) -> bool:
+        """Check if the TLS setup is ready on disk."""
         return (
             self.container.can_connect()
             and self.container.exists(self._server_cert_path)
@@ -580,6 +583,13 @@ def _is_tls_ready(self) -> bool:
             and self.container.exists(self._ca_cert_path)
         )
 
+    def _is_tls_ready(self) -> bool:
+        """Returns True if the workload is ready to operate in TLS mode."""
+        return self.server_cert.available and self._certs_on_disk
+
+    def _is_waiting_for_cert(self) -> bool:
+        return self.server_cert.enabled and not self.server_cert.available
+
     @property
     def _internal_url(self) -> str:
         """Return the fqdn dns-based in-cluster (private) address of the alertmanager api server."""
diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
index 50c6d8b4..eca9f252 100644
--- a/tests/unit/test_charm.py
+++ b/tests/unit/test_charm.py
@@ -221,11 +221,13 @@ def setUp(self, *unused):
     @k8s_resource_multipatch
     @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0"))
     def test_show_config(self, *_unused):
-        tls_paths = {
+        conditional_tls_paths = {
             self.harness.charm._server_cert_path,
-            self.harness.charm._key_path,
             self.harness.charm._ca_cert_path,
         }
+        unconditional_paths = {
+            self.harness.charm._key_path,
+        }
 
         # GIVEN an isolated charm (see setUp, decorator)
         # WHEN the "show-config" action runs
@@ -237,14 +239,16 @@ def test_show_config(self, *_unused):
         # AND configs DOES NOT contain cert-related entries
         # results.configs is a list of dicts, [{"path": ..., "content": ...}, {...}, ...].
         paths_rendered = {d["path"] for d in yaml.safe_load(results["configs"])}
-        for filepath in tls_paths:
+        for filepath in conditional_tls_paths:
             self.assertNotIn(filepath, paths_rendered)
+        for filepath in unconditional_paths:
+            self.assertIn(filepath, paths_rendered)
 
         # AND GIVEN a tls relation is in place
         rel_id = self.harness.add_relation("certificates", "ca")
         self.harness.add_relation_unit(rel_id, "ca/0")
         # AND cert files are on disk
-        for filepath in tls_paths:
+        for filepath in conditional_tls_paths:
             self.harness.model.unit.get_container("alertmanager").push(
                 filepath, "test", make_dirs=True
             )
@@ -257,5 +261,7 @@ def test_show_config(self, *_unused):
 
         # AND configs contains cert-related entries
         paths_rendered = {d["path"] for d in yaml.safe_load(results["configs"])}
-        for filepath in tls_paths:
+        for filepath in conditional_tls_paths:
+            self.assertIn(filepath, paths_rendered)
+        for filepath in unconditional_paths:
             self.assertIn(filepath, paths_rendered)
diff --git a/tests/unit/test_push_config_to_workload_on_startup.py b/tests/unit/test_push_config_to_workload_on_startup.py
index af4142cb..3475a7d5 100644
--- a/tests/unit/test_push_config_to_workload_on_startup.py
+++ b/tests/unit/test_push_config_to_workload_on_startup.py
@@ -24,12 +24,14 @@
 
 
 @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("0.0.0", ""))
+@patch("subprocess.run")
 class TestPushConfigToWorkloadOnStartup(unittest.TestCase):
     """Feature: Push config to workload on startup.
 
     Background: Charm starts up with initial hooks.
     """
 
+    @patch("subprocess.run")
     @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("0.0.0", ""))
     @k8s_resource_multipatch
     @patch("lightkube.core.client.GenericSyncClient")
@@ -45,7 +47,7 @@ def setUp(self, *_):
         self.harness.begin_with_initial_hooks()
 
     @given(st.booleans())
-    def test_single_unit_cluster(self, is_leader):
+    def test_single_unit_cluster(self, is_leader, _):
         """Scenario: Current unit is the only unit present."""
         # WHEN only one unit is
         self.assertEqual(self.harness.model.app.planned_units(), 1)
@@ -100,13 +102,14 @@ def test_multi_unit_cluster(self, *_):
         )
         self.assertIn("--cluster.peer=", command)
 
-    def test_charm_blocks_on_connection_error(self):
+    def test_charm_blocks_on_connection_error(self, *_):
         self.assertIsInstance(self.harness.charm.unit.status, ActiveStatus)
         self.harness.set_can_connect(CONTAINER_NAME, False)
         self.harness.update_config({"templates_file": "doesn't matter"})
         self.assertNotIsInstance(self.harness.charm.unit.status, ActiveStatus)
 
 
+@patch("subprocess.run")
 class TestInvalidConfig(unittest.TestCase):
     """Feature: Charm must block when invalid config is provided.
 
@@ -118,6 +121,8 @@ def setUp(self):
         self.harness = Harness(AlertmanagerCharm)
         self.addCleanup(self.harness.cleanup)
 
+        self.harness.handle_exec("alertmanager", ["update-ca-certificates", "--fresh"], result="")
+
     @k8s_resource_multipatch
     @patch("lightkube.core.client.GenericSyncClient")
     @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0"))
diff --git a/tests/unit/test_remote_configuration_requirer.py b/tests/unit/test_remote_configuration_requirer.py
index 97741806..5aa2ed2f 100644
--- a/tests/unit/test_remote_configuration_requirer.py
+++ b/tests/unit/test_remote_configuration_requirer.py
@@ -5,7 +5,7 @@
 import logging
 import unittest
 from typing import cast
-from unittest.mock import Mock, patch
+from unittest.mock import patch
 
 import yaml
 from charms.alertmanager_k8s.v0.alertmanager_remote_configuration import (
@@ -41,16 +41,25 @@
 """
 
 
+@patch("subprocess.run")
 class TestAlertmanagerRemoteConfigurationRequirer(unittest.TestCase):
+    @patch("subprocess.run")
     @patch("lightkube.core.client.GenericSyncClient")
     @patch.object(AlertmanagerCharm, "_update_ca_certs", lambda *a, **kw: None)
     @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", ""))
     @k8s_resource_multipatch
-    def setUp(self, _) -> None:
+    def setUp(self, *_) -> None:
         self.harness = testing.Harness(AlertmanagerCharm)
         self.addCleanup(self.harness.cleanup)
         self.harness.set_leader(True)
 
+        self.harness.handle_exec("alertmanager", ["update-ca-certificates", "--fresh"], result="")
+        self.harness.handle_exec(
+            "alertmanager",
+            [WorkloadManager._amtool_path, "check-config", AlertmanagerCharm._config_path],
+            result="",
+        )
+
         # TODO: Once we're on ops 2.0.0+ this can be removed as begin_with_initial_hooks()
         # now does it.
         self.harness.set_can_connect("alertmanager", True)
@@ -70,14 +79,11 @@ def setUp(self, _) -> None:
         )
         self.harness.add_relation_unit(self.relation_id, "remote-config-provider/0")
 
-    @patch("ops.model.Container.exec")
     @k8s_resource_multipatch
     def test_valid_config_pushed_to_relation_data_bag_updates_alertmanager_config(
-        self, patched_exec
+        self,
+        *_,
     ):
-        patched_exec_mock = Mock()
-        patched_exec_mock.wait_output.return_value = ("whatever", "")
-        patched_exec.return_value = patched_exec_mock
         expected_config = remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG)
         # add juju topology to "group_by"
         route = cast(dict, expected_config.get("route", {}))
@@ -104,6 +110,7 @@ def test_valid_config_pushed_to_relation_data_bag_updates_alertmanager_config(
     @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", ""))
     def test_configs_available_from_both_relation_data_bag_and_charm_config_block_charm(
         self,
+        *_,
     ):
         sample_remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG)
         self.harness.update_relation_data(
@@ -117,15 +124,12 @@ def test_configs_available_from_both_relation_data_bag_and_charm_config_block_ch
             self.harness.charm.unit.status, BlockedStatus("Multiple configs detected")
         )
 
-    @patch("ops.model.Container.exec")
     @patch("config_builder.default_config", yaml.safe_load(TEST_ALERTMANAGER_DEFAULT_CONFIG))
     @k8s_resource_multipatch
     def test_invalid_config_pushed_to_the_relation_data_bag_does_not_update_alertmanager_config(
-        self, patched_exec
+        self,
+        *_,
     ):
-        patched_exec_mock = Mock()
-        patched_exec_mock.wait_output.return_value = ("whatever", "")
-        patched_exec.return_value = patched_exec_mock
         invalid_config = yaml.safe_load("some: invalid_config")
 
         self.harness.update_relation_data(
@@ -141,6 +145,7 @@ def test_invalid_config_pushed_to_the_relation_data_bag_does_not_update_alertman
     @k8s_resource_multipatch
     def test_templates_pushed_to_relation_data_bag_are_saved_to_templates_file_in_alertmanager(
         self,
+        *_,
     ):
         sample_remote_config = yaml.safe_load(TEST_ALERTMANAGER_REMOTE_CONFIG)
         test_template = '{{define "myTemplate"}}do something{{end}}'
diff --git a/tox.ini b/tox.ini
index 2d744f55..66da8f20 100644
--- a/tox.ini
+++ b/tox.ini
@@ -124,6 +124,8 @@ deps =
     numpy
     # https://github.com/juju/python-libjuju/issues/1025
     juju<=3.3.0,>=3.0
+    # https://github.com/juju/python-libjuju/issues/1184
+    websockets<14
     pytest
     pytest-operator
     pytest-httpserver