From f589853ca58e92cb57b06e90a040deac32f66a30 Mon Sep 17 00:00:00 2001
From: Caleigh Runge-Hottman <crungeho@redhat.com>
Date: Thu, 14 Mar 2024 11:34:50 -0400
Subject: [PATCH] Add fatal LoggingThread error backout strategy

In https://github.com/release-engineering/kobo/pull/169,
fatal LoggingThread errors were logged to the worker's local
log file before exiting.

In https://github.com/release-engineering/kobo/pull/248, a more
drastic measure was taken: all exceptions were indefinitely
retried and the ability to write exceptions to the worker's
local log file was removed. This approach could prevent the
LoggingThread from terminating when encountering a fatal error.

This commit combines the two approaches, backing out and exiting
only after determining we've identified a persistent fatal error.

The means by which we identify a fatal (vs a temporary/non-fatal)
LoggingThread error is by simply retrying the `upload_task_log`
method during a defined interval (defined by the LoggingThread's
`_timeout` attribute).

If the method continues to fail for the duration of the interval
(i.e., does not succeed by the time the timeout is reached), we
can consider the error to be fatal. At this point, we attempt to
instead write the error to the worker's local log file, and raise
the exception.

Note that the timeout can be toggled using the
`KOBO_LOGGING_THREAD_TIMEOUT` environment variable.
---
 kobo/worker/logger.py | 23 ++++++++++++++++++++++-
 tests/test_logger.py  | 27 +++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/kobo/worker/logger.py b/kobo/worker/logger.py
index f3ceb0b..46951ae 100644
--- a/kobo/worker/logger.py
+++ b/kobo/worker/logger.py
@@ -2,6 +2,7 @@
 
 import threading
 import time
+import os
 
 import six
 
@@ -30,6 +31,7 @@ def __init__(self, hub, task_id, *args, **kwargs):
         self._running = True
         self._send_time = 0
         self._send_data = b""
+        self._timeout = int(os.environ.get("KOBO_LOGGING_THREAD_TIMEOUT", 120))
 
     def read_queue(self):
         out = self._queue.get_nowait()
@@ -67,7 +69,26 @@ def run(self):
                 self._send_time = now
                 self._send_data = b""
             except Exception:
-                continue
+                # Any exception other than an XML-RPC fault may be fatal. It is
+                # possible that we've encountered a retryable error, such as a
+                # temporary network disruption between worker and hub. Attempt
+                # to retry for a bit.
+                if now - self._send_time <= self._timeout:
+                    continue
+
+                # If the timemout has been exceeded, we can assume we've
+                # encountered a non-temporary, fatal exception.
+                #
+                # Since upload_task_log is apparently not working, we can't get
+                # this into the task logs, but it should at least be possible
+                # for this to get into the worker's local log file.
+                if self._logger:
+                    msg = "\n".join([
+                        "Fatal error in LoggingThread",
+                        kobo.tback.Traceback().get_traceback(),
+                    ])
+                    self._logger.log_critical(msg)
+                raise
 
     def write(self, data):
         """Add data to the queue and set the event for sending queue content."""
diff --git a/tests/test_logger.py b/tests/test_logger.py
index d2a85e6..9ef6404 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -56,6 +56,33 @@ def test_upload_task_log_after_some_time(self):
         self.assertFalse(thread.is_alive())
         self.assertFalse(thread._running)
 
+    # Following test intentionally kills a thread with an exception.
+    @pytest.mark.filterwarnings("ignore::pytest.PytestUnhandledThreadExceptionWarning")
+    def test_logs_on_fatal_error(self):
+        # Set up a logger whose output we'll be able to inspect.
+        logs = StringIO()
+        logger = logging.getLogger('TestLoggingThread')
+        logger.addHandler(logging.StreamHandler(logs))
+        kobo_logger = LoggingBase(logger)
+
+        mock_hub = Mock()
+        mock_hub.upload_task_log.side_effect = RuntimeError("Simulated error")
+
+        thread = LoggingThread(mock_hub, 9999, logger=kobo_logger)
+        thread.daemon = True
+        thread.start()
+
+        thread.write('This is a log message!')
+        # Since we set up a fatal error, we expect the thread to die soon
+        # despite not calling stop().
+        thread.join(10.0)
+        self.assertFalse(thread.is_alive())
+
+        # Before dying, it should have written something useful to the logs.
+        captured = logs.getvalue()
+        self.assertIn('Fatal error in LoggingThread', captured)
+        self.assertIn('RuntimeError: Simulated error', captured)
+
     def test_logs_during_temporary_outage(self):
         # Messages written to the logging thread during a temporary
         # outage should be uploaded (and not discarded) after the hub