Merge branch 'master' into main-2491-fix-oomkill-graphs-for-slack

robusta-dev · Nov 18, 2024 · d10f645 · d10f645
2 parents 325b5c9 + a58fee3
commit d10f645
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 9 deletions.
diff --git a/docs/configuration/ai-analysis.rst b/docs/configuration/ai-analysis.rst
@@ -223,7 +223,7 @@ To use HolmesGPT with the Robusta UI, one further step may be necessary, dependi
 * If you store the Robusta UI token in a Kubernetes secret, follow the instructions below.
 
 Note: the same Robusta UI token is used for the Robusta UI sink and for HolmesGPT.
- 
+
 Reading the Robusta UI Token from a secret in HolmesGPT
 ************************************************************
 
@@ -428,3 +428,44 @@ Finally, after updating your ``generated_values.yaml``, apply the changes to you
     helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>
 
 This will update the deployment to use the custom Docker image, which includes the new binaries. The ``toolsets`` defined in the configuration will now be available for Holmes to use, including any new binaries like ``jq``.
+
+
+Adding Permissions for Additional Resources
+----------------------------------------------
+
+There are scenarios where HolmesGPT may require access to additional Kubernetes resources or CRDs to perform specific analyses or interact with external tools.
+
+You will need to extend its ClusterRole rules whenever HolmesGPT needs to access resources that are not included in its default configuration.
+
+Common Scenarios for Adding Permissions:
+
+* External Integrations and CRDs: When HolmesGPT needs to access custom resources (CRDs) in your cluster, like ArgoCD Application resources or Istio VirtualService resources.
+* Additional Kubernetes resources: By default, Holmes can only access a limited number of Kubernetes resources. For example, Holmes has no access to Kubernetes secrets by default. You can give Holmes access to more built-in cluster resources if it is useful for your use case.
+
+As an example, let's consider a case where we ask HolmesGPT to analyze the state of Argo CD applications and projects to troubleshoot issues related to application deployments managed by Argo CD, but it doesn't have access to the relevant CRDs.
+
+**Steps to Add Permissions for Argo CD:**
+
+1. **Update generated_values.yaml with Required Permissions:**
+
+Add the following configuration under the ``customClusterRoleRules`` section:
+
+.. code-block:: yaml
+
+    enableHolmesGPT: true
+    holmes:
+      customClusterRoleRules:
+        - apiGroups: ["argoproj.io"]
+          resources: ["applications", "appprojects"]
+          verbs: ["get", "list", "watch"]
+
+2. **Apply the Configuration:**
+
+Deploy the updated configuration using Helm:
+
+.. code-block:: bash
+
+  helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>
+
+This will grant HolmesGPT the necessary permissions to analyze Argo CD applications and projects.
+Now you can ask HolmesGPT questions like "What is the current status of all Argo CD applications in the cluster?" and it will be able to answer.
diff --git a/playbooks/robusta_playbooks/prometheus_simulation.py b/playbooks/robusta_playbooks/prometheus_simulation.py
@@ -219,10 +219,10 @@ def alertmanager_alert(event: ExecutionBaseEvent, action_params: AlertmanagerAle
 
     try:
         requests.post(
-            f"{alertmanager_url}/api/v1/alerts",
+            f"{alertmanager_url}/api/v2/alerts",
             data=json.dumps(alerts),
             headers=headers,
-        )
+        ).raise_for_status()
     except Exception:
         logging.exception(f"Failed to create alertmanager alerts {alerts}")
         raise ActionException(ErrorCodes.ALERT_MANAGER_REQUEST_FAILED)

diff --git a/src/robusta/core/discovery/discovery.py b/src/robusta/core/discovery/discovery.py
@@ -169,7 +169,6 @@ def __create_service_info(
         )
 
     @staticmethod
-
     def create_service_info_from_hikaru(obj: Union[Deployment, DaemonSet, StatefulSet, Pod, ReplicaSet]) -> ServiceInfo:
         return Discovery.__create_service_info_from_hikaru(
             obj.metadata,
@@ -187,7 +186,7 @@ def create_service_info_from_hikaru(obj: Union[Deployment, DaemonSet, StatefulSe
     def discovery_process() -> DiscoveryResults:
         create_monkey_patches()
         Discovery.stacktrace_thread_active = True
-        threading.Thread(target=Discovery.stack_dump_on_signal).start()
+        threading.Thread(target=Discovery.stack_dump_on_signal, daemon=True).start()
         pods_metadata: List[V1ObjectMeta] = []
         node_requests = defaultdict(list)  # map between node name, to request of pods running on it
         active_services: List[ServiceInfo] = []
@@ -247,6 +246,10 @@ def discovery_process() -> DiscoveryResults:
                     for role_binding in role_bindings.items:
                         ns = role_binding.metadata.namespace
 
+                        if not role_binding.subjects:
+                            logging.info(f"Skipping role binding: {role_binding.metadata.name} in ns: {role_binding.metadata.namespace}")
+                            continue
+
                         for subject in role_binding.subjects:
                             if subject.kind == "Group":
                                 groupname_to_namespaces[subject.name].append(ns)

diff --git a/src/robusta/core/model/openshift_group.py b/src/robusta/core/model/openshift_group.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, Optional
 
 from pydantic import BaseModel
 
@@ -9,8 +9,8 @@ class OpenshiftGroup(BaseModel):
     namespace: str = ""
     users: list[str] = []
     namespaces: list[str] = []
-    labels: Dict[str, str]
-    annotations: Dict[str, str]
+    labels: Optional[Dict[str, str]] = None
+    annotations: Optional[Dict[str, str]] = None
     deleted: bool = False
 
     def get_service_key(self) -> str:

diff --git a/src/robusta/core/sinks/robusta/robusta_sink.py b/src/robusta/core/sinks/robusta/robusta_sink.py
@@ -331,8 +331,8 @@ def __discover_resources(self) -> DiscoveryResults:
 
             self.__pods_running_count = results.pods_running_count
 
-            self.__assert_openshift_groups_cache_initialized()
             if results.openshift_groups:
+                self.__assert_openshift_groups_cache_initialized()
                 self.__publish_new_openshift_groups(results.openshift_groups)
 
             # save the cached services for the resolver.

diff --git a/tests/discovery/test_discovery.py b/tests/discovery/test_discovery.py
@@ -0,0 +1,44 @@
+import signal
+from concurrent.futures import ProcessPoolExecutor
+from contextlib import contextmanager
+from http import HTTPStatus
+from typing import Any, Generator, NoReturn
+from unittest.mock import patch
+
+import kubernetes
+import pytest
+from kubernetes.client.exceptions import ApiException
+
+from robusta.core.discovery.discovery import Discovery
+
+
+# pytest-timeout requires pytest>=7, https://github.com/pytest-dev/pytest-timeout/blob/main/setup.cfg
+@contextmanager
+def time_limit(seconds: int) -> Generator[None, Any, None]:
+    def signal_handler(_signum: Any, _frame: Any) -> NoReturn:
+        pytest.fail("Test took to much time...")
+
+    signal.signal(signal.SIGALRM, signal_handler)
+    signal.alarm(seconds)
+    try:
+        yield
+    finally:
+        signal.alarm(0)
+
+
+def _patch_worker() -> None:
+    def _patched(self: Any, **_: Any) -> NoReturn:
+        raise ApiException(HTTPStatus.INTERNAL_SERVER_ERROR, reason="Internal Server Error")
+
+    kubernetes.client.CoreV1Api.list_node = _patched
+
+
+def test_discovery_recovery_on_failure():
+    with time_limit(20):
+        patched_pool = ProcessPoolExecutor(1, initializer=_patch_worker)
+        with patch.object(Discovery, "executor", new=patched_pool):
+            with pytest.raises(ApiException):
+                Discovery.discover_resources()
+
+            assert patched_pool._shutdown_thread
+            assert not Discovery.executor._shutdown_thread