Merge pull request #1139 from SEKOIA-IO/fix/tehtris_remove_duplicates

Fix: Tehtris remove duplicates from result (257)
SEKOIA-IO · Oct 24, 2024 · 924f8da · 924f8da
2 parents cc7a19d + fda2ea6
commit 924f8da
Show file tree

Hide file tree

Showing 7 changed files with 131 additions and 7 deletions.
diff --git a/Tehtris/CHANGELOG.md b/Tehtris/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## 2024-05-30 - 1.15.2
+
+### Fixed
+
+- Remove duplicates from results when fetch new events from the API
+
 ## 2024-05-30 - 1.15.1
 
 ### Fixed

diff --git a/Tehtris/manifest.json b/Tehtris/manifest.json
@@ -29,7 +29,7 @@
   "name": "TEHTRIS",
   "uuid": "1528d749-d353-4e38-ab1b-6e01d7595569",
   "slug": "tehtris",
-  "version": "1.15.1",
+  "version": "1.15.2",
   "categories": [
     "Endpoint"
   ]

diff --git a/Tehtris/poetry.lock b/Tehtris/poetry.lock
diff --git a/Tehtris/pyproject.toml b/Tehtris/pyproject.toml
@@ -9,6 +9,7 @@ python = ">=3.10,<3.12"
 sekoia-automation-sdk = "^1.13.0"
 orjson = "^3.7.7"
 python-dateutil = "^2.8.2"
+cachetools = "^5.4.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "*"

diff --git a/Tehtris/tehtris_modules/metrics.py b/Tehtris/tehtris_modules/metrics.py
@@ -1,4 +1,4 @@
-from prometheus_client import Counter, Histogram, Gauge
+from prometheus_client import Counter, Gauge, Histogram
 
 # Declare google prometheus metrics
 prom_namespace_tehtris = "symphony_module_tehtris"

diff --git a/Tehtris/tehtris_modules/trigger_tehtris_events.py b/Tehtris/tehtris_modules/trigger_tehtris_events.py
@@ -2,8 +2,10 @@
 from collections.abc import Generator
 from datetime import datetime, timedelta, timezone
 from functools import cached_property
+from typing import Any
 
 import orjson
+from cachetools import Cache, LRUCache
 from dateutil.parser import isoparse
 from sekoia_automation.connector import Connector, DefaultConnectorConfiguration
 
@@ -31,6 +33,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.from_date = datetime.now(timezone.utc) - timedelta(minutes=1)
         self.fetch_events_limit = 100
+        self.events_cache: Cache = LRUCache(maxsize=1000)  # TODO: is it enough to have 1000 event ids in cache?
 
     @cached_property
     def client(self):
@@ -77,15 +80,33 @@ def __fetch_next_events(self, from_date: datetime, offset: int):
             )
             return events
 
+    def _remove_duplicates(self, events: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """
+        Remove duplicates events from the fetched events and update the cache with new ids.
+
+        Args:
+            events: list[dict[str, Any]]
+
+        Returns:
+            list[dict[str, Any]]:
+        """
+        result = [event for event in events if event["uid"] not in self.events_cache]
+        self.events_cache.update({event["uid"]: None for event in result})
+
+        return result
+
     def fetch_events(self) -> Generator[list, None, None]:
         has_more_message = True
         most_recent_date_seen = self.from_date
         offset = 0
 
         while has_more_message:
             # fetch events from the current context
-            next_events = self.__fetch_next_events(self.from_date, offset)
-            INCOMING_MESSAGES.labels(intake_key=self.configuration.intake_key).inc(len(next_events))
+            fetched_events = self.__fetch_next_events(self.from_date, offset)
+            INCOMING_MESSAGES.labels(intake_key=self.configuration.intake_key).inc(len(fetched_events))
+
+            # remove duplicates events from previous fetch
+            next_events = self._remove_duplicates(fetched_events)
 
             # if the number of fetched events equals the limit, additional events are remaining
             has_more_message = len(next_events) == self.fetch_events_limit

diff --git a/Tehtris/tests/test_tehtris_event_trigger.py b/Tehtris/tests/test_tehtris_event_trigger.py
@@ -1,4 +1,5 @@
 from datetime import datetime, timezone
+from typing import Any
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -37,6 +38,40 @@ def trigger(symphony_storage, patch_datetime_now):
     yield trigger
 
 
+def message(event_id: int) -> dict[str, Any]:
+    # flake8: noqa
+    return {
+        "rflId": 1,
+        "time": "2022-10-19T12:00:00.163407+00:00",
+        "lvl": 5,
+        "module": "das",
+        "eventName": "HeuristicAlert",
+        "ipSrc": "1.2.3.4",
+        "ipDst": "5.6.7.8",
+        "egKBId": 110000031301810,
+        "description": "Suspect spawn tree detected\n─ (Example\\doe-j) C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe (24644)\n── (Example\\doe-j) C:\\Windows\\System32\\cmd.exe (24876)\n\nNo remediation taken",
+        "os_release__": "11",
+        "pid": 24876,
+        "domain__": "example.org",
+        "os_version__": "10.0.22621",
+        "cmdline": 'C:\\WINDOWS\\system32\\cmd.exe /d /c "C:\\Users\\doe-j\\AppData\\Local\\Programs\\IT Hit\\IT Hit Edit Doc Opener Host 5\\NativeHost.exe" chrome-extension://mdfaonmaoigngflemfmkboffllkopopm/ --parent-window=0 < \\\\.\\pipe\\LOCAL\\edge.nativeMessaging.in.c7c2f388b0eb2f77 > \\\\.\\pipe\\LOCAL\\edge.nativeMessaging.out.c7c2f388b0eb2f77',
+        "username": "Example\\doe-j",
+        "pCreateDatetime": "2022-10-19T12:00:00.098346+00:00",
+        "location": "",
+        "os_server__": False,
+        "sha256": "01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b",
+        "ppid": 24644,
+        "uuid__": "3be682e9-5568-4dbf-8e2d-5b36159945da",
+        "path": "C:\\Windows\\System32\\cmd.exe",
+        "tag": "YBE_PDT_WIN",
+        "uid": f"{event_id};windows;HOST01;example.org",
+        "os__": "windows",
+        "os_architecture__": "x86_64",
+        "hostname__": "HOST01",
+        "id": event_id,
+    }
+
+
 @pytest.fixture
 def message1():
     # flake8: noqa
@@ -120,6 +155,57 @@ def test_fetch_events(trigger, message1, message2):
         assert next(trigger.fetch_events()) == [message1, message2]
 
 
+def test_fetch_events_without_duplicates(trigger, message1, message2):
+    with requests_mock.Mocker() as mock:
+        first_batch = [
+            message(1),
+            message(2),
+            message(3),
+        ]
+
+        second_batch = [
+            message(2),
+            message(3),
+            message(4),
+            message(5),
+        ]
+
+        mock.get(
+            "https://abc.api.tehtris.net/api/xdr/v1/event",
+            status_code=200,
+            json=first_batch,
+        )
+
+        result_first = next(trigger.fetch_events())
+        assert [event["id"] for event in result_first] == [1, 2, 3]
+        assert [event["uid"] for event in result_first] == [
+            "1;windows;HOST01;example.org",
+            "2;windows;HOST01;example.org",
+            "3;windows;HOST01;example.org",
+        ]
+
+        assert trigger.events_cache == {
+            "1;windows;HOST01;example.org": None,
+            "2;windows;HOST01;example.org": None,
+            "3;windows;HOST01;example.org": None,
+        }
+
+        mock.get(
+            "https://abc.api.tehtris.net/api/xdr/v1/event",
+            status_code=200,
+            json=second_batch,
+        )
+
+        result_second = next(trigger.fetch_events())
+
+        assert [event["id"] for event in result_second] == [4, 5]
+        assert [event["uid"] for event in result_second] == [
+            "4;windows;HOST01;example.org",
+            "5;windows;HOST01;example.org",
+        ]
+        assert len(result_second) == 2
+
+
 def test_fetch_events_pagination(trigger, message1, message2):
     first_batch = [message1] * 100
     second_batch = [message2] * 25