diff --git a/Tehtris/CHANGELOG.md b/Tehtris/CHANGELOG.md index 313a53942..82d6f2151 100644 --- a/Tehtris/CHANGELOG.md +++ b/Tehtris/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## 2024-05-30 - 1.15.2 + +### Fixed + +- Remove duplicates from results when fetch new events from the API + ## 2024-05-30 - 1.15.1 ### Fixed diff --git a/Tehtris/manifest.json b/Tehtris/manifest.json index 89af6e23e..87fa57c6b 100644 --- a/Tehtris/manifest.json +++ b/Tehtris/manifest.json @@ -29,7 +29,7 @@ "name": "TEHTRIS", "uuid": "1528d749-d353-4e38-ab1b-6e01d7595569", "slug": "tehtris", - "version": "1.15.1", + "version": "1.15.2", "categories": [ "Endpoint" ] diff --git a/Tehtris/poetry.lock b/Tehtris/poetry.lock index 6b83d8d24..9996b549f 100644 --- a/Tehtris/poetry.lock +++ b/Tehtris/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "arrow" @@ -163,6 +163,17 @@ dev = ["CacheControl[filecache,redis]", "black", "build", "cherrypy", "furo", "m filecache = ["filelock (>=3.8.0)"] redis = ["redis (>=2.10.5)"] +[[package]] +name = "cachetools" +version = "5.5.0" +description = "Extensible memoizing collections and decorators" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, + {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, +] + [[package]] name = "certifi" version = "2024.2.2" @@ -1407,7 +1418,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2107,4 +2117,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "889a157f2e2a5c14a9d79c84f0247dbec007fa7c982c609d3704197955785465" +content-hash = "1629d0af5beb32ec74ca50751b79a26d416f34f847ed84471a968cfd330df932" diff --git a/Tehtris/pyproject.toml b/Tehtris/pyproject.toml index 8fcb0ee0e..4080ddaf4 100644 --- a/Tehtris/pyproject.toml +++ b/Tehtris/pyproject.toml @@ -9,6 +9,7 @@ python = ">=3.10,<3.12" sekoia-automation-sdk = "^1.13.0" orjson = "^3.7.7" python-dateutil = "^2.8.2" +cachetools = "^5.4.0" [tool.poetry.dev-dependencies] pytest = "*" diff --git a/Tehtris/tehtris_modules/metrics.py b/Tehtris/tehtris_modules/metrics.py index 4e48cf316..5f7115ca6 100644 --- a/Tehtris/tehtris_modules/metrics.py +++ b/Tehtris/tehtris_modules/metrics.py @@ -1,4 +1,4 @@ -from prometheus_client import Counter, Histogram, Gauge +from prometheus_client import Counter, Gauge, Histogram # Declare google prometheus metrics prom_namespace_tehtris = "symphony_module_tehtris" diff --git a/Tehtris/tehtris_modules/trigger_tehtris_events.py b/Tehtris/tehtris_modules/trigger_tehtris_events.py index 13837fbbd..a7403c273 100644 --- a/Tehtris/tehtris_modules/trigger_tehtris_events.py +++ b/Tehtris/tehtris_modules/trigger_tehtris_events.py @@ -2,8 +2,10 @@ from collections.abc import Generator from datetime import datetime, timedelta, timezone from functools import cached_property +from typing import Any import orjson +from cachetools import Cache, LRUCache from dateutil.parser import isoparse from sekoia_automation.connector import Connector, DefaultConnectorConfiguration @@ -31,6 +33,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.from_date = datetime.now(timezone.utc) - timedelta(minutes=1) self.fetch_events_limit = 100 + self.events_cache: Cache = LRUCache(maxsize=1000) # TODO: is it enough to have 1000 event ids in cache? @cached_property def client(self): @@ -77,6 +80,21 @@ def __fetch_next_events(self, from_date: datetime, offset: int): ) return events + def _remove_duplicates(self, events: list[dict[str, Any]]) -> list[dict[str, Any]]: + """ + Remove duplicates events from the fetched events and update the cache with new ids. + + Args: + events: list[dict[str, Any]] + + Returns: + list[dict[str, Any]]: + """ + result = [event for event in events if event["uid"] not in self.events_cache] + self.events_cache.update({event["uid"]: None for event in result}) + + return result + def fetch_events(self) -> Generator[list, None, None]: has_more_message = True most_recent_date_seen = self.from_date @@ -84,8 +102,11 @@ def fetch_events(self) -> Generator[list, None, None]: while has_more_message: # fetch events from the current context - next_events = self.__fetch_next_events(self.from_date, offset) - INCOMING_MESSAGES.labels(intake_key=self.configuration.intake_key).inc(len(next_events)) + fetched_events = self.__fetch_next_events(self.from_date, offset) + INCOMING_MESSAGES.labels(intake_key=self.configuration.intake_key).inc(len(fetched_events)) + + # remove duplicates events from previous fetch + next_events = self._remove_duplicates(fetched_events) # if the number of fetched events equals the limit, additional events are remaining has_more_message = len(next_events) == self.fetch_events_limit diff --git a/Tehtris/tests/test_tehtris_event_trigger.py b/Tehtris/tests/test_tehtris_event_trigger.py index 03386115f..efd779811 100644 --- a/Tehtris/tests/test_tehtris_event_trigger.py +++ b/Tehtris/tests/test_tehtris_event_trigger.py @@ -1,4 +1,5 @@ from datetime import datetime, timezone +from typing import Any from unittest.mock import MagicMock, patch import pytest @@ -37,6 +38,40 @@ def trigger(symphony_storage, patch_datetime_now): yield trigger +def message(event_id: int) -> dict[str, Any]: + # flake8: noqa + return { + "rflId": 1, + "time": "2022-10-19T12:00:00.163407+00:00", + "lvl": 5, + "module": "das", + "eventName": "HeuristicAlert", + "ipSrc": "1.2.3.4", + "ipDst": "5.6.7.8", + "egKBId": 110000031301810, + "description": "Suspect spawn tree detected\nā”€ (Example\\doe-j) C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe (24644)\nā”€ā”€ (Example\\doe-j) C:\\Windows\\System32\\cmd.exe (24876)\n\nNo remediation taken", + "os_release__": "11", + "pid": 24876, + "domain__": "example.org", + "os_version__": "10.0.22621", + "cmdline": 'C:\\WINDOWS\\system32\\cmd.exe /d /c "C:\\Users\\doe-j\\AppData\\Local\\Programs\\IT Hit\\IT Hit Edit Doc Opener Host 5\\NativeHost.exe" chrome-extension://mdfaonmaoigngflemfmkboffllkopopm/ --parent-window=0 < \\\\.\\pipe\\LOCAL\\edge.nativeMessaging.in.c7c2f388b0eb2f77 > \\\\.\\pipe\\LOCAL\\edge.nativeMessaging.out.c7c2f388b0eb2f77', + "username": "Example\\doe-j", + "pCreateDatetime": "2022-10-19T12:00:00.098346+00:00", + "location": "", + "os_server__": False, + "sha256": "01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b", + "ppid": 24644, + "uuid__": "3be682e9-5568-4dbf-8e2d-5b36159945da", + "path": "C:\\Windows\\System32\\cmd.exe", + "tag": "YBE_PDT_WIN", + "uid": f"{event_id};windows;HOST01;example.org", + "os__": "windows", + "os_architecture__": "x86_64", + "hostname__": "HOST01", + "id": event_id, + } + + @pytest.fixture def message1(): # flake8: noqa @@ -120,6 +155,57 @@ def test_fetch_events(trigger, message1, message2): assert next(trigger.fetch_events()) == [message1, message2] +def test_fetch_events_without_duplicates(trigger, message1, message2): + with requests_mock.Mocker() as mock: + first_batch = [ + message(1), + message(2), + message(3), + ] + + second_batch = [ + message(2), + message(3), + message(4), + message(5), + ] + + mock.get( + "https://abc.api.tehtris.net/api/xdr/v1/event", + status_code=200, + json=first_batch, + ) + + result_first = next(trigger.fetch_events()) + assert [event["id"] for event in result_first] == [1, 2, 3] + assert [event["uid"] for event in result_first] == [ + "1;windows;HOST01;example.org", + "2;windows;HOST01;example.org", + "3;windows;HOST01;example.org", + ] + + assert trigger.events_cache == { + "1;windows;HOST01;example.org": None, + "2;windows;HOST01;example.org": None, + "3;windows;HOST01;example.org": None, + } + + mock.get( + "https://abc.api.tehtris.net/api/xdr/v1/event", + status_code=200, + json=second_batch, + ) + + result_second = next(trigger.fetch_events()) + + assert [event["id"] for event in result_second] == [4, 5] + assert [event["uid"] for event in result_second] == [ + "4;windows;HOST01;example.org", + "5;windows;HOST01;example.org", + ] + assert len(result_second) == 2 + + def test_fetch_events_pagination(trigger, message1, message2): first_batch = [message1] * 100 second_batch = [message2] * 25