Skip to content

Commit

Permalink
Merge pull request #282 from TogetherCrew/feat/281-violation-detectio…
Browse files Browse the repository at this point in the history
…n-process-all-categories

feat: processing all messages if empty resources array was given!
  • Loading branch information
amindadgar authored Sep 11, 2024
2 parents 4248e5c + e0094af commit 6851638
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 7 deletions.
16 changes: 9 additions & 7 deletions dags/violation_detection_helpers/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def extract(
if `None`, no filtering would be applied to it
resources : list[str]
a list of resources to extract data from
if empty, process all messages
recompute : bool
if `False`, extract the non-labeled data after the latest ones
if `True`, extract `rawmemberactivities` data from the given date
Expand Down Expand Up @@ -98,13 +99,14 @@ def extract(
},
}

cursor = self.client[self.platform_id]["rawmemberactivities"].find(
{
**date_query,
"text": {"$ne": None},
self.resource_name: {"$in": resources},
}
)
query = {**date_query, "text": {"$ne": None}}

# Add resource filter only if resources are not empty
if resources:
query[self.resource_name] = {"$in": resources}

cursor = self.client[self.platform_id]["rawmemberactivities"].find(query)

return cursor, override_recompute

def _find_latest_labeled(self, label_field: str = "vdLabel") -> datetime | None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,60 @@ def test_extract_all_resources(self):
self.assertEqual(results, sample_data)
self.assertTrue(override_recompute)

def test_extract_all_resources_given_empty_resource(self):
sample_data = [
{
"author_id": "1",
"date": datetime(2022, 1, 1),
"source_id": "8888",
"text": "test_test",
"metadata": {
"topic_id": None,
"category_id": "34567",
},
"actions": [
{
"name": "message",
"type": "emitter",
}
],
"interactions": [],
},
{
"author_id": "2",
"date": datetime(2022, 1, 1),
"source_id": "8880",
"text": "test_test 2",
"metadata": {
"topic_id": None,
"category_id": "34569",
},
"actions": [
{
"name": "message",
"type": "emitter",
}
],
"interactions": [],
},
]
self.client[self.platform_id]["rawmemberactivities"].insert_many(sample_data)
extract_data = ExtractPlatformRawData(self.platform_id, "category_id")

cursor, override_recompute = extract_data.extract(
from_date=datetime(2020, 1, 1),
to_date=None,
resources=[],
recompute=False,
)
results = list(cursor)

self.assertEqual(
len(results), 2, "We need to fetch all data if no resource was given"
)
self.assertEqual(results, sample_data)
self.assertTrue(override_recompute)

def test_extract_some_resources(self):
sample_data = [
{
Expand Down

0 comments on commit 6851638

Please sign in to comment.