Skip to content

Commit

Permalink
Merge pull request #337 from TogetherCrew/fix/336-violation-detection…
Browse files Browse the repository at this point in the history
…-no-cursor-found

fix: decreased batch size to help with idle time of mongo cursor!
  • Loading branch information
amindadgar authored Dec 8, 2024
2 parents bc758f7 + dc36229 commit 7bb0f83
Showing 1 changed file with 18 additions and 4 deletions.
22 changes: 18 additions & 4 deletions dags/violation_detection_helpers/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,31 @@


class TransformPlatformRawData:
def __init__(self) -> None:
def __init__(self, cursor_batch_size: int = 10) -> None:
"""
Transformation of the raw data by classifying the violation of them
Parameters
------------
cursor_batch_size : int
the pymongo cursor batch size
lowering it could increase the IO (requests to mongo)
increasing it could reduce IO but increase the idle time of the mongo cursor
default is 10 document per batch in each cursor
"""
self.batch_size = cursor_batch_size
self.classifier = Classifier()

def transform(
self,
raw_data: Cursor,
data_cursor: Cursor,
) -> list[dict]:
"""
transform a list of platform's `rawmemberactivities` by labeling them
Parameters
-------------
raw_data : Cursor
data_cursor : Cursor
the data cursor to be transformed
(using cursor for more efficiency of database)
the transformation here is to label the violation for texts
Expand All @@ -28,13 +40,15 @@ def transform(
labeled_data : list[dict]
the same data but with a label for violation detection
"""
data_cursor = data_cursor.hint({"$natural": 1}).batch_size(self.batch_size)

labeled_data = []

# caching label per source_id
# since we might have multiple document with same text
cached_label: dict[str, str] = {}

for record in raw_data:
for record in data_cursor:
try:
data = copy.deepcopy(record)

Expand Down

0 comments on commit 7bb0f83

Please sign in to comment.