Skip to content

Commit

Permalink
Merge pull request #254 from TogetherCrew/feat/245-cache-source-id
Browse files Browse the repository at this point in the history
feat: Added a caching mechanism using python code!
  • Loading branch information
amindadgar authored Aug 8, 2024
2 parents 0f83b7d + 5ec69a4 commit 72b168f
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion dags/violation_detection_helpers/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,22 @@ def transform(
the same data but with a label for violation detection
"""
labeled_data = []

# caching label per source_id
# since we might have multiple document with same text
cached_label: dict[str, str] = {}

for record in raw_data:
try:
data = copy.deepcopy(record)

text = record["text"]
label = self.classifier.classify(text)

# if not labeled before
if data["source_id"] not in cached_label:
cached_label[data["source_id"]] = self.classifier.classify(text)

label = cached_label[data["source_id"]]

data.setdefault("metadata", {})
data["metadata"]["vdLabel"] = label
Expand Down

0 comments on commit 72b168f

Please sign in to comment.