Kf/visualizer (#72)

- A more properly implemented corpus processing step in the generic pipeline, which outputs triplets, triplet stats, entity and predicate mappings, graph data, and a static graph. - Revamp of clustering approach for entities and predicates as well as labeling. - A visualization tool.
centre-for-humanities-computing · May 28, 2024 · 6cc2b53 · 6cc2b53
1 parent ac16326
commit 6cc2b53
Show file tree

Hide file tree

Showing 39 changed files with 17,749 additions and 499 deletions.
diff --git a/config/eschatology.toml b/config/eschatology.toml
@@ -12,6 +12,7 @@ text_column = "body"
 [docprocessing]
 enabled = true
 batch_size = 5
+prefer_gpu_for_coref = false
 
 [corpusprocessing]
 enabled = true
diff --git a/paper/extract_triplets_newspapers.py b/paper/extract_triplets_newspapers.py
@@ -7,8 +7,11 @@
 from transformers import AutoTokenizer
 import argparse
 
+from conspiracies.preprocessing.wordpiece_length_normalization import (
+    wordpiece_length_normalization,
+)
+
 # Conspiracies
-from conspiracies.preproc import wordpiece_length_normalization
 from extract_utils import load_ndjson, write_txt
 
 

diff --git a/paper/network_creation.py b/paper/network_creation.py
diff --git a/paper/src/context_threads.py b/paper/src/context_threads.py
@@ -210,7 +210,7 @@ def range_of_dates(start_date, extra_days):
     # write contests to ndjson
     with open(
         os.path.join(
-            f"tweet_threads_{datetime.datetime.strptime(args.start_date, '%Y-%m-%d').date() }_{datetime.datetime.strptime(args.start_date, '%Y-%m-%d').date() + datetime.timedelta(days=args.extra_days-1)}.ndjson",
+            f"tweet_threads_{datetime.datetime.strptime(args.start_date, '%Y-%m-%d').date()}_{datetime.datetime.strptime(args.start_date, '%Y-%m-%d').date() + datetime.timedelta(days=args.extra_days-1)}.ndjson",
         ),
         "w",
     ) as f:

diff --git a/pyproject.toml b/pyproject.toml
@@ -76,6 +76,7 @@ docs = [
     "sphinx_design>=0.3.0,<0.3.1",
     "myst-nb>=0.6.0,<1.17.0",
     "sphinx_togglebutton>=0.2.3,<0.4.0",
+    "ipython==8.1.0"
 ]
 tutorials = [
     "jupyter>=1.0.0,<1.1.0"

diff --git a/src/conspiracies/common/fileutils.py b/src/conspiracies/common/fileutils.py
@@ -1,8 +1,12 @@
 import glob
 import logging
+from pathlib import Path
+from typing import Union
 
 
-def iter_lines_of_files(glob_pattern: str):
+def iter_lines_of_files(glob_pattern: Union[str, Path]):
+    if isinstance(glob_pattern, Path):
+        glob_pattern = glob_pattern.as_posix()
     files = glob.glob(glob_pattern, recursive=True)
     logging.debug(
         "The glob pattern '%s' resulted in the following files: %s",

diff --git a/src/conspiracies/corpusprocessing/aggregation.py b/src/conspiracies/corpusprocessing/aggregation.py
@@ -0,0 +1,89 @@
+from collections import Counter
+from typing import List, TypedDict, Dict, Union, Callable, Iterable, Tuple
+
+from pydantic import BaseModel
+
+from conspiracies.corpusprocessing.clustering import Mappings
+from conspiracies.corpusprocessing.triplet import Triplet
+
+
+def min_max_normalizer(values: Iterable[Union[int, float]]) -> Callable[[float], float]:
+    if not isinstance(values, list):
+        values = list(values)
+    min_value = min(values)
+    max_value = max(values)
+    if min_value == max_value:
+        return lambda x: 0
+    return lambda x: (x - min_value) / (max_value - min_value)
+
+
+class StatsEntry(TypedDict):
+    key: Union[str, Tuple[str]]
+    frequency: int
+    norm_frequency: float
+
+
+class StatsDict(Dict[str, StatsEntry]):
+    pass
+
+    @classmethod
+    def from_counter(cls, counter: Counter):
+        normalizer = min_max_normalizer(counter.values())
+        return cls(
+            {
+                key: StatsEntry(
+                    key=key,
+                    frequency=value,
+                    norm_frequency=normalizer(value),
+                )
+                for key, value in counter.items()
+            },
+        )
+
+
+class TripletStats(BaseModel):
+    triplets: StatsDict
+    entities: StatsDict
+    predicates: StatsDict
+
+    def entries(self) -> Dict[str, List[StatsEntry]]:
+        return {
+            statsdict: list(getattr(self, statsdict).values())
+            for statsdict in ("triplets", "entities", "predicates")
+        }
+
+
+class TripletAggregator:
+
+    def __init__(self, mappings: Mappings = None):
+        self._mappings = mappings
+
+    def aggregate(
+        self,
+        triplets: List[Triplet],
+        remove_identical_subj_and_obj: bool = True,
+    ):
+        mapped_triplets = [
+            (
+                self._mappings.map_entity(triplet.subject.text),
+                self._mappings.map_predicate(triplet.predicate.text),
+                self._mappings.map_entity(triplet.object.text),
+            )
+            for triplet in triplets
+        ]
+        if remove_identical_subj_and_obj:
+            mapped_triplets = [t for t in mapped_triplets if t[0] != t[2]]
+        triplet_counts = Counter(triplet for triplet in mapped_triplets)
+        entity_counts = Counter(
+            entity for triplet in mapped_triplets for entity in (triplet[0], triplet[2])
+        )
+        # entity_distinct_triplets = Counter(
+        #     entity for triplet in set(mapped_triplets) for entity in (triplet[0], triplet[2])
+        # )
+
+        predicate_counts = Counter(triplet[1] for triplet in mapped_triplets)
+        return TripletStats(
+            triplets=StatsDict.from_counter(triplet_counts),
+            entities=StatsDict.from_counter(entity_counts),
+            predicates=StatsDict.from_counter(predicate_counts),
+        )