Skip to content

Commit

Permalink
Kf/visualizer (#72)
Browse files Browse the repository at this point in the history
- A more properly implemented corpus processing step in the generic pipeline, which outputs triplets, triplet stats, entity and predicate mappings, graph data, and a static graph.
- Revamp of clustering approach for entities and predicates as well as labeling.
- A visualization tool.
  • Loading branch information
KasperFyhn authored May 28, 2024
1 parent ac16326 commit 6cc2b53
Show file tree
Hide file tree
Showing 39 changed files with 17,749 additions and 499 deletions.
1 change: 1 addition & 0 deletions config/eschatology.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ text_column = "body"
[docprocessing]
enabled = true
batch_size = 5
prefer_gpu_for_coref = false

[corpusprocessing]
enabled = true
5 changes: 4 additions & 1 deletion paper/extract_triplets_newspapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
from transformers import AutoTokenizer
import argparse

from conspiracies.preprocessing.wordpiece_length_normalization import (
wordpiece_length_normalization,
)

# Conspiracies
from conspiracies.preproc import wordpiece_length_normalization
from extract_utils import load_ndjson, write_txt


Expand Down
153 changes: 0 additions & 153 deletions paper/network_creation.py

This file was deleted.

2 changes: 1 addition & 1 deletion paper/src/context_threads.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def range_of_dates(start_date, extra_days):
# write contests to ndjson
with open(
os.path.join(
f"tweet_threads_{datetime.datetime.strptime(args.start_date, '%Y-%m-%d').date() }_{datetime.datetime.strptime(args.start_date, '%Y-%m-%d').date() + datetime.timedelta(days=args.extra_days-1)}.ndjson",
f"tweet_threads_{datetime.datetime.strptime(args.start_date, '%Y-%m-%d').date()}_{datetime.datetime.strptime(args.start_date, '%Y-%m-%d').date() + datetime.timedelta(days=args.extra_days-1)}.ndjson",
),
"w",
) as f:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ docs = [
"sphinx_design>=0.3.0,<0.3.1",
"myst-nb>=0.6.0,<1.17.0",
"sphinx_togglebutton>=0.2.3,<0.4.0",
"ipython==8.1.0"
]
tutorials = [
"jupyter>=1.0.0,<1.1.0"
Expand Down
6 changes: 5 additions & 1 deletion src/conspiracies/common/fileutils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import glob
import logging
from pathlib import Path
from typing import Union


def iter_lines_of_files(glob_pattern: str):
def iter_lines_of_files(glob_pattern: Union[str, Path]):
if isinstance(glob_pattern, Path):
glob_pattern = glob_pattern.as_posix()
files = glob.glob(glob_pattern, recursive=True)
logging.debug(
"The glob pattern '%s' resulted in the following files: %s",
Expand Down
89 changes: 89 additions & 0 deletions src/conspiracies/corpusprocessing/aggregation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from collections import Counter
from typing import List, TypedDict, Dict, Union, Callable, Iterable, Tuple

from pydantic import BaseModel

from conspiracies.corpusprocessing.clustering import Mappings
from conspiracies.corpusprocessing.triplet import Triplet


def min_max_normalizer(values: Iterable[Union[int, float]]) -> Callable[[float], float]:
if not isinstance(values, list):
values = list(values)
min_value = min(values)
max_value = max(values)
if min_value == max_value:
return lambda x: 0
return lambda x: (x - min_value) / (max_value - min_value)


class StatsEntry(TypedDict):
key: Union[str, Tuple[str]]
frequency: int
norm_frequency: float


class StatsDict(Dict[str, StatsEntry]):
pass

@classmethod
def from_counter(cls, counter: Counter):
normalizer = min_max_normalizer(counter.values())
return cls(
{
key: StatsEntry(
key=key,
frequency=value,
norm_frequency=normalizer(value),
)
for key, value in counter.items()
},
)


class TripletStats(BaseModel):
triplets: StatsDict
entities: StatsDict
predicates: StatsDict

def entries(self) -> Dict[str, List[StatsEntry]]:
return {
statsdict: list(getattr(self, statsdict).values())
for statsdict in ("triplets", "entities", "predicates")
}


class TripletAggregator:

def __init__(self, mappings: Mappings = None):
self._mappings = mappings

def aggregate(
self,
triplets: List[Triplet],
remove_identical_subj_and_obj: bool = True,
):
mapped_triplets = [
(
self._mappings.map_entity(triplet.subject.text),
self._mappings.map_predicate(triplet.predicate.text),
self._mappings.map_entity(triplet.object.text),
)
for triplet in triplets
]
if remove_identical_subj_and_obj:
mapped_triplets = [t for t in mapped_triplets if t[0] != t[2]]
triplet_counts = Counter(triplet for triplet in mapped_triplets)
entity_counts = Counter(
entity for triplet in mapped_triplets for entity in (triplet[0], triplet[2])
)
# entity_distinct_triplets = Counter(
# entity for triplet in set(mapped_triplets) for entity in (triplet[0], triplet[2])
# )

predicate_counts = Counter(triplet[1] for triplet in mapped_triplets)
return TripletStats(
triplets=StatsDict.from_counter(triplet_counts),
entities=StatsDict.from_counter(entity_counts),
predicates=StatsDict.from_counter(predicate_counts),
)
Loading

0 comments on commit 6cc2b53

Please sign in to comment.