From 268f036b13b9490c4627d4a13957b1efe727bc74 Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Fri, 14 Apr 2023 09:17:59 +0200 Subject: [PATCH 1/9] embedding, clustering, visualization --- paper/network_creation.py | 343 ++++++++++++++++++++++++ paper/umap_hdb.py | 538 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 881 insertions(+) create mode 100644 paper/network_creation.py create mode 100644 paper/umap_hdb.py diff --git a/paper/network_creation.py b/paper/network_creation.py new file mode 100644 index 0000000..859f9ef --- /dev/null +++ b/paper/network_creation.py @@ -0,0 +1,343 @@ +import networkx as nx +import json +from collections import Counter +import matplotlib.pyplot as plt +import os +import numpy as np +import typing +from typing import Dict, Tuple, Optional +from networkx.drawing.layout import ( + fruchterman_reingold_layout, + spring_layout, + kamada_kawai_layout, +) + + +def load_json(file_path: str) -> dict: + """Loads a json file. + + Args: + file_path (str): path to the json file + + Returns: + data (dict): the data from the file as a dictionary + """ + with open(file_path, mode="r", encoding="utf8") as f: + data = json.load(f) + return data + + +def most_frequent_tuples( + node_dict: Dict[int, tuple], + n: int, + hard_filter: bool = False, +) -> Dict[int, tuple]: + """Find the tuples that contain one of the n most frequent elements. + + Args: + node_dict (dict): dictionary with node ids as keys and node pairs as values + n (int): The number of most frequent elements to keep + + Returns: + result (dict): filtered dictionary with node ids as keys and node pairs as values + """ + element_counter: typing.Counter = Counter() + # Count the frequency for all elements and identify the n most frequent + for i, tup in node_dict.items(): + for string in tup: + element_counter[string] += 1 + most_common_strings = set(string for string, _ in element_counter.most_common(n)) + highest_freq = element_counter.most_common(1)[0][1] + + # Only keep the nodes that contain one of the most frequent elements + result = {} + for i, tup in node_dict.items(): + if any(string in most_common_strings for string in tup): + fr = sum([element_counter[tup[0]], element_counter[tup[1]]]) + if hard_filter and fr >= highest_freq: + result[i] = tup + else: + result[i] = tup + return result + + +def get_nodes_edges( + event: str, + file: str, + remove_self_edges: bool = True, + n_most_frequent: int = 10, + hard_filter: bool = False, +) -> Tuple[Dict[int, tuple], Dict[int, tuple]]: + """Loads nodes and edges from a json file, removes self edges and only + keeps the n most frequent nodes. + + Args: + event (str): the name of the event + file (str): the specific file to load. Must be placed in a folder with the event name + remove_self_edges (bool, optional): Whether or not to remove self edges. Defaults to True. + n_most_frequent (int, optional): The number of most frequent elements to include. Defaults to 10. + + Returns: + Tuple[Dict[int, tuple], Dict[int, tuple]]: a dictionary with the most frequent nodes and + a dictionary with the associated edges + """ + nodes_edges = load_json( + os.path.join(event, file), + ) + nodes = {i: tuple(edge) for i, edge in enumerate(nodes_edges["nodes"])} + edges = {i: edge for i, edge in enumerate(nodes_edges["edges"])} + + # Removing self edges and only keeping the most frequent nodes + if remove_self_edges: + nodes = {i: (e1, e2) for i, (e1, e2) in nodes.items() if e1 != e2} + most_frequent_nodes = most_frequent_tuples(nodes, n_most_frequent, hard_filter) + associated_edges = { + i: edge for i, edge in edges.items() if i in most_frequent_nodes.keys() + } + return most_frequent_nodes, associated_edges + + +def quantile_min_value(lst, quantile): + q = np.quantile(lst, quantile) + return min(filter(lambda x: x >= q, lst)) + + +def create_network_graph( + node_list, + edge_list, + title: str = "Narrative network graph", + layout=fruchterman_reingold_layout, + k: float = 0.3, + node_size_mult: float = 4, + fontsize: int = 12, + color: str = "#2a89d6", + plot_coordinates: bool = False, + seed: Optional[int] = None, + save=False, +): + G = nx.Graph() + G.add_edges_from(list(node_list.values())) + c = Counter(list(node_list.values())) # edge weights = frequency of edge + for u, v, d in G.edges(data=True): + # Make the graph undirected - for some reason, the tuples are sometimes reversed in the edge list + d["weight"] = c[(u, v)] + c[(v, u)] + + edge_label_weight_cutoff = quantile_min_value(list(c.values()), 0.90) + edges_to_draw = {} + for n, nodes in node_list.items(): + if c[nodes] >= edge_label_weight_cutoff: + edges_to_draw[nodes] = edge_list[n] + + if layout == kamada_kawai_layout: + # element_counter: typing.Counter = Counter() + # for tup in node_list.values(): + # for string in tup: + # element_counter[string] += 1 + # most_common = element_counter.most_common(5) + + # value_dict = {k:100 for k,_ in most_common[1:]} + # dist = {most_common[0][1]:value_dict} + # pos = layout(G, dist=dist) + pos = layout(G, scale=2) + else: + pos = layout(G, k=k, seed=seed) + + degrees = nx.degree(G) + + plt.figure(figsize=(10, 10)) + plt.title(title, color="#115691") + nx.draw( + G, + pos, + node_size=[ + k[1] ** node_size_mult if k[1] ** node_size_mult < 1000 else 1000 + for k in degrees + ], + node_color=color, + edge_color=color + "80", + width=[d["weight"] ** 2 for _, _, d in G.edges(data=True)], + ) + nx.draw_networkx_edge_labels( + G, + pos, + edge_labels=edges_to_draw, + font_size=fontsize - 2, + label_pos=0.5, + bbox=dict( + facecolor="white", + edgecolor=color, + alpha=0.8, + boxstyle="round,pad=0.2", + ), + ) + + offset = 0.015 + for node, (x, y) in pos.items(): + h_align = "center" + v_align = "center" + if x < 0: + x -= offset + h_align = "right" + if x > 0: + x += offset + h_align = "left" + if y < 0: + y -= offset + v_align = "top" + if y > 0: + y += offset + v_align = "bottom" + if plot_coordinates: + label = f"{node} ({x:.2f}, {y:.2f})" + else: + label = node + plt.text( + x, + y, + label, + fontsize=fontsize, + color="#404040", + ha=h_align, + va=v_align, + bbox=dict( + facecolor="white", + edgecolor="#404040", + alpha=0.8, + boxstyle="round,pad=0.1", + ), + ) + if save: + plt.savefig(f"{save}.png", format="PNG") + return G + + +# Twitter + +twitter_week_1_nodes, twitter_week_1_edges = get_nodes_edges( + "extracted_triplets_tweets/covid_week_1", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", + hard_filter=True, +) + +twitter_week_1_graph = create_network_graph( + twitter_week_1_nodes, + twitter_week_1_edges, + title="Covid-19 lockdown week 1 - Twitter", + layout=spring_layout, + # layout=kamada_kawai_layout, + k=2.5, + node_size_mult=2.5, + fontsize=11, + save="fig/twitter_week_1_graph", +) + +# News papers + +news_week_1_nodes, news_week_1_edges = get_nodes_edges( + "extracted_triplets_papers/covid_week_1", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", +) + +news_week_2_nodes, news_week_2_edges = get_nodes_edges( + "extracted_triplets_papers/covid_week_2", + # "paraphrase_nodes_edges.json", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", +) + + +news_week_1_graph = create_network_graph( + news_week_1_nodes, + news_week_1_edges, + title="Covid-19 lockdown week 1 - Newspapers", + k=1.5, + node_size_mult=2.5, + fontsize=10, + save="fig/news_week_1_graph", +) + +news_week_2_graph = create_network_graph( + news_week_2_nodes, + news_week_2_edges, + title="Week 2 of the COVID-19 lockdown", + # layout=spring_layout, + # layout=kamada_kawai_layout, + k=2.5, + node_size_mult=2.5, + fontsize=10, + save="fig/news_week_2_graph", +) + + +### Using danskbert instead +# week_1_dansk_nodes, week_1_dansk_edges = get_nodes_edges( +# "covid_week_1", +# "danskBERT_nodes_edges.json", +# ) + +# week_2_dansk_nodes, week_2_dansk_edges = get_nodes_edges( +# "covid_week_2", +# "danskBERT_nodes_edges.json", +# ) + +### Testing different hyperparameters + +week_1_nodes_15, week_1_edges_15 = get_nodes_edges( + "covid_week_1", + "paraphrase_dim=40_neigh=15_clust=15_samp=5_nodes_edges.json", +) +week_1_nodes_10, week_1_edges_10 = get_nodes_edges( + "covid_week_1", + "paraphrase_dim=40_neigh=15_clust=10_samp=5_nodes_edges.json", +) +week_1_nodes_8, week_1_edges_8 = get_nodes_edges( + "covid_week_1", + "paraphrase_dim=40_neigh=15_clust=8_samp=5_nodes_edges.json", +) +week_1_nodes_5, week_1_edges_5 = get_nodes_edges( + "covid_week_1", + "paraphrase_dim=40_neigh=15_clust=5_samp=5_nodes_edges.json", +) +week_1_para_graph_15 = create_network_graph( + week_1_nodes_15, + week_1_edges_15, + k=1, + node_size_mult=2.5, + fontsize=10, +) +week_1_para_graph_10 = create_network_graph( + week_1_nodes_10, + week_1_edges_10, + k=1, + node_size_mult=2.5, + fontsize=10, +) +week_1_para_graph_8 = create_network_graph( + week_1_nodes_8, + week_1_edges_8, + k=1, + node_size_mult=2.5, + fontsize=10, +) +week_1_para_graph_5 = create_network_graph( + week_1_nodes_5, + week_1_edges_5, + k=1, + node_size_mult=2.5, + fontsize=10, +) + + +# week_1_dansk_graph = create_network_graph( +# week_1_dansk_nodes, +# week_1_dansk_edges, +# k=0.6, +# node_size_mult=2.5, +# fontsize=11, +# ) +# week_2_dansk_graph = create_network_graph( +# week_2_dansk_nodes, +# week_2_dansk_edges, +# k=0.6, +# node_size_mult=2.5, +# fontsize=11, +# ) diff --git a/paper/umap_hdb.py b/paper/umap_hdb.py new file mode 100644 index 0000000..912d176 --- /dev/null +++ b/paper/umap_hdb.py @@ -0,0 +1,538 @@ +import json +from typing import Tuple, List, Dict, Optional, Union +import os +import spacy +from umap import UMAP +from hdbscan import HDBSCAN +from sentence_transformers import SentenceTransformer +from stop_words import get_stop_words +from sklearn.preprocessing import StandardScaler +from numpy import ndarray +from collections import Counter +import random +import argparse + + +def read_txt(path: str): + with open(path, mode="r", encoding="utf8") as f: + lines = f.read().splitlines() + return lines + + +def triplet_from_line(line: str) -> Union[Tuple[str, str, str], None]: + """Converts a line from a txt file to a triplet. + + Lines that are not exactly three elements are ignored. + All elements in the triplet are stripped of whitespace and lowercased. + Args: + line (str): Line from a txt file + Returns: + triplet (tuple): Triplet + """ + as_list = line.split(", ") + if len(as_list) != 3: + return None + if line in ["Subject, Predicate, Object", "---, ---, ---"]: + return None + return tuple(map(str.strip, map(str.lower, as_list))) # type: ignore + + +def filter_triplets_with_stopwords( + triplets: List[Tuple[str, str, str]], + stopwords: List[str], + soft: bool = True, +) -> List[Tuple[str, str, str]]: + """Filters triplets that contain a stopword. + + Args: + triplets (List[Tuple[str, str, str]]): List of triplets. A triplet is a tuple of three strings. + stopwords (List[str]): List of stopwords + soft (bool): If True, only the subject and object are checked for stopwords. If False, the whole triplet is checked. + Returns: + filtered_triplets (List[Tuple[str, str, str]]): List of triplets without stopwords + """ + filtered_triplets = [] + if soft: + for triplet in triplets: + subject, predicate, obj = triplet + if subject not in stopwords and obj not in stopwords: + filtered_triplets.append(triplet) + else: + for triplet in triplets: + if not any(stopword in triplet for stopword in stopwords): + filtered_triplets.append(triplet) + return filtered_triplets + + +def load_triplets( + file_path: str, + soft_filtering: bool = True, + shuffle: bool = True, +) -> Tuple[list, list, list, list]: + """Loads triplets from a file and filters them. + + Args: + file_name (str): Name of the file to load triplets from + soft_filtering (bool): Whether to use soft filtering or not + shuffle (bool): Whether to shuffle the triplets or not + Returns: + subjects (list): List of subjects + predicates (list): List of predicates + objects (list): List of objects + filtered_triplets (list): List of filtered triplets + """ + triplets_list: List[Tuple[str, str, str]] = [] + data = read_txt(file_path) + triplets_list = [ + triplet_from_line(line) for line in data if triplet_from_line(line) # type: ignore + ] + filtered_triplets = filter_triplets_with_stopwords( + triplets_list, + get_stop_words("danish"), + soft=soft_filtering, + ) + + if shuffle: + random.shuffle(filtered_triplets) + + subjects = [ + triplet[0] + for triplet in filtered_triplets + if triplet[0] not in ["Subject", "---"] + ] + predicates = [ + triplet[1] + for triplet in filtered_triplets + if triplet[0] not in ["Predicate", "---"] + ] + objects = [ + triplet[2] + for triplet in filtered_triplets + if triplet[0] not in ["Object", "---"] + ] + return subjects, predicates, objects, filtered_triplets + + +def freq_of_most_frequent(list_of_strings: List[str]) -> Tuple[str, float]: + """Calculates the frequency of the most frequent element in a list of + strings. + + Frequency is measured as how much of the list the most frequent element takes up, percentage-wise. + Args: + list_of_strings (List[str]): List of strings to find the most frequent element in + Returns: + most_common_string (str), percentage (float): Frequency of the most frequent element, its percentage + """ + most_common = Counter(list_of_strings).most_common(1)[0] + most_common_string = most_common[0] + percentage = most_common[1] / len(list_of_strings) + return most_common_string, percentage + + +def most_frequent_token(list_of_strings: List[str], nlp) -> Tuple[str, float]: + """Finds the most frequent token in a list of strings. + + Args: + list_of_strings (List[str]): List of strings to find the most frequent token in + nlp (spacy.lang): Spacy language model + Returns: + most_common_token (str): Most frequent token + """ + token_list = [] + for string in list_of_strings: + doc = nlp(string) + for token in doc: + token_list.append(token.text) + most_common_token, percentage = freq_of_most_frequent(token_list) + return most_common_token, percentage + + +def get_cluster_label( + cluster: List[Tuple[str, int]], + nlp, + first_cutoff: float = 0.8, + min_cluster_length: int = 10, + second_cutoff: float = 0.3, +) -> Union[str, None]: + """Finds the most frequent token in a cluster. + + Args: + cluster (List[Tuple[str, int]]): Cluster of elements + nlp (spacy.lang): Spacy language model + certain_cutoff (float): Minimum percentage of the most frequent token. + All clusters that have a most frequent token at least as frequent as this value gets that token as label. + Returns: + label (str): The cluster label according to the rules + """ + cluster_strings = [element[0] for element in cluster] + most_common, percentage = freq_of_most_frequent(cluster_strings) + + # If the most frequent token is frequent enough, use it as label + if percentage >= first_cutoff: + return most_common + + else: + # If the cluster is not clearly defined and it is short, + # return None to indicate it should be removed + if len(cluster) < min_cluster_length: + return None + + # Clusters that are large and relatively clearly defined + if percentage >= second_cutoff: + return most_common + + # If the cluster is not clearly defined, find the most frequent token + most_common_token, percentage = most_frequent_token(cluster_strings, nlp) + + # The most frequent token must be frequent enough to be used as label, + # otherwise return None to indicate it should be removed + if percentage >= 0.2: # TODO: Make this a parameter + return most_common_token + else: + return None + + +def cluster_dict(topic_labels: ndarray, input_list: List[str]) -> Dict[int, List[str]]: + """Creates a dictionary containing all elements in a cluster from a + BERTopic model. + + Only clusters with at least `cutoff` elements. + Args: + topic_model (BERTopic): BERTopic model + input_list (List[str]): The list of string that were used to create the BERTopic model + cutoff (int): Minimum number of elements in a cluster + Returns: + cluster_dict (Dict[int, List[str]]): Dictionary of clusters + """ + assert len(topic_labels) == len( + input_list, + ), "Length of topic labels and input list must be equal" + # topic_info = topic_model.get_topic_info() + # relevant_topics = topic_info.loc[topic_info["Count"] >= cutoff] + cluster_dict: Dict[int, list] = {i: [] for i in range(max(topic_labels) + 1)} + topic_tuples = zip(topic_labels, input_list) + for index, (topic_n, element) in enumerate(topic_tuples): + if topic_n != -1: + cluster_dict[topic_n].append((element, index)) + return cluster_dict + + +def label_clusters( + cluster_dict, + nlp, + first_cutoff: float = 0.8, + min_cluster_length: int = 10, + second_cutoff: float = 0.3, + predicates: bool = False, +): + """Labels clusters according to the rules in `get_cluster_label`. + Args: + cluster_dict (Dict[int, List[str]]): Dictionary of clusters to label + nlp (spacy.lang): Spacy language model to use for tokenization of less defined clusters + first_cutoff (float): Minimum percentage of the most frequent element. + All clusters that have a most frequent element at least as frequent as this value gets that element as label. + min_cluster_length (int): Minimum number of elements in a cluster + second_cutoff (float): Minimum percentage of the most frequent element. + This cutoff is only used if the cluster has a less clearly defined label, but is still large enough. + + Returns: + dict_with_labels (Dict[str, Dict[str, List[str]]]): Dictionary of clusters with labels + The keys are the labels, the values are dictionaries with the keys + "cluster" (final elements in the cluster), and + "n_elements" (number of elements in the final cluster) + """ + if predicates: + min_cluster_length = 1 + second_cutoff = 0.0 + dict_with_labels: Dict[str, dict] = {} + # Get the label for each cluster + for cluster in cluster_dict.values(): + cluster_label = get_cluster_label( + cluster, + nlp, + first_cutoff, + min_cluster_length, + second_cutoff, + ) + if cluster_label: # If the cluster is not None + if ( + cluster_label in dict_with_labels.keys() + ): # If the label already exists, merge + dict_with_labels[cluster_label]["cluster"].extend(cluster) + dict_with_labels[cluster_label]["n_elements"] += len(cluster) + else: # If the label does not exist, create a new entry + dict_with_labels[cluster_label] = {} + dict_with_labels[cluster_label]["cluster"] = cluster + dict_with_labels[cluster_label]["n_elements"] = len(cluster) + + # Remove clusters that are too small even after merging identical clusters + clusters_to_keep = { + label: content + for label, content in dict_with_labels.items() + if content["n_elements"] > min_cluster_length + } + return clusters_to_keep + + +def embed_and_cluster( + list_to_embed: List[str], + embedding_model: str = "vesteinn/DanskBERT", + n_dimensions: int = 40, + n_neighbors: int = 15, + min_cluster_size: int = 5, + min_samples: int = 3, + min_topic_size: int = 10, + predicates: bool = False, +): + """Embeds and clusters a list of strings. + + Args: + list_to_embed (List[str]): List of strings to embed and cluster + n_dimensions (int): Number of dimensions to reduce the embedding space to + n_neighbors (int): Number of neighbors to use for UMAP + min_cluster_size (int): Minimum cluster size for HDBscan + min_samples (int): Minimum number of samples for HDBscan + min_topic_size (int): Minimum number of elements in a cluster + Returns: + clusters (Dict[str, Dict[str, List[str]]]): Dictionary of clusters with labels + The keys are the labels, the values are dictionaries with the keys + "cluster" (final elements in the cluster), and + "n_elements" (number of elements in the final cluster) + """ + + embedding_model = SentenceTransformer(embedding_model) + + # Embed and reduce embdding space + print("Embedding and reducing embedding space") + embeddings = embedding_model.encode(list_to_embed) # type: ignore + scaled_embeddings = StandardScaler().fit_transform(embeddings) + reducer = UMAP(n_components=n_dimensions, n_neighbors=n_neighbors) + reduced_embeddings = reducer.fit_transform(scaled_embeddings) + + # Cluster with HDBscan + print("Clustering") + hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples) + hdbscan_model.fit(reduced_embeddings) + hdbscan_labels = hdbscan_model.labels_ + assert len(hdbscan_labels) == len( + list_to_embed, + ), "Length of hdbscan labels and input list must be equal" + clusters = cluster_dict(hdbscan_labels, list_to_embed) + + # Label and prune clusters + print("Labeling clusters") + nlp = spacy.load("da_core_news_sm") + labeled_clusters = label_clusters( + clusters, + nlp, + min_cluster_length=min_topic_size, + predicates=predicates, + ) + + return labeled_clusters + + +def create_nodes_and_edges( + subj_obj_clusters: Dict[str, Dict[str, List[str]]], + predicate_clusters: Dict[str, Dict[str, List[str]]], + n_elements: int, + no_predicate_filler: str = "", + save: Optional[Union[bool, str]] = False, +): + """Creates nodes and edges from clusters of subjects, objects and predicates. + Args: + subj_obj_clusters (Dict[str, Dict[str, List[str]]]): Dictionary of clusters with labels + The keys are the labels, the values are dictionaries with the keys + "cluster" (final elements in the cluster), and + "n_elements" (number of elements in the final cluster) + predicate_clusters (Dict[str, Dict[str, List[str]]]): Dictionary of clusters with labels + The keys are the labels, the values are dictionaries with the keys + "cluster" (final elements in the cluster), and + "n_elements" (number of elements in the final cluster) + no_predicate_filler (str): String to use as filler for predicates that do not have a cluster + save (Optional[Union[bool,str]]): If a string, nodes and edges are saved to a json file. + If False, does not save. + + Returns: + nodes, edges (List[Tuple[str, str]], List[str]]): nodes and edges for the graph + """ + + labelled_subjects = {i: "" for i in range(0, n_elements)} + labelled_objects = {i: "" for i in range(0, n_elements)} + + for label, content in subj_obj_clusters.items(): + cluster = content["cluster"] + for _, index in cluster: # type: ignore + if index < n_elements: # type: ignore + labelled_subjects[index] = label # type: ignore + else: + labelled_objects[index - n_elements] = label # type: ignore + labelled_subjects = { # type: ignore + i: label for i, label in labelled_subjects.items() if label != "" # type: ignore + } + labelled_objects = { + i: label for i, label in labelled_objects.items() if label != "" + } + + labelled_predicates = {i: "" for i in range(0, n_elements)} + for label, content in predicate_clusters.items(): + cluster = content["cluster"] + for _, index in cluster: # type: ignore + labelled_predicates[index] = label # type: ignore + labelled_predicates = { + i: label for i, label in labelled_predicates.items() if label != "" + } + + nodes = [] + edges = [] + for s_index, subject in labelled_subjects.items(): + if s_index in labelled_objects.keys(): + nodes.append((subject, labelled_objects[s_index])) + if s_index in labelled_predicates.keys(): + edges.append(labelled_predicates[s_index]) + else: + edges.append(no_predicate_filler) + + if save: + with open(save, "w") as f: + json.dump({"nodes": nodes, "edges": edges}, f) + return nodes, edges + + +def main( + path: str, + embedding_model: str, + dim=40, + n_neighbors=15, + min_cluster_size=5, + min_samples=3, + min_topic_size=20, + save: bool = False, +): + # Load triplets + print("Loading triplets") + subjects, predicates, objects, filtered_triplets = load_triplets( + path, + soft_filtering=True, + shuffle=True, + ) + + if save: + save = path.replace( + "triplets.txt", + f"{embedding_model}_dim={dim}_neigh={n_neighbors}_clust={min_cluster_size}_samp={min_samples}_nodes_edges.json", + ) # type: ignore + + model = ( + "vesteinn/DanskBERT" + if embedding_model == "danskBERT" + else "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" + ) + + print( + f"Dimensions: {dim}, neighbors: {n_neighbors}, min cluster size: {min_cluster_size}, samples: {min_samples}, min topic size: {min_topic_size}", + ) + print("\n_________________\n") + print("Embedding and clustering predicates") + # For predicate, we wanna keep all clusters -> min_topic_size=1 + predicate_clusters = embed_and_cluster( + list_to_embed=predicates, + embedding_model=model, + n_dimensions=dim, + n_neighbors=n_neighbors, + min_cluster_size=min_cluster_size, + min_samples=min_samples, + min_topic_size=1, + predicates=True, + ) + + print("\n_________________\n") + print("Embedding and clustering subjects and objects together") + subj_obj = subjects + objects + subj_obj_clusters = embed_and_cluster( + list_to_embed=subj_obj, + embedding_model=model, + n_dimensions=dim, + n_neighbors=n_neighbors, + min_cluster_size=min_cluster_size, + min_samples=min_samples, + min_topic_size=min_topic_size, + ) + + # Create nodes and edges + print("Creating nodes and edges") + + assert ( + len(subjects) == len(objects) == len(predicates) + ), "Subjects, objects and predicates must have the same length" + nodes, edges = create_nodes_and_edges( + subj_obj_clusters, + predicate_clusters, + len(subjects), + save=save, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-e", + "--event", + type=str, + help="Event to cluster. Must include name of source folder (newspapers or twitter) and event", + ) + parser.add_argument( + "-emb", + "--embedding_model", + type=str, + default="paraphrase", + help="""Which embedding model to use, default is paraphrase. + The other option is danskBERT""", + ) + parser.add_argument( + "-dim", + "--n_dimensions", + type=int, + default=40, + help="Number of dimensions to reduce the embedding space to", + ) + parser.add_argument( + "-neigh", + "--n_neighbors", + type=int, + default=15, + help="Number of neighbors to use for UMAP", + ) + parser.add_argument( + "-min_clust", + "--min_cluster_size", + type=int, + default=5, + help="Minimum cluster size for HDBscan", + ) + parser.add_argument( + "-min_samp", + "--min_samples", + type=int, + default=3, + help="Minimum number of samples for HDBscan", + ) + parser.add_argument( + "-save", + "--save", + type=bool, + default=False, + help="whether or not to save nodes and edges to json file", + ) + + args = parser.parse_args() + path = os.path.join(args.event, "triplets.txt") + main( + path, + embedding_model=args.embedding_model, + dim=args.n_dimensions, + n_neighbors=args.n_neighbors, + min_cluster_size=args.min_cluster_size, + min_samples=args.min_samples, + save=args.save, + ) From af9475f00d68af92890a47665641f766704d3b22 Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Mon, 17 Apr 2023 09:58:25 +0200 Subject: [PATCH 2/9] minor updates --- paper/network_creation.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/paper/network_creation.py b/paper/network_creation.py index 859f9ef..6e49717 100644 --- a/paper/network_creation.py +++ b/paper/network_creation.py @@ -4,8 +4,9 @@ import matplotlib.pyplot as plt import os import numpy as np +import ndjson import typing -from typing import Dict, Tuple, Optional +from typing import Dict, Tuple, Optional, Union from networkx.drawing.layout import ( fruchterman_reingold_layout, spring_layout, @@ -54,8 +55,9 @@ def most_frequent_tuples( for i, tup in node_dict.items(): if any(string in most_common_strings for string in tup): fr = sum([element_counter[tup[0]], element_counter[tup[1]]]) - if hard_filter and fr >= highest_freq: - result[i] = tup + if hard_filter: + if fr >= highest_freq: + result[i] = tup else: result[i] = tup return result @@ -67,6 +69,7 @@ def get_nodes_edges( remove_self_edges: bool = True, n_most_frequent: int = 10, hard_filter: bool = False, + save: Optional[str] = None, ) -> Tuple[Dict[int, tuple], Dict[int, tuple]]: """Loads nodes and edges from a json file, removes self edges and only keeps the n most frequent nodes. @@ -94,6 +97,15 @@ def get_nodes_edges( associated_edges = { i: edge for i, edge in edges.items() if i in most_frequent_nodes.keys() } + if save: + with open(save, "w") as f: + ndjson.dump( + { + "nodes": list(most_frequent_nodes.values()), + "edges": list(associated_edges.values()), + }, + f, + ) return most_frequent_nodes, associated_edges @@ -150,7 +162,7 @@ def create_network_graph( G, pos, node_size=[ - k[1] ** node_size_mult if k[1] ** node_size_mult < 1000 else 1000 + k[1] ** node_size_mult # if k[1] ** node_size_mult < 1000 else 1000 for k in degrees ], node_color=color, @@ -217,6 +229,7 @@ def create_network_graph( "extracted_triplets_tweets/covid_week_1", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", hard_filter=True, + save="twitter_week_1_nodes_edges.ndjson", ) twitter_week_1_graph = create_network_graph( @@ -226,9 +239,9 @@ def create_network_graph( layout=spring_layout, # layout=kamada_kawai_layout, k=2.5, - node_size_mult=2.5, + node_size_mult=2, fontsize=11, - save="fig/twitter_week_1_graph", + # save="fig/twitter_week_1_graph", ) # News papers @@ -236,6 +249,7 @@ def create_network_graph( news_week_1_nodes, news_week_1_edges = get_nodes_edges( "extracted_triplets_papers/covid_week_1", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", + save="news_week_1_nodes_edges.ndjson", ) news_week_2_nodes, news_week_2_edges = get_nodes_edges( @@ -250,7 +264,7 @@ def create_network_graph( news_week_1_edges, title="Covid-19 lockdown week 1 - Newspapers", k=1.5, - node_size_mult=2.5, + node_size_mult=2, fontsize=10, save="fig/news_week_1_graph", ) @@ -262,7 +276,7 @@ def create_network_graph( # layout=spring_layout, # layout=kamada_kawai_layout, k=2.5, - node_size_mult=2.5, + node_size_mult=2, fontsize=10, save="fig/news_week_2_graph", ) From 66e92e5420d7b9d122d0cd280cee537116c0f408 Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Wed, 19 Apr 2023 11:06:51 +0200 Subject: [PATCH 3/9] add option for removing specific nodes --- paper/network_creation.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/paper/network_creation.py b/paper/network_creation.py index 6e49717..80d9d64 100644 --- a/paper/network_creation.py +++ b/paper/network_creation.py @@ -6,7 +6,7 @@ import numpy as np import ndjson import typing -from typing import Dict, Tuple, Optional, Union +from typing import Dict, Tuple, Optional, List from networkx.drawing.layout import ( fruchterman_reingold_layout, spring_layout, @@ -67,6 +67,7 @@ def get_nodes_edges( event: str, file: str, remove_self_edges: bool = True, + remove_custom_nodes: Optional[List[str]] = None, n_most_frequent: int = 10, hard_filter: bool = False, save: Optional[str] = None, @@ -93,6 +94,12 @@ def get_nodes_edges( # Removing self edges and only keeping the most frequent nodes if remove_self_edges: nodes = {i: (e1, e2) for i, (e1, e2) in nodes.items() if e1 != e2} + if remove_custom_nodes: + nodes = { + i: (e1, e2) + for i, (e1, e2) in nodes.items() + if e1 not in remove_custom_nodes and e2 not in remove_custom_nodes + } most_frequent_nodes = most_frequent_tuples(nodes, n_most_frequent, hard_filter) associated_edges = { i: edge for i, edge in edges.items() if i in most_frequent_nodes.keys() @@ -236,12 +243,33 @@ def create_network_graph( twitter_week_1_nodes, twitter_week_1_edges, title="Covid-19 lockdown week 1 - Twitter", - layout=spring_layout, + # layout=spring_layout, + # layout=kamada_kawai_layout, + k=2.5, + node_size_mult=2, + fontsize=11, + save="fig/twitter_week_1_graph", +) + +# No få +twitter_week_1_nodes_rm_få, twitter_week_1_edges_rm_få = get_nodes_edges( + "extracted_triplets_tweets/covid_week_1", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", + remove_custom_nodes=["få"], + hard_filter=True, + save="twitter_week_1_nodes_edges.ndjson", +) + +twitter_week_1_graph = create_network_graph( + twitter_week_1_nodes_rm_få, + twitter_week_1_edges_rm_få, + title="Covid-19 lockdown week 1 - Twitter", + # layout=spring_layout, # layout=kamada_kawai_layout, k=2.5, node_size_mult=2, fontsize=11, - # save="fig/twitter_week_1_graph", + save="fig/twitter_week_1_graph_rm_få.png", ) # News papers From ac42124cb0b3b25ca4b083e08a2c18af265cc479 Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Tue, 25 Apr 2023 12:25:51 +0200 Subject: [PATCH 4/9] custom variables --- paper/network_creation.py | 51 ++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/paper/network_creation.py b/paper/network_creation.py index 80d9d64..6b33acc 100644 --- a/paper/network_creation.py +++ b/paper/network_creation.py @@ -55,6 +55,8 @@ def most_frequent_tuples( for i, tup in node_dict.items(): if any(string in most_common_strings for string in tup): fr = sum([element_counter[tup[0]], element_counter[tup[1]]]) + # If hard_filter, keep only pairs of nodes at least at frequent + # as the most frequent node if hard_filter: if fr >= highest_freq: result[i] = tup @@ -79,7 +81,11 @@ def get_nodes_edges( event (str): the name of the event file (str): the specific file to load. Must be placed in a folder with the event name remove_self_edges (bool, optional): Whether or not to remove self edges. Defaults to True. + remove_custom_nodes (list, optional): A list of nodes to remove from the graph. Defaults to None. n_most_frequent (int, optional): The number of most frequent elements to include. Defaults to 10. + hard_filter (bool, optional): Passed to most_frequent_tuples. Whether or not to filter away + pairs of nodes that are less frequent than the single most frequent. Defaults to False. + save (str, optional): If provided, the filtered nodes and edges will be saved to this path. Returns: Tuple[Dict[int, tuple], Dict[int, tuple]]: a dictionary with the most frequent nodes and @@ -128,7 +134,9 @@ def create_network_graph( layout=fruchterman_reingold_layout, k: float = 0.3, node_size_mult: float = 4, + edge_weight_mult: float = 2, fontsize: int = 12, + quantile_value: float = 0.90, color: str = "#2a89d6", plot_coordinates: bool = False, seed: Optional[int] = None, @@ -141,22 +149,13 @@ def create_network_graph( # Make the graph undirected - for some reason, the tuples are sometimes reversed in the edge list d["weight"] = c[(u, v)] + c[(v, u)] - edge_label_weight_cutoff = quantile_min_value(list(c.values()), 0.90) + edge_label_weight_cutoff = quantile_min_value(list(c.values()), quantile_value) edges_to_draw = {} for n, nodes in node_list.items(): if c[nodes] >= edge_label_weight_cutoff: edges_to_draw[nodes] = edge_list[n] if layout == kamada_kawai_layout: - # element_counter: typing.Counter = Counter() - # for tup in node_list.values(): - # for string in tup: - # element_counter[string] += 1 - # most_common = element_counter.most_common(5) - - # value_dict = {k:100 for k,_ in most_common[1:]} - # dist = {most_common[0][1]:value_dict} - # pos = layout(G, dist=dist) pos = layout(G, scale=2) else: pos = layout(G, k=k, seed=seed) @@ -164,7 +163,11 @@ def create_network_graph( degrees = nx.degree(G) plt.figure(figsize=(10, 10)) - plt.title(title, color="#115691") + plt.title( + title, + color="#115691", + fontsize=fontsize + 2, + ) nx.draw( G, pos, @@ -174,7 +177,7 @@ def create_network_graph( ], node_color=color, edge_color=color + "80", - width=[d["weight"] ** 2 for _, _, d in G.edges(data=True)], + width=[d["weight"] ** edge_weight_mult for _, _, d in G.edges(data=True)], ) nx.draw_networkx_edge_labels( G, @@ -272,6 +275,30 @@ def create_network_graph( save="fig/twitter_week_1_graph_rm_få.png", ) +# With old triplet extraction instead of GPT +twitter_week_1_nodes_multi, twitter_week_1_edges_multi = get_nodes_edges( + "extracted_triplets_tweets/covid_week_1_multi", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", + # remove_custom_nodes=["få"], + hard_filter=False, + n_most_frequent=10, + # save="twitter_week_1_nodes_edges_multi.ndjson", +) + +twitter_week_1_graph_multi = create_network_graph( + twitter_week_1_nodes_multi, + twitter_week_1_edges_multi, + title="Covid-19 lockdown week 1 - Twitter", + # layout=spring_layout, + # layout=kamada_kawai_layout, + k=2.5, + node_size_mult=3, + edge_weight_mult=0.7, + fontsize=12, + quantile_value=0.6, + # save="fig/twitter_week_1_graph_rm_få.png", +) + # News papers news_week_1_nodes, news_week_1_edges = get_nodes_edges( From 70aa2d690010ab1c1aba9355573215c78a7ab247 Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Wed, 26 Apr 2023 12:00:12 +0200 Subject: [PATCH 5/9] change color, add mink start newspapers --- paper/network_creation.py | 63 ++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/paper/network_creation.py b/paper/network_creation.py index 6b33acc..18360d9 100644 --- a/paper/network_creation.py +++ b/paper/network_creation.py @@ -137,7 +137,9 @@ def create_network_graph( edge_weight_mult: float = 2, fontsize: int = 12, quantile_value: float = 0.90, - color: str = "#2a89d6", + node_color: str = "#2a89d6", + edge_color: str = "#FE9322", + fig_size: int = 10, plot_coordinates: bool = False, seed: Optional[int] = None, save=False, @@ -162,11 +164,11 @@ def create_network_graph( degrees = nx.degree(G) - plt.figure(figsize=(10, 10)) + plt.figure(figsize=(fig_size, fig_size)) plt.title( title, - color="#115691", - fontsize=fontsize + 2, + color="k", + fontsize=fontsize + 4, ) nx.draw( G, @@ -175,8 +177,8 @@ def create_network_graph( k[1] ** node_size_mult # if k[1] ** node_size_mult < 1000 else 1000 for k in degrees ], - node_color=color, - edge_color=color + "80", + node_color=node_color, + edge_color=edge_color + "80", width=[d["weight"] ** edge_weight_mult for _, _, d in G.edges(data=True)], ) nx.draw_networkx_edge_labels( @@ -185,9 +187,10 @@ def create_network_graph( edge_labels=edges_to_draw, font_size=fontsize - 2, label_pos=0.5, + # font_color=edge_color, bbox=dict( facecolor="white", - edgecolor=color, + edgecolor=edge_color, alpha=0.8, boxstyle="round,pad=0.2", ), @@ -218,12 +221,12 @@ def create_network_graph( y, label, fontsize=fontsize, - color="#404040", + color="k", ha=h_align, va=v_align, bbox=dict( facecolor="white", - edgecolor="#404040", + edgecolor=node_color, alpha=0.8, boxstyle="round,pad=0.1", ), @@ -234,14 +237,12 @@ def create_network_graph( # Twitter - twitter_week_1_nodes, twitter_week_1_edges = get_nodes_edges( "extracted_triplets_tweets/covid_week_1", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", hard_filter=True, save="twitter_week_1_nodes_edges.ndjson", ) - twitter_week_1_graph = create_network_graph( twitter_week_1_nodes, twitter_week_1_edges, @@ -250,10 +251,10 @@ def create_network_graph( # layout=kamada_kawai_layout, k=2.5, node_size_mult=2, + node_color="#A82800", fontsize=11, save="fig/twitter_week_1_graph", ) - # No få twitter_week_1_nodes_rm_få, twitter_week_1_edges_rm_få = get_nodes_edges( "extracted_triplets_tweets/covid_week_1", @@ -262,7 +263,6 @@ def create_network_graph( hard_filter=True, save="twitter_week_1_nodes_edges.ndjson", ) - twitter_week_1_graph = create_network_graph( twitter_week_1_nodes_rm_få, twitter_week_1_edges_rm_få, @@ -271,10 +271,10 @@ def create_network_graph( # layout=kamada_kawai_layout, k=2.5, node_size_mult=2, + node_color="#A82800", fontsize=11, save="fig/twitter_week_1_graph_rm_få.png", ) - # With old triplet extraction instead of GPT twitter_week_1_nodes_multi, twitter_week_1_edges_multi = get_nodes_edges( "extracted_triplets_tweets/covid_week_1_multi", @@ -294,17 +294,44 @@ def create_network_graph( k=2.5, node_size_mult=3, edge_weight_mult=0.7, + node_color="#A82800", + fontsize=12, + quantile_value=0.6, + save="fig/twitter_week_1_graph_multi.png", +) + +### News papers + +# Mink start + +news_mink_start_nodes, news_mink_start_edges = get_nodes_edges( + "extracted_triplets_papers/mink_start", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", + hard_filter=True, +) + +news_mink_start_graph = create_network_graph( + news_mink_start_nodes, + news_mink_start_edges, + title="Mink case start - Newspapers", + layout=spring_layout, + # layout=kamada_kawai_layout, + node_color="#A82800", + k=4, + node_size_mult=2.5, + edge_weight_mult=1, fontsize=12, + fig_size=9, quantile_value=0.6, - # save="fig/twitter_week_1_graph_rm_få.png", + save="fig/news_mink_start.png", ) -# News papers +# Covid week 1 news_week_1_nodes, news_week_1_edges = get_nodes_edges( "extracted_triplets_papers/covid_week_1", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", - save="news_week_1_nodes_edges.ndjson", + # save="news_week_1_nodes_edges.ndjson", ) news_week_2_nodes, news_week_2_edges = get_nodes_edges( @@ -320,6 +347,7 @@ def create_network_graph( title="Covid-19 lockdown week 1 - Newspapers", k=1.5, node_size_mult=2, + node_color="#A82800", fontsize=10, save="fig/news_week_1_graph", ) @@ -332,6 +360,7 @@ def create_network_graph( # layout=kamada_kawai_layout, k=2.5, node_size_mult=2, + node_color="#A82800", fontsize=10, save="fig/news_week_2_graph", ) From 44be03ad94d67ffa480f06543efa1ff631367690 Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Fri, 28 Apr 2023 14:24:34 +0200 Subject: [PATCH 6/9] add extra parameters --- paper/network_creation.py | 97 +++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/paper/network_creation.py b/paper/network_creation.py index 18360d9..ea355c9 100644 --- a/paper/network_creation.py +++ b/paper/network_creation.py @@ -127,14 +127,32 @@ def quantile_min_value(lst, quantile): return min(filter(lambda x: x >= q, lst)) +def min_max_normalize(list_to_normalize: list, min_constant=0.5) -> list: + """Normalizes a list between 0 and 1 using min-max normalization. + + Args: + list_to_normalize (list): The list to normalize + + Returns: + list: The normalized list + """ + min_value = min(list_to_normalize) + max_value = max(list_to_normalize) + if min_value == max_value: + return list_to_normalize + scaled_list = [(x - min_value) / (max_value - min_value) for x in list_to_normalize] + return [x + min_constant for x in scaled_list] + + def create_network_graph( node_list, edge_list, title: str = "Narrative network graph", layout=fruchterman_reingold_layout, k: float = 0.3, - node_size_mult: float = 4, - edge_weight_mult: float = 2, + node_size_mult: float = 3000, + node_size_min: float = 0.001, + edge_weight_mult: float = 5, fontsize: int = 12, quantile_value: float = 0.90, node_color: str = "#2a89d6", @@ -163,23 +181,26 @@ def create_network_graph( pos = layout(G, k=k, seed=seed) degrees = nx.degree(G) - + normalized_degrees = min_max_normalize( + [d[1] for d in degrees], + min_constant=node_size_min, + ) plt.figure(figsize=(fig_size, fig_size)) plt.title( title, color="k", fontsize=fontsize + 4, ) + edge_weights = min_max_normalize([d["weight"] for _, _, d in G.edges(data=True)]) nx.draw( G, pos, - node_size=[ - k[1] ** node_size_mult # if k[1] ** node_size_mult < 1000 else 1000 - for k in degrees - ], + # node_size=non_norm_degrees, + node_size=[d * node_size_mult for d in normalized_degrees], node_color=node_color, edge_color=edge_color + "80", - width=[d["weight"] ** edge_weight_mult for _, _, d in G.edges(data=True)], + # width=[d["weight"] ** edge_weight_mult for _, _, d in G.edges(data=True)], + width=[e * edge_weight_mult for e in edge_weights], ) nx.draw_networkx_edge_labels( G, @@ -241,8 +262,8 @@ def create_network_graph( "extracted_triplets_tweets/covid_week_1", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", hard_filter=True, - save="twitter_week_1_nodes_edges.ndjson", ) + twitter_week_1_graph = create_network_graph( twitter_week_1_nodes, twitter_week_1_edges, @@ -250,54 +271,44 @@ def create_network_graph( # layout=spring_layout, # layout=kamada_kawai_layout, k=2.5, - node_size_mult=2, node_color="#A82800", fontsize=11, save="fig/twitter_week_1_graph", ) + # No få twitter_week_1_nodes_rm_få, twitter_week_1_edges_rm_få = get_nodes_edges( "extracted_triplets_tweets/covid_week_1", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", remove_custom_nodes=["få"], hard_filter=True, - save="twitter_week_1_nodes_edges.ndjson", ) -twitter_week_1_graph = create_network_graph( +twitter_week_1_rm_få = create_network_graph( twitter_week_1_nodes_rm_få, twitter_week_1_edges_rm_få, title="Covid-19 lockdown week 1 - Twitter", - # layout=spring_layout, - # layout=kamada_kawai_layout, k=2.5, - node_size_mult=2, node_color="#A82800", fontsize=11, - save="fig/twitter_week_1_graph_rm_få.png", + save="fig/twitter_week_1_graph_rm_få", ) # With old triplet extraction instead of GPT twitter_week_1_nodes_multi, twitter_week_1_edges_multi = get_nodes_edges( "extracted_triplets_tweets/covid_week_1_multi", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", - # remove_custom_nodes=["få"], hard_filter=False, n_most_frequent=10, - # save="twitter_week_1_nodes_edges_multi.ndjson", ) twitter_week_1_graph_multi = create_network_graph( twitter_week_1_nodes_multi, twitter_week_1_edges_multi, title="Covid-19 lockdown week 1 - Twitter", - # layout=spring_layout, - # layout=kamada_kawai_layout, k=2.5, - node_size_mult=3, - edge_weight_mult=0.7, node_color="#A82800", fontsize=12, quantile_value=0.6, - save="fig/twitter_week_1_graph_multi.png", + save="fig/twitter_week_1_graph_multi", ) ### News papers @@ -317,13 +328,37 @@ def create_network_graph( layout=spring_layout, # layout=kamada_kawai_layout, node_color="#A82800", + # node_size_mult=3000, k=4, - node_size_mult=2.5, - edge_weight_mult=1, fontsize=12, - fig_size=9, - quantile_value=0.6, - save="fig/news_mink_start.png", + # fig_size=9, + quantile_value=0.83, + save="fig/news_mink_start", +) + +# mink - Mogens Jensen resigning + +news_mink_mj_nodes, news_mink_mj_edges = get_nodes_edges( + "extracted_triplets_papers/mink_mogens_jensen", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", + hard_filter=True, + # n_most_frequent=20 +) + +news_mink_mj_graph = create_network_graph( + news_mink_mj_nodes, + news_mink_mj_edges, + title="Mink case, Mogens Jensen resigning - Newspapers", + layout=spring_layout, + # layout=kamada_kawai_layout, + node_color="#A82800", + k=4, + node_size_mult=2000, + # edge_weight_mult=1, + fontsize=12, + # fig_size=9, + # quantile_value=0.6, + save="fig/news_mink_mj", ) # Covid week 1 @@ -345,8 +380,6 @@ def create_network_graph( news_week_1_nodes, news_week_1_edges, title="Covid-19 lockdown week 1 - Newspapers", - k=1.5, - node_size_mult=2, node_color="#A82800", fontsize=10, save="fig/news_week_1_graph", @@ -356,12 +389,8 @@ def create_network_graph( news_week_2_nodes, news_week_2_edges, title="Week 2 of the COVID-19 lockdown", - # layout=spring_layout, - # layout=kamada_kawai_layout, k=2.5, - node_size_mult=2, node_color="#A82800", - fontsize=10, save="fig/news_week_2_graph", ) From 0ecbd5ef7b37a901a7b5ef4a84580c96f578d2ad Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Tue, 2 May 2023 09:12:21 +0200 Subject: [PATCH 7/9] add twitter mink start --- paper/network_creation.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/paper/network_creation.py b/paper/network_creation.py index ea355c9..0c5aad8 100644 --- a/paper/network_creation.py +++ b/paper/network_creation.py @@ -311,6 +311,28 @@ def create_network_graph( save="fig/twitter_week_1_graph_multi", ) +# Mink start +twitter_mink_start_nodes, twitter_mink_start_edges = get_nodes_edges( + "extracted_triplets_tweets/mink_start", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", + hard_filter=True, +) + +twitter_mink_start_graph = create_network_graph( + twitter_mink_start_nodes, + twitter_mink_start_edges, + title="Mink case start - Twitter", + layout=spring_layout, + # layout=kamada_kawai_layout, + node_color="#A82800", + # node_size_mult=3000, + k=4, + fontsize=12, + # fig_size=9, + quantile_value=0.83, + save="fig/twitter_mink_start", +) + ### News papers # Mink start From e8ca46c9c8634417660957f45e0f6eaa58c3e500 Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Fri, 5 May 2023 11:36:49 +0200 Subject: [PATCH 8/9] add mink start for twitter data multi model --- paper/network_creation.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/paper/network_creation.py b/paper/network_creation.py index 0c5aad8..99ce732 100644 --- a/paper/network_creation.py +++ b/paper/network_creation.py @@ -326,13 +326,31 @@ def create_network_graph( # layout=kamada_kawai_layout, node_color="#A82800", # node_size_mult=3000, - k=4, + k=3, fontsize=12, # fig_size=9, quantile_value=0.83, + seed=34, save="fig/twitter_mink_start", ) +twitter_mink_start_nodes_multi, twitter_mink_start_edges_multi = get_nodes_edges( + "extracted_triplets_tweets/mink_start_multi", + "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", + hard_filter=False, +) + +twitter_mink_start_graph_multi = create_network_graph( + twitter_mink_start_nodes_multi, + twitter_mink_start_edges_multi, + title="Mink case start - Twitter", + # layout=spring_layout, + node_color="#A82800", + k=3, + fontsize=12, + save="fig/twitter_mink_start_multi", +) + ### News papers # Mink start @@ -348,9 +366,7 @@ def create_network_graph( news_mink_start_edges, title="Mink case start - Newspapers", layout=spring_layout, - # layout=kamada_kawai_layout, node_color="#A82800", - # node_size_mult=3000, k=4, fontsize=12, # fig_size=9, @@ -376,7 +392,6 @@ def create_network_graph( node_color="#A82800", k=4, node_size_mult=2000, - # edge_weight_mult=1, fontsize=12, # fig_size=9, # quantile_value=0.6, From 8ad3649831f8201ec7de7971b45afe68525a7b15 Mon Sep 17 00:00:00 2001 From: stinenyhus Date: Wed, 10 May 2023 15:45:54 +0200 Subject: [PATCH 9/9] Update colors, add unlabeled graphs --- paper/network_creation.py | 333 +++++++++++++++----------------------- 1 file changed, 133 insertions(+), 200 deletions(-) diff --git a/paper/network_creation.py b/paper/network_creation.py index 99ce732..3d52cd7 100644 --- a/paper/network_creation.py +++ b/paper/network_creation.py @@ -2,6 +2,7 @@ import json from collections import Counter import matplotlib.pyplot as plt +from matplotlib import rcParams import os import numpy as np import ndjson @@ -147,19 +148,21 @@ def min_max_normalize(list_to_normalize: list, min_constant=0.5) -> list: def create_network_graph( node_list, edge_list, - title: str = "Narrative network graph", + title: Optional[str] = None, layout=fruchterman_reingold_layout, k: float = 0.3, node_size_mult: float = 3000, node_size_min: float = 0.001, edge_weight_mult: float = 5, fontsize: int = 12, - quantile_value: float = 0.90, - node_color: str = "#2a89d6", - edge_color: str = "#FE9322", + edge_quantile_value: float = 0.90, + node_quantile_value: Optional[float] = None, + node_color: str = "#146D25", + edge_color: str = "#54A463", fig_size: int = 10, plot_coordinates: bool = False, seed: Optional[int] = None, + draw_labels: bool = True, save=False, ): G = nx.Graph() @@ -169,7 +172,7 @@ def create_network_graph( # Make the graph undirected - for some reason, the tuples are sometimes reversed in the edge list d["weight"] = c[(u, v)] + c[(v, u)] - edge_label_weight_cutoff = quantile_min_value(list(c.values()), quantile_value) + edge_label_weight_cutoff = quantile_min_value(list(c.values()), edge_quantile_value) edges_to_draw = {} for n, nodes in node_list.items(): if c[nodes] >= edge_label_weight_cutoff: @@ -185,13 +188,17 @@ def create_network_graph( [d[1] for d in degrees], min_constant=node_size_min, ) + plt.figure(figsize=(fig_size, fig_size)) - plt.title( - title, - color="k", - fontsize=fontsize + 4, - ) + if title: + plt.title( + title, + color="k", + fontsize=fontsize + 8, + ) + edge_weights = min_max_normalize([d["weight"] for _, _, d in G.edges(data=True)]) + nx.draw( G, pos, @@ -202,58 +209,85 @@ def create_network_graph( # width=[d["weight"] ** edge_weight_mult for _, _, d in G.edges(data=True)], width=[e * edge_weight_mult for e in edge_weights], ) - nx.draw_networkx_edge_labels( - G, - pos, - edge_labels=edges_to_draw, - font_size=fontsize - 2, - label_pos=0.5, - # font_color=edge_color, - bbox=dict( - facecolor="white", - edgecolor=edge_color, - alpha=0.8, - boxstyle="round,pad=0.2", - ), - ) - - offset = 0.015 - for node, (x, y) in pos.items(): - h_align = "center" - v_align = "center" - if x < 0: - x -= offset - h_align = "right" - if x > 0: - x += offset - h_align = "left" - if y < 0: - y -= offset - v_align = "top" - if y > 0: - y += offset - v_align = "bottom" - if plot_coordinates: - label = f"{node} ({x:.2f}, {y:.2f})" - else: - label = node - plt.text( - x, - y, - label, - fontsize=fontsize, - color="k", - ha=h_align, - va=v_align, + if draw_labels: + nx.draw_networkx_edge_labels( + G, + pos, + edge_labels=edges_to_draw, + font_size=fontsize - 1, + label_pos=0.5, bbox=dict( facecolor="white", - edgecolor=node_color, + edgecolor=edge_color, alpha=0.8, - boxstyle="round,pad=0.1", + boxstyle="round,pad=0.2", ), ) + + offset = 0.015 + for node, (x, y) in pos.items(): + h_align = "center" + v_align = "center" + if x < 0: + x -= offset + h_align = "right" + if x > 0: + x += offset + h_align = "left" + if y < 0: + y -= offset + v_align = "top" + if y > 0: + y += offset + v_align = "bottom" + if plot_coordinates: + label = f"{node} ({x:.2f}, {y:.2f})" + else: + label = node + if node_quantile_value: + node_label_draw_cutoff = quantile_min_value( + [value for key, value in degrees], + node_quantile_value, + ) + if degrees[node] >= node_label_draw_cutoff: + plt.text( + x, + y, + label, + fontsize=fontsize, + color="k", + ha=h_align, + va=v_align, + bbox=dict( + facecolor="white", + edgecolor=node_color, + alpha=0.8, + boxstyle="round,pad=0.1", + ), + ) + else: + plt.text( + x, + y, + label, + fontsize=fontsize, + # fontname="Helvetica", + color="k", + ha=h_align, + va=v_align, + bbox=dict( + facecolor="white", + edgecolor=node_color, + alpha=0.8, + boxstyle="round,pad=0.1", + ), + ) if save: - plt.savefig(f"{save}.png", format="PNG") + plt.savefig( + f"{save}.png", + format="PNG", + bbox_inches="tight", + ) return G @@ -261,54 +295,52 @@ def create_network_graph( twitter_week_1_nodes, twitter_week_1_edges = get_nodes_edges( "extracted_triplets_tweets/covid_week_1", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", - hard_filter=True, + n_most_frequent=3, ) +# GReens twitter_week_1_graph = create_network_graph( twitter_week_1_nodes, twitter_week_1_edges, - title="Covid-19 lockdown week 1 - Twitter", - # layout=spring_layout, - # layout=kamada_kawai_layout, + title="Twitter (GPT-3): First week of the lockdown", k=2.5, - node_color="#A82800", - fontsize=11, - save="fig/twitter_week_1_graph", + edge_quantile_value=0.9, + save="fig/twitter_week_1", ) -# No få -twitter_week_1_nodes_rm_få, twitter_week_1_edges_rm_få = get_nodes_edges( - "extracted_triplets_tweets/covid_week_1", - "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", - remove_custom_nodes=["få"], - hard_filter=True, -) -twitter_week_1_rm_få = create_network_graph( - twitter_week_1_nodes_rm_få, - twitter_week_1_edges_rm_få, - title="Covid-19 lockdown week 1 - Twitter", - k=2.5, - node_color="#A82800", - fontsize=11, - save="fig/twitter_week_1_graph_rm_få", -) + +# # No få +# twitter_week_1_nodes_rm_få, twitter_week_1_edges_rm_få = get_nodes_edges( +# "extracted_triplets_tweets/covid_week_1", +# "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", +# remove_custom_nodes=["få"], +# hard_filter=True, +# ) +# twitter_week_1_rm_få = create_network_graph( +# twitter_week_1_nodes_rm_få, +# twitter_week_1_edges_rm_få, +# title="Covid-19 lockdown week 1 - Twitter", +# k=2.5, +# node_color="#A82800", +# fontsize=11, +# save="fig/twitter_week_1_graph_rm_få", +# ) + + # With old triplet extraction instead of GPT twitter_week_1_nodes_multi, twitter_week_1_edges_multi = get_nodes_edges( "extracted_triplets_tweets/covid_week_1_multi", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", - hard_filter=False, - n_most_frequent=10, ) twitter_week_1_graph_multi = create_network_graph( twitter_week_1_nodes_multi, twitter_week_1_edges_multi, - title="Covid-19 lockdown week 1 - Twitter", - k=2.5, - node_color="#A82800", - fontsize=12, - quantile_value=0.6, - save="fig/twitter_week_1_graph_multi", + title="Twitter (Multi2OIE): First week of the lockdown", + k=3.5, + edge_quantile_value=0.8, + save="fig/twitter_week_1_multi_no_labels", + draw_labels=False, ) # Mink start @@ -321,33 +353,23 @@ def create_network_graph( twitter_mink_start_graph = create_network_graph( twitter_mink_start_nodes, twitter_mink_start_edges, - title="Mink case start - Twitter", + title="Twitter (GPT-3): First week of the mink case", layout=spring_layout, - # layout=kamada_kawai_layout, - node_color="#A82800", - # node_size_mult=3000, - k=3, - fontsize=12, - # fig_size=9, - quantile_value=0.83, - seed=34, + k=4, + edge_quantile_value=0.83, save="fig/twitter_mink_start", ) twitter_mink_start_nodes_multi, twitter_mink_start_edges_multi = get_nodes_edges( "extracted_triplets_tweets/mink_start_multi", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", - hard_filter=False, ) twitter_mink_start_graph_multi = create_network_graph( twitter_mink_start_nodes_multi, twitter_mink_start_edges_multi, - title="Mink case start - Twitter", - # layout=spring_layout, - node_color="#A82800", + title="Twitter (Multi2OIE): First week of the mink case", k=3, - fontsize=12, save="fig/twitter_mink_start_multi", ) @@ -364,13 +386,10 @@ def create_network_graph( news_mink_start_graph = create_network_graph( news_mink_start_nodes, news_mink_start_edges, - title="Mink case start - Newspapers", + title="Newspapers: First week of the mink case", layout=spring_layout, - node_color="#A82800", - k=4, - fontsize=12, - # fig_size=9, - quantile_value=0.83, + k=3, + edge_quantile_value=0.83, save="fig/news_mink_start", ) @@ -380,21 +399,14 @@ def create_network_graph( "extracted_triplets_papers/mink_mogens_jensen", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", hard_filter=True, - # n_most_frequent=20 ) news_mink_mj_graph = create_network_graph( news_mink_mj_nodes, news_mink_mj_edges, - title="Mink case, Mogens Jensen resigning - Newspapers", + title="Newspapers: Mogens Jensen's resignation", layout=spring_layout, - # layout=kamada_kawai_layout, - node_color="#A82800", - k=4, - node_size_mult=2000, - fontsize=12, - # fig_size=9, - # quantile_value=0.6, + k=2, save="fig/news_mink_mj", ) @@ -403,12 +415,10 @@ def create_network_graph( news_week_1_nodes, news_week_1_edges = get_nodes_edges( "extracted_triplets_papers/covid_week_1", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", - # save="news_week_1_nodes_edges.ndjson", ) news_week_2_nodes, news_week_2_edges = get_nodes_edges( "extracted_triplets_papers/covid_week_2", - # "paraphrase_nodes_edges.json", "paraphrase_dim=40_neigh=15_clust=5_samp=3_nodes_edges.json", ) @@ -416,92 +426,15 @@ def create_network_graph( news_week_1_graph = create_network_graph( news_week_1_nodes, news_week_1_edges, - title="Covid-19 lockdown week 1 - Newspapers", - node_color="#A82800", - fontsize=10, - save="fig/news_week_1_graph", + title="Newspapers: First week of the lockdown", + save="fig/news_week_1", + k=2.5, ) news_week_2_graph = create_network_graph( news_week_2_nodes, news_week_2_edges, - title="Week 2 of the COVID-19 lockdown", + title="Newspapers: Second week of the lockdown", k=2.5, - node_color="#A82800", - save="fig/news_week_2_graph", -) - - -### Using danskbert instead -# week_1_dansk_nodes, week_1_dansk_edges = get_nodes_edges( -# "covid_week_1", -# "danskBERT_nodes_edges.json", -# ) - -# week_2_dansk_nodes, week_2_dansk_edges = get_nodes_edges( -# "covid_week_2", -# "danskBERT_nodes_edges.json", -# ) - -### Testing different hyperparameters - -week_1_nodes_15, week_1_edges_15 = get_nodes_edges( - "covid_week_1", - "paraphrase_dim=40_neigh=15_clust=15_samp=5_nodes_edges.json", -) -week_1_nodes_10, week_1_edges_10 = get_nodes_edges( - "covid_week_1", - "paraphrase_dim=40_neigh=15_clust=10_samp=5_nodes_edges.json", -) -week_1_nodes_8, week_1_edges_8 = get_nodes_edges( - "covid_week_1", - "paraphrase_dim=40_neigh=15_clust=8_samp=5_nodes_edges.json", -) -week_1_nodes_5, week_1_edges_5 = get_nodes_edges( - "covid_week_1", - "paraphrase_dim=40_neigh=15_clust=5_samp=5_nodes_edges.json", -) -week_1_para_graph_15 = create_network_graph( - week_1_nodes_15, - week_1_edges_15, - k=1, - node_size_mult=2.5, - fontsize=10, -) -week_1_para_graph_10 = create_network_graph( - week_1_nodes_10, - week_1_edges_10, - k=1, - node_size_mult=2.5, - fontsize=10, -) -week_1_para_graph_8 = create_network_graph( - week_1_nodes_8, - week_1_edges_8, - k=1, - node_size_mult=2.5, - fontsize=10, + save="fig/news_week_2", ) -week_1_para_graph_5 = create_network_graph( - week_1_nodes_5, - week_1_edges_5, - k=1, - node_size_mult=2.5, - fontsize=10, -) - - -# week_1_dansk_graph = create_network_graph( -# week_1_dansk_nodes, -# week_1_dansk_edges, -# k=0.6, -# node_size_mult=2.5, -# fontsize=11, -# ) -# week_2_dansk_graph = create_network_graph( -# week_2_dansk_nodes, -# week_2_dansk_edges, -# k=0.6, -# node_size_mult=2.5, -# fontsize=11, -# )