WIP LFR model graphs

RienkF · Jun 5, 2022 · 1ef9ba2 · 1ef9ba2
1 parent 4fd0a30
commit 1ef9ba2
Show file tree

Hide file tree

Showing 7 changed files with 67 additions and 21 deletions.
diff --git a/assess.py b/assess.py
@@ -6,13 +6,13 @@
 from sklearn.metrics import normalized_mutual_info_score
 
 from algorithm.edge_ratio import local_edge_ratio, global_edge_ratio
-from graph_generation import generate_sbm_graph
+from graph_generation_sbm import generate_sbm_graph
 from louvain import louvain_communities
 
 
 Partition = list[set[str]]
 
-GRAPH_SIZE = 500
+GRAPH_SIZE = 5000
 
 RANDOM_GRAPH_SEEDS = (
     2022_0,
@@ -28,14 +28,14 @@
 )
 
 COMMUNITY_MEASURES = {
-    "edge_ratio": {
-        "name": "Edge Ratio",
-        "partition_func": lambda G: louvain_communities(
-            G,
-            global_edge_ratio,
-            local_edge_ratio,
-        ),
-    },
+    # "edge_ratio": {
+    #     "name": "Edge Ratio",
+    #     "partition_func": lambda G: louvain_communities(
+    #         G,
+    #         global_edge_ratio,
+    #         local_edge_ratio,
+    #     ),
+    # },
     "modularity": {
         "name": "Modularity",
         "partition_func": lambda G: nx.community.louvain_communities(G),
@@ -76,6 +76,8 @@ def run_benchmarks(
             G = generate_sbm_graph(graph_size, seed=seed)
             partition = COMMUNITY_MEASURES[measure]["partition_func"](G)
             ground_truth_partition = G.graph["partition"]
+            print(f"Modularity for ground truth: {nx.algorithms.community.modularity(G, ground_truth_partition)}")
+            print(f"Modularity for algorithm: {nx.algorithms.community.modularity(G, partition)}")
             nmi_scores.append(nmi_score(ground_truth_partition, partition))
             print(
                 f"Measure {measure}, Seed {seed}: NMI"

diff --git a/calculate_edge_probabilities.py b/calculate_edge_probabilities.py
@@ -4,21 +4,23 @@
 import networkx as nx
 import csv
 from pathlib import Path
+import powerlaw
 
 from main import load_network
 
 
 def main():
     G = load_network()
     # "court" field contains ground truth community labels
-    nodes_by_community = defaultdict(set)
-    for node in G.nodes:
-        court = G.nodes[node].get("court", None)
-        # Ignore nodes without a ground truth community label
-        if court is not None:
-            nodes_by_community[court].add(node)
-    create_edge_prob_csv(G, nodes_by_community)
-    create_community_size_csv(nodes_by_community)
+    # nodes_by_community = defaultdict(set)
+    # for node in G.nodes:
+    #     court = G.nodes[node].get("court", None)
+    #     # Ignore nodes without a ground truth community label
+    #     if court is not None:
+    #         nodes_by_community[court].add(node)
+    # create_edge_prob_csv(G, nodes_by_community)
+    # create_community_size_csv(nodes_by_community)
+    print(estimate_power_law_degree_exponent(G))
 
 
 def create_edge_prob_csv(G, nodes_by_community):
@@ -53,5 +55,20 @@ def create_community_size_csv(nodes_by_community: dict[set]):
             writer.writerow([community, len(nodes) / total_nodes])
 
 
+def estimate_power_law_degree_exponent(G):
+    degrees = [d for n, d in G.degree()]
+    fit = powerlaw.Fit(degrees)
+    # Now do a powerlaw fit for the community sizes
+    nodes_by_community = defaultdict(set)
+    for node in G.nodes:
+        court = G.nodes[node].get("court", None)
+        if court is not None:
+            nodes_by_community[court].add(node)
+    community_sizes = [len(nodes) for nodes in nodes_by_community.values()]
+    print(community_sizes)
+    fit2 = powerlaw.Fit(community_sizes)
+    return fit.power_law.alpha, fit2.power_law.alpha
+
+
 if __name__ == "__main__":
     main()
diff --git a/graph_generation_lfr.py b/graph_generation_lfr.py
@@ -0,0 +1,11 @@
+# Creates LFR benchmark graphs based on the properties of the main graph
+
+# Power law degree exponent = 3.5651290105965305
+# Power law community size exponent = 4.1918
+# Average degree = 6.783669266744867
+# Minimum degree = 1
+# Maximum degree = 6756
+# Minimum community size = 21716
+# Maximum community size = 116565
+
+import networkx as nx
diff --git a/graph_generation.py → graph_generation_sbm.py b/graph_generation.py → graph_generation_sbm.py
@@ -51,13 +51,15 @@ def generate_sbm_graph(
         edge_prob_mat = EDGE_PROBS
     if community_sizes is None:
         community_sizes = COMMUNITY_SIZES
+    # Scale edge probabilities by 100,000 / n
+    edge_prob_mat = [[p * 50000 / n for p in row] for row in edge_prob_mat]
     community_sizes = [int(round(n * s)) for s in community_sizes]
     G = nx.stochastic_block_model(
         community_sizes, edge_prob_mat, seed=seed, directed=True
     )
     # Add weight=1 to each edge
-    for u, v in G.edges:
-        G[u][v]["weight"] = 1
+    # for u, v in G.edges:
+    #     G[u][v]["weight"] = 1
     return G
 
 

diff --git a/graph_samples.py b/graph_samples.py
@@ -0,0 +1,13 @@
+# From the main graph, randomly sample n nodes and their edges from the graph
+
+from random import sample
+
+import networkx as nx
+
+
+def generate_random_graph(
+    G: nx.DiGraph,
+    n: int,
+) -> nx.DiGraph:
+    random_nodes = sample(list(G.nodes), n)
+    return nx.subgraph(G, random_nodes)
diff --git a/louvain.py b/louvain.py
@@ -75,7 +75,7 @@ def louvain_partitions(
         yield partition
         new_community_score = global_community_measure(G, partition)
         # print(f"Calculated global measure score: {new_community_score}")
-        if new_community_score - comm_score <= threshold:
+        if abs(new_community_score - comm_score) <= threshold:
             return
         comm_score = new_community_score
         graph = _gen_graph(graph, inner_partition)

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 networkx~=2.8
 scikit-learn~=1.1
 click~=8.1
+powerlaw