diff --git a/assess.py b/assess.py index ca2b327..4e690f6 100644 --- a/assess.py +++ b/assess.py @@ -18,6 +18,7 @@ local_modularity_density, global_modularity_density, ) +from graph_generation_fs import generate_fs_graph from graph_generation_sbm import generate_sbm_graph from graph_generation_lfr import generate_lfr_graph @@ -57,14 +58,14 @@ local_modularity, ), }, - # "modularity_density": { - # "name": "Modularity Density", - # "partition_func": lambda G: louvain_communities( - # G, - # global_modularity_density, - # local_modularity_density, - # ), - # }, + "modularity_density": { + "name": "Modularity Density", + "partition_func": lambda G: louvain_communities( + G, + global_modularity_density, + local_modularity_density, + ), + }, } @@ -96,7 +97,7 @@ def run_benchmarks( print(f"Running benchmark for measure {measure}...") nmi_scores = [] for seed in graph_seeds: - G = generate_lfr_graph(graph_size, seed=seed) + G = generate_fs_graph(graph_size, seed=seed) partition = COMMUNITY_MEASURES[measure]["partition_func"](G) ground_truth_partition = G.graph["partition"] print( diff --git a/calculate_edge_probabilities.py b/calculate_edge_probabilities.py index 4db4854..c7ebb76 100644 --- a/calculate_edge_probabilities.py +++ b/calculate_edge_probabilities.py @@ -1,4 +1,5 @@ # Get edge probabilities between different communities of the graph to feed into the stochastic block model +import pickle from collections import defaultdict import networkx as nx @@ -57,6 +58,12 @@ def create_community_size_csv(nodes_by_community: dict[set]): def estimate_power_law_degree_exponent(G): + community_size_fit, degree_distrib_fit = power_law_fits(G) + cache_powerlaw_fits(community_size_fit, degree_distrib_fit) + return degree_distrib_fit.power_law.alpha, community_size_fit.power_law.alpha + + +def power_law_fits(G): degrees = [d for n, d in G.degree()] degree_distrib_fit = powerlaw.Fit(degrees) # Now do a powerlaw fit for the community sizes @@ -67,7 +74,16 @@ def estimate_power_law_degree_exponent(G): nodes_by_community[court].add(node) community_sizes = [len(nodes) for nodes in nodes_by_community.values()] community_size_fit = powerlaw.Fit(community_sizes) - return degree_distrib_fit.power_law.alpha, community_size_fit.power_law.alpha + return community_size_fit, degree_distrib_fit + + +def cache_powerlaw_fits(community_size_fit, degree_distrib_fit): + pickle_cache = { + "degree_distrib_fit": degree_distrib_fit, + "community_size_fit": community_size_fit, + } + with open(Path("data", "powerlaw_fits.pik"), "wb") as f: + pickle.dump(pickle_cache, f) def inter_community_edge_fraction(G): diff --git a/graph_generation_fs.py b/graph_generation_fs.py new file mode 100644 index 0000000..91194f8 --- /dev/null +++ b/graph_generation_fs.py @@ -0,0 +1,83 @@ +from random import choice, seed as randseed + +import networkx as nx + +AVG_DEGREE = 6.7 +DEGREE_ALPHA = 3.565 + +# 1.7 is the average value of the default nx powerlaw sequence +SCALING_FACTOR = AVG_DEGREE / 1.56 + +COMM_FRACTIONS = [ + 0.029933491, + 0.039929701, + 0.064415727, + 0.066738343, + 0.067251111, + 0.071532444, + 0.078061959, + 0.083100037, + 0.087952031, + 0.097088114, + 0.153322995, + 0.160674041, +] + +INTER_COMMUNITY_FRAC = 0.282 + +# DEG_FIT, COMM_FIT = power_law_fits(load_network()) +DEG_ALPHA = 3.565129 + + +def generate_fs_graph(n: int, seed=None) -> nx.DiGraph: + """ + Algorithm description: + 1. Creates num_comm_nodes = n * COMM_FRACTIONS[i] nodes for each community i + 2. Get num_comm_nodes random values from a power law distribution representing degrees + 3. For a given node and its degree deg, create (1 - INTER_COMMUNITY_FRAC) * deg edges to random nodes within the same community + and (INTER_COMMUNITY_FRAC) * deg edges to random nodes in other communities + + Lots of optimizations we can make here, just getting it to work for now. + """ + # Initialize random seed + if seed is not None: + randseed(seed) + G = nx.DiGraph() + nodes_per_community = [int(n * f) for f in COMM_FRACTIONS] + # Create power law distribution for degrees + deg_dist = [v * SCALING_FACTOR for v in nx.utils.powerlaw_sequence(n, exponent=DEGREE_ALPHA, seed=seed)] + curr_node = 0 + comm_nodes = [] + for community_idx, num_nodes in enumerate(nodes_per_community): + curr_comm = [] + for _ in range(num_nodes): + G.add_node(curr_node, community=community_idx, degree=deg_dist[curr_node]) + curr_comm.append(curr_node) + curr_node += 1 + comm_nodes.append(curr_comm) + all_nodes = [n for comm in comm_nodes for n in comm] + # Add edges + nodes = list(G.nodes(data=True)) + for node, data in nodes: + deg = data["degree"] + comm_idx = data["community"] + # Add (1 - INTER_COMMUNITY_FRAC) * deg edges to nodes in the same community + for n in range(round(deg * (1 - INTER_COMMUNITY_FRAC))): + random_node = choice(comm_nodes[comm_idx]) + G.add_edge(node, random_node, weight=1) + # Add (INTER_COMMUNITY_FRAC) * deg edges to nodes in other communities + for n in range(round(deg * INTER_COMMUNITY_FRAC)): + random_node = None + # Pick a node in a different community + while random_node is None: + selection = choice(all_nodes) + if G.nodes[selection]["community"] != comm_idx: + random_node = selection + G.add_edge(node, random_node, weight=1) + # Set partition on graph + G.graph['partition'] = [set(comm) for comm in comm_nodes] + return G + + +if __name__ == "__main__": + generate_fs_graph(1000) diff --git a/graph_generation_sbm.py b/graph_generation_sbm.py index 937eaaf..5c8b50c 100644 --- a/graph_generation_sbm.py +++ b/graph_generation_sbm.py @@ -7,6 +7,8 @@ from algorithm.edge_ratio import global_edge_ratio +AVG_DEGREE = 6.78366 + def get_edge_probabilities(edge_prob_csv: Path) -> list[list[float]]: # Outer key: citing community, inner key: cited community, value: probability of edges @@ -51,9 +53,9 @@ def generate_sbm_graph( edge_prob_mat = EDGE_PROBS if community_sizes is None: community_sizes = COMMUNITY_SIZES - # Scale edge probabilities by 100,000 / n - edge_prob_mat = [[p * 50000 / n for p in row] for row in edge_prob_mat] community_sizes = [int(round(n * s)) for s in community_sizes] + # Preserve the average degree of the original graph + edge_prob_mat = get_scaled_edge_probs(AVG_DEGREE, edge_prob_mat, community_sizes) G = nx.stochastic_block_model( community_sizes, edge_prob_mat, seed=seed, directed=True ) @@ -63,6 +65,21 @@ def generate_sbm_graph( return G +def get_scaled_edge_probs( + desired_avg_degree: float, + edge_prob_mat: list[list[float]], + community_sizes: list[int], +) -> list[list[float]]: + desired_num_edges = sum(community_sizes) * desired_avg_degree + num_edges_before_scaling = 0 + for i in range(len(edge_prob_mat)): + for j in range(len(edge_prob_mat[i])): + curr_possible_edges = community_sizes[i] * community_sizes[j] + num_edges_before_scaling += curr_possible_edges * edge_prob_mat[i][j] + scaling_factor = desired_num_edges / num_edges_before_scaling + return [[p * scaling_factor for p in row] for row in edge_prob_mat] + + def main(): G = generate_sbm_graph(1000) print(f"Edge ratio: {global_edge_ratio(G, G.graph['partition'])}") diff --git a/requirements.txt b/requirements.txt index be184fd..2f50622 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ networkx~=2.8 scikit-learn~=1.1 click~=8.1 powerlaw +scipy