diff --git a/data/samples/presentation-example.csv b/data/samples/presentation-example.csv new file mode 100644 index 0000000..670ab1e --- /dev/null +++ b/data/samples/presentation-example.csv @@ -0,0 +1,13 @@ +A,B +A,E +B,C +B,D +C,D +D,E +E,F +E,G +F,H +F,I +F,J +F,K +J,K diff --git a/social-network-analysis/5_presentation-example.ipynb b/social-network-analysis/5_presentation-example.ipynb new file mode 100644 index 0000000..bb409c3 --- /dev/null +++ b/social-network-analysis/5_presentation-example.ipynb @@ -0,0 +1,533 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d8629d34", + "metadata": {}, + "source": [ + "# Social Network Analysis applied to Data Warehouses\n", + "\n", + "## 5. Presentation examples based on a sample dataset" + ] + }, + { + "cell_type": "markdown", + "id": "9e482941", + "metadata": {}, + "source": [ + "### 5.1. Import dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "33fdb2b8", + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "from typing import Dict, List, Tuple\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import networkx as nx\n", + "from networkx.classes.graph import Graph\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "0ca42e6f", + "metadata": {}, + "source": [ + "### 5.2. Utility functions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a70be649", + "metadata": {}, + "outputs": [], + "source": [ + "def load_graph_from_csv(file: str) -> Graph:\n", + " return nx.read_edgelist(file, delimiter=\",\", create_using=nx.DiGraph)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6cb243bc", + "metadata": {}, + "outputs": [], + "source": [ + "def format_graph_info(graph_id: str, graph: Graph) -> str:\n", + " return (\n", + " f\"{graph_id.upper()} GRAPH INFO:\\n\"\n", + " f\" Number of nodes: {nx.number_of_nodes(graph)}\\n\"\n", + " f\" Number of edges: {nx.number_of_edges(graph)}\\n\"\n", + " f\" Density: {nx.density(graph)}\\n\"\n", + " f\" Average clustering coefficient: {nx.average_clustering(graph)}\\n\"\n", + " f\" Transitivity: {nx.transitivity(graph)}\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "16087b81", + "metadata": {}, + "outputs": [], + "source": [ + "def draw_out_degree_based_network(graph: Graph) -> None:\n", + " highest_degree = max([degree for _, degree in graph.out_degree])\n", + " scaling_factor = 3500 / highest_degree\n", + "\n", + " normalized_node_params = [\n", + " (degree or 0.05) * scaling_factor for _, degree in graph.out_degree\n", + " ]\n", + "\n", + " plt.figure(figsize=(8, 5))\n", + " nx.draw_networkx(\n", + " graph,\n", + " pos=nx.spring_layout(graph),\n", + " edge_color=\"dimgray\",\n", + " node_color=normalized_node_params,\n", + " node_size=normalized_node_params,\n", + " )\n", + " plt.axis(\"off\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9939e02b", + "metadata": {}, + "outputs": [], + "source": [ + "def group_nodes_by_out_degree(\n", + " graph: Graph, highest_first: bool = True\n", + ") -> Dict[int, List[str]]:\n", + " degrees = [degree for _, degree in graph.out_degree]\n", + " unique_degrees = sorted(np.unique(degrees), reverse=highest_first)\n", + " degrees_dict = {}\n", + " for unique_degree in unique_degrees:\n", + " degrees_dict[int(unique_degree)] = sorted(\n", + " [node for node, degree in graph.out_degree if degree == unique_degree]\n", + " )\n", + " return degrees_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "45d509d9", + "metadata": {}, + "outputs": [], + "source": [ + "def get_out_degree_critical_nodes_for_count(\n", + " graph: Graph, target_node_count\n", + ") -> Tuple[Dict[int, List[str]], float]:\n", + " node_count = 0\n", + " degree_groups = group_nodes_by_out_degree(graph)\n", + " degree_sum = sum([degree for _, degree in graph.out_degree])\n", + "\n", + " group_count = 0\n", + " while node_count < target_node_count:\n", + " group_count += 1\n", + " highest_degrees = dict(itertools.islice(degree_groups.items(), group_count))\n", + " node_count = sum([len(nodes) for _, nodes in highest_degrees.items()])\n", + "\n", + " highest_degree_sum = sum(\n", + " [degree * len(nodes) for degree, nodes in highest_degrees.items()]\n", + " )\n", + " groups_degree_ratio = highest_degree_sum / degree_sum * 100\n", + " print(\n", + " f\"{group_count}-critical-groups node count: {node_count}\\n\"\n", + " f\"{group_count}-critical-groups/total out-degree ratio:\"\n", + " f\" {groups_degree_ratio:.0f}% ({highest_degree_sum}/{degree_sum})\"\n", + " )\n", + "\n", + " return highest_degrees, node_count" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4ab1e3ef", + "metadata": {}, + "outputs": [], + "source": [ + "def draw_betweenness_centrality_based_network(graph: Graph) -> None:\n", + " bet_cent = nx.betweenness_centrality(graph, normalized=False)\n", + "\n", + " lowest_non_zero_bet_cent = min(\n", + " [betweenness for _, betweenness in bet_cent.items() if betweenness > 0]\n", + " )\n", + " highest_bet_cent = max([betweenness for _, betweenness in bet_cent.items()])\n", + " scaling_factor = 3500 / highest_bet_cent\n", + "\n", + " normalized_node_params = [\n", + " (betweenness or (lowest_non_zero_bet_cent / 5)) * scaling_factor\n", + " for _, betweenness in bet_cent.items()\n", + " ]\n", + "\n", + " plt.figure(figsize=(8, 5))\n", + " nx.draw_networkx(\n", + " graph,\n", + " pos=nx.spring_layout(graph),\n", + " edge_color=\"dimgray\",\n", + " node_color=normalized_node_params,\n", + " node_size=normalized_node_params,\n", + " )\n", + " plt.axis(\"off\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "076586f4", + "metadata": {}, + "outputs": [], + "source": [ + "def get_betweenness_centrality_list(bet_cent: Dict[str, float]) -> List[float]:\n", + " return [betweenness for _, betweenness in bet_cent.items()]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4bc130b2", + "metadata": {}, + "outputs": [], + "source": [ + "def group_nodes_by_betweenness_centrality(\n", + " bet_cent: Dict[str, float], highest_first: bool = True\n", + ") -> Dict[float, List[str]]:\n", + " bet_cent_list = get_betweenness_centrality_list(bet_cent)\n", + " unique_bet_cent = sorted(np.unique(bet_cent_list), reverse=highest_first)\n", + " sorted_bet_cent_dict = {}\n", + " for value in unique_bet_cent:\n", + " sorted_bet_cent_dict[value] = sorted(\n", + " [node for node, bc in bet_cent.items() if bc == value]\n", + " )\n", + " return sorted_bet_cent_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "07dcf69a", + "metadata": {}, + "outputs": [], + "source": [ + "def get_betweenness_centrality_critical_nodes_for_count(\n", + " graph: Graph, target_node_count\n", + ") -> Tuple[Dict[int, List[str]], float]:\n", + " node_count = 0\n", + " bet_cent = nx.betweenness_centrality(graph, normalized=False)\n", + " bc_groups = group_nodes_by_betweenness_centrality(bet_cent)\n", + "\n", + " group_count = 0\n", + " while node_count < target_node_count:\n", + " group_count += 1\n", + " highest_bc = dict(itertools.islice(bc_groups.items(), group_count))\n", + " node_count = sum([len(nodes) for _, nodes in highest_bc.items()])\n", + "\n", + " print(f\"{group_count}-critical-groups node count: {node_count}\\n\")\n", + "\n", + " return highest_bc, node_count" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "38cf8c65", + "metadata": {}, + "outputs": [], + "source": [ + "def draw_clustering_based_network(graph: Graph) -> None:\n", + " clustering = nx.clustering(graph)\n", + "\n", + " lowest_non_zero_clust_coef = min(\n", + " clust_coef for _, clust_coef in clustering.items() if clust_coef > 0\n", + " )\n", + " highest_clust_coef = max(clust_coef for _, clust_coef in clustering.items())\n", + " scaling_factor = 3500 / highest_clust_coef\n", + "\n", + " normalized_node_params = [\n", + " (0.5 - (clust_coef or (lowest_non_zero_clust_coef / 10))) * scaling_factor\n", + " for _, clust_coef in clustering.items()\n", + " ]\n", + "\n", + " plt.figure(figsize=(8, 5))\n", + " nx.draw_networkx(\n", + " graph,\n", + " pos=nx.spring_layout(graph),\n", + " edge_color=\"dimgray\",\n", + " node_color=normalized_node_params,\n", + " node_size=normalized_node_params,\n", + " )\n", + " plt.axis(\"off\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4f3658f3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_clustering_list(clustering: Dict[str, float]) -> List[float]:\n", + " return [clust_coef for _, clust_coef in clustering.items()]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "241f34d4", + "metadata": {}, + "outputs": [], + "source": [ + "def group_nodes_by_clustering(clustering: Dict[str, float]) -> Dict[float, List[str]]:\n", + " clustering_list = get_clustering_list(clustering)\n", + " unique_clustering = sorted(np.unique(clustering_list))\n", + " sorted_clustering_dict = {}\n", + " for value in unique_clustering:\n", + " sorted_clustering_dict[value] = sorted(\n", + " [node for node, cl in clustering.items() if cl == value]\n", + " )\n", + " return sorted_clustering_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ebaad834", + "metadata": {}, + "outputs": [], + "source": [ + "def get_clustering_critical_nodes_for_count(\n", + " graph: Graph, target_node_count\n", + ") -> Tuple[Dict[int, List[str]], float]:\n", + " node_count = 0\n", + " clustering = nx.clustering(graph)\n", + " cl_groups = group_nodes_by_clustering(clustering)\n", + "\n", + " group_count = 0\n", + " while node_count < target_node_count:\n", + " group_count += 1\n", + " lowest_cl = dict(itertools.islice(cl_groups.items(), group_count))\n", + " node_count = sum([len(nodes) for _, nodes in lowest_cl.items()])\n", + "\n", + " print(f\"{group_count}-critical-groups node count: {node_count}\\n\")\n", + "\n", + " return lowest_cl, node_count" + ] + }, + { + "cell_type": "markdown", + "id": "fca69287", + "metadata": {}, + "source": [ + "### 5.3. Load the sample dataset from CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d55b4eee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PRESENTATION GRAPH INFO:\n", + " Number of nodes: 11\n", + " Number of edges: 13\n", + " Density: 0.11818181818181818\n", + " Average clustering coefficient: 0.1712121212121212\n", + " Transitivity: 0.1111111111111111\n", + "\n" + ] + } + ], + "source": [ + "sample_data_folder = \"../data/samples\"\n", + "\n", + "presentation_graph = load_graph_from_csv(\n", + " f\"{sample_data_folder}/presentation-example.csv\"\n", + ")\n", + "print(f'{format_graph_info(\"presentation\", presentation_graph)}\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "e363abb5", + "metadata": {}, + "source": [ + "### 5.4. Draw the networks" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5a6a2c2f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "draw_out_degree_based_network(presentation_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a2502659", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4-critical-groups node count: 11\n", + "4-critical-groups/total out-degree ratio: 100% (13/13)\n", + "\n", + "{4: ['F'], 2: ['A', 'B', 'E'], 1: ['C', 'D', 'J'], 0: ['G', 'H', 'I', 'K']}\n" + ] + } + ], + "source": [ + "presentation_critical_groups, _ = get_out_degree_critical_nodes_for_count(\n", + " presentation_graph, 10\n", + ")\n", + "print(f\"\\n{presentation_critical_groups}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "fea60aec", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "draw_betweenness_centrality_based_network(presentation_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b0d96214", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5-critical-groups node count: 11\n", + "\n", + "\n", + "{24.0: ['E'], 20.0: ['F'], 14.0: ['D'], 2.0: ['B'], 0.0: ['A', 'C', 'G', 'H', 'I', 'J', 'K']}\n" + ] + } + ], + "source": [ + "presentation_critical_groups, _ = get_betweenness_centrality_critical_nodes_for_count(\n", + " presentation_graph, 10\n", + ")\n", + "print(f\"\\n{presentation_critical_groups}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f20bcf19", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "draw_clustering_based_network(presentation_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "22be1ce1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4-critical-groups node count: 11\n", + "\n", + "\n", + "{0.0: ['A', 'E', 'G', 'H', 'I'], 0.05: ['F'], 0.16666666666666666: ['B', 'D'], 0.5: ['C', 'J', 'K']}\n" + ] + } + ], + "source": [ + "presentation_critical_groups, _ = get_clustering_critical_nodes_for_count(\n", + " presentation_graph, 10\n", + ")\n", + "print(f\"\\n{presentation_critical_groups}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}