-
Notifications
You must be signed in to change notification settings - Fork 6
/
cluster.py
66 lines (56 loc) · 2.1 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
import igraph as ig
import numpy as np
#%%
# Read files
nodes_df = pd.read_csv('../data/citation_nodes-0.txt', sep='\t',
dtype={'abstract': 'str'}, low_memory=False)
edges_df = pd.read_csv('../data/citation_edges-0.txt', sep='\t')
#%%
# Create graph
G = ig.Graph.DictList(
vertices=nodes_df.to_dict('records'),
edges=edges_df.to_dict('records'),
directed=True,
vertex_name_attr='id',
edge_foreign_keys=('citing_pub_id', 'cited_pub_id'));
del G.es['citing_pub_id']
del G.es['cited_pub_id']
#%%
# Get weakly connected component
H = G.components(mode='weak').giant()
degree = np.array(H.degree(mode='out'))
H.es['weight'] = [1.0/degree[e.source] for e in H.es]
H.to_undirected(combine_edges='sum')
#%%
# Cluster publications
import random
random.seed(0)
ig.set_random_number_generator(random)
res_params = [2e-5, 1e-5]
cluster_solutions = [None]*len(res_params)
graph = H
for idx, res in enumerate(res_params):
cluster_solutions[idx] = graph.community_leiden(resolution_parameter=res, n_iterations=10,
weights='weight',
node_weights='weight')
graph = cluster_solutions[idx].cluster_graph(combine_vertices={'weight': 'sum'},
combine_edges={'weight': 'sum'})
#%%
# Make dataframe with clustering solution
pubs_df = nodes_df.set_index('id')
membership = np.arange(H.vcount())
for idx, clusters in enumerate(cluster_solutions):
tmp_membership = np.array(clusters.membership)
membership = np.array([tmp_membership[c] for c in membership])
pubs_df.loc[H.vs['id'],'clusters_{}'.format(idx)] = membership
pubs_df = pubs_df.loc[H.vs['id'],:]
pubs_df = pubs_df[pubs_df['weight'] == 1]
for idx in range(len(cluster_solutions)):
col = 'clusters_{}'.format(idx)
pubs_df[col] = pubs_df[col].astype('int')
#%%
# Write results
pubs_df.to_csv('cluster_solutions_pubs.txt', index=True, sep='\t')
cols = ['clusters_{}'.format(i) for i in range(len(cluster_solutions))]
pubs_df[cols].to_csv('cluster_solutions.txt', index=True, sep='\t')