-
Notifications
You must be signed in to change notification settings - Fork 0
/
document_clustering.py
227 lines (180 loc) · 8.33 KB
/
document_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
=======================================
Clustering text documents using k-means
=======================================
This is an example showing how the scikit-learn can be used to cluster
documents by topics using a bag-of-words approach. This example uses
a scipy.sparse matrix to store the features instead of standard numpy arrays.
Two feature extraction methods can be used in this example:
- TfidfVectorizer uses a in-memory vocabulary (a python dict) to map the most
frequent words to features indices and hence compute a word occurrence
frequency (sparse) matrix. The word frequencies are then reweighted using
the Inverse Document Frequency (IDF) vector collected feature-wise over
the corpus.
- HashingVectorizer hashes word occurrences to a fixed dimensional space,
possibly with collisions. The word count vectors are then normalized to
each have l2-norm equal to one (projected to the euclidean unit-ball) which
seems to be important for k-means to work in high dimensional space.
HashingVectorizer does not provide IDF weighting as this is a stateless
model (the fit method does nothing). When IDF weighting is needed it can
be added by pipelining its output to a TfidfTransformer instance.
Two algorithms are demoed: ordinary k-means and its more scalable cousin
minibatch k-means.
Additionally, latent semantic analysis can also be used to reduce dimensionality
and discover latent patterns in the data.
It can be noted that k-means (and minibatch k-means) are very sensitive to
feature scaling and that in this case the IDF weighting helps improve the
quality of the clustering by quite a lot as measured against the "ground truth"
provided by the class label assignments of the 20 newsgroups dataset.
This improvement is not visible in the Silhouette Coefficient which is small
for both as this measure seem to suffer from the phenomenon called
"Concentration of Measure" or "Curse of Dimensionality" for high dimensional
datasets such as text data. Other measures such as V-measure and Adjusted Rand
Index are information theoretic based evaluation scores: as they are only based
on cluster assignments rather than distances, hence not affected by the curse
of dimensionality.
Note: as k-means is optimizing a non-convex objective function, it will likely
end up in a local optimum. Several runs with independent random init might be
necessary to get a good convergence.
"""
# Author: Peter Prettenhofer <[email protected]>
# Lars Buitinck
# License: BSD 3 clause
from __future__ import print_function
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
# parse commandline arguments
op = OptionParser()
op.add_option("--lsa",
dest="n_components", type="int",
help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",
action="store_false", dest="minibatch", default=True,
help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
action="store_false", dest="use_idf", default=True,
help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
action="store_true", default=False,
help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
help="Maximum number of features (dimensions)"
" to extract from text.")
op.add_option("--verbose",
action="store_true", dest="verbose", default=False,
help="Print progress reports inside k-means algorithm.")
print(__doc__)
op.print_help()
def is_interactive():
return not hasattr(sys.modules['__main__'], '__file__')
# work-around for Jupyter notebook and IPython console
argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
if len(args) > 0:
op.error("this script takes no arguments.")
sys.exit(1)
# #############################################################################
# Load some categories from the training set
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
# Uncomment the following to do the analysis on all the categories
# categories = None
print("Loading 20 newsgroups dataset for categories:")
print(categories)
dataset = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42)
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()
labels = dataset.target
true_k = np.unique(labels).shape[0]
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
if opts.use_idf:
# Perform an IDF normalization on the output of HashingVectorizer
hasher = HashingVectorizer(n_features=opts.n_features,
stop_words='english', alternate_sign=False,
norm=None, binary=False)
vectorizer = make_pipeline(hasher, TfidfTransformer())
else:
vectorizer = HashingVectorizer(n_features=opts.n_features,
stop_words='english',
alternate_sign=False, norm='l2',
binary=False)
else:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
min_df=2, stop_words='english',
use_idf=opts.use_idf)
X = vectorizer.fit_transform(dataset.data)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()
if opts.n_components:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(opts.n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
print("done in %fs" % (time() - t0))
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
int(explained_variance * 100)))
print()
# #############################################################################
# Do the actual clustering
if opts.minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
verbose=opts.verbose)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
% metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, km.labels_, sample_size=1000))
print()
if not opts.use_hashing:
print("Top terms per cluster:")
if opts.n_components:
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
else:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster %d:" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()