-
Notifications
You must be signed in to change notification settings - Fork 1
/
util.py
144 lines (127 loc) · 3.78 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import re
import numpy as np
import scipy as sp
from collections import Counter
from scipy.sparse import csr_matrix
import cPickle as pickle
from collections import defaultdict
def clean(raw):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', raw)
cleanr = re.compile('[^a-zA-Z0-9]')
cleantext = re.sub(cleanr, ' ', cleantext)
cleanr = re.compile(r'\W*\b\w{1,2}\b')
cleantext = re.sub(cleanr, '', cleantext)
return cleantext.lower()
def group(inp, n = 2):
for i in xrange(len(inp) - (n - 1)):
yield inp[i:i+n]
def group2words(inp):
comb_2_words = []
for f, s in group(inp, 2):
comb = f + " "+s
comb_2_words.append(comb)
return comb_2_words
def group3words(inp):
comb_3_words = []
for f, s, t in group(inp, 3):
comb = f + " "+s + " "+t
comb_3_words.append(comb)
return comb_3_words
def getKmers(inp):
kmers=[]
comb_2_words = group2words(inp)
for comb in comb_2_words:
kmers.append(comb)
comb_3_words = group3words(inp)
for comb in comb_3_words:
kmers.append(comb)
return kmers
def build_matrix(docs):
r""" Build sparse matrix from a list of documents,
each of which is a list of word/terms in the document.
"""
nrows = len(docs)
idx = {}
tid = 0
nnz = 0
# Remove all ratings
for d in docs:
#d = d[1:]
nnz += len(set(d))
for w in d:
if w not in idx:
idx[w] = tid
tid += 1
ncols = len(idx)
# set up memory
ind = np.zeros(nnz, dtype=np.int)
val = np.zeros(nnz, dtype=np.double)
ptr = np.zeros(nrows+1, dtype=np.int)
i = 0 # document ID / row counter
n = 0 # non-zero counter
# transfer values
for d in docs:
#d = d[1:]
cnt = Counter(d)
keys = list(k for k,_ in cnt.most_common())
l = len(keys)
for j,k in enumerate(keys):
ind[j+n] = idx[k]
val[j+n] = cnt[k]
ptr[i+1] = ptr[i] + l
n += l
i += 1
mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
mat.sort_indices()
return mat
# scale matrix and normalize its rows
def csr_idf(mat, copy=False, **kargs):
r""" Scale a CSR matrix by idf.
Returns scaling factors as dict. If copy is True,
returns scaled matrix and scaling factors.
"""
if copy is True:
mat = mat.copy()
nrows = mat.shape[0]
nnz = mat.nnz
ind, val, ptr = mat.indices, mat.data, mat.indptr
# document frequency
df = defaultdict(int)
for i in ind:
df[i] += 1
# inverse document frequency
for k,v in df.items():
df[k] = np.log(nrows / float(v)) ## df turns to idf - reusing memory
# scale by idf
for i in range(0, nnz):
val[i] *= df[ind[i]]
return df if copy is False else mat
def csr_l2normalize(mat, copy=False, **kargs):
r""" Normalize the rows of a CSR matrix by their L-2 norm.
If copy is True, returns a copy of the normalized matrix.
"""
if copy is True:
mat = mat.copy()
nrows = mat.shape[0]
nnz = mat.nnz
ind, val, ptr = mat.indices, mat.data, mat.indptr
# normalize
for i in range(nrows):
rsum = 0.0
for j in range(ptr[i], ptr[i+1]):
rsum += val[j]**2
if rsum == 0.0:
continue # do not normalize empty rows
rsum = 1.0/np.sqrt(rsum)
for j in range(ptr[i], ptr[i+1]):
val[j] *= rsum
if copy is True:
return mat
def save_pickle(matrix, filename):
with open(filename, 'wb') as outfile:
pickle.dump(matrix, outfile, pickle.HIGHEST_PROTOCOL)
def load_pickle(filename):
with open(filename, 'rb') as infile:
matrix = pickle.load(infile)
return matrix