-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmath_vectorizer.py
273 lines (198 loc) · 9.57 KB
/
math_vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import mmh3
from nltk.stem.porter import PorterStemmer
# stopwords can be downloaded by importing nltk to python and then executing the command nltk.download('stopwords')
from nltk.corpus import stopwords
# We will use itertools to efficiently flatten lists of lists
import itertools
# Due to the huge amount of data we will need sparse matrices
from scipy.sparse import csr_matrix
class MathHashingVectorizer(object):
"""Class for transformation of (mathematical) text to vectors with numerical entries.
The typical use of this class is to transform texts by first removing certain stopwords stored in (self.stop)
and then turning the result into sparse vectors that can be used as samples when training a classifier.
Methods:
__init__: Constructor method.
seeded_mmh3: Seeded hashing function based on Murmurhash3.
update_stopwords: Append stopwords.
remove_from_text: Removes all stopwords from the input text.
porter_stem: Applies Porter stemming to the input text.
stem: Applies Porter stem to all text not in math mode.
encode: Returns an ndarray where entries correspond to hashing values of the input text.
vectorize_encoded: Transforms a pandas Series of encoded values to a sparse matrix.
vectorize: Transforms a pandas Series of texts to a sparse matrix suitable for training a classifier.
"""
def __init__(self, random_seed=123, n=2**21):
"""
Constructor method.
:param random_seed: set a random seed for reproducibility.
:param n: The dimension of the hashing space, or in other words the number of features.
:type random_seed: int
:type n: int
"""
self.random_seed = random_seed
self.n = n
self.math_stopwords = ['proof', 'theorem', 'proposition', 'definition', 'lemma', 'counterexample']
self.math_stopwords += ['counterexamples', 'conjecture', 'conjectures', 'proofs', 'definitions']
self.math_stopwords += ['theorems', 'propositions', 'lemmas']
self.math_stopwords += ['Generalization', 'result', 'generalization', 'appendix', 'corollary']
self.math_stopwords += ['generalizations', 'generalisation', 'generalisations']
self.stop = stopwords.words('english') + stopwords.words('french') + self.math_stopwords
self.porter = PorterStemmer()
def seeded_mmh3(self, x):
"""
Seeded non-cryptographic hash function found in the mmh3 library (a python wrapper for Murmurhash 3).
:param x: text
:return: hash value
:type x: str
:rtype: int
"""
return mmh3.hash(x, seed=self.random_seed)
def update_stopwords(self, x):
"""
Adds stopwords.
:param x: List of strings to be appended to self.stopwords
:type x: list
"""
self.stop += x
def remove_from_text(self, text):
"""
Removes all stopwords from the given string.
:param text: Text
:return: Text with all words contained in self.stopwords removed.
:type text: str
:rtype: str
"""
text = ' '.join(i for i in text.split() if i.lower() not in self.stop)
return text
def porter_stem(self, text):
"""
stems the words appearing in the input text using PorterStem()
:param text: Text to be stemmed.
:return: Stemmed text.
:type text: str
:rtype: str
"""
stem_split = [self.porter.stem(word) for word in text.split()]
return ' '.join(word for word in stem_split)
def stem(self, text):
"""
This method applies PorterStemmer to all text not in math_mode (that is not wrapped in $ symbols).
::warning: This method has not been tested yet.
:param text: Text to be transformed
:return: Stemmed text
:type text: str
:rtype: str
"""
print('This is an early version of stem and we advice caution')
if text == '':
return ''
if text.count('$') % 2 != 0:
print('text contains an odd number of $ symbols and we will treat it wit porter_stem')
return self.porter_stem(text)
txt = text
transformed_text = ''
while len(txt) > 0:
first_occurrence = txt.find('$')
second_occurrence = txt.find('$', first_occurrence + 1)
if first_occurrence != 0:
transformed_text += self.porter_stem(txt[: first_occurrence]) + ' ' + \
txt[first_occurrence: second_occurrence + 1] + ' '
txt = txt[second_occurrence + 1:]
else:
transformed_text += txt[:second_occurrence + 1] + ' '
txt = txt[second_occurrence + 1:]
return transformed_text
def encode(self, text, max_words=40, hash_function=None, stemming=False):
"""
Removes stopwords from a text and encodes the result into an array of integers using a hashing function.
:param text: Text to be encoded.
:param max_words: Maximum number of words in the string to be considered.
:param hash_function: optional, default = None. A hashing function. If no hashing function is provided
self.seeded_mmh3 will be applied.
:param stemming: optional, default = False. If set to True we stem the words in the text (using self.stem)
before encoding.
:return: ndarray of hashing values.
:type text: str
:type max_words: int
:type hash_function: function
:type stemming: bool
:rtype: ndarray, shape = (max_words,)
"""
text = self.remove_from_text(text)
if stemming:
text = self.stem(text)
filters = '!"$\n \t'
if hash_function is None:
hash_function = self.seeded_mmh3
pre_vec = keras.preprocessing.text.hashing_trick(
text=text, n=self.n, hash_function=hash_function, lower=False, filters=filters)
vec = pre_vec[:max_words]
if len(vec) < max_words:
m = max_words - len(vec)
extension = ((self.n) * np.ones(m)).astype(int)
vec = np.concatenate((vec, extension))
return vec
def vectorize_encoded(self, df, max_words=40, mode='binary'):
"""
Transforms a pandas Series of ndarray's to a csr-matrix.
The non-zero entries of each row in the returned sparse matrix corresponds to the values in the corresponding
ndarray of hashing values.
:param df: Series where each entry is an ndarray of shape (max_words,) .
:param max_words: optional, default = 40. Maximum number of words to consider.
:param mode: optional, default = 'binary'. If set to 'binary' a non-zero entry is set to 1 if the index of the
column (in the given row) appeared in the corresponding ndarray . If mode is set to 'count' then an entry d in
a column means that the index of this column appeared d times in the corresponding ndarray.
:return: Sparse matrix
:type df: pandas Series
:type max_words: int
:type mode: str
:rtype: csr_matrix, shape = (df.shape[0], self.n)
"""
# We will construct a csr_matrix from (data, row_ind, col_ind)
s = df.shape[0]
columns_per_row = df.apply(lambda x: np.unique(x))
columns_per_row = columns_per_row.apply(lambda x: x.astype(int))
columns_per_row = columns_per_row.apply(lambda x: list(x))
columns_lists = columns_per_row.tolist()
cols = list(itertools.chain.from_iterable(columns_lists))
cols = np.array(cols)
# our col_ind will be cols. Let us now create our row_ind
lengths = list(columns_per_row.apply(lambda x: len(x)).values)
rows = []
for i in range(s):
v = lengths[i] * [i]
rows += v
rows = np.array(rows)
# rows is our row_ind. We now find the data to insert into our sparse matrix
if mode == 'count':
def counts(x):
uniq, ret_counts = np.unique(x, return_counts=True)
return ret_counts
counted = df.apply(counts)
counted = df.apply(lambda x: list(x)).tolist()
data = list(itertools.chain.from_iterable(counted))
data = np.array(data)
else:
data = np.ones(cols.shape[0])
X = csr_matrix((data, (rows, cols)), shape=(df.shape[0], self.n + 1))
return X[:, : -1]
def vectorize(self, df, max_words=40, mode='binary'):
"""
Transforms a pandas Series of strings to a csr-matrix with integer entries suitable for training a classifier.
The transformation is carried out by first applying self.encode to each entry of the series and then applying
self.vectorize_encoded to the result.
:param df: Series of (mathematical) text to be transformed.
:param max_words: optional, default = 40. Maximum number of words to consider in each entry of the Series
:param mode: 'binary' or 'count'. see the documentation for self.vectorize_encoded.
:return: Sparse matrix of vectorized samples.
:type df: pandas Series
:type max_words: int
:type mode: str
:rtype csr-matrix, shape = (df.shape[0], self.n)
"""
encoded_df = df.apply(lambda x: self.encode(x, max_words=max_words))
return self.vectorize_encoded(df=encoded_df, max_words=max_words, mode=mode)