-
Notifications
You must be signed in to change notification settings - Fork 5
/
features.py
129 lines (107 loc) · 4.32 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from collections import Counter
from itertools import chain
from itertools import groupby
from operator import itemgetter
#from string import punctuation
from unicodedata import category
import nltk
import numpy as np
from scipy import sparse as sp
#PUNCTUATION = set(punctuation)
PUNCTUATION = {'M', 'P', 'S'}
UINT = np.uint16
def split_on_punctuation(document):
'''tokenizes string by splitting on spaces and punctuation
Args:
document: string
Returns:
str generator
'''
for token in document.split():
if len(token) == 1:
yield token
else:
chunk = token[0]
for char0, char1 in zip(token[:-1], token[1:]):
#if (char0 in PUNCTUATION) == (char1 in PUNCTUATION):
if (category(char0)[0] in PUNCTUATION) == (category(char1)[0] in PUNCTUATION):
chunk += char1
else:
yield chunk
chunk = char1
if chunk:
yield chunk
def tokenize(documents):
'''tokenizes documents
Args:
documents: iterable of strings
Returns:
list of list of strings
'''
return [list(split_on_punctuation(doc)) for doc in documents]
def feature_counts(documents):
'''computes feature counts from featurized documents
Args:
documents: iterable of lists of hashable features
Returns:
dict mapping features to counts
'''
return Counter(feat for doc in documents for feat in doc)
def feature_vocab(documents, min_count=1, sorted_features=sorted):
'''gets feature vocabulary from featurized documents
Args:
documents: iterable of lists of hashable features
min_count: minimum number of times feature must appear to be included in the vocabulary
sorted_features: function that sorts the features
Returns:
{feature: index} dict
'''
return {feat: i for i, feat in enumerate(sorted_features(feat for feat, count in feature_counts(documents).items() if count >= min_count))}
def docs2bofs(documents, vocabulary=None, weights=None, default=1.0, format='csr', **kwargs):
'''constructs sparse BoF representations from featurized documents
Args:
documents: iterable of lists of hashable features
vocabulary: dict mapping features to indices (nonnegative ints) or a list of features; if None will compute automatically from documents
weights: dict mapping features to weights (floats) or a list/np.ndarray of weights; if None will compute unweighted BoFs
default: default feature weight if not feature in weights; ignored if weights is None
format: sparse matrix format
kwargs: passed to feature_vocab; ignored if not vocabulary is None
Returns:
sparse BoF matrix in CSR format of size (len(documents), len(vocabulary))
'''
if vocabulary is None:
vocabulary = feature_vocab(documents, **kwargs)
elif type(vocabulary) == list:
vocabulary = {feat: i for i, feat in enumerate(vocabulary)}
rows, cols, values = zip(*((row, col, count) for (row, col), count in Counter((i, vocabulary.get(feat, -1)) for i, doc in enumerate(documents) for feat in doc).items() if not col==-1))
m = len(documents)
V = len(vocabulary)
if weights is None:
return sp.coo_matrix((values, (rows, cols)), shape=(m, V), dtype=UINT).asformat(format)
bofs = sp.coo_matrix((values, (rows, cols)), shape=(m, V)).tocsr()
if type(weights) == dict:
diag = np.empty(V)
for feat, i in vocabulary.items():
diag[i] = weights.gets(feat, default)
else:
assert len(weights) == V, "if weights passed as a list/np.ndarray, length must be same as vocabulary size"
if type(weights) == list:
diag = np.array(weights)
else:
diag = weights
return bofs.dot(sp.diags(diag, 0)).asformat(format)
def sif_weights(documents_or_counts, a=1E-2):
'''computes SIF weights from featurized documents
Args:
documents_or_counts: iterable of lists of hashable features or dict mapping features to counts or count vector
a: SIF parameter
Returns:
if passed documents of count dict: dict mapping features to weights (floats); else a weight vector
'''
if type(documents_or_counts) == np.ndarray:
axtotal = a*sum(documents_or_counts)
return axtotal/(axtotal+documents_or_counts)
if type(documents_or_counts) == list:
documents_or_counts = feature_counts(documents_or_counts)
axtotal = a*sum(documents_or_counts.values())
return {feat: axtotal/(axtotal+count) for feat, count in documents_or_counts.items()}