-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcommon.py
97 lines (78 loc) · 2.79 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import logging
from zipfile import ZipFile
import json
import re
import numpy as np
from sklearn.datasets import fetch_20newsgroups
def _get_yelp_reviews_as_dict(label):
"""
Yield yelp reviews as a dict {x: list of tokenised sentences, y: label}
:param label: 'training' or 'test'
:return: :raise StopIteration:
"""
def _clean_yelp(text):
# cleaner (order matters)
contractions = re.compile(r"'|-|\"")
# all non alphanumeric
symbols = re.compile(r'(\W+)', re.U)
# single character removal
singles = re.compile(r'(\s\S\s)', re.I | re.U)
# separators (any whitespace)
seps = re.compile(r'\s+')
text = text.lower()
text = contractions.sub('', text)
text = symbols.sub(r' \1 ', text)
text = singles.sub(' ', text)
text = seps.sub(' ', text)
return text
# sentence splitter
alteos = re.compile(r'([!\?])')
def _yelp_sentences(l):
l = alteos.sub(r' \1 .', l).rstrip("(\.)*\n")
return l.split(".")
with ZipFile("data/yelp_%s_set.zip" % label, 'r') as zf:
with zf.open("yelp_%s_set/yelp_%s_set_review.json" % (label, label)) as f:
for i, line in enumerate(f):
if i > 1000:
raise StopIteration
rev = json.loads(line.decode())
yield {'y': rev['stars'], \
'x': [_clean_yelp(s).split() for s in _yelp_sentences(rev['text'])]}
def _get_yelp_data(three_way=False):
"""
Return yelp data as a tuple of
- list of training documents (untokenised, unsegmented)
- np.array of training labels
- list of test documents
- np.array of test documents
:rtype : tuple
"""
def _read(setname):
X, y = [], []
for doc in _get_yelp_reviews_as_dict(setname):
words = []
for sent in doc['x']:
words.extend(sent)
X.append(' '.join(words))
y.append(doc['y'])
return X, np.array(y)
tr_text, ytr = _read('training')
ev_text, yev = _read('test')
if three_way:
for y in [ytr, yev]:
y[y <= 2] = 1
y[y == 2] = 2
y[y >= 4] = 4
return tr_text, ytr, ev_text, yev
def _get_20ng_data():
cats = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'talk.religion.misc']
train = fetch_20newsgroups(data_home='data', subset='train', categories=cats)
test = fetch_20newsgroups(data_home='data', subset='test', categories=cats)
return train.data, train.target, test.data, test.target
def get_data(corpus, **kwargs):
logging.info('Reading labelled corpus')
loaders = {
'yelp': _get_yelp_data,
'20ng': _get_20ng_data
}
return loaders[corpus]()