-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess_reviews.py
41 lines (31 loc) · 1.03 KB
/
process_reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import gzip
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def parse(filename):
f = gzip.open(filename, 'r')
entry = {}
for l in f:
l = l.strip()
colonPos = l.find(':')
if colonPos == -1:
yield entry
entry = {}
continue
eName = l[:colonPos]
rest = l[colonPos+2:]
entry[eName] = rest
yield entry
def topics(raw_data, out_folder, name, id=False):
fname = '%s_topics_in_id.txt' % name if id else '%s_topics_in.txt' % name
fout = open(out_folder+fname, 'wa')
logging.info('Parsing raw data and processing it')
for review in parse(raw_data):
if review:
if id:
line = review['review/userId'] + " == " + review['review/text'] + "\n"
else:
line = review['review/text'] + "\n"
fout.write(line)
fout.close()
if __name__ == '__main__':
topics('data/Electronics.txt.gz', 'data/', 'electronics')