-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_data.py
141 lines (111 loc) · 4.18 KB
/
parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
'''
Created on Apr 3, 2014
@author: Amod Samant
'''
import json
import pprint
import re
from string import punctuation
import csv
def create_csv(review_list,user_dict,business_dict):
headers = [
'review_id', 'true_stars', 'word_count',
'word_cap_count', 'text_polarity',
'biz_stars', 'biz_review_count',
'usr_avrstars', 'usr_review_count', 'usr_fans',
]
with open('feature_review.csv', 'w') as csvfile:
csvwriter = csv.DictWriter(csvfile,
fieldnames=headers,
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL,
lineterminator='\n')
csvwriter.writeheader()
i = 0
for review_obj in review_list:
i += 1
feature_data = {
'review_id': i,
'true_stars': review_obj['stars'],
}
# Text Features
review_text = review_obj['text']
[num_of_words,cap_words_count] = calc_words_plus_cap(review_text)
feature_data['word_count'] = num_of_words
feature_data['word_cap_count'] = cap_words_count
feature_data['text_polarity'] = review_obj['text_polarity']
# Reference to business' features
biz_obj = business_dict[review_obj['business_id']]
feature_data['biz_stars'] = biz_obj['stars']
feature_data['biz_review_count'] = biz_obj['review_count']
# TODO: Bag of Categories
# Reference to user's features
usr_obj = user_dict[review_obj['user_id']]
feature_data['usr_avrstars'] = usr_obj['average_stars']
feature_data['usr_review_count'] = usr_obj['review_count']
feature_data['usr_fans'] = usr_obj['fans']
# TODO: Bag of Elites and yelping_since
#pprint.pprint(feature_data)
csvwriter.writerow(feature_data)
print len(review_list)
return
def calc_words_plus_cap(text_string):
# cap_word_count: Count of capital first letter words in the review text
cap_word_count = 0
reg_exp = re.compile(r'[{}]'.format(punctuation))
new_strs = reg_exp.sub(' ',text_string)
for w in new_strs:
if w[0].isupper():
cap_word_count +=1
# word_len: Count of total words in review text
word_len = len(new_strs.split())
return word_len,cap_word_count
def build_review_list():
# Reading the review dataset
f = open('yelp_academic_dataset_review.json','r');
review_list = []
for line in f:
review_obj = json.loads(line)
review_list.append(review_obj)
# pprint.pprint(review_list)
f.close()
return review_list
def build_business_dict():
biz_categories = {}
# Reading the business dataset
f = open('yelp_academic_dataset_business.json','r');
business_dict = {}
for line in f:
business_obj = json.loads(line)
business_dict[business_obj['business_id']] = business_obj
for cat in business_obj['categories']:
biz_categories[cat] = biz_categories.get(cat, 0) + 1
#pprint.pprint(business_dict)
f.close()
return business_dict, biz_categories
def build_user_dict():
usr_elites = {}
# Reading the user dataset
f = open('yelp_academic_dataset_user.json','r');
user_dict = {}
for line in f:
user_obj = json.loads(line)
user_dict[user_obj['user_id']] = user_obj
for elite in user_obj['elite']:
usr_elites[elite] = usr_elites.get(elite, 0) + 1
#pprint.pprint(user_dict)
f.close()
return user_dict, usr_elites
# Stat the elites of users and categories of businesses
# Store businesses and users as dict for random access
business_dict, biz_categories = build_business_dict()
user_dict, usr_elites = build_user_dict()
pprint.pprint(biz_categories)
pprint.pprint(usr_elites)
print 'Categories: %d' % (len(biz_categories.keys()))
print 'Elites: %d' % (len(usr_elites.keys()))
# List of dictionary objects(each JSON object)
review_list = []
review_list = build_review_list()
create_csv(review_list,user_dict,business_dict)