-
Notifications
You must be signed in to change notification settings - Fork 4
/
util.py
114 lines (94 loc) · 3.75 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
util.py
Functions for loading and processing burrito data
"""
import numpy as np
import scipy as sp
import pandas as pd
import re
def load_burritos(delete_unreliable=True, delete_nonSD=True,
delete_extra_columns=True):
"""Load burrito rating data from Google Sheets
Parameters
----------
delete_unreliable : bool
if True, delete ratings from the database marked as unreliable
delete_nonSD : bool
if True, delete ratings from the database from outside of San Diego
delete_extra_columns : bool
if True, remove columns of data that are generally unused
Returns
-------
df : pandas.DataFrame
information about each burrito consumed
df_shops : pandas.DataFrame
information about each taco shop in the database
df_ingredients : pandas.DataFrame
information about the ingredients in each burrito in 'df'
"""
# Load all data
url = 'https://docs.google.com/spreadsheet/ccc?key=18HkrklYz1bKpDLeL-kaMrGjAhUM6LeJMIACwEljCgaw&output=csv'
df = pd.read_csv(url)
# Remove capitalization and excess spaces from location and reviewer
df.Location = df.Location.str.lower().str.strip()
df.Reviewer = df.Reviewer.str.lower().str.strip()
df.Burrito = df.Burrito.str.lower().str.strip()
# Delete unreliable ratings
if delete_unreliable:
df = df[(df.Unreliable != 'x') & (df.Unreliable != 'X')]
# Delete ratings outside of San Diego
if delete_nonSD:
df = df[(df.NonSD != 'x') & (df.NonSD != 'X')]
df.reset_index(drop=True, inplace=True)
# Get restaurant info
i_address = []
for n in df.index:
if df.loc[n].Neighborhood is not np.nan:
i_address.append(n)
df_shops = df.loc[i_address][['Location', 'Neighborhood',
'Address', 'URL', 'Yelp', 'Google', 'Chips']]
# Binarize free chips data
df_shops.Chips = df_shops.Chips.map({'x': True, 'X': True, 1: True})
df_shops.Chips = df_shops.Chips.fillna(False)
# Separate ingredient info from burrito info
df_ingredients = df[df.keys()[np.where(df.keys() == 'Beef')[0][0]:]]
df = df[df.keys()[:np.where(df.keys() == 'Beef')[0][0]]]
df = df.drop(['Neighborhood', 'Address', 'URL', 'Yelp',
'Google', 'Chips', 'Unreliable', 'NonSD'], axis=1)
# Ignore rows with no info about the taco shop
df_shops.dropna(inplace=True)
return df, df_shops, df_ingredients
def burritotypes(burrito_names, types=None):
"""
Classify each burrito name into discrete categories
Parameters
----------
burrito_names : array-like (1d)
names of each burrito
types : dict
keys indicate the burrito category
values indicate the string needed to be in a burrito name in order to fall in that category
Returns
-------
burrito_category : array-like (1d)
category of each burrito
"""
# Determine the categories that burritos can fall into
if types is None:
types = {'California': 'cali', 'Carnitas': 'carnita', 'Carne asada': 'carne asada',
'Chicken': 'chicken', 'Surf & Turf': 'surf.*turf', 'Adobada': 'adobad', 'Al Pastor': 'pastor'}
T = len(types)
# For each burrito name, assign it a category
categories = types.keys()
N_burritos = len(burrito_names)
burrito_category = ['']*N_burritos
for i, b in enumerate(burrito_names):
matched = False
for t in types.keys():
re4str = re.compile('.*' + types[t] + '.*', re.IGNORECASE)
if np.logical_and(re4str.match(b) is not None, matched is False):
burrito_category[i] = t
matched = True
if matched is False:
burrito_category[i] = 'other'
return burrito_category