-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier.py
229 lines (201 loc) · 9.69 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import pandas as pd
import numpy as np
import pickle
import ast
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
"""**ML Model predictor**"""
def load_model(path):
with open(path, 'rb') as f:
model = pickle.load(f)
return model
"""
This function creates a Bag-of-Words for Col 'imageLabels'
:param pandas.DataFrame df : input dataframe of user-profile
:param float confidence_threshold : threshold for an image tag to be considered as a valid image tag
:return : Bag-of-Words of image label tags above threshold
:rtype: list of str
"""
def BoW_imageLabels(df, confidence_threshold=0.65):
imageLabelBoWCol = []
total_rows = len(df)
for row in range(total_rows):
try:
imageLabelList = ast.literal_eval(df.imageLabels[row])
bow = ""
for tup in imageLabelList:
if (tup[1] > confidence_threshold):
bow += tup[0] + " "
imageLabelBoWCol.append(bow)
except ValueError:
# print("Encountered null")
imageLabelBoWCol.append("")
continue
except:
imageLabelBoWCol.append("")
continue
return imageLabelBoWCol
"""
This function checks if any person-like features exist in Col 'imageLabels'
:param pandas.DataFrame df : input dataframe on whose imageLabels Col we have check person-like features
:param float confidence_threshold : threshold for an image tag to be considered as a valid image tag
:param float personFeature_condifence : threshold for percent of image tags belonging to person-like features
:return : 2 lists:
list1 (rtype:str) containining imageLabel tags above the specified confidence threshold
list2 (rtype: bool) specifying whether the imageLabels col has person-like features or not
"""
def CheckPersonFeatures_in_imageLabels(df, confidence_threshold=0.65, personFeature_condifence=0.0):
## list of person-like features extracted from imageLabels column using data-engineering
personFeatures = {'neck', 'hair', 'eyelash', 'beard', 'human leg', 'face', 'facial expression', 'lip', 'chin',
'moustache', 'blond', 'human body', 'arm', 'human', 'waist', 'man', 'woman', 'wrist', 'finger',
'nose', 'person', 'leg', 'cheek', 'tongue', 'lady', 'muscle', 'forehead', 'shoulder', 'eye',
'boy', 'mouth', 'male', 'girl', 'skin', 'thigh', 'smile', 'chest', 'nail', 'head', 'hand',
'people', 'eyebrow', 'layered hair', 'hair coloring', 'brown hair', 'facial hair',
'hair accessory', 'thumb', 'jaw', 'hip'}
total_rows = len(df)
isPerson = []
newDfColumn = []
for row in range(total_rows):
try:
imageLabelList = ast.literal_eval(df.imageLabels[row])
newImageLabelList = []
for tup in imageLabelList:
if (tup[1] >= confidence_threshold):
newImageLabelList.append(tup)
pf_count = 0
for tup in newImageLabelList:
if (tup[0] in personFeatures):
pf_count += 1
pf_percent = pf_count / len(newImageLabelList)
newDfColumn.append(newImageLabelList)
if (pf_percent > personFeature_condifence):
isPerson.append(1)
else:
isPerson.append(0)
except ValueError:
print("in ValueError", file=sys.stderr)
# print("Encountered null")
newDfColumn.append([])
isPerson.append(0)
continue
except ZeroDivisionError:
print("in ZeroDivisionError", file=sys.stderr)
# newImageList is empty => no image label with high confidence tag
newDfColumn.append([])
isPerson.append(0)
continue
except:
print("in except", file=sys.stderr)
newDfColumn.append([])
isPerson.append(0)
continue
return newDfColumn, isPerson
"""
This function classifies if a given user-profile is brand or not
:param dict input_dict : user-profile from UI as dict
:return : username(str) of profile and if it isBrand(bool)
:rtype : dict
"""
def brand_influencer_classifier(input_dict):
del input_dict['id']
df = pd.DataFrame([input_dict])
print(df, file=sys.stderr)
total_rows = len(df)
if (total_rows == 0):
# empty input
return {}
df_input = df.copy()
# Feature expansion from Col ImageLabels (isPerson)
confidence_threshold = 0.65
personFeature_condifence = 0.0
df["newImageLabels"], df["isPerson"] = CheckPersonFeatures_in_imageLabels(df, confidence_threshold,
personFeature_condifence)
# Bag of Word (BoW) on ImageLabel tags
df["imageLabelBoW"] = BoW_imageLabels(df, confidence_threshold)
# Feature Selection : Removing unwanted cols based on domain knowledge
df.drop(['biography', 'username', 'externalUrl', 'picture', 'cityId', 'latitude', 'longitude', 'zip',
'contactPhoneNumber', 'imageLabels', 'newImageLabels'], axis=1, inplace=True)
# Loading fitted CountVectorizer (using trained data) to apply CountVectorizer on ImageLabelBoW
cv = load_model('AffableModels/cv_pkl.pickle')
cv_fit = cv.transform(df['imageLabelBoW'])
bow_df = pd.DataFrame(cv_fit.toarray(), columns=cv.get_feature_names_out())
bow_df.rename(columns={'brand': 'brand_bow'}, inplace=True)
# Handling Missing Values
numerical_vars = ['mediaCount', 'followingCount', 'followerCount', 'usertagsCount', 'geoMediaCount',
'shoppablePostsCount', 'followingTagCount']
# Loading training model to fill missing values
df_train = load_model('AffableModels/df_train_pkl.pickle')
for i in numerical_vars:
df[i].fillna(df_train[i].mean(), inplace=True)
categorical_vars = ['isBusiness', 'category', 'hasEmail', 'isPerson']
for i in categorical_vars:
df[i].fillna(df_train[i].mode()[0], inplace=True)
# Concatenating CountVectorizer of ImageLabels with Original df
df = pd.concat([df, bow_df], axis=1)
df.drop('imageLabelBoW', axis=1, inplace=True)
# One-hot encoding for categical vars
categorical_vars = ['isBusiness', 'category', 'hasEmail', 'isPerson']
# Loading all the trained One-hot-Encoders for categorical vars to fit the test data
category_encoder = load_model('AffableModels/category_ohe_pkl.pickle')
category_array = category_encoder.transform(df[['category']]).toarray()
category_ohe_df = pd.DataFrame(category_array, columns=category_encoder.get_feature_names_out(['category']))
isBusiness_encoder = load_model('AffableModels/isBusiness_ohe_pkl.pickle')
isBusiness_array = isBusiness_encoder.transform(df[['isBusiness']]).toarray()
isBusiness_ohe_df = pd.DataFrame(isBusiness_array, columns=isBusiness_encoder.get_feature_names_out(['isBusiness']))
hasEmail_encoder = load_model('AffableModels/hasEmail_ohe_pkl.pickle')
hasEmail_array = hasEmail_encoder.transform(df[['hasEmail']]).toarray()
hasEmail_ohe_df = pd.DataFrame(hasEmail_array, columns=hasEmail_encoder.get_feature_names_out(['hasEmail']))
isPerson_encoder = load_model('AffableModels/isPerson_ohe_pkl.pickle')
isPerson_array = isPerson_encoder.transform(df[['isPerson']]).toarray()
isPerson_ohe_df = pd.DataFrame(isPerson_array, columns=isPerson_encoder.get_feature_names_out(['isPerson']))
# Expanding categorical var columns using One-Hot-Encoders and expanding input datafram
ohe_df = pd.concat([category_ohe_df, isBusiness_ohe_df, hasEmail_ohe_df, isPerson_ohe_df], axis=1)
df = pd.concat([df, ohe_df], axis=1)
df.drop(categorical_vars, axis=1, inplace=True)
# Loading saved XG Boost Model (used bin file format to support different version of XG Boost Classifiers)
# Predicting using XG Boost ML Model (Original + No Scaling + No SVD) -> Best
XGBoost_model = XGBClassifier()
XGBoost_model.load_model('AffableModels/XGBoost.bin')
# Generating prediction in the requisite dict format from the model predictions
predictions = XGBoost_model.predict(df)
pred_df = pd.DataFrame(predictions, columns=['Brand'])
pred_df['Brand'] = pred_df['Brand'].astype(str)
username_df = pd.DataFrame(df_input['username'], columns=['username'])
username_df['username'] = username_df['username'].astype(str)
output_df = pd.concat([username_df, pred_df], axis=1)
output = {}
for i in range(len(output_df)):
output["username"] = output_df["username"][i]
if (output_df["Brand"][i] == "1"):
output["isBrand"] = "True"
else:
output["isBrand"] = "False"
return output
if __name__ == '__main__':
# Unit Testing --------
input_dict = {}
input_dict['id'] = 1
input_dict['username'] = "janish"
input_dict['biography'] = "newbie at angular"
input_dict['mediaCount'] = 1
input_dict['cityId'] = 1
input_dict['latitude'] = 1
input_dict['longitude'] = 1
input_dict['zip'] = 1
input_dict['contactPhoneNumber'] = 1
input_dict['followingCount'] = 1
input_dict['followerCount'] = 1
input_dict['usertagsCount'] = 1
input_dict['isBusiness'] = False
input_dict['externalUrl'] = "www.myntra.com"
input_dict['category'] = "actor"
input_dict['geoMediaCount'] = 1
input_dict['shoppablePostsCount'] = 1
input_dict['followingTagCount'] = 1
input_dict['picture'] = "www.google.com"
input_dict['imageLabels'] = "[('neck', 0.95, 0.95), ('head', 0.72, 0.72)]"
input_dict['hasEmail'] = True
output_dict = brand_influencer_classifier(input_dict)
print(output_dict)