-
Notifications
You must be signed in to change notification settings - Fork 2
/
predictorBulk.py
35 lines (25 loc) · 964 Bytes
/
predictorBulk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import sys
import numpy as np
import pandas as pd
from sklearn.externals import joblib
# get codes for prediction
dfLabels = pd.read_excel('./data/Contention_Dictionary.xlsx')
dLabels = {}
for index, row in dfLabels.iterrows():
dLabels[row['New Contention Classification Text'].lower().strip()] = row['IDs']
# load the vectorizer
vectorizer = joblib.load(filename='./modelsAndTransformations/vectorizer.pkl')
# load the classifier
clf = joblib.load(filename='./modelsAndTransformations/LRclf.pkl')
# Load Dataset
df = pd.read_csv(sys.argv[1])
df['CLMANT_TXT'] = df.apply(lambda x: x['CLMANT_TXT'].lower().strip(), 1)
#Vectorize data
X = vectorizer.transform(df['CLMANT_TXT'])
# Predict Label
df['predictedLabel'] = clf.predict(X)
df['predictedID'] = df.apply(lambda x: dLabels[x['predictedLabel']], 1)
#Save file
file_name = './data/predicted' + sys.argv[1].split('/')[-1]
df.to_csv(file_name)
print('Done. Results have been saved in: ' + file_name)