-
Notifications
You must be signed in to change notification settings - Fork 1
/
LSmodel_testing.py
145 lines (128 loc) · 5.54 KB
/
LSmodel_testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import numpy
import datetime
import pandas as pd
from google.cloud import storage
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from math import sqrt
import requests
#survey results are n = 20, each in {0,1,2,3,4}
BUCKET_NAME = 'cs506finalprojectbucket'
# [START download-data]
# Public bucket holding the census data
bucket = storage.Client().bucket('cs506finalprojectbucket')
# Path to the data inside the public bucket
blob = bucket.blob('myCopy.csv')
# Download the data
blob.download_to_filename('myCopy.csv')
# [END download-data]
reg = linear_model.LinearRegression()
reg_healthcare = linear_model.LinearRegression()
regRidge = linear_model.Ridge(alpha=0.5)
regLasso = linear_model.Lasso(alpha=0.1)
# Roman: this code takes the data from my sample CSV and prints
# the array we need for ML. Please update to this when you are ready!
df = pd.read_csv('myCopy.csv')
#TODO change this section to pull from Austin/Peyton's database to get info about a particular database
test_user_responses = 0
#print(df.values[:,11:])
print(df.columns)
train_data_df = df.iloc[:30,11:]
train_labels_df = df.iloc[:30,6:11]
train_data = train_data_df.values.tolist()
train_labels_all = train_labels_df.values.tolist()
train_labels = []
train_labels_healthcare = []
for person in train_labels_all:
train_labels.append(person[1]) #extract answers for immigration
train_labels_healthcare.append(person[2])
test_data_df = df.iloc[30:,11:]
test_labels_df = df.iloc[30:,6:11]
test_data = test_data_df.values.tolist()
test_labels_all = test_labels_df.values.tolist()
test_labels = []
test_labels_healthcare = []
for person in test_labels_all:
test_labels.append(person[1]) #extract answers for immigration
test_labels_healthcare.append(person[2])
print(train_data)
print(train_labels)
#mock survey data for 5 people
mock_survey_data = [[4,1,0,4,0,1,3,4,4,3,3,2,3,0,1,0,0,1,0,3], #"democrat"
[0,3,3,0,4,3,0,1,0,0,0,0,1,3,3,3,4,3,3,1], #"republican"
[1,3,3,1,4,2,0,1,0,1,1,0,1,4,2,3,4,3,2,1], #"republican"
[2,2,1,1,2,3,2,3,2,2,2,2,2,2,3,2,2,3,3,3], #"moderate republican"
[3,1,1,3,2,2,3,3,3,3,3,3,3,1,2,1,1,2,1,2]] #"moderate democrat"
#mock opinions on immigration
mock_opinions_immigration = [2,7,7,5,3]
#mock_opinions_immigration = [7,2,2,4,8]
#reg.fit(mock_survey_data, mock_opinions_immigration)
reg.fit(train_data, train_labels)
reg_healthcare.fit(train_data, train_labels_healthcare)
#regRidge.fit(mock_survey_data, mock_opinions_immigration)
regRidge.fit(train_data, train_labels)
#regLasso.fit(mock_survey_data, mock_opinions_immigration)
regLasso.fit(train_data, train_labels)
print("oLS coefficients: ")
print(reg.coef_)
#print("LS+ridge coefficients: ")
#print(regRidge.coef_)
#print("LASSO coefficients: ")
#print(regLasso.coef_)
#mock new person "democrat"
#mock_new_data1 = [4,1,0,3,1,2,3,3,4,3,3,3,3,1,1,1,1,2,1,3]
#predict their opinion on immigration
#pred1 = reg.predict(numpy.reshape(mock_new_data1, (1,-1)))
#predRidge1 = regRidge.predict(numpy.reshape(mock_new_data1, (1,-1)))
#predLasso1 = regLasso.predict(numpy.reshape(mock_new_data1, (1,-1)))
#print("new prediction oLS, mock democrat: " + str(pred1))
#print("new prediction LS+ridge, mock democrat: " + str(predRidge1))
#print("new prediction LASSO, mock democrat: " + str(predLasso1))
#mock new person "republican"
#mock_new_data2 = [0,4,3,0,3,3,1,1,0,0,0,1,1,3,3,4,3,3,2,1]
#predict their opinion on immigration
#pred2 = reg.predict(numpy.reshape(mock_new_data2, (1,-1)))
#predRidge2 = regRidge.predict(numpy.reshape(mock_new_data2, (1,-1)))
#predLasso2 = regLasso.predict(numpy.reshape(mock_new_data2, (1,-1)))
#print("new prediction oLS, mock republican: " + str(pred2))
#print("new prediction LS+ridge, mock republican: " + str(predRidge2))
#print("new prediction LASSO, mock republican: " + str(predLasso2))
#test on real people's answers from test set
#test_row1 = test_data[0]
#pred_real1 = reg.predict(numpy.reshape(test_row1, (1,-1)))
#print(test_row1)
#print(test_labels[0])
#print("new prediction oLS, row 1 of test set: " + str(pred_real1))
#print("person's real answer's: " + str(test_labels[0]))
#test_row2 = test_data[1]
#pred_real2 = reg.predict(numpy.reshape(test_row2, (1,-1)))
#print(test_row2)
#print(test_labels[1])
#print("new prediction oLS, row 2 of test set: " + str(pred_real2))
#print("person's real answer's: " + str(test_labels[1]))
i = 0
total_prediction_array = []
health_pred = []
for person in test_data:
prediction = reg.predict(numpy.reshape(person, (1,-1)))
prediction_healthcare = reg_healthcare.predict(numpy.reshape(person, (1,-1)))
print(person)
total_prediction_array.append(prediction)
health_pred.append(prediction_healthcare)
#print(test_labels[i])
print("new prediction (immigration) oLS, row " + str(i) + " of test set: " + str(prediction))
print("person's real answer (immigration): " + str(test_labels[i]))
print("new prediction (healthcare) oLS, row " + str(i) + " of test set: " + str(prediction_healthcare))
print("person's real answer (healthcare): " + str(test_labels_healthcare[i]))
i += 1
# Export the model to a file
model = 'model.joblib'
joblib.dump(pipeline, model)
# Upload the model to GCS
bucket = storage.Client().bucket(BUCKET_NAME)
blob = bucket.blob('{}/{}'.format(
datetime.datetime.now().strftime('census_%Y%m%d_%H%M%S'),
model))
blob.upload_from_filename(model)
print("RMSE is "+str(sqrt(mean_squared_error(test_labels,total_prediction_array))))
print("healthcare RMSE is "+str(sqrt(mean_squared_error(test_labels_healthcare,health_pred))))