-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodel.py
97 lines (62 loc) · 6.94 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np # linear algebra
import pandas as pd # data processin/microsoft/pylance-release/blob/main/DIAGNOSTIC_SEVERITY_RULES.mdg, CSV file I/O (e.g. pd.read_csv)
import pickle
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#for filename in filenames:
#print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:05:34.787811Z","iopub.execute_input":"2022-05-07T02:05:34.788486Z","iopub.status.idle":"2022-05-07T02:05:36.040999Z","shell.execute_reply.started":"2022-05-07T02:05:34.788448Z","shell.execute_reply":"2022-05-07T02:05:36.039829Z"}}
from sklearn.model_selection import train_test_split
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:05:38.289868Z","iopub.execute_input":"2022-05-07T02:05:38.290180Z","iopub.status.idle":"2022-05-07T02:05:38.478450Z","shell.execute_reply.started":"2022-05-07T02:05:38.290149Z","shell.execute_reply":"2022-05-07T02:05:38.477788Z"}}
import seaborn as sns
import matplotlib.pyplot as plt
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:05:41.775853Z","iopub.execute_input":"2022-05-07T02:05:41.776703Z","iopub.status.idle":"2022-05-07T02:05:41.863283Z","shell.execute_reply.started":"2022-05-07T02:05:41.776666Z","shell.execute_reply":"2022-05-07T02:05:41.861591Z"}}
data = pd.read_csv("housing.csv") #reading in the csv_data on housing
data.head()
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:05:44.245326Z","iopub.execute_input":"2022-05-07T02:05:44.246244Z","iopub.status.idle":"2022-05-07T02:05:44.310414Z","shell.execute_reply.started":"2022-05-07T02:05:44.246177Z","shell.execute_reply":"2022-05-07T02:05:44.309486Z"}}
data.describe()
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:07:35.741732Z","iopub.execute_input":"2022-05-07T02:07:35.742529Z","iopub.status.idle":"2022-05-07T02:07:35.749660Z","shell.execute_reply.started":"2022-05-07T02:07:35.742480Z","shell.execute_reply":"2022-05-07T02:07:35.748785Z"}}
data['total_bedrooms']=data['total_bedrooms'].fillna('bfill') #fill in the empty columns
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:05:51.621866Z","iopub.execute_input":"2022-05-07T02:05:51.622195Z","iopub.status.idle":"2022-05-07T02:05:51.627747Z","shell.execute_reply.started":"2022-05-07T02:05:51.622160Z","shell.execute_reply":"2022-05-07T02:05:51.626974Z"}}
print(np.shape(data)) #describes the number of rows and columns
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:05:54.468409Z","iopub.execute_input":"2022-05-07T02:05:54.469161Z","iopub.status.idle":"2022-05-07T02:07:19.371926Z","shell.execute_reply.started":"2022-05-07T02:05:54.469120Z","shell.execute_reply":"2022-05-07T02:07:19.371108Z"}}
#sns.lineplot(data=data, x="median_house_value", y="households")
#relationship between household desnity and median_house_value
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:07:46.828088Z","iopub.execute_input":"2022-05-07T02:07:46.828826Z","iopub.status.idle":"2022-05-07T02:09:11.684246Z","shell.execute_reply.started":"2022-05-07T02:07:46.828779Z","shell.execute_reply":"2022-05-07T02:09:11.683360Z"}}
#sns.lineplot(data=data, x="median_house_value", y="total_rooms")
# %% [markdown]
#
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:11:38.943656Z","iopub.execute_input":"2022-05-07T02:11:38.943973Z","iopub.status.idle":"2022-05-07T02:11:39.144829Z","shell.execute_reply.started":"2022-05-07T02:11:38.943924Z","shell.execute_reply":"2022-05-07T02:11:39.143733Z"}}
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# %% [markdown]
#
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:11:44.009553Z","iopub.execute_input":"2022-05-07T02:11:44.009854Z","iopub.status.idle":"2022-05-07T02:13:09.189798Z","shell.execute_reply.started":"2022-05-07T02:11:44.009819Z","shell.execute_reply":"2022-05-07T02:13:09.188902Z"}}
#sns.lineplot(data=data, x="median_house_value", y="median_income")
# line plot to show the relation between median_house_value and the median_income
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:16:52.159919Z","iopub.execute_input":"2022-05-07T02:16:52.160352Z","iopub.status.idle":"2022-05-07T02:16:52.538700Z","shell.execute_reply.started":"2022-05-07T02:16:52.160321Z","shell.execute_reply":"2022-05-07T02:16:52.537712Z"}}
#sns.displot(data=data,x="median_house_value", kind='kde')
# plotting a desnity graph to show median_house_value
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:16:41.844668Z","iopub.execute_input":"2022-05-07T02:16:41.845019Z","iopub.status.idle":"2022-05-07T02:16:41.852802Z","shell.execute_reply.started":"2022-05-07T02:16:41.844981Z","shell.execute_reply":"2022-05-07T02:16:41.852144Z"}}
features = ['longitude', 'latitude', 'total_rooms', 'median_income']
X = data[features]
y = data['median_house_value']
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:17:39.403847Z","iopub.execute_input":"2022-05-07T02:17:39.404872Z","iopub.status.idle":"2022-05-07T02:17:39.413787Z","shell.execute_reply.started":"2022-05-07T02:17:39.404812Z","shell.execute_reply":"2022-05-07T02:17:39.413022Z"}}
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.6,random_state=0)
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:17:41.395995Z","iopub.execute_input":"2022-05-07T02:17:41.396277Z","iopub.status.idle":"2022-05-07T02:17:44.180230Z","shell.execute_reply.started":"2022-05-07T02:17:41.396246Z","shell.execute_reply":"2022-05-07T02:17:44.179117Z"}}
model = RandomForestRegressor(random_state=0)
model.fit(X_train,y_train)
model2 = RandomForestRegressor(random_state=0)
print(X_train.head())
# %% [code] {"execution":{"iopub.status.busy":"2022-05-06T23:03:44.759703Z","iopub.execute_input":"2022-05-06T23:03:44.759967Z","iopub.status.idle":"2022-05-06T23:03:45.000523Z","shell.execute_reply.started":"2022-05-06T23:03:44.759932Z","shell.execute_reply":"2022-05-06T23:03:44.999323Z"}}
model.predict(X_test)
pickle.dump(model,open('model.pkl','wb'))
test_case = pd.DataFrame(data={'longitude':-118.35, 'latitude': 34.22, '1':1560.0, 'median_income':4.0000}, index=[0])
# %% [code] {"execution":{"iopub.status.busy":"2022-05-07T02:17:48.929420Z","iopub.execute_input":"2022-05-07T02:17:48.929727Z","iopub.status.idle":"2022-05-07T02:17:49.224594Z","shell.execute_reply.started":"2022-05-07T02:17:48.929696Z","shell.execute_reply":"2022-05-07T02:17:49.223715Z"}}
print(mean_absolute_error(model.predict(X_test), y_test))
print(model.predict(test_case))
print(model.predict(X_test))