-
Notifications
You must be signed in to change notification settings - Fork 0
/
house_prices_prediction_project.py
219 lines (190 loc) · 9.5 KB
/
house_prices_prediction_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#Business Problem¶
#
#Using a dataset of properties and house prices for each house, a machine learning project on the prices of different types of houses is intended to be realized.
#Dataset Story
#
#This dataset of residential homes in Ames, Iowa contains 79 explanatory variables. A contest on Kaggle. You can access the dataset and the competition page of the project from the kaggle website. The dataset belongs to a kaggle competition, therefore, there are two different csv files, train and test. In the test dataset, house prices are left blank and this. We try expect to estimate the values.
#
#38 Numeric Variables, 43 Categorical Variables, 1460 Observation
#
# MSSubClass: Identifies the type of dwelling involved in the sale
# MSZoning: Identifies the general zoning classification of the sale
# LotFrontage: Linear feet of street connected to property
# LotArea: Lot size in square feet
# Street: Type of road access to property
# Alley: Type of alley access to property
# LotShape: General shape of property
# LandContour: Flatness of the property
# Utilities: Type of utilities available
# LotConfig: Lot configuration
# LandSlope: Slope of property
# Neighborhood: Physical locations within Ames city limits
# Condition1: Proximity to various conditions
# Condition2: Proximity to various conditions (if more than one is present)
# BldgType: Type of dwelling
# HouseStyle: Style of dwelling
# OverallQual: Rates the overall material and finish of the house
# OverallCond: Rates the overall condition of the house
# YearBuilt: Original construction date
# YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
# RoofStyle: Type of roof
# RoofMatl: Roof material
# Exterior1st: Exterior covering on house
# Exterior2nd: Exterior covering on house (if more than one material)
# MasVnrType: Masonry veneer type
# MasVnrArea: Masonry veneer area in square feet
# ExterQual: Evaluates the quality of the material on the exterior
# ExterCond: Evaluates the present condition of the material on the exterior
# Foundation: Type of foundation
# BsmtQual: Evaluates the height of the basement
# BsmtCond: Evaluates the general condition of the basement
# BsmtExposure: Refers to walkout or garden level walls
# BsmtFinType1: Rating of basement finished area
# BsmtFinSF1: Type 1 finished square feet
# BsmtFinType2: Rating of basement finished area (if multiple types)
# BsmtFinSF2: Type 2 finished square feet
# BsmtUnfSF: Unfinished square feet of basement area
# TotalBsmtSF: Total square feet of basement area
# Heating: Type of heating
# HeatingQC: Heating quality and condition
# CentralAir: Central air conditioning
# Electrical: Electrical system
# 1stFlrSF: First Floor square feet
# 2ndFlrSF: Second floor square feet
# LowQualFinSF: Low quality finished square feet (all floors)
# GrLivArea: Above grade (ground) living area square feet
# BsmtFullBath: Basement full bathrooms
# BsmtHalfBath: Basement half bathrooms
# FullBath: Full bathrooms above grade
# HalfBath: Half baths above grade
# Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
# Kitchen: Kitchens above grade
# KitchenQual: Kitchen quality
# TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
# Functional: Home functionality (Assume typical unless deductions are warranted)
# Fireplaces: Number of fireplaces
# FireplaceQu: Fireplace quality
# GarageType: Garage location
# GarageYrBlt: Year garage was built
# GarageFinish: Interior finish of the garage
# GarageCars: Size of garage in car capacity
# GarageArea: Size of garage in square feet
# GarageQual: Garage quality
# GarageCond: Garage condition
# PavedDrive: Paved driveway
# WoodDeckSF: Wood deck area in square feet
# OpenPorchSF: Open porch area in square feet
# EnclosedPorch: Enclosed porch area in square feet
# 3SsnPorch: Three season porch area in square feet
# ScreenPorch: Screen porch area in square feet
# PoolArea: Pool area in square feet
# PoolQC: Pool quality
# Fence: Fence quality
# MiscFeature: Miscellaneous feature not covered in other categories
# MiscVal: $Value of miscellaneous feature
# MoSold: Month Sold (MM)
# YrSold: Year Sold (YYYY)
# SaleType: Type of sale
# SaleCondition: Condition of sale
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer, KNNImputer
import missingno as msno
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df_test = pd.read_csv('Datasets/test.csv')
df_train = pd.read_csv('Datasets/train.csv')
df = pd.concat([df_test, df_train], axis=1).reset_index(drop=True)
df = pd.read_csv('Datasets/combined.csv', index_col=False)
df.drop("Unnamed: 0", axis=1, inplace=True)
df.dtypes
df.to_csv('Datasets/combined.csv', index=False)
def check_df(dataframe, head=5):
print("##################### Shape #####################")
print(dataframe.shape)
print("##################### Types #####################")
print(dataframe.dtypes)
print("##################### Duplicated Values #####################")
print(dataframe.duplicated().sum())
print("##################### Number of Unique Values #####################")
print(df.nunique())
print("##################### Head #####################")
print(dataframe.head(head))
print("##################### Tail #####################")
print(dataframe.tail(head))
print("##################### NA #####################")
print(dataframe.isnull().sum())
print("##################### Quantiles #####################")
numeric_cols = dataframe.select_dtypes(include=['number']) # sayısal değerlerin quantileıne bakar
print(numeric_cols.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
check_df(df)
# Check the type of the DataFrame and its columns
print(f"Type of df: {type(df)}")
print("DataFrame columns and types:")
print(df.dtypes)
# Function to identify categorical, numerical, and cardinal variables in the dataset.
def grab_col_names(dataframe, cat_th=10, car_th=20):
"""
This function identifies the names of categorical, numerical, and categorical but cardinal variables in the dataset.
Note: Numeric-looking categorical variables are also included in categorical variables.
Parameters:
dataframe: The dataframe to analyze.
cat_th: Threshold for the number of unique values below which variables are considered categorical (default is 10).
car_th: Threshold for the number of unique values above which variables are considered cardinal (default is 20).
Returns:
cat_cols: List of categorical variable names.
num_cols: List of numerical variable names.
cat_but_car: List of cardinal (categorical but with many unique values) variable names.
"""
print("Starting grab_col_names function")
# Categorical columns
cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "O" or pd.api.types.is_object_dtype(dataframe[col])]
print("Identified categorical columns:", cat_cols)
# Numerical but categorical
num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and pd.api.types.is_numeric_dtype(dataframe[col])]
print("Identified numerical but categorical columns:", num_but_cat)
# Categorical but cardinal
cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and (dataframe[col].dtype == "O" or pd.api.types.is_object_dtype(dataframe[col]))]
print("Identified categorical but cardinal columns:", cat_but_car)
# Combine categorical columns and remove cardinal columns
cat_cols += num_but_cat
cat_cols = [col for col in cat_cols if col not in cat_but_car]
print("Final categorical columns:", cat_cols)
# Numerical columns
num_cols = [col for col in dataframe.columns if pd.api.types.is_numeric_dtype(dataframe[col])]
num_cols = [col for col in num_cols if col not in num_but_cat]
print("Identified numerical columns:", num_cols)
print(f"Observations: {dataframe.shape[0]}")
print(f"Variables: {dataframe.shape[1]}")
print(f'cat_cols: {len(cat_cols)}')
print(f'num_cols: {len(num_cols)}')
print(f'cat_but_car: {len(cat_but_car)}')
print(f'num_but_cat: {len(num_but_cat)}')
return cat_cols, num_cols, cat_but_car
# Execute the function to identify and separate variable types.
cat_cols, num_cols, cat_but_car = grab_col_names(df)
print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)
print("Cardinal columns:", cat_but_car)