-
Notifications
You must be signed in to change notification settings - Fork 0
/
Mallesh project file.R
163 lines (126 loc) · 4.75 KB
/
Mallesh project file.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#Task----------------1
# Exploratory data analysis:
#Load the dataset and the relevant libraries
# Install and load the readxl package
install.packages("readxl")
library(readxl)
Data=read_excel(file.choose())
head(Data)
#Perform data type conversion of the attributes
str(Data)
Data$dteday <- as.Date(Data$dteday)
str(Data)
#Carry out the missing value analysis
missing_values <- colSums(is.na(Data))
print(missing_values)## As per the missing_values summary there is no missing values in dataset
#TASK-----------2
#Attributes distribution and trends
library(ggplot2)
#Plot monthly distribution of the total number of bikes rented
# Plot monthly distribution
ggplot(Data, aes(x = mnth, y = cnt, fill = factor(mnth))) +
geom_bar(stat = "identity") +
labs(title = "Monthly Distribution of Bikes Rented",
x = "Month",
y = "Total Number of Bikes Rented") +
theme_minimal()
#Plot yearly distribution of the total number of bikes rented
ggplot(Data, aes(x = yr, y = cnt, fill = factor(yr))) +
geom_bar(stat = "identity") +
labs(title = "Yearly Distribution of Bikes Rented",
x = "Year",
y = "Total Number of Bikes Rented") +
theme_minimal()
#Plot boxplot for outliers' analysis
ggplot(Data, aes(x = factor(yr), y = cnt)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Boxplot for Outliers' Analysis",
x = "Year",
y = "Total Number of Bikes Rented") +
theme_minimal()
#TASK------------3
# Split the dataset into train and test dataset
# Independent variables
X <- Data[, !(names(Data) %in% c("cnt", "instant"))]
# Traget variable
y <- Data$cnt
head(X)
head(y)
install.packages("caret")
library(caret)
set.seed(123)
# Create an index for splitting data
index <- createDataPartition(y, p = 0.8, list = FALSE)
# Split features and target into training and testing sets
X_train <- X[index, ]
X_test <- X[-index, ]
y_train <- y[index]
y_test <- y[-index]
#TASK-----------------4
#Create a model using the random forest algorithm
install.packages("randomForest")
library(randomForest)
# Create a random forest model
rf_model <- randomForest(y_train ~ ., data = X_train, ntree = 100)
# Print the model summary
print(rf_model)
#Task---------5
# Make predictions on the test set
predictions <- predict(rf_model, newdata = X_test)
# Print or inspect the predictions
print(predictions)
# Calculate Mean Absolute Error (MAE)
mae <- mean(abs(y_test - predictions))
cat("Mean Absolute Error (MAE):", mae, "\n")
# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((y_test - predictions)^2))
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
# Calculate R-squared
rsquared <- 1 - (sum((y_test - predictions)^2) / sum((y_test - mean(y_test))^2))
cat("R-squared:", rsquared, "\n")
### testing whether model overfitting or not
model <- randomForest(y_train ~ ., data = X_train, ntree = 100)
# Predictions on the training set
predictions_train <- predict(model, newdata = X_train)
# Predictions on the test set
predictions_test <- predict(model, newdata = X_test)
# Evaluate performance on the training set
mae_train <- mean(abs(y_train - predictions_train))
rmse_train <- sqrt(mean((y_train - predictions_train)^2))
cat("Training Set - MAE:", mae_train, " | RMSE:", rmse_train, "\n")
# Evaluate performance on the test set
mae_test <- mean(abs(y_test - predictions_test))
rmse_test <- sqrt(mean((y_test - predictions_test)^2))
cat("Test Set - MAE:", mae_test, " | RMSE:", rmse_test, "\n")
# Train the model with regularization
model <- randomForest(y_train ~ ., data = X_train, mtry = sqrt(ncol(X_train)))
# Predict on the test set
predictions <- predict(model, newdata = X_test)
# Evaluate performance
mae <- mean(abs(y_test - predictions))
rmse <- sqrt(mean((y_test - predictions)^2))
cat("Test Set - MAE:", mae, " | RMSE:", rmse, "\n")
##HYPER PARAMETER TUNNING
#Combine features and target into a data frame
train_data <- cbind(X_train, y_train)
# Set up the train control for cross-validation
ctrl <- trainControl(method = "cv", number = 10)
# Tune hyperparameters using random search
set.seed(123)
tuned_model <- train(
y_train ~ .,
data = train_data, # Use the combined data
method = "rf",
trControl = ctrl,
tuneLength = 15
)
# Get the best model
best_model <- tuned_model$finalModel
# Assess performance on the test set
predictions_test <- predict(best_model, newdata = X_test)
mae_test <- mean(abs(y_test - predictions_test))
rmse_test <- sqrt(mean((y_test - predictions_test)^2))
cat("Tuned Model - Test Set MAE:", mae_test, " | RMSE:", rmse_test, "\n")
# Calculate R-squared
rsquared <- 1 - (sum((y_test - predictions_test)^2) / sum((y_test - mean(y_test))^2))
cat("R-squared:", rsquared, "\n")