-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathos_labfinal.py
315 lines (232 loc) · 12.3 KB
/
os_labfinal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# -*- coding: utf-8 -*-
"""OS_LabFinal.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1Mfqw14bvlVBR-PubdpuC32GQ8yDYIFFg
#introduction
This project is part of the Operating Systems Lab course, focusing on optimizing the Readahead feature of the Linux Page Cache. The project involves collecting data from various benchmarks, processing this data, and applying machine learning models to classify different workload types. The ultimate goal is to optimize the Readahead mechanism under varying workloads, using models like Decision Trees, Random Forests, and Neural Networks.
# Problem Definition
Readahead can significantly impact the performance of I/O operations, especially under heavy workloads. However, if not tuned correctly, it can lead to cache pollution or unnecessary memory usage, degrading the system's overall performance. The project aims to develop a model that dynamically adjusts the Readahead size based on workload characteristics, using machine learning techniques.
Objectives
Data Collection: Gather data on various I/O operations using RocksDB benchmarks and Linux's LTTng tracing framework.
Feature Engineering: Process the collected data to extract relevant features.
Model Training: Implement and train different models (Decision Tree, Neural Network, Random Forest) to classify workload types and suggest optimal Readahead sizes.
Performance Evaluation: Compare the performance of the models and determine the best approach.
# data gathering
"""
# mount it
from google.colab import drive
drive.mount('/content/drive')
"""# Working with larger and bigger dataset"""
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import tree
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical, plot_model
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
ls
cd drive/MyDrive/AI/OS_lab
ls
dataset = pd.read_csv('Final-Data.csv')
dataset
sample_fraction = 0.5
sampled_data = dataset.sample(frac=sample_fraction, random_state=42)
print("Shape of Sampled Data:", sampled_data.shape)
print("\nSampled Data Overview:")
print(sampled_data.head())
dataset = sampled_data
X = dataset.drop('workload_type', axis=1)
y = dataset['workload_type']
"""trying to find out which feature to delete using random forest classifire"""
model = RandomForestClassifier()
model.fit(X, y)
feature_importances = model.feature_importances_
feature_names = X.columns
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
plt.barh(range(len(sorted_features)), [val[1] for val in sorted_features], align='center')
plt.yticks(range(len(sorted_features)), [val[0] for val in sorted_features])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.show()
"""removing non-important features"""
threshold = 0.05
indices_to_keep = [i for i, importance in enumerate(feature_importances) if importance > threshold]
selected_features = X.iloc[:, indices_to_keep]
X = selected_features
print("\nShape of Updated Features (X):", X.shape)
print("\nUpdated Features (X):")
print(X.head())
print("\nShape of Updated Features (X):", y.shape)
print("\nTarget Variable (y):")
print(y.head())
X
"""using T-SNE to visualize our data
once in 2-D 😆
"""
!pip install openTSNE
X_data = selected_features.values
y_data = y.values
n_components_2d = 2
tsne_2d = TSNE(n_components=n_components_2d, random_state=40, n_jobs=-1)
X_tsne_2d = tsne_2d.fit(X_data)
plt.scatter(X_tsne_2d[:, 0], X_tsne_2d[:, 1], c=y_data, cmap='viridis', alpha=0.5)
plt.title('t-SNE 2D Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()
index_to_class_mapping = {1: 'readseq', 2:'readrandom', 3: 'readreverse', 4: 'readrandomwriterandom'}
"""using MPL nn models"""
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)
nn_accuracy = nn_model.score(X_test, y_test)
print(f"Neural Network Accuracy: {nn_accuracy:.4f}")
"""creating a nn model from scratch using the sequentials"""
input_dim = X_train.shape[1]
output_dim = len(label_encoder.classes_)
def create_nn_model(input_dim, output_dim):
model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
return model
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
nn_accuracies = []
for train_index, test_index in skf.split(selected_features, y_encoded):
X_train_fold, X_test_fold = selected_features.iloc[train_index], selected_features.iloc[test_index]
y_train_fold, y_test_fold = y_encoded[train_index], y_encoded[test_index]
nn_model = create_nn_model(input_dim, output_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = nn_model.fit(X_train_fold, y_train_fold, epochs=20, batch_size=64,
validation_data=(X_test_fold, y_test_fold),
callbacks=[early_stopping], verbose=0)
_, nn_accuracy = nn_model.evaluate(X_test_fold, y_test_fold, verbose=0)
nn_accuracies.append(nn_accuracy)
average_nn_accuracy = sum(nn_accuracies) / len(nn_accuracies)
print(f"Average Neural Network Accuracy (10-fold CV): {average_nn_accuracy:.4f}")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_onehot = to_categorical(y_encoded)
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)
def create_nn_model(input_dim, output_dim):
model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
return model
input_dim = X_train.shape[1]
output_dim = y_onehot.shape[1]
nn_model = create_nn_model(input_dim, output_dim)
plot_model(nn_model, to_file='neural_network_model.png', show_shapes=True, show_layer_names=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = nn_model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=2)
nn_loss, nn_accuracy = nn_model.evaluate(X_test, y_test, verbose=0)
print(f"Neural Network Accuracy: {nn_accuracy:.4f}")
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
nn_predictions = nn_model.predict(X_test)
nn_predictions_indices = nn_predictions.argmax(axis=1)
y_test_indices = y_test.argmax(axis=1)
print("Neural Network Classification Report:")
print(classification_report(y_test_indices, nn_predictions_indices, target_names=index_to_class_mapping.values()))
"""
4455/4455 [==============================] - 10s 2ms/step
Neural Network Classification Report:
precision recall f1-score support
readseq 1.00 0.94 0.97 1623
readrandom 1.00 1.00 1.00 37962
readreverse 0.86 0.81 0.83 698
readrandomwriterandom 1.00 1.00 1.00 102261
accuracy 1.00 142544
macro avg 0.96 0.94 0.95 142544
weighted avg 1.00 1.00 1.00 142544
"""
"""Greattt resaults on NN models!! 🙂
let's try dt
"""
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_accuracy = dt_model.score(X_test, y_test)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
plt.figure(figsize=(20, 10))
plot_tree(dt_model, filled=True, feature_names=X.columns, class_names=index_to_class_mapping.values(), rounded=True, fontsize=10)
plt.show()
dt_predictions = dt_model.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_predictions, target_names=index_to_class_mapping.values()))
"""
Decision Tree Classification Report:
precision recall f1-score support
readseq 1.00 1.00 1.00 1623
readrandom 1.00 1.00 1.00 37962
readreverse 1.00 1.00 1.00 698
readrandomwriterandom 1.00 1.00 1.00 102261
micro avg 1.00 1.00 1.00 142544
macro avg 1.00 1.00 1.00 142544
weighted avg 1.00 1.00 1.00 142544
samples avg 1.00 1.00 1.00 142544
"""
"""**Awesome!** 🆙
***Let's use Random forrest too***
"""
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
selected_tree = rf_model.estimators_[0]
plt.figure(figsize=(20, 10))
plot_tree(selected_tree, filled=True, feature_names=X.columns, class_names=index_to_class_mapping.values(), rounded=True, fontsize=10)
plt.show()
"""as you see there are lots of trees for making the decision. That's why it is called Random Forrest! **🙂**"""
from sklearn.metrics import classification_report
rf_accuracy = rf_model.score(X_test, y_test)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
rf_predictions = rf_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test.argmax(axis=1), rf_predictions.argmax(axis=1), target_names=index_to_class_mapping.values()))
"""
Random Forest Accuracy: 1.0000
Random Forest Classification Report:
precision recall f1-score support
readseq 1.00 1.00 1.00 1623
readrandom 1.00 1.00 1.00 37962
readreverse 1.00 1.00 1.00 698
readrandomwriterandom 1.00 1.00 1.00 102261
accuracy 1.00 142544
macro avg 1.00 1.00 1.00 142544
weighted avg 1.00 1.00 1.00 142544
"""
"""# Performance Comparison
| Model | Accuracy | Notes |
|------------------|-----------|---------------------------------------------|
| Decision Tree | 100.00% | Simple, interpretable, perfect accuracy |
| Neural Network | 99.85% | High accuracy, complex model with slight variability in precision |
| Random Forest | 100.00% | Combines multiple trees for perfect accuracy and generalization |"""
"""
# Conclusion
Through this project, we developed and compared three models—Decision Tree, Neural Network, and Random Forest—to optimize the Readahead feature under varying workloads. Both the Decision Tree and Random Forest models achieved perfect accuracy, demonstrating their strength in handling this classification task. The Neural Network, while slightly less accurate, offered flexibility in model design and captured complex relationships within the data. Given these results, the Random Forest model stands out for its combination of accuracy and interpretability, making it a strong candidate for real-time systems that require dynamic adjustment of Readahead sizes based on current workloads.
"""