Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/file_save'
Browse files Browse the repository at this point in the history
# Conflicts:
#	machine_learning/main.py
#	machine_learning/training.py
  • Loading branch information
casparemiljensen committed Mar 18, 2024
2 parents b5821df + 98c85d7 commit d2b1e1c
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 119 deletions.
77 changes: 61 additions & 16 deletions data_processing/data_cleansing.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,73 @@
import pandas as pd
import os

import pandas as pd
import requests
from machine_learning.pre_processing import load_data

PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))

input_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights.csv")
output_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights_processed.csv")
model_url = "https://kilthub.cmu.edu/ndownloader/files/26385151"

def main():
input_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights.csv")
output_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights_processed.csv")

def main():
try:
data = pd.read_csv(input_file, low_memory=False)
data = data[((data.route == 'R1') | (data.route == 'R2') | (data.route == 'R3') | (data.route == 'R4') |
(data.route == 'R5') | (data.route == 'R6') | (data.route == 'R7'))]
data_processed = data

data_processed.to_csv(output_file, index=False)
if os.path.exists("data/datasets/rodrigues/flights.csv"):
print("Found flights.csv")
filter_flights()
else:
print('''
--------------------------------------------
File 'flights.csv' not found.
Downloading the file 'flights.csv' from:
https://doi.org/10.1184/R1/12683453.v1
--------------------------------------------''')
download_model()
filter_flights()
except FileNotFoundError:
print('''
--------------------------------------------
Error: File 'flights.csv' not found.
Please download the file 'flights.csv' from:
https://doi.org/10.1184/R1/12683453.v1
--------------------------------------------''')
raise Exception('An error occured')


def download_model():
response = requests.get(model_url, stream=True)
print("Downloading flights.csv file")

# Check if the request was successful (status code 200)
if response.status_code == 200:
with open(input_file, "wb") as model_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
model_file.write(chunk)

# Check if the file exists after writing
if os.path.exists(input_file):
print(f"Download successful: {input_file}")
# Number of rows in flights.csv file:
df = pd.read_csv(input_file)
num_rows = len(df)
print("Number of rows in flights.csv:", num_rows)
return True
else:
print("Error: File not found after download.")
else:
print(f"Error: Failed to download file. Status code: {response.status_code}")

return False


def filter_flights():
print("Generating flights_processed")
data = load_data(input_file)
# Checks if "Route" in file starts with R, and goes from 1 through 7
data = data[data['route'].apply(lambda x: any(x.startswith(f'R{i}') for i in range(1, 8)))]
data_processed = data
data_processed.to_csv(output_file, index=False)

# Number of rows in flights_processed.csv file:
df = pd.read_csv(output_file)
num_rows = len(df)
print("Number of rows in flights_processed.csv:", num_rows)


if __name__ == '__main__':
Expand Down
2 changes: 2 additions & 0 deletions data_processing/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ notebook==7.1.1
pandas==2.2.1
pyarrow==15.0.1
matplotlib==3.8.3
requests~=2.31.0
scipy~=1.12.0
29 changes: 21 additions & 8 deletions machine_learning/main.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,41 @@
import os
import pickle
from machine_learning.pre_processing import pre_process_and_split_data
from machine_learning.prepare_for_training import format_data
from machine_learning.training import training_and_evaluating

from machine_learning.training import train_model, evaluate_model

PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
flights_processed = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights_processed.csv")

# Define the path for saving/loading the model
MODEL_FILE_PATH = os.path.join(PROJECT_ROOT, "machine_learning/model_file/trained_model.pkl")


def train():
# pre_processing
print("Pre-processing data...")
input_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights_processed.csv")

# organizing
print("Splitting data...")
train_data, test_data = pre_process_and_split_data(input_file)
print("Formatting data...")
train_data = format_data(train_data)
test_data = format_data(test_data)

# training
print("Training...")
training_and_evaluating(train_data, test_data, grid_search_cv=True)
# Check if the model file exists
if os.path.exists(MODEL_FILE_PATH):
print("Loading pre-trained model...")
with open(MODEL_FILE_PATH, 'rb') as model_file:
model = pickle.load(model_file)
else:
print("Training...")
model = train_model(train_data)

# Save the trained model
print("Saving trained model...")
with open(MODEL_FILE_PATH, 'wb') as model_file:
pickle.dump(model, model_file)

# Perform prediction using the trained model
evaluate_model(model, test_data)


if __name__ == "__main__":
Expand Down
148 changes: 57 additions & 91 deletions machine_learning/training.py
Original file line number Diff line number Diff line change
@@ -1,116 +1,82 @@
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from machine_learning.prepare_for_training import TrainingDataset
from sklearn.metrics import mean_squared_error
from machine_learning.config import HPConfig, GridSearchConfig
from sklearn.metrics import make_scorer

def rmse_cum_power(true_labels, predicted_labels):
time_diff = np.diff(true_labels[:, 0], prepend=0)
true_cumulative_power = np.cumsum(predicted_labels[:, 0] * predicted_labels[:, 1] * time_diff)
test_targets_cumulative_power = np.cumsum(true_labels[:, 0] * true_labels[:, 1] * time_diff)

rmse = np.sqrt(mean_squared_error(test_targets_cumulative_power, true_cumulative_power))
return rmse


def custom_scoring(true_labels, predicted_labels):
return rmse_cum_power(true_labels, predicted_labels)


custom_scoring = make_scorer(custom_scoring)


def training_and_evaluating(train_data, test_data, grid_search_cv=True):
if grid_search_cv:
training_dataset = TrainingDataset(train_data)
test_dataset = TrainingDataset(test_data)
from machine_learning.config import HPConfig
from machine_learning.prepare_for_training import TrainingDataset


# Instantiate the decision tree model with specified hyperparameters
model = DecisionTreeRegressor()
# splits the train-test data into n_splits number of subsets for cross validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=GridSearchConfig.param_grid,
cv=cv, scoring=custom_scoring,verbose=2)
train_features = [] # input til model
train_targets = [] # target til model
for index in range(len(training_dataset)):
input_array, target_array = training_dataset[index]
train_features.append(input_array)
train_targets.append(target_array)
def train_model(train_data):
training_dataset = TrainingDataset(train_data)

# concatenate over axis 0, hvilket svarer til rows i et numpy array.
# Det svarer til at lave et stort array ud ad alle input arrays, hvor hver entry er inputs til een flight.
train_features_np = np.concatenate(train_features, axis=0)
train_targets_np = np.concatenate(train_targets, axis=0)
# Instantiate the decision tree model with specified hyperparameters
model = DecisionTreeRegressor(criterion=HPConfig.criterion, max_depth=HPConfig.max_depth,
max_features=HPConfig.max_features, max_leaf_nodes=HPConfig.max_leaf_nodes,
random_state=42)

# Fit the decision tree model
grid_search.fit(train_features_np, train_targets_np)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
# Extract features and targets from the training dataset
train_features = []
train_targets = []
for index in range(len(training_dataset)):
input_array, target_array = training_dataset[index]
train_features.append(input_array)
train_targets.append(target_array)

print("Best Params: ", best_params)
print("Best score: ", best_score)
# Concatenate the lists along the appropriate axis
train_features_np = np.concatenate(train_features, axis=0)
train_targets_np = np.concatenate(train_targets, axis=0)

best_model = grid_search.best_estimator_
best_model.fit(train_features_np, train_targets_np)
# TODO: Tilføj print detaljer
else:
# Fit the decision tree model
model.fit(train_features_np, train_targets_np)

training_dataset = TrainingDataset(train_data)
test_dataset = TrainingDataset(test_data)
return model

# Instantiate the decision tree model with specified hyperparameters
model = DecisionTreeRegressor(criterion=HPConfig.criterion, max_depth=HPConfig.max_depth,
max_features=HPConfig.max_features, max_leaf_nodes=HPConfig.max_leaf_nodes,
random_state=42)

# Extract features and targets from the training dataset
train_features = []
train_targets = []
for index in range(len(training_dataset)):
input_array, target_array = training_dataset[index]
train_features.append(input_array)
train_targets.append(target_array)
def evaluate_model(model, test_data):
test_dataset = TrainingDataset(test_data)

# Concatenate the lists along the appropriate axis
train_features_np = np.concatenate(train_features, axis=0)
train_targets_np = np.concatenate(train_targets, axis=0)
print("Evaluating...")
# Extract features and targets from the test dataset
test_features = []
test_targets = []
for index in range(len(test_dataset)):
test_input_array, test_target_array = test_dataset[index]
test_features.append(test_input_array)
test_targets.append(test_target_array)

model.fit(train_features_np, train_targets_np)
# Concatenate the lists along the appropriate axis
test_features_np = np.concatenate(test_features, axis=0)
test_targets_np = np.concatenate(test_targets, axis=0)

print("Evaluating...")
test_features = []
test_targets = []
for index in range(len(test_dataset)):
test_input_array, test_target_array = test_dataset[index]
test_features.append(test_input_array)
test_targets.append(test_target_array)
# Predict on the test set
test_predictions = model.predict(test_features_np)

# Concatenate the lists along the appropriate axis
test_features_np = np.concatenate(test_features, axis=0)
test_targets_np = np.concatenate(test_targets, axis=0)
# Calculate RMSE for the two output parameters
test_rmse = np.sqrt(mean_squared_error(test_targets_np, test_predictions))
print(f"Test Root Mean Squared Error (RMSE) for Voltage and Current: {test_rmse}")

# Predict on the test set
test_predictions = model.predict(test_features_np)
# Calculate power consumption predictions by multiplying voltage and current predictions
power_consumption_predictions = test_predictions[:, 0] * test_predictions[:, 1]

# Calculate RMSE for the two output parameters
# TODO: find rmse på begge targets individuelt
test_rmse = np.sqrt(mean_squared_error(test_targets_np, test_predictions))
print(f"Test Root Mean Squared Error (RMSE) for Voltage and Current: {test_rmse}")
# Calculate the time difference between each timestamp
time_diff = np.diff(test_features_np[:, 0], prepend=0)

# udtryk for forskellen fra cumulative power på ground truth og cumulative power på vores predictions.
# Jo tættere på nul, jo strammere hul
print(f"Original Test Root Mean Squared Error (RMSE) for Cumulative Power: "
f"{rmse_cum_power(test_targets, test_predictions)}")
# Adjust power consumption predictions by multiplying with the time difference
adjusted_power_consumption = power_consumption_predictions * time_diff

print("Training finished somehow!")
# Calculate cumulative power consumption for each flight
cumulative_power = np.cumsum(adjusted_power_consumption)

# TODO: Caasper her regnet jeg også bare cumulative power consumption på den simple måde.
# Her skal den også være integralet i stedet. Du kan bruge dem samme funktion som du laver i trapezoid_integration.
# Calculate RMSE on the original cumulative power for the test set
test_targets_cumulative_power = np.cumsum(test_targets_np[:, 0] * test_targets_np[:, 1] * time_diff)
original_test_rmse = np.sqrt(mean_squared_error(test_targets_cumulative_power, cumulative_power))

print(f"Original Test Root Mean Squared Error (RMSE) for Cumulative Power: {original_test_rmse}")

# Calculate RMSE on the adjusted cumulative power for the test set
test_rmse = np.sqrt(
mean_squared_error(test_targets_np[:, 0] * test_targets_np[:, 1], power_consumption_predictions))
print(f"Adjusted Test Root Mean Squared Error (RMSE) for Cumulative Power: {test_rmse}")

print("Evaluation finished!")

return model
8 changes: 4 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from machine_learning.main import train


if not os.path.exists("data/datasets/rodrigues/flights_processed.csv"):
print("flights_processed.csv not found: Generating...")
data_processing.data_cleansing.main()
if os.path.exists("data/datasets/rodrigues/flights_processed.csv") and os.path.exists("data/datasets/rodrigues/flights.csv"):
print("Correct file found. Very nice!")
else:
print("flights_processed.csv found. Very nice!")
print("Correct files not found: Generating...")
data_processing.data_cleansing.main()


if __name__ == "__main__":
Expand Down

0 comments on commit d2b1e1c

Please sign in to comment.