Merge remote-tracking branch 'origin/file_save'

# Conflicts: # machine_learning/main.py # machine_learning/training.py
denBruneBarone · Mar 18, 2024 · d2b1e1c · d2b1e1c
2 parents b5821df + 98c85d7
commit d2b1e1c
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 119 deletions.
diff --git a/data_processing/data_cleansing.py b/data_processing/data_cleansing.py
@@ -1,28 +1,73 @@
-import pandas as pd
 import os
 
+import pandas as pd
+import requests
+from machine_learning.pre_processing import load_data
 
 PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
 
+input_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights.csv")
+output_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights_processed.csv")
+model_url = "https://kilthub.cmu.edu/ndownloader/files/26385151"
 
-def main():
- input_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights.csv")
- output_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights_processed.csv")
 
+def main():
  try:
- data = pd.read_csv(input_file, low_memory=False)
- data = data[((data.route == 'R1') | (data.route == 'R2') | (data.route == 'R3') | (data.route == 'R4') |
- (data.route == 'R5') | (data.route == 'R6') | (data.route == 'R7'))]
- data_processed = data
-
- data_processed.to_csv(output_file, index=False)
+ if os.path.exists("data/datasets/rodrigues/flights.csv"):
+ print("Found flights.csv")
+ filter_flights()
+ else:
+ print('''
+ --------------------------------------------
+ File 'flights.csv' not found.
+ Downloading the file 'flights.csv' from:
+ https://doi.org/10.1184/R1/12683453.v1
+ --------------------------------------------''')
+ download_model()
+ filter_flights()
  except FileNotFoundError:
- print('''
- --------------------------------------------
- Error: File 'flights.csv' not found.
- Please download the file 'flights.csv' from:
- https://doi.org/10.1184/R1/12683453.v1
- --------------------------------------------''')
+ raise Exception('An error occured')
+
+
+def download_model():
+ response = requests.get(model_url, stream=True)
+ print("Downloading flights.csv file")
+
+ # Check if the request was successful (status code 200)
+ if response.status_code == 200:
+ with open(input_file, "wb") as model_file:
+ for chunk in response.iter_content(chunk_size=8192):
+ if chunk:
+ model_file.write(chunk)
+
+ # Check if the file exists after writing
+ if os.path.exists(input_file):
+ print(f"Download successful: {input_file}")
+ # Number of rows in flights.csv file:
+ df = pd.read_csv(input_file)
+ num_rows = len(df)
+ print("Number of rows in flights.csv:", num_rows)
+ return True
+ else:
+ print("Error: File not found after download.")
+ else:
+ print(f"Error: Failed to download file. Status code: {response.status_code}")
+
+ return False
+
+
+def filter_flights():
+ print("Generating flights_processed")
+ data = load_data(input_file)
+ # Checks if "Route" in file starts with R, and goes from 1 through 7
+ data = data[data['route'].apply(lambda x: any(x.startswith(f'R{i}') for i in range(1, 8)))]
+ data_processed = data
+ data_processed.to_csv(output_file, index=False)
+
+ # Number of rows in flights_processed.csv file:
+ df = pd.read_csv(output_file)
+ num_rows = len(df)
+ print("Number of rows in flights_processed.csv:", num_rows)
 
 
 if __name__ == '__main__':

diff --git a/data_processing/requirements.txt b/data_processing/requirements.txt
@@ -2,3 +2,5 @@ notebook==7.1.1
 pandas==2.2.1
 pyarrow==15.0.1
 matplotlib==3.8.3
+requests~=2.31.0
+scipy~=1.12.0
diff --git a/machine_learning/main.py b/machine_learning/main.py
@@ -1,28 +1,41 @@
 import os
+import pickle
 from machine_learning.pre_processing import pre_process_and_split_data
 from machine_learning.prepare_for_training import format_data
-from machine_learning.training import training_and_evaluating
-
+from machine_learning.training import train_model, evaluate_model
 
 PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
 flights_processed = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights_processed.csv")
 
+# Define the path for saving/loading the model
+MODEL_FILE_PATH = os.path.join(PROJECT_ROOT, "machine_learning/model_file/trained_model.pkl")
+
 
 def train():
- # pre_processing
  print("Pre-processing data...")
  input_file = os.path.join(PROJECT_ROOT, "data/datasets/rodrigues/flights_processed.csv")
-
- # organizing
  print("Splitting data...")
  train_data, test_data = pre_process_and_split_data(input_file)
  print("Formatting data...")
  train_data = format_data(train_data)
  test_data = format_data(test_data)
 
- # training
- print("Training...")
- training_and_evaluating(train_data, test_data, grid_search_cv=True)
+ # Check if the model file exists
+ if os.path.exists(MODEL_FILE_PATH):
+ print("Loading pre-trained model...")
+ with open(MODEL_FILE_PATH, 'rb') as model_file:
+ model = pickle.load(model_file)
+ else:
+ print("Training...")
+ model = train_model(train_data)
+
+ # Save the trained model
+ print("Saving trained model...")
+ with open(MODEL_FILE_PATH, 'wb') as model_file:
+ pickle.dump(model, model_file)
+
+ # Perform prediction using the trained model
+ evaluate_model(model, test_data)
 
 
 if __name__ == "__main__":

diff --git a/machine_learning/training.py b/machine_learning/training.py
@@ -1,116 +1,82 @@
 import numpy as np
-from sklearn.model_selection import KFold, GridSearchCV
 from sklearn.tree import DecisionTreeRegressor
-from machine_learning.prepare_for_training import TrainingDataset
 from sklearn.metrics import mean_squared_error
-from machine_learning.config import HPConfig, GridSearchConfig
-from sklearn.metrics import make_scorer
-
-def rmse_cum_power(true_labels, predicted_labels):
- time_diff = np.diff(true_labels[:, 0], prepend=0)
- true_cumulative_power = np.cumsum(predicted_labels[:, 0] * predicted_labels[:, 1] * time_diff)
- test_targets_cumulative_power = np.cumsum(true_labels[:, 0] * true_labels[:, 1] * time_diff)
-
- rmse = np.sqrt(mean_squared_error(test_targets_cumulative_power, true_cumulative_power))
- return rmse
-
-
-def custom_scoring(true_labels, predicted_labels):
- return rmse_cum_power(true_labels, predicted_labels)
-
-
-custom_scoring = make_scorer(custom_scoring)
-
-
-def training_and_evaluating(train_data, test_data, grid_search_cv=True):
- if grid_search_cv:
- training_dataset = TrainingDataset(train_data)
- test_dataset = TrainingDataset(test_data)
+from machine_learning.config import HPConfig
+from machine_learning.prepare_for_training import TrainingDataset
 
 
- # Instantiate the decision tree model with specified hyperparameters
- model = DecisionTreeRegressor()
- # splits the train-test data into n_splits number of subsets for cross validation
- cv = KFold(n_splits=5, shuffle=True, random_state=42)
- grid_search = GridSearchCV(estimator=model, param_grid=GridSearchConfig.param_grid,
- cv=cv, scoring=custom_scoring,verbose=2)
- train_features = [] # input til model
- train_targets = [] # target til model
- for index in range(len(training_dataset)):
- input_array, target_array = training_dataset[index]
- train_features.append(input_array)
- train_targets.append(target_array)
+def train_model(train_data):
+ training_dataset = TrainingDataset(train_data)
 
-  # concatenate over axis 0, hvilket svarer til rows i et numpy array.
-  # Det svarer til at lave et stort array ud ad alle input arrays, hvor hver entry er inputs til een flight.
- train_features_np = np.concatenate(train_features, axis=0)
- train_targets_np = np.concatenate(train_targets, axis=0)
+ # Instantiate the decision tree model with specified hyperparameters
+ model = DecisionTreeRegressor(criterion=HPConfig.criterion, max_depth=HPConfig.max_depth,
+  max_features=HPConfig.max_features, max_leaf_nodes=HPConfig.max_leaf_nodes,
+  random_state=42)
 
- # Fit the decision tree model
- grid_search.fit(train_features_np, train_targets_np)
- best_params = grid_search.best_params_
- best_score = grid_search.best_score_
+ # Extract features and targets from the training dataset
+ train_features = []
+ train_targets = []
+ for index in range(len(training_dataset)):
+ input_array, target_array = training_dataset[index]
+ train_features.append(input_array)
+ train_targets.append(target_array)
 
- print("Best Params: ", best_params)
- print("Best score: ", best_score)
+ # Concatenate the lists along the appropriate axis
+ train_features_np = np.concatenate(train_features, axis=0)
+ train_targets_np = np.concatenate(train_targets, axis=0)
 
- best_model = grid_search.best_estimator_
- best_model.fit(train_features_np, train_targets_np)
- # TODO: Tilføj print detaljer
- else:
+ # Fit the decision tree model
+ model.fit(train_features_np, train_targets_np)
 
- training_dataset = TrainingDataset(train_data)
- test_dataset = TrainingDataset(test_data)
+ return model
 
- # Instantiate the decision tree model with specified hyperparameters
- model = DecisionTreeRegressor(criterion=HPConfig.criterion, max_depth=HPConfig.max_depth,
- max_features=HPConfig.max_features, max_leaf_nodes=HPConfig.max_leaf_nodes,
- random_state=42)
 
- # Extract features and targets from the training dataset
- train_features = []
- train_targets = []
- for index in range(len(training_dataset)):
- input_array, target_array = training_dataset[index]
- train_features.append(input_array)
- train_targets.append(target_array)
+def evaluate_model(model, test_data):
+ test_dataset = TrainingDataset(test_data)
 
- # Concatenate the lists along the appropriate axis
- train_features_np = np.concatenate(train_features, axis=0)
- train_targets_np = np.concatenate(train_targets, axis=0)
+ print("Evaluating...")
+ # Extract features and targets from the test dataset
+ test_features = []
+ test_targets = []
+ for index in range(len(test_dataset)):
+ test_input_array, test_target_array = test_dataset[index]
+ test_features.append(test_input_array)
+ test_targets.append(test_target_array)
 
- model.fit(train_features_np, train_targets_np)
+ # Concatenate the lists along the appropriate axis
+ test_features_np = np.concatenate(test_features, axis=0)
+ test_targets_np = np.concatenate(test_targets, axis=0)
 
- print("Evaluating...")
- test_features = []
- test_targets = []
- for index in range(len(test_dataset)):
- test_input_array, test_target_array = test_dataset[index]
- test_features.append(test_input_array)
- test_targets.append(test_target_array)
+ # Predict on the test set
+ test_predictions = model.predict(test_features_np)
 
-  # Concatenate the lists along the appropriate axis
-  test_features_np = np.concatenate(test_features, axis=0)
-  test_targets_np = np.concatenate(test_targets, axis=0)
+ # Calculate RMSE for the two output parameters
+ test_rmse = np.sqrt(mean_squared_error(test_targets_np, test_predictions))
+ print(f"Test Root Mean Squared Error (RMSE) for Voltage and Current: {test_rmse}")
 
-  # Predict on the test set
-  test_predictions = model.predict(test_features_np)
+ # Calculate power consumption predictions by multiplying voltage and current predictions
+ power_consumption_predictions = test_predictions[:, 0] * test_predictions[:, 1]
 
- # Calculate RMSE for the two output parameters
- # TODO: find rmse på begge targets individuelt
- test_rmse = np.sqrt(mean_squared_error(test_targets_np, test_predictions))
- print(f"Test Root Mean Squared Error (RMSE) for Voltage and Current: {test_rmse}")
+ # Calculate the time difference between each timestamp
+ time_diff = np.diff(test_features_np[:, 0], prepend=0)
 
- # udtryk for forskellen fra cumulative power på ground truth og cumulative power på vores predictions.
- # Jo tættere på nul, jo strammere hul
- print(f"Original Test Root Mean Squared Error (RMSE) for Cumulative Power: "
- f"{rmse_cum_power(test_targets, test_predictions)}")
+ # Adjust power consumption predictions by multiplying with the time difference
+ adjusted_power_consumption = power_consumption_predictions * time_diff
 
- print("Training finished somehow!")
+ # Calculate cumulative power consumption for each flight
+ cumulative_power = np.cumsum(adjusted_power_consumption)
 
- # TODO: Caasper her regnet jeg også bare cumulative power consumption på den simple måde.
- # Her skal den også være integralet i stedet. Du kan bruge dem samme funktion som du laver i trapezoid_integration.
+ # Calculate RMSE on the original cumulative power for the test set
+ test_targets_cumulative_power = np.cumsum(test_targets_np[:, 0] * test_targets_np[:, 1] * time_diff)
+ original_test_rmse = np.sqrt(mean_squared_error(test_targets_cumulative_power, cumulative_power))
 
+ print(f"Original Test Root Mean Squared Error (RMSE) for Cumulative Power: {original_test_rmse}")
 
+ # Calculate RMSE on the adjusted cumulative power for the test set
+ test_rmse = np.sqrt(
+ mean_squared_error(test_targets_np[:, 0] * test_targets_np[:, 1], power_consumption_predictions))
+ print(f"Adjusted Test Root Mean Squared Error (RMSE) for Cumulative Power: {test_rmse}")
 
+ print("Evaluation finished!")
 
+ return model
diff --git a/main.py b/main.py
@@ -3,11 +3,11 @@
 from machine_learning.main import train
 
 
-if not os.path.exists("data/datasets/rodrigues/flights_processed.csv"):
- print("flights_processed.csv not found: Generating...")
- data_processing.data_cleansing.main()
+if os.path.exists("data/datasets/rodrigues/flights_processed.csv") and os.path.exists("data/datasets/rodrigues/flights.csv"):
+ print("Correct file found. Very nice!")
 else:
- print("flights_processed.csv found. Very nice!")
+ print("Correct files not found: Generating...")
+ data_processing.data_cleansing.main()
 
 
 if __name__ == "__main__":