new trains

yigbt · Jul 17, 2024 · 5899d92 · 5899d92
1 parent fdca870
commit 5899d92
Show file tree

Hide file tree

Showing 35 changed files with 5,420 additions and 1,018 deletions.
diff --git a/dfpl/.single_label_model.py.swp b/dfpl/.single_label_model.py.swp
diff --git a/dfpl/plot.py b/dfpl/plot.py
@@ -1,53 +1,29 @@
-from typing import List
+import array
 
 import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
-import wandb
-from matplotlib.axes import Axes
-
 # for NN model functions
 from tensorflow.keras.callbacks import History
+from matplotlib.axes import Axes
 
 
-def get_max_validation_accuracy(history: History) -> str:
-    validation = smooth_curve(history.history["val_accuracy"])
-    y_max: float = max(validation)
-    return "Max validation accuracy ≈ " + str(round(y_max, 3) * 100) + "%"
-
-
-def get_max_validation_balanced_accuracy(history: History) -> str:
-    validation_bal_acc = smooth_curve(history.history["val_balanced_accuracy"])
-    y_max: float = max(validation_bal_acc)
-    return "Max validation balanced accuracy ≈ " + str(round(y_max, 3) * 100) + "%"
-
-
-def get_max_training_balanced_accuracy(history: History) -> str:
-    training_bal_acc = smooth_curve(history.history["balanced_accuracy"])
-    y_max: float = max(training_bal_acc)
-    return "Training balanced accuracy ≈ " + str(round(y_max, 3) * 100) + "%"
-
-
-def get_max_training_auc(history: History) -> str:
-    training_auc = smooth_curve(history.history["auc"])
-    y_max: float = max(training_auc)
-    return "Validation AUC ≈ " + str(round(y_max, 3) * 100) + "%"
+# for testing in Weights & Biases
 
 
-def get_max_validation_auc(history: History) -> str:
-    validation_auc = smooth_curve(history.history["val_auc"])
-    y_max: float = max(validation_auc)
-    return "Validation AUC ≈ " + str(round(y_max, 3) * 100) + "%"
+def get_max_validation_accuracy(history: History) -> str:
+    validation = smooth_curve(history.history['val_accuracy'])
+    y_max = max(validation)
+    return 'Max validation accuracy ≈ ' + str(round(y_max, 3) * 100) + '%'
 
 
 def get_max_training_accuracy(history: History) -> str:
-    training = smooth_curve(history.history["accuracy"])
-    y_max: float = max(training)
-    return "Max training accuracy ≈ " + str(round(y_max, 3) * 100) + "%"
+    training = smooth_curve(history.history['accuracy'])
+    y_max = max(training)
+    return 'Max training accuracy ≈ ' + str(round(y_max, 3) * 100) + '%'
 
 
-def smooth_curve(points: np.ndarray, factor: float = 0.75) -> np.ndarray:
-    smoothed_points: List[float] = []
+def smooth_curve(points: array, factor: float = 0.75) -> array:
+    smoothed_points = []
     for point in points:
         if smoothed_points:
             previous = smoothed_points[-1]
@@ -60,13 +36,13 @@ def smooth_curve(points: np.ndarray, factor: float = 0.75) -> np.ndarray:
 def set_plot_history_data(ax: Axes, history: History, which_graph: str) -> None:
     (train, valid) = (None, None)
 
-    if which_graph == "acc":
-        train = smooth_curve(history.history["accuracy"])
-        valid = smooth_curve(history.history["val_accuracy"])
+    if which_graph == 'acc':
+        train = smooth_curve(history.history['accuracy'])
+        valid = smooth_curve(history.history['val_accuracy'])
 
-    if which_graph == "loss":
-        train = smooth_curve(history.history["loss"])
-        valid = smooth_curve(history.history["val_loss"])
+    if which_graph == 'loss':
+        train = smooth_curve(history.history['loss'])
+        valid = smooth_curve(history.history['val_loss'])
 
     # plt.xkcd() # make plots look like xkcd
 
@@ -75,69 +51,78 @@ def set_plot_history_data(ax: Axes, history: History, which_graph: str) -> None:
     trim = 0  # remove first 5 epochs
     # when graphing loss the first few epochs may skew the (loss) graph
 
-    ax.plot(epochs[trim:], train[trim:], "dodgerblue", linewidth=15, alpha=0.1)
-    ax.plot(epochs[trim:], train[trim:], "dodgerblue", label="Training")
+    ax.plot(epochs[trim:], train[trim:], 'dodgerblue', linewidth=15, alpha=0.1)
+    ax.plot(epochs[trim:], train[trim:], 'dodgerblue', label='Training')
+
+    ax.plot(epochs[trim:], valid[trim:], 'g', linewidth=15, alpha=0.1)
+    ax.plot(epochs[trim:], valid[trim:], 'g', label='Validation')
 
-    ax.plot(epochs[trim:], valid[trim:], "g", linewidth=15, alpha=0.1)
-    ax.plot(epochs[trim:], valid[trim:], "g", label="Validation")
+
+def plot_loss(hist: History, file: str) -> None:
+    fig, ax = plt.subplots(1)
+    fig.suptitle('History of loss values (regression model)')
+    ax.plot(hist.epoch, hist.history['loss'], 'dodgerblue', linewidth=15, alpha=0.1)
+    ax.plot(hist.epoch, hist.history['loss'], 'dodgerblue', label='Training')
+    ax.plot(hist.epoch, hist.history['val_loss'], 'g', linewidth=15, alpha=0.1)
+    ax.plot(hist.epoch, hist.history['val_loss'], 'g', label='Validation')
+    ax.set_xlabel('Epochs')
+    ax.set_ylabel('Loss')
+    ax.legend(loc="upper right")
+    plt.tight_layout()
+    plt.savefig(fname=file, format='jpg')
+    plt.close()
 
 
 def plot_history(history: History, file: str) -> None:
-    fig, (ax1, ax2) = plt.subplots(
-        nrows=2,
-        ncols=1,
-        figsize=(10, 6),
-        sharex="all",
-        gridspec_kw={"height_ratios": [5, 2]},
-    )
+    fig, (ax1, ax2) = plt.subplots(nrows=2,
+                                   ncols=1,
+                                   figsize=(10, 6),
+                                   sharex='all',
+                                   gridspec_kw={'height_ratios': [5, 2]})
 
-    set_plot_history_data(ax1, history, "acc")
+    set_plot_history_data(ax1, history, 'acc')
 
-    set_plot_history_data(ax2, history, "loss")
+    set_plot_history_data(ax2, history, 'loss')
 
     # Accuracy graph
-    ax1.set_ylabel("Accuracy")
+    ax1.set_ylabel('Accuracy')
     ax1.set_ylim(bottom=0.5, top=1)
     ax1.legend(loc="lower right")
-    ax1.spines["top"].set_visible(False)
-    ax1.spines["right"].set_visible(False)
-    ax1.xaxis.set_ticks_position("none")
-    ax1.spines["bottom"].set_visible(False)
+    ax1.spines['top'].set_visible(False)
+    ax1.spines['right'].set_visible(False)
+    ax1.xaxis.set_ticks_position('none')
+    ax1.spines['bottom'].set_visible(False)
 
     # max accuracy text
-    plt.text(
-        0.5,
-        0.6,
-        get_max_validation_balanced_accuracy(history),
-        horizontalalignment="right",
-        verticalalignment="top",
-        transform=ax1.transAxes,
-        fontsize=12,
-    )
-    plt.text(
-        0.5,
-        0.8,
-        get_max_training_balanced_accuracy(history),
-        horizontalalignment="right",
-        verticalalignment="top",
-        transform=ax1.transAxes,
-        fontsize=12,
-    )
+    plt.text(0.5,
+             0.6,
+             get_max_validation_accuracy(history),
+             horizontalalignment='right',
+             verticalalignment='top',
+             transform=ax1.transAxes,
+             fontsize=12)
+    plt.text(0.5,
+             0.8,
+             get_max_training_accuracy(history),
+             horizontalalignment='right',
+             verticalalignment='top',
+             transform=ax1.transAxes,
+             fontsize=12)
 
     # Loss graph
-    ax2.set_ylabel("Loss")
+    ax2.set_ylabel('Loss')
     ax2.set_yticks([])
     ax2.plot(legend=False)
-    ax2.set_xlabel("Epochs")
-    ax2.spines["top"].set_visible(False)
-    ax2.spines["right"].set_visible(False)
+    ax2.set_xlabel('Epochs')
+    ax2.spines['top'].set_visible(False)
+    ax2.spines['right'].set_visible(False)
 
     plt.tight_layout()
-    plt.savefig(fname=file, format="svg")
+    plt.savefig(fname=file, format='svg')
     plt.close()
 
 
-def plot_train_history(hist, target, file_accuracy, file_loss):
+def plotTrainHistory(hist, target, file_accuracy, file_loss):
     """
     Plot the training performance in terms of accuracy and loss values for each epoch.
     :param hist: The history returned by model.fit function
@@ -149,60 +134,44 @@ def plot_train_history(hist, target, file_accuracy, file_loss):
 
     # plot accuracy
     plt.figure()
-    plt.plot(hist.history["accuracy"])
-    if "val_accuracy" in hist.history.keys():
-        plt.plot(hist.history["val_accuracy"])
-    plt.title("Model accuracy - " + target)
-    plt.ylabel("Accuracy")
-    plt.xlabel("Epoch")
-    if "val_accuracy" in hist.history.keys():
-        plt.legend(["Train", "Test"], loc="upper left")
+    plt.plot(hist.history['accuracy'])
+    if 'val_accuracy' in hist.history.keys():
+        plt.plot(hist.history['val_accuracy'])
+    plt.title('Model accuracy - ' + target)
+    plt.ylabel('Accuracy')
+    plt.xlabel('Epoch')
+    if 'val_accuracy' in hist.history.keys():
+        plt.legend(['Train', 'Test'], loc='upper left')
     else:
-        plt.legend(["Train"], loc="upper_left")
-    plt.savefig(fname=file_accuracy, format="svg")
+        plt.legend(['Train'], loc='upper_left')
+    plt.savefig(fname=file_accuracy, format='svg')
 
     # Plot training & validation loss values
     plt.figure()
-    plt.plot(hist.history["loss"])
-    plt.plot(hist.history["val_loss"])
-    plt.title("Model loss - " + target)
-    plt.ylabel("Loss")
-    plt.xlabel("Epoch")
-    plt.legend(["Train", "Test"], loc="upper left")
+    plt.plot(hist.history['loss'])
+    plt.plot(hist.history['val_loss'])
+    plt.title('Model loss - ' + target)
+    plt.ylabel('Loss')
+    plt.xlabel('Epoch')
+    plt.legend(['Train', 'Test'], loc='upper left')
     #        plt.show()
-    plt.savefig(fname=file_loss, format="svg")
+    plt.savefig(fname=file_loss, format='svg')
     plt.close()
 
 
-def plot_history_vis(
-    hist: History,
-    model_hist_plot_path: str,
-    model_hist_csv_path: str,
-    model_hist_plot_path_a: str,
-    model_hist_plot_path_l: str,
-    target: str,
-) -> None:
+def plot_history_vis(hist: History, model_hist_plot_path: str, model_hist_csv_path: str,
+                     model_hist_plot_path_a: str, model_hist_plot_path_l: str, target: str) -> None:
     plot_history(history=hist, file=model_hist_plot_path)
     histDF = pd.DataFrame(hist.history)
     histDF.to_csv(model_hist_csv_path)
 
     # plot accuracy and loss for the training and validation during training
-    plot_train_history(
-        hist=hist,
-        target=target,
-        file_accuracy=model_hist_plot_path_a,
-        file_loss=model_hist_plot_path_l,
-    )
-
-
-def plot_auc(
-    fpr: np.ndarray,
-    tpr: np.ndarray,
-    auc_value: float,
-    target: str,
-    filename: str,
-    wandb_logging: bool = False,
-) -> None:
+    plotTrainHistory(hist=hist, target=target,
+                     file_accuracy=model_hist_plot_path_a,
+                     file_loss=model_hist_plot_path_l)
+
+
+def plot_auc(fpr: array, tpr: array, auc_value: float, target: str, filename: str) -> None:
     """
     Plot the area under the curve to the provided file
 
@@ -211,18 +180,14 @@ def plot_auc(
     :param auc_value: The value of the area under the curve
     :param target: The name of the training target
     :param filename: The filename to which the plot should be stored
-    :param wandb_logging: Whether to log the plot to wandb
     :rtype: None
     """
-    # Create a boolean mask to filter out zero values
     plt.figure()
-    plt.plot([0, 1], [0, 1], "k--")
-    plt.plot(fpr, tpr, label=f"Keras (area = {auc_value:.3f})")
-    plt.xlabel("False positive rate")
-    plt.ylabel("True positive rate")
-    plt.title("ROC curve " + target)
-    plt.legend(loc="best")
-    plt.savefig(fname=filename, format="png")
-    if wandb_logging:
-        wandb.log({"roc_plot": plt})
-    plt.close()
+    plt.plot([0, 1], [0, 1], 'k--')
+    plt.plot(fpr, tpr, label='Keras (area = {:.3f})'.format(auc_value))
+    plt.xlabel('False positive rate')
+    plt.ylabel('True positive rate')
+    plt.title('ROC curve ' + target)
+    plt.legend(loc='best')
+    plt.savefig(fname=filename, format='svg')
+    plt.close()
diff --git a/dfpl/settings.py b/dfpl/settings.py
@@ -34,6 +34,7 @@
 nn_fp_numpy_type = np.float32
 nn_fp_compressed_numpy_type = np.float32
 nn_target_numpy_type = np.short
+nn_target_numpy_type_regression = np.float32
 
 nn_multi_fp_numpy_type = np.float32
 nn_multi_fp_compressed_numpy_type = np.float32
@@ -49,4 +50,4 @@
 # Training settings of the FNN that were magic numbers in the code before.
 nn_train_min_delta = 0.0001
 nn_train_check_period = 10
-nn_train_patience = 20
+nn_train_patience = 20