From 1685f48c9c1285b4196a3960b23296d08e5eec86 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Fri, 30 Aug 2024 12:21:11 +0100
Subject: [PATCH 1/8] add Xy prediction plot with uncertainty

---
 autoemulate/plotting.py | 110 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 96 insertions(+), 14 deletions(-)

diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
index c0640a25..a80210b6 100644
--- a/autoemulate/plotting.py
+++ b/autoemulate/plotting.py
@@ -1,3 +1,5 @@
+import inspect
+
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.metrics import PredictionErrorDisplay
@@ -48,6 +50,7 @@ def _plot_single_fold(
     plot="standard",
     annotation=" ",
     output_index=0,
+    input_index=0,
 ):
     """Plots a single cv fold for a given model.
 
@@ -74,34 +77,59 @@ def _plot_single_fold(
         The annotation to add to the plot title. Default is an empty string.
     output_index : int, optional
         The index of the output to plot. Default is 0.
+    input_index : int, optional
+        The index of the input variable to plot. Default is 0.
     """
+    # get cv fold test indices
     test_indices = cv_results[model_name]["indices"]["test"][fold_index]
-
-    true_values = y[test_indices]
-
-    predicted_values = cv_results[model_name]["estimator"][fold_index].predict(
-        X[test_indices]
-    )
+    X_test = X[test_indices]
+    y_test = y[test_indices]
+    # get model trained on cv fold train indices
+    model = cv_results[model_name]["estimator"][fold_index]
+    # should we return and plot uncertainty?
+    predict_params = inspect.signature(model.named_steps["model"].predict).parameters
+    if "return_std" in predict_params:
+        y_test_pred, y_test_std = model.predict(X_test, return_std=True)
+    else:
+        y_test_pred = model.predict(X_test)
+        y_test_std = None
 
     # if y is multi-output, we need to select the correct column
     if y.ndim > 1:
-        true_values = true_values[:, output_index]
-        predicted_values = predicted_values[:, output_index]
+        y_test = y_test[:, output_index]
+        y_test_pred = y_test_pred[:, output_index]
+        if y_test_std is not None:
+            y_test_std = y_test_std[:, output_index]
 
     match plot:
         case "standard":
             plot_type = "actual_vs_predicted"
         case "residual":
             plot_type = "residual_vs_predicted"
+        case "Xy":
+            plot_type = "Xy"
         case _:
             ValueError(f"Invalid plot type: {plot}")
 
-    # plot
-    display = PredictionErrorDisplay.from_predictions(
-        y_true=true_values, y_pred=predicted_values, kind=plot_type, ax=ax
-    )
-    title_suffix = f"{annotation}: {fold_index}"
-    ax.set_title(f"{model_name} - {title_suffix}")
+    if plot_type == "Xy":
+        # if X is multi-dimensional, we need to select the correct column
+        if X.ndim > 1:
+            X_test = X_test[:, input_index]
+        title_suffix = f"{annotation}: {fold_index}"
+        _plot_Xy(
+            X_test,
+            y_test,
+            y_test_pred,
+            y_test_std,
+            ax,
+            title=f"{model_name} - {title_suffix}",
+        )
+    else:
+        display = PredictionErrorDisplay.from_predictions(
+            y_true=y_test, y_pred=y_test_pred, kind=plot_type, ax=ax
+        )
+        title_suffix = f"{annotation}: {fold_index}"
+        ax.set_title(f"{model_name} - {title_suffix}")
 
 
 def _plot_best_fold_per_model(
@@ -338,3 +366,57 @@ def _plot_model(model, X, y, plot="standard", n_cols=2, figsize=None):
             ax.set_visible(False)
 
     plt.show()
+
+
+def _plot_Xy(X, y, y_pred, y_std=None, ax=None, title="Xy"):
+    # Sort the data
+    sort_idx = np.argsort(X).flatten()
+    X_sorted = X[sort_idx]
+    y_sorted = y[sort_idx]
+    y_pred_sorted = y_pred[sort_idx]
+    if y_std is not None:
+        y_std_sorted = y_std[sort_idx]
+
+    org_points_color = "#4B0082"
+    pred_points_color = "#00CED1"
+    pred_line_color = "#00CED1"
+    ci_color = "#9DC183"
+
+    ax.scatter(
+        X_sorted, y_sorted, color=org_points_color, alpha=0.7, label="data", s=50
+    )
+    ax.scatter(
+        X_sorted, y_pred_sorted, color=pred_points_color, alpha=1, label="pred.", s=10
+    )
+    if y_std is not None:
+        ax.fill_between(
+            X_sorted,
+            y_pred_sorted - 2 * y_std_sorted,
+            y_pred_sorted + 2 * y_std_sorted,
+            color=ci_color,
+            alpha=0.5,
+            label="95% Confidence Interval",
+        )
+    ax.plot(X_sorted, y_pred_sorted, color=pred_line_color, label="pred.")
+
+    ax.set_xlabel("X")
+    ax.set_ylabel("y")
+    ax.set_title(title)
+    ax.grid(True, alpha=0.3)
+
+    # Add legend
+    if y_std is not None:
+        ax.legend(["data", "pred.(±2σ)"], loc="best")
+    else:
+        ax.legend(["data", "pred."], loc="best")
+    # Calculate R2 score
+    r2 = 1 - np.sum((y_sorted - y_pred_sorted) ** 2) / np.sum(
+        (y_sorted - np.mean(y_sorted)) ** 2
+    )
+    ax.text(
+        0.05,
+        0.95,
+        f"R2 score: {r2:.3f}",
+        transform=ax.transAxes,
+        verticalalignment="top",
+    )

From dfb0e89712be0d9b5921f2ee16a038a93d53fc1f Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Fri, 30 Aug 2024 16:34:17 +0100
Subject: [PATCH 2/8] add draft for Xy plot for _plot_model

---
 autoemulate/compare.py  |  21 ++-
 autoemulate/plotting.py | 335 ++++++++++++++++++++++++++++++++--------
 tests/test_plotting.py  |  10 ++
 3 files changed, 302 insertions(+), 64 deletions(-)

diff --git a/autoemulate/compare.py b/autoemulate/compare.py
index 2f394c4b..7ea7b4df 100644
--- a/autoemulate/compare.py
+++ b/autoemulate/compare.py
@@ -414,7 +414,7 @@ def print_results(self, model=None, sort_by="r2"):
     def plot_results(
         self,
         model=None,
-        plot="standard",
+        plot="Xy",
         n_cols=3,
         figsize=None,
         output_index=0,
@@ -428,7 +428,8 @@ def plot_results(
             If a model name is specified, plots all folds of that model.
         plot_type : str, optional
             The type of plot to draw:
-            “standard” draws the observed values (y-axis) vs. the predicted values (x-axis) (default).
+            "Xy" observed and predicted values vs. features, including 2σ error bands where available (default).
+            “standard” draws the observed values (y-axis) vs. the predicted values (x-axis).
             “residual” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
         n_cols : int
             Number of columns in the plot grid.
@@ -484,7 +485,15 @@ def evaluate_model(self, model=None):
 
         return scores_df
 
-    def plot_model(self, model, plot="standard", n_cols=2, figsize=None):
+    def plot_model(
+        self,
+        model,
+        plot="standard",
+        n_cols=2,
+        figsize=None,
+        input_index=0,
+        output_index=0,
+    ):
         """Plots the model predictions vs. the true values.
 
         Parameters
@@ -497,6 +506,10 @@ def plot_model(self, model, plot="standard", n_cols=2, figsize=None):
             “residual” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
         n_cols : int, optional
             Number of columns in the plot grid for multi-output. Default is 2.
+        input_index : int
+            Index of the input to plot. Default is 0. Only used if plot_type="Xy".
+        output_index : int
+            Index of the output to plot. Default is 0. Only used if plot_type="Xy".
         """
         _plot_model(
             model,
@@ -505,4 +518,6 @@ def plot_model(self, model, plot="standard", n_cols=2, figsize=None):
             plot,
             n_cols,
             figsize,
+            input_index=input_index,
+            output_index=output_index,
         )
diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
index a80210b6..dff09b6d 100644
--- a/autoemulate/plotting.py
+++ b/autoemulate/plotting.py
@@ -3,6 +3,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.metrics import PredictionErrorDisplay
+from sklearn.pipeline import Pipeline
 
 from autoemulate.utils import get_model_name
 
@@ -126,7 +127,12 @@ def _plot_single_fold(
         )
     else:
         display = PredictionErrorDisplay.from_predictions(
-            y_true=y_test, y_pred=y_test_pred, kind=plot_type, ax=ax
+            y_true=y_test,
+            y_pred=y_test_pred,
+            kind=plot_type,
+            ax=ax,
+            scatter_kwargs={"edgecolor": "black", "linewidth": 0.5},
+            line_kwargs={"linewidth": 1, "color": "#36454F"},
         )
         title_suffix = f"{annotation}: {fold_index}"
         ax.set_title(f"{model_name} - {title_suffix}")
@@ -298,7 +304,16 @@ def _plot_results(
         _plot_best_fold_per_model(cv_results, X, y, n_cols, plot, figsize, output_index)
 
 
-def _plot_model(model, X, y, plot="standard", n_cols=2, figsize=None):
+def _plot_model(
+    model,
+    X,
+    y,
+    plot="standard",
+    n_cols=2,
+    figsize=None,
+    input_index=None,
+    output_index=None,
+):
     """Plots the model predictions vs. the true values.
 
     Parameters
@@ -311,64 +326,226 @@ def _plot_model(model, X, y, plot="standard", n_cols=2, figsize=None):
         Simulation output.
     plot : str, optional
         The type of plot to draw:
-        “standard” draws the observed values (y-axis) vs. the predicted values (x-axis) (default).
-        “residual” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
+        "standard" draws the observed values (y-axis) vs. the predicted values (x-axis) (default).
+        "residual" draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
+        "Xy" draws the input features vs. the output variables, including predictions.
     n_cols : int, optional
         The number of columns in the plot. Default is 2.
     figsize : tuple, optional
         Overrides the default figure size.
+    input_index : int or list of int, optional
+        The index(es) of the input feature(s) to plot for "Xy" plots. If None, all features are used.
+    output_index : int or list of int, optional
+        The index(es) of the output variable(s) to plot. If None, all outputs are used.
     """
 
-    match plot:
-        case "standard":
-            plot_type = "actual_vs_predicted"
-        case "residual":
-            plot_type = "residual_vs_predicted"
-        case _:
-            ValueError(f"Invalid plot type: {plot}")
+    # Get predictions, with uncertainty if available
+    if isinstance(model, Pipeline):
+        predict_params = inspect.signature(
+            model.named_steps["model"].predict
+        ).parameters
+    else:
+        predict_params = inspect.signature(model.predict).parameters
 
-    # figsize
-    if figsize is None:
-        if y.ndim == 1 or y.shape[1] == 1:
-            figsize = (6, 4)
-        else:  # Dynamic calculation for multi-output
-            n_outputs = y.shape[1]
-            n_rows = np.ceil(n_outputs / n_cols).astype(int)
-            figsize = (4 * n_cols, 4 * n_rows)
-    # predictions
-    y_pred = model.predict(X)
-
-    if y.ndim == 1 or y.shape[1] == 1:  # single output
-        _, ax = plt.subplots(figsize=figsize)
-        display = PredictionErrorDisplay.from_predictions(
-            y_true=y, y_pred=y_pred, kind=plot_type, ax=ax
-        )
-        ax.set_title(f"{get_model_name(model)} - Test Set")
-    else:  # Multi-output
-        n_outputs = y.shape[1]
-        n_rows = np.ceil(n_outputs / n_cols).astype(int)
-        fig, axs = plt.subplots(
-            n_rows, n_cols, figsize=figsize, constrained_layout=True
-        )
-        axs = axs.flatten()
+    if "return_std" in predict_params:
+        y_pred, y_std = model.predict(X, return_std=True)
+    else:
+        y_pred = model.predict(X)
+        y_std = None
+
+    # Ensure y and y_pred are 2D
+    y = np.atleast_2d(y)
+    y_pred = np.atleast_2d(y_pred)
+    if y_std is not None:
+        y_std = np.atleast_2d(y_std)
+
+    n_samples, n_features = X.shape
+    n_outputs = y.shape[1]
 
-        for i in range(n_outputs):
-            if i < len(
-                axs
-            ):  # Check to avoid index error if n_cols * n_rows > n_outputs
+    # Handle input and output indices
+    if input_index is None:
+        input_index = list(range(n_features))
+    elif isinstance(input_index, int):
+        input_index = [input_index]
+
+    if output_index is None:
+        output_index = list(range(n_outputs))
+    elif isinstance(output_index, int):
+        output_index = [output_index]
+
+    # Calculate number of subplots
+    if plot == "Xy":
+        n_plots = len(input_index) * len(output_index)
+    else:
+        n_plots = len(output_index)
+
+    # Calculate number of rows
+    n_rows = int(np.ceil(n_plots / n_cols))
+
+    # Set up the figure
+    if figsize is None:
+        figsize = (5 * n_cols, 4 * n_rows)
+    fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
+    axs = axs.flatten()
+
+    plot_index = 0
+    for out_idx in output_index:
+        if plot == "Xy":
+            for in_idx in input_index:
+                if plot_index < len(axs):
+                    _plot_Xy(
+                        X[:, in_idx],
+                        y[:, out_idx],
+                        y_pred[:, out_idx],
+                        y_std[:, out_idx] if y_std is not None else None,
+                        ax=axs[plot_index],
+                        title=f"X{in_idx+1} vs. y{out_idx+1}",
+                    )
+                    plot_index += 1
+        else:
+            if plot_index < len(axs):
                 display = PredictionErrorDisplay.from_predictions(
-                    y_true=y[:, i], y_pred=y_pred[:, i], kind=plot_type, ax=axs[i]
+                    y_true=y[:, out_idx],
+                    y_pred=y_pred[:, out_idx],
+                    kind="actual_vs_predicted"
+                    if plot == "standard"
+                    else "residual_vs_predicted",
+                    ax=axs[plot_index],
+                    scatter_kwargs={"edgecolor": "black", "alpha": 0.7},
+                    line_kwargs={"color": "red"},
                 )
-                axs[i].set_title(f"{get_model_name(model)} - Test Set - Output {i+1}")
+                axs[plot_index].set_title(
+                    f"{plot.capitalize()} Plot - Output {out_idx+1}"
+                )
+                plot_index += 1
 
-        # Hide any unused subplots if n_cols * n_rows > n_outputs
-        for ax in axs[n_outputs:]:
-            ax.set_visible(False)
+    # Hide any unused subplots
+    for ax in axs[plot_index:]:
+        ax.set_visible(False)
 
+    plt.tight_layout()
     plt.show()
 
 
+# def _plot_model(model, X, y, plot="standard", n_cols=2, figsize=None):
+#     """Plots the model predictions vs. the true values.
+
+#     Parameters
+#     ----------
+#     model : object
+#         A fitted model.
+#     X : array-like, shape (n_samples, n_features)
+#         Simulation input.
+#     y : array-like, shape (n_samples, n_outputs)
+#         Simulation output.
+#     plot : str, optional
+#         The type of plot to draw:
+#         “standard” draws the observed values (y-axis) vs. the predicted values (x-axis) (default).
+#         “residual” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
+#     n_cols : int, optional
+#         The number of columns in the plot. Default is 2.
+#     figsize : tuple, optional
+#         Overrides the default figure size.
+#     """
+
+#     match plot:
+#         case "standard":
+#             plot_type = "actual_vs_predicted"
+#         case "residual":
+#             plot_type = "residual_vs_predicted"
+#         case "Xy":
+#             plot_type = "Xy"
+#         case _:
+#             ValueError(f"Invalid plot type: {plot}")
+
+#     # get predictions, with uncertainty if available
+#     # check if model is a pipeline
+#     if isinstance(model, Pipeline):
+#         predict_params = inspect.signature(model.named_steps["model"].predict).parameters
+#     else:
+#         predict_params = inspect.signature(model.predict).parameters
+
+#     if "return_std" in predict_params:
+#         y_pred, y_std = model.predict(X, return_std=True)
+#     else:
+#         y_pred = model.predict(X)
+#         y_std = None
+
+#     print(f"X: {X.shape}, y: {y.shape}, y_pred: {y_pred.shape}, y_std: {y_std}")
+
+#     if plot_type == "Xy":
+#         # check if y dim an x dim are 1
+#         n_outputs = y.shape[1] if y.ndim > 1 else 1
+#         n_inputs = X.shape[1] if X.ndim > 1 else 1
+
+#         if n_outputs == 1 and n_inputs == 1:
+#             print(f"X: {X.shape}, y: {y.shape}, y_pred: {y_pred.shape}, y_std: {y_std.shape}")
+#             fig, ax = plt.subplots(figsize=(6, 4))
+#             _plot_Xy(X, y, y_pred, y_std, ax, title=f"{get_model_name(model)} - Test Set")
+#             plt.show()
+
+#         # limit to max 3 x 3 scatter plot matrix
+#         n_inputs = min(n_inputs, 3)
+#         n_outputs = min(n_outputs, 3)
+
+
+#         # else:
+#         #     figsize = (3 * n_inputs, 3 * n_outputs)
+#         #     fig, axs = plt.subplots(nrows = n_outputs, ncols = n_inputs, figsize=figsize, constrained_layout=True)
+#         #     axs = np.atleast_2d(axs)
+#         #     for i in range(n_outputs):
+#         #         for j in range(n_inputs):
+#         #             ax = axs[i * n_inputs + j]
+#         #             _plot_Xy(X[:, j], y[:, i], y_pred[:, i], y_std[:, i], ax, title=f"X{j+1} vs. y{i+1}")
+#         # fig.suptitle(f"{get_model_name(model)} - Test Set Predictions", fontsize=32)
+#         # plt.show()
+#         # return
+
+
+#     # # figsize
+#     # if figsize is None:
+#     #     if y.ndim == 1 or y.shape[1] == 1:
+#     #         figsize = (6, 4)
+#     #     else:  # Dynamic calculation for multi-output
+#     #         n_outputs = y.shape[1]
+#     #         n_rows = np.ceil(n_outputs / n_cols).astype(int)
+#     #         figsize = (4 * n_cols, 4 * n_rows)
+
+
+#     # if y.ndim == 1 or y.shape[1] == 1:  # single output
+#     #     _, ax = plt.subplots(figsize=figsize)
+#     #     display = PredictionErrorDisplay.from_predictions(
+#     #         y_true=y, y_pred=y_pred, kind=plot_type, ax=ax
+#     #     )
+#     #     ax.set_title(f"{get_model_name(model)} - Test Set")
+#     # else:  # Multi-output
+#     #     n_outputs = y.shape[1]
+#     #     n_rows = np.ceil(n_outputs / n_cols).astype(int)
+#     #     fig, axs = plt.subplots(
+#     #         n_rows, n_cols, figsize=figsize, constrained_layout=True
+#     #     )
+#     #     axs = axs.flatten()
+
+#     #     for i in range(n_outputs):
+#     #         if i < len(
+#     #             axs
+#     #         ):
+#     #             display = PredictionErrorDisplay.from_predictions(
+#     #                 y_true=y[:, i], y_pred=y_pred[:, i], kind=plot_type, ax=axs[i]
+#     #             )
+#     #             axs[i].set_title(f"{get_model_name(model)} - Test Set - Output {i+1}")
+
+#     #     # Hide any unused subplots if n_cols * n_rows > n_outputs
+#     #     for ax in axs[n_outputs:]:
+#     #         ax.set_visible(False)
+
+#     # plt.show()
+
+
 def _plot_Xy(X, y, y_pred, y_std=None, ax=None, title="Xy"):
+    """
+    Plots observed and predicted values vs. features, including 2σ error bands where available.
+    """
     # Sort the data
     sort_idx = np.argsort(X).flatten()
     X_sorted = X[sort_idx]
@@ -377,46 +554,82 @@ def _plot_Xy(X, y, y_pred, y_std=None, ax=None, title="Xy"):
     if y_std is not None:
         y_std_sorted = y_std[sort_idx]
 
-    org_points_color = "#4B0082"
-    pred_points_color = "#00CED1"
-    pred_line_color = "#00CED1"
-    ci_color = "#9DC183"
+    org_points_color = "Goldenrod"
+    pred_points_color = "#6A5ACD"
+    pred_line_color = "#6A5ACD"
+    ci_color = "lightblue"
 
-    ax.scatter(
-        X_sorted, y_sorted, color=org_points_color, alpha=0.7, label="data", s=50
-    )
-    ax.scatter(
-        X_sorted, y_pred_sorted, color=pred_points_color, alpha=1, label="pred.", s=10
-    )
     if y_std is not None:
         ax.fill_between(
             X_sorted,
             y_pred_sorted - 2 * y_std_sorted,
             y_pred_sorted + 2 * y_std_sorted,
             color=ci_color,
-            alpha=0.5,
+            alpha=0.25,
             label="95% Confidence Interval",
         )
-    ax.plot(X_sorted, y_pred_sorted, color=pred_line_color, label="pred.")
+    ax.plot(
+        X_sorted,
+        y_pred_sorted,
+        color=pred_line_color,
+        label="pred.",
+        alpha=0.8,
+        linewidth=1,
+    )  # , linestyle='--'
+    ax.scatter(
+        X_sorted,
+        y_sorted,
+        color=org_points_color,
+        alpha=0.7,
+        edgecolor="black",
+        linewidth=0.5,
+        label="data",
+    )
+    ax.scatter(
+        X_sorted,
+        y_pred_sorted,
+        color=pred_points_color,
+        edgecolor="black",
+        linewidth=0.5,
+        alpha=0.7,
+        label="pred.",
+    )
 
     ax.set_xlabel("X")
     ax.set_ylabel("y")
     ax.set_title(title)
     ax.grid(True, alpha=0.3)
 
+    # Get the handles and labels for the scatter plots
+    handles, labels = ax.get_legend_handles_labels()
+
     # Add legend
     if y_std is not None:
-        ax.legend(["data", "pred.(±2σ)"], loc="best")
+        ax.legend(
+            handles[-2:],
+            ["data", "pred.(±2σ)"],
+            loc="best",
+            handletextpad=0,
+            columnspacing=0,
+            ncol=2,
+        )
     else:
-        ax.legend(["data", "pred."], loc="best")
+        ax.legend(
+            handles[-2:],
+            ["data", "pred."],
+            loc="best",
+            handletextpad=0,
+            columnspacing=0,
+            ncol=2,
+        )
     # Calculate R2 score
     r2 = 1 - np.sum((y_sorted - y_pred_sorted) ** 2) / np.sum(
         (y_sorted - np.mean(y_sorted)) ** 2
     )
     ax.text(
         0.05,
-        0.95,
-        f"R2 score: {r2:.3f}",
+        0.05,
+        f"R\u00B2 = {r2:.2f}",
         transform=ax.transAxes,
-        verticalalignment="top",
+        verticalalignment="bottom",
     )
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
index f791270e..09576cec 100644
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@@ -4,6 +4,8 @@
 from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import train_test_split
 
+from autoemulate.emulators import RadialBasisFunctions
+from autoemulate.plotting import _plot_model
 from autoemulate.plotting import _plot_single_fold
 from autoemulate.plotting import _validate_inputs
 from autoemulate.plotting import check_multioutput
@@ -156,3 +158,11 @@ def test_plot_single_fold_with_multioutput():
     # Assert that the plot is displayed correctly
     assert ax.get_title() == "model1 - Test: 0"
     # assert ax.texts[0].get_text() == "$R^2$ = 0.900"
+
+
+def test__plot_model_Xy():
+    X, y = make_regression(n_samples=30, n_features=1, n_targets=1)
+    model = RadialBasisFunctions()
+    model.fit(X, y)
+    # assert that plot_model runs
+    # _plot_model(model, X, y, plot="Xy")

From c198c529468b6838e93e528f7d07de120f507502 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Fri, 6 Sep 2024 09:32:53 +0100
Subject: [PATCH 3/8] refactor

---
 autoemulate/plotting.py | 44 ++++++++++++++++++++++++++-----
 tests/test_plotting.py  | 58 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
index dff09b6d..cc708db8 100644
--- a/autoemulate/plotting.py
+++ b/autoemulate/plotting.py
@@ -41,6 +41,40 @@ def check_multioutput(y, output_index):
         )
 
 
+def _predict_with_optional_std(model, X_test):
+    """Predicts the output of the model with or without uncertainty.
+
+    Parameters
+    ----------
+    model : object
+        A fitted model.
+    X_test : array-like, shape (n_samples, n_features)
+        Simulation input.
+
+    Returns
+    -------
+    y_test_pred : array-like, shape (n_samples, n_outputs)
+        Simulation output.
+    y_test_std : array-like, shape (n_samples, n_outputs)
+        Standard deviation of the simulation output.
+    """
+    # see whether the model is a pipeline or not
+    if isinstance(model, Pipeline):
+        predict_params = inspect.signature(
+            model.named_steps["model"].predict
+        ).parameters
+    else:
+        predict_params = inspect.signature(model.predict).parameters
+    # see whether the model has return_std in its predict parameters
+    if "return_std" in predict_params:
+        y_test_pred, y_test_std = model.predict(X_test, return_std=True)
+    else:
+        y_test_pred = model.predict(X_test)
+        y_test_std = None
+
+    return y_test_pred, y_test_std
+
+
 def _plot_single_fold(
     cv_results,
     X,
@@ -85,15 +119,11 @@ def _plot_single_fold(
     test_indices = cv_results[model_name]["indices"]["test"][fold_index]
     X_test = X[test_indices]
     y_test = y[test_indices]
+
     # get model trained on cv fold train indices
     model = cv_results[model_name]["estimator"][fold_index]
-    # should we return and plot uncertainty?
-    predict_params = inspect.signature(model.named_steps["model"].predict).parameters
-    if "return_std" in predict_params:
-        y_test_pred, y_test_std = model.predict(X_test, return_std=True)
-    else:
-        y_test_pred = model.predict(X_test)
-        y_test_std = None
+    # predict
+    y_test_pred, y_test_std = _predict_with_optional_std(model, X_test)
 
     # if y is multi-output, we need to select the correct column
     if y.ndim > 1:
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
index 09576cec..e31333ad 100644
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@@ -1,16 +1,40 @@
 import matplotlib.pyplot as plt
 import numpy as np
+import pytest
 from sklearn.datasets import make_regression
 from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import train_test_split
 
+from autoemulate.compare import AutoEmulate
 from autoemulate.emulators import RadialBasisFunctions
 from autoemulate.plotting import _plot_model
+from autoemulate.plotting import _plot_results
 from autoemulate.plotting import _plot_single_fold
+from autoemulate.plotting import _predict_with_optional_std
 from autoemulate.plotting import _validate_inputs
 from autoemulate.plotting import check_multioutput
 
 
+@pytest.fixture
+def ae_single_output():
+    X, y = make_regression(n_samples=50, n_features=2, noise=0.5, random_state=42)
+    em = AutoEmulate()
+    em.setup(X, y, model_subset=["gpt", "rbf"])
+    em.compare()
+    return em
+
+
+@pytest.fixture
+def ae_multi_output():
+    X, y = make_regression(
+        n_samples=50, n_features=2, n_targets=2, noise=0.5, random_state=42
+    )
+    em = AutoEmulate()
+    em.setup(X, y, model_subset=["gpt", "rbf"])
+    em.compare()
+    return em
+
+
 # ------------------------------ test validate_inputs ------------------------------
 def test_validate_inputs_with_empty_cv_results():
     cv_results = {}
@@ -75,6 +99,23 @@ def test_check_multioutput_with_invalid_output_index():
         )
 
 
+# ------------------------------ test _predict_with_optional_std --------------------
+def test_predict_with_optional_std(ae_single_output):
+    # test whether the function correctly returns None for rbf's std
+    rbf = ae_single_output.get_model(name="rbf")
+    X = ae_single_output.X
+    y_pred, y_std = _predict_with_optional_std(rbf, X)
+    assert y_pred.shape == (X.shape[0],)
+    assert y_std is None
+
+    # test whether the function correctly returns the std for gpt
+    gpt = ae_single_output.get_model(name="gpt")
+    y_pred, y_std = _predict_with_optional_std(gpt, X)
+    assert y_pred.shape == (X.shape[0],)
+    assert y_std.shape == (X.shape[0],)
+    assert np.all(y_std >= 0)
+
+
 # ------------------------------ test plot_single_fold ------------------------------
 def test_plot_single_fold_with_single_output():
     # Generate synthetic data
@@ -160,6 +201,23 @@ def test_plot_single_fold_with_multioutput():
     # assert ax.texts[0].get_text() == "$R^2$ = 0.900"
 
 
+# ------------------------------ test cv plotting full ------------------------------
+# def test_cv_plotting_full_single_output(ae_single_output):
+#     ae_single_output.plot_results()
+
+
+def test_cv_plotting_full_single_output(ae_single_output, monkeypatch):
+    # Mock plt.show to do nothing
+    monkeypatch.setattr(plt, "show", lambda: None)
+
+    cv_results = ae_single_output.cv_results
+    X = ae_single_output.X
+    y = ae_single_output.y
+
+    fig = _plot_results(cv_results, X, y)
+    assert isinstance(fig, plt.Figure)
+
+
 def test__plot_model_Xy():
     X, y = make_regression(n_samples=30, n_features=1, n_targets=1)
     model = RadialBasisFunctions()

From f02e17d480b5d4c2472356e7db47206dfe4e67a7 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Fri, 6 Sep 2024 10:02:19 +0100
Subject: [PATCH 4/8] return fig

---
 autoemulate/compare.py  |  3 ++-
 autoemulate/plotting.py | 53 ++++++++++++++++++++++-------------------
 tests/test_plotting.py  |  1 +
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/autoemulate/compare.py b/autoemulate/compare.py
index 7ea7b4df..613d7f5d 100644
--- a/autoemulate/compare.py
+++ b/autoemulate/compare.py
@@ -441,7 +441,7 @@ def plot_results(
         model_name = (
             _get_full_model_name(model, self.model_names) if model is not None else None
         )
-        _plot_results(
+        figure = _plot_results(
             self.cv_results,
             self.X,
             self.y,
@@ -451,6 +451,7 @@ def plot_results(
             figsize=figsize,
             output_index=output_index,
         )
+        return figure
 
     def evaluate_model(self, model=None):
         """
diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
index cc708db8..5d934b70 100644
--- a/autoemulate/plotting.py
+++ b/autoemulate/plotting.py
@@ -42,22 +42,7 @@ def check_multioutput(y, output_index):
 
 
 def _predict_with_optional_std(model, X_test):
-    """Predicts the output of the model with or without uncertainty.
-
-    Parameters
-    ----------
-    model : object
-        A fitted model.
-    X_test : array-like, shape (n_samples, n_features)
-        Simulation input.
-
-    Returns
-    -------
-    y_test_pred : array-like, shape (n_samples, n_outputs)
-        Simulation output.
-    y_test_std : array-like, shape (n_samples, n_outputs)
-        Standard deviation of the simulation output.
-    """
+    """Predicts the output of the model with or without uncertainty."""
     # see whether the model is a pipeline or not
     if isinstance(model, Pipeline):
         predict_params = inspect.signature(
@@ -122,7 +107,6 @@ def _plot_single_fold(
 
     # get model trained on cv fold train indices
     model = cv_results[model_name]["estimator"][fold_index]
-    # predict
     y_test_pred, y_test_std = _predict_with_optional_std(model, X_test)
 
     # if y is multi-output, we need to select the correct column
@@ -204,25 +188,33 @@ def _plot_best_fold_per_model(
     if figsize is None:
         figsize = (4 * n_cols, 3 * n_rows)
 
-    plt.figure(figsize=figsize)
+    fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
+    axs = axs.flatten()
+    # plt.figure(figsize=figsize)
 
     for i, model_name in enumerate(cv_results):
         best_fold_index = np.argmax(cv_results[model_name]["test_r2"])
-        ax = plt.subplot(n_rows, n_cols, i + 1)
         _plot_single_fold(
             cv_results,
             X,
             y,
             model_name,
             best_fold_index,
-            ax,
+            axs[i],
             plot=plot,
             annotation="Best CV-fold",
             output_index=output_index,
         )
+
+    # hide unused subplots
+    for j in range(i + 1, len(axs)):
+        axs[j].set_visible(False)
+
     plt.tight_layout()
     plt.show()
 
+    return fig
+
 
 def _plot_model_folds(
     cv_results,
@@ -263,24 +255,31 @@ def _plot_model_folds(
     if figsize is None:
         figsize = (4 * n_cols, 3 * n_rows)
 
-    plt.figure(figsize=figsize)
+    fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
+    axs = axs.flatten()
+    # plt.figure(figsize=figsize)
 
     for i in range(n_folds):
-        ax = plt.subplot(n_rows, n_cols, i + 1)
         _plot_single_fold(
             cv_results,
             X,
             y,
             model_name,
             i,
-            ax,
+            axs[i],
             plot,
             annotation="CV-fold",
             output_index=output_index,
         )
+    # hide unused subplots
+    for j in range(i + 1, len(axs)):
+        axs[j].set_visible(False)
+
     plt.tight_layout()
     plt.show()
 
+    return fig
+
 
 def _plot_results(
     cv_results,
@@ -320,7 +319,7 @@ def _plot_results(
     check_multioutput(y, output_index)
 
     if model_name:
-        _plot_model_folds(
+        figure = _plot_model_folds(
             cv_results,
             X,
             y,
@@ -331,7 +330,11 @@ def _plot_results(
             output_index,
         )
     else:
-        _plot_best_fold_per_model(cv_results, X, y, n_cols, plot, figsize, output_index)
+        figure = _plot_best_fold_per_model(
+            cv_results, X, y, n_cols, plot, figsize, output_index
+        )
+
+    return figure
 
 
 def _plot_model(
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
index e31333ad..a39583c9 100644
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@@ -215,6 +215,7 @@ def test_cv_plotting_full_single_output(ae_single_output, monkeypatch):
     y = ae_single_output.y
 
     fig = _plot_results(cv_results, X, y)
+    print(fig.axes)
     assert isinstance(fig, plt.Figure)
 
 

From 89c457ed465c1be470c6522e249964410ae3e598 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Fri, 6 Sep 2024 16:29:47 +0100
Subject: [PATCH 5/8] fix plot_results

---
 autoemulate/compare.py  |   4 ++
 autoemulate/plotting.py |  54 ++++++++++----
 tests/test_plotting.py  | 152 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 179 insertions(+), 31 deletions(-)

diff --git a/autoemulate/compare.py b/autoemulate/compare.py
index 613d7f5d..bbcfd028 100644
--- a/autoemulate/compare.py
+++ b/autoemulate/compare.py
@@ -418,6 +418,7 @@ def plot_results(
         n_cols=3,
         figsize=None,
         output_index=0,
+        input_index=0,
     ):
         """Plots the results of the cross-validation.
 
@@ -437,6 +438,8 @@ def plot_results(
             Overrides the default figure size.
         output_index : int
             Index of the output to plot. Default is 0.
+        input_index : int
+            Index of the input to plot. Default is 0.
         """
         model_name = (
             _get_full_model_name(model, self.model_names) if model is not None else None
@@ -450,6 +453,7 @@ def plot_results(
             plot=plot,
             figsize=figsize,
             output_index=output_index,
+            input_index=input_index,
         )
         return figure
 
diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
index 5d934b70..bcc6ba90 100644
--- a/autoemulate/plotting.py
+++ b/autoemulate/plotting.py
@@ -1,4 +1,5 @@
 import inspect
+import time
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -67,7 +68,7 @@ def _plot_single_fold(
     model_name,
     fold_index,
     ax,
-    plot="standard",
+    plot="Xy",
     annotation=" ",
     output_index=0,
     input_index=0,
@@ -90,6 +91,7 @@ def _plot_single_fold(
         The axes on which to plot the results.
     plot : str, optional
         The type of plot to draw:
+        "Xy" draws the input features vs. the output variables, including predictions.
         “standard” draws the observed values (y-axis) vs. the predicted values (x-axis) (default).
         “residual” draws the residuals, i.e. difference between observed and predicted values,
         (y-axis) vs. the predicted values (x-axis).
@@ -109,8 +111,16 @@ def _plot_single_fold(
     model = cv_results[model_name]["estimator"][fold_index]
     y_test_pred, y_test_std = _predict_with_optional_std(model, X_test)
 
+    # check output_index is valid and select the correct column
+    if y.ndim == 1:
+        if output_index > 0:
+            raise ValueError("output_index must be 0 for single-output data.")
     # if y is multi-output, we need to select the correct column
     if y.ndim > 1:
+        if output_index >= y.shape[1]:
+            raise ValueError(
+                f"output_index {output_index} is out of range. The index should be between 0 and {y.shape[1] - 1}."
+            )
         y_test = y_test[:, output_index]
         y_test_pred = y_test_pred[:, output_index]
         if y_test_std is not None:
@@ -127,6 +137,10 @@ def _plot_single_fold(
             ValueError(f"Invalid plot type: {plot}")
 
     if plot_type == "Xy":
+        if input_index >= X.shape[1]:
+            raise ValueError(
+                f"input_index {input_index} is out of range. The index should be between 0 and {X.shape[1] - 1}."
+            )
         # if X is multi-dimensional, we need to select the correct column
         if X.ndim > 1:
             X_test = X_test[:, input_index]
@@ -157,9 +171,10 @@ def _plot_best_fold_per_model(
     X,
     y,
     n_cols=3,
-    plot="standard",
+    plot="Xy",
     figsize=None,
     output_index=0,
+    input_index=0,
 ):
     """Plots results of the best (highest R^2) cv-fold for each model in cv_results.
 
@@ -180,6 +195,8 @@ def _plot_best_fold_per_model(
         Width, height in inches. Overrides the default figure size.
     output_index : int, optional
         The index of the output to plot. Default is 0.
+    input_index : int, optional
+        The index of the input to plot. Default is 0.
     """
 
     n_models = len(cv_results)
@@ -188,10 +205,10 @@ def _plot_best_fold_per_model(
     if figsize is None:
         figsize = (4 * n_cols, 3 * n_rows)
 
+    plt.ioff()
     fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
     axs = axs.flatten()
     # plt.figure(figsize=figsize)
-
     for i, model_name in enumerate(cv_results):
         best_fold_index = np.argmax(cv_results[model_name]["test_r2"])
         _plot_single_fold(
@@ -204,15 +221,15 @@ def _plot_best_fold_per_model(
             plot=plot,
             annotation="Best CV-fold",
             output_index=output_index,
+            input_index=input_index,
         )
 
     # hide unused subplots
     for j in range(i + 1, len(axs)):
         axs[j].set_visible(False)
-
     plt.tight_layout()
-    plt.show()
-
+    plt.ion()
+    # plt.show()
     return fig
 
 
@@ -222,9 +239,10 @@ def _plot_model_folds(
     y,
     model_name,
     n_cols=3,
-    plot="standard",
+    plot="Xy",
     figsize=None,
     output_index=0,
+    input_index=0,
 ):
     """Plots all the folds for a given model.
 
@@ -247,6 +265,8 @@ def _plot_model_folds(
         Overrides the default figure size.
     output_index : int, optional
         The index of the output to plot. Default is 0.
+    input_index : int, optional
+        The index of the input to plot. Default is 0.
     """
 
     n_folds = len(cv_results[model_name]["estimator"])
@@ -255,9 +275,9 @@ def _plot_model_folds(
     if figsize is None:
         figsize = (4 * n_cols, 3 * n_rows)
 
+    plt.ioff()
     fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
     axs = axs.flatten()
-    # plt.figure(figsize=figsize)
 
     for i in range(n_folds):
         _plot_single_fold(
@@ -270,14 +290,14 @@ def _plot_model_folds(
             plot,
             annotation="CV-fold",
             output_index=output_index,
+            input_index=input_index,
         )
     # hide unused subplots
     for j in range(i + 1, len(axs)):
         axs[j].set_visible(False)
 
     plt.tight_layout()
-    plt.show()
-
+    plt.ion()
     return fig
 
 
@@ -287,9 +307,10 @@ def _plot_results(
     y,
     model_name=None,
     n_cols=3,
-    plot="standard",
+    plot="Xy",
     figsize=None,
     output_index=0,
+    input_index=0,
 ):
     """Plots the results of cross-validation.
 
@@ -313,6 +334,8 @@ def _plot_results(
         Overrides the default figure size.
     output_index : int, optional
         For multi-output: Index of the output variable to plot.
+    input_index : int, optional
+        For multi-output: Index of the input variable to plot.
     """
 
     _validate_inputs(cv_results, model_name)
@@ -328,10 +351,11 @@ def _plot_results(
             plot,
             figsize,
             output_index,
+            input_index,
         )
     else:
         figure = _plot_best_fold_per_model(
-            cv_results, X, y, n_cols, plot, figsize, output_index
+            cv_results, X, y, n_cols, plot, figsize, output_index, input_index
         )
 
     return figure
@@ -341,7 +365,7 @@ def _plot_model(
     model,
     X,
     y,
-    plot="standard",
+    plot="Xy",
     n_cols=2,
     figsize=None,
     input_index=None,
@@ -457,7 +481,9 @@ def _plot_model(
         ax.set_visible(False)
 
     plt.tight_layout()
-    plt.show()
+
+
+# plt.show()
 
 
 # def _plot_model(model, X, y, plot="standard", n_cols=2, figsize=None):
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
index a39583c9..7d6e4771 100644
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@@ -15,22 +15,22 @@
 from autoemulate.plotting import check_multioutput
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def ae_single_output():
     X, y = make_regression(n_samples=50, n_features=2, noise=0.5, random_state=42)
     em = AutoEmulate()
-    em.setup(X, y, model_subset=["gpt", "rbf"])
+    em.setup(X, y, model_subset=["gpt", "rbf", "sop"])
     em.compare()
     return em
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def ae_multi_output():
     X, y = make_regression(
         n_samples=50, n_features=2, n_targets=2, noise=0.5, random_state=42
     )
     em = AutoEmulate()
-    em.setup(X, y, model_subset=["gpt", "rbf"])
+    em.setup(X, y, model_subset=["gpt", "rbf", "sop"])
     em.compare()
     return em
 
@@ -201,27 +201,145 @@ def test_plot_single_fold_with_multioutput():
     # assert ax.texts[0].get_text() == "$R^2$ = 0.900"
 
 
-# ------------------------------ test cv plotting full ------------------------------
-# def test_cv_plotting_full_single_output(ae_single_output):
-#     ae_single_output.plot_results()
+# ------------------------------ test _plot_results ------------------------------
 
 
-def test_cv_plotting_full_single_output(ae_single_output, monkeypatch):
+def test__plot_results(ae_single_output, monkeypatch):
     # Mock plt.show to do nothing
     monkeypatch.setattr(plt, "show", lambda: None)
 
     cv_results = ae_single_output.cv_results
-    X = ae_single_output.X
-    y = ae_single_output.y
+    X, y = ae_single_output.X, ae_single_output.y
 
+    # without model name
     fig = _plot_results(cv_results, X, y)
-    print(fig.axes)
     assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 3
+
+    # with model name
+    fig = _plot_results(cv_results, X, y, model_name="RadialBasisFunctions")
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 6  # 5 cv folds, but three columns so 6 subplots are made
+
+
+def test__plot_results_output_range(ae_multi_output, monkeypatch):
+    # Mock plt.show to do nothing
+    monkeypatch.setattr(plt, "show", lambda: None)
+    cv_results = ae_multi_output.cv_results
+    X, y = ae_multi_output.X, ae_multi_output.y
+
+    # check that output index 1 works
+    fig_0 = _plot_results(cv_results, X, y, output_index=0)
+    fig_1 = _plot_results(cv_results, X, y, output_index=1)
+    assert isinstance(fig_1, plt.Figure)
+    assert len(fig_1.axes) == 3
+
+    # check that fig_0 and fig_1 are different
+    assert fig_0 != fig_1
+
+    # check that output index 2 raises an error
+    with pytest.raises(ValueError):
+        _plot_results(cv_results, X, y, output_index=2)
+
+
+def test__plot_results_input_range(ae_multi_output, monkeypatch):
+    # Mock plt.show to do nothing
+    monkeypatch.setattr(plt, "show", lambda: None)
+    cv_results = ae_multi_output.cv_results
+    X, y = ae_multi_output.X, ae_multi_output.y
+
+    # check that input index 1 works
+    fig_0 = _plot_results(cv_results, X, y, input_index=0)
+    fig_1 = _plot_results(cv_results, X, y, input_index=1)
+    assert isinstance(fig_0, plt.Figure)
+    assert isinstance(fig_1, plt.Figure)
+    assert len(fig_1.axes) == 3
+
+    # check that fig_0 and fig_1 are different
+    assert fig_0 != fig_1
+
+    # check that input index 2 raises an error (2 features)
+    with pytest.raises(ValueError):
+        _plot_results(cv_results, X, y, input_index=2)
+
+
+# ------------------------------ most important tests, does it work? ----------------
+# ------------------------------ test plot_results ----------------------------------
+
+# test Xy plots
+
+
+# test plots with best cv per model, Xy plot
+def test_plot_results(ae_single_output):
+    fig = ae_single_output.plot_results(plot="Xy")
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 3
+
+
+def test_plot_results_input_index(ae_single_output):
+    fig = ae_single_output.plot_results(input_index=1)
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 3
+
+
+def test_plot_results_input_index_out_of_range(ae_single_output):
+    with pytest.raises(ValueError):
+        ae_single_output.plot_results(input_index=2)
+
+
+def test_plot_results_output_index(ae_multi_output):
+    fig = ae_multi_output.plot_results(output_index=1)
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 3
+
+
+def test_plot_results_output_index_out_of_range(ae_multi_output):
+    with pytest.raises(ValueError):
+        ae_multi_output.plot_results(output_index=2)
+
+
+# test plots with best cv per model, standard [;pt]
+def test_plot_results_standard(ae_single_output):
+    fig = ae_single_output.plot_results(plot="standard")
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 3
+
+
+def test_plot_results_output_index_standard(ae_multi_output):
+    fig = ae_multi_output.plot_results(plot="standard", output_index=1)
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 3
+
+
+def test_plot_results_output_index_standard_out_of_range(ae_multi_output):
+    with pytest.raises(ValueError):
+        ae_multi_output.plot_results(plot="standard", output_index=2)
+
+
+# test plots with all cv folds for a single model
+def test_plot_results_model(ae_single_output):
+    fig = ae_single_output.plot_results(model="gpt")
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 6  # 5 cv folds, but three columns so 6 subplots are made
+
+
+def test_plot_results_model_input_index(ae_single_output):
+    fig = ae_single_output.plot_results(model="gpt", input_index=1)
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 6
+
+
+def test_plot_results_model_output_index(ae_multi_output):
+    fig = ae_multi_output.plot_results(model="gpt", output_index=1)
+    assert isinstance(fig, plt.Figure)
+    assert len(fig.axes) == 6
+
+
+def test_plot_results_model_input_index_out_of_range(ae_single_output):
+    with pytest.raises(ValueError):
+        ae_single_output.plot_results(model="gpt", input_index=2)
 
 
-def test__plot_model_Xy():
-    X, y = make_regression(n_samples=30, n_features=1, n_targets=1)
-    model = RadialBasisFunctions()
-    model.fit(X, y)
-    # assert that plot_model runs
-    # _plot_model(model, X, y, plot="Xy")
+def test_plot_results_model_output_index_out_of_range(ae_multi_output):
+    with pytest.raises(ValueError):
+        ae_multi_output.plot_results(model="gpt", output_index=2)

From f3fd69796d3d4a6baaac988a93a515421402f085 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Tue, 10 Sep 2024 09:12:47 +0100
Subject: [PATCH 6/8] fix notebook double plotting in _plot_model

---
 autoemulate/compare.py  |  14 ++--
 autoemulate/plotting.py | 176 +++++++---------------------------------
 tests/test_plotting.py  |  16 +++-
 3 files changed, 52 insertions(+), 154 deletions(-)

diff --git a/autoemulate/compare.py b/autoemulate/compare.py
index bbcfd028..1e8119fe 100644
--- a/autoemulate/compare.py
+++ b/autoemulate/compare.py
@@ -493,11 +493,11 @@ def evaluate_model(self, model=None):
     def plot_model(
         self,
         model,
-        plot="standard",
-        n_cols=2,
+        plot="Xy",
+        n_cols=3,
         figsize=None,
-        input_index=0,
         output_index=0,
+        input_index=0,
     ):
         """Plots the model predictions vs. the true values.
 
@@ -511,12 +511,12 @@ def plot_model(
             “residual” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
         n_cols : int, optional
             Number of columns in the plot grid for multi-output. Default is 2.
+        output_index : int
+            Index of the output to plot. Default is 0..
         input_index : int
             Index of the input to plot. Default is 0. Only used if plot_type="Xy".
-        output_index : int
-            Index of the output to plot. Default is 0. Only used if plot_type="Xy".
         """
-        _plot_model(
+        fig = _plot_model(
             model,
             self.X[self.test_idxs],
             self.y[self.test_idxs],
@@ -526,3 +526,5 @@ def plot_model(
             input_index=input_index,
             output_index=output_index,
         )
+
+        return fig
diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
index bcc6ba90..18e4d70c 100644
--- a/autoemulate/plotting.py
+++ b/autoemulate/plotting.py
@@ -205,7 +205,6 @@ def _plot_best_fold_per_model(
     if figsize is None:
         figsize = (4 * n_cols, 3 * n_rows)
 
-    plt.ioff()
     fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
     axs = axs.flatten()
     # plt.figure(figsize=figsize)
@@ -228,8 +227,8 @@ def _plot_best_fold_per_model(
     for j in range(i + 1, len(axs)):
         axs[j].set_visible(False)
     plt.tight_layout()
-    plt.ion()
-    # plt.show()
+    # prevent double plotting in notebooks
+    plt.close(fig)
     return fig
 
 
@@ -275,7 +274,6 @@ def _plot_model_folds(
     if figsize is None:
         figsize = (4 * n_cols, 3 * n_rows)
 
-    plt.ioff()
     fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
     axs = axs.flatten()
 
@@ -297,7 +295,8 @@ def _plot_model_folds(
         axs[j].set_visible(False)
 
     plt.tight_layout()
-    plt.ion()
+    # prevent double plotting in notebooks
+    plt.close(fig)
     return fig
 
 
@@ -366,7 +365,7 @@ def _plot_model(
     X,
     y,
     plot="Xy",
-    n_cols=2,
+    n_cols=3,
     figsize=None,
     input_index=None,
     output_index=None,
@@ -395,29 +394,11 @@ def _plot_model(
     output_index : int or list of int, optional
         The index(es) of the output variable(s) to plot. If None, all outputs are used.
     """
-
     # Get predictions, with uncertainty if available
-    if isinstance(model, Pipeline):
-        predict_params = inspect.signature(
-            model.named_steps["model"].predict
-        ).parameters
-    else:
-        predict_params = inspect.signature(model.predict).parameters
-
-    if "return_std" in predict_params:
-        y_pred, y_std = model.predict(X, return_std=True)
-    else:
-        y_pred = model.predict(X)
-        y_std = None
-
-    # Ensure y and y_pred are 2D
-    y = np.atleast_2d(y)
-    y_pred = np.atleast_2d(y_pred)
-    if y_std is not None:
-        y_std = np.atleast_2d(y_std)
+    y_pred, y_std = _predict_with_optional_std(model, X)
 
     n_samples, n_features = X.shape
-    n_outputs = y.shape[1]
+    n_outputs = y.shape[1] if y.ndim > 1 else 1
 
     # Handle input and output indices
     if input_index is None:
@@ -430,6 +411,16 @@ def _plot_model(
     elif isinstance(output_index, int):
         output_index = [output_index]
 
+    # check that input_index and output_index are valid
+    if any(idx >= n_features for idx in input_index):
+        raise ValueError(
+            f"input_index {input_index} is out of range. The index should be between 0 and {n_features - 1}."
+        )
+    if any(idx >= n_outputs for idx in output_index):
+        raise ValueError(
+            f"output_index {output_index} is out of range. The index should be between 0 and {n_outputs - 1}."
+        )
+
     # Calculate number of subplots
     if plot == "Xy":
         n_plots = len(input_index) * len(output_index)
@@ -445,12 +436,20 @@ def _plot_model(
     fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
     axs = axs.flatten()
 
+    # if y is 1d, we need to make it 2d
+    if y.ndim == 1:
+        y = y.reshape(-1, 1)
+    if y_pred.ndim == 1:
+        y_pred = y_pred.reshape(-1, 1)
+    if y_std is not None and y_std.ndim == 1:
+        y_std = y_std.reshape(-1, 1)
+
     plot_index = 0
     for out_idx in output_index:
         if plot == "Xy":
             for in_idx in input_index:
                 if plot_index < len(axs):
-                    _plot_Xy(
+                    a = _plot_Xy(
                         X[:, in_idx],
                         y[:, out_idx],
                         y_pred[:, out_idx],
@@ -469,7 +468,7 @@ def _plot_model(
                     else "residual_vs_predicted",
                     ax=axs[plot_index],
                     scatter_kwargs={"edgecolor": "black", "alpha": 0.7},
-                    line_kwargs={"color": "red"},
+                    # line_kwargs={"color": "red"},
                 )
                 axs[plot_index].set_title(
                     f"{plot.capitalize()} Plot - Output {out_idx+1}"
@@ -479,126 +478,11 @@ def _plot_model(
     # Hide any unused subplots
     for ax in axs[plot_index:]:
         ax.set_visible(False)
-
     plt.tight_layout()
 
-
-# plt.show()
-
-
-# def _plot_model(model, X, y, plot="standard", n_cols=2, figsize=None):
-#     """Plots the model predictions vs. the true values.
-
-#     Parameters
-#     ----------
-#     model : object
-#         A fitted model.
-#     X : array-like, shape (n_samples, n_features)
-#         Simulation input.
-#     y : array-like, shape (n_samples, n_outputs)
-#         Simulation output.
-#     plot : str, optional
-#         The type of plot to draw:
-#         “standard” draws the observed values (y-axis) vs. the predicted values (x-axis) (default).
-#         “residual” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
-#     n_cols : int, optional
-#         The number of columns in the plot. Default is 2.
-#     figsize : tuple, optional
-#         Overrides the default figure size.
-#     """
-
-#     match plot:
-#         case "standard":
-#             plot_type = "actual_vs_predicted"
-#         case "residual":
-#             plot_type = "residual_vs_predicted"
-#         case "Xy":
-#             plot_type = "Xy"
-#         case _:
-#             ValueError(f"Invalid plot type: {plot}")
-
-#     # get predictions, with uncertainty if available
-#     # check if model is a pipeline
-#     if isinstance(model, Pipeline):
-#         predict_params = inspect.signature(model.named_steps["model"].predict).parameters
-#     else:
-#         predict_params = inspect.signature(model.predict).parameters
-
-#     if "return_std" in predict_params:
-#         y_pred, y_std = model.predict(X, return_std=True)
-#     else:
-#         y_pred = model.predict(X)
-#         y_std = None
-
-#     print(f"X: {X.shape}, y: {y.shape}, y_pred: {y_pred.shape}, y_std: {y_std}")
-
-#     if plot_type == "Xy":
-#         # check if y dim an x dim are 1
-#         n_outputs = y.shape[1] if y.ndim > 1 else 1
-#         n_inputs = X.shape[1] if X.ndim > 1 else 1
-
-#         if n_outputs == 1 and n_inputs == 1:
-#             print(f"X: {X.shape}, y: {y.shape}, y_pred: {y_pred.shape}, y_std: {y_std.shape}")
-#             fig, ax = plt.subplots(figsize=(6, 4))
-#             _plot_Xy(X, y, y_pred, y_std, ax, title=f"{get_model_name(model)} - Test Set")
-#             plt.show()
-
-#         # limit to max 3 x 3 scatter plot matrix
-#         n_inputs = min(n_inputs, 3)
-#         n_outputs = min(n_outputs, 3)
-
-
-#         # else:
-#         #     figsize = (3 * n_inputs, 3 * n_outputs)
-#         #     fig, axs = plt.subplots(nrows = n_outputs, ncols = n_inputs, figsize=figsize, constrained_layout=True)
-#         #     axs = np.atleast_2d(axs)
-#         #     for i in range(n_outputs):
-#         #         for j in range(n_inputs):
-#         #             ax = axs[i * n_inputs + j]
-#         #             _plot_Xy(X[:, j], y[:, i], y_pred[:, i], y_std[:, i], ax, title=f"X{j+1} vs. y{i+1}")
-#         # fig.suptitle(f"{get_model_name(model)} - Test Set Predictions", fontsize=32)
-#         # plt.show()
-#         # return
-
-
-#     # # figsize
-#     # if figsize is None:
-#     #     if y.ndim == 1 or y.shape[1] == 1:
-#     #         figsize = (6, 4)
-#     #     else:  # Dynamic calculation for multi-output
-#     #         n_outputs = y.shape[1]
-#     #         n_rows = np.ceil(n_outputs / n_cols).astype(int)
-#     #         figsize = (4 * n_cols, 4 * n_rows)
-
-
-#     # if y.ndim == 1 or y.shape[1] == 1:  # single output
-#     #     _, ax = plt.subplots(figsize=figsize)
-#     #     display = PredictionErrorDisplay.from_predictions(
-#     #         y_true=y, y_pred=y_pred, kind=plot_type, ax=ax
-#     #     )
-#     #     ax.set_title(f"{get_model_name(model)} - Test Set")
-#     # else:  # Multi-output
-#     #     n_outputs = y.shape[1]
-#     #     n_rows = np.ceil(n_outputs / n_cols).astype(int)
-#     #     fig, axs = plt.subplots(
-#     #         n_rows, n_cols, figsize=figsize, constrained_layout=True
-#     #     )
-#     #     axs = axs.flatten()
-
-#     #     for i in range(n_outputs):
-#     #         if i < len(
-#     #             axs
-#     #         ):
-#     #             display = PredictionErrorDisplay.from_predictions(
-#     #                 y_true=y[:, i], y_pred=y_pred[:, i], kind=plot_type, ax=axs[i]
-#     #             )
-#     #             axs[i].set_title(f"{get_model_name(model)} - Test Set - Output {i+1}")
-
-#     #     # Hide any unused subplots if n_cols * n_rows > n_outputs
-#     #     for ax in axs[n_outputs:]:
-#     #         ax.set_visible(False)
-
-#     # plt.show()
+    # prevent double plotting in notebooks
+    plt.close(fig)
+    return fig
 
 
 def _plot_Xy(X, y, y_pred, y_std=None, ax=None, title="Xy"):
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
index 7d6e4771..a64007c3 100644
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@@ -266,8 +266,6 @@ def test__plot_results_input_range(ae_multi_output, monkeypatch):
 # ------------------------------ most important tests, does it work? ----------------
 # ------------------------------ test plot_results ----------------------------------
 
-# test Xy plots
-
 
 # test plots with best cv per model, Xy plot
 def test_plot_results(ae_single_output):
@@ -343,3 +341,17 @@ def test_plot_results_model_input_index_out_of_range(ae_single_output):
 def test_plot_results_model_output_index_out_of_range(ae_multi_output):
     with pytest.raises(ValueError):
         ae_multi_output.plot_results(model="gpt", output_index=2)
+
+
+# test _plot_model
+def test__plot_model_int(ae_single_output):
+    fig = _plot_model(
+        ae_single_output.get_model(name="gpt"),
+        ae_single_output.X,
+        ae_single_output.y,
+        plot="standard",
+        input_index=0,
+        output_index=0,
+    )
+    assert isinstance(fig, plt.Figure)
+    print(fig.axes)

From 5f30b925755b3d28a9b4c725029f4d509bb4b3e8 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Tue, 10 Sep 2024 09:22:47 +0100
Subject: [PATCH 7/8] add plotting tests

---
 autoemulate/plotting.py |  4 ++--
 tests/test_plotting.py  | 42 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
index 18e4d70c..72326040 100644
--- a/autoemulate/plotting.py
+++ b/autoemulate/plotting.py
@@ -455,7 +455,7 @@ def _plot_model(
                         y_pred[:, out_idx],
                         y_std[:, out_idx] if y_std is not None else None,
                         ax=axs[plot_index],
-                        title=f"X{in_idx+1} vs. y{out_idx+1}",
+                        title=f"X{in_idx} vs. y{out_idx}",
                     )
                     plot_index += 1
         else:
@@ -471,7 +471,7 @@ def _plot_model(
                     # line_kwargs={"color": "red"},
                 )
                 axs[plot_index].set_title(
-                    f"{plot.capitalize()} Plot - Output {out_idx+1}"
+                    f"{plot.capitalize()} Plot - Output {out_idx}"
                 )
                 plot_index += 1
 
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
index a64007c3..6acee56f 100644
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@@ -343,8 +343,46 @@ def test_plot_results_model_output_index_out_of_range(ae_multi_output):
         ae_multi_output.plot_results(model="gpt", output_index=2)
 
 
-# test _plot_model
+# ------------------------------ test _plot_model ------------------------------
 def test__plot_model_int(ae_single_output):
+    fig = _plot_model(
+        ae_single_output.get_model(name="gpt"),
+        ae_single_output.X,
+        ae_single_output.y,
+        plot="Xy",
+        input_index=0,
+        output_index=0,
+    )
+    assert isinstance(fig, plt.Figure)
+    assert fig.axes[0].get_title() == "X0 vs. y0"
+
+
+def test__plot_model_list(ae_single_output):
+    fig = _plot_model(
+        ae_single_output.get_model(name="gpt"),
+        ae_single_output.X,
+        ae_single_output.y,
+        plot="Xy",
+        input_index=[0, 1],
+        output_index=[0],
+    )
+    assert isinstance(fig, plt.Figure)
+    assert fig.axes[1].get_title() == "X1 vs. y0"
+
+
+def test__plot_model_int_out_of_range(ae_single_output):
+    with pytest.raises(ValueError):
+        _plot_model(
+            ae_single_output.get_model(name="gpt"),
+            ae_single_output.X,
+            ae_single_output.y,
+            plot="Xy",
+            input_index=3,
+            output_index=2,
+        )
+
+
+def test__plot_model_standard(ae_single_output):
     fig = _plot_model(
         ae_single_output.get_model(name="gpt"),
         ae_single_output.X,
@@ -354,4 +392,4 @@ def test__plot_model_int(ae_single_output):
         output_index=0,
     )
     assert isinstance(fig, plt.Figure)
-    print(fig.axes)
+    assert fig.axes[0].get_title() == "Standard Plot - Output 0"

From decbfd0f55fb6a6849941c002678387f04d8dde8 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Tue, 10 Sep 2024 12:32:13 +0100
Subject: [PATCH 8/8] debug ci

debug ci

fix ci

fix ci

fix ci

fix ci

fix ci

remove prints from yml
---
 .github/workflows/ci.yaml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 77067e88..e9eb196d 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -69,12 +69,17 @@ jobs:
           token: ${{ secrets.CODECOV_TOKEN }}  # Required for private repos
           file: ./coverage.xml  # Specify the coverage report file
           fail_ci_if_error: true
-              
+          verbose: true
+
+      
       - name: Store coverage file
         uses: actions/upload-artifact@v3
         with:
           name: coverage
           path: .coverage.${{ matrix.python-version }}
+          include-hidden-files: true
+          
+      
 
   # Coverage job to comment on PRs and update README badge
   coverage:
@@ -90,7 +95,7 @@ jobs:
       - name: Download coverage artifact
         uses: actions/download-artifact@v3
         with:
-          name: 'coverage'
+          name: coverage
       
       # Comment coverage details on PR
       - name: Coverage comment