From 045686da79706d138ef9e4f63de47b530b569e81 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <becca.mcbrayer@alteryx.com>
Date: Fri, 4 Aug 2023 15:35:01 -0400
Subject: [PATCH] Fix MASE error by passing through y_train (#4258)

* Add y_train as an argument to score_all_objectives
---
 docs/source/release_notes.rst                 |  1 +
 .../binary_classification_pipeline.py         |  2 +-
 evalml/pipelines/pipeline_base.py             | 16 +++++--
 .../time_series_classification_pipelines.py   | 10 ++++-
 .../time_series_regression_pipeline.py        |  1 +
 evalml/tests/pipeline_tests/test_pipelines.py | 42 +++++++++++++++++++
 6 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index 0f9029a5df..a2c7964bbd 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -7,6 +7,7 @@ Release Notes
         * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
     * Fixes
         * Added support for pandas 2 :pr:`4216`
+        * Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258`
         * Update s3 bucket for docs image :pr:`4260`
     * Changes
         * Unpinned sktime version :pr:`4214`
diff --git a/evalml/pipelines/binary_classification_pipeline.py b/evalml/pipelines/binary_classification_pipeline.py
index 5c8ddae075..86f8e11520 100644
--- a/evalml/pipelines/binary_classification_pipeline.py
+++ b/evalml/pipelines/binary_classification_pipeline.py
@@ -87,7 +87,7 @@ def predict_proba(self, X, X_train=None, y_train=None):
         return super().predict_proba(X)
 
     @staticmethod
-    def _score(X, y, predictions, objective):
+    def _score(X, y, predictions, objective, y_train=None):
         """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score."""
         if predictions.ndim > 1:
             predictions = predictions.iloc[:, 1]
diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py
index a5b71c7b46..b89638afae 100644
--- a/evalml/pipelines/pipeline_base.py
+++ b/evalml/pipelines/pipeline_base.py
@@ -351,10 +351,18 @@ def score(self, X, y, objectives, X_train=None, y_train=None):
         """
 
     @staticmethod
-    def _score(X, y, predictions, objective):
-        return objective.score(y, predictions, X=X)
+    def _score(X, y, predictions, objective, y_train=None):
+        return objective.score(y, predictions, X=X, y_train=y_train)
 
-    def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives):
+    def _score_all_objectives(
+        self,
+        X,
+        y,
+        y_pred,
+        y_pred_proba,
+        objectives,
+        y_train=None,
+    ):
         """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score.
 
         Will raise a PipelineScoreError if any objectives fail.
@@ -366,6 +374,7 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives):
             y_pred_proba (pd.Dataframe, pd.Series, None): The predicted probabilities for classification problems.
                 Will be a DataFrame for multiclass problems and Series otherwise. Will be None for regression problems.
             objectives (list): List of objectives to score.
+            y_train (pd.Series or None): Training labels. Only used for time series, otherwise ignored.
 
         Returns:
             dict: Ordered dictionary with objectives and their scores.
@@ -390,6 +399,7 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives):
                     y,
                     y_pred_proba if objective.score_needs_proba else y_pred,
                     objective,
+                    y_train,
                 )
                 scored_successfully.update({objective.name: score})
             except Exception as e:
diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py
index df4d8b8597..b14dac3a15 100644
--- a/evalml/pipelines/time_series_classification_pipelines.py
+++ b/evalml/pipelines/time_series_classification_pipelines.py
@@ -282,11 +282,17 @@ def predict_in_sample(self, X, y, X_train, y_train, objective=None):
         return infer_feature_types(predictions)
 
     @staticmethod
-    def _score(X, y, predictions, objective):
+    def _score(X, y, predictions, objective, y_train=None):
         """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score."""
         if predictions.ndim > 1:
             predictions = predictions.iloc[:, 1]
-        return TimeSeriesClassificationPipeline._score(X, y, predictions, objective)
+        return TimeSeriesClassificationPipeline._score(
+            X,
+            y,
+            predictions,
+            objective,
+            y_train,
+        )
 
 
 class TimeSeriesMulticlassClassificationPipeline(TimeSeriesClassificationPipeline):
diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py
index 93c4ec2a5a..fbe4ef8fc5 100644
--- a/evalml/pipelines/time_series_regression_pipeline.py
+++ b/evalml/pipelines/time_series_regression_pipeline.py
@@ -102,6 +102,7 @@ def score(self, X, y, objectives, X_train=None, y_train=None):
             y_predicted,
             y_pred_proba=None,
             objectives=objectives,
+            y_train=y_train,
         )
 
     def get_forecast_period(self, X):
diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py
index 6720f4732d..91f1e3c9f8 100644
--- a/evalml/tests/pipeline_tests/test_pipelines.py
+++ b/evalml/tests/pipeline_tests/test_pipelines.py
@@ -1,4 +1,5 @@
 import io
+import math
 import os
 import pickle
 import re
@@ -1048,6 +1049,47 @@ def test_score_with_objective_that_requires_predict_proba(
     mock_predict.assert_called()
 
 
+@patch("evalml.pipelines.components.Estimator.predict")
+@patch("evalml.pipelines.components.Estimator.fit")
+def test_score_with_objective_that_requires_y_train(
+    mock_fit,
+    mock_predict,
+    dummy_time_series_regression_pipeline_class,
+    generate_seasonal_data,
+):
+    X, y = generate_seasonal_data(real_or_synthetic="real")(period=10)
+    X = X.reset_index()
+
+    split = math.floor(0.9 * len(X))
+    X_train, X_holdout = X.iloc[:split], X.iloc[split:]
+    y_train, y_holdout = y.iloc[:split], y.iloc[split:]
+
+    parameters = {
+        "pipeline": {
+            "max_delay": 0,
+            "gap": 2,
+            "forecast_horizon": 2,
+            "time_index": "Date",
+        },
+    }
+
+    mock_regression_pipeline = dummy_time_series_regression_pipeline_class(
+        parameters=parameters,
+    )
+
+    mock_predict.return_value = pd.Series([1] * len(y_holdout))
+
+    mock_regression_pipeline.fit(X_train, y_train)
+    mock_regression_pipeline.score(
+        X_holdout,
+        y_holdout,
+        ["mean absolute scaled error"],
+        X_train=X_train,
+        y_train=y_train,
+    )
+    mock_predict.assert_called()
+
+
 def test_score_auc(X_y_binary, logistic_regression_binary_pipeline):
     X, y = X_y_binary
     lr_pipeline = logistic_regression_binary_pipeline