From cca34e05d860fd4967bfb331ad0c271725453060 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Fri, 13 Sep 2024 17:42:44 +0100
Subject: [PATCH 1/4] Add column type validation tests for metrics/plot

Each of these functions requires some columns to be numeric.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 tests/metrics/test_efficiency.py | 13 +++++++++++++
 tests/metrics/test_pp.py         | 13 +++++++++++++
 tests/plot/test_cascade.py       | 13 +++++++++++++
 tests/plot/test_navchart.py      | 24 ++++++++++++++++++++++++
 4 files changed, 63 insertions(+)

diff --git a/tests/metrics/test_efficiency.py b/tests/metrics/test_efficiency.py
index b004a3c..50d023b 100644
--- a/tests/metrics/test_efficiency.py
+++ b/tests/metrics/test_efficiency.py
@@ -20,6 +20,19 @@ def test_required_columns(self):
         with self.assertRaises(ValueError):
             application_efficiency(df)
 
+    def test_required_column_types(self):
+        """Check that application_efficiency() validates column types."""
+        data = {
+            "problem": ["problem"],
+            "platform": ["platform"],
+            "application": ["application"],
+            "fom": ["non-numeric"],
+        }
+        df = pd.DataFrame(data)
+
+        with self.assertRaises(TypeError):
+            application_efficiency(df)
+
     def test_foms(self):
         """p3analysis.data.efficiency.foms"""
         data = {"problem": [], "platform": [], "application": [], "fom": []}
diff --git a/tests/metrics/test_pp.py b/tests/metrics/test_pp.py
index 40bf88f..1733689 100644
--- a/tests/metrics/test_pp.py
+++ b/tests/metrics/test_pp.py
@@ -20,6 +20,19 @@ def test_required_columns(self):
         with self.assertRaises(ValueError):
             pp(df)
 
+    def test_required_column_types(self):
+        """Check that pp() validates column types."""
+        data = {
+            "problem": ["problem"],
+            "platform": ["platform"],
+            "application": ["application"],
+            "arch eff": ["non-numeric"],
+        }
+        df = pd.DataFrame(data)
+
+        with self.assertRaises(TypeError):
+            pp(df)
+
     def test_effs(self):
         """p3analysis.data.pp.effs"""
         data = {
diff --git a/tests/plot/test_cascade.py b/tests/plot/test_cascade.py
index ef41ff7..34d6729 100644
--- a/tests/plot/test_cascade.py
+++ b/tests/plot/test_cascade.py
@@ -31,6 +31,19 @@ def test_required_columns(self):
         with self.assertRaises(ValueError):
             cascade(df, eff="invalid")
 
+    def test_required_column_types(self):
+        """Check that cascade() validates column types."""
+        data = {
+            "problem": ["problem"],
+            "platform": ["platform"],
+            "application": ["application"],
+            "app eff": ["non-numeric"],
+        }
+        df = pd.DataFrame(data)
+
+        with self.assertRaises(TypeError):
+            cascade(df, eff="app")
+
     def test_options(self):
         """p3analysis.plot.cascade.options"""
 
diff --git a/tests/plot/test_navchart.py b/tests/plot/test_navchart.py
index 95f0d92..6ea1ad4 100644
--- a/tests/plot/test_navchart.py
+++ b/tests/plot/test_navchart.py
@@ -47,6 +47,30 @@ def test_required_columns(self):
         })
         navchart(pp, cd)
 
+    def test_required_column_types(self):
+        """Check that navchart() validates column types."""
+        pp = pd.DataFrame({
+            "problem": ["problem"],
+            "platform": ["platform"],
+            "application": ["application"],
+            "app pp": [1.0],
+        })
+        cd = pd.DataFrame({
+            "problem": ["problem"],
+            "application": ["application"],
+            "divergence": [0.0],
+        })
+
+        with self.assertRaises(TypeError):
+            tmp = pp
+            tmp["app pp"] = "non-numeric"
+            navchart(tmp, cd)
+
+        with self.assertRaises(TypeError):
+            tmp = cd
+            tmp["divergence"] = "non-numeric"
+            navchart(pp, tmp)
+
     def test_options(self):
         """p3analysis.plot.navchart.options"""
 

From e66cc7b8c25a9d1014ce30e738fc8daf33a1adae Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Fri, 13 Sep 2024 17:46:55 +0100
Subject: [PATCH 2/4] Add regression test for efficiency with string FOM

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 tests/metrics/test_efficiency.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/metrics/test_efficiency.py b/tests/metrics/test_efficiency.py
index 50d023b..a6948cb 100644
--- a/tests/metrics/test_efficiency.py
+++ b/tests/metrics/test_efficiency.py
@@ -121,6 +121,28 @@ def test_efficiency(self):
         with self.assertRaises(TypeError):
             application_efficiency(df, foms="higher")
 
+    def test_non_numeric(self):
+        """Check that non-numeric data is correctly cast to numeric."""
+        # This is the same correctness test as above, but values are strings.
+        data = {
+            "problem": ["test"] * 10,
+            "platform": ["A", "B", "C", "D", "E"] * 2,
+            "application": ["latest"] * 5 + ["best"] * 5,
+            "fom": ["4", "8", "4", 0, "20"] + ["4", "10", "8", "20", "100"],
+        }
+        df = pd.DataFrame(data)
+
+        result = application_efficiency(df, foms="higher")
+
+        eff_data = {
+            "app eff": [1.0, 0.8, 0.5, 0.0, 0.2] + [1.0, 1.0, 1.0, 1.0, 1.0],
+        }
+        expected_data = data.copy()
+        expected_data.update(eff_data)
+        expected_df = pd.DataFrame(expected_data)
+
+        pd.testing.assert_frame_equal(result, expected_df)
+
 
 if __name__ == "__main__":
     unittest.main()

From 02bd22976eac6c168681cdd563f3268ee9635898 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Fri, 13 Sep 2024 17:55:31 +0100
Subject: [PATCH 3/4] Replace _require_numeric with _cast_to_numeric

Whereas _require_numeric simply checked that the values in a DataFrame
could be interpreted as numbers, _cast_to_numeric actually performs the
cast and returns the result.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 p3analysis/_utils.py                  | 9 +++++----
 p3analysis/metrics/_efficiency.py     | 4 ++--
 p3analysis/metrics/_pp.py             | 4 ++--
 p3analysis/plot/_cascade.py           | 4 ++--
 p3analysis/plot/_navchart.py          | 4 ++--
 p3analysis/plot/backend/matplotlib.py | 4 ++--
 p3analysis/plot/backend/pgfplots.py   | 4 ++--
 tests/metrics/test_efficiency.py      | 2 ++
 8 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/p3analysis/_utils.py b/p3analysis/_utils.py
index 87b70f9..a062a74 100644
--- a/p3analysis/_utils.py
+++ b/p3analysis/_utils.py
@@ -19,14 +19,15 @@ def _require_columns(df, columns):
             raise ValueError(msg % (column, str(columns)))
 
 
-def _require_numeric(df, columns):
+def _cast_to_numeric(df, columns):
     """
-    Check that the named columns are numeric.
+    Check that the named columns are numeric, and cast them.
     """
-
+    result = df.copy(deep=True)
     for column in columns:
         try:
-            pd.to_numeric(df[column])
+            result[column] = pd.to_numeric(df[column])
         except Exception:
             msg = "Column '%s' must contain only numeric values."
             raise TypeError(msg % (column))
+    return result
diff --git a/p3analysis/metrics/_efficiency.py b/p3analysis/metrics/_efficiency.py
index ed86198..a0ff097 100644
--- a/p3analysis/metrics/_efficiency.py
+++ b/p3analysis/metrics/_efficiency.py
@@ -3,7 +3,7 @@
 
 import numpy
 
-from p3analysis._utils import _require_columns, _require_numeric
+from p3analysis._utils import _cast_to_numeric, _require_columns
 
 
 def application_efficiency(df, foms="lower"):
@@ -45,7 +45,7 @@ def application_efficiency(df, foms="lower"):
     """
     required_columns = ["problem", "platform", "application", "fom"]
     _require_columns(df, required_columns)
-    _require_numeric(df, ["fom"])
+    df = _cast_to_numeric(df, ["fom"])
 
     if foms not in ["lower", "higher"]:
         raise ValueError("FOM interpretation must be 'lower' or 'higher'")
diff --git a/p3analysis/metrics/_pp.py b/p3analysis/metrics/_pp.py
index ea4ed9a..a318761 100644
--- a/p3analysis/metrics/_pp.py
+++ b/p3analysis/metrics/_pp.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 
-from p3analysis._utils import _require_columns, _require_numeric
+from p3analysis._utils import _cast_to_numeric, _require_columns
 
 
 def _hmean(series):
@@ -79,7 +79,7 @@ def pp(df):
     if len(efficiencies) == 0:
         msg = "DataFrame must contain a column named 'arch eff' or 'app eff'."
         raise ValueError(msg)
-    _require_numeric(df, efficiencies)
+    df = _cast_to_numeric(df, efficiencies)
 
     # Check that efficiencies are not given in percentages
     for eff in efficiencies:
diff --git a/p3analysis/plot/_cascade.py b/p3analysis/plot/_cascade.py
index a81630b..d1de20a 100644
--- a/p3analysis/plot/_cascade.py
+++ b/p3analysis/plot/_cascade.py
@@ -8,7 +8,7 @@
 
 import copy
 
-from p3analysis._utils import _require_columns, _require_numeric
+from p3analysis._utils import _cast_to_numeric, _require_columns
 
 
 def cascade(
@@ -115,7 +115,7 @@ def cascade(
         if eff_column not in df:
             msg = "DataFrame does not contain an '%s' column."
             raise ValueError(msg % (eff_column))
-    _require_numeric(df, [eff_column])
+    df = _cast_to_numeric(df, [eff_column])
 
     # Check there is only one entry per (application, platform) pair.
     grouped = df.groupby(["platform", "application"])
diff --git a/p3analysis/plot/_navchart.py b/p3analysis/plot/_navchart.py
index 1c41773..93c7bfd 100644
--- a/p3analysis/plot/_navchart.py
+++ b/p3analysis/plot/_navchart.py
@@ -3,7 +3,7 @@
 
 import copy
 
-from p3analysis._utils import _require_columns, _require_numeric
+from p3analysis._utils import _cast_to_numeric, _require_columns
 
 
 def navchart(
@@ -86,7 +86,7 @@ def navchart(
 
     _require_columns(pp, ["problem", "application"])
     _require_columns(cd, ["problem", "application", "divergence"])
-    _require_numeric(cd, ["divergence"])
+    cd = _cast_to_numeric(cd, ["divergence"])
 
     if len(cd["problem"].unique()) > 1:
         raise NotImplementedError(
diff --git a/p3analysis/plot/backend/matplotlib.py b/p3analysis/plot/backend/matplotlib.py
index c09d005..fce4056 100644
--- a/p3analysis/plot/backend/matplotlib.py
+++ b/p3analysis/plot/backend/matplotlib.py
@@ -15,7 +15,7 @@
 from matplotlib.path import Path
 
 import p3analysis.metrics
-from p3analysis._utils import _require_numeric
+from p3analysis._utils import _cast_to_numeric
 from p3analysis.plot._common import ApplicationStyle, Legend, PlatformStyle
 from p3analysis.plot.backend import CascadePlot, NavChart
 
@@ -471,7 +471,7 @@ def __init__(
             if pp_column not in pp:
                 msg = "DataFrame does not contain an '%s' column."
                 raise ValueError(msg % (pp_column))
-        _require_numeric(pp, [pp_column])
+        pp = _cast_to_numeric(pp, [pp_column])
 
         # If the size is unset, default to 5 x 5
         if not size:
diff --git a/p3analysis/plot/backend/pgfplots.py b/p3analysis/plot/backend/pgfplots.py
index 030563a..d2ba1e8 100644
--- a/p3analysis/plot/backend/pgfplots.py
+++ b/p3analysis/plot/backend/pgfplots.py
@@ -14,7 +14,7 @@
 import pandas as pd
 
 import p3analysis.metrics
-from p3analysis._utils import _require_numeric
+from p3analysis._utils import _cast_to_numeric
 from p3analysis.plot._common import ApplicationStyle, Legend, PlatformStyle
 from p3analysis.plot.backend import CascadePlot, NavChart
 
@@ -293,7 +293,7 @@ def __init__(
             if pp_column not in pp:
                 msg = "DataFrame does not contain an '%s' column."
                 raise ValueError(msg % (pp_column))
-        _require_numeric(pp, [pp_column])
+        pp = _cast_to_numeric(pp, [pp_column])
 
         # If the size is unset, default to 200pt x 200pt, otherwise set size
         plotwidth = "200pt"
diff --git a/tests/metrics/test_efficiency.py b/tests/metrics/test_efficiency.py
index a6948cb..ec3e356 100644
--- a/tests/metrics/test_efficiency.py
+++ b/tests/metrics/test_efficiency.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 
+from p3analysis._utils import _cast_to_numeric
 from p3analysis.metrics import application_efficiency
 
 
@@ -141,6 +142,7 @@ def test_non_numeric(self):
         expected_data.update(eff_data)
         expected_df = pd.DataFrame(expected_data)
 
+        expected_df = _cast_to_numeric(expected_df, ["fom", "app eff"])
         pd.testing.assert_frame_equal(result, expected_df)
 
 

From 8ed24540e6c2569b32ea93dfea76d2fc23d6939c Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Thu, 19 Sep 2024 10:12:13 +0100
Subject: [PATCH 4/4] Avoid false positives in _cast_to_numeric

Previously, _cast_to_numeric would raise a TypeError if a column was
missing. We now check that a column exists before trying to cast it.

This does result in some redundant checking, as some functions call both
_require_columns and _cast_to_numeric. We may want to consider
refactoring to combine all of this functionality into a single DataFrame
validation step.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 p3analysis/_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/p3analysis/_utils.py b/p3analysis/_utils.py
index a062a74..c656448 100644
--- a/p3analysis/_utils.py
+++ b/p3analysis/_utils.py
@@ -23,6 +23,7 @@ def _cast_to_numeric(df, columns):
     """
     Check that the named columns are numeric, and cast them.
     """
+    _require_columns(df, columns)
     result = df.copy(deep=True)
     for column in columns:
         try: