From cca34e05d860fd4967bfb331ad0c271725453060 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 13 Sep 2024 17:42:44 +0100 Subject: [PATCH 1/4] Add column type validation tests for metrics/plot Each of these functions requires some columns to be numeric. Signed-off-by: John Pennycook --- tests/metrics/test_efficiency.py | 13 +++++++++++++ tests/metrics/test_pp.py | 13 +++++++++++++ tests/plot/test_cascade.py | 13 +++++++++++++ tests/plot/test_navchart.py | 24 ++++++++++++++++++++++++ 4 files changed, 63 insertions(+) diff --git a/tests/metrics/test_efficiency.py b/tests/metrics/test_efficiency.py index b004a3c..50d023b 100644 --- a/tests/metrics/test_efficiency.py +++ b/tests/metrics/test_efficiency.py @@ -20,6 +20,19 @@ def test_required_columns(self): with self.assertRaises(ValueError): application_efficiency(df) + def test_required_column_types(self): + """Check that application_efficiency() validates column types.""" + data = { + "problem": ["problem"], + "platform": ["platform"], + "application": ["application"], + "fom": ["non-numeric"], + } + df = pd.DataFrame(data) + + with self.assertRaises(TypeError): + application_efficiency(df) + def test_foms(self): """p3analysis.data.efficiency.foms""" data = {"problem": [], "platform": [], "application": [], "fom": []} diff --git a/tests/metrics/test_pp.py b/tests/metrics/test_pp.py index 40bf88f..1733689 100644 --- a/tests/metrics/test_pp.py +++ b/tests/metrics/test_pp.py @@ -20,6 +20,19 @@ def test_required_columns(self): with self.assertRaises(ValueError): pp(df) + def test_required_column_types(self): + """Check that pp() validates column types.""" + data = { + "problem": ["problem"], + "platform": ["platform"], + "application": ["application"], + "arch eff": ["non-numeric"], + } + df = pd.DataFrame(data) + + with self.assertRaises(TypeError): + pp(df) + def test_effs(self): """p3analysis.data.pp.effs""" data = { diff --git a/tests/plot/test_cascade.py b/tests/plot/test_cascade.py index ef41ff7..34d6729 100644 --- a/tests/plot/test_cascade.py +++ b/tests/plot/test_cascade.py @@ -31,6 +31,19 @@ def test_required_columns(self): with self.assertRaises(ValueError): cascade(df, eff="invalid") + def test_required_column_types(self): + """Check that cascade() validates column types.""" + data = { + "problem": ["problem"], + "platform": ["platform"], + "application": ["application"], + "app eff": ["non-numeric"], + } + df = pd.DataFrame(data) + + with self.assertRaises(TypeError): + cascade(df, eff="app") + def test_options(self): """p3analysis.plot.cascade.options""" diff --git a/tests/plot/test_navchart.py b/tests/plot/test_navchart.py index 95f0d92..6ea1ad4 100644 --- a/tests/plot/test_navchart.py +++ b/tests/plot/test_navchart.py @@ -47,6 +47,30 @@ def test_required_columns(self): }) navchart(pp, cd) + def test_required_column_types(self): + """Check that navchart() validates column types.""" + pp = pd.DataFrame({ + "problem": ["problem"], + "platform": ["platform"], + "application": ["application"], + "app pp": [1.0], + }) + cd = pd.DataFrame({ + "problem": ["problem"], + "application": ["application"], + "divergence": [0.0], + }) + + with self.assertRaises(TypeError): + tmp = pp + tmp["app pp"] = "non-numeric" + navchart(tmp, cd) + + with self.assertRaises(TypeError): + tmp = cd + tmp["divergence"] = "non-numeric" + navchart(pp, tmp) + def test_options(self): """p3analysis.plot.navchart.options""" From e66cc7b8c25a9d1014ce30e738fc8daf33a1adae Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 13 Sep 2024 17:46:55 +0100 Subject: [PATCH 2/4] Add regression test for efficiency with string FOM Signed-off-by: John Pennycook --- tests/metrics/test_efficiency.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/metrics/test_efficiency.py b/tests/metrics/test_efficiency.py index 50d023b..a6948cb 100644 --- a/tests/metrics/test_efficiency.py +++ b/tests/metrics/test_efficiency.py @@ -121,6 +121,28 @@ def test_efficiency(self): with self.assertRaises(TypeError): application_efficiency(df, foms="higher") + def test_non_numeric(self): + """Check that non-numeric data is correctly cast to numeric.""" + # This is the same correctness test as above, but values are strings. + data = { + "problem": ["test"] * 10, + "platform": ["A", "B", "C", "D", "E"] * 2, + "application": ["latest"] * 5 + ["best"] * 5, + "fom": ["4", "8", "4", 0, "20"] + ["4", "10", "8", "20", "100"], + } + df = pd.DataFrame(data) + + result = application_efficiency(df, foms="higher") + + eff_data = { + "app eff": [1.0, 0.8, 0.5, 0.0, 0.2] + [1.0, 1.0, 1.0, 1.0, 1.0], + } + expected_data = data.copy() + expected_data.update(eff_data) + expected_df = pd.DataFrame(expected_data) + + pd.testing.assert_frame_equal(result, expected_df) + if __name__ == "__main__": unittest.main() From 02bd22976eac6c168681cdd563f3268ee9635898 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 13 Sep 2024 17:55:31 +0100 Subject: [PATCH 3/4] Replace _require_numeric with _cast_to_numeric Whereas _require_numeric simply checked that the values in a DataFrame could be interpreted as numbers, _cast_to_numeric actually performs the cast and returns the result. Signed-off-by: John Pennycook --- p3analysis/_utils.py | 9 +++++---- p3analysis/metrics/_efficiency.py | 4 ++-- p3analysis/metrics/_pp.py | 4 ++-- p3analysis/plot/_cascade.py | 4 ++-- p3analysis/plot/_navchart.py | 4 ++-- p3analysis/plot/backend/matplotlib.py | 4 ++-- p3analysis/plot/backend/pgfplots.py | 4 ++-- tests/metrics/test_efficiency.py | 2 ++ 8 files changed, 19 insertions(+), 16 deletions(-) diff --git a/p3analysis/_utils.py b/p3analysis/_utils.py index 87b70f9..a062a74 100644 --- a/p3analysis/_utils.py +++ b/p3analysis/_utils.py @@ -19,14 +19,15 @@ def _require_columns(df, columns): raise ValueError(msg % (column, str(columns))) -def _require_numeric(df, columns): +def _cast_to_numeric(df, columns): """ - Check that the named columns are numeric. + Check that the named columns are numeric, and cast them. """ - + result = df.copy(deep=True) for column in columns: try: - pd.to_numeric(df[column]) + result[column] = pd.to_numeric(df[column]) except Exception: msg = "Column '%s' must contain only numeric values." raise TypeError(msg % (column)) + return result diff --git a/p3analysis/metrics/_efficiency.py b/p3analysis/metrics/_efficiency.py index ed86198..a0ff097 100644 --- a/p3analysis/metrics/_efficiency.py +++ b/p3analysis/metrics/_efficiency.py @@ -3,7 +3,7 @@ import numpy -from p3analysis._utils import _require_columns, _require_numeric +from p3analysis._utils import _cast_to_numeric, _require_columns def application_efficiency(df, foms="lower"): @@ -45,7 +45,7 @@ def application_efficiency(df, foms="lower"): """ required_columns = ["problem", "platform", "application", "fom"] _require_columns(df, required_columns) - _require_numeric(df, ["fom"]) + df = _cast_to_numeric(df, ["fom"]) if foms not in ["lower", "higher"]: raise ValueError("FOM interpretation must be 'lower' or 'higher'") diff --git a/p3analysis/metrics/_pp.py b/p3analysis/metrics/_pp.py index ea4ed9a..a318761 100644 --- a/p3analysis/metrics/_pp.py +++ b/p3analysis/metrics/_pp.py @@ -6,7 +6,7 @@ import pandas as pd -from p3analysis._utils import _require_columns, _require_numeric +from p3analysis._utils import _cast_to_numeric, _require_columns def _hmean(series): @@ -79,7 +79,7 @@ def pp(df): if len(efficiencies) == 0: msg = "DataFrame must contain a column named 'arch eff' or 'app eff'." raise ValueError(msg) - _require_numeric(df, efficiencies) + df = _cast_to_numeric(df, efficiencies) # Check that efficiencies are not given in percentages for eff in efficiencies: diff --git a/p3analysis/plot/_cascade.py b/p3analysis/plot/_cascade.py index a81630b..d1de20a 100644 --- a/p3analysis/plot/_cascade.py +++ b/p3analysis/plot/_cascade.py @@ -8,7 +8,7 @@ import copy -from p3analysis._utils import _require_columns, _require_numeric +from p3analysis._utils import _cast_to_numeric, _require_columns def cascade( @@ -115,7 +115,7 @@ def cascade( if eff_column not in df: msg = "DataFrame does not contain an '%s' column." raise ValueError(msg % (eff_column)) - _require_numeric(df, [eff_column]) + df = _cast_to_numeric(df, [eff_column]) # Check there is only one entry per (application, platform) pair. grouped = df.groupby(["platform", "application"]) diff --git a/p3analysis/plot/_navchart.py b/p3analysis/plot/_navchart.py index 1c41773..93c7bfd 100644 --- a/p3analysis/plot/_navchart.py +++ b/p3analysis/plot/_navchart.py @@ -3,7 +3,7 @@ import copy -from p3analysis._utils import _require_columns, _require_numeric +from p3analysis._utils import _cast_to_numeric, _require_columns def navchart( @@ -86,7 +86,7 @@ def navchart( _require_columns(pp, ["problem", "application"]) _require_columns(cd, ["problem", "application", "divergence"]) - _require_numeric(cd, ["divergence"]) + cd = _cast_to_numeric(cd, ["divergence"]) if len(cd["problem"].unique()) > 1: raise NotImplementedError( diff --git a/p3analysis/plot/backend/matplotlib.py b/p3analysis/plot/backend/matplotlib.py index c09d005..fce4056 100644 --- a/p3analysis/plot/backend/matplotlib.py +++ b/p3analysis/plot/backend/matplotlib.py @@ -15,7 +15,7 @@ from matplotlib.path import Path import p3analysis.metrics -from p3analysis._utils import _require_numeric +from p3analysis._utils import _cast_to_numeric from p3analysis.plot._common import ApplicationStyle, Legend, PlatformStyle from p3analysis.plot.backend import CascadePlot, NavChart @@ -471,7 +471,7 @@ def __init__( if pp_column not in pp: msg = "DataFrame does not contain an '%s' column." raise ValueError(msg % (pp_column)) - _require_numeric(pp, [pp_column]) + pp = _cast_to_numeric(pp, [pp_column]) # If the size is unset, default to 5 x 5 if not size: diff --git a/p3analysis/plot/backend/pgfplots.py b/p3analysis/plot/backend/pgfplots.py index 030563a..d2ba1e8 100644 --- a/p3analysis/plot/backend/pgfplots.py +++ b/p3analysis/plot/backend/pgfplots.py @@ -14,7 +14,7 @@ import pandas as pd import p3analysis.metrics -from p3analysis._utils import _require_numeric +from p3analysis._utils import _cast_to_numeric from p3analysis.plot._common import ApplicationStyle, Legend, PlatformStyle from p3analysis.plot.backend import CascadePlot, NavChart @@ -293,7 +293,7 @@ def __init__( if pp_column not in pp: msg = "DataFrame does not contain an '%s' column." raise ValueError(msg % (pp_column)) - _require_numeric(pp, [pp_column]) + pp = _cast_to_numeric(pp, [pp_column]) # If the size is unset, default to 200pt x 200pt, otherwise set size plotwidth = "200pt" diff --git a/tests/metrics/test_efficiency.py b/tests/metrics/test_efficiency.py index a6948cb..ec3e356 100644 --- a/tests/metrics/test_efficiency.py +++ b/tests/metrics/test_efficiency.py @@ -5,6 +5,7 @@ import pandas as pd +from p3analysis._utils import _cast_to_numeric from p3analysis.metrics import application_efficiency @@ -141,6 +142,7 @@ def test_non_numeric(self): expected_data.update(eff_data) expected_df = pd.DataFrame(expected_data) + expected_df = _cast_to_numeric(expected_df, ["fom", "app eff"]) pd.testing.assert_frame_equal(result, expected_df) From 8ed24540e6c2569b32ea93dfea76d2fc23d6939c Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 19 Sep 2024 10:12:13 +0100 Subject: [PATCH 4/4] Avoid false positives in _cast_to_numeric Previously, _cast_to_numeric would raise a TypeError if a column was missing. We now check that a column exists before trying to cast it. This does result in some redundant checking, as some functions call both _require_columns and _cast_to_numeric. We may want to consider refactoring to combine all of this functionality into a single DataFrame validation step. Signed-off-by: John Pennycook --- p3analysis/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/p3analysis/_utils.py b/p3analysis/_utils.py index a062a74..c656448 100644 --- a/p3analysis/_utils.py +++ b/p3analysis/_utils.py @@ -23,6 +23,7 @@ def _cast_to_numeric(df, columns): """ Check that the named columns are numeric, and cast them. """ + _require_columns(df, columns) result = df.copy(deep=True) for column in columns: try: