Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix validation of numeric types #69

Merged
merged 4 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions p3analysis/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@ def _require_columns(df, columns):
raise ValueError(msg % (column, str(columns)))


def _require_numeric(df, columns):
def _cast_to_numeric(df, columns):
"""
Check that the named columns are numeric.
Check that the named columns are numeric, and cast them.
"""

_require_columns(df, columns)
result = df.copy(deep=True)
for column in columns:
try:
pd.to_numeric(df[column])
result[column] = pd.to_numeric(df[column])
except Exception:
laserkelvin marked this conversation as resolved.
Show resolved Hide resolved
msg = "Column '%s' must contain only numeric values."
raise TypeError(msg % (column))
return result
4 changes: 2 additions & 2 deletions p3analysis/metrics/_efficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import numpy

from p3analysis._utils import _require_columns, _require_numeric
from p3analysis._utils import _cast_to_numeric, _require_columns


def application_efficiency(df, foms="lower"):
Expand Down Expand Up @@ -45,7 +45,7 @@ def application_efficiency(df, foms="lower"):
"""
required_columns = ["problem", "platform", "application", "fom"]
_require_columns(df, required_columns)
_require_numeric(df, ["fom"])
df = _cast_to_numeric(df, ["fom"])

if foms not in ["lower", "higher"]:
raise ValueError("FOM interpretation must be 'lower' or 'higher'")
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/metrics/_pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas as pd

from p3analysis._utils import _require_columns, _require_numeric
from p3analysis._utils import _cast_to_numeric, _require_columns


def _hmean(series):
Expand Down Expand Up @@ -79,7 +79,7 @@ def pp(df):
if len(efficiencies) == 0:
msg = "DataFrame must contain a column named 'arch eff' or 'app eff'."
raise ValueError(msg)
_require_numeric(df, efficiencies)
df = _cast_to_numeric(df, efficiencies)

# Check that efficiencies are not given in percentages
for eff in efficiencies:
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/plot/_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import copy

from p3analysis._utils import _require_columns, _require_numeric
from p3analysis._utils import _cast_to_numeric, _require_columns


def cascade(
Expand Down Expand Up @@ -115,7 +115,7 @@ def cascade(
if eff_column not in df:
msg = "DataFrame does not contain an '%s' column."
raise ValueError(msg % (eff_column))
_require_numeric(df, [eff_column])
df = _cast_to_numeric(df, [eff_column])

# Check there is only one entry per (application, platform) pair.
grouped = df.groupby(["platform", "application"])
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/plot/_navchart.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import copy

from p3analysis._utils import _require_columns, _require_numeric
from p3analysis._utils import _cast_to_numeric, _require_columns


def navchart(
Expand Down Expand Up @@ -86,7 +86,7 @@ def navchart(

_require_columns(pp, ["problem", "application"])
_require_columns(cd, ["problem", "application", "divergence"])
_require_numeric(cd, ["divergence"])
cd = _cast_to_numeric(cd, ["divergence"])

if len(cd["problem"].unique()) > 1:
raise NotImplementedError(
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/plot/backend/matplotlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from matplotlib.path import Path

import p3analysis.metrics
from p3analysis._utils import _require_numeric
from p3analysis._utils import _cast_to_numeric
from p3analysis.plot._common import ApplicationStyle, Legend, PlatformStyle
from p3analysis.plot.backend import CascadePlot, NavChart

Expand Down Expand Up @@ -471,7 +471,7 @@ def __init__(
if pp_column not in pp:
msg = "DataFrame does not contain an '%s' column."
raise ValueError(msg % (pp_column))
_require_numeric(pp, [pp_column])
pp = _cast_to_numeric(pp, [pp_column])

# If the size is unset, default to 5 x 5
if not size:
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/plot/backend/pgfplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pandas as pd

import p3analysis.metrics
from p3analysis._utils import _require_numeric
from p3analysis._utils import _cast_to_numeric
from p3analysis.plot._common import ApplicationStyle, Legend, PlatformStyle
from p3analysis.plot.backend import CascadePlot, NavChart

Expand Down Expand Up @@ -293,7 +293,7 @@ def __init__(
if pp_column not in pp:
msg = "DataFrame does not contain an '%s' column."
raise ValueError(msg % (pp_column))
_require_numeric(pp, [pp_column])
pp = _cast_to_numeric(pp, [pp_column])

# If the size is unset, default to 200pt x 200pt, otherwise set size
plotwidth = "200pt"
Expand Down
37 changes: 37 additions & 0 deletions tests/metrics/test_efficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd

from p3analysis._utils import _cast_to_numeric
from p3analysis.metrics import application_efficiency


Expand All @@ -20,6 +21,19 @@ def test_required_columns(self):
with self.assertRaises(ValueError):
application_efficiency(df)

def test_required_column_types(self):
"""Check that application_efficiency() validates column types."""
data = {
"problem": ["problem"],
"platform": ["platform"],
"application": ["application"],
"fom": ["non-numeric"],
}
df = pd.DataFrame(data)

with self.assertRaises(TypeError):
application_efficiency(df)

def test_foms(self):
"""p3analysis.data.efficiency.foms"""
data = {"problem": [], "platform": [], "application": [], "fom": []}
Expand Down Expand Up @@ -108,6 +122,29 @@ def test_efficiency(self):
with self.assertRaises(TypeError):
application_efficiency(df, foms="higher")

def test_non_numeric(self):
"""Check that non-numeric data is correctly cast to numeric."""
# This is the same correctness test as above, but values are strings.
data = {
"problem": ["test"] * 10,
"platform": ["A", "B", "C", "D", "E"] * 2,
"application": ["latest"] * 5 + ["best"] * 5,
"fom": ["4", "8", "4", 0, "20"] + ["4", "10", "8", "20", "100"],
}
df = pd.DataFrame(data)

result = application_efficiency(df, foms="higher")

eff_data = {
"app eff": [1.0, 0.8, 0.5, 0.0, 0.2] + [1.0, 1.0, 1.0, 1.0, 1.0],
}
expected_data = data.copy()
expected_data.update(eff_data)
expected_df = pd.DataFrame(expected_data)

expected_df = _cast_to_numeric(expected_df, ["fom", "app eff"])
pd.testing.assert_frame_equal(result, expected_df)


if __name__ == "__main__":
unittest.main()
13 changes: 13 additions & 0 deletions tests/metrics/test_pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ def test_required_columns(self):
with self.assertRaises(ValueError):
pp(df)

def test_required_column_types(self):
"""Check that pp() validates column types."""
data = {
"problem": ["problem"],
"platform": ["platform"],
"application": ["application"],
"arch eff": ["non-numeric"],
}
df = pd.DataFrame(data)

with self.assertRaises(TypeError):
pp(df)

def test_effs(self):
"""p3analysis.data.pp.effs"""
data = {
Expand Down
13 changes: 13 additions & 0 deletions tests/plot/test_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ def test_required_columns(self):
with self.assertRaises(ValueError):
cascade(df, eff="invalid")

def test_required_column_types(self):
"""Check that cascade() validates column types."""
data = {
"problem": ["problem"],
"platform": ["platform"],
"application": ["application"],
"app eff": ["non-numeric"],
}
df = pd.DataFrame(data)

with self.assertRaises(TypeError):
cascade(df, eff="app")

def test_options(self):
"""p3analysis.plot.cascade.options"""

Expand Down
24 changes: 24 additions & 0 deletions tests/plot/test_navchart.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,30 @@ def test_required_columns(self):
})
navchart(pp, cd)

def test_required_column_types(self):
"""Check that navchart() validates column types."""
pp = pd.DataFrame({
"problem": ["problem"],
"platform": ["platform"],
"application": ["application"],
"app pp": [1.0],
})
cd = pd.DataFrame({
"problem": ["problem"],
"application": ["application"],
"divergence": [0.0],
})

with self.assertRaises(TypeError):
tmp = pp
tmp["app pp"] = "non-numeric"
navchart(tmp, cd)

with self.assertRaises(TypeError):
tmp = cd
tmp["divergence"] = "non-numeric"
navchart(pp, tmp)

def test_options(self):
"""p3analysis.plot.navchart.options"""

Expand Down
Loading