Skip to content

Commit

Permalink
Merge pull request #69 from Pennycook/cast-to-numeric
Browse files Browse the repository at this point in the history
Fix validation of numeric types
  • Loading branch information
Pennycook authored Sep 25, 2024
2 parents d615e04 + 8ed2454 commit 2a5210e
Show file tree
Hide file tree
Showing 11 changed files with 105 additions and 16 deletions.
10 changes: 6 additions & 4 deletions p3analysis/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@ def _require_columns(df, columns):
raise ValueError(msg % (column, str(columns)))


def _require_numeric(df, columns):
def _cast_to_numeric(df, columns):
"""
Check that the named columns are numeric.
Check that the named columns are numeric, and cast them.
"""

_require_columns(df, columns)
result = df.copy(deep=True)
for column in columns:
try:
pd.to_numeric(df[column])
result[column] = pd.to_numeric(df[column])
except Exception:
msg = "Column '%s' must contain only numeric values."
raise TypeError(msg % (column))
return result
4 changes: 2 additions & 2 deletions p3analysis/metrics/_efficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import numpy

from p3analysis._utils import _require_columns, _require_numeric
from p3analysis._utils import _cast_to_numeric, _require_columns


def application_efficiency(df, foms="lower"):
Expand Down Expand Up @@ -45,7 +45,7 @@ def application_efficiency(df, foms="lower"):
"""
required_columns = ["problem", "platform", "application", "fom"]
_require_columns(df, required_columns)
_require_numeric(df, ["fom"])
df = _cast_to_numeric(df, ["fom"])

if foms not in ["lower", "higher"]:
raise ValueError("FOM interpretation must be 'lower' or 'higher'")
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/metrics/_pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas as pd

from p3analysis._utils import _require_columns, _require_numeric
from p3analysis._utils import _cast_to_numeric, _require_columns


def _hmean(series):
Expand Down Expand Up @@ -79,7 +79,7 @@ def pp(df):
if len(efficiencies) == 0:
msg = "DataFrame must contain a column named 'arch eff' or 'app eff'."
raise ValueError(msg)
_require_numeric(df, efficiencies)
df = _cast_to_numeric(df, efficiencies)

# Check that efficiencies are not given in percentages
for eff in efficiencies:
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/plot/_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import copy

from p3analysis._utils import _require_columns, _require_numeric
from p3analysis._utils import _cast_to_numeric, _require_columns


def cascade(
Expand Down Expand Up @@ -115,7 +115,7 @@ def cascade(
if eff_column not in df:
msg = "DataFrame does not contain an '%s' column."
raise ValueError(msg % (eff_column))
_require_numeric(df, [eff_column])
df = _cast_to_numeric(df, [eff_column])

# Check there is only one entry per (application, platform) pair.
grouped = df.groupby(["platform", "application"])
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/plot/_navchart.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import copy

from p3analysis._utils import _require_columns, _require_numeric
from p3analysis._utils import _cast_to_numeric, _require_columns


def navchart(
Expand Down Expand Up @@ -86,7 +86,7 @@ def navchart(

_require_columns(pp, ["problem", "application"])
_require_columns(cd, ["problem", "application", "divergence"])
_require_numeric(cd, ["divergence"])
cd = _cast_to_numeric(cd, ["divergence"])

if len(cd["problem"].unique()) > 1:
raise NotImplementedError(
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/plot/backend/matplotlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from matplotlib.path import Path

import p3analysis.metrics
from p3analysis._utils import _require_numeric
from p3analysis._utils import _cast_to_numeric
from p3analysis.plot._common import ApplicationStyle, Legend, PlatformStyle
from p3analysis.plot.backend import CascadePlot, NavChart

Expand Down Expand Up @@ -471,7 +471,7 @@ def __init__(
if pp_column not in pp:
msg = "DataFrame does not contain an '%s' column."
raise ValueError(msg % (pp_column))
_require_numeric(pp, [pp_column])
pp = _cast_to_numeric(pp, [pp_column])

# If the size is unset, default to 5 x 5
if not size:
Expand Down
4 changes: 2 additions & 2 deletions p3analysis/plot/backend/pgfplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pandas as pd

import p3analysis.metrics
from p3analysis._utils import _require_numeric
from p3analysis._utils import _cast_to_numeric
from p3analysis.plot._common import ApplicationStyle, Legend, PlatformStyle
from p3analysis.plot.backend import CascadePlot, NavChart

Expand Down Expand Up @@ -293,7 +293,7 @@ def __init__(
if pp_column not in pp:
msg = "DataFrame does not contain an '%s' column."
raise ValueError(msg % (pp_column))
_require_numeric(pp, [pp_column])
pp = _cast_to_numeric(pp, [pp_column])

# If the size is unset, default to 200pt x 200pt, otherwise set size
plotwidth = "200pt"
Expand Down
37 changes: 37 additions & 0 deletions tests/metrics/test_efficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd

from p3analysis._utils import _cast_to_numeric
from p3analysis.metrics import application_efficiency


Expand All @@ -20,6 +21,19 @@ def test_required_columns(self):
with self.assertRaises(ValueError):
application_efficiency(df)

def test_required_column_types(self):
"""Check that application_efficiency() validates column types."""
data = {
"problem": ["problem"],
"platform": ["platform"],
"application": ["application"],
"fom": ["non-numeric"],
}
df = pd.DataFrame(data)

with self.assertRaises(TypeError):
application_efficiency(df)

def test_foms(self):
"""p3analysis.data.efficiency.foms"""
data = {"problem": [], "platform": [], "application": [], "fom": []}
Expand Down Expand Up @@ -108,6 +122,29 @@ def test_efficiency(self):
with self.assertRaises(TypeError):
application_efficiency(df, foms="higher")

def test_non_numeric(self):
"""Check that non-numeric data is correctly cast to numeric."""
# This is the same correctness test as above, but values are strings.
data = {
"problem": ["test"] * 10,
"platform": ["A", "B", "C", "D", "E"] * 2,
"application": ["latest"] * 5 + ["best"] * 5,
"fom": ["4", "8", "4", 0, "20"] + ["4", "10", "8", "20", "100"],
}
df = pd.DataFrame(data)

result = application_efficiency(df, foms="higher")

eff_data = {
"app eff": [1.0, 0.8, 0.5, 0.0, 0.2] + [1.0, 1.0, 1.0, 1.0, 1.0],
}
expected_data = data.copy()
expected_data.update(eff_data)
expected_df = pd.DataFrame(expected_data)

expected_df = _cast_to_numeric(expected_df, ["fom", "app eff"])
pd.testing.assert_frame_equal(result, expected_df)


if __name__ == "__main__":
unittest.main()
13 changes: 13 additions & 0 deletions tests/metrics/test_pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ def test_required_columns(self):
with self.assertRaises(ValueError):
pp(df)

def test_required_column_types(self):
"""Check that pp() validates column types."""
data = {
"problem": ["problem"],
"platform": ["platform"],
"application": ["application"],
"arch eff": ["non-numeric"],
}
df = pd.DataFrame(data)

with self.assertRaises(TypeError):
pp(df)

def test_effs(self):
"""p3analysis.data.pp.effs"""
data = {
Expand Down
13 changes: 13 additions & 0 deletions tests/plot/test_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ def test_required_columns(self):
with self.assertRaises(ValueError):
cascade(df, eff="invalid")

def test_required_column_types(self):
"""Check that cascade() validates column types."""
data = {
"problem": ["problem"],
"platform": ["platform"],
"application": ["application"],
"app eff": ["non-numeric"],
}
df = pd.DataFrame(data)

with self.assertRaises(TypeError):
cascade(df, eff="app")

def test_options(self):
"""p3analysis.plot.cascade.options"""

Expand Down
24 changes: 24 additions & 0 deletions tests/plot/test_navchart.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,30 @@ def test_required_columns(self):
})
navchart(pp, cd)

def test_required_column_types(self):
"""Check that navchart() validates column types."""
pp = pd.DataFrame({
"problem": ["problem"],
"platform": ["platform"],
"application": ["application"],
"app pp": [1.0],
})
cd = pd.DataFrame({
"problem": ["problem"],
"application": ["application"],
"divergence": [0.0],
})

with self.assertRaises(TypeError):
tmp = pp
tmp["app pp"] = "non-numeric"
navchart(tmp, cd)

with self.assertRaises(TypeError):
tmp = cd
tmp["divergence"] = "non-numeric"
navchart(pp, tmp)

def test_options(self):
"""p3analysis.plot.navchart.options"""

Expand Down

0 comments on commit 2a5210e

Please sign in to comment.