Skip to content

Commit

Permalink
Column as Model: Add tests and some fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Jul 16, 2024
1 parent 73f6125 commit 62d47f2
Show file tree
Hide file tree
Showing 6 changed files with 473 additions and 53 deletions.
1 change: 1 addition & 0 deletions Orange/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .sgd import *
from .neural_network import *
from .calibration import *
from .column import *
try:
from .catgb import *
except ModuleNotFoundError:
Expand Down
42 changes: 26 additions & 16 deletions Orange/classification/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

import numpy as np

from Orange.data import Variable, DiscreteVariable, Domain
from Orange.data import Variable, DiscreteVariable, Domain, Table
from Orange.classification import Model, Learner


__all__ = ["ColumnClassifier"]
__all__ = ["ColumnLearner", "ColumnClassifier"]


class ColumnLearner(Learner):
Expand Down Expand Up @@ -36,16 +36,21 @@ def __init__(self,
super().__init__(Domain([column], class_var))
assert class_var.is_discrete
if column.is_continuous:
assert len(class_var.values) == 2
self.value_mapping = np.array([0, 1])
if len(class_var.values) != 2:
raise ValueError("Numeric column can only be used with "
"binary class variable")
self.value_mapping = None
else:
assert column.is_discrete
assert isinstance(column, DiscreteVariable)
assert offset is None and k is None
if not self.check_value_sets(class_var, column):
raise ValueError(
"Column contains values that are not in class variable")
self.value_mapping = np.array(
[class_var.to_val(x) for x in column.values])
if class_var.values[:len(column.values)] == column.values:
self.value_mapping = None
else:
self.value_mapping = np.array(
[class_var.to_val(x) for x in column.values])
self.class_var = class_var
self.column = column
self.offset = offset
Expand All @@ -61,27 +66,32 @@ def check_value_sets(class_var: DiscreteVariable,
column_var: DiscreteVariable):
return set(column_var.values) <= set(class_var.values)

def predict_storage(self, data):
def predict_storage(self, data: Table):
vals = data.get_column(self.column)
rows = np.isfinite(vals)
nclasses = len(self.class_var.values)
proba = np.full((len(data), nclasses), 1 / nclasses)
if self.column.is_discrete:
proba = np.zeros((len(data), len(self.class_var.values)))
vals = self.value_mapping[vals[rows].astype(int)]
proba[rows, vals] = 1
mapped = vals[rows].astype(int)
if self.value_mapping is not None:
mapped = self.value_mapping[mapped]
vals = vals.copy()
vals[rows] = mapped
proba[rows] = 0
proba[rows, mapped] = 1
else:
proba = np.full((len(data), len(self.class_var.values)), 0.5)
if self.k is None:
if not self.check_prob_range(vals):
raise ValueError("Column values must be in [0, 1] range "
"unless logistic function is applied")
proba[rows, 1] = vals[rows]
proba[rows, 0] = 1 - vals[rows]
vals = vals > 0.5
else:
proba[rows, 1] = (
1 / (1 + np.exp(-self.k * (vals[rows] - self.offset))))
proba[rows, 0] = 1 - proba[:, 1]
vals = vals > self.offset

proba[rows, 0] = 1 - proba[rows, 1]
vals = (proba[:, 1] > 0.5).astype(float)
vals[~rows] = np.nan
return vals, proba

def __str__(self):
Expand Down
165 changes: 165 additions & 0 deletions Orange/classification/tests/test_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import unittest
from unittest.mock import patch

import numpy as np

from Orange.classification import ColumnLearner, ColumnClassifier
from Orange.data import DiscreteVariable, ContinuousVariable, Domain, Table


class ColumnTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.domain = Domain([DiscreteVariable("d1", values=["a", "b"]),
DiscreteVariable("d2", values=["c", "d"]),
DiscreteVariable("d3", values=["d", "c"]),
ContinuousVariable("c1"),
ContinuousVariable("c2")
],
DiscreteVariable("cls", values=["c", "d"]),
[DiscreteVariable("m1", values=["a", "b"]),
DiscreteVariable("m2", values=["d"]),
ContinuousVariable("c3")]
)
cls.data = Table.from_numpy(
cls.domain,
np.array([[0, 0, 0, 1, 0.5],
[0, 1, 1, 0.25, -3],
[1, 0, np.nan, np.nan, np.nan]]),
np.array([0, 1, 1]),
np.array([[0, 0, 2],
[1, 0, 8],
[np.nan, np.nan, 5]])
)

@patch("Orange.classification.column.ColumnClassifier")
def test_fit_storage(self, clsfr):
learner = ColumnLearner(self.domain.class_var, self.domain["d2"])
self.assertEqual(learner.name, "column 'd2'")
learner.fit_storage(self.data)
clsfr.assert_called_with(self.domain.class_var, self.domain["d2"], None, None)

learner = ColumnLearner(self.domain.class_var, self.domain["c3"])
learner.fit_storage(self.data)
clsfr.assert_called_with(self.domain.class_var, self.domain["c3"], None, None)

learner = ColumnLearner(self.domain.class_var, self.domain["c3"], 42, 3.5)
self.assertEqual(learner.name, "column 'c3'")
learner.fit_storage(self.data)
clsfr.assert_called_with(self.domain.class_var, self.domain["c3"], 42, 3.5)

def test_classifier_init_checks(self):
cls = ColumnClassifier(self.domain.class_var, self.domain["d2"])
cls.name = "column 'd2'"

cls = ColumnClassifier(self.domain.class_var, self.domain["d3"])
cls.name = "column 'd3'"

cls = ColumnClassifier(self.domain.class_var, self.domain["c3"])
cls.name = "column 'c3'"

self.assertRaises(
ValueError,
ColumnClassifier,
self.domain.class_var, self.domain["d1"])

self.assertRaises(
ValueError,
ColumnClassifier,
DiscreteVariable("x", values=("a", "b", "c")), self.domain["c3"])

def test_check_prob_range(self):
self.assertTrue(
ColumnClassifier.check_prob_range(np.array([0, 0.5, 1]))
)
self.assertTrue(
ColumnClassifier.check_prob_range(np.array([0, 0.5, np.nan]))
)
self.assertFalse(
ColumnClassifier.check_prob_range(np.array([0, 0.5, 1.5]))
)
self.assertFalse(
ColumnClassifier.check_prob_range(np.array([0, 0.5, -1]))
)

def test_check_value_sets(self):
d1, d2, d3, *_ = self.domain.attributes
c = self.domain.class_var
m2: DiscreteVariable = self.domain["m2"]
self.assertFalse(ColumnClassifier.check_value_sets(c, d1))
self.assertTrue(ColumnClassifier.check_value_sets(c ,d2))
self.assertTrue(ColumnClassifier.check_value_sets(c, d3))
self.assertTrue(ColumnClassifier.check_value_sets(c, m2))
self.assertFalse(ColumnClassifier.check_value_sets(m2, c))

def test_predict_discrete(self):
# Just copy
model = ColumnClassifier(self.domain.class_var, self.domain["d2"])
self.assertEqual(model.name, "column 'd2'")
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [0, 1, 0])
np.testing.assert_equal(probs, [[1, 0], [0, 1], [1, 0]])

# Values are not in the same order -> map
model = ColumnClassifier(self.domain.class_var, self.domain["d3"])
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [1, 0, np.nan])
np.testing.assert_equal(probs, [[0, 1], [1, 0], [0.5, 0.5]])

# Not in the same order, and one is missing -> map
model = ColumnClassifier(self.domain.class_var, self.domain["m2"])
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [1, 1, np.nan])
np.testing.assert_equal(probs, [[0, 1], [0, 1], [0.5, 0.5]])

# Non-binary class
domain = Domain(
self.domain.attributes,
DiscreteVariable("cls", values=["a", "c", "b", "d", "e"]))
data = Table.from_numpy(domain, self.data.X, self.data.Y)
model = ColumnClassifier(domain.class_var, domain["d3"])
classes, probs = model(data, model.ValueProbs)
np.testing.assert_equal(classes, [3, 1, np.nan])
np.testing.assert_almost_equal(
probs,
np.array([[0, 0, 0, 1, 0],
[0, 1, 0, 0, 0],
[0.2, 0.2, 0.2, 0.2, 0.2]]))

def test_predict_as_direct_probs(self):
model = ColumnClassifier(self.domain.class_var, self.domain["c1"])
self.assertEqual(model.name, "column 'c1'")
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [1, 0, np.nan])
np.testing.assert_equal(probs, [[0, 1], [0.75, 0.25], [0.5, 0.5]])

model = ColumnClassifier(self.domain.class_var, self.domain["c2"])
self.assertRaises(ValueError, model, self.data)

model = ColumnClassifier(self.domain.class_var, self.domain["c3"])
self.assertRaises(ValueError, model, self.data)

def test_predict_with_logistic(self):
model = ColumnClassifier(
self.domain.class_var, self.domain["c1"], 0.5, 3)
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [1, 0, np.nan])
np.testing.assert_almost_equal(
probs[:, 1], [1 / (1 + np.exp(-3 * (1 - 0.5))),
1 / (1 + np.exp(-3 * (0.25 - 0.5))),
0.5])
np.testing.assert_equal(probs[:, 0], 1 - probs[:, 1])

model = ColumnClassifier(
self.domain.class_var, self.domain["c2"], 0.5, 3)
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [0, 0, np.nan])
np.testing.assert_almost_equal(
probs[:, 1], [1 / (1 + np.exp(-3 * (0.5 - 0.5))),
1 / (1 + np.exp(-3 * (-3 - 0.5))),
0.5])
np.testing.assert_equal(probs[:, 0], 1 - probs[:, 1])


if __name__ == "__main__":
unittest.main()
29 changes: 17 additions & 12 deletions Orange/tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
SVMLearner, LinearSVMLearner, OneClassSVMLearner, TreeLearner, KNNLearner,
SimpleRandomForestLearner, EllipticEnvelopeLearner, ThresholdLearner,
CalibratedLearner)
from Orange.classification.column import ColumnLearner
from Orange.classification.rules import _RuleLearner
from Orange.data import (ContinuousVariable, DiscreteVariable,
Domain, Table)
Expand All @@ -30,6 +31,10 @@
from Orange.tests.dummy_learners import DummyLearner, DummyMulticlassLearner
from Orange.tests import test_filename

# While this could be determined automatically from __init__ signatures,
# it is better to do it explicitly
LEARNERS_WITH_ARGUMENTS = (ThresholdLearner, CalibratedLearner, ColumnLearner)


def all_learners():
classification_modules = pkgutil.walk_packages(
Expand Down Expand Up @@ -214,8 +219,7 @@ def test_result_shape(self):
"""
iris = Table('iris')
for learner in all_learners():
# calibration, threshold learners' __init__ requires arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue

with self.subTest(learner.__name__):
Expand Down Expand Up @@ -256,6 +260,8 @@ def test_result_shape_numpy(self):
args = []
if learner in (ThresholdLearner, CalibratedLearner):
args = [LogisticRegressionLearner()]
elif learner in LEARNERS_WITH_ARGUMENTS:
continue
data = iris_bin if learner is ThresholdLearner else iris
model = learner(*args)(data)
transformed_iris = model.data_to_model_domain(data)
Expand All @@ -277,6 +283,10 @@ def test_predict_proba(self):
continue
if learner in (ThresholdLearner, CalibratedLearner):
model = learner(LogisticRegressionLearner())(data)
elif learner in LEARNERS_WITH_ARGUMENTS:
# note that above two also require arguments, but we
# provide them
continue
else:
model = learner()(data)
probs = model.predict_proba(data)
Expand Down Expand Up @@ -385,8 +395,7 @@ def test_unknown(self):
def test_missing_class(self):
table = Table(test_filename("datasets/adult_sample_missing"))
for learner in all_learners():
# calibration, threshold learners' __init__ require arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
# Skip slow tests
if isinstance(learner, _RuleLearner):
Expand Down Expand Up @@ -414,8 +423,7 @@ def test_all_learners_accessible_in_Orange_classification_namespace(self):
def test_all_models_work_after_unpickling(self):
datasets = [Table('iris'), Table('titanic')]
for learner in list(all_learners()):
# calibration, threshold learners' __init__ require arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
# Skip slow tests
if issubclass(learner, _RuleLearner):
Expand All @@ -438,8 +446,7 @@ def test_all_models_work_after_unpickling(self):
def test_all_models_work_after_unpickling_pca(self):
datasets = [Table('iris'), Table('titanic')]
for learner in list(all_learners()):
# calibration, threshold learners' __init__ require arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
# Skip slow tests
if issubclass(learner, _RuleLearner):
Expand All @@ -462,8 +469,7 @@ def test_all_models_work_after_unpickling_pca(self):

def test_adequacy_all_learners(self):
for learner in all_learners():
# calibration, threshold learners' __init__ requires arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
with self.subTest(learner.__name__):
learner = learner()
Expand All @@ -472,8 +478,7 @@ def test_adequacy_all_learners(self):

def test_adequacy_all_learners_multiclass(self):
for learner in all_learners():
# calibration, threshold learners' __init__ require arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
with self.subTest(learner.__name__):
learner = learner()
Expand Down
Loading

0 comments on commit 62d47f2

Please sign in to comment.