From e645b771a77ba0cda1c48e1ac1658a3d7b44cabe Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 16:02:54 +0200 Subject: [PATCH 01/13] Move conversion preprocessor tests into test_preprocess --- .../spectroscopy/tests/test_conversion.py | 65 ++---------------- .../spectroscopy/tests/test_preprocess.py | 67 +++++++++++++++++++ 2 files changed, 71 insertions(+), 61 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_conversion.py b/orangecontrib/spectroscopy/tests/test_conversion.py index 7f1f20618..c445c2323 100644 --- a/orangecontrib/spectroscopy/tests/test_conversion.py +++ b/orangecontrib/spectroscopy/tests/test_conversion.py @@ -10,19 +10,17 @@ from Orange.evaluation.scoring import AUC from Orange.data.table import DomainTransformationError -from orangecontrib.spectroscopy.tests.test_preprocess import \ - PREPROCESSORS_INDEPENDENT_SAMPLES, \ - PREPROCESSORS - -from orangecontrib.spectroscopy.tests.test_preprocess import SMALL_COLLAGEN, preprocessor_data - from orangecontrib.spectroscopy.preprocess import Interpolate, \ Cut, SavitzkyGolayFiltering from orangecontrib.spectroscopy.data import getx +from orangecontrib.spectroscopy.tests.util import smaller_data logreg = LogisticRegressionLearner(max_iter=1000) +COLLAGEN = Orange.data.Table("collagen") +SMALL_COLLAGEN = smaller_data(COLLAGEN, 2, 2) + def separate_learn_test(data): sf = ms.ShuffleSplit(n_splits=1, test_size=0.2, random_state=np.random.RandomState(0)) @@ -80,20 +78,6 @@ def test_predict_different_domain_interpolation(self): # the more we cut the lower precision we get self.assertTrue(aucorig > auccut1 > auccut2 > auccut3) - def test_whole_and_train_separete(self): - """ Applying a preprocessor before spliting data into train and test - and applying is just on train data should yield the same transformation of - the test data. """ - for proc in PREPROCESSORS_INDEPENDENT_SAMPLES: - with self.subTest(proc): - data = preprocessor_data(proc) - _, test1 = separate_learn_test(proc(data)) - train, test = separate_learn_test(data) - train = proc(train) - test_transformed = test.transform(train.domain) - np.testing.assert_almost_equal(test_transformed.X, test1.X, - err_msg="Preprocessor " + str(proc)) - def test_predict_savgov_same_domain(self): data = SavitzkyGolayFiltering(window=9, polyorder=2, deriv=2)(self.collagen) train, test = separate_learn_test(data) @@ -107,44 +91,3 @@ def test_predict_savgol_another_interpolate(self): train = Interpolate(points=getx(train))(train) aucai = AUC(TestOnTestData()(train, test, [logreg])) self.assertAlmostEqual(auc, aucai, delta=0.02) - - def test_slightly_different_domain(self): - """ If test data has a slightly different domain then (with interpolation) - we should obtain a similar classification score. """ - # rows full of unknowns make LogisticRegression undefined - # we can obtain them, for example, with EMSC, if one of the badspectra - # is a spectrum from the data - learner = LogisticRegressionLearner(max_iter=1000, preprocessors=[_RemoveNaNRows()]) - - for proc in PREPROCESSORS: - if hasattr(proc, "skip_add_zeros"): - continue - with self.subTest(proc): - # LR that can not handle unknown values - train, test = separate_learn_test(preprocessor_data(proc)) - train1 = proc(train) - aucorig = AUC(TestOnTestData()(train1, test, [learner])) - test = slightly_change_wavenumbers(test, 0.00001) - test = odd_attr(test) - # a subset of points for training so that all test sets points - # are within the train set points, which gives no unknowns - train = Interpolate(points=getx(train)[1:-3])(train) # interpolatable train - train = proc(train) - # explicit domain conversion test to catch exceptions that would - # otherwise be silently handled in TestOnTestData - _ = test.transform(train.domain) - aucnow = AUC(TestOnTestData()(train, test, [learner])) - self.assertAlmostEqual(aucnow, aucorig, delta=0.03, msg="Preprocessor " + str(proc)) - test = Interpolate(points=getx(test) - 1.)(test) # also do a shift - _ = test.transform(train.domain) # explicit call again - aucnow = AUC(TestOnTestData()(train, test, [learner])) - # the difference should be slight - self.assertAlmostEqual(aucnow, aucorig, delta=0.05, msg="Preprocessor " + str(proc)) - - -class _RemoveNaNRows(Orange.preprocess.preprocess.Preprocess): - - def __call__(self, data): - mask = np.isnan(data.X) - mask = np.any(mask, axis=1) - return data[~mask] diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index 98e13466e..8127d10ce 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -4,7 +4,9 @@ import numpy as np import Orange +from Orange.classification import LogisticRegressionLearner from Orange.data import Table +from Orange.evaluation import TestOnTestData, AUC from Orange.preprocess.preprocess import PreprocessorList from orangecontrib.spectroscopy.data import getx @@ -18,6 +20,7 @@ from orangecontrib.spectroscopy.preprocess.me_emsc import ME_EMSC from orangecontrib.spectroscopy.preprocess.atm_corr import AtmCorr from orangecontrib.spectroscopy.preprocess.utils import replacex +from orangecontrib.spectroscopy.tests.test_conversion import separate_learn_test, slightly_change_wavenumbers, odd_attr from orangecontrib.spectroscopy.tests.util import smaller_data @@ -485,6 +488,70 @@ def test_reference_exceptions(self): NormalizeReference(reference=Table.from_numpy(None, [[2], [6]])) +class TestConversion(unittest.TestCase): + + preprocessors = PREPROCESSORS + + def test_slightly_different_domain(self): + """ If test data has a slightly different domain then (with interpolation) + we should obtain a similar classification score. """ + # rows full of unknowns make LogisticRegression undefined + # we can obtain them, for example, with EMSC, if one of the badspectra + # is a spectrum from the data + learner = LogisticRegressionLearner(max_iter=1000, preprocessors=[_RemoveNaNRows()]) + + for proc in self.preprocessors: + if hasattr(proc, "skip_add_zeros"): + continue + with self.subTest(proc): + # LR that can not handle unknown values + train, test = separate_learn_test(preprocessor_data(proc)) + train1 = proc(train) + aucorig = AUC(TestOnTestData()(train1, test, [learner])) + test = slightly_change_wavenumbers(test, 0.00001) + test = odd_attr(test) + # a subset of points for training so that all test sets points + # are within the train set points, which gives no unknowns + train = Interpolate(points=getx(train)[1:-3])(train) # interpolatable train + train = proc(train) + # explicit domain conversion test to catch exceptions that would + # otherwise be silently handled in TestOnTestData + _ = test.transform(train.domain) + aucnow = AUC(TestOnTestData()(train, test, [learner])) + self.assertAlmostEqual(aucnow, aucorig, delta=0.03, msg="Preprocessor " + str(proc)) + test = Interpolate(points=getx(test) - 1.)(test) # also do a shift + _ = test.transform(train.domain) # explicit call again + aucnow = AUC(TestOnTestData()(train, test, [learner])) + # the difference should be slight + self.assertAlmostEqual(aucnow, aucorig, delta=0.05, msg="Preprocessor " + str(proc)) + + +class TestConversionIndpSamples(TestConversion, unittest.TestCase): + + preprocessors = PREPROCESSORS_INDEPENDENT_SAMPLES + + def test_whole_and_train_separate(self): + """ Applying a preprocessor before spliting data into train and test + and applying is just on train data should yield the same transformation of + the test data. """ + for proc in self.preprocessors: + with self.subTest(proc): + data = preprocessor_data(proc) + _, test1 = separate_learn_test(proc(data)) + train, test = separate_learn_test(data) + train = proc(train) + test_transformed = test.transform(train.domain) + np.testing.assert_almost_equal(test_transformed.X, test1.X, + err_msg="Preprocessor " + str(proc)) + +class _RemoveNaNRows(Orange.preprocess.preprocess.Preprocess): + + def __call__(self, data): + mask = np.isnan(data.X) + mask = np.any(mask, axis=1) + return data[~mask] + + class TestCommon(unittest.TestCase): def test_no_samples(self): From 62c94af62d59f46c202803462e81f3c8b0a16cfd Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 16:15:43 +0200 Subject: [PATCH 02/13] TestCommon -> TestStrangeData --- .../spectroscopy/tests/test_preprocess.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index 8127d10ce..660fd4913 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -550,14 +550,16 @@ def __call__(self, data): mask = np.isnan(data.X) mask = np.any(mask, axis=1) return data[~mask] - -class TestCommon(unittest.TestCase): + +class TestStrangeData(unittest.TestCase): + + preprocessors = PREPROCESSORS def test_no_samples(self): """ Preprocessors should not crash when there are no input samples. """ data = SMALL_COLLAGEN[:0] - for proc in PREPROCESSORS: + for proc in self.preprocessors: with self.subTest(proc): _ = proc(data) @@ -567,13 +569,13 @@ def test_no_attributes(self): data = data.transform(Orange.data.Domain([], class_vars=data.domain.class_vars, metas=data.domain.metas)) - for proc in PREPROCESSORS: + for proc in self.preprocessors: with self.subTest(proc): _ = proc(data) def test_all_nans(self): """ Preprocessors should not crash when there are all-nan samples. """ - for proc in PREPROCESSORS: + for proc in self.preprocessors: with self.subTest(proc): data = preprocessor_data(proc).copy() with data.unlocked(): @@ -584,7 +586,7 @@ def test_all_nans(self): continue # allow explicit preprocessor exception def test_unordered_features(self): - for proc in PREPROCESSORS: + for proc in self.preprocessors: with self.subTest(proc): data = preprocessor_data(proc) data_reversed = reverse_attr(data) @@ -599,7 +601,7 @@ def test_unordered_features(self): np.testing.assert_almost_equal(X, X_shuffle, err_msg="Preprocessor " + str(proc)) def test_unknown_no_propagate(self): - for proc in PREPROCESSORS: + for proc in self.preprocessors: with self.subTest(proc): data = preprocessor_data(proc).copy() # one unknown in line @@ -615,7 +617,7 @@ def test_unknown_no_propagate(self): def test_no_infs(self): """ Preprocessors should not return (-)inf """ - for proc in PREPROCESSORS: + for proc in self.preprocessors: with self.subTest(proc): data = preprocessor_data(proc).copy() # add some zeros to the dataset From 4e370b274386e66d090caebe51c5043128c18a4c Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 16:41:58 +0200 Subject: [PATCH 03/13] test_tile_reader: no need to test all preprocessors Because tile reading uses domain transformations that are already tested in TestConversionIndpSamplesMixin --- .../spectroscopy/tests/test_tile_reader.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_tile_reader.py b/orangecontrib/spectroscopy/tests/test_tile_reader.py index 8a92fec4a..0e5a32d65 100644 --- a/orangecontrib/spectroscopy/tests/test_tile_reader.py +++ b/orangecontrib/spectroscopy/tests/test_tile_reader.py @@ -9,7 +9,8 @@ from Orange.widgets.tests.base import WidgetTest from orangecontrib.spectroscopy import get_sample_datasets_dir -from orangecontrib.spectroscopy.tests.test_preprocess import PREPROCESSORS_INDEPENDENT_SAMPLES +from orangecontrib.spectroscopy.preprocess import Interpolate, SavitzkyGolayFiltering, Cut, \ + GaussianSmoothing, Absorbance, Transmittance, Integrate from orangecontrib.spectroscopy.widgets.owintegrate import OWIntegrate from orangecontrib.spectroscopy.widgets.owpreprocess import OWPreprocess, \ create_preprocessor @@ -18,11 +19,18 @@ AGILENT_TILE = "agilent/5_mosaic_agg1024.dmt" -# EMSC test fails on this dataset with -# "ValueError: On entry to DLASCL parameter number 4 had an illegal value" -PREPROCESSORS_INDEPENDENT_SAMPLES_NO_EMSC = [ - p for p in PREPROCESSORS_INDEPENDENT_SAMPLES - if type(p).__name__ not in ["EMSC", "ExtractEXAFSUsage", "ME_EMSC"]] +# no need to test all preprocessors here because tile reading uses domain +# transformations that are already tested in TestConversionIndpSamplesMixin +PREPROCESSORS_SEQUENCE = [ + Interpolate(np.linspace(1000, 1700, 100)), + SavitzkyGolayFiltering(window=9, polyorder=2, deriv=2), + Cut(lowlim=1000, highlim=1800), + GaussianSmoothing(sd=3.), + Absorbance(), + Transmittance(), + Integrate(limits=[[900, 100], [1100, 1200], [1200, 1300]]) +] + class TestTileReaders(unittest.TestCase): @@ -51,7 +59,7 @@ def test_single_preproc(self): # TODO problematic interface design: should be able to use Orange.data.Table directly path = os.path.join(get_sample_datasets_dir(), AGILENT_TILE) reader = OWTilefile.get_tile_reader(path) - for p in PREPROCESSORS_INDEPENDENT_SAMPLES_NO_EMSC: + for p in PREPROCESSORS_SEQUENCE: reader.set_preprocessor(p) reader.read() @@ -59,7 +67,7 @@ def test_preprocessor_list(self): # TODO problematic interface design: should be able to use Orange.data.Table directly path = os.path.join(get_sample_datasets_dir(), AGILENT_TILE) reader = OWTilefile.get_tile_reader(path) - pp = PreprocessorList(PREPROCESSORS_INDEPENDENT_SAMPLES[0:7]) + pp = PreprocessorList(PREPROCESSORS_SEQUENCE) reader.set_preprocessor(pp) t = reader.read() assert len(t.domain.attributes) == 3 From 80141dd2e328feeb1303acf39c6e75519f9848dd Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 16:49:23 +0200 Subject: [PATCH 04/13] Common preprocess tests into Mixins, converted 2 --- .../spectroscopy/tests/test_preprocess.py | 61 +++++++++---------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index 660fd4913..a1a54860f 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -29,16 +29,6 @@ SMALLER_COLLAGEN = smaller_data(COLLAGEN[195:621], 40, 4) # only glycogen and lipids -def preprocessor_data(preproc): - """ - Rerturn appropriate test file for a preprocessor. - - Very slow preprocessors should get smaller files. - """ - if isinstance(preproc, ME_EMSC): - return SMALLER_COLLAGEN - return SMALL_COLLAGEN - # Preprocessors that work per sample and should return the same # result for a sample independent of the other samples PREPROCESSORS_INDEPENDENT_SAMPLES = [ @@ -61,7 +51,6 @@ def preprocessor_data(preproc): Normalize(method=Normalize.Vector), Normalize(method=Normalize.Area, int_method=Integrate.PeakMax, lower=0, upper=10000), Normalize(method=Normalize.MinMax), - ShiftAndScale(1, 2), Despike(threshold=5, cutoff=60, dis=5), ALSP(lam=100E+6, itermax=5, p=0.5), ARPLS(lam=100E+5, itermax=5, ratio=0.5), @@ -168,9 +157,7 @@ def add_edge_case_data_parameter(class_, data_arg_name, data_to_modify, *args, * # Preprocessors that use groups of input samples to infer # internal parameters. -PREPROCESSORS_GROUPS_OF_SAMPLES = [ - PCADenoising(components=2), -] +PREPROCESSORS_GROUPS_OF_SAMPLES = [] PREPROCESSORS_INDEPENDENT_SAMPLES += list( add_edge_case_data_parameter(ME_EMSC, "reference", SMALLER_COLLAGEN[0:1], max_iter=4)) @@ -488,9 +475,7 @@ def test_reference_exceptions(self): NormalizeReference(reference=Table.from_numpy(None, [[2], [6]])) -class TestConversion(unittest.TestCase): - - preprocessors = PREPROCESSORS +class TestConversionMixin: def test_slightly_different_domain(self): """ If test data has a slightly different domain then (with interpolation) @@ -505,7 +490,7 @@ def test_slightly_different_domain(self): continue with self.subTest(proc): # LR that can not handle unknown values - train, test = separate_learn_test(preprocessor_data(proc)) + train, test = separate_learn_test(self.data) train1 = proc(train) aucorig = AUC(TestOnTestData()(train1, test, [learner])) test = slightly_change_wavenumbers(test, 0.00001) @@ -526,9 +511,7 @@ def test_slightly_different_domain(self): self.assertAlmostEqual(aucnow, aucorig, delta=0.05, msg="Preprocessor " + str(proc)) -class TestConversionIndpSamples(TestConversion, unittest.TestCase): - - preprocessors = PREPROCESSORS_INDEPENDENT_SAMPLES +class TestConversionIndpSamplesMixin(TestConversionMixin): def test_whole_and_train_separate(self): """ Applying a preprocessor before spliting data into train and test @@ -536,7 +519,7 @@ def test_whole_and_train_separate(self): the test data. """ for proc in self.preprocessors: with self.subTest(proc): - data = preprocessor_data(proc) + data = self.data _, test1 = separate_learn_test(proc(data)) train, test = separate_learn_test(data) train = proc(train) @@ -552,20 +535,18 @@ def __call__(self, data): return data[~mask] -class TestStrangeData(unittest.TestCase): - - preprocessors = PREPROCESSORS +class TestStrangeDataMixin: def test_no_samples(self): """ Preprocessors should not crash when there are no input samples. """ - data = SMALL_COLLAGEN[:0] + data = self.data[:0] for proc in self.preprocessors: with self.subTest(proc): _ = proc(data) def test_no_attributes(self): """ Preprocessors should not crash when samples have no attributes. """ - data = SMALL_COLLAGEN + data = self.data data = data.transform(Orange.data.Domain([], class_vars=data.domain.class_vars, metas=data.domain.metas)) @@ -577,7 +558,7 @@ def test_all_nans(self): """ Preprocessors should not crash when there are all-nan samples. """ for proc in self.preprocessors: with self.subTest(proc): - data = preprocessor_data(proc).copy() + data = self.data.copy() with data.unlocked(): data.X[0, :] = np.nan try: @@ -588,7 +569,7 @@ def test_all_nans(self): def test_unordered_features(self): for proc in self.preprocessors: with self.subTest(proc): - data = preprocessor_data(proc) + data = self.data data_reversed = reverse_attr(data) data_shuffle = shuffle_attr(data) pdata = proc(data) @@ -603,7 +584,7 @@ def test_unordered_features(self): def test_unknown_no_propagate(self): for proc in self.preprocessors: with self.subTest(proc): - data = preprocessor_data(proc).copy() + data = self.data.copy() # one unknown in line with data.unlocked(): for i in range(min(len(data), len(data.domain.attributes))): @@ -619,7 +600,7 @@ def test_no_infs(self): """ Preprocessors should not return (-)inf """ for proc in self.preprocessors: with self.subTest(proc): - data = preprocessor_data(proc).copy() + data = self.data.copy() # add some zeros to the dataset with data.unlocked(): for i in range(min(len(data), len(data.domain.attributes))): @@ -634,7 +615,18 @@ def test_no_infs(self): self.assertFalse(anyinfs, msg="Preprocessor " + str(proc)) -class TestPCADenoising(unittest.TestCase): +class TestCommonMixin(TestStrangeDataMixin, TestConversionMixin): + pass + + +class TestCommonIndpSamplesMixin(TestStrangeDataMixin, TestConversionIndpSamplesMixin): + pass + + +class TestPCADenoising(unittest.TestCase, TestCommonMixin): + + preprocessors = [PCADenoising(components=2)] + data = SMALL_COLLAGEN def test_no_samples(self): data = Orange.data.Table("iris") @@ -657,7 +649,10 @@ def test_iris(self): [4.75015528, 3.15366444, 1.46254138, 0.23693223]]) -class TestShiftAndScale(unittest.TestCase): +class TestShiftAndScale(unittest.TestCase, TestConversionIndpSamplesMixin): + + preprocessors = [ShiftAndScale(1, 2)] + data = SMALL_COLLAGEN def test_simple(self): data = Table.from_numpy(None, [[1.0, 2.0, 3.0, 4.0]]) From d440f91dd8a5fb49a19860fd40e9ab7d0a0fe565 Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 16:53:18 +0200 Subject: [PATCH 05/13] test_preprocess: move common utilities to the top --- .../spectroscopy/tests/test_preprocess.py | 296 +++++++++--------- 1 file changed, 148 insertions(+), 148 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index a1a54860f..96b2d14cd 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -165,6 +165,154 @@ def add_edge_case_data_parameter(class_, data_arg_name, data_to_modify, *args, * PREPROCESSORS = PREPROCESSORS_INDEPENDENT_SAMPLES + PREPROCESSORS_GROUPS_OF_SAMPLES +class TestConversionMixin: + + def test_slightly_different_domain(self): + """ If test data has a slightly different domain then (with interpolation) + we should obtain a similar classification score. """ + # rows full of unknowns make LogisticRegression undefined + # we can obtain them, for example, with EMSC, if one of the badspectra + # is a spectrum from the data + learner = LogisticRegressionLearner(max_iter=1000, preprocessors=[_RemoveNaNRows()]) + + for proc in self.preprocessors: + if hasattr(proc, "skip_add_zeros"): + continue + with self.subTest(proc): + # LR that can not handle unknown values + train, test = separate_learn_test(self.data) + train1 = proc(train) + aucorig = AUC(TestOnTestData()(train1, test, [learner])) + test = slightly_change_wavenumbers(test, 0.00001) + test = odd_attr(test) + # a subset of points for training so that all test sets points + # are within the train set points, which gives no unknowns + train = Interpolate(points=getx(train)[1:-3])(train) # interpolatable train + train = proc(train) + # explicit domain conversion test to catch exceptions that would + # otherwise be silently handled in TestOnTestData + _ = test.transform(train.domain) + aucnow = AUC(TestOnTestData()(train, test, [learner])) + self.assertAlmostEqual(aucnow, aucorig, delta=0.03, msg="Preprocessor " + str(proc)) + test = Interpolate(points=getx(test) - 1.)(test) # also do a shift + _ = test.transform(train.domain) # explicit call again + aucnow = AUC(TestOnTestData()(train, test, [learner])) + # the difference should be slight + self.assertAlmostEqual(aucnow, aucorig, delta=0.05, msg="Preprocessor " + str(proc)) + + +class TestConversionIndpSamplesMixin(TestConversionMixin): + + def test_whole_and_train_separate(self): + """ Applying a preprocessor before spliting data into train and test + and applying is just on train data should yield the same transformation of + the test data. """ + for proc in self.preprocessors: + with self.subTest(proc): + data = self.data + _, test1 = separate_learn_test(proc(data)) + train, test = separate_learn_test(data) + train = proc(train) + test_transformed = test.transform(train.domain) + np.testing.assert_almost_equal(test_transformed.X, test1.X, + err_msg="Preprocessor " + str(proc)) + +class _RemoveNaNRows(Orange.preprocess.preprocess.Preprocess): + + def __call__(self, data): + mask = np.isnan(data.X) + mask = np.any(mask, axis=1) + return data[~mask] + + +class TestStrangeDataMixin: + + def test_no_samples(self): + """ Preprocessors should not crash when there are no input samples. """ + data = self.data[:0] + for proc in self.preprocessors: + with self.subTest(proc): + _ = proc(data) + + def test_no_attributes(self): + """ Preprocessors should not crash when samples have no attributes. """ + data = self.data + data = data.transform(Orange.data.Domain([], + class_vars=data.domain.class_vars, + metas=data.domain.metas)) + for proc in self.preprocessors: + with self.subTest(proc): + _ = proc(data) + + def test_all_nans(self): + """ Preprocessors should not crash when there are all-nan samples. """ + for proc in self.preprocessors: + with self.subTest(proc): + data = self.data.copy() + with data.unlocked(): + data.X[0, :] = np.nan + try: + _ = proc(data) + except PreprocessException: + continue # allow explicit preprocessor exception + + def test_unordered_features(self): + for proc in self.preprocessors: + with self.subTest(proc): + data = self.data + data_reversed = reverse_attr(data) + data_shuffle = shuffle_attr(data) + pdata = proc(data) + X = pdata.X[:, np.argsort(getx(pdata))] + pdata_reversed = proc(data_reversed) + X_reversed = pdata_reversed.X[:, np.argsort(getx(pdata_reversed))] + np.testing.assert_almost_equal(X, X_reversed, err_msg="Preprocessor " + str(proc)) + pdata_shuffle = proc(data_shuffle) + X_shuffle = pdata_shuffle.X[:, np.argsort(getx(pdata_shuffle))] + np.testing.assert_almost_equal(X, X_shuffle, err_msg="Preprocessor " + str(proc)) + + def test_unknown_no_propagate(self): + for proc in self.preprocessors: + with self.subTest(proc): + data = self.data.copy() + # one unknown in line + with data.unlocked(): + for i in range(min(len(data), len(data.domain.attributes))): + data.X[i, i] = np.nan + + if hasattr(proc, "skip_add_zeros"): + continue + pdata = proc(data) + sumnans = np.sum(np.isnan(pdata.X), axis=1) + self.assertFalse(np.any(sumnans > 1), msg="Preprocessor " + str(proc)) + + def test_no_infs(self): + """ Preprocessors should not return (-)inf """ + for proc in self.preprocessors: + with self.subTest(proc): + data = self.data.copy() + # add some zeros to the dataset + with data.unlocked(): + for i in range(min(len(data), len(data.domain.attributes))): + data.X[i, i] = 0 + data.X[0, :] = 0 + data.X[:, 0] = 0 + try: + pdata = proc(data) + except PreprocessException: + continue # allow explicit preprocessor exception + anyinfs = np.any(np.isinf(pdata.X)) + self.assertFalse(anyinfs, msg="Preprocessor " + str(proc)) + + +class TestCommonMixin(TestStrangeDataMixin, TestConversionMixin): + pass + + +class TestCommonIndpSamplesMixin(TestStrangeDataMixin, TestConversionIndpSamplesMixin): + pass + + class TestSpSubtract(unittest.TestCase): def test_simple(self): @@ -475,154 +623,6 @@ def test_reference_exceptions(self): NormalizeReference(reference=Table.from_numpy(None, [[2], [6]])) -class TestConversionMixin: - - def test_slightly_different_domain(self): - """ If test data has a slightly different domain then (with interpolation) - we should obtain a similar classification score. """ - # rows full of unknowns make LogisticRegression undefined - # we can obtain them, for example, with EMSC, if one of the badspectra - # is a spectrum from the data - learner = LogisticRegressionLearner(max_iter=1000, preprocessors=[_RemoveNaNRows()]) - - for proc in self.preprocessors: - if hasattr(proc, "skip_add_zeros"): - continue - with self.subTest(proc): - # LR that can not handle unknown values - train, test = separate_learn_test(self.data) - train1 = proc(train) - aucorig = AUC(TestOnTestData()(train1, test, [learner])) - test = slightly_change_wavenumbers(test, 0.00001) - test = odd_attr(test) - # a subset of points for training so that all test sets points - # are within the train set points, which gives no unknowns - train = Interpolate(points=getx(train)[1:-3])(train) # interpolatable train - train = proc(train) - # explicit domain conversion test to catch exceptions that would - # otherwise be silently handled in TestOnTestData - _ = test.transform(train.domain) - aucnow = AUC(TestOnTestData()(train, test, [learner])) - self.assertAlmostEqual(aucnow, aucorig, delta=0.03, msg="Preprocessor " + str(proc)) - test = Interpolate(points=getx(test) - 1.)(test) # also do a shift - _ = test.transform(train.domain) # explicit call again - aucnow = AUC(TestOnTestData()(train, test, [learner])) - # the difference should be slight - self.assertAlmostEqual(aucnow, aucorig, delta=0.05, msg="Preprocessor " + str(proc)) - - -class TestConversionIndpSamplesMixin(TestConversionMixin): - - def test_whole_and_train_separate(self): - """ Applying a preprocessor before spliting data into train and test - and applying is just on train data should yield the same transformation of - the test data. """ - for proc in self.preprocessors: - with self.subTest(proc): - data = self.data - _, test1 = separate_learn_test(proc(data)) - train, test = separate_learn_test(data) - train = proc(train) - test_transformed = test.transform(train.domain) - np.testing.assert_almost_equal(test_transformed.X, test1.X, - err_msg="Preprocessor " + str(proc)) - -class _RemoveNaNRows(Orange.preprocess.preprocess.Preprocess): - - def __call__(self, data): - mask = np.isnan(data.X) - mask = np.any(mask, axis=1) - return data[~mask] - - -class TestStrangeDataMixin: - - def test_no_samples(self): - """ Preprocessors should not crash when there are no input samples. """ - data = self.data[:0] - for proc in self.preprocessors: - with self.subTest(proc): - _ = proc(data) - - def test_no_attributes(self): - """ Preprocessors should not crash when samples have no attributes. """ - data = self.data - data = data.transform(Orange.data.Domain([], - class_vars=data.domain.class_vars, - metas=data.domain.metas)) - for proc in self.preprocessors: - with self.subTest(proc): - _ = proc(data) - - def test_all_nans(self): - """ Preprocessors should not crash when there are all-nan samples. """ - for proc in self.preprocessors: - with self.subTest(proc): - data = self.data.copy() - with data.unlocked(): - data.X[0, :] = np.nan - try: - _ = proc(data) - except PreprocessException: - continue # allow explicit preprocessor exception - - def test_unordered_features(self): - for proc in self.preprocessors: - with self.subTest(proc): - data = self.data - data_reversed = reverse_attr(data) - data_shuffle = shuffle_attr(data) - pdata = proc(data) - X = pdata.X[:, np.argsort(getx(pdata))] - pdata_reversed = proc(data_reversed) - X_reversed = pdata_reversed.X[:, np.argsort(getx(pdata_reversed))] - np.testing.assert_almost_equal(X, X_reversed, err_msg="Preprocessor " + str(proc)) - pdata_shuffle = proc(data_shuffle) - X_shuffle = pdata_shuffle.X[:, np.argsort(getx(pdata_shuffle))] - np.testing.assert_almost_equal(X, X_shuffle, err_msg="Preprocessor " + str(proc)) - - def test_unknown_no_propagate(self): - for proc in self.preprocessors: - with self.subTest(proc): - data = self.data.copy() - # one unknown in line - with data.unlocked(): - for i in range(min(len(data), len(data.domain.attributes))): - data.X[i, i] = np.nan - - if hasattr(proc, "skip_add_zeros"): - continue - pdata = proc(data) - sumnans = np.sum(np.isnan(pdata.X), axis=1) - self.assertFalse(np.any(sumnans > 1), msg="Preprocessor " + str(proc)) - - def test_no_infs(self): - """ Preprocessors should not return (-)inf """ - for proc in self.preprocessors: - with self.subTest(proc): - data = self.data.copy() - # add some zeros to the dataset - with data.unlocked(): - for i in range(min(len(data), len(data.domain.attributes))): - data.X[i, i] = 0 - data.X[0, :] = 0 - data.X[:, 0] = 0 - try: - pdata = proc(data) - except PreprocessException: - continue # allow explicit preprocessor exception - anyinfs = np.any(np.isinf(pdata.X)) - self.assertFalse(anyinfs, msg="Preprocessor " + str(proc)) - - -class TestCommonMixin(TestStrangeDataMixin, TestConversionMixin): - pass - - -class TestCommonIndpSamplesMixin(TestStrangeDataMixin, TestConversionIndpSamplesMixin): - pass - - class TestPCADenoising(unittest.TestCase, TestCommonMixin): preprocessors = [PCADenoising(components=2)] From c75c17fb330ca385693048b98a83c469559bfeb4 Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 17:11:14 +0200 Subject: [PATCH 06/13] test_preprocess: all preprocess tests in this file use the new mixin --- .../spectroscopy/tests/test_preprocess.py | 95 ++++++++++--------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index 96b2d14cd..0228580d5 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -33,11 +33,7 @@ # result for a sample independent of the other samples PREPROCESSORS_INDEPENDENT_SAMPLES = [ Interpolate(np.linspace(1000, 1700, 100)), - SavitzkyGolayFiltering(window=9, polyorder=2, deriv=2), Cut(lowlim=1000, highlim=1800), - GaussianSmoothing(sd=3.), - Absorbance(), - Transmittance(), Integrate(limits=[[900, 100], [1100, 1200], [1200, 1300]]), Integrate(methods=Integrate.Simple, limits=[[1100, 1200]]), Integrate(methods=Integrate.Baseline, limits=[[1100, 1200]]), @@ -46,11 +42,6 @@ Integrate(methods=Integrate.PeakAt, limits=[[1100]]), Integrate(methods=Integrate.PeakX, limits=[[1100, 1200]]), Integrate(methods=Integrate.PeakXBaseline, limits=[[1100, 1200]]), - RubberbandBaseline(), - LinearBaseline(), - Normalize(method=Normalize.Vector), - Normalize(method=Normalize.Area, int_method=Integrate.PeakMax, lower=0, upper=10000), - Normalize(method=Normalize.MinMax), Despike(threshold=5, cutoff=60, dis=5), ALSP(lam=100E+6, itermax=5, p=0.5), ARPLS(lam=100E+5, itermax=5, ratio=0.5), @@ -131,10 +122,6 @@ def add_edge_case_data_parameter(class_, data_arg_name, data_to_modify, *args, * yield p -for p in [Absorbance, Transmittance]: - # single reference - PREPROCESSORS_INDEPENDENT_SAMPLES += list(add_edge_case_data_parameter(p, "reference", SMALL_COLLAGEN[0:1])) - # EMSC with different kinds of reference PREPROCESSORS_INDEPENDENT_SAMPLES += list( add_edge_case_data_parameter(EMSC, "reference", SMALL_COLLAGEN[0:1])) @@ -148,12 +135,6 @@ def add_edge_case_data_parameter(class_, data_arg_name, data_to_modify, *args, * add_edge_case_data_parameter(AtmCorr, "reference", SMALL_COLLAGEN[0:1], correct_ranges=[(1300, 2100)], smooth_win=5)) -PREPROCESSORS_INDEPENDENT_SAMPLES += \ - list(add_edge_case_data_parameter(NormalizeReference, "reference", SMALL_COLLAGEN[:1])) - -PREPROCESSORS_INDEPENDENT_SAMPLES += \ - list(add_edge_case_data_parameter(SpSubtract, "reference", SMALL_COLLAGEN[:1], amount=0.1)) - # Preprocessors that use groups of input samples to infer # internal parameters. @@ -313,7 +294,11 @@ class TestCommonIndpSamplesMixin(TestStrangeDataMixin, TestConversionIndpSamples pass -class TestSpSubtract(unittest.TestCase): +class TestSpSubtract(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = list(add_edge_case_data_parameter( + SpSubtract, "reference", SMALL_COLLAGEN[:1], amount=0.1)) + data = SMALL_COLLAGEN def test_simple(self): data = Table.from_numpy(None, [[1.0, 2.0, 3.0, 4.0]]) @@ -323,7 +308,12 @@ def test_simple(self): np.testing.assert_almost_equal(fdata.X, [[-1.0, -2.0, -3.0, -4.0]]) -class TestTransmittance(unittest.TestCase): +class TestTransmittance(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [Transmittance()] + \ + list(add_edge_case_data_parameter( + Transmittance, "reference", SMALL_COLLAGEN[0:1])) + data = SMALL_COLLAGEN def test_domain_conversion(self): """Test whether a domain can be used for conversion.""" @@ -357,7 +347,13 @@ def disabled_test_eq(self): self.assertNotEqual(a.domain, t1.domain) -class TestAbsorbance(unittest.TestCase): +class TestAbsorbance(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [Absorbance()] + \ + list(add_edge_case_data_parameter( + Absorbance, "reference", SMALL_COLLAGEN[0:1])) + data = SMALL_COLLAGEN + def test_domain_conversion(self): """Test whether a domain can be used for conversion.""" @@ -390,17 +386,10 @@ def disabled_test_eq(self): self.assertEqual(t4.domain, t5.domain) -class TestSavitzkyGolay(unittest.TestCase): +class TestSavitzkyGolay(unittest.TestCase, TestCommonIndpSamplesMixin): - def test_unknown_no_propagate(self): - data = Orange.data.Table("iris")[:5].copy() - f = SavitzkyGolayFiltering() - with data.unlocked(): - for i in range(4): - data.X[i, i] = np.nan - data.X[4] = np.nan - fdata = f(data) - np.testing.assert_equal(np.sum(np.isnan(fdata.X), axis=1), [1, 1, 1, 1, 4]) + preprocessors = [SavitzkyGolayFiltering(window=9, polyorder=2, deriv=2)] + data = SMALL_COLLAGEN def test_simple(self): data = Orange.data.Table("iris") @@ -429,17 +418,10 @@ def disabled_test_eq(self): self.assertEqual(p1.domain, s2.domain) -class TestGaussian(unittest.TestCase): +class TestGaussian(unittest.TestCase, TestCommonIndpSamplesMixin): - def test_unknown_no_propagate(self): - data = Orange.data.Table("iris")[:5].copy() - f = GaussianSmoothing() - with data.unlocked(): - for i in range(4): - data.X[i, i] = np.nan - data.X[4] = np.nan - fdata = f(data) - np.testing.assert_equal(np.sum(np.isnan(fdata.X), axis=1), [1, 1, 1, 1, 4]) + preprocessors = [GaussianSmoothing(sd=3.)] + data = SMALL_COLLAGEN def test_simple(self): data = Orange.data.Table("iris") @@ -450,7 +432,10 @@ def test_simple(self): [[4.4907066, 3.2794677, 1.7641664, 0.6909083]]) -class TestRubberbandBaseline(unittest.TestCase): +class TestRubberbandBaseline(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [RubberbandBaseline()] + data = SMALL_COLLAGEN def test_whole(self): """ Every point belongs in the convex region. """ @@ -471,7 +456,10 @@ def test_simple(self): np.testing.assert_equal(i.X, [[0, 0, -0.5, 0]]) -class TestLinearBaseline(unittest.TestCase): +class TestLinearBaseline(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [LinearBaseline()] + data = SMALL_COLLAGEN def test_whole(self): data = Table.from_numpy(None, [[1, 5, 1]]) @@ -503,7 +491,14 @@ def test_edgepoints_out_of_data(self): np.testing.assert_almost_equal(i.X, [[0, 4, 0]]) -class TestNormalize(unittest.TestCase): +class TestNormalize(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [Normalize(method=Normalize.Vector), + Normalize(method=Normalize.Area, + int_method=Integrate.PeakMax, lower=0, upper=10000), + Normalize(method=Normalize.MinMax)] + + data = SMALL_COLLAGEN def test_vector_norm(self): data = Table.from_numpy(None, [[2, 1, 2, 2, 3]]) @@ -606,7 +601,13 @@ def disabled_test_eq(self): self.assertEqual(p1.domain, p4.domain) -class TestNormalizeReference(unittest.TestCase): +class TestNormalizeReference(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = (list(add_edge_case_data_parameter(NormalizeReference, + "reference", SMALL_COLLAGEN[:1])) + + list(add_edge_case_data_parameter(NormalizePhaseReference, + "reference", SMALL_COLLAGEN[:1]))) + data = SMALL_COLLAGEN def test_reference(self): data = Table.from_numpy(None, [[2, 1, 3], [4, 2, 6]]) @@ -649,7 +650,7 @@ def test_iris(self): [4.75015528, 3.15366444, 1.46254138, 0.23693223]]) -class TestShiftAndScale(unittest.TestCase, TestConversionIndpSamplesMixin): +class TestShiftAndScale(unittest.TestCase, TestCommonIndpSamplesMixin): preprocessors = [ShiftAndScale(1, 2)] data = SMALL_COLLAGEN From 120135d6666afd746b5adc860d7b3f05b53e760b Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 21:03:17 +0200 Subject: [PATCH 07/13] emsc: test with new mixin --- orangecontrib/spectroscopy/tests/test_emsc.py | 12 +++++++++++- orangecontrib/spectroscopy/tests/test_me_emsc.py | 8 +++++++- .../spectroscopy/tests/test_preprocess.py | 14 +------------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_emsc.py b/orangecontrib/spectroscopy/tests/test_emsc.py index 94bcfe0e4..9e701705e 100644 --- a/orangecontrib/spectroscopy/tests/test_emsc.py +++ b/orangecontrib/spectroscopy/tests/test_emsc.py @@ -8,9 +8,19 @@ SelectionFunction, SmoothedSelectionFunction from orangecontrib.spectroscopy.preprocess.npfunc import Sum from orangecontrib.spectroscopy.tests.util import spectra_table +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALL_COLLAGEN, add_edge_case_data_parameter -class TestEMSC(unittest.TestCase): +class TestEMSC(unittest.TestCase, TestCommonIndpSamplesMixin): + + different_reference = list( + add_edge_case_data_parameter(EMSC, "reference", SMALL_COLLAGEN[0:1])) + different_badspectra = list( + add_edge_case_data_parameter(EMSC, "badspectra", SMALL_COLLAGEN[0:2], + reference=SMALL_COLLAGEN[-1:])) + preprocessors = different_reference + different_badspectra + data = SMALL_COLLAGEN def test_ab(self): data = Table.from_numpy(None, [[1.0, 2.0, 1.0, 1.0], diff --git a/orangecontrib/spectroscopy/tests/test_me_emsc.py b/orangecontrib/spectroscopy/tests/test_me_emsc.py index 5adac5556..c68236282 100644 --- a/orangecontrib/spectroscopy/tests/test_me_emsc.py +++ b/orangecontrib/spectroscopy/tests/test_me_emsc.py @@ -8,6 +8,8 @@ from orangecontrib.spectroscopy.preprocess.me_emsc import ME_EMSC from orangecontrib.spectroscopy.preprocess.emsc import SelectionFunction, SmoothedSelectionFunction from orangecontrib.spectroscopy.preprocess.npfunc import Sum +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALLER_COLLAGEN, add_edge_case_data_parameter def weights_from_inflection_points_legacy(points, kappa, wavenumbers): @@ -82,7 +84,11 @@ def weights_from_inflection_points_legacy(points, kappa, wavenumbers): return data -class TestME_EMSC(unittest.TestCase): +class TestME_EMSC(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = list( + add_edge_case_data_parameter(ME_EMSC, "reference", SMALLER_COLLAGEN[0:1], max_iter=4)) + data = SMALLER_COLLAGEN @classmethod def setUpClass(cls): diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index 0228580d5..18b2e4c95 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -13,11 +13,10 @@ from orangecontrib.spectroscopy.preprocess import Absorbance, Transmittance, \ Integrate, Interpolate, Cut, SavitzkyGolayFiltering, \ GaussianSmoothing, PCADenoising, RubberbandBaseline, \ - Normalize, LinearBaseline, ShiftAndScale, EMSC, MissingReferenceException, \ + Normalize, LinearBaseline, ShiftAndScale, MissingReferenceException, \ WrongReferenceException, NormalizeReference, XASnormalization, ExtractEXAFS, \ PreprocessException, NormalizePhaseReference, Despike, SpSubtract from orangecontrib.spectroscopy.preprocess.als import ALSP, ARPLS, AIRPLS -from orangecontrib.spectroscopy.preprocess.me_emsc import ME_EMSC from orangecontrib.spectroscopy.preprocess.atm_corr import AtmCorr from orangecontrib.spectroscopy.preprocess.utils import replacex from orangecontrib.spectroscopy.tests.test_conversion import separate_learn_test, slightly_change_wavenumbers, odd_attr @@ -122,14 +121,6 @@ def add_edge_case_data_parameter(class_, data_arg_name, data_to_modify, *args, * yield p -# EMSC with different kinds of reference -PREPROCESSORS_INDEPENDENT_SAMPLES += list( - add_edge_case_data_parameter(EMSC, "reference", SMALL_COLLAGEN[0:1])) -# EMSC with different kinds of bad spectra -PREPROCESSORS_INDEPENDENT_SAMPLES += list( - add_edge_case_data_parameter(EMSC, "badspectra", SMALL_COLLAGEN[0:2], - reference=SMALL_COLLAGEN[-1:])) - # AtmCorr with different kinds of reference PREPROCESSORS_INDEPENDENT_SAMPLES += list( add_edge_case_data_parameter(AtmCorr, "reference", SMALL_COLLAGEN[0:1], @@ -140,9 +131,6 @@ def add_edge_case_data_parameter(class_, data_arg_name, data_to_modify, *args, * # internal parameters. PREPROCESSORS_GROUPS_OF_SAMPLES = [] -PREPROCESSORS_INDEPENDENT_SAMPLES += list( - add_edge_case_data_parameter(ME_EMSC, "reference", SMALLER_COLLAGEN[0:1], max_iter=4)) - PREPROCESSORS = PREPROCESSORS_INDEPENDENT_SAMPLES + PREPROCESSORS_GROUPS_OF_SAMPLES From 93d0d1016dfda9ec85962c97225b23c12ef03511 Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 21:06:07 +0200 Subject: [PATCH 08/13] atmcorr: test with new mixin --- orangecontrib/spectroscopy/tests/test_atm_corr.py | 9 ++++++++- orangecontrib/spectroscopy/tests/test_preprocess.py | 7 ------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_atm_corr.py b/orangecontrib/spectroscopy/tests/test_atm_corr.py index 4b1ab6413..8fd0fcced 100755 --- a/orangecontrib/spectroscopy/tests/test_atm_corr.py +++ b/orangecontrib/spectroscopy/tests/test_atm_corr.py @@ -4,9 +4,16 @@ from orangecontrib.spectroscopy.preprocess.atm_corr import AtmCorr from orangecontrib.spectroscopy.tests.util import spectra_table +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALL_COLLAGEN, add_edge_case_data_parameter +class TestAtmCorr(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = list( + add_edge_case_data_parameter(AtmCorr, "reference", SMALL_COLLAGEN[0:1], + correct_ranges=[(1300, 2100)], smooth_win=5)) + data = SMALL_COLLAGEN -class TestAtmCorr(unittest.TestCase): def test_atm_corr(self): # Fake atmospheric spectrum def atm(wn): diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index 18b2e4c95..d87f892fe 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -17,7 +17,6 @@ WrongReferenceException, NormalizeReference, XASnormalization, ExtractEXAFS, \ PreprocessException, NormalizePhaseReference, Despike, SpSubtract from orangecontrib.spectroscopy.preprocess.als import ALSP, ARPLS, AIRPLS -from orangecontrib.spectroscopy.preprocess.atm_corr import AtmCorr from orangecontrib.spectroscopy.preprocess.utils import replacex from orangecontrib.spectroscopy.tests.test_conversion import separate_learn_test, slightly_change_wavenumbers, odd_attr from orangecontrib.spectroscopy.tests.util import smaller_data @@ -121,12 +120,6 @@ def add_edge_case_data_parameter(class_, data_arg_name, data_to_modify, *args, * yield p -# AtmCorr with different kinds of reference -PREPROCESSORS_INDEPENDENT_SAMPLES += list( - add_edge_case_data_parameter(AtmCorr, "reference", SMALL_COLLAGEN[0:1], - correct_ranges=[(1300, 2100)], smooth_win=5)) - - # Preprocessors that use groups of input samples to infer # internal parameters. PREPROCESSORS_GROUPS_OF_SAMPLES = [] From 0f09f738f20e26e45d1d341d45f67416a875beec Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 21:08:27 +0200 Subject: [PATCH 09/13] cut: test with new mixin --- orangecontrib/spectroscopy/tests/test_cut.py | 7 ++++++- orangecontrib/spectroscopy/tests/test_preprocess.py | 3 +-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_cut.py b/orangecontrib/spectroscopy/tests/test_cut.py index 92e8053f8..0fa657632 100644 --- a/orangecontrib/spectroscopy/tests/test_cut.py +++ b/orangecontrib/spectroscopy/tests/test_cut.py @@ -3,9 +3,14 @@ import unittest from orangecontrib.spectroscopy.preprocess import Cut from orangecontrib.spectroscopy.data import getx +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALL_COLLAGEN -class TestCut(unittest.TestCase): +class TestCut(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [Cut(lowlim=1000, highlim=1800)] + data = SMALL_COLLAGEN @classmethod def setUpClass(cls): diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index d87f892fe..97ab7ec7f 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -11,7 +11,7 @@ from orangecontrib.spectroscopy.data import getx from orangecontrib.spectroscopy.preprocess import Absorbance, Transmittance, \ - Integrate, Interpolate, Cut, SavitzkyGolayFiltering, \ + Integrate, Interpolate, SavitzkyGolayFiltering, \ GaussianSmoothing, PCADenoising, RubberbandBaseline, \ Normalize, LinearBaseline, ShiftAndScale, MissingReferenceException, \ WrongReferenceException, NormalizeReference, XASnormalization, ExtractEXAFS, \ @@ -31,7 +31,6 @@ # result for a sample independent of the other samples PREPROCESSORS_INDEPENDENT_SAMPLES = [ Interpolate(np.linspace(1000, 1700, 100)), - Cut(lowlim=1000, highlim=1800), Integrate(limits=[[900, 100], [1100, 1200], [1200, 1300]]), Integrate(methods=Integrate.Simple, limits=[[1100, 1200]]), Integrate(methods=Integrate.Baseline, limits=[[1100, 1200]]), From be5406784fc54490761925e56bd03948014f4052 Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 21:15:06 +0200 Subject: [PATCH 10/13] interpolate, integrate: test with new mixin --- .../spectroscopy/tests/test_integrate.py | 17 ++++++++++++++++- .../spectroscopy/tests/test_interpolate.py | 7 ++++++- .../spectroscopy/tests/test_preprocess.py | 9 --------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_integrate.py b/orangecontrib/spectroscopy/tests/test_integrate.py index 928914d02..21f429c08 100644 --- a/orangecontrib/spectroscopy/tests/test_integrate.py +++ b/orangecontrib/spectroscopy/tests/test_integrate.py @@ -4,9 +4,24 @@ import numpy as np from orangecontrib.spectroscopy.preprocess import Integrate +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALL_COLLAGEN -class TestIntegrate(unittest.TestCase): +class TestIntegrate(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [ + Integrate(limits=[[900, 100], [1100, 1200], [1200, 1300]]), + Integrate(methods=Integrate.Simple, limits=[[1100, 1200]]), + Integrate(methods=Integrate.Baseline, limits=[[1100, 1200]]), + Integrate(methods=Integrate.PeakMax, limits=[[1100, 1200]]), + Integrate(methods=Integrate.PeakBaseline, limits=[[1100, 1200]]), + Integrate(methods=Integrate.PeakAt, limits=[[1100]]), + Integrate(methods=Integrate.PeakX, limits=[[1100, 1200]]), + Integrate(methods=Integrate.PeakXBaseline, limits=[[1100, 1200]]) + ] + data = SMALL_COLLAGEN + def test_simple(self): data = Table.from_numpy(None, [[1, 2, 3, 1, 1, 1], diff --git a/orangecontrib/spectroscopy/tests/test_interpolate.py b/orangecontrib/spectroscopy/tests/test_interpolate.py index ee57e8ded..51743c8db 100644 --- a/orangecontrib/spectroscopy/tests/test_interpolate.py +++ b/orangecontrib/spectroscopy/tests/test_interpolate.py @@ -16,9 +16,14 @@ nan_extend_edges_and_interpolate from orangecontrib.spectroscopy.data import getx from orangecontrib.spectroscopy.tests.util import spectra_table +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALL_COLLAGEN -class TestInterpolate(unittest.TestCase): +class TestInterpolate(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [Interpolate(np.linspace(1000, 1700, 100))] + data = SMALL_COLLAGEN @classmethod def setUpClass(cls): diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index 97ab7ec7f..bd2d56d96 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -30,15 +30,6 @@ # Preprocessors that work per sample and should return the same # result for a sample independent of the other samples PREPROCESSORS_INDEPENDENT_SAMPLES = [ - Interpolate(np.linspace(1000, 1700, 100)), - Integrate(limits=[[900, 100], [1100, 1200], [1200, 1300]]), - Integrate(methods=Integrate.Simple, limits=[[1100, 1200]]), - Integrate(methods=Integrate.Baseline, limits=[[1100, 1200]]), - Integrate(methods=Integrate.PeakMax, limits=[[1100, 1200]]), - Integrate(methods=Integrate.PeakBaseline, limits=[[1100, 1200]]), - Integrate(methods=Integrate.PeakAt, limits=[[1100]]), - Integrate(methods=Integrate.PeakX, limits=[[1100, 1200]]), - Integrate(methods=Integrate.PeakXBaseline, limits=[[1100, 1200]]), Despike(threshold=5, cutoff=60, dis=5), ALSP(lam=100E+6, itermax=5, p=0.5), ARPLS(lam=100E+5, itermax=5, ratio=0.5), From a079d56ec368a3f337447eb95ee9e19d0023a4ca Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 21:20:17 +0200 Subject: [PATCH 11/13] despike, als: test with new mixin --- orangecontrib/spectroscopy/tests/test_als.py | 12 +++++++++++- orangecontrib/spectroscopy/tests/test_despike.py | 8 +++++++- orangecontrib/spectroscopy/tests/test_preprocess.py | 10 ++-------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_als.py b/orangecontrib/spectroscopy/tests/test_als.py index 48b1dafe1..5d7ba7389 100644 --- a/orangecontrib/spectroscopy/tests/test_als.py +++ b/orangecontrib/spectroscopy/tests/test_als.py @@ -4,9 +4,19 @@ from Orange.data import Table from orangecontrib.spectroscopy.preprocess.als import ALSP, ARPLS, AIRPLS +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALLER_COLLAGEN -class Testals(unittest.TestCase): +class TestAls(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [ + ALSP(lam=100E+6, itermax=5, p=0.5), + ARPLS(lam=100E+5, itermax=5, ratio=0.5), + AIRPLS(lam=100, itermax=5, porder=1), + ] + data = SMALLER_COLLAGEN + def test_als_Basic(self): data = Table.from_numpy(None, [[1.0, 2.0, 10.0, 5.0], [3.0, 5.0, 9.0, 4.0]]) diff --git a/orangecontrib/spectroscopy/tests/test_despike.py b/orangecontrib/spectroscopy/tests/test_despike.py index 56aa4987e..eefb2ccf2 100644 --- a/orangecontrib/spectroscopy/tests/test_despike.py +++ b/orangecontrib/spectroscopy/tests/test_despike.py @@ -2,9 +2,15 @@ import numpy as np from Orange.data import Table from orangecontrib.spectroscopy.preprocess import Despike +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALL_COLLAGEN -class TestSpikeremoval(unittest.TestCase): +class TestSpikeRemoval(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [Despike(threshold=5, cutoff=60, dis=5)] + data = SMALL_COLLAGEN + def test_spikes(self): data = Table.from_numpy(None, [[1000, 1, 1, 1, 1, 10, 1, 1, 1000, 1000, 1000, 1, 1000, 1, 1, 1, 1000, 1000, 1000, 1000], diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index bd2d56d96..e8f759009 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -15,8 +15,7 @@ GaussianSmoothing, PCADenoising, RubberbandBaseline, \ Normalize, LinearBaseline, ShiftAndScale, MissingReferenceException, \ WrongReferenceException, NormalizeReference, XASnormalization, ExtractEXAFS, \ - PreprocessException, NormalizePhaseReference, Despike, SpSubtract -from orangecontrib.spectroscopy.preprocess.als import ALSP, ARPLS, AIRPLS + PreprocessException, NormalizePhaseReference, SpSubtract from orangecontrib.spectroscopy.preprocess.utils import replacex from orangecontrib.spectroscopy.tests.test_conversion import separate_learn_test, slightly_change_wavenumbers, odd_attr from orangecontrib.spectroscopy.tests.util import smaller_data @@ -29,12 +28,7 @@ # Preprocessors that work per sample and should return the same # result for a sample independent of the other samples -PREPROCESSORS_INDEPENDENT_SAMPLES = [ - Despike(threshold=5, cutoff=60, dis=5), - ALSP(lam=100E+6, itermax=5, p=0.5), - ARPLS(lam=100E+5, itermax=5, ratio=0.5), - AIRPLS(lam=100, itermax=5, porder=1), -] +PREPROCESSORS_INDEPENDENT_SAMPLES = [] xas_norm_collagen = XASnormalization(edge=1630, preedge_dict={'from': 1000, 'to': 1300, 'deg': 1}, From 1e69ea7bfc129505d4ab9e21084a4e02eb41b6ab Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 21:26:35 +0200 Subject: [PATCH 12/13] test_preprocess: finish porting tests to the new mixin --- .../spectroscopy/tests/test_preprocess.py | 35 +++---------------- orangecontrib/spectroscopy/tests/test_xas.py | 22 +++++++++++- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index e8f759009..cf3001d2c 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -7,14 +7,13 @@ from Orange.classification import LogisticRegressionLearner from Orange.data import Table from Orange.evaluation import TestOnTestData, AUC -from Orange.preprocess.preprocess import PreprocessorList from orangecontrib.spectroscopy.data import getx from orangecontrib.spectroscopy.preprocess import Absorbance, Transmittance, \ Integrate, Interpolate, SavitzkyGolayFiltering, \ GaussianSmoothing, PCADenoising, RubberbandBaseline, \ Normalize, LinearBaseline, ShiftAndScale, MissingReferenceException, \ - WrongReferenceException, NormalizeReference, XASnormalization, ExtractEXAFS, \ + WrongReferenceException, NormalizeReference, \ PreprocessException, NormalizePhaseReference, SpSubtract from orangecontrib.spectroscopy.preprocess.utils import replacex from orangecontrib.spectroscopy.tests.test_conversion import separate_learn_test, slightly_change_wavenumbers, odd_attr @@ -26,27 +25,6 @@ SMALLER_COLLAGEN = smaller_data(COLLAGEN[195:621], 40, 4) # only glycogen and lipids -# Preprocessors that work per sample and should return the same -# result for a sample independent of the other samples -PREPROCESSORS_INDEPENDENT_SAMPLES = [] - -xas_norm_collagen = XASnormalization(edge=1630, - preedge_dict={'from': 1000, 'to': 1300, 'deg': 1}, - postedge_dict={'from': 1650, 'to': 1700, 'deg': 1}) -extract_exafs = ExtractEXAFS(edge=1630, extra_from=1630, extra_to=1800, - poly_deg=1, kweight=0, m=0) - - -class ExtractEXAFSUsage(PreprocessorList): - """ExtractEXAFS needs previous XAS normalization""" - def __init__(self): - super().__init__(preprocessors=[xas_norm_collagen, - extract_exafs]) - - -PREPROCESSORS_INDEPENDENT_SAMPLES += [xas_norm_collagen, ExtractEXAFSUsage()] - - def add_zeros(data): """ Every 5th value is zero """ s = data.copy() @@ -104,13 +82,6 @@ def add_edge_case_data_parameter(class_, data_arg_name, data_to_modify, *args, * yield p -# Preprocessors that use groups of input samples to infer -# internal parameters. -PREPROCESSORS_GROUPS_OF_SAMPLES = [] - -PREPROCESSORS = PREPROCESSORS_INDEPENDENT_SAMPLES + PREPROCESSORS_GROUPS_OF_SAMPLES - - class TestConversionMixin: def test_slightly_different_domain(self): @@ -148,6 +119,10 @@ def test_slightly_different_domain(self): class TestConversionIndpSamplesMixin(TestConversionMixin): + """ + Testing mixin for preprocessors that work per sample and should + return the same result for a sample independent of the other samples + """ def test_whole_and_train_separate(self): """ Applying a preprocessor before spliting data into train and test diff --git a/orangecontrib/spectroscopy/tests/test_xas.py b/orangecontrib/spectroscopy/tests/test_xas.py index 19582a989..e5c91c5d3 100644 --- a/orangecontrib/spectroscopy/tests/test_xas.py +++ b/orangecontrib/spectroscopy/tests/test_xas.py @@ -2,11 +2,31 @@ import numpy import Orange from Orange.data import Table +from Orange.preprocess import PreprocessorList from orangecontrib.spectroscopy.preprocess import XASnormalization, ExtractEXAFS, NoEdgejumpProvidedException +from orangecontrib.spectroscopy.tests.test_preprocess import TestCommonIndpSamplesMixin, \ + SMALLER_COLLAGEN -class TestXASnormalization(unittest.TestCase): +xas_norm_collagen = XASnormalization(edge=1630, + preedge_dict={'from': 1000, 'to': 1300, 'deg': 1}, + postedge_dict={'from': 1650, 'to': 1700, 'deg': 1}) +extract_exafs = ExtractEXAFS(edge=1630, extra_from=1630, extra_to=1800, + poly_deg=1, kweight=0, m=0) + + +class ExtractEXAFSUsage(PreprocessorList): + """ExtractEXAFS needs previous XAS normalization""" + def __init__(self): + super().__init__(preprocessors=[xas_norm_collagen, + extract_exafs]) + + +class TestXASnormalization(unittest.TestCase, TestCommonIndpSamplesMixin): + + preprocessors = [xas_norm_collagen, ExtractEXAFSUsage()] + data = SMALLER_COLLAGEN def test_flat(self): domain = Orange.data.Domain([Orange.data.ContinuousVariable(str(w)) From d856bf34ed1c6c4af9a48a6c0e04f4f2b04ae9b3 Mon Sep 17 00:00:00 2001 From: Marko Toplak Date: Mon, 19 Aug 2024 21:35:44 +0200 Subject: [PATCH 13/13] test_preprocess: speedups --- .../spectroscopy/tests/test_preprocess.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/orangecontrib/spectroscopy/tests/test_preprocess.py b/orangecontrib/spectroscopy/tests/test_preprocess.py index cf3001d2c..a6e7412e8 100644 --- a/orangecontrib/spectroscopy/tests/test_preprocess.py +++ b/orangecontrib/spectroscopy/tests/test_preprocess.py @@ -237,8 +237,8 @@ class TestCommonIndpSamplesMixin(TestStrangeDataMixin, TestConversionIndpSamples class TestSpSubtract(unittest.TestCase, TestCommonIndpSamplesMixin): preprocessors = list(add_edge_case_data_parameter( - SpSubtract, "reference", SMALL_COLLAGEN[:1], amount=0.1)) - data = SMALL_COLLAGEN + SpSubtract, "reference", SMALLER_COLLAGEN[:1], amount=0.1)) + data = SMALLER_COLLAGEN def test_simple(self): data = Table.from_numpy(None, [[1.0, 2.0, 3.0, 4.0]]) @@ -252,12 +252,12 @@ class TestTransmittance(unittest.TestCase, TestCommonIndpSamplesMixin): preprocessors = [Transmittance()] + \ list(add_edge_case_data_parameter( - Transmittance, "reference", SMALL_COLLAGEN[0:1])) - data = SMALL_COLLAGEN + Transmittance, "reference", SMALLER_COLLAGEN[0:1])) + data = SMALLER_COLLAGEN def test_domain_conversion(self): """Test whether a domain can be used for conversion.""" - data = SMALL_COLLAGEN + data = self.data transmittance = Transmittance()(data) nt = Orange.data.Table.from_table(transmittance.domain, data) self.assertEqual(transmittance.domain, nt.domain) @@ -266,12 +266,12 @@ def test_domain_conversion(self): def test_roundtrip(self): """Test AB -> TR -> AB calculation""" - data = SMALL_COLLAGEN + data = self.data calcdata = Absorbance()(Transmittance()(data)) np.testing.assert_allclose(data.X, calcdata.X) def disabled_test_eq(self): - data = SMALL_COLLAGEN + data = self.data t1 = Transmittance()(data) t2 = Transmittance()(data) self.assertEqual(t1.domain, t2.domain) @@ -291,13 +291,13 @@ class TestAbsorbance(unittest.TestCase, TestCommonIndpSamplesMixin): preprocessors = [Absorbance()] + \ list(add_edge_case_data_parameter( - Absorbance, "reference", SMALL_COLLAGEN[0:1])) - data = SMALL_COLLAGEN + Absorbance, "reference", SMALLER_COLLAGEN[0:1])) + data = SMALLER_COLLAGEN def test_domain_conversion(self): """Test whether a domain can be used for conversion.""" - data = Transmittance()(SMALL_COLLAGEN) + data = Transmittance()(self.data) absorbance = Absorbance()(data) nt = Orange.data.Table.from_table(absorbance.domain, data) self.assertEqual(absorbance.domain, nt.domain) @@ -307,12 +307,12 @@ def test_domain_conversion(self): def test_roundtrip(self): """Test TR -> AB -> TR calculation""" # actually AB -> TR -> AB -> TR - data = Transmittance()(SMALL_COLLAGEN) + data = Transmittance()(self.data) calcdata = Transmittance()(Absorbance()(data)) np.testing.assert_allclose(data.X, calcdata.X) def disabled_test_eq(self): - data = SMALL_COLLAGEN + data = self.data t1 = Absorbance()(data) t2 = Absorbance()(data) self.assertEqual(t1.domain, t2.domain) @@ -375,7 +375,7 @@ def test_simple(self): class TestRubberbandBaseline(unittest.TestCase, TestCommonIndpSamplesMixin): preprocessors = [RubberbandBaseline()] - data = SMALL_COLLAGEN + data = SMALLER_COLLAGEN def test_whole(self): """ Every point belongs in the convex region. """ @@ -544,10 +544,10 @@ def disabled_test_eq(self): class TestNormalizeReference(unittest.TestCase, TestCommonIndpSamplesMixin): preprocessors = (list(add_edge_case_data_parameter(NormalizeReference, - "reference", SMALL_COLLAGEN[:1])) + + "reference", SMALLER_COLLAGEN[:1])) + list(add_edge_case_data_parameter(NormalizePhaseReference, - "reference", SMALL_COLLAGEN[:1]))) - data = SMALL_COLLAGEN + "reference", SMALLER_COLLAGEN[:1]))) + data = SMALLER_COLLAGEN def test_reference(self): data = Table.from_numpy(None, [[2, 1, 3], [4, 2, 6]]) @@ -567,10 +567,10 @@ def test_reference_exceptions(self): class TestPCADenoising(unittest.TestCase, TestCommonMixin): preprocessors = [PCADenoising(components=2)] - data = SMALL_COLLAGEN + data = SMALLER_COLLAGEN def test_no_samples(self): - data = Orange.data.Table("iris") + data = self.data proc = PCADenoising() d1 = proc(data[:0]) newdata = data.transform(d1.domain)