Merge remote-tracking branch 'origin/feature/gui' into feature/gui

# Conflicts: # demo/DemoClassification.ipynb # gui.py # psyke/__init__.py # psyke/clustering/__init__.py # psyke/clustering/exact/__init__.py # psyke/extraction/hypercubic/__init__.py # psyke/extraction/hypercubic/gridex/__init__.py # psyke/extraction/real/__init__.py # requirements.txt # test/psyke/classification/real/test_real.py # test/psyke/regression/gridex/test_gridex.py # test/psyke/regression/iter/test_iter.py
psykei · Aug 10, 2022 · 2f8c1c5 · 2f8c1c5
2 parents 04bc402 + e2f8c2c
commit 2f8c1c5
Show file tree

Hide file tree

Showing 12 changed files with 320 additions and 331 deletions.
diff --git a/demo/DemoClassification.ipynb b/demo/DemoClassification.ipynb
diff --git a/gui.py b/gui.py
@@ -153,7 +153,7 @@ def select_dataset(self, widget):
         print(f'Loading {dataset}... ', end='')
         if dataset == 'Iris':
             x, y = load_iris(return_X_y=True, as_frame=True)
-            self.data = (x, y.replace({0: 'setosa', 1: 'versicolor', 2: 'virginica'}))
+            self.data = (x, y.replace({0: 'setosa', 1: 'virginica', 2: 'versicolor'}))
         elif dataset == 'Wine':
             self.data = load_wine(return_X_y=True, as_frame=True)
         elif dataset == "House":

diff --git a/psyke/__init__.py b/psyke/__init__.py
@@ -1,15 +1,16 @@
 from __future__ import annotations
+
 import numpy as np
 import pandas as pd
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score, accuracy_score
 
+import psyke
 from psyke.schema import DiscreteFeature
 from psyke.utils import get_default_random_seed
 from tuprolog.theory import Theory
 from typing import Iterable
 import logging
 
-
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger('psyke')
 
@@ -56,7 +57,7 @@ def mae(self, dataframe: pd.DataFrame, predictor=None) -> float:
         :return: the mean absolute error (MAE) of the predictions.
         """
         predictions = np.array(self.predict(dataframe.iloc[:, :-1]))
-        idx = [prediction is not None for prediction in predictions]
+        idx = ~np.isnan(predictions)
         return mean_absolute_error(dataframe.iloc[idx, -1] if predictor is None else
                                    predictor.predict(dataframe.iloc[idx, :-1]).flatten(),
                                    predictions[idx])
@@ -70,7 +71,7 @@ def mse(self, dataframe: pd.DataFrame, predictor=None) -> float:
         :return: the mean squared error (MSE) of the predictions.
         """
         predictions = np.array(self.predict(dataframe.iloc[:, :-1]))
-        idx = [prediction is not None for prediction in predictions]
+        idx = ~np.isnan(predictions)
         return mean_squared_error(dataframe.iloc[idx, -1] if predictor is None else
                                   predictor.predict(dataframe.iloc[idx, :-1]).flatten(),
                                   predictions[idx])
@@ -84,7 +85,7 @@ def r2(self, dataframe: pd.DataFrame, predictor=None) -> float:
         :return: the R2 score of the predictions.
         """
         predictions = np.array(self.predict(dataframe.iloc[:, :-1]))
-        idx = [prediction is not None for prediction in predictions]
+        idx = ~np.isnan(predictions)
         return r2_score(dataframe.iloc[idx, -1] if predictor is None else
                         predictor.predict(dataframe.iloc[idx, :-1]).flatten(),
                         predictions[idx])
@@ -98,10 +99,9 @@ def accuracy(self, dataframe: pd.DataFrame, predictor=None) -> float:
         :return: the accuracy classification score of the predictions.
         """
         predictions = np.array(self.predict(dataframe.iloc[:, :-1]))
-        idx = [prediction is not None for prediction in predictions]
-        return accuracy_score(dataframe.iloc[idx, -1] if predictor is None else
-                              predictor.predict(dataframe.iloc[idx, :-1]).flatten(),
-                              predictions[idx])
+        return accuracy_score(dataframe.iloc[:, -1] if predictor is None else
+                              predictor.predict(dataframe.iloc[:, :-1]).flatten(),
+                              predictions)
 
     def f1(self, dataframe: pd.DataFrame, predictor=None) -> float:
         """
@@ -112,43 +112,26 @@ def f1(self, dataframe: pd.DataFrame, predictor=None) -> float:
         :return: the F1 score of the predictions.
         """
         predictions = np.array(self.predict(dataframe.iloc[:, :-1]))
-        idx = [prediction is not None for prediction in predictions]
-        return f1_score(dataframe.iloc[idx, -1] if predictor is None else
-                        predictor.predict(dataframe.iloc[idx, :-1]).flatten(),
-                        predictions[idx])
-
-    @staticmethod
-    def exact(depth: int, error_threshold: float, output, gauss_components: int = 2):
-        """
-        Creates a new ExACT instance.
-        """
-        from psyke.clustering.exact import ExACT
-        return ExACT(depth, error_threshold, output, gauss_components)
+        return f1_score(dataframe.iloc[:, -1] if predictor is None else
+                        predictor.predict(dataframe.iloc[:, :-1]).flatten(),
+                        predictions)
 
     @staticmethod
-    def cream(depth: int, error_threshold: float, output, gauss_components: int = 2):
-        """
-        Creates a new CREAM instance.
-        """
-        from psyke.clustering.cream import CREAM
-        return CREAM(depth, error_threshold, output, gauss_components)
-
-    @staticmethod
-    def cart(predictor, max_depth: int = 3, max_leaves: int = 3,
-             discretization: Iterable[DiscreteFeature] = None, simplify: bool = True) -> Extractor:
+    def cart(predictor: psyke.cart.CartPredictor, discretization: Iterable[DiscreteFeature] = None,
+             simplify: bool = True) -> Extractor:
         """
         Creates a new Cart extractor.
         """
-        from psyke.extraction.cart import Cart
-        return Cart(predictor, max_depth, max_leaves, discretization=discretization, simplify=simplify)
+        from psyke.cart import Cart
+        return Cart(predictor, discretization, simplify)
 
     @staticmethod
     def iter(predictor, min_update: float = 0.1, n_points: int = 1, max_iterations: int = 600, min_examples: int = 250,
              threshold: float = 0.1, fill_gaps: bool = True, seed: int = get_default_random_seed()) -> Extractor:
         """
         Creates a new ITER extractor.
         """
-        from psyke.extraction.hypercubic.iter import ITER
+        from psyke.regression.iter import ITER
         return ITER(predictor, min_update, n_points, max_iterations, min_examples, threshold, fill_gaps, seed)
 
     @staticmethod
@@ -157,7 +140,7 @@ def gridex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
         """
         Creates a new GridEx extractor.
         """
-        from psyke.extraction.hypercubic.gridex import GridEx
+        from psyke.regression.gridex import GridEx
         return GridEx(predictor, grid, min_examples, threshold, seed)
 
     @staticmethod
@@ -166,33 +149,31 @@ def gridrex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
         """
         Creates a new GridREx extractor.
         """
-        from psyke.extraction.hypercubic.gridrex import GridREx
+        from psyke.regression.gridrex import GridREx
         return GridREx(predictor, grid, min_examples, threshold, seed)
 
     @staticmethod
-    def creepy(predictor, depth: int, error_threshold: float, output, gauss_components: int = 2,
-               ranks: [(str, float)] = [], ignore_threshold: float = 0.0) -> Extractor:
+    def cream(predictor, depth: int, error_threshold: float, output, gauss_components: int = 10) -> Extractor:
         """
-        Creates a new CReEPy extractor.
+        Creates a new CREAM extractor.
         """
-        from psyke.extraction.hypercubic.creepy import CReEPy
-        return CReEPy(predictor, depth, error_threshold, output, gauss_components, ranks, ignore_threshold)
+        from psyke.clustering.cream import CREAM
+        return CREAM(predictor, depth, error_threshold, output, gauss_components)
 
     @staticmethod
-    def orchid(predictor, depth: int, error_threshold: float, output, gauss_components: int = 2,
-               ranks: [(str, float)] = [], ignore_threshold: float = 0.0) -> Extractor:
+    def creepy(predictor, depth: int, error_threshold: float, output, gauss_components: int = 10) -> Extractor:
         """
-        Creates a new ORCHiD extractor.
+        Creates a new CReEPy extractor.
         """
-        from psyke.extraction.hypercubic.orchid import ORCHiD
-        return ORCHiD(predictor, depth, error_threshold, output, gauss_components, ranks, ignore_threshold)
+        from psyke.clustering.creepy import CReEPy
+        return CReEPy(predictor, depth, error_threshold, output, gauss_components)
 
     @staticmethod
     def real(predictor, discretization=None) -> Extractor:
         """
         Creates a new REAL extractor.
         """
-        from psyke.extraction.real import REAL
+        from psyke.classification.real import REAL
         return REAL(predictor, [] if discretization is None else discretization)
 
     @staticmethod
@@ -201,7 +182,7 @@ def trepan(predictor, discretization=None, min_examples: int = 0, max_depth: int
         """
         Creates a new Trepan extractor.
         """
-        from psyke.extraction.trepan import Trepan, SplitLogic
+        from psyke.classification.trepan import Trepan, SplitLogic
         if split_logic is None:
             split_logic = SplitLogic.DEFAULT
         return Trepan(predictor, [] if discretization is None else discretization, min_examples, max_depth, split_logic)
diff --git a/psyke/clustering/__init__.py b/psyke/clustering/__init__.py
@@ -3,25 +3,26 @@
 import pandas as pd
 from tuprolog.theory import Theory
 
-from psyke.extraction.hypercubic.hypercube import ClosedRegressionCube, ClosedClassificationCube, ClosedCube
-from psyke.utils import Target
+from psyke.regression import HyperCubeExtractor
+from psyke.regression.hypercube import ClosedClassificationCube, ClosedCube, ClosedRegressionCube
 
 
-class InterpretableClustering:
+class ClusterExtractor(HyperCubeExtractor):
 
-    def __init__(self, depth: int, error_threshold: float, output: Target = Target.CONSTANT, gauss_components: int = 2):
+    def __init__(self, predictor, depth: int, error_threshold: float,
+                 output: HyperCubeExtractor.Target = HyperCubeExtractor.Target.CONSTANT, gauss_components: int = 2):
+        super().__init__(predictor)
         self.depth = depth
         self.error_threshold = error_threshold
         self.gauss_components = gauss_components
-        self._output = output
-        self._hypercubes = []
+        self.output = output
 
     def extract(self, dataframe: pd.DataFrame) -> Theory:
         raise NotImplementedError('extract')
 
     def _default_cube(self) -> Union[ClosedCube, ClosedRegressionCube, ClosedClassificationCube]:
-        if self._output == Target.CONSTANT:
+        if self.output == ClusterExtractor.Target.CONSTANT:
             return ClosedCube()
-        if self._output == Target.REGRESSION:
+        if self.output == ClusterExtractor.Target.REGRESSION:
             return ClosedRegressionCube()
         return ClosedClassificationCube()
diff --git a/psyke/clustering/exact/__init__.py b/psyke/clustering/exact/__init__.py
@@ -6,23 +6,26 @@
 import numpy as np
 import pandas as pd
 from sklearn.cluster import DBSCAN
-from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
-
-from psyke.clustering import InterpretableClustering
-from psyke.extraction.hypercubic import Node, ClosedCube, HyperCube
+from tuprolog.theory import Theory
+from psyke.clustering import ClusterExtractor
+from psyke.regression import Node, ClosedCube, HyperCube
 from psyke.clustering.utils import select_gaussian_mixture, select_dbscan_epsilon
-from psyke.utils import Target
 
 
-class ExACT(InterpretableClustering):
+class CReEPy(ClusterExtractor):
     """
-    Explanator implementing ExACT algorithm.
+    Explanator implementing CReEPy algorithm.
     """
 
-    def __init__(self, depth: int, error_threshold: float, output: Target = Target.CONSTANT, gauss_components: int = 5):
-        super().__init__(depth, error_threshold, output, gauss_components)
-        self._predictor = KNeighborsClassifier() if output == Target.CLASSIFICATION else KNeighborsRegressor()
-        self._predictor.n_neighbors = 1
+    def __init__(self, predictor, depth: int, error_threshold: float,
+                 output: ClusterExtractor.Target = ClusterExtractor.Target.CONSTANT, gauss_components: int = 5):
+        super().__init__(predictor, depth, error_threshold, output, gauss_components)
+
+    def _split(self, right: ClosedCube, outer_cube: ClosedCube, data: pd.DataFrame, indices: np.ndarray):
+        right.update(data.iloc[indices], self.predictor)
+        left = outer_cube.copy()
+        left.update(data.iloc[~indices], self.predictor)
+        return right, left
 
     def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int):
         cubes = []
@@ -42,25 +45,17 @@ def _indices(cube: ClosedCube, data: pd.DataFrame) -> np.ndarray | None:
         return indices
 
     def _create_cube(self, dataframe: pd.DataFrame, clusters: int) -> ClosedCube:
-        data = ExACT._remove_string_label(dataframe)
+        data = CReEPy._remove_string_label(dataframe)
         dbscan_pred = DBSCAN(eps=select_dbscan_epsilon(data, clusters)).fit_predict(data.iloc[:, :-1])
         return HyperCube.create_surrounding_cube(
             dataframe.iloc[np.where(dbscan_pred == Counter(dbscan_pred).most_common(1)[0][0])],
-            True, self._output
+            True, self.output
         )
 
-    def extract(self, dataframe: pd.DataFrame) -> Iterable[HyperCube]:
-        self._predictor.fit(dataframe.iloc[:, :-1], dataframe.iloc[:, -1])
+    def extract(self, dataframe: pd.DataFrame) -> Theory:
         self._hypercubes = \
-            self._iterate(Node(dataframe, HyperCube.create_surrounding_cube(dataframe, True, self._output)))
-        return list(self._hypercubes)
-
-    def print(self):
-        for cube in self._hypercubes:
-            print(f'Output is {cube.output} if:')
-            for feature in cube.dimensions:
-                lower, upper = cube[feature]
-                print(f'    {feature} is in [{lower:.2f}, {upper:.2f}]')
+            self._iterate(Node(dataframe, HyperCube.create_surrounding_cube(dataframe, True, self.output)))
+        return self._create_theory(dataframe)
 
     @staticmethod
     def _remove_string_label(dataframe: pd.DataFrame):
@@ -73,7 +68,7 @@ def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
         while len(to_split) > 0:
             to_split.sort(reverse=True)
             (_, depth, _, node) = to_split.pop()
-            data = ExACT._remove_string_label(node.dataframe)
+            data = CReEPy._remove_string_label(node.dataframe)
             gauss_params = select_gaussian_mixture(data, self.gauss_components)
             gauss_pred = gauss_params[2].predict(data)
             cubes, indices = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
@@ -83,9 +78,9 @@ def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
                 continue
             _, _, _, indices, cube = max(cubes)
 
-            cube.update(node.dataframe[indices], self._predictor)
+            cube.update(node.dataframe[indices], self.predictor)
             node.right = Node(node.dataframe[indices], cube)
-            node.cube.update(node.dataframe[~indices], self._predictor)
+            node.cube.update(node.dataframe[~indices], self.predictor)
             node.left = Node(node.dataframe[~indices], node.cube)
 
             if depth < self.depth and cube.diversity > self.error_threshold:

diff --git a/psyke/extraction/hypercubic/__init__.py b/psyke/extraction/hypercubic/__init__.py
@@ -1,4 +1,6 @@
 from __future__ import annotations
+
+from enum import Enum
 from typing import Iterable
 import numpy as np
 import pandas as pd
@@ -7,18 +9,23 @@
 from tuprolog.core import Var, Struct, clause
 from tuprolog.theory import Theory, mutable_theory
 from psyke import Extractor, logger
-from psyke.extraction.hypercubic.hypercube import HyperCube, RegressionCube, ClassificationCube, ClosedCube
+from psyke.regression.strategy import FixedStrategy, Strategy
+from psyke.regression.utils import Limit, MinUpdate, ZippedDimension, Expansion
 from psyke.utils.logic import create_variable_list, create_head, to_var
-from psyke.utils import Target, get_int_precision
-from psyke.extraction.hypercubic.strategy import Strategy, FixedStrategy
+from psyke.regression.hypercube import HyperCube, ClosedCube, RegressionCube, ClosedRegressionCube, ClassificationCube
 
 
 class HyperCubeExtractor(Extractor):
 
+    class Target(Enum):
+        CLASSIFICATION = 1,
+        CONSTANT = 2,
+        REGRESSION = 3
+
     def __init__(self, predictor):
         super().__init__(predictor)
         self._hypercubes = []
-        self._output = Target.CONSTANT
+        self.output = HyperCubeExtractor.Target.CONSTANT
 
     def extract(self, dataframe: pd.DataFrame) -> Theory:
         raise NotImplementedError('extract')
@@ -29,17 +36,14 @@ def predict(self, dataframe: pd.DataFrame) -> Iterable:
     def _predict(self, data: dict[str, float]) -> float | None:
         data = {k: v for k, v in data.items()}
         for cube in self._hypercubes:
-            if cube.__contains__(data):
-                if self._output == Target.CLASSIFICATION:
-                    return HyperCubeExtractor._get_cube_output(cube, data)
-                else:
-                    return round(HyperCubeExtractor._get_cube_output(cube, data), get_int_precision())
+            if data in cube:
+                return HyperCubeExtractor._get_cube_output(cube, data)
         return None
 
     def _default_cube(self) -> HyperCube | RegressionCube | ClassificationCube:
-        if self._output == Target.CONSTANT:
+        if self.output == HyperCubeExtractor.Target.CONSTANT:
             return HyperCube()
-        if self._output == Target.REGRESSION:
+        if self.output == HyperCubeExtractor.Target.REGRESSION:
             return RegressionCube()
         return ClassificationCube()
 
@@ -49,7 +53,7 @@ def _get_cube_output(cube: HyperCube | RegressionCube, data: dict[str, float]) -
             isinstance(cube, RegressionCube) else cube.output
 
     @staticmethod
-    def _create_head(dataframe: pd.DataFrame, variables: list[Var], output: float | LinearRegression) -> Struct:
+    def __create_head(dataframe: pd.DataFrame, variables: list[Var], output: float | LinearRegression) -> Struct:
         return create_head(dataframe.columns[-1], variables[:-1], output) \
             if not isinstance(output, LinearRegression) else \
             create_head(dataframe.columns[-1], variables[:-1], variables[-1])
@@ -59,13 +63,14 @@ def _ignore_dimensions(self) -> Iterable[str]:
 
     def _create_theory(self, dataframe: pd.DataFrame) -> Theory:
         new_theory = mutable_theory()
+        ignore_dimensions = self._ignore_dimensions()
         for cube in self._hypercubes:
             logger.info(cube.output)
             logger.info(cube.dimensions)
             variables = create_variable_list([], dataframe)
             variables[dataframe.columns[-1]] = to_var(dataframe.columns[-1])
-            head = HyperCubeExtractor._create_head(dataframe, list(variables.values()), cube.output)
-            body = cube.body(variables, self._ignore_dimensions())
+            head = HyperCubeExtractor.__create_head(dataframe, list(variables.values()), cube.output)
+            body = cube.body(variables, ignore_dimensions)
             new_theory.assertZ(clause(head, body))
         return new_theory