add selector between integer and one-hot encoding for categoricals (#76)

mljar · Jul 13, 2020 · c44df82 · c44df82
1 parent 818292f
commit c44df82
Show file tree

Hide file tree

Showing 20 changed files with 79 additions and 33 deletions.
diff --git a/examples/scripts/binary_classifier.py b/examples/scripts/binary_classifier.py
@@ -19,11 +19,11 @@
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
 automl = AutoML(
-    # results_path="AutoML_8",
-    algorithms=["Xgboost"],
+    #results_path="AutoML_11",
+    #algorithms=["Xgboost"],
     # total_time_limit=200,
     # explain_level=0
-    mode="Explain"
+    #mode="Perform"
 )
 automl.fit(X_train, y_train)
 

diff --git a/supervised/preprocessing/encoding_selector.py b/supervised/preprocessing/encoding_selector.py
@@ -0,0 +1,21 @@
+import pandas as pd
+import numpy as np
+
+from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
+
+
+class EncodingSelector:
+
+    """
+    EncodingSelector object decides which method should be used for categorical encoding.
+
+    Please keep it fast and simple. Thank you.
+    """
+
+    @staticmethod
+    def get(X, y, column):
+        unique_cnt = len(np.unique(X.loc[~pd.isnull(X[column]), column]))
+        if unique_cnt <= 2 or unique_cnt > 25:
+            return PreprocessingCategorical.CONVERT_INTEGER
+
+        return PreprocessingCategorical.CONVERT_ONE_HOT
diff --git a/supervised/preprocessing/preprocessing.py b/supervised/preprocessing/preprocessing.py
@@ -133,7 +133,10 @@ def fit_and_transform(self, X_train, y_train):
             X_train = missing.transform(X_train)
             self._missing_values += [missing]
 
-        for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
+        for convert_method in [
+            PreprocessingCategorical.CONVERT_INTEGER,
+            PreprocessingCategorical.CONVERT_ONE_HOT,
+        ]:
             cols_to_process = list(
                 filter(
                     lambda k: convert_method in columns_preprocessing[k],

diff --git a/supervised/tuner/data_info.py b/supervised/tuner/data_info.py
@@ -4,6 +4,7 @@
 from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
 from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues
 from supervised.preprocessing.scale import Scale
+from supervised.preprocessing.encoding_selector import EncodingSelector
 
 from supervised.algorithms.registry import (
     REGRESSION,
@@ -35,6 +36,7 @@ def compute(X, y, machinelearning_task):
             #
             if PreprocessingUtils.is_categorical(X[col]):
                 columns_info[col] += ["categorical"]
+                columns_info[col] += [EncodingSelector.get(X, y, col)]
             else:
                 # numeric type, check if scale needed
                 if PreprocessingUtils.is_scale_needed(X[col]):

diff --git a/supervised/tuner/preprocessing_tuner.py b/supervised/tuner/preprocessing_tuner.py
@@ -45,11 +45,15 @@ def get(required_preprocessing, data_info, machinelearning_task):
             # convert to categorical only for categorical types
             convert_to_integer_will_be_applied = False
             if (
-                "convert_categorical" in required_preprocessing
-                and "categorical" in preprocessing_needed
+                "convert_categorical"
+                in required_preprocessing  # the algorithm needs converted categoricals
+                and "categorical" in preprocessing_needed  # the feature is categorical
             ):
-                preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
-                convert_to_integer_will_be_applied = True
+                if PreprocessingCategorical.CONVERT_ONE_HOT in preprocessing_needed:
+                    preprocessing_to_apply += [PreprocessingCategorical.CONVERT_ONE_HOT]
+                else:
+                    preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
+                    convert_to_integer_will_be_applied = True  # maybe scale needed
 
             if "scale" in required_preprocessing:
                 if convert_to_integer_will_be_applied:

diff --git a/tests/tests_algorithms/test_catboost.py b/tests/tests_algorithms/test_catboost.py
@@ -14,6 +14,7 @@
 
 additional["max_rounds"] = 1
 
+
 class CatBoostRegressorAlgorithmTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):

diff --git a/tests/tests_algorithms/test_extra_trees.py b/tests/tests_algorithms/test_extra_trees.py
@@ -11,7 +11,7 @@
     ExtraTreesAlgorithm,
     ExtraTreesRegressorAlgorithm,
     additional,
-    regression_additional
+    regression_additional,
 )
 from supervised.utils.metric import Metric
 
@@ -22,6 +22,7 @@
 additional["max_steps"] = 1
 regression_additional["max_steps"] = 1
 
+
 class ExtraTreesRegressorAlgorithmTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):

diff --git a/tests/tests_algorithms/test_knn.py b/tests/tests_algorithms/test_knn.py
@@ -70,4 +70,3 @@ def test_fit_predict(self):
         la.fit(self.X, self.y)
         y_predicted = la.predict(self.X)
         self.assertTrue(metric(self.y, y_predicted) < 0.6)
-
diff --git a/tests/tests_algorithms/test_lightgbm.py b/tests/tests_algorithms/test_lightgbm.py
@@ -13,6 +13,7 @@
 
 additional["max_rounds"] = 1
 
+
 class LightgbmAlgorithmTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):

diff --git a/tests/tests_algorithms/test_nn.py b/tests/tests_algorithms/test_nn.py
@@ -35,7 +35,7 @@ def setUpClass(cls):
             "learning_rate": 0.01,
             "momentum": 0.9,
             "decay": 0.001,
-            "ml_task": "binary_classification"
+            "ml_task": "binary_classification",
         }
 
     def test_fit_predict(self):
@@ -111,7 +111,7 @@ def setUpClass(cls):
             "learning_rate": 0.01,
             "momentum": 0.9,
             "decay": 0.001,
-            "ml_task": "regression"
+            "ml_task": "regression",
         }
 
         cls.y = preprocessing.scale(cls.y)
@@ -128,6 +128,7 @@ def test_fit_predict(self):
                 self.assertTrue(loss + 0.000001 < loss_prev)
             loss_prev = loss
 
+
 class MultiClassNeuralNetworkAlgorithmTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -152,13 +153,12 @@ def setUpClass(cls):
             "momentum": 0.9,
             "decay": 0.001,
             "ml_task": "multiclass_classification",
-            "num_class": 3
+            "num_class": 3,
         }
 
         lb = preprocessing.LabelBinarizer()
         lb.fit(cls.y)
         cls.y = lb.transform(cls.y)
-
 
     def test_fit_predict(self):
         metric = Metric({"name": "logloss"})
@@ -170,4 +170,4 @@ def test_fit_predict(self):
             loss = metric(self.y, y_predicted)
             if loss_prev is not None:
                 self.assertTrue(loss + 0.000001 < loss_prev)
-            loss_prev = loss
+            loss_prev = loss
diff --git a/tests/tests_algorithms/test_random_forest.py b/tests/tests_algorithms/test_random_forest.py
@@ -10,8 +10,8 @@
 from supervised.algorithms.random_forest import (
     RandomForestAlgorithm,
     RandomForestRegressorAlgorithm,
-    additional, 
-    regression_additional
+    additional,
+    regression_additional,
 )
 from supervised.utils.metric import Metric
 
@@ -22,6 +22,7 @@
 additional["max_steps"] = 1
 regression_additional["max_steps"] = 1
 
+
 class RandomForestRegressorAlgorithmTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):

diff --git a/tests/tests_algorithms/test_registry.py b/tests/tests_algorithms/test_registry.py
@@ -18,7 +18,7 @@ class Model1:
             "model_params": {},
             "required_preprocessing": {},
             "additional": {},
-            "default_params": {}
+            "default_params": {},
         }
         AlgorithmsRegistry.add(**model1)
 

diff --git a/tests/tests_algorithms/test_xgboost.py b/tests/tests_algorithms/test_xgboost.py
@@ -14,6 +14,7 @@
 
 additional["max_rounds"] = 1
 
+
 class XgboostAlgorithmTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):

diff --git a/tests/tests_automl/test_automl_time_constraints.py b/tests/tests_automl/test_automl_time_constraints.py
@@ -23,25 +23,25 @@ def test_set_total_time_limit(self):
         automl = AutoML(
             results_path=self.automl_dir, total_time_limit=100, algorithms=[model_type]
         )
-        
-        automl._time_spend['simple_algorithms'] = 0
-        automl._time_spend['default_algorithms'] = 0
-        automl._fit_level = 'not_so_random'
+
+        automl._time_spend["simple_algorithms"] = 0
+        automl._time_spend["default_algorithms"] = 0
+        automl._fit_level = "not_so_random"
         time_spend = 0
         for _ in range(12):
             automl._start_time -= 10
             automl.log_train_time(model_type, 10)
             if automl._enough_time_to_train(model_type):
                 time_spend += 10
-                
+
         self.assertTrue(time_spend < 100)
 
     def test_set_model_time_limit(self):
         model_type = "Xgboost"
         automl = AutoML(
             results_path=self.automl_dir, model_time_limit=10, algorithms=[model_type]
         )
-        
+
         for _ in range(12):
             automl.log_train_time(model_type, 10)
             # should be always true
@@ -55,7 +55,7 @@ def test_set_model_time_limit_omit_total_time(self):
             total_time_limit=10,  # this parameter setting should be omitted
             algorithms=[model_type],
         )
-        
+
         for _ in range(12):
             automl.log_train_time(model_type, 10)
             # should be always true

diff --git a/tests/tests_automl/test_explain_levels.py b/tests/tests_automl/test_explain_levels.py
@@ -10,12 +10,15 @@
 from supervised import AutoML
 
 from supervised.algorithms.random_forest import additional
+
 additional["max_steps"] = 1
 additional["trees_in_step"] = 1
 
 from supervised.algorithms.xgboost import additional
+
 additional["max_rounds"] = 1
 
+
 class AutoMLExplainLevelsTest(unittest.TestCase):
 
     automl_dir = "automl_1"
@@ -53,7 +56,9 @@ def test_explain_default(self):
 
         a.fit(X, y)
 
-        result_files = os.listdir(os.path.join(self.automl_dir, "1_Default_RandomForest"))
+        result_files = os.listdir(
+            os.path.join(self.automl_dir, "1_Default_RandomForest")
+        )
 
         # There should be files with:
         # - permutation importance
@@ -241,7 +246,7 @@ def test_build_decision_tree(self):
         a.fit(X, y)
 
         result_files = os.listdir(os.path.join(self.automl_dir, "1_DecisionTree"))
-        
+
         # There should be files with:
         # - decision tree visualization
         # - permutation importance
@@ -284,4 +289,3 @@ def test_build_decision_tree(self):
                 produced = True
                 break
         self.assertTrue(produced)
-
diff --git a/tests/tests_automl/test_targets.py b/tests/tests_automl/test_targets.py
@@ -11,8 +11,10 @@
 from supervised.exceptions import AutoMLException
 
 from supervised.algorithms.xgboost import additional
+
 additional["max_rounds"] = 1
 
+
 class AutoMLTargetsTest(unittest.TestCase):
 
     automl_dir = "automl_tests"

diff --git a/tests/tests_automl/test_tuning.py b/tests/tests_automl/test_tuning.py
@@ -11,8 +11,10 @@
 from supervised.exceptions import AutoMLException
 
 from supervised.algorithms.xgboost import additional
+
 additional["max_rounds"] = 1
 
+
 class AutoMLTuningTest(unittest.TestCase):
 
     automl_dir = "automl_tests"

diff --git a/tests/tests_preprocessing/test_label_binarizer.py b/tests/tests_preprocessing/test_label_binarizer.py
@@ -190,7 +190,7 @@ def test_inverse_transform(self):
         self.assertTrue(np.sum(bb["col2_w"]) == 1)
         bb = lb.inverse_transform(bb)
         self.assertTrue("col2_w" not in bb.columns)
-        
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/tests_validation/test_validator_kfold.py b/tests/tests_validation/test_validator_kfold.py
@@ -140,4 +140,3 @@ def test_create_with_target_as_labels(self):
             self.assertEqual(y_train.shape[0], 2)
             self.assertEqual(X_validation.shape[0], 2)
             self.assertEqual(y_validation.shape[0], 2)
-
diff --git a/tests/tests_validation/test_validator_split.py b/tests/tests_validation/test_validator_split.py
@@ -6,6 +6,7 @@
 import os
 import shutil
 
+
 class SplitValidatorTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -21,9 +22,14 @@ def test_create(self):
         data = {
             "train": {
                 "X": pd.DataFrame(
-                    np.array([[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]
+                    np.array(
+                        [[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]
+                    ),
+                    columns=["a", "b"],
+                ),
+                "y": pd.DataFrame(
+                    np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]
                 ),
-                "y": pd.DataFrame(np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]),
             }
         }
 
@@ -136,4 +142,3 @@ def test_create_with_target_as_labels(self):
             self.assertEqual(y_train.shape[0], 2)
             self.assertEqual(X_validation.shape[0], 2)
             self.assertEqual(y_validation.shape[0], 2)
-