Skip to content

Commit

Permalink
add selector between integer and one-hot encoding for categoricals (#76)
Browse files Browse the repository at this point in the history
  • Loading branch information
pplonski committed Jul 13, 2020
1 parent 818292f commit c44df82
Show file tree
Hide file tree
Showing 20 changed files with 79 additions and 33 deletions.
6 changes: 3 additions & 3 deletions examples/scripts/binary_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

automl = AutoML(
# results_path="AutoML_8",
algorithms=["Xgboost"],
#results_path="AutoML_11",
#algorithms=["Xgboost"],
# total_time_limit=200,
# explain_level=0
mode="Explain"
#mode="Perform"
)
automl.fit(X_train, y_train)

Expand Down
21 changes: 21 additions & 0 deletions supervised/preprocessing/encoding_selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd
import numpy as np

from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical


class EncodingSelector:

"""
EncodingSelector object decides which method should be used for categorical encoding.
Please keep it fast and simple. Thank you.
"""

@staticmethod
def get(X, y, column):
unique_cnt = len(np.unique(X.loc[~pd.isnull(X[column]), column]))
if unique_cnt <= 2 or unique_cnt > 25:
return PreprocessingCategorical.CONVERT_INTEGER

return PreprocessingCategorical.CONVERT_ONE_HOT
5 changes: 4 additions & 1 deletion supervised/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,10 @@ def fit_and_transform(self, X_train, y_train):
X_train = missing.transform(X_train)
self._missing_values += [missing]

for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
for convert_method in [
PreprocessingCategorical.CONVERT_INTEGER,
PreprocessingCategorical.CONVERT_ONE_HOT,
]:
cols_to_process = list(
filter(
lambda k: convert_method in columns_preprocessing[k],
Expand Down
2 changes: 2 additions & 0 deletions supervised/tuner/data_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues
from supervised.preprocessing.scale import Scale
from supervised.preprocessing.encoding_selector import EncodingSelector

from supervised.algorithms.registry import (
REGRESSION,
Expand Down Expand Up @@ -35,6 +36,7 @@ def compute(X, y, machinelearning_task):
#
if PreprocessingUtils.is_categorical(X[col]):
columns_info[col] += ["categorical"]
columns_info[col] += [EncodingSelector.get(X, y, col)]
else:
# numeric type, check if scale needed
if PreprocessingUtils.is_scale_needed(X[col]):
Expand Down
12 changes: 8 additions & 4 deletions supervised/tuner/preprocessing_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,15 @@ def get(required_preprocessing, data_info, machinelearning_task):
# convert to categorical only for categorical types
convert_to_integer_will_be_applied = False
if (
"convert_categorical" in required_preprocessing
and "categorical" in preprocessing_needed
"convert_categorical"
in required_preprocessing # the algorithm needs converted categoricals
and "categorical" in preprocessing_needed # the feature is categorical
):
preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
convert_to_integer_will_be_applied = True
if PreprocessingCategorical.CONVERT_ONE_HOT in preprocessing_needed:
preprocessing_to_apply += [PreprocessingCategorical.CONVERT_ONE_HOT]
else:
preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
convert_to_integer_will_be_applied = True # maybe scale needed

if "scale" in required_preprocessing:
if convert_to_integer_will_be_applied:
Expand Down
1 change: 1 addition & 0 deletions tests/tests_algorithms/test_catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

additional["max_rounds"] = 1


class CatBoostRegressorAlgorithmTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down
3 changes: 2 additions & 1 deletion tests/tests_algorithms/test_extra_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
ExtraTreesAlgorithm,
ExtraTreesRegressorAlgorithm,
additional,
regression_additional
regression_additional,
)
from supervised.utils.metric import Metric

Expand All @@ -22,6 +22,7 @@
additional["max_steps"] = 1
regression_additional["max_steps"] = 1


class ExtraTreesRegressorAlgorithmTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down
1 change: 0 additions & 1 deletion tests/tests_algorithms/test_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,3 @@ def test_fit_predict(self):
la.fit(self.X, self.y)
y_predicted = la.predict(self.X)
self.assertTrue(metric(self.y, y_predicted) < 0.6)

1 change: 1 addition & 0 deletions tests/tests_algorithms/test_lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

additional["max_rounds"] = 1


class LightgbmAlgorithmTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down
10 changes: 5 additions & 5 deletions tests/tests_algorithms/test_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def setUpClass(cls):
"learning_rate": 0.01,
"momentum": 0.9,
"decay": 0.001,
"ml_task": "binary_classification"
"ml_task": "binary_classification",
}

def test_fit_predict(self):
Expand Down Expand Up @@ -111,7 +111,7 @@ def setUpClass(cls):
"learning_rate": 0.01,
"momentum": 0.9,
"decay": 0.001,
"ml_task": "regression"
"ml_task": "regression",
}

cls.y = preprocessing.scale(cls.y)
Expand All @@ -128,6 +128,7 @@ def test_fit_predict(self):
self.assertTrue(loss + 0.000001 < loss_prev)
loss_prev = loss


class MultiClassNeuralNetworkAlgorithmTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand All @@ -152,13 +153,12 @@ def setUpClass(cls):
"momentum": 0.9,
"decay": 0.001,
"ml_task": "multiclass_classification",
"num_class": 3
"num_class": 3,
}

lb = preprocessing.LabelBinarizer()
lb.fit(cls.y)
cls.y = lb.transform(cls.y)


def test_fit_predict(self):
metric = Metric({"name": "logloss"})
Expand All @@ -170,4 +170,4 @@ def test_fit_predict(self):
loss = metric(self.y, y_predicted)
if loss_prev is not None:
self.assertTrue(loss + 0.000001 < loss_prev)
loss_prev = loss
loss_prev = loss
5 changes: 3 additions & 2 deletions tests/tests_algorithms/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from supervised.algorithms.random_forest import (
RandomForestAlgorithm,
RandomForestRegressorAlgorithm,
additional,
regression_additional
additional,
regression_additional,
)
from supervised.utils.metric import Metric

Expand All @@ -22,6 +22,7 @@
additional["max_steps"] = 1
regression_additional["max_steps"] = 1


class RandomForestRegressorAlgorithmTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_algorithms/test_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Model1:
"model_params": {},
"required_preprocessing": {},
"additional": {},
"default_params": {}
"default_params": {},
}
AlgorithmsRegistry.add(**model1)

Expand Down
1 change: 1 addition & 0 deletions tests/tests_algorithms/test_xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

additional["max_rounds"] = 1


class XgboostAlgorithmTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down
14 changes: 7 additions & 7 deletions tests/tests_automl/test_automl_time_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,25 @@ def test_set_total_time_limit(self):
automl = AutoML(
results_path=self.automl_dir, total_time_limit=100, algorithms=[model_type]
)
automl._time_spend['simple_algorithms'] = 0
automl._time_spend['default_algorithms'] = 0
automl._fit_level = 'not_so_random'

automl._time_spend["simple_algorithms"] = 0
automl._time_spend["default_algorithms"] = 0
automl._fit_level = "not_so_random"
time_spend = 0
for _ in range(12):
automl._start_time -= 10
automl.log_train_time(model_type, 10)
if automl._enough_time_to_train(model_type):
time_spend += 10

self.assertTrue(time_spend < 100)

def test_set_model_time_limit(self):
model_type = "Xgboost"
automl = AutoML(
results_path=self.automl_dir, model_time_limit=10, algorithms=[model_type]
)

for _ in range(12):
automl.log_train_time(model_type, 10)
# should be always true
Expand All @@ -55,7 +55,7 @@ def test_set_model_time_limit_omit_total_time(self):
total_time_limit=10, # this parameter setting should be omitted
algorithms=[model_type],
)

for _ in range(12):
automl.log_train_time(model_type, 10)
# should be always true
Expand Down
10 changes: 7 additions & 3 deletions tests/tests_automl/test_explain_levels.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@
from supervised import AutoML

from supervised.algorithms.random_forest import additional

additional["max_steps"] = 1
additional["trees_in_step"] = 1

from supervised.algorithms.xgboost import additional

additional["max_rounds"] = 1


class AutoMLExplainLevelsTest(unittest.TestCase):

automl_dir = "automl_1"
Expand Down Expand Up @@ -53,7 +56,9 @@ def test_explain_default(self):

a.fit(X, y)

result_files = os.listdir(os.path.join(self.automl_dir, "1_Default_RandomForest"))
result_files = os.listdir(
os.path.join(self.automl_dir, "1_Default_RandomForest")
)

# There should be files with:
# - permutation importance
Expand Down Expand Up @@ -241,7 +246,7 @@ def test_build_decision_tree(self):
a.fit(X, y)

result_files = os.listdir(os.path.join(self.automl_dir, "1_DecisionTree"))

# There should be files with:
# - decision tree visualization
# - permutation importance
Expand Down Expand Up @@ -284,4 +289,3 @@ def test_build_decision_tree(self):
produced = True
break
self.assertTrue(produced)

2 changes: 2 additions & 0 deletions tests/tests_automl/test_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
from supervised.exceptions import AutoMLException

from supervised.algorithms.xgboost import additional

additional["max_rounds"] = 1


class AutoMLTargetsTest(unittest.TestCase):

automl_dir = "automl_tests"
Expand Down
2 changes: 2 additions & 0 deletions tests/tests_automl/test_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
from supervised.exceptions import AutoMLException

from supervised.algorithms.xgboost import additional

additional["max_rounds"] = 1


class AutoMLTuningTest(unittest.TestCase):

automl_dir = "automl_tests"
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_preprocessing/test_label_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def test_inverse_transform(self):
self.assertTrue(np.sum(bb["col2_w"]) == 1)
bb = lb.inverse_transform(bb)
self.assertTrue("col2_w" not in bb.columns)


if __name__ == "__main__":
unittest.main()
1 change: 0 additions & 1 deletion tests/tests_validation/test_validator_kfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,3 @@ def test_create_with_target_as_labels(self):
self.assertEqual(y_train.shape[0], 2)
self.assertEqual(X_validation.shape[0], 2)
self.assertEqual(y_validation.shape[0], 2)

11 changes: 8 additions & 3 deletions tests/tests_validation/test_validator_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import shutil


class SplitValidatorTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand All @@ -21,9 +22,14 @@ def test_create(self):
data = {
"train": {
"X": pd.DataFrame(
np.array([[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]
np.array(
[[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]
),
columns=["a", "b"],
),
"y": pd.DataFrame(
np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]
),
"y": pd.DataFrame(np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]),
}
}

Expand Down Expand Up @@ -136,4 +142,3 @@ def test_create_with_target_as_labels(self):
self.assertEqual(y_train.shape[0], 2)
self.assertEqual(X_validation.shape[0], 2)
self.assertEqual(y_validation.shape[0], 2)

0 comments on commit c44df82

Please sign in to comment.