Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement the SuperVectorizer and dirty_cat's encoders to the search space #169

Open
wants to merge 8 commits into
base: typed_data_terminals
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 2 additions & 14 deletions gama/configuration/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
MinMaxScaler,
Normalizer,
PolynomialFeatures,
RobustScaler,
StandardScaler,
Binarizer,
)
from sklearn.kernel_approximation import Nystroem, RBFSampler
Expand All @@ -29,23 +27,15 @@
VarianceThreshold,
)


class SuperEncoder(StandardScaler):
# For testing purposes only
pass
from .preprocessing import preproc_conf


# For comparison, this selection of operators and hyperparameters is
# currently most of what TPOT supports.

clf_config = {
"data": ["data"],
SuperEncoder: {
"_input": "data",
"_output": "numeric_data",
"with_std": [True, False],
"with_mean": [True, False],
},
**preproc_conf,
"alpha": [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0],
"fit_prior": [True, False],
"min_samples_split": range(2, 21),
Expand Down Expand Up @@ -147,8 +137,6 @@ class SuperEncoder(StandardScaler):
"interaction_only": [False],
},
RBFSampler: {"gamma": np.arange(0.0, 1.01, 0.05)},
RobustScaler: {},
StandardScaler: {},
# Selectors
SelectFwe: {"alpha": np.arange(0, 0.05, 0.001), "score_func": {f_classif: None}},
SelectPercentile: {"percentile": range(1, 100), "score_func": {f_classif: None}},
Expand Down
32 changes: 30 additions & 2 deletions gama/configuration/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,33 @@ def pset_from_config(
if isinstance(param_values, list) and not param_values:
# An empty list indicates a shared hyperparameter
hyperparameter_types.append(name)
elif isinstance(param_values, dict) \
and all([isinstance(_k, type) for _k in param_values.keys()]):
for sub_key, sub_hyperparameters in param_values.items():
sub_hps_for_encoder = []

for enc_param, sub_hyperparams in sub_hyperparameters.items():
hp_name = f"{key.__name__}.{name}.{sub_key.__name__}.{enc_param}"
sub_hps_for_encoder.append(hp_name)
for sub_param_value in sub_hyperparams:
pset[hp_name].append(
Terminal(
value=sub_param_value,
output=enc_param,
identifier=hp_name,
)
)
hp_name = f"{key.__name__}.{name}"
if hp_name not in hyperparameter_types:
hyperparameter_types.append(hp_name)
pset[hp_name].append(
Primitive(
input=tuple(sub_hps_for_encoder),
output=name,
identifier=sub_key,
data_input="dont_remove",
)
)
elif name == "param_check":
# This allows users to define illegal hyperparameter combinations,
# but is not a terminal.
Expand Down Expand Up @@ -192,12 +219,13 @@ def remove_primitives_with_unreachable_input(
for return_type, prims_and_terms in pset.items():
for pt in prims_and_terms:
if isinstance(pt, Primitive) and pt.data_input not in reachability:
print(pt)
pass
#print(pt) # FIXME
return {
return_type: [
pt
for pt in prims_and_terms
if not (isinstance(pt, Primitive) and pt.data_input not in reachability)
if not (isinstance(pt, Primitive) and pt.data_input not in reachability and pt.data_input != "dont_remove")
]
for return_type, prims_and_terms in pset.items()
}
48 changes: 48 additions & 0 deletions gama/configuration/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from dirty_cat import (
SuperVectorizer,
SimilarityEncoder,
GapEncoder,
MinHashEncoder,
)
from sklearn.preprocessing import (
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
RobustScaler,
)


preproc_conf = {
SuperVectorizer: {
"_input": "data",
"impute_missing": ["force"],
"cardinality_threshold": [20, 40, 60],
"low_card_cat_transformer": {
OneHotEncoder: {
"handle_unknown": ["ignore"],
},
},
"high_card_cat_transformer": {
OrdinalEncoder: {
"categories": ["auto"],
"handle_unknown": ["use_encoded_value"],
"unknown_value": [-1],
"encoded_missing_value": [-2],
},
SimilarityEncoder: {
"n_prototypes": [10, 25, 50, 100],
},
GapEncoder: {
"analyzer": ["word", "char", "char_wb"],
},
MinHashEncoder: {
"n_components": [10, 30, 50, 100],
"hashing": ["fast", "murmur"],
},
},
"numerical_transformer": {
RobustScaler: {},
StandardScaler: {},
}
},
}
8 changes: 5 additions & 3 deletions gama/configuration/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR

from .preprocessing import preproc_conf


# For comparison, this selection of operators and hyperparameters is
# currently most of what TPOT supports.

reg_config = {
"numeric_data": ["data"],
"data": ["data"],
**preproc_conf,
ElasticNetCV: {
"l1_ratio": np.arange(0.0, 1.01, 0.05),
"tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
Expand Down Expand Up @@ -125,8 +129,6 @@
"interaction_only": [False],
},
RBFSampler: {"gamma": np.arange(0.0, 1.01, 0.05)},
RobustScaler: {},
StandardScaler: {},
# Selectors
SelectFwe: {"alpha": np.arange(0, 0.05, 0.001), "score_func": {f_regression: None}},
SelectPercentile: {"percentile": range(1, 100), "score_func": {f_regression: None}},
Expand Down
18 changes: 3 additions & 15 deletions gama/gama.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from gama.utilities.generic.timekeeper import TimeKeeper
from gama.logging.utility_functions import register_stream_log
from gama.utilities.preprocessing import (
basic_encoding,
basic_pipeline_extension,
)
from gama.genetic_programming.mutation import random_valid_mutation_in_place
Expand Down Expand Up @@ -255,7 +254,6 @@ def __init__(

self._x: Optional[pd.DataFrame] = None
self._y: Optional[pd.DataFrame] = None
self._basic_encoding_pipeline: Optional[Pipeline] = None
self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = []
self._inferred_dtypes: List[Type] = []
self.model: object = None
Expand Down Expand Up @@ -359,8 +357,6 @@ def _prepare_for_prediction(
) -> pd.DataFrame:
if isinstance(x, np.ndarray):
x = self._np_to_matching_dataframe(x)
if self._basic_encoding_pipeline:
x = self._basic_encoding_pipeline.transform(x)
return x

def _predict(self, x: pd.DataFrame) -> np.ndarray:
Expand Down Expand Up @@ -530,14 +526,11 @@ def fit(
with self._time_manager.start_activity(
"preprocessing", activity_meta=["default"]
):
x, self._y = format_x_y(x, y)
self._x, self._y = format_x_y(x, y)
self._inferred_dtypes = x.dtypes
is_classification = hasattr(self, "_label_encoder")
self._x, self._basic_encoding_pipeline = basic_encoding(
x, is_classification
)
self._fixed_pipeline_extension = basic_pipeline_extension(
self._x, is_classification
is_classification
)
self._operator_set._safe_compile = partial(
self._operator_set._compile,
Expand Down Expand Up @@ -688,12 +681,7 @@ def export_script(
if raise_if_exists and file is not None and os.path.isfile(file):
raise FileExistsError(f"File {file} already exists.")

if self._basic_encoding_pipeline is not None:
script_text = self._post_processing.to_code(
self._basic_encoding_pipeline.steps + self._fixed_pipeline_extension
)
else:
script_text = self._post_processing.to_code(self._fixed_pipeline_extension)
script_text = self._post_processing.to_code(self._fixed_pipeline_extension)

if file:
with open(file, "w") as fh:
Expand Down
21 changes: 16 additions & 5 deletions gama/genetic_programming/compilers/scikitlearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,23 @@


def primitive_node_to_sklearn(primitive_node: PrimitiveNode) -> object:
hyperparameters = {
terminal.output: terminal.value
for terminal in primitive_node._children
hyperparameters = {}
for terminal in primitive_node._children:
# BANDAGE
if isinstance(terminal, Terminal) and terminal.value != "data"
}
if isinstance(terminal, Terminal) and terminal.value == "data":
continue
# Check if primitive is hyperparameter or different step in the pipeline
# if it is a hyperparameter it has input type dont_remove
if isinstance(terminal, PrimitiveNode) and terminal._primitive.data_input == "dont_remove":
hps = {
hp.output: hp.value
for hp in terminal._children
}
value = terminal._primitive.identifier(**hps)
hyperparameters.update({terminal._primitive.output: value})
if isinstance(terminal, Terminal):
value = terminal.value
hyperparameters.update({terminal.output: value})
return primitive_node._primitive.identifier(**hyperparameters)


Expand Down
9 changes: 7 additions & 2 deletions gama/genetic_programming/components/individual.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,17 @@ def pipeline_str(self) -> str:
@property
def primitives(self) -> List[PrimitiveNode]:
"""Lists all primitive nodes, starting with the Individual's main node."""

def is_data_primitive(child) -> bool:
return isinstance(child, PrimitiveNode) and child._primitive.data_input != "dont_remove"

primitives = [self.main_node]
current_children = self.main_node._children
while any(isinstance(child, PrimitiveNode) for child in current_children):
while any(is_data_primitive(child) for child in current_children):
# Only data input can be a primitive node, so there is never more than one.
child_node = next(
child for child in current_children if isinstance(child, PrimitiveNode)
child for child in current_children
if is_data_primitive(child)
)
primitives.append(child_node)
current_children = child_node._children
Expand Down
6 changes: 3 additions & 3 deletions gama/genetic_programming/components/primitive_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class PrimitiveNode:
The Primitive type of this PrimitiveNode.
data_node: PrimitiveNode
The PrimitiveNode that specifies all preprocessing before this PrimitiveNode.
terminals: List[Union["PrimitiveNode", Terminal]]
children: List[Union["PrimitiveNode", Terminal]]
A non-empty list of terminals and primitivenodes matching the `primitive` input.
"""

Expand All @@ -37,8 +37,8 @@ def __str__(self) -> str:
input_str = f"{self.input_node!r}" if self.input_node else ""
terminal_str = ", ".join(
[
repr(terminal)
for terminal in self.terminals
repr(terminal) if isinstance(terminal, Terminal) else str(terminal)
for terminal in self.terminals + self.primitives
if terminal != self.input_node
]
)
Expand Down
2 changes: 1 addition & 1 deletion gama/genetic_programming/crossover.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,6 @@ def _valid_crossover_functions(ind1: Individual, ind2: Individual) -> List[Calla
crossover_choices = []
if list(_shared_terminals(ind1, ind2)):
crossover_choices.append(crossover_terminals)
if len(list(ind1.primitives)) >= 2 and len(list(ind2.primitives)) >= 2:
if len(list(ind1.primitives)) >= 3 and len(list(ind2.primitives)) >= 3:
crossover_choices.append(crossover_primitives)
return crossover_choices
18 changes: 15 additions & 3 deletions gama/genetic_programming/mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import Callable, Optional, List, Dict

from gama.genetic_programming.components.terminal import Terminal
from .components import Individual
from .components import Individual, Primitive
from .operations import random_primitive_node


Expand Down Expand Up @@ -105,7 +105,10 @@ def mut_shrink(
if shrink_by is not None and n_primitives <= shrink_by:
raise ValueError(f"Can't shrink size {n_primitives} individual by {shrink_by}.")
if shrink_by is None:
shrink_by = random.randint(1, n_primitives)
if n_primitives > 1:
shrink_by = random.randint(1, n_primitives)
else:
shrink_by = 0

i = len(individual.primitives) - 1
while shrink_by > 0 and i > 0:
Expand Down Expand Up @@ -136,7 +139,16 @@ def mut_insert(individual: Individual, primitive_set: dict) -> None:
Individual to mutate in-place.
primitive_set: dict
"""
parent_node = random.choice(list(individual.primitives))
candidate_primitives = [
primitive for primitive in individual.primitives
if any(
isinstance(p, Primitive)
for p in primitive_set[primitive._primitive.data_input]
)
]
if not candidate_primitives:
raise Exception(f'No candidate primitives')
parent_node = random.choice(candidate_primitives)
new_primitive_node = random_primitive_node(
output_type=parent_node._primitive.data_input,
primitive_set=primitive_set,
Expand Down
9 changes: 9 additions & 0 deletions gama/genetic_programming/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
DATA_TERMINAL,
)

from dirty_cat import SuperVectorizer


def random_terminals_for_primitive(
primitive_set: dict, primitive: Primitive
Expand Down Expand Up @@ -85,6 +87,11 @@ def random_primitive_node(
data_input_type: Optional[str] = None,
) -> PrimitiveNode:
"""Create a PrimitiveNode with specified output_type and random terminals."""
# Hotfix
# otherwise, the function tries to replace the SuperVectorizer
# with something else (but none match).
if isinstance(exclude, Primitive) and exclude.identifier is SuperVectorizer:
exclude = None
candidates = [
p
for p in primitive_set[output_type]
Expand All @@ -102,6 +109,8 @@ def random_primitive_node(
c for c in candidates if reachability[c.data_input] == with_depth - 1
]

if len(candidates) == 0:
raise Exception('No candidates to chose from')
primitive = random.choice(candidates)
remaining_depth = with_depth - 1 if with_depth else None
children = random_children_for_primitive(
Expand Down
4 changes: 3 additions & 1 deletion gama/genetic_programming/operator_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,12 @@ def wait_next(self, async_evaluator):
log.warning(f"Error raised during evaluation: {str(future.exception)}.")
return future

def try_until_new(self, operator, *args, **kwargs):
def try_until_new(self, operator: Callable[..., Individual], *args, **kwargs):
"""Keep executing `operator` until a new individual is created."""
for _ in range(self._max_retry):
individual = operator(*args, **kwargs)
if str(individual.main_node) == 'SuperVectorizer':
return individual
if str(individual.main_node) not in self._completed_evaluations:
return individual
else:
Expand Down
2 changes: 2 additions & 0 deletions gama/postprocessing/best_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def __init__(self, time_fraction: float = 0.1):
def post_process(
self, x: pd.DataFrame, y: pd.Series, timeout: float, selection: List[Individual]
) -> object:
if len(selection) == 0:
raise Exception('No individual to choose from')
self._selected_individual = selection[0]
return self._selected_individual.pipeline.fit(x, y)

Expand Down
Loading