openml-labs · LilianBoulard · Aug 4, 2022 · Aug 8, 2022 · Aug 10, 2022 · Aug 10, 2022
diff --git a/gama/configuration/classification.py b/gama/configuration/classification.py
@@ -16,8 +16,6 @@
  MinMaxScaler,
  Normalizer,
  PolynomialFeatures,
- RobustScaler,
- StandardScaler,
  Binarizer,
 )
 from sklearn.kernel_approximation import Nystroem, RBFSampler
@@ -29,23 +27,15 @@
  VarianceThreshold,
 )
 
-
-class SuperEncoder(StandardScaler):
- # For testing purposes only
- pass
+from .preprocessing import preproc_conf
 
 
 # For comparison, this selection of operators and hyperparameters is
 # currently most of what TPOT supports.
 
 clf_config = {
  "data": ["data"],
- SuperEncoder: {
- "_input": "data",
- "_output": "numeric_data",
- "with_std": [True, False],
- "with_mean": [True, False],
- },
+ **preproc_conf,
  "alpha": [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0],
  "fit_prior": [True, False],
  "min_samples_split": range(2, 21),
@@ -147,8 +137,6 @@ class SuperEncoder(StandardScaler):
  "interaction_only": [False],
  },
  RBFSampler: {"gamma": np.arange(0.0, 1.01, 0.05)},
- RobustScaler: {},
- StandardScaler: {},
  # Selectors
  SelectFwe: {"alpha": np.arange(0, 0.05, 0.001), "score_func": {f_classif: None}},
  SelectPercentile: {"percentile": range(1, 100), "score_func": {f_classif: None}},

diff --git a/gama/configuration/parser.py b/gama/configuration/parser.py
@@ -53,6 +53,33 @@ def pset_from_config(
  if isinstance(param_values, list) and not param_values:
  # An empty list indicates a shared hyperparameter
  hyperparameter_types.append(name)
+ elif isinstance(param_values, dict) \
+ and all([isinstance(_k, type) for _k in param_values.keys()]):
+ for sub_key, sub_hyperparameters in param_values.items():
+ sub_hps_for_encoder = []
+
+ for enc_param, sub_hyperparams in sub_hyperparameters.items():
+ hp_name = f"{key.__name__}.{name}.{sub_key.__name__}.{enc_param}"
+ sub_hps_for_encoder.append(hp_name)
+ for sub_param_value in sub_hyperparams:
+ pset[hp_name].append(
+ Terminal(
+ value=sub_param_value,
+ output=enc_param,
+ identifier=hp_name,
+ )
+ )
+ hp_name = f"{key.__name__}.{name}"
+ if hp_name not in hyperparameter_types:
+ hyperparameter_types.append(hp_name)
+ pset[hp_name].append(
+ Primitive(
+ input=tuple(sub_hps_for_encoder),
+ output=name,
+ identifier=sub_key,
+ data_input="dont_remove",
+ )
+ )
  elif name == "param_check":
  # This allows users to define illegal hyperparameter combinations,
  # but is not a terminal.
@@ -192,12 +219,13 @@ def remove_primitives_with_unreachable_input(
  for return_type, prims_and_terms in pset.items():
  for pt in prims_and_terms:
  if isinstance(pt, Primitive) and pt.data_input not in reachability:
- print(pt)
+ pass
+ #print(pt) # FIXME
  return {
  return_type: [
  pt
  for pt in prims_and_terms
- if not (isinstance(pt, Primitive) and pt.data_input not in reachability)
+ if not (isinstance(pt, Primitive) and pt.data_input not in reachability and pt.data_input != "dont_remove")
  ]
  for return_type, prims_and_terms in pset.items()
  }
diff --git a/gama/configuration/preprocessing.py b/gama/configuration/preprocessing.py
@@ -0,0 +1,48 @@
+from dirty_cat import (
+ SuperVectorizer,
+ SimilarityEncoder,
+ GapEncoder,
+ MinHashEncoder,
+)
+from sklearn.preprocessing import (
+ OneHotEncoder,
+ OrdinalEncoder,
+ StandardScaler,
+ RobustScaler,
+)
+
+
+preproc_conf = {
+ SuperVectorizer: {
+ "_input": "data",
+ "impute_missing": ["force"],
+ "cardinality_threshold": [20, 40, 60],
+ "low_card_cat_transformer": {
+ OneHotEncoder: {
+ "handle_unknown": ["ignore"],
+ },
+ },
+ "high_card_cat_transformer": {
+ OrdinalEncoder: {
+ "categories": ["auto"],
+ "handle_unknown": ["use_encoded_value"],
+ "unknown_value": [-1],
+ "encoded_missing_value": [-2],
+ },
+ SimilarityEncoder: {
+ "n_prototypes": [10, 25, 50, 100],
+ },
+ GapEncoder: {
+ "analyzer": ["word", "char", "char_wb"],
+ },
+ MinHashEncoder: {
+ "n_components": [10, 30, 50, 100],
+ "hashing": ["fast", "murmur"],
+ },
+ },
+ "numerical_transformer": {
+ RobustScaler: {},
+ StandardScaler: {},
+ }
+ },
+}
diff --git a/gama/configuration/regression.py b/gama/configuration/regression.py
@@ -31,11 +31,15 @@
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.svm import LinearSVR
 
+from .preprocessing import preproc_conf
+
+
 # For comparison, this selection of operators and hyperparameters is
 # currently most of what TPOT supports.
 
 reg_config = {
- "numeric_data": ["data"],
+ "data": ["data"],
+ **preproc_conf,
  ElasticNetCV: {
  "l1_ratio": np.arange(0.0, 1.01, 0.05),
  "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
@@ -125,8 +129,6 @@
  "interaction_only": [False],
  },
  RBFSampler: {"gamma": np.arange(0.0, 1.01, 0.05)},
- RobustScaler: {},
- StandardScaler: {},
  # Selectors
  SelectFwe: {"alpha": np.arange(0, 0.05, 0.001), "score_func": {f_regression: None}},
  SelectPercentile: {"percentile": range(1, 100), "score_func": {f_regression: None}},

diff --git a/gama/gama.py b/gama/gama.py
@@ -42,7 +42,6 @@
 from gama.utilities.generic.timekeeper import TimeKeeper
 from gama.logging.utility_functions import register_stream_log
 from gama.utilities.preprocessing import (
- basic_encoding,
  basic_pipeline_extension,
 )
 from gama.genetic_programming.mutation import random_valid_mutation_in_place
@@ -255,7 +254,6 @@ def __init__(
 
  self._x: Optional[pd.DataFrame] = None
  self._y: Optional[pd.DataFrame] = None
- self._basic_encoding_pipeline: Optional[Pipeline] = None
  self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = []
  self._inferred_dtypes: List[Type] = []
  self.model: object = None
@@ -359,8 +357,6 @@ def _prepare_for_prediction(
  ) -> pd.DataFrame:
  if isinstance(x, np.ndarray):
  x = self._np_to_matching_dataframe(x)
- if self._basic_encoding_pipeline:
- x = self._basic_encoding_pipeline.transform(x)
  return x
 
  def _predict(self, x: pd.DataFrame) -> np.ndarray:
@@ -530,14 +526,11 @@ def fit(
  with self._time_manager.start_activity(
  "preprocessing", activity_meta=["default"]
  ):
- x, self._y = format_x_y(x, y)
+ self._x, self._y = format_x_y(x, y)
  self._inferred_dtypes = x.dtypes
  is_classification = hasattr(self, "_label_encoder")
- self._x, self._basic_encoding_pipeline = basic_encoding(
- x, is_classification
- )
  self._fixed_pipeline_extension = basic_pipeline_extension(
- self._x, is_classification
+ is_classification
  )
  self._operator_set._safe_compile = partial(
  self._operator_set._compile,
@@ -688,12 +681,7 @@ def export_script(
  if raise_if_exists and file is not None and os.path.isfile(file):
  raise FileExistsError(f"File {file} already exists.")
 
- if self._basic_encoding_pipeline is not None:
- script_text = self._post_processing.to_code(
- self._basic_encoding_pipeline.steps + self._fixed_pipeline_extension
- )
- else:
- script_text = self._post_processing.to_code(self._fixed_pipeline_extension)
+ script_text = self._post_processing.to_code(self._fixed_pipeline_extension)
 
  if file:
  with open(file, "w") as fh:

diff --git a/gama/genetic_programming/compilers/scikitlearn.py b/gama/genetic_programming/compilers/scikitlearn.py
@@ -25,12 +25,23 @@
 
 
 def primitive_node_to_sklearn(primitive_node: PrimitiveNode) -> object:
- hyperparameters = {
- terminal.output: terminal.value
- for terminal in primitive_node._children
+ hyperparameters = {}
+ for terminal in primitive_node._children:
  # BANDAGE
- if isinstance(terminal, Terminal) and terminal.value != "data"
- }
+ if isinstance(terminal, Terminal) and terminal.value == "data":
+ continue
+ # Check if primitive is hyperparameter or different step in the pipeline
+ # if it is a hyperparameter it has input type dont_remove
+ if isinstance(terminal, PrimitiveNode) and terminal._primitive.data_input == "dont_remove":
+ hps = {
+ hp.output: hp.value
+ for hp in terminal._children
+ }
+ value = terminal._primitive.identifier(**hps)
+ hyperparameters.update({terminal._primitive.output: value})
+ if isinstance(terminal, Terminal):
+ value = terminal.value
+ hyperparameters.update({terminal.output: value})
  return primitive_node._primitive.identifier(**hyperparameters)
 
 

diff --git a/gama/genetic_programming/components/individual.py b/gama/genetic_programming/components/individual.py
@@ -63,12 +63,17 @@ def pipeline_str(self) -> str:
  @property
  def primitives(self) -> List[PrimitiveNode]:
  """Lists all primitive nodes, starting with the Individual's main node."""
+
+ def is_data_primitive(child) -> bool:
+ return isinstance(child, PrimitiveNode) and child._primitive.data_input != "dont_remove"
+
  primitives = [self.main_node]
  current_children = self.main_node._children
- while any(isinstance(child, PrimitiveNode) for child in current_children):
+ while any(is_data_primitive(child) for child in current_children):
  # Only data input can be a primitive node, so there is never more than one.
  child_node = next(
- child for child in current_children if isinstance(child, PrimitiveNode)
+ child for child in current_children
+ if is_data_primitive(child)
  )
  primitives.append(child_node)
  current_children = child_node._children

diff --git a/gama/genetic_programming/components/primitive_node.py b/gama/genetic_programming/components/primitive_node.py
@@ -13,7 +13,7 @@ class PrimitiveNode:
  The Primitive type of this PrimitiveNode.
  data_node: PrimitiveNode
  The PrimitiveNode that specifies all preprocessing before this PrimitiveNode.
- terminals: List[Union["PrimitiveNode", Terminal]]
+ children: List[Union["PrimitiveNode", Terminal]]
  A non-empty list of terminals and primitivenodes matching the `primitive` input.
  """
 
@@ -37,8 +37,8 @@ def __str__(self) -> str:
  input_str = f"{self.input_node!r}" if self.input_node else ""
  terminal_str = ", ".join(
  [
- repr(terminal)
- for terminal in self.terminals
+ repr(terminal) if isinstance(terminal, Terminal) else str(terminal)
+ for terminal in self.terminals + self.primitives
  if terminal != self.input_node
  ]
  )

diff --git a/gama/genetic_programming/crossover.py b/gama/genetic_programming/crossover.py
@@ -148,6 +148,6 @@ def _valid_crossover_functions(ind1: Individual, ind2: Individual) -> List[Calla
  crossover_choices = []
  if list(_shared_terminals(ind1, ind2)):
  crossover_choices.append(crossover_terminals)
- if len(list(ind1.primitives)) >= 2 and len(list(ind2.primitives)) >= 2:
+ if len(list(ind1.primitives)) >= 3 and len(list(ind2.primitives)) >= 3:
  crossover_choices.append(crossover_primitives)
  return crossover_choices
diff --git a/gama/genetic_programming/mutation.py b/gama/genetic_programming/mutation.py
@@ -7,7 +7,7 @@
 from typing import Callable, Optional, List, Dict
 
 from gama.genetic_programming.components.terminal import Terminal
-from .components import Individual
+from .components import Individual, Primitive
 from .operations import random_primitive_node
 
 
@@ -105,7 +105,10 @@ def mut_shrink(
  if shrink_by is not None and n_primitives <= shrink_by:
  raise ValueError(f"Can't shrink size {n_primitives} individual by {shrink_by}.")
  if shrink_by is None:
- shrink_by = random.randint(1, n_primitives)
+ if n_primitives > 1:
+ shrink_by = random.randint(1, n_primitives)
+ else:
+ shrink_by = 0
 
  i = len(individual.primitives) - 1
  while shrink_by > 0 and i > 0:
@@ -136,7 +139,16 @@ def mut_insert(individual: Individual, primitive_set: dict) -> None:
  Individual to mutate in-place.
  primitive_set: dict
  """
- parent_node = random.choice(list(individual.primitives))
+ candidate_primitives = [
+ primitive for primitive in individual.primitives
+ if any(
+ isinstance(p, Primitive)
+ for p in primitive_set[primitive._primitive.data_input]
+ )
+ ]
+ if not candidate_primitives:
+ raise Exception(f'No candidate primitives')
+ parent_node = random.choice(candidate_primitives)
  new_primitive_node = random_primitive_node(
  output_type=parent_node._primitive.data_input,
  primitive_set=primitive_set,

diff --git a/gama/genetic_programming/operations.py b/gama/genetic_programming/operations.py
@@ -9,6 +9,8 @@
  DATA_TERMINAL,
 )
 
+from dirty_cat import SuperVectorizer
+
 
 def random_terminals_for_primitive(
  primitive_set: dict, primitive: Primitive
@@ -85,6 +87,11 @@ def random_primitive_node(
  data_input_type: Optional[str] = None,
 ) -> PrimitiveNode:
  """Create a PrimitiveNode with specified output_type and random terminals."""
+ # Hotfix
+ # otherwise, the function tries to replace the SuperVectorizer
+ # with something else (but none match).
+ if isinstance(exclude, Primitive) and exclude.identifier is SuperVectorizer:
+ exclude = None
  candidates = [
  p
  for p in primitive_set[output_type]
@@ -102,6 +109,8 @@ def random_primitive_node(
  c for c in candidates if reachability[c.data_input] == with_depth - 1
  ]
 
+ if len(candidates) == 0:
+ raise Exception('No candidates to chose from')
  primitive = random.choice(candidates)
  remaining_depth = with_depth - 1 if with_depth else None
  children = random_children_for_primitive(

diff --git a/gama/genetic_programming/operator_set.py b/gama/genetic_programming/operator_set.py
@@ -52,10 +52,12 @@ def wait_next(self, async_evaluator):
  log.warning(f"Error raised during evaluation: {str(future.exception)}.")
  return future
 
- def try_until_new(self, operator, *args, **kwargs):
+ def try_until_new(self, operator: Callable[..., Individual], *args, **kwargs):
  """Keep executing `operator` until a new individual is created."""
  for _ in range(self._max_retry):
  individual = operator(*args, **kwargs)
+ if str(individual.main_node) == 'SuperVectorizer':
+ return individual
  if str(individual.main_node) not in self._completed_evaluations:
  return individual
  else:

diff --git a/gama/postprocessing/best_fit.py b/gama/postprocessing/best_fit.py
@@ -23,6 +23,8 @@ def __init__(self, time_fraction: float = 0.1):
  def post_process(
  self, x: pd.DataFrame, y: pd.Series, timeout: float, selection: List[Individual]
  ) -> object:
+ if len(selection) == 0:
+ raise Exception('No individual to choose from')
  self._selected_individual = selection[0]
  return self._selected_individual.pipeline.fit(x, y)