Merge pull request #130 from perib/new_search_space_def

fix num_nodes test, add update checker, setup.py change
EpistasisLab · Apr 24, 2024 · 8de41fb · 8de41fb
2 parents bb8a919 + 47b7fd0
commit 8de41fb
Show file tree

Hide file tree

Showing 11 changed files with 105 additions and 42 deletions.
diff --git a/setup.py b/setup.py
@@ -42,12 +42,10 @@ def calculate_version():
  'lightgbm>=3.3.3',
  'optuna>=3.0.5',
  'baikal>=0.4.2',
- 'jupyter>=1.0.0',
  'networkx>=3.0',
  'dask>=2024.4.2',
  'distributed>=2024.4.2',
  'dask-expr>=1.0.12',
- 'dask-ml>=2023.4.20',
  'dask-jobqueue>=0.8.5',
  'func_timeout>=4.3.5',
  'configspace>=0.7.1',

diff --git a/tpot2/__init__.py b/tpot2/__init__.py
@@ -18,3 +18,7 @@
 
 
 from .tpot_estimator import TPOTClassifier, TPOTRegressor, TPOTEstimator, TPOTEstimatorSteadyState
+
+from update_checker import update_check
+from ._version import __version__
+update_check("tpot2",__version__)
diff --git a/tpot2/_version.py b/tpot2/_version.py
@@ -1 +1 @@
-__version__ = '0.1.5-alpha'
+__version__ = '0.1.7-alpha'
diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py
@@ -406,7 +406,6 @@ def get_HistGradientBoostingClassifier_ConfigurationSpace(n_features, random_sta
  validation_fraction_cond = EqualsCondition(validation_fraction, early_stop, "valid")
 
  space = {
- 'loss': Categorical("loss", ['log_loss', 'exponential']),
  'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True),
  'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 200)),
  'max_features': Float("max_features", bounds=(0.1,1.0)), 
@@ -432,7 +431,6 @@ def get_HistGradientBoostingClassifier_ConfigurationSpace(n_features, random_sta
 def HistGradientBoostingClassifier_hyperparameter_parser(params):
 
  final_params = {
- 'loss': params['loss'],
  'learning_rate': params['learning_rate'],
  'min_samples_leaf': params['min_samples_leaf'],
  'max_features': params['max_features'],
@@ -447,7 +445,7 @@ def HistGradientBoostingClassifier_hyperparameter_parser(params):
 
 
  if params['early_stop'] == 'off':
- final_params['n_iter_no_change'] = None
+ final_params['n_iter_no_change'] = 0
  final_params['validation_fraction'] = None
  final_params['early_stopping'] = False
  elif params['early_stop'] == 'valid':
@@ -477,12 +475,14 @@ def get_MLPClassifier_ConfigurationSpace(random_state):
 
  n_hidden_layers = Integer("n_hidden_layers", bounds=(1, 3))
  n_nodes_per_layer = Integer("n_nodes_per_layer", bounds=(16, 512))
- activation = Categorical("activation", ['tanh', 'relu'])
- alpha = Float("alpha", bounds=(1e-7, 1e-1), log=True)
- learning_rate = Float("learning_rate", bounds=(1e-4, 1e-1), log=True)
+ activation = Categorical("activation", ["identity", "logistic",'tanh', 'relu'])
+ alpha = Float("alpha", bounds=(1e-4, 1e-1), log=True)
  early_stopping = Categorical("early_stopping", [True,False])
 
- cs.add_hyperparameters([n_hidden_layers, n_nodes_per_layer, activation, alpha, learning_rate, early_stopping])
+ learning_rate_init = Float("learning_rate_init", bounds=(1e-4, 1e-1), log=True)
+ learning_rate = Categorical("learning_rate", ['constant', 'invscaling', 'adaptive'])
+
+ cs.add_hyperparameters([n_hidden_layers, n_nodes_per_layer, activation, alpha, learning_rate, early_stopping, learning_rate_init])
 
  return cs
 
@@ -492,8 +492,10 @@ def MLPClassifier_hyperparameter_parser(params):
  'hidden_layer_sizes' : [params['n_nodes_per_layer']]*params['n_hidden_layers'],
  'activation': params['activation'],
  'alpha': params['alpha'],
- 'learning_rate': params['learning_rate'],
  'early_stopping': params['early_stopping'],
+
+ 'learning_rate_init': params['learning_rate_init'],
+ 'learning_rate': params['learning_rate'],
  }
 
  if 'random_state' in params:

diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py
@@ -104,12 +104,15 @@
 
 
 # not including "PassiveAggressiveClassifier" in classifiers since it is mainly for larger than memory datasets/online use cases
+
+# TODO need to subclass "GaussianProcessClassifier" and 'GaussianProcessRegressor'. These require n_features as a parameter for the kernel, but n_features may be different depending on selection functions or transformations previously in the pipeline.
+
 GROUPNAMES = {
  "selectors": ["SelectFwe", "SelectPercentile", "VarianceThreshold",],
  "selectors_classification": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_classification", "SelectFromModel_classification"],
  "selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"],
- "classifiers" : ["LGBMRegressor", "BaggingClassifier", "GaussianProcessClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB', "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'],
- "regressors" : ['AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcessRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'],
+ "classifiers" : ["LGBMRegressor", "BaggingClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB', "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'],
+ "regressors" : ['AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'],
 
 
  "transformers": ["Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
@@ -263,7 +266,7 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta
  case "FastICA":
  return transformers.get_FastICA_configspace(n_features=n_features, random_state=random_state)
  case "FeatureAgglomeration":
- return transformers.get_FeatureAgglomeration_configspace(n_features=n_features,)
+ return transformers.get_FeatureAgglomeration_configspace(n_samples=n_samples)
  case "Nystroem":
  return transformers.get_Nystroem_configspace(n_features=n_features, random_state=random_state)
  case "RBFSampler":
@@ -435,9 +438,12 @@ def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None
  if name == "HistGradientBoostingClassifier":
  configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
  return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.HistGradientBoostingClassifier_hyperparameter_parser)
- if name == "GradientBoostingRegressor" or name == "HistGradientBoostingRegressor":
+ if name == "GradientBoostingRegressor":
  configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
  return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GradientBoostingRegressor_hyperparameter_parser)
+ if name == "HistGradientBoostingRegressor":
+ configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
+ return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.HistGradientBoostingRegressor_hyperparameter_parser)
  if name == "MLPClassifier":
  configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
  return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.MLPClassifier_hyperparameter_parser)

diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py
@@ -412,6 +412,36 @@ def get_GradientBoostingRegressor_ConfigurationSpace(n_features, random_state):
  cs.add_conditions([validation_fraction_cond, n_iter_no_change_cond])
  return cs
 
+def GradientBoostingRegressor_hyperparameter_parser(params):
+
+ final_params = {
+ 'loss': params['loss'],
+ 'learning_rate': params['learning_rate'],
+ 'min_samples_leaf': params['min_samples_leaf'],
+ 'min_samples_split': params['min_samples_split'],
+ 'max_features': params['max_features'],
+ 'max_leaf_nodes': params['max_leaf_nodes'],
+ 'max_depth': params['max_depth'],
+ 'tol': params['tol'],
+ 'subsample': params['subsample']
+ }
+
+ if 'random_state' in params:
+ final_params['random_state'] = params['random_state']
+
+ if params['early_stop'] == 'off':
+ final_params['n_iter_no_change'] = None
+ final_params['validation_fraction'] = None
+ elif params['early_stop'] == 'valid':
+ final_params['n_iter_no_change'] = params['n_iter_no_change']
+ final_params['validation_fraction'] = params['validation_fraction']
+ elif params['early_stop'] == 'train':
+ final_params['n_iter_no_change'] = params['n_iter_no_change']
+ final_params['validation_fraction'] = None
+
+
+ return final_params
+
 #only difference is l2_regularization
 def get_HistGradientBoostingRegressor_ConfigurationSpace(n_features, random_state):
  early_stop = Categorical("early_stop", ["off", "valid", "train"])
@@ -443,39 +473,40 @@ def get_HistGradientBoostingRegressor_ConfigurationSpace(n_features, random_stat
 
  return cs
 
-def GradientBoostingRegressor_hyperparameter_parser(params):
+
+def HistGradientBoostingRegressor_hyperparameter_parser(params):
 
  final_params = {
- 'loss': params['loss'],
  'learning_rate': params['learning_rate'],
  'min_samples_leaf': params['min_samples_leaf'],
  'max_features': params['max_features'],
  'max_leaf_nodes': params['max_leaf_nodes'],
  'max_depth': params['max_depth'],
  'tol': params['tol'],
+ 'l2_regularization': params['l2_regularization']
  }
 
- if "l2_regularization" in params:
- final_params['l2_regularization'] = params['l2_regularization']
-
  if 'random_state' in params:
  final_params['random_state'] = params['random_state']
 
+
  if params['early_stop'] == 'off':
- final_params['n_iter_no_change'] = None
+ final_params['n_iter_no_change'] = 0
  final_params['validation_fraction'] = None
+ final_params['early_stopping'] = False
  elif params['early_stop'] == 'valid':
  final_params['n_iter_no_change'] = params['n_iter_no_change']
  final_params['validation_fraction'] = params['validation_fraction']
+ final_params['early_stopping'] = True
  elif params['early_stop'] == 'train':
  final_params['n_iter_no_change'] = params['n_iter_no_change']
  final_params['validation_fraction'] = None
+ final_params['early_stopping'] = True
 
 
  return final_params
 
 
-
 ###
 
 def get_MLPRegressor_ConfigurationSpace(random_state):
@@ -495,7 +526,10 @@ def get_MLPRegressor_ConfigurationSpace(random_state):
  learning_rate = Float("learning_rate", bounds=(1e-4, 1e-1), log=True)
  early_stopping = Categorical("early_stopping", [True,False])
 
- cs.add_hyperparameters([n_hidden_layers, n_nodes_per_layer, activation, alpha, learning_rate, early_stopping])
+ learning_rate_init = Float("learning_rate_init", bounds=(1e-4, 1e-1), log=True)
+ learning_rate = Categorical("learning_rate", ['constant', 'invscaling', 'adaptive'])
+
+ cs.add_hyperparameters([n_hidden_layers, n_nodes_per_layer, activation, alpha, learning_rate, early_stopping, learning_rate_init])
 
  return cs
 
@@ -505,8 +539,9 @@ def MLPRegressor_hyperparameter_parser(params):
  'hidden_layer_sizes' : [params['n_nodes_per_layer']]*params['n_hidden_layers'],
  'activation': params['activation'],
  'alpha': params['alpha'],
- 'learning_rate': params['learning_rate'],
  'early_stopping': params['early_stopping'],
+ 'learning_rate_init': params['learning_rate_init'],
+ 'learning_rate': params['learning_rate'],
  }
 
  if 'random_state' in params:

diff --git a/tpot2/config/transformers.py b/tpot2/config/transformers.py
@@ -44,11 +44,11 @@ def get_FastICA_configspace(n_features=100, random_state=None):
 
  )
 
-def get_FeatureAgglomeration_configspace(n_features=100):
+def get_FeatureAgglomeration_configspace(n_samples):
 
  linkage = Categorical('linkage', ['ward', 'complete', 'average'])
  metric = Categorical('metric', ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'])
- n_clusters = Integer('n_clusters', bounds=(2, 400))
+ n_clusters = Integer('n_clusters', bounds=(2, min(n_samples,400)))
  pooling_func = Categorical('pooling_func', ['mean', 'median', 'max'])
 
  metric_condition = NotEqualsCondition(metric, linkage, 'ward')

diff --git a/tpot2/objectives/number_of_nodes.py b/tpot2/objectives/number_of_nodes.py
@@ -4,7 +4,7 @@
 
 def number_of_nodes_objective(est):
  if isinstance(est, GraphPipeline):
- return sum(node["instance"] for node in est.graph.nodes)
+ return sum(number_of_nodes_objective(est.graph.nodes[node]["instance"]) for node in est.graph.nodes)
  if isinstance(est, Pipeline):
  return sum(number_of_nodes_objective(estimator) for _,estimator in est.steps)
  if isinstance(est, sklearn.pipeline.FeatureUnion):

diff --git a/tpot2/objectives/tests/test_complexity_objective.py b/tpot2/objectives/tests/test_complexity_objective.py
@@ -1 +0,0 @@
-from ..complexity import BernoulliNB_Complexity, GaussianNB_Complexity, MultinomialNB_Complexity

diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py
@@ -750,7 +750,11 @@ def generate(self, rng=None):
  self.cross_val_predict_cv, self.method, self.memory, self.use_label_encoder, rng=rng) 
  # if user specified limit, grab a random number between that limit
 
- n_nodes = min(rng.integers(1, self.max_size), 5)
+ if self.max_size is None or self.max_size == np.inf:
+ n_nodes = rng.integers(1, 5)
+ else:
+ n_nodes = min(rng.integers(1, self.max_size), 5)
+
  starting_ops = []
  if self.inner_search_space is not None:
  starting_ops.append(ind._mutate_insert_inner_node)

diff --git a/tpot2/tpot_estimator/templates/tpottemplates.py b/tpot2/tpot_estimator/templates/tpottemplates.py
@@ -30,6 +30,7 @@ def __init__( self,
  memory_limit = "4GB",
  client = None,
  random_state=None,
+ allow_inner_regressors=True,
  **tpotestimator_kwargs,
  ):
  """
@@ -58,6 +59,7 @@ def __init__( self,
  self.memory_limit = memory_limit
  self.client = client
  self.random_state = random_state
+ self.allow_inner_regressors = allow_inner_regressors
  self.tpotestimator_kwargs = tpotestimator_kwargs
 
  self.initialized = False
@@ -71,13 +73,18 @@ def fit(self, X, y):
  "n_features":X.shape[1], 
  "random_state":self.random_state}
 
- search_space = tpot2.search_spaces.pipelines.GraphPipeline(
- root_search_space= tpot2.config.get_search_space("regressors", **get_search_space_params),
- leaf_search_space = None, 
- inner_search_space = tpot2.config.get_search_space(["selectors","transformers","regressors","scalers"],**get_search_space_params),
- max_size = 10,
- )
-
+ if self.allow_inner_regressors:
+ search_space = tpot2.search_spaces.pipelines.GraphPipeline(
+ root_search_space= tpot2.config.get_search_space("regressors", **get_search_space_params),
+ leaf_search_space = None, 
+ inner_search_space = tpot2.config.get_search_space(["selectors","transformers","regressors","scalers"],**get_search_space_params),
+ )
+ else:
+ search_space = tpot2.search_spaces.pipelines.GraphPipeline(
+ root_search_space= tpot2.config.get_search_space("regressors", **get_search_space_params),
+ leaf_search_space = None, 
+ inner_search_space = tpot2.config.get_search_space(["selectors","transformers","scalers"],**get_search_space_params),
+ )
 
  super(TPOTRegressor,self).__init__(
  search_space=search_space,
@@ -134,6 +141,7 @@ def __init__( self,
  memory_limit = "4GB",
  client = None,
  random_state=None,
+ allow_inner_classifiers=True,
  **tpotestimator_kwargs,
 
  ):
@@ -164,6 +172,7 @@ def __init__( self,
  self.client = client
  self.random_state = random_state
  self.tpotestimator_kwargs = tpotestimator_kwargs
+ self.allow_inner_classifiers = allow_inner_classifiers
 
  self.initialized = False
 
@@ -176,12 +185,18 @@ def fit(self, X, y):
  "n_features":X.shape[1], 
  "random_state":self.random_state}
 
- search_space = tpot2.search_spaces.pipelines.GraphPipeline(
- root_search_space= tpot2.config.get_search_space("classifiers", **get_search_space_params),
- leaf_search_space = None, 
- inner_search_space = tpot2.config.get_search_space(["selectors","transformers","classifiers", "scalers"], **get_search_space_params),
- max_size = 10,
- )
+ if self.allow_inner_classifiers:
+ search_space = tpot2.search_spaces.pipelines.GraphPipeline(
+ root_search_space= tpot2.config.get_search_space("classifiers", **get_search_space_params),
+ leaf_search_space = None, 
+ inner_search_space = tpot2.config.get_search_space(["selectors","transformers","classifiers", "scalers"], **get_search_space_params),
+ )
+ else:
+ search_space = tpot2.search_spaces.pipelines.GraphPipeline(
+ root_search_space= tpot2.config.get_search_space("classifiers", **get_search_space_params),
+ leaf_search_space = None, 
+ inner_search_space = tpot2.config.get_search_space(["selectors","transformers","scalers"], **get_search_space_params),
+ )
 
 
  super(TPOTClassifier,self).__init__(