Merge pull request #962 from mindsdb/staging

Release 22.8.1.0
mindsdb · Aug 5, 2022 · 31e9925 · 31e9925
2 parents 5cf15bb + 258ad71
commit 31e9925
Show file tree

Hide file tree

Showing 26 changed files with 1,996 additions and 1,403 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 max-line-length = 120
-ignore = E402,F821,W503,W504,C408,W391
+ignore = E275,E402,F821,W503,W504,C408,W391
 exclude = .git,__pycache__,docs,docssrc
diff --git a/docssrc/source/tutorials/custom_explainer/custom_explainer.ipynb b/docssrc/source/tutorials/custom_explainer/custom_explainer.ipynb
diff --git a/lightwood/__about__.py b/lightwood/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '22.7.4.0'
+__version__ = '22.8.1.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "[email protected]"
 __author__ = 'MindsDB Inc'

diff --git a/lightwood/analysis/__init__.py b/lightwood/analysis/__init__.py
@@ -8,7 +8,7 @@
 from lightwood.analysis.helpers.acc_stats import AccStats
 from lightwood.analysis.helpers.conf_stats import ConfStats
 from lightwood.analysis.nn_conf.temp_scale import TempScaler
-from lightwood.analysis.helpers.feature_importance import GlobalFeatureImportance
+from lightwood.analysis.helpers.feature_importance import PermutationFeatureImportance
 
 try:
  from lightwood.analysis.helpers.shap import ShapleyValues
@@ -17,4 +17,4 @@
 
 
 __all__ = ['model_analyzer', 'explain', 'BaseAnalysisBlock', 'TempScaler',
- 'ICP', 'AccStats', 'ConfStats', 'GlobalFeatureImportance', 'ShapleyValues']
+ 'ICP', 'AccStats', 'ConfStats', 'PermutationFeatureImportance', 'ShapleyValues']
diff --git a/lightwood/analysis/helpers/acc_stats.py b/lightwood/analysis/helpers/acc_stats.py
@@ -15,7 +15,8 @@ class AccStats(BaseAnalysisBlock):
  """ Computes accuracy stats and a confusion matrix for the validation dataset """
 
  def __init__(self, deps=('ICP',)):
- super().__init__(deps=deps) # @TODO: enforce that this actually prevents early execution somehow
+ super().__init__(deps=deps)
+ self.n_decimals = 3
 
  def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  ns = SimpleNamespace(**kwargs)
@@ -29,7 +30,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  info['score_dict'] = evaluate_accuracy(ns.data, ns.normal_predictions['prediction'],
  ns.target, accuracy_functions, ts_analysis=ns.ts_analysis)
 
- info['normal_accuracy'] = np.mean(list(info['score_dict'].values()))
+ info['normal_accuracy'] = round(np.mean(list(info['score_dict'].values())), self.n_decimals)
  self.fit(ns, info['result_df'])
  info['val_overall_acc'], info['acc_histogram'], info['cm'], info['acc_samples'] = self.get_accuracy_stats()
  return info
@@ -99,7 +100,7 @@ def get_accuracy_stats(self, is_classification=None, is_numerical=None):
  for counts in list(bucket_acc_counts.values()):
  accuracy_count += counts
 
- overall_accuracy = sum(accuracy_count) / len(accuracy_count)
+ overall_accuracy = round(sum(accuracy_count) / len(accuracy_count), self.n_decimals)
 
  for bucket in range(len(self.buckets)):
  if bucket not in bucket_accuracy:

diff --git a/lightwood/analysis/helpers/conf_stats.py b/lightwood/analysis/helpers/conf_stats.py
@@ -18,6 +18,7 @@ def __init__(self, deps=('ICP',), ece_bins: int = 10):
  super().__init__(deps=deps)
  self.ece_bins = ece_bins
  self.ordenc = OrdinalEncoder()
+ self.n_decimals = 3
 
  def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  ns = SimpleNamespace(**kwargs)
@@ -38,10 +39,10 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  ns.data,
  ns.target,
  task_type)
- info['maximum_calibration_error'] = mce
- info['expected_calibration_error'] = ece
+ info['maximum_calibration_error'] = round(mce, self.n_decimals)
+ info['expected_calibration_error'] = round(ece, self.n_decimals)
  info['binned_conf_acc_difference'] = ces
- info['global_calibration_score'] = gscore
+ info['global_calibration_score'] = round(gscore, self.n_decimals)
  return info
 
  def _get_stats(self, confs, preds, data, target, task_type='categorical'):

diff --git a/lightwood/analysis/helpers/feature_importance.py b/lightwood/analysis/helpers/feature_importance.py
@@ -2,69 +2,101 @@
 from types import SimpleNamespace
 from typing import Dict
 
-import torch
 import numpy as np
+from sklearn.utils import shuffle
 
+from lightwood.helpers.log import log
+from lightwood.data.encoded_ds import EncodedDs
 from lightwood.analysis.base import BaseAnalysisBlock
 from lightwood.helpers.general import evaluate_accuracy
-from lightwood.analysis.nc.util import t_softmax
 from lightwood.api.types import PredictionArguments
 
 
-class GlobalFeatureImportance(BaseAnalysisBlock):
+class PermutationFeatureImportance(BaseAnalysisBlock):
  """
- Analysis block that estimates column importance with a variant of the LOCO (leave-one-covariate-out) algorithm.
+ Analysis block that estimates column importances via permutation.
 
  Roughly speaking, the procedure:
  - iterates over all input columns
- - if the input column is optional, then make a predict with its values set to None
- - compare this accuracy with the accuracy obtained using all data
- - all accuracy differences are passed through a softmax and reported as estimated column importance scores
+ - if the input column is optional, shuffle its values, then generate predictions for the input data
+ - compare this accuracy with the accuracy obtained using unshuffled data
+ - all accuracy differences are normalized with respect to the original accuracy (clipped at zero if negative)
+ - report these as estimated column importance scores
 
  Note that, crucially, this method does not refit the predictor at any point.
 
+ :param row_limit: Set to 0 to use the entire validation dataset.
+ :param col_limit: Set to 0 to consider all possible columns.
+
  Reference:
+ https://scikit-learn.org/stable/modules/permutation_importance.html
  https://compstat-lmu.github.io/iml_methods_limitations/pfi.html
  """
- def __init__(self, disable_column_importance):
- super().__init__()
+ def __init__(self, disable_column_importance=False, row_limit=1000, col_limit=10, deps=tuple('AccStats',)):
+ super().__init__(deps=deps)
  self.disable_column_importance = disable_column_importance
+ self.row_limit = row_limit
+ self.col_limit = col_limit
+ self.n_decimals = 3
 
  def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  ns = SimpleNamespace(**kwargs)
 
- if self.disable_column_importance or ns.tss.is_timeseries or ns.has_pretrained_text_enc:
+ if self.disable_column_importance:
+ info['column_importances'] = None
+ elif ns.tss.is_timeseries or ns.has_pretrained_text_enc:
+ log.warning(f"Block 'PermutationFeatureImportance' does not support time series nor text encoding, skipping...") # noqa
  info['column_importances'] = None
  else:
- empty_input_accuracy = {}
- ignorable_input_cols = [x for x in ns.input_cols if (not ns.tss.is_timeseries or
- (x != ns.tss.order_by and
- x not in ns.tss.historical_columns))]
- for col in ignorable_input_cols:
- partial_data = deepcopy(ns.encoded_val_data)
- partial_data.clear_cache()
- partial_data.data_frame[col] = [None] * len(partial_data.data_frame[col])
-
- args = {'predict_proba': True} if ns.is_classification else {}
- empty_input_preds = ns.predictor(partial_data, args=PredictionArguments.from_dict(args))
-
- empty_input_accuracy[col] = np.mean(list(evaluate_accuracy(
- ns.data,
- empty_input_preds['prediction'],
+ if self.row_limit:
+ log.info(f"[PFI] Using a random sample ({self.row_limit} rows out of {len(ns.encoded_val_data.data_frame)}).") # noqa
+ ref_df = ns.encoded_val_data.data_frame.sample(frac=1).reset_index(drop=True).iloc[:self.row_limit]
+ else:
+ log.info(f"[PFI] Using complete validation set ({len(ns.encoded_val_data.data_frame)} rows).")
+ ref_df = deepcopy(ns.encoded_val_data.data_frame)
+
+ ref_data = EncodedDs(ns.encoded_val_data.encoders, ref_df, ns.target)
+
+ args = {'predict_proba': True} if ns.is_classification else {}
+ ref_preds = ns.predictor(ref_data, args=PredictionArguments.from_dict(args))
+ ref_score = np.mean(list(evaluate_accuracy(ref_data.data_frame,
+ ref_preds['prediction'],
+ ns.target,
+ ns.accuracy_functions
+ ).values()))
+ shuffled_col_accuracy = {}
+ shuffled_cols = []
+ for x in ns.input_cols:
+ if ('__mdb' not in x) and \
+ (not ns.tss.is_timeseries or (x != ns.tss.order_by and x not in ns.tss.historical_columns)):
+ shuffled_cols.append(x)
+
+ if self.col_limit:
+ shuffled_cols = shuffled_cols[:min(self.col_limit, len(ns.encoded_val_data.data_frame.columns))]
+ log.info(f"[PFI] Set to consider first {self.col_limit} columns out of {len(shuffled_cols)}: {shuffled_cols}.") # noqa
+ else:
+ log.info(f"[PFI] Computing importance for all {len(shuffled_cols)} columns: {shuffled_cols}")
+
+ for col in shuffled_cols:
+ shuffle_data = deepcopy(ref_data)
+ shuffle_data.clear_cache()
+ shuffle_data.data_frame[col] = shuffle(shuffle_data.data_frame[col].values)
+
+ shuffled_preds = ns.predictor(shuffle_data, args=PredictionArguments.from_dict(args))
+ shuffled_col_accuracy[col] = np.mean(list(evaluate_accuracy(
+ shuffle_data.data_frame,
+ shuffled_preds['prediction'],
  ns.target,
  ns.accuracy_functions
  ).values()))
 
  column_importances = {}
- acc_increases = []
- for col in ignorable_input_cols:
- accuracy_increase = (info['normal_accuracy'] - empty_input_accuracy[col])
- acc_increases.append(accuracy_increase)
-
- # low 0.2 temperature to accentuate differences
- acc_increases = t_softmax(torch.Tensor([acc_increases]), t=0.2).tolist()[0]
- for col, inc in zip(ignorable_input_cols, acc_increases):
- column_importances[col] = inc # scores go from 0 to 1
+ acc_increases = np.zeros((len(shuffled_cols),))
+ for i, col in enumerate(shuffled_cols):
+ accuracy_increase = (ref_score - shuffled_col_accuracy[col])
+ acc_increases[i] = round(accuracy_increase, self.n_decimals)
+ for col, inc in zip(shuffled_cols, acc_increases):
+ column_importances[col] = inc
 
  info['column_importances'] = column_importances
 

diff --git a/lightwood/analysis/nc/base.py b/lightwood/analysis/nc/base.py
@@ -26,6 +26,15 @@ def get_problem_type(cls) -> str:
  return 'classification'
 
 
+class TSMixin(object):
+ def __init__(self) -> None:
+ super(TSMixin, self).__init__()
+
+ @classmethod
+ def get_problem_type(cls):
+ return 'time-series'
+
+
 class BaseModelAdapter(BaseEstimator):
  __metaclass__ = abc.ABCMeta
 
@@ -114,6 +123,14 @@ def _underlying_predict(self, x: np.array) -> np.array:
  return self.model.predict(x)
 
 
+class TSAdapter(BaseModelAdapter):
+ def __init__(self, model: object, fit_params: Dict[str, object] = None) -> None:
+ super(TSAdapter, self).__init__(model, fit_params)
+
+ def _underlying_predict(self, x: np.array) -> np.array:
+ return self.model.predict(x)
+
+
 class CachedRegressorAdapter(RegressorAdapter):
  def __init__(self, model, fit_params=None):
  super(CachedRegressorAdapter, self).__init__(model, fit_params)
@@ -148,3 +165,15 @@ def predict(self, x=None):
  return t_softmax(self.prediction_cache, t=0.5)
  else:
  return self.prediction_cache
+
+
+class CachedTSAdapter(TSAdapter):
+ def __init__(self, model, fit_params=None):
+ super(CachedTSAdapter, self).__init__(model, fit_params)
+ self.prediction_cache = None
+
+ def fit(self, x=None, y=None):
+ pass
+
+ def predict(self, x=None):
+ return self.prediction_cache