Skip to content

Commit

Permalink
Merge pull request #962 from mindsdb/staging
Browse files Browse the repository at this point in the history
Release 22.8.1.0
  • Loading branch information
paxcema authored Aug 5, 2022
2 parents 5cf15bb + 258ad71 commit 31e9925
Show file tree
Hide file tree
Showing 26 changed files with 1,996 additions and 1,403 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 120
ignore = E402,F821,W503,W504,C408,W391
ignore = E275,E402,F821,W503,W504,C408,W391
exclude = .git,__pycache__,docs,docssrc
2,476 changes: 1,238 additions & 1,238 deletions docssrc/source/tutorials/custom_explainer/custom_explainer.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lightwood/__about__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__title__ = 'lightwood'
__package_name__ = 'lightwood'
__version__ = '22.7.4.0'
__version__ = '22.8.1.0'
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
__email__ = "[email protected]"
__author__ = 'MindsDB Inc'
Expand Down
4 changes: 2 additions & 2 deletions lightwood/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from lightwood.analysis.helpers.acc_stats import AccStats
from lightwood.analysis.helpers.conf_stats import ConfStats
from lightwood.analysis.nn_conf.temp_scale import TempScaler
from lightwood.analysis.helpers.feature_importance import GlobalFeatureImportance
from lightwood.analysis.helpers.feature_importance import PermutationFeatureImportance

try:
from lightwood.analysis.helpers.shap import ShapleyValues
Expand All @@ -17,4 +17,4 @@


__all__ = ['model_analyzer', 'explain', 'BaseAnalysisBlock', 'TempScaler',
'ICP', 'AccStats', 'ConfStats', 'GlobalFeatureImportance', 'ShapleyValues']
'ICP', 'AccStats', 'ConfStats', 'PermutationFeatureImportance', 'ShapleyValues']
7 changes: 4 additions & 3 deletions lightwood/analysis/helpers/acc_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ class AccStats(BaseAnalysisBlock):
""" Computes accuracy stats and a confusion matrix for the validation dataset """

def __init__(self, deps=('ICP',)):
super().__init__(deps=deps) # @TODO: enforce that this actually prevents early execution somehow
super().__init__(deps=deps)
self.n_decimals = 3

def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
ns = SimpleNamespace(**kwargs)
Expand All @@ -29,7 +30,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
info['score_dict'] = evaluate_accuracy(ns.data, ns.normal_predictions['prediction'],
ns.target, accuracy_functions, ts_analysis=ns.ts_analysis)

info['normal_accuracy'] = np.mean(list(info['score_dict'].values()))
info['normal_accuracy'] = round(np.mean(list(info['score_dict'].values())), self.n_decimals)
self.fit(ns, info['result_df'])
info['val_overall_acc'], info['acc_histogram'], info['cm'], info['acc_samples'] = self.get_accuracy_stats()
return info
Expand Down Expand Up @@ -99,7 +100,7 @@ def get_accuracy_stats(self, is_classification=None, is_numerical=None):
for counts in list(bucket_acc_counts.values()):
accuracy_count += counts

overall_accuracy = sum(accuracy_count) / len(accuracy_count)
overall_accuracy = round(sum(accuracy_count) / len(accuracy_count), self.n_decimals)

for bucket in range(len(self.buckets)):
if bucket not in bucket_accuracy:
Expand Down
7 changes: 4 additions & 3 deletions lightwood/analysis/helpers/conf_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self, deps=('ICP',), ece_bins: int = 10):
super().__init__(deps=deps)
self.ece_bins = ece_bins
self.ordenc = OrdinalEncoder()
self.n_decimals = 3

def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
ns = SimpleNamespace(**kwargs)
Expand All @@ -38,10 +39,10 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
ns.data,
ns.target,
task_type)
info['maximum_calibration_error'] = mce
info['expected_calibration_error'] = ece
info['maximum_calibration_error'] = round(mce, self.n_decimals)
info['expected_calibration_error'] = round(ece, self.n_decimals)
info['binned_conf_acc_difference'] = ces
info['global_calibration_score'] = gscore
info['global_calibration_score'] = round(gscore, self.n_decimals)
return info

def _get_stats(self, confs, preds, data, target, task_type='categorical'):
Expand Down
100 changes: 66 additions & 34 deletions lightwood/analysis/helpers/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,69 +2,101 @@
from types import SimpleNamespace
from typing import Dict

import torch
import numpy as np
from sklearn.utils import shuffle

from lightwood.helpers.log import log
from lightwood.data.encoded_ds import EncodedDs
from lightwood.analysis.base import BaseAnalysisBlock
from lightwood.helpers.general import evaluate_accuracy
from lightwood.analysis.nc.util import t_softmax
from lightwood.api.types import PredictionArguments


class GlobalFeatureImportance(BaseAnalysisBlock):
class PermutationFeatureImportance(BaseAnalysisBlock):
"""
Analysis block that estimates column importance with a variant of the LOCO (leave-one-covariate-out) algorithm.
Analysis block that estimates column importances via permutation.
Roughly speaking, the procedure:
- iterates over all input columns
- if the input column is optional, then make a predict with its values set to None
- compare this accuracy with the accuracy obtained using all data
- all accuracy differences are passed through a softmax and reported as estimated column importance scores
- if the input column is optional, shuffle its values, then generate predictions for the input data
- compare this accuracy with the accuracy obtained using unshuffled data
- all accuracy differences are normalized with respect to the original accuracy (clipped at zero if negative)
- report these as estimated column importance scores
Note that, crucially, this method does not refit the predictor at any point.
:param row_limit: Set to 0 to use the entire validation dataset.
:param col_limit: Set to 0 to consider all possible columns.
Reference:
https://scikit-learn.org/stable/modules/permutation_importance.html
https://compstat-lmu.github.io/iml_methods_limitations/pfi.html
"""
def __init__(self, disable_column_importance):
super().__init__()
def __init__(self, disable_column_importance=False, row_limit=1000, col_limit=10, deps=tuple('AccStats',)):
super().__init__(deps=deps)
self.disable_column_importance = disable_column_importance
self.row_limit = row_limit
self.col_limit = col_limit
self.n_decimals = 3

def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
ns = SimpleNamespace(**kwargs)

if self.disable_column_importance or ns.tss.is_timeseries or ns.has_pretrained_text_enc:
if self.disable_column_importance:
info['column_importances'] = None
elif ns.tss.is_timeseries or ns.has_pretrained_text_enc:
log.warning(f"Block 'PermutationFeatureImportance' does not support time series nor text encoding, skipping...") # noqa
info['column_importances'] = None
else:
empty_input_accuracy = {}
ignorable_input_cols = [x for x in ns.input_cols if (not ns.tss.is_timeseries or
(x != ns.tss.order_by and
x not in ns.tss.historical_columns))]
for col in ignorable_input_cols:
partial_data = deepcopy(ns.encoded_val_data)
partial_data.clear_cache()
partial_data.data_frame[col] = [None] * len(partial_data.data_frame[col])

args = {'predict_proba': True} if ns.is_classification else {}
empty_input_preds = ns.predictor(partial_data, args=PredictionArguments.from_dict(args))

empty_input_accuracy[col] = np.mean(list(evaluate_accuracy(
ns.data,
empty_input_preds['prediction'],
if self.row_limit:
log.info(f"[PFI] Using a random sample ({self.row_limit} rows out of {len(ns.encoded_val_data.data_frame)}).") # noqa
ref_df = ns.encoded_val_data.data_frame.sample(frac=1).reset_index(drop=True).iloc[:self.row_limit]
else:
log.info(f"[PFI] Using complete validation set ({len(ns.encoded_val_data.data_frame)} rows).")
ref_df = deepcopy(ns.encoded_val_data.data_frame)

ref_data = EncodedDs(ns.encoded_val_data.encoders, ref_df, ns.target)

args = {'predict_proba': True} if ns.is_classification else {}
ref_preds = ns.predictor(ref_data, args=PredictionArguments.from_dict(args))
ref_score = np.mean(list(evaluate_accuracy(ref_data.data_frame,
ref_preds['prediction'],
ns.target,
ns.accuracy_functions
).values()))
shuffled_col_accuracy = {}
shuffled_cols = []
for x in ns.input_cols:
if ('__mdb' not in x) and \
(not ns.tss.is_timeseries or (x != ns.tss.order_by and x not in ns.tss.historical_columns)):
shuffled_cols.append(x)

if self.col_limit:
shuffled_cols = shuffled_cols[:min(self.col_limit, len(ns.encoded_val_data.data_frame.columns))]
log.info(f"[PFI] Set to consider first {self.col_limit} columns out of {len(shuffled_cols)}: {shuffled_cols}.") # noqa
else:
log.info(f"[PFI] Computing importance for all {len(shuffled_cols)} columns: {shuffled_cols}")

for col in shuffled_cols:
shuffle_data = deepcopy(ref_data)
shuffle_data.clear_cache()
shuffle_data.data_frame[col] = shuffle(shuffle_data.data_frame[col].values)

shuffled_preds = ns.predictor(shuffle_data, args=PredictionArguments.from_dict(args))
shuffled_col_accuracy[col] = np.mean(list(evaluate_accuracy(
shuffle_data.data_frame,
shuffled_preds['prediction'],
ns.target,
ns.accuracy_functions
).values()))

column_importances = {}
acc_increases = []
for col in ignorable_input_cols:
accuracy_increase = (info['normal_accuracy'] - empty_input_accuracy[col])
acc_increases.append(accuracy_increase)

# low 0.2 temperature to accentuate differences
acc_increases = t_softmax(torch.Tensor([acc_increases]), t=0.2).tolist()[0]
for col, inc in zip(ignorable_input_cols, acc_increases):
column_importances[col] = inc # scores go from 0 to 1
acc_increases = np.zeros((len(shuffled_cols),))
for i, col in enumerate(shuffled_cols):
accuracy_increase = (ref_score - shuffled_col_accuracy[col])
acc_increases[i] = round(accuracy_increase, self.n_decimals)
for col, inc in zip(shuffled_cols, acc_increases):
column_importances[col] = inc

info['column_importances'] = column_importances

Expand Down
29 changes: 29 additions & 0 deletions lightwood/analysis/nc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ def get_problem_type(cls) -> str:
return 'classification'


class TSMixin(object):
def __init__(self) -> None:
super(TSMixin, self).__init__()

@classmethod
def get_problem_type(cls):
return 'time-series'


class BaseModelAdapter(BaseEstimator):
__metaclass__ = abc.ABCMeta

Expand Down Expand Up @@ -114,6 +123,14 @@ def _underlying_predict(self, x: np.array) -> np.array:
return self.model.predict(x)


class TSAdapter(BaseModelAdapter):
def __init__(self, model: object, fit_params: Dict[str, object] = None) -> None:
super(TSAdapter, self).__init__(model, fit_params)

def _underlying_predict(self, x: np.array) -> np.array:
return self.model.predict(x)


class CachedRegressorAdapter(RegressorAdapter):
def __init__(self, model, fit_params=None):
super(CachedRegressorAdapter, self).__init__(model, fit_params)
Expand Down Expand Up @@ -148,3 +165,15 @@ def predict(self, x=None):
return t_softmax(self.prediction_cache, t=0.5)
else:
return self.prediction_cache


class CachedTSAdapter(TSAdapter):
def __init__(self, model, fit_params=None):
super(CachedTSAdapter, self).__init__(model, fit_params)
self.prediction_cache = None

def fit(self, x=None, y=None):
pass

def predict(self, x=None):
return self.prediction_cache
Loading

0 comments on commit 31e9925

Please sign in to comment.