diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 0824279d0..e96085631 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,7 +1,7 @@ --- name: Bug report about: Create a report to help us improve -labels: +labels: Bug --- ## Your Environment @@ -13,3 +13,5 @@ labels: ## How can we replicate it? +* What dataset did you use (link to it please) +* What was the code you ran \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 000000000..862c13842 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,5 @@ +--- +name: Question +about: Ask a question +labels: question +--- \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/suggestion.md b/.github/ISSUE_TEMPLATE/suggestion.md new file mode 100644 index 000000000..ccdf67811 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/suggestion.md @@ -0,0 +1,8 @@ +--- +name: Suggestion +about: Suggest a feature, improvement, doc change, etc. +labels: enhancement +--- + + + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 00f389251..d56c4e991 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -10,26 +10,51 @@ We love to receive contributions from the community and hear your opinions! We w * Submit a bug fix * Propose new features * Test Lightwood +* Solve an issue # Code contributions In general, we follow the "fork-and-pull" Git workflow. 1. Fork the Lightwood repository -2. Clone the repository -3. Make changes and commit them -4. Push your local branch to your fork -5. Submit a Pull request so that we can review your changes -6. Write a commit message -7. Make sure that the CI tests are GREEN +2. Checkout the `staging` branch, this is the development version that gets released weekly +4. Make changes and commit them +5. Make sure that the CI tests pass +6. Submit a Pull request from your repo to the `staging` branch of mindsdb/lightwood so that we can review your changes ->NOTE: Be sure to merge the latest from "upstream" before making a pull request! +> You will need to sign a CLI agreement for the code since lightwood is under a GPL license +> Be sure to merge the latest from `staging` before making a pull request! +> You can run the test suite locally by running `flake8 .` to check style and `python -m unittest discover tests` to run the automated tests. This doesn't guarantee it will pass remotely since we run on multiple envs, but should work in most cases. # Feature and Bug reports We use GitHub issues to track bugs and features. Report them by opening a [new issue](https://github.com/mindsdb/lightwood/issues/new/choose) and fill out all of the required inputs. # Code review process The Pull Request reviews are done on a regular basis. -Please, make sure you respond to our feedback/questions. + +If your change has a chance to affecting performance we will run our private benchmark suite to validate it. + +Please, make sure you respond to our feedback and questions. # Community -If you have additional questions or you want to chat with MindsDB core team, you can join our community [![Discourse posts](https://img.shields.io/discourse/posts?server=https%3A%2F%2Fcommunity.mindsdb.com%2F)](https://community.mindsdb.com/). To get updates on MindsDB’s latest announcements, releases, and events, [sign up for our newsletter](https://mindsdb.us20.list-manage.com/subscribe/post?u=5174706490c4f461e54869879&id=242786942a). +If you have additional questions or you want to chat with MindsDB core team, you can join our community slack. + +# Setting up a dev environment + +- Clone lightwood +- `cd lightwood && pip install requirements.txt` +- Add it to your python path (e.g. by adding `export PYTHONPATH='/where/you/cloned/lightwood:$PYTHONPATH` as a newline at the end of your `~/.bashrc` file) +- Check that the unittest are passing by going into the directory where you cloned lightwood and running: `python -m unittest discover tests` + +> If `python` default to python2.x on your environment use `python3` and `pip3` instead + +## Setting up a vscode environment + +Currently, the prefred environment for working with lightwood is vscode, it's a very popular python IDE. Any IDE should however work, while we don't have guides for those please use the following as a template. + +* Install and enable setting sync using github account (if you use multiple machines) +* Install pylance (for types) and make sure to disable pyright +* Go to `Python > Lint: Enabled` and disable everything *but* flake8 +* Set `python.linting.flake8Path` to the full path to flake8 (which flake8) +* Set `Python › Formatting: Provider` to autopep8 +* Add `--global-config=/lightwood/.flake8` and `--experimental` to `Python › Formatting: Autopep8 Args` +* Install live share and live share whiteboard \ No newline at end of file diff --git a/dev/README.md b/dev/README.md deleted file mode 100644 index f2cef68fc..000000000 --- a/dev/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Contributor guide - -## Setting up vscode environment - -* Install and enable setting sync using github account (if you use multiple machines) -* Install pylance (for types) and make sure to disable pyright -* Go to `Python > Lint: Enabled` and disable everything *but* flake8 -* Set `python.linting.flake8Path` to the full path to flake8 (which flake8) -* Set `Python › Formatting: Provider` to autopep8 -* Add `--global-config=/lightwood/.flake8` and `--experimental` to `Python › Formatting: Autopep8 Args` -* Install live share and live share whiteboard \ No newline at end of file diff --git a/dev/requirements.txt b/dev/requirements.txt deleted file mode 100644 index 230c9c827..000000000 --- a/dev/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -reindent -flake8 \ No newline at end of file diff --git a/lightwood/__about__.py b/lightwood/__about__.py index 2d58696f7..9150d1ec6 100755 --- a/lightwood/__about__.py +++ b/lightwood/__about__.py @@ -1,6 +1,6 @@ __title__ = 'lightwood' __package_name__ = 'lightwood' -__version__ = '1.2.0' +__version__ = '1.3.0' __description__ = "Lightwood is a toolkit for automatic machine learning model building" __email__ = "community@mindsdb.com" __author__ = 'MindsDB Inc' diff --git a/lightwood/analysis/__init__.py b/lightwood/analysis/__init__.py index 64d283074..3887ae999 100644 --- a/lightwood/analysis/__init__.py +++ b/lightwood/analysis/__init__.py @@ -1,4 +1,12 @@ -from lightwood.analysis.model_analyzer import model_analyzer +# Base +from lightwood.analysis.analyze import model_analyzer from lightwood.analysis.explain import explain -__all__ = ['model_analyzer', 'explain'] +# Blocks +from lightwood.analysis.base import BaseAnalysisBlock +from lightwood.analysis.nc.calibrate import ICP +from lightwood.analysis.helpers.acc_stats import AccStats +from lightwood.analysis.helpers.feature_importance import GlobalFeatureImportance + + +__all__ = ['model_analyzer', 'explain', 'ICP', 'AccStats', 'GlobalFeatureImportance', 'BaseAnalysisBlock'] diff --git a/lightwood/analysis/analyze.py b/lightwood/analysis/analyze.py new file mode 100644 index 000000000..fa5d7ffd8 --- /dev/null +++ b/lightwood/analysis/analyze.py @@ -0,0 +1,96 @@ +from typing import Dict, List, Tuple, Optional + +from lightwood.api import dtype +from lightwood.ensemble import BaseEnsemble +from lightwood.analysis.base import BaseAnalysisBlock +from lightwood.data.encoded_ds import EncodedDs +from lightwood.encoder.text.pretrained import PretrainedLangEncoder +from lightwood.api.types import ModelAnalysis, StatisticalAnalysis, TimeseriesSettings + + +def model_analyzer( + predictor: BaseEnsemble, + data: EncodedDs, + train_data: EncodedDs, + stats_info: StatisticalAnalysis, + target: str, + ts_cfg: TimeseriesSettings, + dtype_dict: Dict[str, str], + accuracy_functions, + analysis_blocks: Optional[List[BaseAnalysisBlock]] = [] +) -> Tuple[ModelAnalysis, Dict[str, object]]: + """ + Analyses model on a validation subset to evaluate accuracy, estimate feature importance and generate a + calibration model to estimating confidence in future predictions. + + Additionally, any user-specified analysis blocks (see class `BaseAnalysisBlock`) are also called here. + + :return: + runtime_analyzer: This dictionary object gets populated in a sequential fashion with data generated from + any `.analyze()` block call. This dictionary object is stored in the predictor itself, and used when + calling the `.explain()` method of all analysis blocks when generating predictions. + + model_analysis: `ModelAnalysis` object that contains core analysis metrics, not necessarily needed when predicting. + """ + + runtime_analyzer = {} + data_type = dtype_dict[target] + + # retrieve encoded data representations + encoded_train_data = train_data + encoded_val_data = data + data = encoded_val_data.data_frame + input_cols = list([col for col in data.columns if col != target]) + + # predictive task + is_numerical = data_type in (dtype.integer, dtype.float, dtype.array, dtype.tsarray, dtype.quantity) + is_classification = data_type in (dtype.categorical, dtype.binary) + is_multi_ts = ts_cfg.is_timeseries and ts_cfg.nr_predictions > 1 + has_pretrained_text_enc = any([isinstance(enc, PretrainedLangEncoder) + for enc in encoded_train_data.encoders.values()]) + + # raw predictions for validation dataset + normal_predictions = predictor(encoded_val_data) if not is_classification else predictor(encoded_val_data, + predict_proba=True) + normal_predictions = normal_predictions.set_index(data.index) + + # ------------------------- # + # Run analysis blocks, both core and user-defined + # ------------------------- # + kwargs = { + 'predictor': predictor, + 'target': target, + 'input_cols': input_cols, + 'dtype_dict': dtype_dict, + 'normal_predictions': normal_predictions, + 'data': data, + 'train_data': train_data, + 'encoded_val_data': encoded_val_data, + 'is_classification': is_classification, + 'is_numerical': is_numerical, + 'is_multi_ts': is_multi_ts, + 'stats_info': stats_info, + 'ts_cfg': ts_cfg, + 'accuracy_functions': accuracy_functions, + 'has_pretrained_text_enc': has_pretrained_text_enc + } + + for block in analysis_blocks: + runtime_analyzer = block.analyze(runtime_analyzer, **kwargs) + + # ------------------------- # + # Populate ModelAnalysis object + # ------------------------- # + model_analysis = ModelAnalysis( + accuracies=runtime_analyzer['score_dict'], + accuracy_histogram=runtime_analyzer['acc_histogram'], + accuracy_samples=runtime_analyzer['acc_samples'], + train_sample_size=len(encoded_train_data), + test_sample_size=len(encoded_val_data), + confusion_matrix=runtime_analyzer['cm'], + column_importances=runtime_analyzer['column_importances'], + histograms=stats_info.histograms, + dtypes=dtype_dict + ) + + return model_analysis, runtime_analyzer diff --git a/lightwood/analysis/base.py b/lightwood/analysis/base.py new file mode 100644 index 000000000..869236bae --- /dev/null +++ b/lightwood/analysis/base.py @@ -0,0 +1,46 @@ +from typing import Tuple, Dict, Optional + +import pandas as pd +from lightwood.helpers.log import log + + +class BaseAnalysisBlock: + """Class to be inherited by any analysis/explainer block.""" + def __init__(self, + deps: Optional[Tuple] = () + ): + + self.dependencies = deps # can be parallelized when there are no dependencies @TODO enforce + + def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: + """ + This method should be called once during the analysis phase, or not called at all. + It computes any information that the block may either output to the model analysis object, + or use at inference time when `.explain()` is called (in this case, make sure all needed + objects are added to the runtime analyzer so that `.explain()` can access them). + + :param info: Dictionary where any new information or objects are added. The next analysis block will use + the output of the previous block as a starting point. + :param kwargs: Dictionary with named variables from either the core analysis or the rest of the prediction + pipeline. + """ + log.info(f"{self.__class__.__name__}.analyze() has not been implemented, no modifications will be done to the model analysis.") # noqa + return info + + def explain(self, + row_insights: pd.DataFrame, + global_insights: Dict[str, object], **kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]: + """ + This method should be called once during the explaining phase at inference time, or not called at all. + Additional explanations can be at an instance level (row-wise) or global. + For the former, return a data frame with any new insights. For the latter, a dictionary is required. + + :param row_insights: dataframe with previously computed row-level explanations. + :param global_insights: dict() with any explanations that concern all predicted instances or the model itself. + + :returns: + - row_insights: modified input dataframe with any new row insights added here. + - global_insights: dict() with any explanations that concern all predicted instances or the model itself. + """ + log.info(f"{self.__class__.__name__}.explain() has not been implemented, no modifications will be done to the data insights.") # noqa + return row_insights, global_insights diff --git a/lightwood/analysis/explain.py b/lightwood/analysis/explain.py index 207803aa1..7979a0815 100644 --- a/lightwood/analysis/explain.py +++ b/lightwood/analysis/explain.py @@ -1,23 +1,21 @@ -from copy import deepcopy - +from typing import Optional, List, Dict import torch -import numpy as np import pandas as pd -from lightwood.analysis.nc.util import get_numerical_conf_range, get_categorical_conf, get_anomalies -from lightwood.helpers.ts import get_inferred_timestamps, add_tn_conf_bounds -from lightwood.api.dtype import dtype from lightwood.api.types import TimeseriesSettings +from lightwood.helpers.ts import get_inferred_timestamps +from lightwood.analysis.base import BaseAnalysisBlock def explain(data: pd.DataFrame, encoded_data: torch.Tensor, predictions: pd.DataFrame, timeseries_settings: TimeseriesSettings, - analysis: dict, + analysis: Dict, target_name: str, target_dtype: str, - positive_domain: bool, + + positive_domain: bool, # @TODO: pass inside a {} with params for each block to avoid signature overload fixed_confidence: float, anomaly_detection: bool, @@ -29,205 +27,64 @@ def explain(data: pd.DataFrame, # implicitly assumes series are regularly spaced anomaly_cooldown: int, - ts_analysis: dict = None + explainer_blocks: Optional[List[BaseAnalysisBlock]] = [], + ts_analysis: Optional[Dict] = {} ): + """ + This procedure runs at the end of every normal `.predict()` call. Its goal is to generate prediction insights, + potentially using information generated at the model analysis stage (e.g. confidence estimation). + + As in `analysis()`, any user-specified analysis blocks (see class `BaseAnalysisBlock`) are also called here. - # @TODO: check not quick_predict + :return: + row_insights: a DataFrame containing predictions and all generated insights at a row-level. + """ + + # ------------------------- # + # Setup base insights + # ------------------------- # data = data.reset_index(drop=True) - insights = pd.DataFrame() + row_insights = pd.DataFrame() + global_insights = {} + row_insights['prediction'] = predictions['prediction'] + if target_name in data.columns: - insights['truth'] = data[target_name] + row_insights['truth'] = data[target_name] else: - insights['truth'] = [None] * len(predictions['prediction']) - insights['prediction'] = predictions['prediction'] + row_insights['truth'] = [None] * len(predictions['prediction']) if timeseries_settings.is_timeseries: - if timeseries_settings.group_by: for col in timeseries_settings.group_by: - insights[f'group_{col}'] = data[col] + row_insights[f'group_{col}'] = data[col] for col in timeseries_settings.order_by: - insights[f'order_{col}'] = data[col] + row_insights[f'order_{col}'] = data[col] for col in timeseries_settings.order_by: - insights[f'order_{col}'] = get_inferred_timestamps( - insights, col, ts_analysis['deltas'], timeseries_settings) - - # confidence estimation using calibrated inductive conformal predictors (ICPs) - if analysis['icp']['__mdb_active']: - icp_X = deepcopy(data) - - # replace observed data w/predictions - preds = predictions['prediction'] - if timeseries_settings.is_timeseries and timeseries_settings.nr_predictions > 1: - preds = [p[0] for p in preds] - - for col in [f'timestep_{i}' for i in range(1, timeseries_settings.nr_predictions)]: - if col in icp_X.columns: - icp_X.pop(col) # erase ignorable columns - - icp_X[target_name] = preds - - is_categorical = target_dtype in (dtype.binary, dtype.categorical) - is_numerical = target_dtype in (dtype.integer, dtype.float, dtype.array, dtype.tsarray) - is_anomaly_task = is_numerical and timeseries_settings.is_timeseries and anomaly_detection - - if (is_numerical or is_categorical) and analysis['icp'].get('__mdb_active', False): - - # reorder DF index - index = analysis['icp']['__default'].index.values - index = np.append(index, target_name) if target_name not in index else index - icp_X = icp_X.reindex(columns=index) # important, else bounds can be invalid - - # only one normalizer, even if it's a grouped time series task - normalizer = analysis['icp']['__default'].nc_function.normalizer - if normalizer: - normalizer.prediction_cache = normalizer(encoded_data) - icp_X['__mdb_selfaware_scores'] = normalizer.prediction_cache - - # get ICP predictions - result_cols = ['lower', 'upper', 'significance'] if is_numerical else ['significance'] - result = pd.DataFrame(index=icp_X.index, columns=result_cols) - - # base ICP - X = deepcopy(icp_X) - # Calling `values` multiple times increased runtime of this function; referenced var is faster - icp_values = X.values - - # get all possible ranges - if timeseries_settings.is_timeseries and timeseries_settings.nr_predictions > 1 and is_numerical: - - # bounds in time series are only given for the first forecast - analysis['icp']['__default'].nc_function.model.prediction_cache = \ - [p[0] for p in predictions['prediction']] - all_confs = analysis['icp']['__default'].predict(icp_values) - - elif is_numerical: - analysis['icp']['__default'].nc_function.model.prediction_cache = predictions['prediction'] - all_confs = analysis['icp']['__default'].predict(icp_values) - - # categorical - else: - predicted_proba = True if any(['__mdb_proba' in col for col in predictions.columns]) else False - if predicted_proba: - all_cat_cols = [col for col in predictions.columns if '__mdb_proba' in col] - class_dists = predictions[all_cat_cols].values - for icol, cat_col in enumerate(all_cat_cols): - insights.loc[X.index, cat_col] = class_dists[:, icol] - else: - class_dists = pd.get_dummies(predictions['prediction']).values - - analysis['icp']['__default'].nc_function.model.prediction_cache = class_dists - - conf_candidates = list(range(20)) + list(range(20, 100, 10)) - all_ranges = np.array( - [analysis['icp']['__default'].predict(icp_values, significance=s / 100) - for s in conf_candidates]) - all_confs = np.swapaxes(np.swapaxes(all_ranges, 0, 2), 0, 1) - - # convert (B, 2, 99) into (B, 2) given width or error rate constraints - if is_numerical: - significances = fixed_confidence - if significances is not None: - confs = all_confs[:, :, int(100 * (1 - significances)) - 1] - else: - error_rate = anomaly_error_rate if is_anomaly_task else None - significances, confs = get_numerical_conf_range(all_confs, - df_std_dev=analysis['df_std_dev'], - positive_domain=positive_domain, - error_rate=error_rate) - result.loc[X.index, 'lower'] = confs[:, 0] - result.loc[X.index, 'upper'] = confs[:, 1] - else: - conf_candidates = list(range(20)) + list(range(20, 100, 10)) - significances = get_categorical_conf(all_confs, conf_candidates) - - result.loc[X.index, 'significance'] = significances - - # grouped time series, we replace bounds in rows that have a trained ICP - if analysis['icp'].get('__mdb_groups', False): - icps = analysis['icp'] - group_keys = icps['__mdb_group_keys'] - - for group in icps['__mdb_groups']: - icp = icps[frozenset(group)] - - # check ICP has calibration scores - if icp.cal_scores[0].shape[0] > 0: - - # filter rows by group - X = deepcopy(icp_X) - for key, val in zip(group_keys, group): - X = X[X[key] == val] - - if X.size > 0: - # set ICP caches - icp.nc_function.model.prediction_cache = X.pop(target_name).values - if icp.nc_function.normalizer: - icp.nc_function.normalizer.prediction_cache = X.pop('__mdb_selfaware_scores').values - - # predict and get confidence level given width or error rate constraints - if is_numerical: - all_confs = icp.predict(X.values) - error_rate = anomaly_error_rate if is_anomaly_task else None - significances, confs = get_numerical_conf_range(all_confs, - df_std_dev=analysis['df_std_dev'], - positive_domain=positive_domain, - group=frozenset(group), - error_rate=error_rate) - - # only replace where grouped ICP is more informative (i.e. tighter) - default_icp_widths = result.loc[X.index, 'upper'] - result.loc[X.index, 'lower'] - grouped_widths = np.subtract(confs[:, 1], confs[:, 0]) - insert_index = (default_icp_widths > grouped_widths)[lambda x: x.isin([True])].index - conf_index = (default_icp_widths.reset_index(drop=True) > - grouped_widths)[lambda x: x.isin([True])].index - - result.loc[insert_index, 'lower'] = confs[conf_index, 0] - result.loc[insert_index, 'upper'] = confs[conf_index, 1] - result.loc[insert_index, 'significance'] = significances[conf_index] - - else: - conf_candidates = list(range(20)) + list(range(20, 100, 10)) - all_ranges = np.array( - [icp.predict(X.values, significance=s / 100) - for s in conf_candidates]) - all_confs = np.swapaxes(np.swapaxes(all_ranges, 0, 2), 0, 1) - significances = get_categorical_conf(all_confs, conf_candidates) - result.loc[X.index, 'significance'] = significances - - insights['confidence'] = result['significance'].astype(float).tolist() - - if is_numerical: - insights['lower'] = result['lower'].astype(float) - insights['upper'] = result['upper'].astype(float) - - # anomaly detection - if is_anomaly_task: - anomalies = get_anomalies(insights, - data[target_name], - cooldown=anomaly_cooldown) - insights['anomaly'] = anomalies - - if timeseries_settings.is_timeseries and timeseries_settings.nr_predictions > 1 and is_numerical: - insights = add_tn_conf_bounds(insights, timeseries_settings) - - # Make sure the target and realted values are of an appropriate type - if timeseries_settings.is_timeseries and timeseries_settings.nr_predictions > 1: - # Array output that are not of type originally are odd and I'm not sure how to handle them - # Or if they even need handling yet - pass - elif target_dtype in (dtype.integer): - insights['prediction'] = insights['prediction'].astype(int) - insights['upper'] = insights['upper'].astype(int) - insights['lower'] = insights['lower'].astype(int) - elif target_dtype in (dtype.float): - insights['prediction'] = insights['prediction'].astype(float) - insights['upper'] = insights['upper'].astype(float) - insights['lower'] = insights['lower'].astype(float) - elif target_dtype in (dtype.short_text, dtype.rich_text, dtype.binary, dtype.categorical): - insights['prediction'] = insights['prediction'].astype(str) - - return insights + row_insights[f'order_{col}'] = get_inferred_timestamps( + row_insights, col, ts_analysis['deltas'], timeseries_settings) + + kwargs = { + 'data': data, + 'encoded_data': encoded_data, + 'predictions': predictions, + 'analysis': analysis, + 'target_name': target_name, + 'target_dtype': target_dtype, + 'tss': timeseries_settings, + 'positive_domain': positive_domain, + 'fixed_confidence': fixed_confidence, + 'anomaly_detection': anomaly_detection, + 'anomaly_error_rate': anomaly_error_rate, + 'anomaly_cooldown': anomaly_cooldown + } + + # ------------------------- # + # Call explanation blocks + # ------------------------- # + for block in explainer_blocks: + row_insights, global_insights = block.explain(row_insights, global_insights, **kwargs) + + return row_insights, global_insights diff --git a/lightwood/analysis/helpers/__init__.py b/lightwood/analysis/helpers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lightwood/analysis/acc_stats.py b/lightwood/analysis/helpers/acc_stats.py similarity index 76% rename from lightwood/analysis/acc_stats.py rename to lightwood/analysis/helpers/acc_stats.py index e6f81cc8b..a05597d8b 100644 --- a/lightwood/analysis/acc_stats.py +++ b/lightwood/analysis/helpers/acc_stats.py @@ -1,55 +1,70 @@ import random -from typing import Union +from types import SimpleNamespace +from typing import Dict, Optional import numpy as np -import pandas as pd from sklearn.metrics import confusion_matrix + from lightwood.api.dtype import dtype +from lightwood.analysis.base import BaseAnalysisBlock +from lightwood.helpers.general import evaluate_accuracy -class AccStats: - """ - Computes accuracy stats and a confusion matrix for the validation dataset - """ +class AccStats(BaseAnalysisBlock): + """ Computes accuracy stats and a confusion matrix for the validation dataset """ + + def __init__(self, deps=('ICP',)): + super().__init__(deps=deps) # @TODO: enforce that this actually prevents early execution somehow + + def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: + ns = SimpleNamespace(**kwargs) + + # @TODO: maybe pass ts_analysis to trigger group-wise MASE instead of R2 mean, though it wouldn't be 0-1 bounded + info['score_dict'] = evaluate_accuracy(ns.data, ns.normal_predictions['prediction'], + ns.target, ns.accuracy_functions) + info['normal_accuracy'] = np.mean(list(info['score_dict'].values())) + + self.fit(ns, info['result_df']) + info['val_overall_acc'], info['acc_histogram'], info['cm'], info['acc_samples'] = self.get_accuracy_stats() + return info - def __init__(self, dtype_dict: dict, target: str, buckets: Union[None, dict]): - self.col_stats = dtype_dict - self.target = target - self.input_cols = list(dtype_dict.keys()) - self.buckets = buckets if buckets else {} + def fit(self, ns: SimpleNamespace, conf=Optional[np.ndarray]): + self.col_stats = ns.dtype_dict + self.target = ns.target + self.input_cols = list(ns.dtype_dict.keys()) + self.buckets = ns.stats_info.buckets if ns.stats_info.buckets else {} self.normal_predictions_bucketized = [] self.real_values_bucketized = [] self.numerical_samples_arr = [] - def fit(self, input_df: pd.DataFrame, predictions: pd.DataFrame, conf=Union[None, np.ndarray]): column_indexes = {} for i, col in enumerate(self.input_cols): column_indexes[col] = i real_present_inputs_arr = [] - for _, row in input_df.iterrows(): + for _, row in ns.data.iterrows(): present_inputs = [1] * len(self.input_cols) for i, col in enumerate(self.input_cols): if str(row[col]) in ('None', 'nan', '', 'Nan', 'NAN', 'NaN'): present_inputs[i] = 0 real_present_inputs_arr.append(present_inputs) - for n in range(len(predictions)): - row = input_df.iloc[n] + for n in range(len(ns.normal_predictions)): + row = ns.data.iloc[n] real_value = row[self.target] - predicted_value = predictions.iloc[n]['prediction'] + predicted_value = ns.normal_predictions.iloc[n]['prediction'] if isinstance(predicted_value, list): # T+N time series, for now we compare the T+1 prediction only @TODO: generalize predicted_value = predicted_value[0] predicted_value = predicted_value \ - if self.col_stats[self.target] not in [dtype.integer, dtype.float] \ + if self.col_stats[self.target] not in [dtype.integer, dtype.float, dtype.quantity] \ else float(predicted_value) real_value = real_value \ - if self.col_stats[self.target] not in [dtype.integer, dtype.float] \ + if self.col_stats[self.target] not in [dtype.integer, dtype.float, dtype.quantity] \ else float(real_value) if self.buckets: @@ -60,14 +75,14 @@ def fit(self, input_df: pd.DataFrame, predictions: pd.DataFrame, conf=Union[None predicted_value_b = predicted_value real_value_b = real_value - if conf is not None and self.col_stats[self.target] in [dtype.integer, dtype.float]: + if conf is not None and self.col_stats[self.target] in [dtype.integer, dtype.float, dtype.quantity]: predicted_range = conf.iloc[n][['lower', 'upper']].tolist() else: predicted_range = (predicted_value_b, predicted_value_b) self.real_values_bucketized.append(real_value_b) self.normal_predictions_bucketized.append(predicted_value_b) - if conf is not None and self.col_stats[self.target] in [dtype.integer, dtype.float]: + if conf is not None and self.col_stats[self.target] in [dtype.integer, dtype.float, dtype.quantity]: self.numerical_samples_arr.append((real_value, predicted_range)) def get_accuracy_stats(self, is_classification=None, is_numerical=None): @@ -148,7 +163,7 @@ def get_value_bucket(value, buckets, target_dtype): else: bucket = len(buckets) # for null values - elif target_dtype in (dtype.integer, dtype.float): + elif target_dtype in (dtype.integer, dtype.float, dtype.quantity): bucket = closest(buckets, value) else: bucket = len(buckets) # for null values diff --git a/lightwood/analysis/helpers/feature_importance.py b/lightwood/analysis/helpers/feature_importance.py new file mode 100644 index 000000000..d56fba132 --- /dev/null +++ b/lightwood/analysis/helpers/feature_importance.py @@ -0,0 +1,72 @@ +from copy import deepcopy +from types import SimpleNamespace +from typing import Dict + +import torch +import numpy as np + +from lightwood.analysis.base import BaseAnalysisBlock +from lightwood.helpers.general import evaluate_accuracy +from lightwood.analysis.nc.util import t_softmax + + +class GlobalFeatureImportance(BaseAnalysisBlock): + """ + Analysis block that estimates column importance with a variant of the LOCO (leave-one-covariate-out) algorithm. + + Roughly speaking, the procedure: + - iterates over all input columns + - if the input column is optional, then make a predict with its values set to None + - compare this accuracy with the accuracy obtained using all data + - all accuracy differences are passed through a softmax and reported as estimated column importance scores + + Note that, crucially, this method does not refit the predictor at any point. + + Reference: + https://compstat-lmu.github.io/iml_methods_limitations/pfi.html + """ + def __init__(self, disable_column_importance): + super().__init__() + self.disable_column_importance = disable_column_importance + + def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: + ns = SimpleNamespace(**kwargs) + + if self.disable_column_importance or ns.ts_cfg.is_timeseries or ns.has_pretrained_text_enc: + info['column_importances'] = None + else: + empty_input_accuracy = {} + ignorable_input_cols = [x for x in ns.input_cols if (not ns.ts_cfg.is_timeseries or + (x not in ns.ts_cfg.order_by and + x not in ns.ts_cfg.historical_columns))] + for col in ignorable_input_cols: + partial_data = deepcopy(ns.encoded_val_data) + partial_data.clear_cache() + partial_data.data_frame[col] = [None] * len(partial_data.data_frame[col]) + + if not ns.is_classification: + empty_input_preds = ns.predictor(partial_data) + else: + empty_input_preds = ns.predictor(partial_data, predict_proba=True) + + empty_input_accuracy[col] = np.mean(list(evaluate_accuracy( + ns.data, + empty_input_preds['prediction'], + ns.target, + ns.accuracy_functions + ).values())) + + column_importances = {} + acc_increases = [] + for col in ignorable_input_cols: + accuracy_increase = (info['normal_accuracy'] - empty_input_accuracy[col]) + acc_increases.append(accuracy_increase) + + # low 0.2 temperature to accentuate differences + acc_increases = t_softmax(torch.Tensor([acc_increases]), t=0.2).tolist()[0] + for col, inc in zip(ignorable_input_cols, acc_increases): + column_importances[col] = 10 * inc # scores go from 0 to 10 in GUI + + info['column_importances'] = column_importances + + return info diff --git a/lightwood/analysis/model_analyzer.py b/lightwood/analysis/model_analyzer.py deleted file mode 100644 index f9df286ae..000000000 --- a/lightwood/analysis/model_analyzer.py +++ /dev/null @@ -1,294 +0,0 @@ -from typing import Dict, List - -import torch -import numpy as np -import pandas as pd -from copy import deepcopy -from itertools import product -from sklearn.preprocessing import OneHotEncoder - -from lightwood.api import dtype -from lightwood.api.types import ModelAnalysis, StatisticalAnalysis, TimeseriesSettings -from lightwood.data.encoded_ds import ConcatedEncodedDs, EncodedDs -from lightwood.helpers.general import evaluate_accuracy -from lightwood.ensemble import BaseEnsemble -from lightwood.encoder.text.pretrained import PretrainedLangEncoder - -from lightwood.analysis.acc_stats import AccStats -from lightwood.analysis.nc.norm import Normalizer -from lightwood.analysis.nc.nc import BoostedAbsErrorErrFunc -from lightwood.analysis.nc.util import clean_df, set_conf_range -from lightwood.analysis.nc.icp import IcpRegressor, IcpClassifier -from lightwood.analysis.nc.nc import RegressorNc, ClassifierNc, MarginErrFunc -from lightwood.analysis.nc.wrappers import ConformalClassifierAdapter, ConformalRegressorAdapter, t_softmax - - -""" -Pending: - - [] simplify nonconformist custom implementation to deprecate wrappers - - [] reimplement caching for faster analysis? - - [] confidence for T+N <- active research question -""" - - -def model_analyzer( - predictor: BaseEnsemble, - data: List[EncodedDs], - train_data: List[EncodedDs], - stats_info: StatisticalAnalysis, - target: str, - ts_cfg: TimeseriesSettings, - dtype_dict: Dict[str, str], - disable_column_importance: bool, - fixed_significance: float, - positive_domain: bool, - confidence_normalizer: bool, - accuracy_functions -): - """Analyses model on a validation subset to evaluate accuracy and confidence of future predictions""" - - data_type = dtype_dict[target] - data_subtype = data_type - - is_numerical = data_type in (dtype.integer, dtype.float, dtype.array, dtype.tsarray) - is_classification = data_type in (dtype.categorical, dtype.binary) - is_multi_ts = ts_cfg.is_timeseries and ts_cfg.nr_predictions > 1 - - encoded_train_data = ConcatedEncodedDs(train_data) - encoded_data = ConcatedEncodedDs(data) - - has_pretrained_text_enc = any([isinstance(enc, PretrainedLangEncoder) - for enc in encoded_train_data.encoders.values()]) - disable_column_importance = disable_column_importance or ts_cfg.is_timeseries or has_pretrained_text_enc - - data = encoded_data.data_frame - runtime_analyzer = {} - predictions = {} - input_cols = list([col for col in data.columns if col != target]) - normal_predictions = predictor(encoded_data) if not is_classification else predictor( - encoded_data, predict_proba=True) - normal_predictions = normal_predictions.set_index(data.index) - - # confidence estimation with inductive conformal predictors (ICPs) - runtime_analyzer['icp'] = {'__mdb_active': False} - - fit_params = {'nr_preds': ts_cfg.nr_predictions or 0, 'columns_to_ignore': []} - fit_params['columns_to_ignore'].extend([f'timestep_{i}' for i in range(1, fit_params['nr_preds'])]) - - if is_classification: - if predictor.supports_proba: - all_cat_cols = [col for col in normal_predictions.columns if '__mdb_proba' in col] - all_classes = np.array([col.replace('__mdb_proba_', '') for col in all_cat_cols]) - else: - class_keys = sorted(encoded_data.encoders[target].rev_map.keys()) - all_classes = np.array([encoded_data.encoders[target].rev_map[idx] for idx in class_keys]) - - if data_subtype != dtype.tags: - enc = OneHotEncoder(sparse=False, handle_unknown='ignore') - enc.fit(all_classes.reshape(-1, 1)) - runtime_analyzer['label_encoders'] = enc # needed to repr cat labels inside nonconformist - else: - runtime_analyzer['label_encoders'] = None - - adapter = ConformalClassifierAdapter - nc_function = MarginErrFunc() - nc_class = ClassifierNc - icp_class = IcpClassifier - - else: - adapter = ConformalRegressorAdapter - nc_function = BoostedAbsErrorErrFunc() - nc_class = RegressorNc - icp_class = IcpRegressor - - if is_numerical or (is_classification and data_subtype != dtype.tags): - model = adapter(predictor) - - norm_params = {'target': target, 'dtype_dict': dtype_dict, 'predictor': predictor, - 'encoders': encoded_data.encoders, 'is_multi_ts': is_multi_ts, 'stop_after': 1e2} - if confidence_normalizer: - normalizer = Normalizer(fit_params=norm_params) - normalizer.fit(train_data) - normalizer.prediction_cache = normalizer(encoded_data) - else: - normalizer = None - - # instance the ICP - nc = nc_class(model, nc_function, normalizer=normalizer) - icp = icp_class(nc) - - runtime_analyzer['icp']['__default'] = icp - - # setup prediction cache to avoid additional .predict() calls - if is_classification: - if predictor.mixers[predictor.best_index].supports_proba: - icp.nc_function.model.prediction_cache = normal_predictions[all_cat_cols].values - else: - predicted_classes = pd.get_dummies(normal_predictions['prediction']).values # inflate to one-hot enc - icp.nc_function.model.prediction_cache = predicted_classes - - elif is_multi_ts: - # we fit ICPs for time series confidence bounds only at t+1 forecast - icp.nc_function.model.prediction_cache = np.array([p[0] for p in normal_predictions['prediction']]) - else: - icp.nc_function.model.prediction_cache = np.array(normal_predictions['prediction']) - - if not is_classification: - runtime_analyzer['df_std_dev'] = {'__default': stats_info.df_std_dev} - - # fit additional ICPs in time series tasks with grouped columns - if ts_cfg.is_timeseries and ts_cfg.group_by: - - # create an ICP for each possible group - group_info = data[ts_cfg.group_by].to_dict('list') - all_group_combinations = list(product(*[set(x) for x in group_info.values()])) - runtime_analyzer['icp']['__mdb_groups'] = all_group_combinations - runtime_analyzer['icp']['__mdb_group_keys'] = [x for x in group_info.keys()] - - for combination in all_group_combinations: - runtime_analyzer['icp'][frozenset(combination)] = deepcopy(icp) - - # calibrate ICP - icp_df = deepcopy(data) - icp_df, y = clean_df(icp_df, target, is_classification, runtime_analyzer.get('label_encoders', None)) - runtime_analyzer['icp']['__default'].index = icp_df.columns - runtime_analyzer['icp']['__default'].calibrate(icp_df.values, y) - - # get confidence estimation for validation dataset - conf, ranges = set_conf_range( - icp_df, icp, dtype_dict[target], - runtime_analyzer, positive_domain=positive_domain, significance=fixed_significance) - if not is_classification: - result_df = pd.DataFrame(index=data.index, columns=['confidence', 'lower', 'upper'], dtype=float) - result_df.loc[icp_df.index, 'lower'] = ranges[:, 0] - result_df.loc[icp_df.index, 'upper'] = ranges[:, 1] - else: - result_df = pd.DataFrame(index=data.index, columns=['confidence'], dtype=float) - - result_df.loc[icp_df.index, 'confidence'] = conf - - # calibrate additional grouped ICPs - if ts_cfg.is_timeseries and ts_cfg.group_by: - icps = runtime_analyzer['icp'] - group_keys = icps['__mdb_group_keys'] - - # add all predictions to DF - icps_df = deepcopy(data) - if is_multi_ts: - icps_df[f'__predicted_{target}'] = [p[0] for p in normal_predictions['prediction']] - else: - icps_df[f'__predicted_{target}'] = normal_predictions['prediction'] - - for group in icps['__mdb_groups']: - icp_df = icps_df - if icps[frozenset(group)].nc_function.normalizer is not None: - icp_df[f'__norm_{target}'] = icps[frozenset(group)].nc_function.normalizer.prediction_cache - - # filter irrelevant rows for each group combination - for key, val in zip(group_keys, group): - icp_df = icp_df[icp_df[key] == val] - - # save relevant predictions in the caches, then calibrate the ICP - pred_cache = icp_df.pop(f'__predicted_{target}').values - icps[frozenset(group)].nc_function.model.prediction_cache = pred_cache - icp_df, y = clean_df(icp_df, target, is_classification, runtime_analyzer.get('label_encoders', None)) - if icps[frozenset(group)].nc_function.normalizer is not None: - icps[frozenset(group)].nc_function.normalizer.prediction_cache = icp_df.pop( - f'__norm_{target}').values - - icps[frozenset(group)].index = icp_df.columns # important at inference time - icps[frozenset(group)].calibrate(icp_df.values, y) - - # save training std() for bounds width selection - if not is_classification: - icp_train_df = data - for key, val in zip(group_keys, group): - icp_train_df = icp_train_df[icp_train_df[key] == val] - y_train = icp_train_df[target].values - runtime_analyzer['df_std_dev'][frozenset(group)] = y_train.std() - - # get bounds for relevant rows in validation dataset - conf, group_ranges = set_conf_range( - icp_df, icps[frozenset(group)], - dtype_dict[target], - runtime_analyzer, group=frozenset(group), - positive_domain=positive_domain, significance=fixed_significance) - # save group bounds - if not is_classification: - result_df.loc[icp_df.index, 'lower'] = group_ranges[:, 0] - result_df.loc[icp_df.index, 'upper'] = group_ranges[:, 1] - - result_df.loc[icp_df.index, 'confidence'] = conf - - # consolidate all groups here - if not is_classification: - ranges = result_df.values - predictions['confidence_range'] = ranges - - runtime_analyzer['icp']['__mdb_active'] = True - - # get accuracy metric for validation data - score_dict = evaluate_accuracy( - data, - normal_predictions['prediction'], - target, - accuracy_functions - ) - normal_accuracy = np.mean(list(score_dict.values())) - - # compute global feature importance - if not disable_column_importance: - empty_input_accuracy = {} - ignorable_input_cols = [x for x in input_cols if (not ts_cfg.is_timeseries or - (x not in ts_cfg.order_by and - x not in ts_cfg.historical_columns))] - for col in ignorable_input_cols: - partial_data = deepcopy(encoded_data) - partial_data.clear_cache() - for ds in partial_data.encoded_ds_arr: - ds.data_frame[col] = [None] * len(ds.data_frame[col]) - - if not is_classification: - empty_input_preds = predictor(partial_data) - else: - empty_input_preds = predictor(partial_data, predict_proba=True) - - empty_input_accuracy[col] = np.mean(list(evaluate_accuracy( - data, - empty_input_preds['prediction'], - target, - accuracy_functions - ).values())) - - column_importances = {} - acc_increases = [] - for col in ignorable_input_cols: - accuracy_increase = (normal_accuracy - empty_input_accuracy[col]) - acc_increases.append(accuracy_increase) - - # low 0.2 temperature to accentuate differences - acc_increases = t_softmax(torch.Tensor([acc_increases]), t=0.2).tolist()[0] - for col, inc in zip(ignorable_input_cols, acc_increases): - column_importances[col] = 10 * inc # scores go from 0 to 10 in GUI - else: - column_importances = None - - acc_stats = AccStats(dtype_dict=dtype_dict, target=target, buckets=stats_info.buckets) - acc_stats.fit(data, normal_predictions, conf=result_df) - bucket_accuracy, accuracy_histogram, cm, accuracy_samples = acc_stats.get_accuracy_stats( - is_classification=is_classification, is_numerical=is_numerical) - runtime_analyzer['bucket_accuracy'] = bucket_accuracy - - model_analysis = ModelAnalysis( - accuracies=score_dict, - accuracy_histogram=accuracy_histogram, - accuracy_samples=accuracy_samples, - train_sample_size=len(encoded_train_data), - test_sample_size=len(encoded_data), - confusion_matrix=cm, - column_importances=column_importances, - histograms=stats_info.histograms, - dtypes=dtype_dict - ) - - return model_analysis, runtime_analyzer diff --git a/lightwood/analysis/nc/LICENSE b/lightwood/analysis/nc/LICENSE new file mode 100644 index 000000000..f305d4eb9 --- /dev/null +++ b/lightwood/analysis/nc/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Henrik Linusson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/lightwood/analysis/nc/__init__.py b/lightwood/analysis/nc/__init__.py index e3208df7b..e69de29bb 100644 --- a/lightwood/analysis/nc/__init__.py +++ b/lightwood/analysis/nc/__init__.py @@ -1 +0,0 @@ -# TODO: update to latest repo version, as pypi 2.1.0 release is outdated! diff --git a/lightwood/analysis/nc/base.py b/lightwood/analysis/nc/base.py index 7546cd657..2e472e4d4 100644 --- a/lightwood/analysis/nc/base.py +++ b/lightwood/analysis/nc/base.py @@ -5,6 +5,9 @@ from sklearn.base import BaseEstimator +from lightwood.analysis.nc.util import t_softmax + + class RegressorMixin(object): def __init__(self) -> None: super(RegressorMixin, self).__init__() @@ -109,3 +112,35 @@ def __init__(self, model: object, fit_params: Dict[str, object] = None) -> None: def _underlying_predict(self, x: np.array) -> np.array: return self.model.predict(x) + + +class CachedRegressorAdapter(RegressorAdapter): + def __init__(self, model, fit_params=None): + super(CachedRegressorAdapter, self).__init__(model, fit_params) + self.prediction_cache = None + + def fit(self, x=None, y=None): + """ At this point, the predictor has already been trained, but this + has to be called to setup some things in the nonconformist backend """ + pass + + def predict(self, x=None): + """ Same as in .fit() + :return: np.array (n_test, n_classes) with class probability estimates """ + return self.prediction_cache + + +class CachedClassifierAdapter(ClassifierAdapter): + def __init__(self, model, fit_params=None): + super(CachedClassifierAdapter, self).__init__(model, fit_params) + self.prediction_cache = None + + def fit(self, x=None, y=None): + """ At this point, the predictor has already been trained, but this + has to be called to setup some things in the nonconformist backend """ + pass + + def predict(self, x=None): + """ Same as in .fit() + :return: np.array (n_test, n_classes) with class probability estimates """ + return t_softmax(self.prediction_cache, t=0.5) diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py new file mode 100644 index 000000000..41390f81d --- /dev/null +++ b/lightwood/analysis/nc/calibrate.py @@ -0,0 +1,379 @@ +from copy import deepcopy +from itertools import product +from typing import Dict, Tuple +from types import SimpleNamespace + +import numpy as np +import pandas as pd +from sklearn.preprocessing import OneHotEncoder + +from lightwood.api.dtype import dtype +from lightwood.helpers.ts import add_tn_conf_bounds + +from lightwood.analysis.base import BaseAnalysisBlock +from lightwood.analysis.nc.norm import Normalizer +from lightwood.analysis.nc.icp import IcpRegressor, IcpClassifier +from lightwood.analysis.nc.base import CachedRegressorAdapter, CachedClassifierAdapter +from lightwood.analysis.nc.nc import BoostedAbsErrorErrFunc, RegressorNc, ClassifierNc, MarginErrFunc +from lightwood.analysis.nc.util import clean_df, set_conf_range, get_numeric_conf_range, \ + get_categorical_conf, get_anomalies + + +class ICP(BaseAnalysisBlock): + """ Confidence estimation block, uses inductive conformal predictors (ICPs) for model agnosticity """ + + def __init__(self, + fixed_significance: float, + positive_domain: bool, + confidence_normalizer: bool + ): + super().__init__() + self.fixed_significance = fixed_significance + self.positive_domain = positive_domain + self.confidence_normalizer = confidence_normalizer + + def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: + ns = SimpleNamespace(**kwargs) + + data_type = ns.dtype_dict[ns.target] + output = {'icp': {'__mdb_active': False}} + + fit_params = {'nr_preds': ns.ts_cfg.nr_predictions or 0, 'columns_to_ignore': []} + fit_params['columns_to_ignore'].extend([f'timestep_{i}' for i in range(1, fit_params['nr_preds'])]) + + if ns.is_classification: + if ns.predictor.supports_proba: + all_cat_cols = [col for col in ns.normal_predictions.columns if '__mdb_proba' in col] + all_classes = np.array([col.replace('__mdb_proba_', '') for col in all_cat_cols]) + else: + class_keys = sorted(ns.encoded_val_data.encoders[ns.target].rev_map.keys()) + all_classes = np.array([ns.encoded_val_data.encoders[ns.target].rev_map[idx] for idx in class_keys]) + + if data_type != dtype.tags: + enc = OneHotEncoder(sparse=False, handle_unknown='ignore') + enc.fit(all_classes.reshape(-1, 1)) + output['label_encoders'] = enc # needed to repr cat labels inside nonconformist + else: + output['label_encoders'] = None + + adapter = CachedClassifierAdapter + nc_function = MarginErrFunc() + nc_class = ClassifierNc + icp_class = IcpClassifier + + else: + adapter = CachedRegressorAdapter + nc_function = BoostedAbsErrorErrFunc() + nc_class = RegressorNc + icp_class = IcpRegressor + + result_df = pd.DataFrame() + + if ns.is_numerical or (ns.is_classification and data_type != dtype.tags): + model = adapter(ns.predictor) + + norm_params = {'target': ns.target, 'dtype_dict': ns.dtype_dict, 'predictor': ns.predictor, + 'encoders': ns.encoded_val_data.encoders, 'is_multi_ts': ns.is_multi_ts, 'stop_after': 1e2} + if self.confidence_normalizer: + normalizer = Normalizer(fit_params=norm_params) + normalizer.fit(ns.train_data) + normalizer.prediction_cache = normalizer(ns.encoded_val_data) + else: + normalizer = None + + # instance the ICP + nc = nc_class(model, nc_function, normalizer=normalizer) + icp = icp_class(nc) + + output['icp']['__default'] = icp + + # setup prediction cache to avoid additional .predict() calls + if ns.is_classification: + if ns.predictor.mixers[ns.predictor.best_index].supports_proba: + icp.nc_function.model.prediction_cache = ns.normal_predictions[all_cat_cols].values + else: + predicted_classes = pd.get_dummies( + ns.normal_predictions['prediction']).values # inflate to one-hot enc + icp.nc_function.model.prediction_cache = predicted_classes + + elif ns.is_multi_ts: + # we fit ICPs for time series confidence bounds only at t+1 forecast + icp.nc_function.model.prediction_cache = np.array([p[0] for p in ns.normal_predictions['prediction']]) + else: + icp.nc_function.model.prediction_cache = np.array(ns.normal_predictions['prediction']) + + if not ns.is_classification: + output['df_std_dev'] = {'__default': ns.stats_info.df_std_dev} + + # fit additional ICPs in time series tasks with grouped columns + if ns.ts_cfg.is_timeseries and ns.ts_cfg.group_by: + + # create an ICP for each possible group + group_info = ns.data[ns.ts_cfg.group_by].to_dict('list') + all_group_combinations = list(product(*[set(x) for x in group_info.values()])) + output['icp']['__mdb_groups'] = all_group_combinations + output['icp']['__mdb_group_keys'] = [x for x in group_info.keys()] + + for combination in all_group_combinations: + output['icp'][frozenset(combination)] = deepcopy(icp) + + # calibrate ICP + icp_df = deepcopy(ns.data) + icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None)) + output['icp']['__default'].index = icp_df.columns + output['icp']['__default'].calibrate(icp_df.values, y) + + # get confidence estimation for validation dataset + conf, ranges = set_conf_range( + icp_df, icp, ns.dtype_dict[ns.target], + output, positive_domain=self.positive_domain, significance=self.fixed_significance) + if not ns.is_classification: + result_df = pd.DataFrame(index=ns.data.index, columns=['confidence', 'lower', 'upper'], dtype=float) + result_df.loc[icp_df.index, 'lower'] = ranges[:, 0] + result_df.loc[icp_df.index, 'upper'] = ranges[:, 1] + else: + result_df = pd.DataFrame(index=ns.data.index, columns=['confidence'], dtype=float) + + result_df.loc[icp_df.index, 'confidence'] = conf + + # calibrate additional grouped ICPs + if ns.ts_cfg.is_timeseries and ns.ts_cfg.group_by: + icps = output['icp'] + group_keys = icps['__mdb_group_keys'] + + # add all predictions to DF + icps_df = deepcopy(ns.data) + if ns.is_multi_ts: + icps_df[f'__predicted_{ns.target}'] = [p[0] for p in ns.normal_predictions['prediction']] + else: + icps_df[f'__predicted_{ns.target}'] = ns.normal_predictions['prediction'] + + for group in icps['__mdb_groups']: + icp_df = icps_df + if icps[frozenset(group)].nc_function.normalizer is not None: + icp_df[f'__norm_{ns.target}'] = icps[frozenset(group)].nc_function.normalizer.prediction_cache + + # filter irrelevant rows for each group combination + for key, val in zip(group_keys, group): + icp_df = icp_df[icp_df[key] == val] + + # save relevant predictions in the caches, then calibrate the ICP + pred_cache = icp_df.pop(f'__predicted_{ns.target}').values + icps[frozenset(group)].nc_function.model.prediction_cache = pred_cache + icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None)) + if icps[frozenset(group)].nc_function.normalizer is not None: + icps[frozenset(group)].nc_function.normalizer.prediction_cache = icp_df.pop( + f'__norm_{ns.target}').values + + icps[frozenset(group)].index = icp_df.columns # important at inference time + icps[frozenset(group)].calibrate(icp_df.values, y) + + # save training std() for bounds width selection + if not ns.is_classification: + icp_train_df = ns.data + for key, val in zip(group_keys, group): + icp_train_df = icp_train_df[icp_train_df[key] == val] + y_train = icp_train_df[ns.target].values + output['df_std_dev'][frozenset(group)] = y_train.std() + + # get bounds for relevant rows in validation dataset + conf, group_ranges = set_conf_range( + icp_df, icps[frozenset(group)], + ns.dtype_dict[ns.target], + output, group=frozenset(group), + positive_domain=self.positive_domain, significance=self.fixed_significance) + # save group bounds + if not ns.is_classification: + result_df.loc[icp_df.index, 'lower'] = group_ranges[:, 0] + result_df.loc[icp_df.index, 'upper'] = group_ranges[:, 1] + + result_df.loc[icp_df.index, 'confidence'] = conf + + # consolidate all groups here + output['icp']['__mdb_active'] = True + + output['result_df'] = result_df + + info = {**info, **output} + return info + + def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object], + **kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]: + ns = SimpleNamespace(**kwargs) + + if ns.analysis['icp']['__mdb_active']: + icp_X = deepcopy(ns.data) + + # replace observed data w/predictions + preds = ns.predictions['prediction'] + if ns.tss.is_timeseries and ns.tss.nr_predictions > 1: + preds = [p[0] for p in preds] + + for col in [f'timestep_{i}' for i in range(1, ns.tss.nr_predictions)]: + if col in icp_X.columns: + icp_X.pop(col) # erase ignorable columns + + icp_X[ns.target_name] = preds + + is_categorical = ns.target_dtype in (dtype.binary, dtype.categorical, dtype.array) + is_numerical = ns.target_dtype in [dtype.integer, dtype.float, + dtype.quantity] or ns.target_dtype == dtype.array + is_anomaly_task = is_numerical and ns.tss.is_timeseries and ns.anomaly_detection + + if (is_numerical or is_categorical) and ns.analysis['icp'].get('__mdb_active', False): + + # reorder DF index + index = ns.analysis['icp']['__default'].index.values + index = np.append(index, ns.target_name) if ns.target_name not in index else index + icp_X = icp_X.reindex(columns=index) # important, else bounds can be invalid + + # only one normalizer, even if it's a grouped time series task + normalizer = ns.analysis['icp']['__default'].nc_function.normalizer + if normalizer: + normalizer.prediction_cache = normalizer(ns.encoded_data) + icp_X['__mdb_selfaware_scores'] = normalizer.prediction_cache + + # get ICP predictions + result_cols = ['lower', 'upper', 'significance'] if is_numerical else ['significance'] + result = pd.DataFrame(index=icp_X.index, columns=result_cols) + + # base ICP + X = deepcopy(icp_X) + # Calling `values` multiple times increased runtime of this function; referenced var is faster + icp_values = X.values + + # get all possible ranges + if ns.tss.is_timeseries and ns.tss.nr_predictions > 1 and is_numerical: + + # bounds in time series are only given for the first forecast + ns.analysis['icp']['__default'].nc_function.model.prediction_cache = \ + [p[0] for p in ns.predictions['prediction']] + all_confs = ns.analysis['icp']['__default'].predict(icp_values) + + elif is_numerical: + ns.analysis['icp']['__default'].nc_function.model.prediction_cache = ns.predictions['prediction'] + all_confs = ns.analysis['icp']['__default'].predict(icp_values) + + # categorical + else: + predicted_proba = True if any(['__mdb_proba' in col for col in ns.predictions.columns]) else False + if predicted_proba: + all_cat_cols = [col for col in ns.predictions.columns if '__mdb_proba' in col] + class_dists = ns.predictions[all_cat_cols].values + for icol, cat_col in enumerate(all_cat_cols): + row_insights.loc[X.index, cat_col] = class_dists[:, icol] + else: + class_dists = pd.get_dummies(ns.predictions['prediction']).values + + ns.analysis['icp']['__default'].nc_function.model.prediction_cache = class_dists + + conf_candidates = list(range(20)) + list(range(20, 100, 10)) + all_ranges = np.array( + [ns.analysis['icp']['__default'].predict(icp_values, significance=s / 100) + for s in conf_candidates]) + all_confs = np.swapaxes(np.swapaxes(all_ranges, 0, 2), 0, 1) + + # convert (B, 2, 99) into (B, 2) given width or error rate constraints + if is_numerical: + significances = ns.fixed_confidence + if significances is not None: + confs = all_confs[:, :, int(100 * (1 - significances)) - 1] + else: + error_rate = ns.anomaly_error_rate if is_anomaly_task else None + significances, confs = get_numeric_conf_range(all_confs, + df_std_dev=ns.analysis['df_std_dev'], + positive_domain=self.positive_domain, + error_rate=error_rate) + result.loc[X.index, 'lower'] = confs[:, 0] + result.loc[X.index, 'upper'] = confs[:, 1] + else: + conf_candidates = list(range(20)) + list(range(20, 100, 10)) + significances = get_categorical_conf(all_confs, conf_candidates) + + result.loc[X.index, 'significance'] = significances + + # grouped time series, we replace bounds in rows that have a trained ICP + if ns.analysis['icp'].get('__mdb_groups', False): + icps = ns.analysis['icp'] + group_keys = icps['__mdb_group_keys'] + + for group in icps['__mdb_groups']: + icp = icps[frozenset(group)] + + # check ICP has calibration scores + if icp.cal_scores[0].shape[0] > 0: + + # filter rows by group + X = deepcopy(icp_X) + for key, val in zip(group_keys, group): + X = X[X[key] == val] + + if X.size > 0: + # set ICP caches + icp.nc_function.model.prediction_cache = X.pop(ns.target_name).values + if icp.nc_function.normalizer: + icp.nc_function.normalizer.prediction_cache = X.pop('__mdb_selfaware_scores').values + + # predict and get confidence level given width or error rate constraints + if is_numerical: + all_confs = icp.predict(X.values) + error_rate = ns.anomaly_error_rate if is_anomaly_task else None + significances, confs = get_numeric_conf_range(all_confs, + df_std_dev=ns.analysis['df_std_dev'], + positive_domain=self.positive_domain, + group=frozenset(group), + error_rate=error_rate) + + # only replace where grouped ICP is more informative (i.e. tighter) + default_icp_widths = result.loc[X.index, 'upper'] - result.loc[X.index, 'lower'] + grouped_widths = np.subtract(confs[:, 1], confs[:, 0]) + insert_index = (default_icp_widths > grouped_widths)[lambda x: x.isin([True])].index + conf_index = (default_icp_widths.reset_index(drop=True) > + grouped_widths)[lambda x: x.isin([True])].index + + result.loc[insert_index, 'lower'] = confs[conf_index, 0] + result.loc[insert_index, 'upper'] = confs[conf_index, 1] + result.loc[insert_index, 'significance'] = significances[conf_index] + + else: + conf_candidates = list(range(20)) + list(range(20, 100, 10)) + all_ranges = np.array( + [icp.predict(X.values, significance=s / 100) + for s in conf_candidates]) + all_confs = np.swapaxes(np.swapaxes(all_ranges, 0, 2), 0, 1) + significances = get_categorical_conf(all_confs, conf_candidates) + result.loc[X.index, 'significance'] = significances + + row_insights['confidence'] = result['significance'].astype(float).tolist() + + if is_numerical: + row_insights['lower'] = result['lower'].astype(float) + row_insights['upper'] = result['upper'].astype(float) + + # anomaly detection + if is_anomaly_task: + anomalies = get_anomalies(row_insights, + ns.data[ns.target_name], + cooldown=ns.anomaly_cooldown) + row_insights['anomaly'] = anomalies + + if ns.tss.is_timeseries and ns.tss.nr_predictions > 1 and is_numerical: + row_insights = add_tn_conf_bounds(row_insights, ns.tss) + + # Make sure the target and real values are of an appropriate type + if ns.tss.is_timeseries and ns.tss.nr_predictions > 1: + # Array output that are not of type originally are odd and I'm not sure how to handle them + # Or if they even need handling yet + pass + elif ns.target_dtype in (dtype.integer): + row_insights['prediction'] = row_insights['prediction'].clip(-pow(2, 62), pow(2, 62)).astype(int) + row_insights['upper'] = row_insights['upper'].clip(-pow(2, 62), pow(2, 62)).astype(int) + row_insights['lower'] = row_insights['lower'].clip(-pow(2, 62), pow(2, 62)).astype(int) + elif ns.target_dtype in (dtype.float, dtype.quantity): + row_insights['prediction'] = row_insights['prediction'].astype(float) + row_insights['upper'] = row_insights['upper'].astype(float) + row_insights['lower'] = row_insights['lower'].astype(float) + elif ns.target_dtype in (dtype.short_text, dtype.rich_text, dtype.binary, dtype.categorical): + row_insights['prediction'] = row_insights['prediction'].astype(str) + + return row_insights, global_insights diff --git a/lightwood/analysis/nc/metrics.py b/lightwood/analysis/nc/metrics.py index 65e5f06c7..803fea873 100644 --- a/lightwood/analysis/nc/metrics.py +++ b/lightwood/analysis/nc/metrics.py @@ -152,8 +152,7 @@ def class_one_c(prediction, y, significance): only a single class label) of a conformal classification model. """ prediction = prediction > significance - n_singletons = np.sum(1 for _ in filter(lambda x: np.sum(x) == 1, - prediction)) + n_singletons = np.sum(1 for _ in filter(lambda x: np.sum(x) == 1, prediction)) return n_singletons / y.size @@ -162,8 +161,7 @@ def class_empty(prediction, y, significance): only a single class label) of a conformal classification model. """ prediction = prediction > significance - n_empty = np.sum(1 for _ in filter(lambda x: np.sum(x) == 0, - prediction)) + n_empty = np.sum(1 for _ in filter(lambda x: np.sum(x) == 0, prediction)) return n_empty / y.size diff --git a/lightwood/analysis/nc/norm.py b/lightwood/analysis/nc/norm.py index e1be97ccf..ecb23e0b6 100644 --- a/lightwood/analysis/nc/norm.py +++ b/lightwood/analysis/nc/norm.py @@ -1,4 +1,4 @@ -from typing import Union, List +from typing import Union import torch import numpy as np @@ -29,9 +29,8 @@ def __init__(self, fit_params: dict): self.bounds = (0.5, 1.5) self.error_fn = mean_absolute_error - def fit(self, data: List[EncodedDs]) -> None: + def fit(self, data: EncodedDs) -> None: try: - data = ConcatedEncodedDs(data) preds = self.base_predictor(data, predict_proba=True) truths = data.data_frame[self.target] labels = self.get_labels(preds, truths.values, data.encoders[self.target]) @@ -65,7 +64,7 @@ def score(self, data) -> np.ndarray: return scores def get_labels(self, preds: pd.DataFrame, truths: np.ndarray, target_enc) -> np.ndarray: - if self.target_dtype in [dtype.integer, dtype.float]: + if self.target_dtype in [dtype.integer, dtype.float, dtype.quantity]: if not self.multi_ts_task: preds = preds.values.squeeze() else: diff --git a/lightwood/analysis/nc/util.py b/lightwood/analysis/nc/util.py index f7061247e..75902bbd9 100644 --- a/lightwood/analysis/nc/util.py +++ b/lightwood/analysis/nc/util.py @@ -1,10 +1,17 @@ +import torch import numpy as np +from torch.nn.functional import softmax from lightwood.api.dtype import dtype +def t_softmax(x, t=1.0, axis=1): + """ Softmax with temperature scaling """ + return softmax(torch.Tensor(x) / t, dim=axis).numpy() + + def clean_df(df, target, is_classification, label_encoders): """ Returns cleaned DF for nonconformist calibration """ - # @TODO: reevaluate whether this can be streamlined inside custom nonconf + # @TODO: reevaluate whether this can be streamlined enc = label_encoders y = df.pop(target).values @@ -14,7 +21,7 @@ def clean_df(df, target, is_classification, label_encoders): cats = enc.categories_[0].tolist() # the last element is "__mdb_unknown_cat" y = np.array([cats.index(i) if i in cats else len(cats) - 1 for i in y]) - y = y.astype(int) + y = y.clip(-pow(2, 63), pow(2, 63)).astype(int) else: y = y.astype(float) @@ -27,7 +34,7 @@ def set_conf_range( significance: desired confidence level. can be preset 0 < x <= 0.99 """ # numerical - if target_type in (dtype.integer, dtype.float, dtype.array, dtype.tsarray): + if target_type in (dtype.integer, dtype.float, dtype.array, dtype.tsarray, dtype.quantity): # ICP gets all possible bounds (shape: (B, 2, 99)) all_ranges = icp.predict(X.values) @@ -64,7 +71,7 @@ def set_conf_range( return 0.005, np.zeros((X.shape[0], 2)) -def get_numerical_conf_range( +def get_numeric_conf_range( all_confs, df_std_dev=None, positive_domain=False, std_tol=1, group='__default', error_rate=None): """ Gets prediction bounds for numerical targets, based on ICP estimation and width tolerance error_rate: pre-determined error rate for the ICP, used in anomaly detection tasks to adjust the diff --git a/lightwood/analysis/nc/wrappers.py b/lightwood/analysis/nc/wrappers.py deleted file mode 100644 index 2abb8942c..000000000 --- a/lightwood/analysis/nc/wrappers.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch -from torch.nn.functional import softmax -from lightwood.analysis.nc.base import RegressorAdapter, ClassifierAdapter - - -def t_softmax(x, t=1.0, axis=1): - """ Softmax with temperature scaling """ - # @TODO: move this, not a wrapper - return softmax(torch.Tensor(x) / t, dim=axis).numpy() - - -def clear_icp_state(icp): - """ We clear last_x and last_y to minimize file size. Model has to be cleared because it cannot be pickled. """ - icp.model.model = None - icp.model.last_x = None - icp.model.last_y = None - if icp.normalizer is not None: - icp.normalizer.model = None - - -class ConformalRegressorAdapter(RegressorAdapter): - def __init__(self, model, fit_params=None): - super(ConformalRegressorAdapter, self).__init__(model, fit_params) - self.prediction_cache = None - - def fit(self, x=None, y=None): - """ At this point, the predictor has already been trained, but this - has to be called to setup some things in the nonconformist backend """ - pass - - def predict(self, x=None): - """ Same as in .fit() - :return: np.array (n_test, n_classes) with class probability estimates """ - return self.prediction_cache - - -class ConformalClassifierAdapter(ClassifierAdapter): - def __init__(self, model, fit_params=None): - super(ConformalClassifierAdapter, self).__init__(model, fit_params) - self.prediction_cache = None - - def fit(self, x=None, y=None): - """ At this point, the predictor has already been trained, but this - has to be called to setup some things in the nonconformist backend """ - pass - - def predict(self, x=None): - """ Same as in .fit() - :return: np.array (n_test, n_classes) with class probability estimates """ - return t_softmax(self.prediction_cache, t=0.5) diff --git a/lightwood/api/__init__.py b/lightwood/api/__init__.py index f250bf9b7..1dc0f177e 100644 --- a/lightwood/api/__init__.py +++ b/lightwood/api/__init__.py @@ -11,7 +11,6 @@ DataAnalysis, ) from lightwood.api.predictor import PredictorInterface -from lightwood.api.encode import encode from lightwood.api.high_level import ( analyze_dataset, code_from_problem, @@ -39,7 +38,6 @@ "ModelAnalysis", "DataAnalysis", "PredictorInterface", - "encode", "dtype", "predictor_from_state", ] diff --git a/lightwood/api/encode.py b/lightwood/api/encode.py deleted file mode 100644 index caa5860c1..000000000 --- a/lightwood/api/encode.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import List -import pandas as pd -from lightwood.encoder.base import BaseEncoder -from lightwood.data.encoded_ds import EncodedDs - - -def encode(encoders: List[BaseEncoder], subsets: List[pd.DataFrame], target: str) -> List[EncodedDs]: - """ - Given a list of Lightwood encoders, and data subsets, applies the encoders onto each subset. - - :param encoders: A list of lightwood encoders, in the order of each of the column types. - :param folds: A list of data subsets, each being a separate dataframe with all the columns applied per encoder. - :param target: The name of the column that is the target for prediction. - - :returns: An encoded dataset for each encoder in the list - """ - if isinstance(subsets, pd.DataFrame): - subsets = [subsets] - - encoded_ds_arr: List[EncodedDs] = [] - for subset in subsets: - encoded_ds_arr.append(EncodedDs(encoders, subset, target)) - return encoded_ds_arr diff --git a/lightwood/api/high_level.py b/lightwood/api/high_level.py index ec6e02319..3d1d623a4 100644 --- a/lightwood/api/high_level.py +++ b/lightwood/api/high_level.py @@ -1,5 +1,6 @@ import os from types import ModuleType +from typing import Union import dill import pandas as pd from lightwood.api.types import DataAnalysis, JsonAI, ProblemDefinition @@ -14,9 +15,10 @@ import string import gc import time +from lightwood.helpers.log import log -def predictor_from_problem(df: pd.DataFrame, problem_definition: ProblemDefinition) -> PredictorInterface: +def predictor_from_problem(df: pd.DataFrame, problem_definition: Union[ProblemDefinition, dict]) -> PredictorInterface: """ Creates a ready-to-train ``Predictor`` object from some raw data and a ``ProblemDefinition``. Do not use this if you want to edit the JsonAI first. Usually you'd want to next train this predictor by calling the ``learn`` method on the same dataframe used to create it. @@ -28,11 +30,14 @@ def predictor_from_problem(df: pd.DataFrame, problem_definition: ProblemDefiniti if not isinstance(problem_definition, ProblemDefinition): problem_definition = ProblemDefinition.from_dict(problem_definition) + log.info(f'Dropping features: {problem_definition.ignore_features}') + df = df.drop(columns=problem_definition.ignore_features) + predictor_class_str = code_from_problem(df, problem_definition) return predictor_from_code(predictor_class_str) -def json_ai_from_problem(df: pd.DataFrame, problem_definition: ProblemDefinition) -> JsonAI: +def json_ai_from_problem(df: pd.DataFrame, problem_definition: Union[ProblemDefinition, dict]) -> JsonAI: """ Creates a JsonAI from your raw data and problem definition. Usually you would use this when you want to subsequently edit the JsonAI, the easiest way to do this is to unload it to a dictionary via `to_dict`, modify it, and then create a new object from it using `lightwood.JsonAI.from_dict`. It's usually better to generate the JsonAI using this function rather than writing it from scratch. @@ -41,10 +46,12 @@ def json_ai_from_problem(df: pd.DataFrame, problem_definition: ProblemDefinition :returns: A ``JsonAI`` object generated based on your data and problem specifications """ # noqa - if not isinstance(problem_definition, ProblemDefinition): problem_definition = ProblemDefinition.from_dict(problem_definition) + log.info(f'Dropping features: {problem_definition.ignore_features}') + df = df.drop(columns=problem_definition.ignore_features) + type_information = lightwood.data.infer_types(df, problem_definition.pct_invalid) statistical_analysis = lightwood.data.statistical_analysis( df, type_information.dtypes, type_information.identifiers, problem_definition) @@ -99,13 +106,18 @@ def analyze_dataset(df: pd.DataFrame) -> DataAnalysis: ) -def code_from_problem(df: pd.DataFrame, problem_definition: ProblemDefinition) -> str: +def code_from_problem(df: pd.DataFrame, problem_definition: Union[ProblemDefinition, dict]) -> str: """ :param df: The raw data :param problem_definition: The manual specifications for your predictive problem :returns: The text code generated based on your data and problem specifications """ + if not isinstance(problem_definition, ProblemDefinition): + problem_definition = ProblemDefinition.from_dict(problem_definition) + + log.info(f'Dropping features: {problem_definition.ignore_features}') + df = df.drop(columns=problem_definition.ignore_features) json_ai = json_ai_from_problem(df, problem_definition) predictor_code = code_from_json_ai(json_ai) return predictor_code diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 06a5342e7..d40c165e6 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -1,11 +1,6 @@ -# TODO: We need a better way to specify trainable_encoders # TODO: lookup_encoder is awkward; similar to dtype, can we make a file with encoder_lookup? People may be interested # in seeing where these come from and it's not clear that you need to look here. -# TODO: What does `target_class_distribution` and `positive_domain` do? -# TODO: generate_json_ai is really large; can we abstract it into smaller functions to make it more readable? # TODO: add_implicit_values unit test ensures NO changes for a fully specified file. -# TODO: Please fix spelling on parallel_preped_encoders - from typing import Dict from lightwood.helpers.templating import call, inline_dict, align import black @@ -19,10 +14,47 @@ Output, ProblemDefinition, ) +import inspect + + +IMPORT_EXTERNAL_DIRS = """ +for import_dir in [os.path.expanduser('~/lightwood_modules'), '/etc/lightwood_modules']: + if os.path.exists(import_dir) and os.access(import_dir, os.R_OK): + for file_name in list(os.walk(import_dir))[0][2]: + if file_name[-3:] != '.py': + continue + mod_name = file_name[:-3] + loader = importlib.machinery.SourceFileLoader(mod_name, + os.path.join(import_dir, file_name)) + module = ModuleType(loader.name) + loader.exec_module(module) + sys.modules[mod_name] = module + exec(f'import {mod_name}') +""" - -trainable_encoders = ('PretrainedLangEncoder', 'CategoricalAutoEncoder', 'TimeSeriesEncoder', 'ArrayEncoder') -ts_encoders = ('TimeSeriesEncoder', 'TsNumericEncoder') +IMPORTS = """ +import lightwood +from lightwood.analysis import * +from lightwood.api import * +from lightwood.data import * +from lightwood.encoder import * +from lightwood.ensemble import * +from lightwood.helpers.device import * +from lightwood.helpers.general import * +from lightwood.helpers.log import * +from lightwood.helpers.numeric import * +from lightwood.helpers.parallelism import * +from lightwood.helpers.seed import * +from lightwood.helpers.text import * +from lightwood.helpers.torch import * +from lightwood.mixer import * +import pandas as pd +from typing import Dict, List +import os +from types import ModuleType +import importlib.machinery +import sys +""" def lookup_encoder( @@ -31,6 +63,7 @@ def lookup_encoder( is_target: bool, problem_defintion: ProblemDefinition, is_target_predicting_encoder: bool, + statistical_analysis: StatisticalAnalysis ): """ Assign a default encoder for a given column based on its data type, and whether it is a target. Encoders intake raw (but cleaned) data and return an feature representation. This function assigns, per data type, what the featurizer should be. This function runs on each column within the dataset available for model building to assign how it should be featurized. @@ -45,12 +78,15 @@ def lookup_encoder( :param problem_definition: The ``ProblemDefinition`` criteria; this populates specifics on how models and encoders may be trained. :param is_target_predicting_encoder: """ # noqa + tss = problem_defintion.timeseries_settings encoder_lookup = { dtype.integer: 'Integer.NumericEncoder', dtype.float: 'Float.NumericEncoder', dtype.binary: 'Binary.BinaryEncoder', - dtype.categorical: 'Categorical.CategoricalAutoEncoder', + dtype.categorical: 'Categorical.CategoricalAutoEncoder' + if statistical_analysis is None or len(statistical_analysis.histograms[col_name]) > 100 + else 'Categorical.OneHotEncoder', dtype.tags: 'Tags.MultiHotEncoder', dtype.date: 'Date.DatetimeEncoder', dtype.datetime: 'Datetime.DatetimeEncoder', @@ -116,11 +152,10 @@ def lookup_encoder( if encoder_dict['module'] == "Rich_Text.PretrainedLangEncoder" and not is_target: encoder_dict['args']['output_type'] = "$dtype_dict[$target]" - for encoder_name in trainable_encoders: - if encoder_name == encoder_dict['module'].split(".")[1]: - encoder_dict['args'][ - "stop_after" - ] = "$problem_definition.seconds_per_encoder" + if eval(encoder_dict['module'].split(".")[1]).is_trainable_encoder: + encoder_dict['args'][ + "stop_after" + ] = "$problem_definition.seconds_per_encoder" if is_target_predicting_encoder: encoder_dict['args']['embed_mode'] = 'False' @@ -140,7 +175,9 @@ def generate_json_ai( :param problem_definition: Specifies details of the model training/building procedure, as defined by ``ProblemDefinition`` :returns: JSON-AI object with fully populated details of the ML pipeline - """ # noqa + """ # noqaexec + exec(IMPORTS, globals()) + exec(IMPORT_EXTERNAL_DIRS, globals()) target = problem_definition.target input_cols = [] for col_name, col_dtype in type_information.dtypes.items(): @@ -153,6 +190,7 @@ def generate_json_ai( tss = problem_definition.timeseries_settings is_target_predicting_encoder = False + is_ts = problem_definition.timeseries_settings.is_timeseries # Single text column classification if ( len(input_cols) == 1 @@ -224,6 +262,7 @@ def generate_json_ai( 'module': 'BestOf', 'args': { 'accuracy_functions': '$accuracy_functions', + 'ts_analysis': 'self.ts_analysis' if is_ts else None } } )} @@ -234,7 +273,7 @@ def generate_json_ai( list(outputs.values())[0].data_dtype = dtype.tsarray list(outputs.values())[0].encoder = lookup_encoder( - type_information.dtypes[target], target, True, problem_definition, False + type_information.dtypes[target], target, True, problem_definition, False, statistical_analysis ) features: Dict[str, Feature] = {} @@ -242,17 +281,16 @@ def generate_json_ai( col_dtype = type_information.dtypes[col_name] dependency = [] encoder = lookup_encoder( - col_dtype, col_name, False, problem_definition, is_target_predicting_encoder + col_dtype, col_name, False, problem_definition, is_target_predicting_encoder, statistical_analysis ) - for encoder_name in ts_encoders: - if tss.is_timeseries and encoder_name == encoder['module'].split(".")[1]: - if tss.group_by is not None: - for group in tss.group_by: - dependency.append(group) + if tss.is_timeseries and eval(encoder['module'].split(".")[1]).is_timeseries_encoder: + if tss.group_by is not None: + for group in tss.group_by: + dependency.append(group) - if tss.use_previous_target: - dependency.append(f"__mdb_ts_previous_{target}") + if tss.use_previous_target: + dependency.append(f"__mdb_ts_previous_{target}") if len(dependency) > 0: feature = Feature(encoder=encoder, dependency=dependency) @@ -262,7 +300,7 @@ def generate_json_ai( # Decide on the accuracy functions to use output_dtype = list(outputs.values())[0].data_dtype - if output_dtype in [dtype.integer, dtype.float, dtype.date, dtype.datetime]: + if output_dtype in [dtype.integer, dtype.float, dtype.date, dtype.datetime, dtype.quantity]: accuracy_functions = ['r2_score'] elif output_dtype in [dtype.categorical, dtype.tags, dtype.binary]: accuracy_functions = ['balanced_accuracy_score'] @@ -271,6 +309,11 @@ def generate_json_ai( else: raise Exception(f'Please specify a custom accuracy function for output type {output_dtype}') + # special dispatch for t+1 time series forecasters + if is_ts: + if list(outputs.values())[0].data_dtype in [dtype.integer, dtype.float]: + accuracy_functions = ['evaluate_array_accuracy'] + if problem_definition.time_aim is None and ( problem_definition.seconds_per_mixer is None or problem_definition.seconds_per_encoder is None): problem_definition.time_aim = 1000 + np.log( @@ -282,8 +325,8 @@ def generate_json_ai( for x in type_information.dtypes.values()]) * 200 if problem_definition.time_aim is not None: - nr_trainable_encoders = len([x for x in features.values() if x.encoder['module'].split('.')[1] - in trainable_encoders]) + nr_trainable_encoders = len([x for x in features.values() if + eval(x.encoder['module'].split('.')[1]).is_trainable_encoder]) nr_mixers = len(list(outputs.values())[0].mixers) encoder_time_budget_pct = max(3.3 / 5, 1.5 + np.log(nr_trainable_encoders + 1) / 5) @@ -302,7 +345,6 @@ def generate_json_ai( explainer=None, features=features, outputs=outputs, - imports=None, problem_definition=problem_definition, identifiers=type_information.identifiers, timeseries_transformer=None, @@ -311,6 +353,59 @@ def generate_json_ai( ) +def merge_implicit_values(field, implicit_value): + exec(IMPORTS, globals()) + exec(IMPORT_EXTERNAL_DIRS, globals()) + module = eval(field['module']) + if inspect.isclass(module): + args = list(inspect.signature(module.__init__).parameters.keys())[1:] + else: + args = module.__code__.co_varnames + + for arg in args: + if 'args' not in field: + field['args'] = implicit_value['args'] + else: + if arg not in field['args']: + if arg in implicit_value['args']: + field['args'][arg] = implicit_value['args'][arg] + return field + + +def populate_implicit_field(json_ai: JsonAI, field_name: str, implicit_value: dict, is_timeseries: bool) -> None: + """ + Populate the implicit field of the JsonAI, either by filling it in entirely if missing, or by introspecting the class or function and assigning default values to the args in it's signature that are in the implicit default but haven't been populated by the user + + :params: json_ai: ``JsonAI`` object that describes the ML pipeline that may not have every detail fully specified. + :params: field_name: Name of the field the implicit field in ``JsonAI`` + :params: implicit_value: The dictionary containing implicit values for the module and arg in the field + :params: is_timeseries: Whether or not this is a timeseries problem + + :returns: nothing, this method mutates the respective field of the ``JsonAI`` object it receives + """ # noqa + # These imports might be slow, in which case the only solution is to line this code + field = json_ai.__getattribute__(field_name) + if field is None: + # This if is to only populated timeseries-specific implicit fields for implicit problems + if is_timeseries or field_name not in ('timeseries_analyzer', 'timeseries_transformer'): + field = implicit_value + + # If the user specified one or more subfields in a field that's a list + # Populate them with implicit arguments form the implicit values from that subfield + elif isinstance(field, list) and isinstance(implicit_value, list): + for i in range(len(field)): + sub_field_implicit = [x for x in implicit_value if x['module'] == field[i]['module']] + if len(sub_field_implicit) == 1: + field[i] = merge_implicit_values(field[i], sub_field_implicit[0]) + for sub_field_implicit in implicit_value: + if len([x for x in field if x['module'] == sub_field_implicit['module']]) == 0: + field.append(sub_field_implicit) + # If the user specified the field, add implicit arguments which we didn't specify + else: + field = merge_implicit_values(field, implicit_value) + json_ai.__setattr__(field_name, field) + + def add_implicit_values(json_ai: JsonAI) -> JsonAI: """ To enable brevity in writing, auto-generate the "unspecified/missing" details required in the ML pipeline. @@ -322,36 +417,6 @@ def add_implicit_values(json_ai: JsonAI) -> JsonAI: problem_definition = json_ai.problem_definition tss = problem_definition.timeseries_settings - imports = [ - 'from lightwood.mixer import Neural', 'from lightwood.mixer import LightGBM', - 'from lightwood.mixer import LightGBMArray', 'from lightwood.mixer import SkTime', - 'from lightwood.mixer import Unit', 'from lightwood.mixer import Regression', - 'from lightwood.ensemble import BestOf', 'from lightwood.data import cleaner', - 'from lightwood.data import transform_timeseries, timeseries_analyzer', 'from lightwood.data import splitter', - 'from lightwood.analysis import model_analyzer, explain', - 'from sklearn.metrics import r2_score, balanced_accuracy_score, accuracy_score', 'import pandas as pd', - 'from lightwood.helpers.seed import seed', 'from lightwood.helpers.log import log', 'import lightwood', - 'from lightwood.api import *', 'from lightwood.mixer import BaseMixer', - 'from lightwood.encoder import BaseEncoder, __ts_encoders__', - 'from lightwood.encoder import Array, Binary, Categorical, Date, Datetime, TimeSeries, Float, Image, Integer, Quantity, Rich_Text, Short_Text, Tags', # noqa - 'from lightwood.ensemble import BaseEnsemble', 'from typing import Dict, List', - 'from lightwood.helpers.parallelism import mut_method_call', - 'from lightwood.data.encoded_ds import ConcatedEncodedDs', 'from lightwood import ProblemDefinition'] - - if json_ai.imports is None: - json_ai.imports = imports - else: - json_ai.imports.extend(imports) - - for feature in [list(json_ai.outputs.values())[0], *json_ai.features.values()]: - encoder_import = feature.encoder['module'] - if "." in encoder_import: - continue - imports.append(f"from lightwood.encoder import {encoder_import}") - - if tss.use_previous_target: - imports.append('from lightwood.encoder import ArrayEncoder') - # Add implicit arguments # @TODO: Consider removing once we have a proper editor in studio mixers = json_ai.outputs[json_ai.problem_definition.target].mixers @@ -389,7 +454,7 @@ def add_implicit_values(json_ai: JsonAI) -> JsonAI: ensemble = json_ai.outputs[json_ai.problem_definition.target].ensemble ensemble['args']['target'] = ensemble['args'].get('target', '$target') - ensemble['args']['data'] = ensemble['args'].get('data', 'test_data') + ensemble['args']['data'] = ensemble['args'].get('data', 'encoded_test_data') ensemble['args']['mixers'] = ensemble['args'].get('mixers', '$mixers') for name in json_ai.features: @@ -400,95 +465,105 @@ def add_implicit_values(json_ai: JsonAI) -> JsonAI: json_ai.features[name].encoder['module'].split(".")[0].lower() ) - # Add implicit phases - # @TODO: Consider removing once we have a proper editor in studio - if json_ai.cleaner is None: - json_ai.cleaner = { - "module": "cleaner", - "args": { - "pct_invalid": "$problem_definition.pct_invalid", - "ignore_features": "$problem_definition.ignore_features", - "identifiers": "$identifiers", - "data": "data", - "dtype_dict": "$dtype_dict", - "target": "$target", - "mode": "$mode", - "timeseries_settings": "$problem_definition.timeseries_settings", - "anomaly_detection": "$problem_definition.anomaly_detection", - }, - } - - if json_ai.splitter is None: - json_ai.splitter = { - 'module': 'splitter', - 'args': { - 'tss': '$problem_definition.timeseries_settings', - 'data': 'data', - 'k': 'nsubsets' - } - } - if json_ai.analyzer is None: - json_ai.analyzer = { - "module": "model_analyzer", - "args": { - "stats_info": "$statistical_analysis", - "ts_cfg": "$problem_definition.timeseries_settings", - "accuracy_functions": "$accuracy_functions", - "predictor": "$ensemble", - "data": "test_data", - "train_data": "train_data", - "target": "$target", - "disable_column_importance": "False", - "dtype_dict": "$dtype_dict", - "fixed_significance": None, - "confidence_normalizer": False, - "positive_domain": "$statistical_analysis.positive_domain", - }, - } - - if json_ai.explainer is None: - json_ai.explainer = { - "module": "explain", - "args": { - "timeseries_settings": "$problem_definition.timeseries_settings", - "positive_domain": "$statistical_analysis.positive_domain", - "fixed_confidence": "$problem_definition.fixed_confidence", - "anomaly_detection": "$problem_definition.anomaly_detection", - "anomaly_error_rate": "$problem_definition.anomaly_error_rate", - "anomaly_cooldown": "$problem_definition.anomaly_cooldown", - "data": "data", - "encoded_data": "encoded_data", - "predictions": "df", - "analysis": "$runtime_analyzer", - "ts_analysis": "$ts_analysis" if tss.is_timeseries else None, - "target_name": "$target", - "target_dtype": "$dtype_dict[self.target]", - }, + # Add "hidden" fields + hidden_fields = [('cleaner', { + "module": "cleaner", + "args": { + "pct_invalid": "$problem_definition.pct_invalid", + "identifiers": "$identifiers", + "data": "data", + "dtype_dict": "$dtype_dict", + "target": "$target", + "mode": "$mode", + "timeseries_settings": "$problem_definition.timeseries_settings", + "anomaly_detection": "$problem_definition.anomaly_detection", + }, + }), ('splitter', { + 'module': 'splitter', + 'args': { + 'tss': '$problem_definition.timeseries_settings', + 'data': 'data', + 'seed': 1, + 'target': '$target', + 'dtype_dict': '$dtype_dict', + 'pct_train': 80, + 'pct_dev': 10, + 'pct_test': 10 } - - if tss.is_timeseries: - if json_ai.timeseries_transformer is None: - json_ai.timeseries_transformer = { - "module": "transform_timeseries", - "args": { - "timeseries_settings": "$problem_definition.timeseries_settings", - "data": "data", - "dtype_dict": "$dtype_dict", - "target": "$target", - "mode": "$mode", - }, - } - - if json_ai.timeseries_analyzer is None: - json_ai.timeseries_analyzer = { - "module": "timeseries_analyzer", - "args": { - "timeseries_settings": "$problem_definition.timeseries_settings", - "data": "data", - "dtype_dict": "$dtype_dict", - "target": "$target", - }, - } + }), ('analyzer', { + "module": "model_analyzer", + "args": { + "stats_info": "$statistical_analysis", + "ts_cfg": "$problem_definition.timeseries_settings", + "accuracy_functions": "$accuracy_functions", + "predictor": "$ensemble", + "data": "encoded_test_data", + "train_data": "encoded_train_data", + "target": "$target", + "dtype_dict": "$dtype_dict", + "analysis_blocks": "$analysis_blocks" + }, + }), ('explainer', { + "module": "explain", + "args": { + "timeseries_settings": "$problem_definition.timeseries_settings", + "positive_domain": "$statistical_analysis.positive_domain", + "fixed_confidence": "$problem_definition.fixed_confidence", + "anomaly_detection": "$problem_definition.anomaly_detection", + "anomaly_error_rate": "$problem_definition.anomaly_error_rate", + "anomaly_cooldown": "$problem_definition.anomaly_cooldown", + "data": "data", + "encoded_data": "encoded_data", + "predictions": "df", + "analysis": "$runtime_analyzer", + "ts_analysis": "$ts_analysis" if tss.is_timeseries else None, + "target_name": "$target", + "target_dtype": "$dtype_dict[self.target]", + "explainer_blocks": "$analysis_blocks" + }, + }), ('analysis_blocks', [ + { + 'module': 'ICP', + 'args': { + "fixed_significance": None, + "confidence_normalizer": False, + "positive_domain": "$statistical_analysis.positive_domain", + + }, + }, + { + 'module': 'AccStats', + 'args': { + 'deps': ['ICP'] + }, + }, + { + 'module': 'GlobalFeatureImportance', + 'args': { + "disable_column_importance": "False", + }, + }, + ]), ('timeseries_transformer', { + "module": "transform_timeseries", + "args": { + "timeseries_settings": "$problem_definition.timeseries_settings", + "data": "data", + "dtype_dict": "$dtype_dict", + "target": "$target", + "mode": "$mode", + }, + }), ('timeseries_analyzer', { + "module": "timeseries_analyzer", + "args": { + "timeseries_settings": "$problem_definition.timeseries_settings", + "data": "data", + "dtype_dict": "$dtype_dict", + "target": "$target", + }, + })] + + for field_name, implicit_value in hidden_fields: + populate_implicit_field(json_ai, field_name, implicit_value, tss.is_timeseries) return json_ai @@ -511,10 +586,9 @@ def code_from_json_ai(json_ai: JsonAI) -> str: } for col_name, feature in json_ai.features.items(): - if col_name not in json_ai.problem_definition.ignore_features: - encoder_dict[col_name] = call(feature.encoder) - dependency_dict[col_name] = feature.dependency - dtype_dict[col_name] = f"""'{feature.data_dtype}'""" + encoder_dict[col_name] = call(feature.encoder) + dependency_dict[col_name] = feature.dependency + dtype_dict[col_name] = f"""'{feature.data_dtype}'""" # @TODO: Move into json-ai creation function (I think? Maybe? Let's discuss) tss = json_ai.problem_definition.timeseries_settings @@ -526,15 +600,15 @@ def code_from_json_ai(json_ai: JsonAI) -> str: False, json_ai.problem_definition, False, + None )) dependency_dict[col_name] = [] dtype_dict[col_name] = f"""'{list(json_ai.outputs.values())[0].data_dtype}'""" json_ai.features[col_name] = Feature(encoder=encoder_dict[col_name]) - ignored_cols = json_ai.problem_definition.ignore_features input_cols = [x.replace("'", "\\'").replace('"', '\\"') for x in json_ai.features] - input_cols = ','.join([f"""'{name}'""" for name in input_cols if name not in ignored_cols]) + input_cols = ','.join([f"""'{name}'""" for name in input_cols]) ts_transform_code = "" ts_analyze_code = "" @@ -550,7 +624,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str: if json_ai.timeseries_analyzer is not None: ts_encoder_code = """ -if type(encoder) in __ts_encoders__: +if encoder.is_timeseries_encoder: kwargs['ts_analysis'] = self.ts_analysis """ @@ -565,7 +639,6 @@ def code_from_json_ai(json_ai: JsonAI) -> str: dataprep_body = f""" # The type of each column -self.problem_definition = ProblemDefinition.from_dict({json_ai.problem_definition.to_dict()}) self.accuracy_functions = {json_ai.accuracy_functions} self.identifiers = {json_ai.identifiers} self.dtype_dict = {inline_dict(dtype_dict)} @@ -579,35 +652,38 @@ def code_from_json_ai(json_ai: JsonAI) -> str: # self.input_cols = [{input_cols}] +self.analysis_blocks = [{', '.join([call(block) for block in json_ai.analysis_blocks])}] + log.info('Cleaning the data') data = {call(json_ai.cleaner)} {ts_transform_code} {ts_analyze_code} -nsubsets = {json_ai.problem_definition.nsubsets} -log.info(f'Splitting the data into {{nsubsets}} subsets') -subsets = {call(json_ai.splitter)} +data = {call(json_ai.splitter)} log.info('Preparing the encoders') -encoder_preping_dict = {{}} -enc_preping_data = pd.concat(subsets[0:nsubsets-1]) +encoder_prepping_dict = {{}} +concatenated_train_dev = pd.concat([data['train'], data['dev']]) for col_name, encoder in self.encoders.items(): - if not encoder.is_nn_encoder: - encoder_preping_dict[col_name] = [encoder, enc_preping_data[col_name], 'prepare'] - log.info(f'Encoder preping dict length of: {{len(encoder_preping_dict)}}') + if not encoder.is_trainable_encoder: + encoder_prepping_dict[col_name] = [encoder, concatenated_train_dev[col_name], 'prepare'] + log.info(f'Encoder prepping dict length of: {{len(encoder_prepping_dict)}}') -parallel_preped_encoders = mut_method_call(encoder_preping_dict) -for col_name, encoder in parallel_preped_encoders.items(): +parallel_prepped_encoders = mut_method_call(encoder_prepping_dict) +for col_name, encoder in parallel_prepped_encoders.items(): self.encoders[col_name] = encoder -if self.target not in parallel_preped_encoders: - self.encoders[self.target].prepare(enc_preping_data[self.target]) +if self.target not in parallel_prepped_encoders: + if self.encoders[self.target].is_trainable_encoder: + self.encoders[self.target].prepare(data['train'][self.target], data['dev'][self.target]) + else: + self.encoders[self.target].prepare(pd.concat([data['train'], data['dev']])[self.target]) for col_name, encoder in self.encoders.items(): - if encoder.is_nn_encoder: - priming_data = pd.concat(subsets[0:nsubsets-1]) + if encoder.is_trainable_encoder: + priming_data = pd.concat([data['train'], data['dev']]) kwargs = {{}} if self.dependencies[col_name]: kwargs['dependency_data'] = {{}} @@ -620,9 +696,9 @@ def code_from_json_ai(json_ai: JsonAI) -> str: # This assumes target encoders are also prepared in parallel, might not be true if hasattr(encoder, 'uses_target'): - kwargs['encoded_target_values'] = parallel_preped_encoders[self.target].encode(priming_data[self.target]) + kwargs['encoded_target_values'] = parallel_prepped_encoders[self.target].encode(priming_data[self.target]) - encoder.prepare(priming_data[col_name], **kwargs) + encoder.prepare(data['train'][col_name], data['dev'][col_name], **kwargs) {align(ts_target_code, 1)} """ @@ -631,16 +707,16 @@ def code_from_json_ai(json_ai: JsonAI) -> str: learn_body = f""" log.info('Featurizing the data') -encoded_ds_arr = lightwood.encode(self.encoders, subsets, self.target) -train_data = encoded_ds_arr[0:int(nsubsets*0.9)] -test_data = encoded_ds_arr[int(nsubsets*0.9):] +encoded_train_data = EncodedDs(self.encoders, data['train'], self.target) +encoded_dev_data = EncodedDs(self.encoders, data['dev'], self.target) +encoded_test_data = EncodedDs(self.encoders, data['test'], self.target) log.info('Training the mixers') self.mixers = [{', '.join([call(x) for x in list(json_ai.outputs.values())[0].mixers])}] trained_mixers = [] for mixer in self.mixers: try: - mixer.fit(train_data) + mixer.fit(encoded_train_data, encoded_dev_data) trained_mixers.append(mixer) except Exception as e: log.warning(f'Exception: {{e}} when training mixer: {{mixer}}') @@ -661,41 +737,44 @@ def code_from_json_ai(json_ai: JsonAI) -> str: # important to train with. for mixer in self.mixers: if {json_ai.problem_definition.fit_on_validation}: - mixer.partial_fit(test_data, train_data) + mixer.partial_fit(encoded_test_data, ConcatedEncodedDs([encoded_train_data, encoded_dev_data])) """ learn_body = align(learn_body, 2) predict_common_body = f""" +log.info(f'Dropping features: {{self.problem_definition.ignore_features}}') +data = data.drop(columns=self.problem_definition.ignore_features) +for col in self.input_cols: + if col not in data.columns: + data[col] = [None] * len(data) self.mode = 'predict' log.info('Cleaning the data') data = {call(json_ai.cleaner)} {ts_transform_code} -encoded_ds = lightwood.encode(self.encoders, data, self.target)[0] +encoded_ds = EncodedDs(self.encoders, data, self.target) encoded_data = encoded_ds.get_encoded_data(include_target=False) """ predict_common_body = align(predict_common_body, 2) predict_body = f""" df = self.ensemble(encoded_ds) -insights = {call(json_ai.explainer)} +insights, global_insights = {call(json_ai.explainer)} return insights """ predict_body = align(predict_body, 2) predict_proba_body = f""" df = self.ensemble(encoded_ds, predict_proba=True) -insights = {call(json_ai.explainer)} +insights, global_insights = {call(json_ai.explainer)} return insights """ predict_proba_body = align(predict_proba_body, 2) - imports = "\n".join(json_ai.imports) predictor_code = f""" -{imports} -from lightwood.api import PredictorInterface - +{IMPORTS} +{IMPORT_EXTERNAL_DIRS} class Predictor(PredictorInterface): target: str @@ -710,6 +789,9 @@ def __init__(self): self.mode = 'innactive' def learn(self, data: pd.DataFrame) -> None: + self.problem_definition = ProblemDefinition.from_dict({json_ai.problem_definition.to_dict()}) + log.info(f'Dropping features: {{self.problem_definition.ignore_features}}') + data = data.drop(columns=self.problem_definition.ignore_features) {dataprep_body} {learn_body} diff --git a/lightwood/api/types.py b/lightwood/api/types.py index f2b63bce2..a21ac5f77 100644 --- a/lightwood/api/types.py +++ b/lightwood/api/types.py @@ -10,7 +10,6 @@ # TODO: Problem definition missing a few terms # TODO: Model Analysis # TODO: Analyzer - from typing import Dict, List, Optional, Union from dataclasses import dataclass from lightwood.helpers.log import log @@ -106,8 +105,8 @@ class Output: :param data_dtype: The type of information within the target column (ex.: numerical, categorical, etc.). :param encoder: the methodology for encoding the target feature (a Lightwood Encoder). There can only be one \ encoder for the output target. - :param models: The list of ML algorithms that are trained for the target distribution. - :param ensemble: For a panel of ML algorithms, the approach of selecting the best model, and the metrics used in \ + :param mixers: The list of ML algorithms that are trained for the target distribution. + :param ensemble: For a panel of ML algorithms, the approach of selecting the best mixer, and the metrics used in \ that evaluation. """ @@ -216,6 +215,7 @@ class TimeseriesSettings: historical_columns: List[str] = None target_type: str = ( "" # @TODO: is the current setter (outside of initialization) a sane option? + # @TODO: George: No, I don't think it is, we need to pass this some other way ) allow_incomplete_history: bool = False @@ -318,7 +318,6 @@ class ProblemDefinition: """ target: str - nsubsets: int pct_invalid: float unbias_target: bool seconds_per_mixer: Union[int, None] @@ -347,8 +346,7 @@ def from_dict(obj: Dict): :returns: A populated ``ProblemDefinition`` object. """ target = obj['target'] - nsubsets = obj.get('nsubsets', 30) - pct_invalid = obj.get('pct_invalid', 1) + pct_invalid = obj.get('pct_invalid', 2) unbias_target = obj.get('unbias_target', True) seconds_per_mixer = obj.get('seconds_per_mixer', None) seconds_per_encoder = obj.get('seconds_per_encoder', None) @@ -366,7 +364,6 @@ def from_dict(obj: Dict): seed_nr = obj.get('seed_nr', 420) problem_definition = ProblemDefinition( target=target, - nsubsets=nsubsets, pct_invalid=pct_invalid, unbias_target=unbias_target, seconds_per_mixer=seconds_per_mixer, @@ -430,10 +427,10 @@ class JsonAI: :param splitter: The Splitter object is the method in which the input data is split into training/validation/testing data. :param analyzer: The Analyzer object is used to evaluate how well a model performed on the predictive task. :param explainer: The Explainer object deploys explainability tools of interest on a model to indicate how well a model generalizes its predictions. - :param imports: A list of custom packages, indicated through a str import statement, that a user can call. + :param analysis_blocks: The blocks that get used in both analysis and inference inside the analyzer and explainer blocks. :param timeseries_transformer: :param timeseries_analyzer: - :param accuracy_functions: A list of performance metrics used to evaluate the best models. + :param accuracy_functions: A list of performance metrics used to evaluate the best mixers. """ # noqa features: Dict[str, Feature] @@ -444,11 +441,10 @@ class JsonAI: splitter: Optional[object] = None analyzer: Optional[object] = None explainer: Optional[object] = None - imports: Optional[List[str]] = None + analysis_blocks: Optional[List[object]] = None timeseries_transformer: Optional[object] = None timeseries_analyzer: Optional[object] = None accuracy_functions: Optional[List[str]] = None - phases: Optional[Dict[str, object]] = None @staticmethod def from_dict(obj: Dict): @@ -463,11 +459,10 @@ def from_dict(obj: Dict): splitter = obj.get("splitter", None) analyzer = obj.get("analyzer", None) explainer = obj.get("explainer", None) - imports = obj.get("imports", None) + analysis_blocks = obj.get("analysis_blocks", None) timeseries_transformer = obj.get("timeseries_transformer", None) timeseries_analyzer = obj.get("timeseries_analyzer", None) accuracy_functions = obj.get("accuracy_functions", None) - phases = obj.get("phases", None) json_ai = JsonAI( features=features, @@ -478,11 +473,10 @@ def from_dict(obj: Dict): splitter=splitter, analyzer=analyzer, explainer=explainer, - imports=imports, + analysis_blocks=analysis_blocks, timeseries_transformer=timeseries_transformer, timeseries_analyzer=timeseries_analyzer, accuracy_functions=accuracy_functions, - phases=phases, ) return json_ai diff --git a/lightwood/data/__init__.py b/lightwood/data/__init__.py index 24e3d37bc..1fe88866c 100644 --- a/lightwood/data/__init__.py +++ b/lightwood/data/__init__.py @@ -4,5 +4,7 @@ from lightwood.data.splitter import splitter from lightwood.data.timeseries_transform import transform_timeseries from lightwood.data.timeseries_analyzer import timeseries_analyzer +from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs -__all__ = ['infer_types', 'statistical_analysis', 'cleaner', 'splitter', 'transform_timeseries', 'timeseries_analyzer'] +__all__ = ['infer_types', 'statistical_analysis', 'cleaner', 'splitter', 'transform_timeseries', 'timeseries_analyzer', + 'EncodedDs', 'ConcatedEncodedDs'] diff --git a/lightwood/data/cleaner.py b/lightwood/data/cleaner.py index e525ab8df..749dca72f 100644 --- a/lightwood/data/cleaner.py +++ b/lightwood/data/cleaner.py @@ -1,17 +1,130 @@ -from copy import deepcopy -from lightwood.api.types import TimeseriesSettings import re -from typing import Dict, List +from copy import deepcopy + +import pandas as pd +import datetime +from dateutil.parser import parse as parse_dt + from lightwood.api.dtype import dtype +from lightwood.helpers import text from lightwood.helpers.log import log -from dateutil.parser import parse as parse_dt -import datetime -from lightwood.helpers.text import clean_float -import pandas as pd +from lightwood.api.types import TimeseriesSettings from lightwood.helpers.numeric import can_be_nan_numeric +import numpy as np +from typing import Dict, List, Optional, Tuple, Callable, Union + + +VALUES_FOR_NAN_AND_NONE_IN_PANDAS = [np.nan, 'nan', 'NaN', 'Nan', 'None'] + + +def cleaner( + data: pd.DataFrame, + dtype_dict: Dict[str, str], + pct_invalid: float, + identifiers: Dict[str, str], + target: str, + mode: str, + timeseries_settings: TimeseriesSettings, + anomaly_detection: bool, + custom_cleaning_functions: Dict[str, str] = {} +) -> pd.DataFrame: + """ + The cleaner is a function which takes in the raw data, plus additional information about it's types and about the problem. Based on this it generates a "clean" representation of the data, where each column has an ideal standaridzed type and all malformed or otherwise missing or invalid elements are turned into ``None`` + + :param data: The raw data + :param dtype_dict: Type information for each column + :param pct_invalid: How much of each column can be invalid + :param identifiers: A dict containing all identifier typed columns + :param target: The target columns + :param mode: Can be "predict" or "train" + :param timeseries_settings: Timeseries related settings, only relevant for timeseries predictors, otherwise can be the default object + :param anomaly_detection: Are we detecting anomalies with this predictor? + + :returns: The cleaned data + """ # noqa + + data = _remove_columns(data, identifiers, target, mode, timeseries_settings, + anomaly_detection, dtype_dict) + + for col in _get_columns_to_clean(data, dtype_dict, mode, target): + # Get and apply a cleaning function for each data type + # If you want to customize the cleaner, it's likely you can to modify ``get_cleaning_func`` + data[col] = data[col].apply(get_cleaning_func(dtype_dict[col], custom_cleaning_functions)) + data[col] = data[col].replace(to_replace=VALUES_FOR_NAN_AND_NONE_IN_PANDAS, value=None) + # If a column has too many None values, raise an Excpetion + # Figure out how to reintroduce later, maybe a custom flag, `crash for too much invalid data`? + # _check_if_invalid(data[col], pct_invalid, col) + return data + + +def _check_if_invalid(new_data: pd.Series, pct_invalid: float, col_name: str) -> bool: + """ Checks how many invalid data points there are """ + + chk_invalid = ( + 100 + * (len(new_data) - len([x for x in new_data if x is not None])) + / len(new_data) + ) + + if chk_invalid > pct_invalid: + err = f'Too many ({chk_invalid}%) invalid values in column {col_name}nam' + log.error(err) + raise Exception(err) + + +def get_cleaning_func(data_dtype: dtype, custom_cleaning_functions: Dict[str, str]) -> Callable: + """ + For the provided data type, provide the appropriate cleaning function. Below are the defaults, users can either override this function OR impose a custom block. + + :param data_dtype: The data-type (inferred from a column) as prescribed from ``api.dtype`` + + :returns: The appropriate function that will pre-process (clean) data of specified dtype. + """ # noqa + if data_dtype in custom_cleaning_functions: + clean_func = eval(custom_cleaning_functions[data_dtype]) + + elif data_dtype in (dtype.date, dtype.datetime): + clean_func = _standardize_datetime + + elif data_dtype in (dtype.float, dtype.tsarray): + clean_func = _clean_float + + elif data_dtype in (dtype.integer): + clean_func = _clean_int + + elif data_dtype in (dtype.array): + clean_func = _standardize_array + + elif data_dtype in (dtype.tags): + clean_func = _tags_to_tuples + + elif data_dtype in (dtype.quantity): + clean_func = _clean_quantity + + elif data_dtype in ( + dtype.short_text, + dtype.rich_text, + dtype.categorical, + dtype.binary, + ): + clean_func = _clean_text + + else: + raise ValueError(f"{data_dtype} is not supported. Check lightwood.api.dtype") + + return clean_func + -def _to_datetime(element): +# ------------------------- # +# Temporal Cleaning +# ------------------------- # + + +def _standardize_datetime(element: object) -> Optional[float]: + """ + Parses an expected date-time element. Intakes an element that can in theory be anything. + """ try: date = parse_dt(str(element)) except Exception: @@ -20,115 +133,147 @@ def _to_datetime(element): except Exception: return None - return date - - -def _standardize_date(element): - date = _to_datetime(element) - if date is None: - return None return date.timestamp() -def _standardize_datetime(element): - date = _to_datetime(element) - if date is None: - return None - return date.timestamp() +# ------------------------- # +# Tags/Sequences +# ------------------------- # +# TODO Make it split on something other than commas +def _tags_to_tuples(tags_str: str) -> Tuple[str]: + """ + Converts comma-separated values into a tuple to preserve a sequence/array. -def _tags_to_tuples(tags_str): + Ex: + >> x = 'apples, oranges, bananas' + >> _tags_to_tuples(x) + >> ('apples', 'oranges', 'bananas') + """ try: - return tuple([x.strip() for x in tags_str.split(',')]) + return tuple([x.strip() for x in tags_str.split(",")]) except Exception: return tuple() -def _clean_float_or_none(element): - try: - calened_float = clean_float(element) - if can_be_nan_numeric(calened_float): - return None - return calened_float - except Exception: - return None +def _standardize_array(element: object) -> Optional[Union[List[float], float]]: + """ + Given an array of numbers in the form ``[1, 2, 3, 4]``, converts into a numerical sequence. + :param element: An array-like element in a sequence + :returns: standardized array OR scalar number IF edge case -def _standardize_array(element): + Ex of edge case: + >> element = [1] + >> _standardize_array(element) + >> 1 + """ try: element = str(element) - element = element.rstrip(']').lstrip('[') - element = element.rstrip(' ').lstrip(' ') - element = element.replace(', ', ' ').replace(',', ' ') + element = element.rstrip("]").lstrip("[") + element = element.rstrip(" ").lstrip(" ") + element = element.replace(", ", " ").replace(",", " ") # Handles cases where arrays are numbers - if ' ' not in element: - element = _clean_float_or_none(element) + if " " not in element: + element = _clean_float(element) else: - element = [float(x) for x in element.split(' ')] + element = [float(x) for x in element.split(" ")] except Exception: pass return element -def _clean_value(element: object, data_dtype: str): - if data_dtype in (dtype.date): - element = _standardize_date(element) +# ------------------------- # +# Integers/Floats/Quantities +# ------------------------- # - if data_dtype in (dtype.datetime): - element = _standardize_datetime(element) +def _clean_float(element: object) -> Optional[float]: + """ + Given an element, converts it into a numeric format. If element is NaN, or inf, then returns None. + """ + try: + cleaned_float = text.clean_float(element) + if can_be_nan_numeric(cleaned_float): + return None + return cleaned_float + except Exception: + return None - if data_dtype in (dtype.float): - element = float(_clean_float_or_none(element)) - if data_dtype in (dtype.integer): - element = int(_clean_float_or_none(element)) - if data_dtype in (dtype.array): - element = _standardize_array(element) +def _clean_int(element: object) -> Optional[int]: + element = _clean_float(element) + if element is not None: + element = int(element) + return element - if data_dtype in (dtype.tags): - element = _tags_to_tuples(element) - if data_dtype in (dtype.quantity): - element = float(re.sub("[^0-9.,]", '', element).replace(',', '.')) +def _clean_quantity(element: object) -> Optional[float]: + element = float(re.sub("[^0-9.,]", "", str(element)).replace(",", ".")) + return _clean_float(element) - if data_dtype in (dtype.short_text, dtype.rich_text, dtype.categorical, dtype.binary): - element = str(element) - return element +def _clean_text(element: object) -> str: + return str(element) -def clean_empty_targets(df: pd.DataFrame, target: str) -> pd.DataFrame: +def _rm_rows_w_empty_targets(df: pd.DataFrame, target: str) -> pd.DataFrame: + """ + Drop any rows that have targets as unknown. Targets are necessary to train. + + :param df: The input dataframe including the target value + :param target: the column name that is the output target variable + + :returns: Data with any target smissing + """ + # Compare length before/after len_before = len(df) + + # Use Pandas ```dropna``` to omit any rows with missing values for targets; these cannot be trained df = df.dropna(subset=[target]) + + # Compare length with after len_after = len(df) nr_removed = len_before - len_after + if nr_removed != 0: log.warning( - f'Removed {nr_removed} rows due to the target value missing. Training with rows without a target value makes no sense, please avoid this!') # noqa + f"Removed {nr_removed} rows because target was missing. Training on these rows is not possible." + ) # noqa return df -def cleaner( - data: pd.DataFrame, dtype_dict: Dict[str, str], - pct_invalid: float, ignore_features: List[str], - identifiers: Dict[str, str], - target: str, mode: str, timeseries_settings: TimeseriesSettings, anomaly_detection: bool) -> pd.DataFrame: - # Drop columns we don't want to use - data = deepcopy(data) - to_drop = [*ignore_features, [x for x in identifiers.keys() if x != target]] - exceptions = ['__mdb_make_predictions'] - for col in to_drop: - try: - data = data.drop(columns=[col]) - except Exception: - pass +def _remove_columns(data: pd.DataFrame, identifiers: Dict[str, object], target: str, + mode: str, timeseries_settings: TimeseriesSettings, anomaly_detection: bool, + dtype_dict: Dict[str, dtype]) -> pd.DataFrame: + """ + Drop columns we don't want to use in order to train or predict - if mode == 'train': - data = clean_empty_targets(data, target) - if mode == 'predict': - if target in data.columns and not timeseries_settings.use_previous_target and not anomaly_detection: + :param data: The raw data + :param dtype_dict: Type information for each column + :param identifiers: A dict containing all identifier typed columns + :param target: The target columns + :param mode: Can be "predict" or "train" + :param timeseries_settings: Timeseries related settings, only relevant for timeseries predictors, otherwise can be the default object + :param anomaly_detection: Are we detecting anomalies with this predictor? + + :returns: A (new) dataframe without the dropped columns + """ # noqa + data = deepcopy(data) + to_drop = [*[x for x in identifiers.keys() if x != target], + *[x for x in data.columns if x in dtype_dict and dtype_dict[x] == dtype.invalid]] + exceptions = ["__mdb_make_predictions"] + data = data.drop(columns=to_drop) + + if mode == "train": + data = _rm_rows_w_empty_targets(data, target) + if mode == "predict": + if ( + target in data.columns + and not timeseries_settings.use_previous_target + and not anomaly_detection + ): data = data.drop(columns=[target]) # Drop extra columns @@ -136,33 +281,24 @@ def cleaner( if name not in dtype_dict and name not in exceptions: data = data.drop(columns=[name]) - # Standardize content - for name, data_dtype in dtype_dict.items(): - if mode == 'predict': + return data + + +def _get_columns_to_clean(data: pd.DataFrame, dtype_dict: Dict[str, dtype], mode: str, target: str) -> List[str]: + """ + :param data: The raw data + :param dtype_dict: Type information for each column + :param target: The target columns + :param mode: Can be "predict" or "train" + + :returns: A list of columns that we want to clean + """ # noqa + + cleanable_columns = [] + for name, _ in dtype_dict.items(): + if mode == "predict": if name == target: continue - if name in to_drop: - continue - if name not in data.columns: - if '__mdb_ts_previous' not in name: - data[name] = [None] * len(data) - continue - - new_data = [] - for element in data[name]: - try: - new_data.append(_clean_value(element, data_dtype)) - except Exception as e: - new_data.append(None) - log.warning( - f'Unable to parse elemnt: {element} or type {data_dtype} from column {name}. Excetpion: {e}') - - pct_invalid = 100 * (len(new_data) - len([x for x in new_data if x is not None])) / len(new_data) - - if pct_invalid > pct_invalid: - err = f'Too many ({pct_invalid}%) invalid values in column {name} of type {data_dtype}' - log.error(err) - raise Exception(err) - - data[name] = new_data - return data + if name in data.columns: + cleanable_columns.append(name) + return cleanable_columns diff --git a/lightwood/data/splitter.py b/lightwood/data/splitter.py index 3e70d9160..cd83c3247 100644 --- a/lightwood/data/splitter.py +++ b/lightwood/data/splitter.py @@ -1,48 +1,131 @@ +from lightwood.api.dtype import dtype import pandas as pd import numpy as np -from typing import List +from typing import List, Dict from itertools import product - from lightwood.api.types import TimeseriesSettings +from lightwood.helpers.log import log -def splitter(data: pd.DataFrame, k: int, tss: TimeseriesSettings) -> List[pd.DataFrame]: - """ - Splits a dataframe into k equally-sized subsets. +def splitter( + data: pd.DataFrame, + tss: TimeseriesSettings, + dtype_dict: Dict[str, str], + seed: int, + pct_train: int, + pct_dev: int, + pct_test: int, + target: str +) -> Dict[str, pd.DataFrame]: """ + Splits a dataset into stratified training/test. First shuffles the data within the dataframe (via ``df.sample``). + + :param data: Input dataset to be split + :param tss: time-series specific details for splitting + :param dtype_dict: Dictionary with the data type of all columns + :param seed: Random state for pandas data-frame shuffling + :param pct_train: training fraction of data; must be less than 1 + :param pct_dev: dev fraction of data; must be less than 1 + :param pct_test: testing fraction of data; must be less than 1 + :param target: Name of the target column; if specified, data will be stratified on this column + + :returns: A dictionary containing the keys train, test and dev with their respective data frames, as well as the "stratified_on" key indicating which columns the data was stratified on (None if it wasn't stratified on anything) + """ # noqa + if pct_train + pct_dev + pct_test != 100: + raise Exception('The train, dev and test percentage of the data needs to sum up to 100') + + gcd = np.gcd(100, np.gcd(pct_test, np.gcd(pct_train, pct_dev))) + nr_subsets = int(100 / gcd) + + # Shuffle the data + np.random.seed(seed) if not tss.is_timeseries: - # shuffle - data = data.sample(frac=1).reset_index(drop=True) + data = data.sample(frac=1, random_state=seed).reset_index(drop=True) - # split - subsets = np.array_split(data, k) + stratify_on = [] + if target is not None: + if dtype_dict[target] in (dtype.categorical, dtype.binary): + stratify_on += [target] + if tss.is_timeseries and isinstance(tss.group_by, list): + stratify_on += tss.group_by + if stratify_on: + subsets = stratify(data, nr_subsets, stratify_on) + subsets = randomize_uneven_stratification(data, subsets, nr_subsets, tss) else: - if not tss.group_by: - subsets = np.array_split(data, k) - else: - gcols = tss.group_by - subsets = grouped_ts_splitter(data, k, gcols) + subsets = np.array_split(data, nr_subsets) - return subsets + train = pd.concat(subsets[0:int(pct_train / gcd)]) + dev = pd.concat(subsets[int(pct_train / gcd):int(pct_train / gcd + pct_dev / gcd)]) + test = pd.concat(subsets[int(pct_train / gcd + pct_dev / gcd):]) + return {"train": train, "test": test, "dev": dev, "stratified_on": stratify_on} -def grouped_ts_splitter(data: pd.DataFrame, k: int, gcols: List[str]): - """ - Splitter for grouped time series tasks, where there is a set of `gcols` columns by which data is grouped. - Each group yields a different time series, and the splitter generates `k` subsets from `data`, - with equally-sized sub-series for each group. + +def stratify(data: pd.DataFrame, nr_subset: int, stratify_on: List[str], random_alloc=False) -> List[pd.DataFrame]: """ - all_group_combinations = list(product(*[data[gcol].unique() for gcol in gcols])) - subsets = [pd.DataFrame() for _ in range(k)] + Stratified data splitter. + + The `stratify_on` columns yield a cartesian product by which every different subset will be stratified + independently from the others, and recombined at the end. + + For grouped time series tasks, each group yields a different time series. That is, the splitter generates + `nr_subsets` subsets from `data`, with equally-sized sub-series for each group. + + :param data: Data to be split + :param nr_subset: Number of subsets to create + :param stratify_on: Columns to group-by on + :param random_alloc: Whether to allocate subsets randomly + + :returns A list of equally-sized data subsets that can be concatenated by the full data. This preserves the group-by columns. + """ # noqa + # TODO: Make stratification work for regression via histogram bins?? + all_group_combinations = list(product(*[data[col].unique() for col in stratify_on])) + + subsets = [pd.DataFrame() for _ in range(nr_subset)] for group in all_group_combinations: subframe = data - for idx, gcol in enumerate(gcols): - subframe = subframe[subframe[gcol] == group[idx]] + for idx, col in enumerate(stratify_on): + subframe = subframe[subframe[col] == group[idx]] + + subset = np.array_split(subframe, nr_subset) - subset = np.array_split(subframe, k) + # Allocate to subsets randomly + if random_alloc: + already_visited = [] + for n in range(nr_subset): + i = np.random.randint(nr_subset) + while i in already_visited: + i = np.random.randint(nr_subset) + already_visited.append(i) + subsets[n] = pd.concat([subsets[n], subset[i]]) + else: + for n in range(nr_subset): + subsets[n] = pd.concat([subsets[n], subset[n]]) + + return subsets - for i in range(k): - subsets[i] = pd.concat([subsets[i], subset[i]]) +def randomize_uneven_stratification(data: pd.DataFrame, subsets: List[pd.DataFrame], nr_subsets: int, + tss: TimeseriesSettings, len_threshold: int = 2): + """ + Helper function reverts stratified data back to a normal split if the size difference between splits is larger + than a certain threshold. + + :param data: Raw data + :param subsets: Stratified data + :param nr_subsets: Number of subsets + :param tss: TimeseriesSettings + :param len_threshold: size difference between subsets to revert the stratification process + + :return: Inplace-modified subsets if threshold was passed. Else, subsets are returned unmodified. + """ + if not tss.is_timeseries: + max_len = np.max([len(subset) for subset in subsets]) + for subset in subsets: + if len(subset) < max_len - len_threshold: + subset_lengths = [len(subset) for subset in subsets] + log.warning(f'Cannot stratify, got subsets of length: {subset_lengths} | Splitting without stratification') # noqa + subsets = np.array_split(data, nr_subsets) + break return subsets diff --git a/lightwood/data/statistical_analysis.py b/lightwood/data/statistical_analysis.py index f405b1f30..e46d3d6bf 100644 --- a/lightwood/data/statistical_analysis.py +++ b/lightwood/data/statistical_analysis.py @@ -10,16 +10,16 @@ from lightwood.helpers.log import log from lightwood.api.dtype import dtype from scipy.stats import entropy -from lightwood.data.cleaner import _clean_float_or_none +from lightwood.data.cleaner import _clean_float def get_datetime_histogram(data: pd.Series, bins: int) -> Dict[str, list]: """Generates the histogram for date and datetime types """ if isinstance(data[0], float) or isinstance(data[0], int): - data = [_clean_float_or_none(x) for x in data] + data = [_clean_float(x) for x in data] else: - data = [_clean_float_or_none(parse_dt(str(x)).timestamp()) for x in data] + data = [_clean_float(parse_dt(str(x)).timestamp()) for x in data] Y, X = np.histogram(data, bins=min(bins, len(set(data))), range=(min(data), max(data)), density=False) @@ -44,7 +44,7 @@ def get_numeric_histogram(data: pd.Series, data_dtype: dtype, bins: int) -> Dict new_data.extend(arr) data = new_data - data = [_clean_float_or_none(x) for x in data] + data = [_clean_float(x) for x in data] Y, X = np.histogram(data, bins=min(bins, len(set(data))), range=(min(data), max(data)), density=False) @@ -80,7 +80,7 @@ def statistical_analysis(data: pd.DataFrame, seed_nr: int = 420) -> StatisticalAnalysis: seed(seed_nr) log.info('Starting statistical analysis') - df = cleaner(data, dtypes, problem_definition.pct_invalid, problem_definition.ignore_features, + df = cleaner(data, dtypes, problem_definition.pct_invalid, identifiers, problem_definition.target, 'train', problem_definition.timeseries_settings, problem_definition.anomaly_detection) @@ -91,7 +91,7 @@ def statistical_analysis(data: pd.DataFrame, target = problem_definition.target positive_domain = False # get train std, used in analysis - if dtypes[target] in [dtype.float, dtype.integer, dtype.tsarray]: + if dtypes[target] in [dtype.float, dtype.integer, dtype.tsarray, dtype.quantity]: df_std = df[target].astype(float).std() if min(df[target]) >= 0: positive_domain = True @@ -122,7 +122,7 @@ def statistical_analysis(data: pd.DataFrame, 'y': list(hist.values()) } buckets[col] = histograms[col]['x'] - elif dtypes[col] in (dtype.integer, dtype.float, dtype.array, dtype.tsarray): + elif dtypes[col] in (dtype.integer, dtype.float, dtype.array, dtype.tsarray, dtype.quantity): histograms[col] = get_numeric_histogram(filter_nan_and_none(df[col]), dtypes[col], 50) buckets[col] = histograms[col]['x'] elif dtypes[col] in (dtype.date, dtype.datetime): @@ -155,7 +155,8 @@ def statistical_analysis(data: pd.DataFrame, if dtypes[col] in (dtype.rich_text, dtype.short_text): words_per_sentence = [] for item in df[col]: - words_per_sentence.append(len(item.split(' '))) + if item is not None: + words_per_sentence.append(len(item.split(' '))) avg_words_per_sentence[col] = int(np.mean(words_per_sentence)) else: avg_words_per_sentence[col] = None diff --git a/lightwood/data/timeseries_analyzer.py b/lightwood/data/timeseries_analyzer.py index c6ee00671..85d41d0fb 100644 --- a/lightwood/data/timeseries_analyzer.py +++ b/lightwood/data/timeseries_analyzer.py @@ -1,7 +1,10 @@ -from typing import Dict +from typing import Dict, Tuple, List + +import numpy as np import pandas as pd from lightwood.api.types import TimeseriesSettings +from lightwood.api.dtype import dtype from lightwood.encoder.time_series.helpers.common import get_group_matches, generate_target_group_normalizers @@ -19,6 +22,11 @@ def timeseries_analyzer(data: pd.DataFrame, dtype_dict: Dict[str, str], # @TODO: maybe normalizers should fit using only the training subsets?? new_data = generate_target_group_normalizers(info) + if dtype_dict[target] in (dtype.integer, dtype.float, dtype.tsarray): + naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(info, new_data['group_combinations']) + else: + naive_forecast_residuals, scale_factor = {}, {} + deltas = get_delta(data[timeseries_settings.order_by], info, new_data['group_combinations'], @@ -27,7 +35,10 @@ def timeseries_analyzer(data: pd.DataFrame, dtype_dict: Dict[str, str], return {'target_normalizers': new_data['target_normalizers'], 'deltas': deltas, 'tss': timeseries_settings, - 'group_combinations': new_data['group_combinations']} + 'group_combinations': new_data['group_combinations'], + 'ts_naive_residuals': naive_forecast_residuals, + 'ts_naive_mae': scale_factor + } def get_delta(df: pd.DataFrame, ts_info: dict, group_combinations: list, order_cols: list): @@ -43,6 +54,7 @@ def get_delta(df: pd.DataFrame, ts_info: dict, group_combinations: list, order_c deltas["__default"][col] = delta if ts_info.get('group_info', False): + original_data = ts_info['data'] for group in group_combinations: if group != "__default": deltas[group] = {} @@ -56,5 +68,38 @@ def get_delta(df: pd.DataFrame, ts_info: dict, group_combinations: list, order_c lambda x: x.iloc[1] - x.iloc[0]) delta = rolling_diff.value_counts(ascending=False).keys()[0] deltas[group][col] = delta + ts_info['data'] = original_data return deltas + + +def get_naive_residuals(target_data: pd.DataFrame, m: int = 1) -> Tuple[List, float]: + """ + Computes forecasting residuals for the naive method (forecasts for time `t` is the value observed at `t-1`). + Useful for computing MASE forecasting error. + + Note: method assumes predictions are all for the same group combination. For a dataframe that contains multiple + series, use `get_grouped_naive_resiudals`. + + :param target_data: observed time series targets + :param m: season length. the naive forecasts will be the m-th previously seen value for each series + + :returns: (list of naive residuals, average residual value) + """ + residuals = target_data.rolling(window=m + 1).apply(lambda x: abs(x.iloc[m] - x.iloc[0]))[m:].values.flatten() + scale_factor = np.average(residuals) + return residuals.tolist(), scale_factor + + +def get_grouped_naive_residuals(info: Dict, group_combinations: List) -> Tuple[Dict, Dict]: + """ + Wraps `get_naive_residuals` for a dataframe with grouped time series. + """ + group_residuals = {} + group_scale_factors = {} + for group in group_combinations: + idxs, subset = get_group_matches(info, group) + residuals, scale_factor = get_naive_residuals(pd.DataFrame(subset)) # @TODO: pass m once we handle seasonality + group_residuals[group] = residuals + group_scale_factors[group] = scale_factor + return group_residuals, group_scale_factors diff --git a/lightwood/encoder/__init__.py b/lightwood/encoder/__init__.py index 40bbf1d95..a22cf8302 100644 --- a/lightwood/encoder/__init__.py +++ b/lightwood/encoder/__init__.py @@ -27,7 +27,6 @@ AmplitudeTsEncoder = None -__ts_encoders__ = [TsNumericEncoder, TimeSeriesEncoder, ArrayEncoder] __all__ = ['BaseEncoder', 'DatetimeEncoder', 'Img2VecEncoder', 'NumericEncoder', 'TsNumericEncoder', 'TsArrayNumericEncoder', 'ShortTextEncoder', 'VocabularyEncoder', 'TextRnnEncoder', 'OneHotEncoder', 'CategoricalAutoEncoder', 'TimeSeriesEncoder', 'ArrayEncoder', 'MultiHotEncoder', diff --git a/lightwood/encoder/array/array.py b/lightwood/encoder/array/array.py index 9f18ec28b..fc692139f 100644 --- a/lightwood/encoder/array/array.py +++ b/lightwood/encoder/array/array.py @@ -8,6 +8,8 @@ class ArrayEncoder(BaseEncoder): + is_trainable_encoder: bool = True + def __init__(self, stop_after: int, window: int = None, is_target: bool = False, original_type: dtype = None): """ Fits a normalizer for a time series previous historical data. @@ -22,9 +24,9 @@ def __init__(self, stop_after: int, window: int = None, is_target: bool = False, else: self.output_size = None - def prepare(self, priming_data): - if isinstance(priming_data, pd.Series): - priming_data = priming_data.values + def prepare(self, train_priming_data, dev_priming_data): + priming_data = pd.concat([train_priming_data, dev_priming_data]) + priming_data = priming_data.values if self.output_size is None: self.output_size = np.max([len(x) for x in priming_data if x is not None]) diff --git a/lightwood/encoder/base.py b/lightwood/encoder/base.py index d1db1d8c2..e0104f384 100644 --- a/lightwood/encoder/base.py +++ b/lightwood/encoder/base.py @@ -7,11 +7,13 @@ class BaseEncoder: is_target: bool prepared: bool + is_timeseries_encoder: bool = False + is_trainable_encoder: bool = False + def __init__(self, is_target=False) -> None: self.is_target = is_target self._prepared = False self.uses_subsets = False - self.is_nn_encoder = False self.dependencies = [] self.output_size = None @@ -25,7 +27,7 @@ def encode(self, column_data) -> torch.Tensor: def decode(self, encoded_data) -> List[object]: raise NotImplementedError - # Should work for all troch-based encoders, but custom behavior may have to be implemented for very weird models + # Should work for all torch-based encoders, but custom behavior may have to be implemented for weird models def to(self, device, available_devices): # Find all nn.Module type objects and convert them # @TODO: Make this work recursively diff --git a/lightwood/encoder/categorical/autoencoder.py b/lightwood/encoder/categorical/autoencoder.py index 7f2200f36..0ddb8bcf1 100644 --- a/lightwood/encoder/categorical/autoencoder.py +++ b/lightwood/encoder/categorical/autoencoder.py @@ -1,5 +1,4 @@ import random -from typing import Union import numpy as np import torch from torch.utils.data import DataLoader @@ -9,11 +8,13 @@ from lightwood.encoder.base import BaseEncoder from lightwood.helpers.log import log from lightwood.mixer.helpers.default_net import DefaultNet +import pandas as pd class CategoricalAutoEncoder(BaseEncoder): - def __init__(self, stop_after: int = 3600, is_target: bool = False, max_encoded_length: int = 100, - use_autoencoder: Union[bool, None] = None): + is_trainable_encoder: bool = True + + def __init__(self, stop_after: int = 3600, is_target: bool = False, max_encoded_length: int = 100): super().__init__(is_target) self._prepared = False self.name = 'Categorical Autoencoder' @@ -22,18 +23,10 @@ def __init__(self, stop_after: int = 3600, is_target: bool = False, max_encoded_ self.decoder = None self.onehot_encoder = OneHotEncoder(is_target=self.is_target) self.desired_error = 0.01 - self.use_autoencoder = use_autoencoder self.stop_after = stop_after # @TODO stop using instead of ONEHOT !!!@! - self.is_nn_encoder = True self.output_size = None - if self.is_target: - self.max_encoded_length = None - else: - self.max_encoded_length = max_encoded_length - - def _train_callback(self, error, real_buff, predicted_buff): - log.info(f'{self.name} reached a loss of {error} while training !') + self.max_encoded_length = max_encoded_length def _encoder_targets(self, data): oh_encoded_categories = self.onehot_encoder.encode(data) @@ -43,7 +36,8 @@ def _encoder_targets(self, data): labels = targets_c.to(self.net.device) return labels - def prepare(self, priming_data): + def prepare(self, train_priming_data, dev_priming_data): + priming_data = pd.concat([train_priming_data, dev_priming_data]) random.seed(len(priming_data)) if self._prepared: @@ -52,73 +46,62 @@ def prepare(self, priming_data): self.onehot_encoder.prepare(priming_data) input_len = self.onehot_encoder._lang.n_words - if self.use_autoencoder is None: - self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length - if self.use_autoencoder: - if self.is_target: - log.warning('You are trying to use an autoencoder for the target value! \ - This is very likely a bad idea') - log.info('Preparing a categorical autoencoder, this might take a while') + if self.is_target: + log.warning('You are trying to use an autoencoder for the target value! \ + This is very likely a bad idea') + log.info('Preparing a categorical autoencoder, this might take a while') - embeddings_layer_len = self.max_encoded_length + embeddings_layer_len = self.max_encoded_length - self.net = DefaultNet(shape=[ - input_len, embeddings_layer_len, input_len]) + self.net = DefaultNet(shape=[input_len, embeddings_layer_len, input_len]) - criterion = torch.nn.CrossEntropyLoss() - optimizer = Ranger(self.net.parameters()) + criterion = torch.nn.CrossEntropyLoss() + optimizer = Ranger(self.net.parameters()) - gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, - device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, - output_encoder=self._encoder_targets) + gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, + device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, + output_encoder=self._encoder_targets) - batch_size = min(200, int(len(priming_data) / 50)) + batch_size = min(200, int(len(priming_data) / 50)) - priming_data_str = [str(x) for x in priming_data] - train_data_loader = DataLoader( - list(zip(priming_data_str, priming_data_str)), - batch_size=batch_size, shuffle=True) + priming_data_str = [str(x) for x in priming_data] + train_data_loader = DataLoader( + list(zip(priming_data_str, priming_data_str)), + batch_size=batch_size, shuffle=True) - test_data_loader = None + test_data_loader = None - best_model, error, training_time = gym.fit(train_data_loader, - test_data_loader, - desired_error=self.desired_error, - max_time=self.stop_after, - callback=self._train_callback, - eval_every_x_epochs=1, - max_unimproving_models=5) + best_model, _, _ = gym.fit(train_data_loader, + test_data_loader, + desired_error=self.desired_error, + max_time=self.stop_after, + eval_every_x_epochs=1, + max_unimproving_models=5) - self.net = best_model.to(self.net.device) + self.net = best_model.to(self.net.device) - modules = [module for module in self.net.modules() if type( - module) != torch.nn.Sequential and type(module) != DefaultNet] - self.encoder = torch.nn.Sequential(*modules[0:2]).eval() - self.decoder = torch.nn.Sequential(*modules[2:3]).eval() - log.info('Categorical autoencoder ready') + modules = [module for module in self.net.modules() if type( + module) != torch.nn.Sequential and type(module) != DefaultNet] + self.encoder = torch.nn.Sequential(*modules[0:2]).eval() + self.decoder = torch.nn.Sequential(*modules[2:3]).eval() + log.info('Categorical autoencoder ready') self.output_size = self.onehot_encoder._lang.n_words - if self.use_autoencoder: - self.output_size = self.max_encoded_length + self.output_size = self.max_encoded_length self._prepared = True def encode(self, column_data): oh_encoded_tensor = self.onehot_encoder.encode(column_data) - if not self.use_autoencoder: - return oh_encoded_tensor - else: - with torch.no_grad(): - oh_encoded_tensor = oh_encoded_tensor.to(self.net.device) - embeddings = self.encoder(oh_encoded_tensor) - return embeddings.to('cpu') + + with torch.no_grad(): + oh_encoded_tensor = oh_encoded_tensor.to(self.net.device) + embeddings = self.encoder(oh_encoded_tensor) + return embeddings.to('cpu') def decode(self, encoded_data): - if not self.use_autoencoder: - return self.onehot_encoder.decode(encoded_data) - else: - with torch.no_grad(): - encoded_data = encoded_data.to(self.net.device) - oh_encoded_tensor = self.decoder(encoded_data) - oh_encoded_tensor = oh_encoded_tensor.to('cpu') - return self.onehot_encoder.decode(oh_encoded_tensor) + with torch.no_grad(): + encoded_data = encoded_data.to(self.net.device) + oh_encoded_tensor = self.decoder(encoded_data) + oh_encoded_tensor = oh_encoded_tensor.to('cpu') + return self.onehot_encoder.decode(oh_encoded_tensor) diff --git a/lightwood/encoder/categorical/gym.py b/lightwood/encoder/categorical/gym.py index 648baff45..8e90eb5b4 100644 --- a/lightwood/encoder/categorical/gym.py +++ b/lightwood/encoder/categorical/gym.py @@ -24,7 +24,7 @@ def __init__(self, model, optimizer, scheduler, loss_criterion, device, self.best_model = None - def fit(self, train_data_loader, test_data_loader, desired_error, max_time, callback, + def fit(self, train_data_loader, test_data_loader, desired_error, max_time, callback=None, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=None, custom_test_func=None): started = time.time() epoch = 0 @@ -126,7 +126,7 @@ def fit(self, train_data_loader, test_data_loader, desired_error, max_time, call delta_mean = np.mean(test_error_delta_buff[-max_unimproving_models:]) if delta_mean <= 0: keep_training = False - - callback(test_error, real_buff, predicted_buff) + if callback is not None: + callback(test_error, real_buff, predicted_buff) return self.best_model, lowest_test_error, int(time.time() - started) diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py index 15a7d5ff4..fc30cd122 100644 --- a/lightwood/encoder/numeric/ts_numeric.py +++ b/lightwood/encoder/numeric/ts_numeric.py @@ -9,6 +9,7 @@ class TsNumericEncoder(NumericEncoder): """ Variant of vanilla numerical encoder, supports dynamic mean re-scaling """ + is_timeseries_encoder: bool = True def __init__(self, is_target: bool = False, positive_domain: bool = False, grouped_by=None): super(TsNumericEncoder, self).__init__(is_target=is_target, positive_domain=positive_domain) diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py index 7754f0eb1..be9ea87a3 100644 --- a/lightwood/encoder/text/pretrained.py +++ b/lightwood/encoder/text/pretrained.py @@ -74,6 +74,8 @@ class PretrainedLangEncoder(BaseEncoder): + is_trainable_encoder: bool = True + """ Pretrained language models. Option to train on a target encoding of choice. @@ -126,7 +128,6 @@ def __init__( self._pretrained_model_name = "distilbert-base-uncased" self.device, _ = get_devices() - self.is_nn_encoder = True self.stop_after = stop_after self.embed_mode = embed_mode @@ -139,7 +140,7 @@ def __init__( else: log.info("Embedding mode off. Logits are output of encode()") - def prepare(self, priming_data: pd.Series, encoded_target_values: torch.Tensor): + def prepare(self, train_priming_data: pd.Series, dev_priming_data: pd.Series, encoded_target_values: torch.Tensor): """ Prepare the encoder by training on the target. @@ -147,6 +148,8 @@ def prepare(self, priming_data: pd.Series, encoded_target_values: torch.Tensor): Automatically assumes this. """ os.environ['TOKENIZERS_PARALLELISM'] = 'true' + priming_data = pd.concat([train_priming_data, dev_priming_data]) + priming_data = priming_data.values if self._prepared: raise Exception("Encoder is already prepared.") diff --git a/lightwood/encoder/text/short.py b/lightwood/encoder/text/short.py index b809d3195..66e319408 100644 --- a/lightwood/encoder/text/short.py +++ b/lightwood/encoder/text/short.py @@ -4,6 +4,7 @@ from lightwood.encoder.categorical import CategoricalAutoEncoder from lightwood.helpers.text import tokenize_text from lightwood.helpers.torch import concat_vectors_and_pad, average_vectors +import pandas as pd class ShortTextEncoder(BaseEncoder): @@ -35,7 +36,6 @@ def __init__(self, is_target=False, mode=None): # Defined in self.prepare() self._combine_fn = None self.max_words_per_sent = None - self.is_nn_encoder = True self.cae = CategoricalAutoEncoder(is_target=is_target, max_encoded_length=100) self._prepared = False @@ -59,7 +59,7 @@ def prepare(self, priming_data): for tok in tokens: unique_tokens.add(tok) - self.cae.prepare(unique_tokens) + self.cae.prepare(pd.Series(list(unique_tokens)), pd.Series([])) if self._mode == 'concat': self.max_words_per_sent = max_words_per_sent @@ -87,10 +87,7 @@ def encode(self, column_data: List[str]) -> torch.Tensor: def decode(self, vectors): if self._mode == 'concat': - if self.cae.use_autoencoder: - vec_size = self.cae.max_encoded_length - else: - vec_size = len(self.cae.onehot_encoder._lang.index2word) + vec_size = self.cae.max_encoded_length output = [] for vec in vectors: diff --git a/lightwood/encoder/time_series/helpers/common.py b/lightwood/encoder/time_series/helpers/common.py index c6653d7c7..6e11192d2 100644 --- a/lightwood/encoder/time_series/helpers/common.py +++ b/lightwood/encoder/time_series/helpers/common.py @@ -92,7 +92,7 @@ def get_group_matches(data, combination): if isinstance(data['data'], np.ndarray) and len(data['data'].shape) < 2: data['data'] = np.expand_dims(data['data'], axis=1) - if not combination: + if combination == '__default': idxs = range(len(data['data'])) return [idxs, np.array(data['data'])[idxs, :]] # return all data else: diff --git a/lightwood/encoder/time_series/rnn.py b/lightwood/encoder/time_series/rnn.py index 586f8aba2..c2619810e 100644 --- a/lightwood/encoder/time_series/rnn.py +++ b/lightwood/encoder/time_series/rnn.py @@ -21,6 +21,8 @@ class TimeSeriesEncoder(BaseEncoder): + is_timeseries_encoder: bool = True + is_trainable_encoder: bool = True def __init__(self, stop_after: int, is_target=False, original_type: str = None, target: str = None, grouped_by: List[str] = [], encoder_type='rnn'): @@ -47,7 +49,6 @@ def __init__(self, stop_after: int, is_target=False, original_type: str = None, self._group_combinations = None self.original_type = original_type self.stop_after = stop_after - self.is_nn_encoder = True if encoder_type.lower() == 'rnn': self.encoder_class = EncoderRNNNumerical elif encoder_type.lower() == 'transformer': @@ -145,7 +146,7 @@ def _get_batch(self, source, start, end): end = min(end, len(source)) return source[start:end] - def prepare(self, priming_data, dependency_data={}, ts_analysis=None, + def prepare(self, train_priming_data: pd.Series, dev_priming_data: pd.Series, dependency_data={}, ts_analysis=None, feedback_hoop_function=log.info, batch_size=256): """ :param priming_data: a list of (self._n_dims)-dimensional time series [[dim1_data], ...] @@ -154,6 +155,9 @@ def prepare(self, priming_data, dependency_data={}, ts_analysis=None, :param feedback_hoop_function: method to use if you want to get feedback on the training process :param batch_size """ + priming_data = pd.concat([train_priming_data, dev_priming_data]) + priming_data = list(priming_data.values) + if self._prepared: raise Exception('You can only call "prepare" once for a given encoder.') else: diff --git a/lightwood/ensemble/base.py b/lightwood/ensemble/base.py index 8f6dba32e..7a0c67fcf 100644 --- a/lightwood/ensemble/base.py +++ b/lightwood/ensemble/base.py @@ -5,12 +5,12 @@ class BaseEnsemble: - data: List[EncodedDs] + data: EncodedDs mixers: List[BaseMixer] best_index: int supports_proba: bool - def __init__(self, target, mixers: List[BaseMixer], data: List[EncodedDs]) -> None: + def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs) -> None: self.data = data self.mixers = mixers self.best_index = 0 diff --git a/lightwood/ensemble/best_of.py b/lightwood/ensemble/best_of.py index 92a9e5b19..be9551cf1 100644 --- a/lightwood/ensemble/best_of.py +++ b/lightwood/ensemble/best_of.py @@ -1,42 +1,52 @@ -from typing import List +from typing import List, Optional import numpy as np import pandas as pd from lightwood.helpers.log import log +from lightwood.helpers.numeric import can_be_nan_numeric from lightwood.mixer.base import BaseMixer from lightwood.ensemble.base import BaseEnsemble -from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs +from lightwood.data.encoded_ds import EncodedDs from lightwood.helpers.general import evaluate_accuracy class BestOf(BaseEnsemble): - best_index: int + indexes_by_accuracy: List[float] - def __init__(self, target, mixers: List[BaseMixer], data: List[EncodedDs], accuracy_functions) -> None: + def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs, accuracy_functions, + ts_analysis: Optional[dict] = None) -> None: super().__init__(target, mixers, data) - # @TODO: Need some shared accuracy functionality to determine mixer selection here - self.maximize = True - best_score = -pow(2, 32) if self.maximize else pow(2, 32) - ds = ConcatedEncodedDs(data) - for idx, mixer in enumerate(mixers): + + score_list = [] + for _, mixer in enumerate(mixers): score_dict = evaluate_accuracy( - ds.data_frame, - mixer(ds)['prediction'], + data.data_frame, + mixer(data)['prediction'], target, - accuracy_functions + accuracy_functions, + ts_analysis=ts_analysis ) avg_score = np.mean(list(score_dict.values())) - log.info(f'Mixer {type(mixer).__name__} obtained a best-of evaluation score of {round(avg_score,4)}') - if self.improves(avg_score, best_score, accuracy_functions): - best_score = avg_score - self.best_index = idx + log.info(f'Mixer: {type(mixer).__name__} got accuracy: {avg_score}') - self.supports_proba = self.mixers[self.best_index].supports_proba - log.info(f'Picked best mixer: {type(self.mixers[self.best_index]).__name__}') + if can_be_nan_numeric(avg_score): + avg_score = -pow(2, 63) + log.warning(f'Change the accuracy of mixer {type(mixer).__name__} to valid value: {avg_score}') - def __call__(self, ds: EncodedDs, predict_proba: bool = False) -> pd.DataFrame: - return self.mixers[self.best_index](ds, predict_proba=predict_proba) + score_list.append(avg_score) - def improves(self, new, old, functions): - return new > old if self.maximize else new < old + self.indexes_by_accuracy = list(reversed(np.array(score_list).argsort())) + self.supports_proba = self.mixers[self.indexes_by_accuracy[0]].supports_proba + log.info(f'Picked best mixer: {type(self.mixers[self.indexes_by_accuracy[0]]).__name__}') + + def __call__(self, ds: EncodedDs, predict_proba: bool = False) -> pd.DataFrame: + for mixer_index in self.indexes_by_accuracy: + try: + return self.mixers[mixer_index](ds, predict_proba=predict_proba) + except Exception as e: + if self.mixers[mixer_index].stable: + raise(e) + else: + log.warning(f'Unstable mixer {type(self.mixers[mixer_index]).__name__} failed with exception: {e}.\ + Trying next best') diff --git a/lightwood/helpers/factory.py b/lightwood/helpers/factory.py deleted file mode 100644 index 603ee6c0d..000000000 --- a/lightwood/helpers/factory.py +++ /dev/null @@ -1,17 +0,0 @@ -from functools import partial - -# Factories are syntactic sugar but can introduce bugs, we should *only* use them for the public APIs - - -class Factory: - self.generate_class = None - - def __init__(self, **kwargs): - pass - - def generate(self) -> self.generate_class: - return self.generate_class(**kwargs) - - -def gen_factory_func(func: Callable, **kwargs) -> Callable: - return partial(func, **kwargs) diff --git a/lightwood/helpers/general.py b/lightwood/helpers/general.py index eada084ad..b6bd67902 100644 --- a/lightwood/helpers/general.py +++ b/lightwood/helpers/general.py @@ -1,29 +1,37 @@ -import math import importlib -from typing import List, Union, Dict +from typing import List, Union, Dict, Optional import numpy as np import pandas as pd -from sklearn.metrics import r2_score, f1_score +from sklearn.metrics import r2_score, f1_score, mean_absolute_error +from lightwood.encoder.time_series.helpers.common import get_group_matches + + +# ------------------------- # +# Accuracy metrics +# ------------------------- # def evaluate_accuracy(data: pd.DataFrame, predictions: pd.Series, target: str, - accuracy_functions: List[str]) -> Dict[str, float]: + accuracy_functions: List[str], + ts_analysis: Optional[dict] = {}) -> Dict[str, float]: score_dict = {} for accuracy_function_str in accuracy_functions: if accuracy_function_str == 'evaluate_array_accuracy': - nr_predictions = len(predictions.iloc[0]) + nr_predictions = 1 if not isinstance(predictions.iloc[0], list) else len(predictions.iloc[0]) cols = [target] + [f'{target}_timestep_{i}' for i in range(1, nr_predictions)] true_values = data[cols].values.tolist() - accuracy_function = evaluate_array_accuracy + score_dict[accuracy_function_str] = evaluate_array_accuracy(list(true_values), + list(predictions), + data, + ts_analysis=ts_analysis) else: true_values = data[target].tolist() accuracy_function = getattr(importlib.import_module('sklearn.metrics'), accuracy_function_str) - - score_dict[accuracy_function_str] = accuracy_function(list(true_values), list(predictions)) + score_dict[accuracy_function_str] = accuracy_function(list(true_values), list(predictions)) return score_dict @@ -48,20 +56,100 @@ def evaluate_multilabel_accuracy(true_values, predictions, **kwargs): def evaluate_array_accuracy( + true_values: List[List[Union[int, float]]], + predictions: List[List[Union[int, float]]], + data: pd.DataFrame, + **kwargs +) -> float: + """ + Evaluate accuracy in numerical time series forecasting tasks. + Defaults to mean absolute scaled error (MASE) if in-sample residuals are available. + If this is not the case, R2 score is computed instead. + + Scores are computed for each timestep (as determined by `timeseries_settings.nr_predictions`), + and the final accuracy is the reciprocal of the average score through all timesteps. + """ + + ts_analysis = kwargs.get('ts_analysis', {}) + naive_errors = ts_analysis.get('ts_naive_mae', {}) + + if not naive_errors: + # use mean R2 method if naive errors are not available + return evaluate_array_r2_accuracy(true_values, predictions, ts_analysis=ts_analysis) + + mases = [] + true_values = np.array(true_values) + predictions = np.array(predictions) + wrapped_data = {'data': data.reset_index(drop=True), + 'group_info': {gcol: data[gcol].tolist() + for gcol in ts_analysis['tss'].group_by} if ts_analysis['tss'].group_by else {} + } + for group in ts_analysis['group_combinations']: + g_idxs, _ = get_group_matches(wrapped_data, group) + + # only evaluate populated groups + if g_idxs: + trues = true_values[g_idxs] + preds = predictions[g_idxs] + + if ts_analysis['tss'].nr_predictions == 1: + preds = np.expand_dims(preds, axis=1) + + # only evaluate accuracy for rows with complete historical context + if len(trues) > ts_analysis['tss'].window: + trues = trues[ts_analysis['tss'].window:] + preds = preds[ts_analysis['tss'].window:] + + # add MASE score for each group (__default only considered if the task is non-grouped) + if len(ts_analysis['group_combinations']) == 1 or group != '__default': + mases.append(mase(trues, preds, ts_analysis['ts_naive_mae'][group], ts_analysis['tss'].nr_predictions)) + + return 1 / max(np.average(mases), 1e-4) # reciprocal to respect "larger -> better" convention + + +def evaluate_array_r2_accuracy( true_values: List[List[Union[int, float]]], predictions: List[List[Union[int, float]]], **kwargs ) -> float: - # @TODO: ideally MASE here + """ + Default time series forecasting accuracy method. + Returns mean R2 score over all timesteps in the forecasting horizon. + """ base_acc_fn = kwargs.get('base_acc_fn', lambda t, p: max(0, r2_score(t, p))) - aggregate = 0 - for i in range(len(predictions)): - try: - valid_horizon = [math.isnan(x) for x in true_values[i]].index(True) - except ValueError: - valid_horizon = len(true_values[i]) + aggregate = 0.0 + + fh = 1 if not isinstance(predictions[0], list) else len(predictions[0]) + if fh == 1: + predictions = [[p] for p in predictions] + + # only evaluate accuracy for rows with complete historical context + if kwargs.get('ts_analysis', {}).get('tss', False): + true_values = true_values[kwargs['ts_analysis']['tss'].window:] + predictions = predictions[kwargs['ts_analysis']['tss'].window:] + + for i in range(fh): + aggregate += base_acc_fn([t[i] for t in true_values], [p[i] for p in predictions]) + + return aggregate / fh + + +# ------------------------- # +# Helpers +# ------------------------- # +def mase(trues, preds, scale_error, fh): + """ + Computes mean absolute scaled error. + The scale corrective factor is the mean in-sample residual from the naive forecasting method. + """ + if scale_error == 0: + scale_error = 1 # cover (rare) case where series is constant - aggregate += base_acc_fn(true_values[i][:valid_horizon], predictions[i][:valid_horizon]) + agg = 0.0 + for i in range(fh): + true = [t[i] for t in trues] + pred = [p[i] for p in preds] + agg += mean_absolute_error(true, pred) - return aggregate / len(predictions) + return (agg / fh) / scale_error diff --git a/lightwood/helpers/templating.py b/lightwood/helpers/templating.py index 000de4d69..5c8ecd542 100644 --- a/lightwood/helpers/templating.py +++ b/lightwood/helpers/templating.py @@ -29,7 +29,7 @@ def is_allowed(v): return True # Allowed variable names - if v in ['df', 'nsubsets', 'data', 'encoded_data', 'train_data', 'encoded_train_data', 'test_data']: + if v in ['df', 'data', 'encoded_data', 'train_data', 'encoded_train_data', 'test_data']: return True try: diff --git a/lightwood/helpers/text.py b/lightwood/helpers/text.py index 169010443..5ea4ae489 100644 --- a/lightwood/helpers/text.py +++ b/lightwood/helpers/text.py @@ -230,7 +230,7 @@ def get_identifier_description(data, column_name, data_dtype): # Detect foreign key if data_dtype == dtype.integer: if _is_foreign_key_name(column_name): - return 'Foregin key' + return 'Foreign key' if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary): if unquie_pct > 0.98: diff --git a/lightwood/mixer/base.py b/lightwood/mixer/base.py index 2f2fcfdbf..8cf829a42 100644 --- a/lightwood/mixer/base.py +++ b/lightwood/mixer/base.py @@ -1,4 +1,3 @@ -from typing import List import pandas as pd from lightwood.data.encoded_ds import EncodedDs @@ -11,11 +10,11 @@ def __init__(self, stop_after: int): self.stop_after = stop_after self.supports_proba = None - def fit(self, data: List[EncodedDs]) -> None: + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: raise NotImplementedError() def __call__(self, ds: EncodedDs, predict_proba: bool = False) -> pd.DataFrame: raise NotImplementedError() - def partial_fit(self, train_data: List[EncodedDs], test_data: List[EncodedDs]) -> None: + def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: pass diff --git a/lightwood/mixer/lightgbm.py b/lightwood/mixer/lightgbm.py index f425b07e0..0bd139393 100644 --- a/lightwood/mixer/lightgbm.py +++ b/lightwood/mixer/lightgbm.py @@ -1,5 +1,5 @@ import pandas as pd -from lightwood.data.encoded_ds import ConcatedEncodedDs, EncodedDs +from lightwood.data.encoded_ds import EncodedDs from lightwood.api import dtype from typing import Dict, List, Set import numpy as np @@ -11,6 +11,8 @@ from lightwood.helpers.log import log from sklearn.preprocessing import OrdinalEncoder from lightwood.mixer.base import BaseMixer +from lightwood.helpers.device import get_devices + optuna.logging.set_verbosity(optuna.logging.CRITICAL) @@ -22,7 +24,11 @@ def check_gpu_support(): train_data = lightgbm.Dataset(data, label=label) params = {'num_iterations': 1, 'device': 'gpu'} lightgbm.train(params, train_set=train_data) - return True + device, nr_devices = get_devices() + if nr_devices > 0 and str(device) != 'cpu': + return True + else: + return False except Exception: return False @@ -91,35 +97,33 @@ def _to_dataset(self, data, output_dtype): label_data = [x if x in self.label_set else '__mdb_unknown_cat' for x in label_data] label_data = self.ordinal_encoder.transform(np.array(label_data).reshape(-1, 1)).flatten() elif output_dtype == dtype.integer: - label_data = label_data.astype(int) - elif output_dtype == dtype.float: + label_data = label_data.clip(-pow(2, 63), pow(2, 63)).astype(int) + elif output_dtype in (dtype.float, dtype.quantity): label_data = label_data.astype(float) data[subset_name]['label_data'] = label_data return data - def fit(self, ds_arr: List[EncodedDs]) -> None: + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: log.info('Started fitting LGBM model') - train_ds_arr = ds_arr[0:int(len(ds_arr) * 0.9)] - dev_ds_arr = ds_arr[int(len(ds_arr) * 0.9):] data = { - 'train': {'ds': ConcatedEncodedDs(train_ds_arr), 'data': None, 'label_data': {}}, - 'dev': {'ds': ConcatedEncodedDs(dev_ds_arr), 'data': None, 'label_data': {}} + 'train': {'ds': train_data, 'data': None, 'label_data': {}}, + 'dev': {'ds': dev_data, 'data': None, 'label_data': {}} } self.fit_data_len = len(data['train']['ds']) - self.positive_domain = getattr(train_ds_arr[0].encoders.get(self.target, None), 'positive_domain', False) + self.positive_domain = getattr(train_data.encoders.get(self.target, None), 'positive_domain', False) output_dtype = self.dtype_dict[self.target] data = self._to_dataset(data, output_dtype) - if output_dtype not in (dtype.categorical, dtype.integer, dtype.float, dtype.binary): + if output_dtype not in (dtype.categorical, dtype.integer, dtype.float, dtype.binary, dtype.quantity): log.error(f'Lightgbm mixer not supported for type: {output_dtype}') raise Exception(f'Lightgbm mixer not supported for type: {output_dtype}') else: - objective = 'regression' if output_dtype in (dtype.integer, dtype.float) else 'multiclass' - metric = 'l2' if output_dtype in (dtype.integer, dtype.float) else 'multi_logloss' + objective = 'regression' if output_dtype in (dtype.integer, dtype.float, dtype.quantity) else 'multiclass' + metric = 'l2' if output_dtype in (dtype.integer, dtype.float, dtype.quantity) else 'multi_logloss' self.params = { 'objective': objective, @@ -182,15 +186,14 @@ def fit(self, ds_arr: List[EncodedDs]) -> None: log.info(f'Lightgbm model contains {self.model.num_trees()} weak estimators') if self.fit_on_dev: - self.partial_fit(dev_ds_arr, train_ds_arr) + self.partial_fit(dev_data, train_data) - def partial_fit(self, train_data: List[EncodedDs], dev_data: List[EncodedDs]) -> None: - ds = ConcatedEncodedDs(train_data) - pct_of_original = len(ds) / self.fit_data_len + def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + pct_of_original = len(train_data) / self.fit_data_len iterations = max(1, int(self.num_iterations * pct_of_original) / 2) - data = {'retrain': {'ds': ds, 'data': None, 'label_data': {}}, 'dev': { - 'ds': ConcatedEncodedDs(dev_data), 'data': None, 'label_data': {}}} + data = {'retrain': {'ds': train_data, 'data': None, 'label_data': {}}, 'dev': { + 'ds': dev_data, 'data': None, 'label_data': {}}} output_dtype = self.dtype_dict[self.target] data = self._to_dataset(data, output_dtype) diff --git a/lightwood/mixer/lightgbm_array.py b/lightwood/mixer/lightgbm_array.py index 005c0e071..a2bb5cc6f 100644 --- a/lightwood/mixer/lightgbm_array.py +++ b/lightwood/mixer/lightgbm_array.py @@ -31,23 +31,23 @@ def __init__( self.supports_proba = False self.stable = True - def fit(self, ds_arr: List[EncodedDs]) -> None: + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: log.info('Started fitting LGBM models for array prediction') for timestep in range(self.n_ts_predictions): if timestep > 0: - for idx in range(len(ds_arr)): - ds_arr[idx].data_frame[self.target] = ds_arr[idx].data_frame[f'{self.target}_timestep_{timestep}'] - self.models[timestep].fit(ds_arr) # @TODO: this call could be parallelized + train_data.data_frame[self.target] = train_data.data_frame[f'{self.target}_timestep_{timestep}'] + dev_data.data_frame[self.target] = dev_data.data_frame[f'{self.target}_timestep_{timestep}'] - def partial_fit(self, train_data: List[EncodedDs], dev_data: List[EncodedDs]) -> None: + self.models[timestep].fit(train_data, dev_data) # @TODO: this call could be parallelized + + def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: log.info('Updating array of LGBM models...') for timestep in range(self.n_ts_predictions): if timestep > 0: - for data in train_data, dev_data: - for idx in range(len(data)): - data[idx].data_frame[self.target] = data[idx].data_frame[f'{self.target}_timestep_{timestep}'] + train_data.data_frame[self.target] = train_data.data_frame[f'{self.target}_timestep_{timestep}'] + dev_data.data_frame[self.target] = dev_data.data_frame[f'{self.target}_timestep_{timestep}'] self.models[timestep].partial_fit(train_data, dev_data) # @TODO: this call could be parallelized diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index f6a41197e..3c60dff1c 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -17,7 +17,7 @@ from lightwood.helpers.log import log from lightwood.api.types import TimeseriesSettings from lightwood.helpers.torch import LightwoodAutocast -from lightwood.data.encoded_ds import ConcatedEncodedDs, EncodedDs +from lightwood.data.encoded_ds import EncodedDs from lightwood.mixer.helpers.transform_corss_entropy_loss import TransformCrossEntropyLoss from lightwood.mixer.base import BaseMixer from lightwood.mixer.helpers.ar_net import ArNet @@ -50,8 +50,8 @@ def __init__( self.search_hyperparameters = search_hyperparameters self.stable = True - def _final_tuning(self, data_arr): - if self.dtype_dict[self.target] in (dtype.integer, dtype.float): + def _final_tuning(self, data): + if self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.quantity): self.model = self.model.eval() with torch.no_grad(): acc_dict = {} @@ -59,17 +59,16 @@ def _final_tuning(self, data_arr): self.target_encoder.decode_log = decode_log decoded_predictions = [] decoded_real_values = [] - for data in data_arr: - for X, Y in data: - X = X.to(self.model.device) - Y = Y.to(self.model.device) - Yh = self.model(X) + for X, Y in data: + X = X.to(self.model.device) + Y = Y.to(self.model.device) + Yh = self.model(X) - Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh - Y = torch.unsqueeze(Y, 0) if len(Y.shape) < 2 else Y + Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh + Y = torch.unsqueeze(Y, 0) if len(Y.shape) < 2 else Y - decoded_predictions.extend(self.target_encoder.decode(Yh)) - decoded_real_values.extend(self.target_encoder.decode(Y)) + decoded_predictions.extend(self.target_encoder.decode(Yh)) + decoded_real_values.extend(self.target_encoder.decode(Y)) acc_dict[decode_log] = r2_score(decoded_real_values, decoded_predictions) @@ -80,10 +79,10 @@ def _select_criterion(self) -> torch.nn.Module: criterion = TransformCrossEntropyLoss(weight=self.target_encoder.index_weights.to(self.model.device)) elif self.dtype_dict[self.target] in (dtype.tags): criterion = nn.BCEWithLogitsLoss() - elif (self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.tsarray) + elif (self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.tsarray, dtype.quantity) and self.timeseries_settings.is_timeseries): criterion = nn.L1Loss() - elif self.dtype_dict[self.target] in (dtype.integer, dtype.float): + elif self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.quantity): criterion = MSELoss() else: criterion = MSELoss() @@ -183,7 +182,6 @@ def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, r running_losses.append(loss.item()) train_error = np.mean(running_losses) - epoch_error = self._error(dev_dl, criterion) running_errors.append(epoch_error) log.debug(f'Loss @ epoch {epoch}: {epoch_error}') @@ -223,39 +221,34 @@ def _error(self, dev_dl, criterion) -> float: running_losses.append(criterion(Yh, Y).item()) return np.mean(running_losses) - def _init_net(self, ds_arr: List[EncodedDs]): - net_kwargs = {'input_size': len(ds_arr[0][0][0]), - 'output_size': len(ds_arr[0][0][1]), + def _init_net(self, ds: EncodedDs): + net_kwargs = {'input_size': len(ds[0][0]), + 'output_size': len(ds[0][1]), 'num_hidden': self.num_hidden, 'dropout': 0} if self.net_class == ArNet: - net_kwargs['encoder_span'] = ds_arr[0].encoder_spans + net_kwargs['encoder_span'] = ds.encoder_spans net_kwargs['target_name'] = self.target self.model = self.net_class(**net_kwargs) # @TODO: Compare partial fitting fully on and fully off on the benchmarks! # @TODO: Writeup on the methodology for partial fitting - def fit(self, ds_arr: List[EncodedDs]) -> None: + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: # ConcatedEncodedDs - train_ds_arr = ds_arr[0:int(len(ds_arr) * 0.9)] - dev_ds_arr = ds_arr[int(len(ds_arr) * 0.9):] - - con_train_ds = ConcatedEncodedDs(train_ds_arr) - con_test_ds = ConcatedEncodedDs(dev_ds_arr) - self.batch_size = min(200, int(len(con_train_ds) / 10)) + self.batch_size = min(200, int(len(train_data) / 10)) self.batch_size = max(40, self.batch_size) - dev_dl = DataLoader(con_test_ds, batch_size=self.batch_size, shuffle=False) - train_dl = DataLoader(con_train_ds, batch_size=self.batch_size, shuffle=False) + dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=False) + train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=False) self.lr = 1e-4 self.num_hidden = 1 # Find learning rate # keep the weights - self._init_net(ds_arr) + self._init_net(train_data) self.lr, self.model = self._find_lr(train_dl) # Keep on training @@ -263,24 +256,19 @@ def fit(self, ds_arr: List[EncodedDs]) -> None: criterion = self._select_criterion() scaler = GradScaler() - train_dl = DataLoader(ConcatedEncodedDs(train_ds_arr), batch_size=200, shuffle=True) - self.model, epoch_to_best_model, err = self._max_fit( train_dl, dev_dl, criterion, optimizer, scaler, self.stop_after, return_model_after=20000) self.epochs_to_best += epoch_to_best_model - if len(con_test_ds) > 0: - if self.fit_on_dev: - self.partial_fit(dev_ds_arr, train_ds_arr) - self._final_tuning(dev_ds_arr) + if self.fit_on_dev: + self.partial_fit(dev_data, train_data) + self._final_tuning(dev_data) - def partial_fit(self, train_data: List[EncodedDs], dev_data: List[EncodedDs]) -> None: + def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: # Based this on how long the initial training loop took, at a low learning rate as to not mock anything up tooo badly # noqa - train_ds = ConcatedEncodedDs(train_data) - dev_ds = ConcatedEncodedDs(dev_data + train_data) - train_dl = DataLoader(train_ds, batch_size=self.batch_size, shuffle=True) - dev_dl = DataLoader(dev_ds, batch_size=self.batch_size, shuffle=True) + train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=True) + dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=True) optimizer = self._select_optimizer() criterion = self._select_criterion() scaler = GradScaler() diff --git a/lightwood/mixer/regression.py b/lightwood/mixer/regression.py index 42b9669d6..33ab2f802 100644 --- a/lightwood/mixer/regression.py +++ b/lightwood/mixer/regression.py @@ -1,5 +1,3 @@ -from typing import List - import torch import pandas as pd from scipy.special import softmax @@ -25,13 +23,13 @@ def __init__(self, stop_after: int, target_encoder: BaseEncoder, dtype_dict: dic self.label_map = {} self.stable = False - def fit(self, ds_arr: List[EncodedDs]) -> None: - if self.target_dtype not in (dtype.float, dtype.integer): + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + if self.target_dtype not in (dtype.float, dtype.integer, dtype.quantity): raise Exception(f'Unspported {self.target_dtype} type for regression') log.info('Fitting Linear Regression model') X = [] Y = [] - for x, y in ConcatedEncodedDs(ds_arr): + for x, y in ConcatedEncodedDs([train_data, dev_data]): X.append(x.tolist()) Y.append(y.tolist()) @@ -41,8 +39,8 @@ def fit(self, ds_arr: List[EncodedDs]) -> None: self.model = LinearRegression().fit(X, Y) log.info(f'Regression based correlation of: {self.model.score(X, Y)}') - def partial_fit(self, train_data: List[EncodedDs], dev_data: List[EncodedDs]) -> None: - self.fit(train_data + dev_data) + def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + self.fit(train_data, dev_data) def __call__(self, ds: EncodedDs, predict_proba: bool = False) -> pd.DataFrame: X = [] diff --git a/lightwood/mixer/sktime.py b/lightwood/mixer/sktime.py index d9f9a09ac..6620a512f 100644 --- a/lightwood/mixer/sktime.py +++ b/lightwood/mixer/sktime.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -from typing import Dict, List, Union +from typing import Dict, Union from sktime.forecasting.arima import AutoARIMA from lightwood.api import dtype @@ -32,10 +32,10 @@ def __init__( self.supports_proba = False self.stable = True - def fit(self, ds_arr: List[EncodedDs]) -> None: + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: log.info('Started fitting sktime forecaster for array prediction') - all_subsets = ConcatedEncodedDs(ds_arr) + all_subsets = ConcatedEncodedDs([train_data, dev_data]) df = all_subsets.data_frame.sort_values(by=f'__mdb_original_{self.ts_analysis["tss"].order_by[0]}') data = {'data': df[self.target], 'group_info': {gcol: df[gcol].tolist() @@ -91,12 +91,14 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], predict_proba: bool series_idxs, series_data = get_group_matches(data, group) if series_data.size > 0: + forecaster = self.models[group] if self.models[group].is_fitted else self.models['__default'] + series = pd.Series(series_data.squeeze(), index=series_idxs) series = series.sort_index(ascending=True) series = series.reset_index(drop=True) for idx, _ in enumerate(series.iteritems()): - ydf['prediction'].iloc[series_idxs[idx]] = self.models[group].predict( + ydf['prediction'].iloc[series_idxs[idx]] = forecaster.predict( np.arange(idx, # +cutoff idx + self.n_ts_predictions)).tolist() # +cutoff diff --git a/lightwood/mixer/unit.py b/lightwood/mixer/unit.py index e74cb9daf..7e682e3cf 100644 --- a/lightwood/mixer/unit.py +++ b/lightwood/mixer/unit.py @@ -21,12 +21,10 @@ def __init__(self, stop_after: int, target_encoder: BaseEncoder): self.supports_proba = False self.stable = True - def fit(self, ds_arr: List[EncodedDs]) -> None: + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: log.info("Unit Mixer just borrows from encoder") - def partial_fit( - self, train_data: List[EncodedDs], dev_data: List[EncodedDs] - ) -> None: + def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: pass def __call__(self, ds: EncodedDs, predict_proba: bool = False) -> pd.DataFrame: diff --git a/requirements.txt b/requirements.txt index 7bd3d735a..77a934fca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ NLTK >= 3, != 3.6 python-dateutil <2.8.1,>=2.1 numpy >= 1.16.2 -pandas >= 0.25.1 +pandas >= 1.1.5 schema >= 0.6.8 torchvision >= 0.10.0 torch >= 1.9.0 @@ -13,7 +13,7 @@ scipy >= 1.5.4 psutil >= 5.7.0 setuptools >= 21.2.1 wheel >= 0.32.2 -scikit-learn +scikit-learn <= 0.24.2 pillow < 7 langdetect >= 1.0.0 dataclasses_json >= 0.5.4 @@ -22,4 +22,4 @@ dill == 0.3.4 sktime >= 0.5.0 torch_optimizer == 0.1.0 pmdarima >= 1.8.0 -black >= 21.9b0 \ No newline at end of file +black >= 21.9b0 diff --git a/tests/integration/advanced/test_custom_modules.py b/tests/integration/advanced/test_custom_modules.py new file mode 100644 index 000000000..a7b1d805e --- /dev/null +++ b/tests/integration/advanced/test_custom_modules.py @@ -0,0 +1,106 @@ +from lightwood.api.high_level import json_ai_from_problem, code_from_json_ai, predictor_from_code +from lightwood.api.types import JsonAI, ProblemDefinition +import unittest +from mindsdb_datasources import FileDS +import os +import shutil + + +test_err_message = 'This ! Is ! A ! Testing ! Error !' +mdir = os.path.expanduser('~/lightwood_modules') + + +def create_custom_module(mpath, mcode): + try: + shutil.rmtree(mpath) + except Exception: + pass + + try: + os.mkdir(mdir) + except Exception: + pass + + with open(mpath, 'w') as fp: + fp.write(mcode) + + +class TestBasic(unittest.TestCase): + def test_0_add_throwing_cleaner(self): + module_code = """ +import pandas as pd + +def throwing_cleaner(data: pd.DataFrame, err_msg: str): + assert isinstance(data, pd.DataFrame) + raise Exception(err_msg) +""" + create_custom_module(os.path.join(mdir, 'custom_cleaners.py'), module_code) + + # Create base json ai + df = FileDS('tests/data/hdi.csv').df.iloc[0:400] + json_ai = json_ai_from_problem(df, ProblemDefinition.from_dict({'target': 'Development Index', 'time_aim': 20})) + + # modify it + json_ai_dump = json_ai.to_dict() + json_ai_dump['cleaner'] = { + 'module': 'custom_cleaners.throwing_cleaner', + 'args': { + 'err_msg': f'"{test_err_message}"' + } + } + + json_ai = JsonAI.from_dict(json_ai_dump) + + # create a predictor from it + code = code_from_json_ai(json_ai) + predictor = predictor_from_code(code) + try: + predictor.learn(df) + except Exception as e: + assert str(e) == test_err_message + return + + raise Exception('Predictor did not contain modified function!') + + def test_1_add_analyzer_block(self): + + mname = 'custom_analyzers' + cname = 'ExampleAnalysis' + module_code = f""" +from lightwood.analysis.base import BaseAnalysisBlock + +class {cname}(BaseAnalysisBlock): + def __init__(self): + super().__init__(deps=None) + + def analyze(self, info, **kwargs): + info['test'] = 'test' + return info + + def explain(self, row_insights, global_insights, **kwargs): + row_insights['test'] = 'test' + return row_insights, global_insights +""" + create_custom_module(os.path.join(mdir, f'{mname}.py'), module_code) + + # Create base json ai + df = FileDS('tests/data/hdi.csv').df.iloc[0:400] + json_ai = json_ai_from_problem(df, ProblemDefinition.from_dict({'target': 'Development Index', 'time_aim': 20})) + + # modify it + json_ai_dump = json_ai.to_dict() + json_ai_dump['analysis_blocks'] = [{ + 'module': f'{mname}.{cname}', + 'args': {} + }] + + json_ai = JsonAI.from_dict(json_ai_dump) + + # create a predictor from it + code = code_from_json_ai(json_ai) + predictor = predictor_from_code(code) + predictor.learn(df) + row_insights = predictor.predict(df) + + assert predictor.runtime_analyzer['test'] == 'test' + assert row_insights['test'].iloc[0] == 'test' diff --git a/tests/integration/advanced/test_timeseries.py b/tests/integration/advanced/test_timeseries.py index 1139fdbb8..01cde679b 100644 --- a/tests/integration/advanced/test_timeseries.py +++ b/tests/integration/advanced/test_timeseries.py @@ -56,7 +56,6 @@ def test_0_time_series_grouped_regression(self): pred = predictor_from_problem(train, ProblemDefinition.from_dict({'target': target, 'time_aim': 30, - 'nsubsets': 10, 'anomaly_detection': True, 'timeseries_settings': { 'use_previous_target': True, @@ -94,7 +93,6 @@ def test_1_time_series_regression(self): window = 5 pred = predictor_from_problem(data, ProblemDefinition.from_dict({'target': target, - 'nsubsets': 10, 'anomaly_detection': False, 'timeseries_settings': { 'use_previous_target': False, @@ -129,7 +127,6 @@ def test_2_time_series_classification(self): predictor = predictor_from_problem(df, ProblemDefinition.from_dict({'target': target, 'time_aim': 30, - 'nsubsets': 5, 'anomaly_detection': False, 'timeseries_settings': { 'order_by': ['T'], diff --git a/tests/integration/basic/test_airline.py b/tests/integration/basic/test_airline.py index a5f03cd84..46fc87097 100644 --- a/tests/integration/basic/test_airline.py +++ b/tests/integration/basic/test_airline.py @@ -6,6 +6,7 @@ class TestBasic(unittest.TestCase): + # Interesting: has coordinates as inputs def test_0_predict_file_flow(self): from lightwood.api.high_level import predictor_from_problem diff --git a/tests/integration/basic/test_boston_housing.py b/tests/integration/basic/test_boston_housing.py index 803a70130..b561aeb54 100644 --- a/tests/integration/basic/test_boston_housing.py +++ b/tests/integration/basic/test_boston_housing.py @@ -1,3 +1,4 @@ +from lightwood.api.dtype import dtype import unittest import pandas as pd from sklearn.metrics import r2_score @@ -16,11 +17,17 @@ def test_0_predict_file_flow(self): df = df.rename(columns={df.columns[3]: f'{{{df.columns[3]}\"'}) target = 'MEDV' + # Make this a quantity + df[target] = [f'{x}$' for x in df[target]] + predictor = predictor_from_problem(df, ProblemDefinition.from_dict({'target': target, 'time_aim': 200})) predictor.learn(df) + + assert predictor.model_analysis.dtypes[target] == dtype.quantity + predictions = predictor.predict(df) # sanity checks - self.assertTrue(r2_score(df[target], predictions['prediction']) > 0.8) + self.assertTrue(r2_score([float(x.rstrip('$')) for x in df[target]], predictions['prediction']) > 0.8) self.assertTrue(all([0 <= p <= 1 for p in predictions['confidence']])) self.assertTrue(all([p['lower'] <= p['prediction'] <= p['upper'] for _, p in predictions.iterrows()])) diff --git a/tests/integration/basic/test_categorical.py b/tests/integration/basic/test_categorical.py index d2884979d..59a60d48c 100644 --- a/tests/integration/basic/test_categorical.py +++ b/tests/integration/basic/test_categorical.py @@ -35,12 +35,12 @@ def setup_predictor(self, df, target): return predictor def test_0_binary(self): - df = pd.read_csv('tests/data/adult.csv')[:300] + df = pd.read_csv('tests/data/adult.csv')[:100] target = 'income' predictor = self.setup_predictor(df, target) predictions = predictor.predict(df) - - self.assertTrue(balanced_accuracy_score(df[target], predictions['prediction']) > 0.7) + acc = balanced_accuracy_score(df[target], predictions['prediction']) + self.assertTrue(acc > 0.5) self.assertTrue(all([0 <= p <= 1 for p in predictions['confidence']])) def test_1_categorical(self): diff --git a/tests/integration/basic/test_weird_target_dist.py b/tests/integration/basic/test_weird_target_dist.py index 9415d7984..34f24bd50 100644 --- a/tests/integration/basic/test_weird_target_dist.py +++ b/tests/integration/basic/test_weird_target_dist.py @@ -1,6 +1,7 @@ import unittest import pandas as pd from lightwood.api.types import ProblemDefinition +from lightwood import dtype class TestBasic(unittest.TestCase): @@ -28,4 +29,5 @@ def test_0_unkown_cateogires_in_test(self): predictor = predictor_from_problem(df, ProblemDefinition.from_dict( {'target': target, 'time_aim': 60, 'unbias_target': True})) predictor.learn(df) + assert predictor.model_analysis.dtypes['target'] == dtype.categorical predictor.predict(df) diff --git a/tests/unit_tests/data/test_transform_ts.py b/tests/unit_tests/data/test_transform_ts.py index e69de29bb..6177792a1 100644 --- a/tests/unit_tests/data/test_transform_ts.py +++ b/tests/unit_tests/data/test_transform_ts.py @@ -0,0 +1,66 @@ +import unittest + +import numpy as np +import pandas as pd + +from lightwood.data.timeseries_analyzer import get_naive_residuals +from lightwood.helpers.general import mase, evaluate_array_r2_accuracy + + +class TestTransformTS(unittest.TestCase): + def test_mase(self): + true = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]]) + + # edge case: perfect forecast + for scale_error in [1e0, 1e2, 1e4]: + self.assertTrue(mase(true, true, scale_error, fh=5) == 0) + + # check naive forecast is exactly one + naive_residual = np.average(abs(true[:, 1:] - true[:, :-1])) + self.assertTrue(mase(true[:, 1:], true[:, :-1], naive_residual, fh=4) == 1) + + # edge case: constant series + true = np.array([[2.0, 2.0, 2.0, 2.0, 2.0]]) + pred = np.array([[4.0, 4.0, 4.0, 4.0, 4.0]]) + self.assertTrue(mase(true, pred, 0.0, fh=5) == 2.0) + + # test multiple instance handling (i.e. two 5-step-ahead forecasts) + true = [[10, 20, 30, 40, 50], [60, 70, 80, 90, 100]] + pred = [[15, 25, 35, 45, 55], [65, 75, 85, 95, 105]] + self.assertTrue(mase(true, pred, scale_error=5, fh=5) == 1) + self.assertTrue(mase(true, pred, scale_error=1, fh=5) == 5) + self.assertTrue(mase(true, pred, scale_error=10, fh=5) == 0.5) + + def test_get_residuals(self): + data_len = 10 + + target = [i for i in range(data_len)] + all_residuals, mean = get_naive_residuals(pd.DataFrame(target)) + self.assertEqual(all_residuals, [1.0 for _ in range(data_len - 1)]) + self.assertEqual(mean, 1) + + target = [0 for _ in range(data_len)] + all_residuals, mean = get_naive_residuals(pd.DataFrame(target)) + self.assertEqual(all_residuals, [0.0 for _ in range(data_len - 1)]) + self.assertEqual(mean, 0) + + target = [1, 4, 2, 5, 3] + all_residuals, mean = get_naive_residuals(pd.DataFrame(target)) + self.assertEqual(all_residuals, [3.0, 2.0, 3.0, 2.0]) + self.assertEqual(mean, 2.5) + + def test_evaluate_array_r2_accuracy(self): + true = [[10, 20, 30, 40, 50], [60, 70, 80, 90, 100]] + self.assertTrue(evaluate_array_r2_accuracy(true, true) == 1.0) + + pred = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] + self.assertTrue(evaluate_array_r2_accuracy(true, pred) == 0.0) + + pred = [[i + 1 for i in instance] for instance in true] + self.assertGreaterEqual(evaluate_array_r2_accuracy(true, pred), 0.99) + + pred = [[i - 1 for i in instance] for instance in true] + self.assertGreaterEqual(evaluate_array_r2_accuracy(true, pred), 0.99) + + pred = [[-i for i in instance] for instance in true] + self.assertTrue(evaluate_array_r2_accuracy(true, pred) == 0.0) diff --git a/tests/unit_tests/encoder/categorical/test_autoencoder.py b/tests/unit_tests/encoder/categorical/test_autoencoder.py index 7a1cbc18a..ed95fea2b 100644 --- a/tests/unit_tests/encoder/categorical/test_autoencoder.py +++ b/tests/unit_tests/encoder/categorical/test_autoencoder.py @@ -4,7 +4,7 @@ import random import logging from sklearn.metrics import accuracy_score - +import pandas as pd from lightwood.helpers.log import log @@ -34,7 +34,7 @@ def test_autoencoder(self): enc = CategoricalAutoEncoder(stop_after=20) enc.desired_error = 3 - enc.prepare(priming_data) + enc.prepare(pd.Series(priming_data), pd.Series(priming_data)) encoded_data = enc.encode(test_data) decoded_data = enc.decode(encoded_data) diff --git a/tests/unit_tests/encoder/text/test_pretrained.py b/tests/unit_tests/encoder/text/test_pretrained.py index f2568f2bd..1a28ea3a6 100644 --- a/tests/unit_tests/encoder/text/test_pretrained.py +++ b/tests/unit_tests/encoder/text/test_pretrained.py @@ -5,6 +5,7 @@ from lightwood.encoder.numeric import NumericEncoder from lightwood.encoder.text import PretrainedLangEncoder from lightwood.api.dtype import dtype +import pandas as pd class TestPretrainedLangEncoder(unittest.TestCase): @@ -32,7 +33,7 @@ def test_encode_and_decode(self): enc = PretrainedLangEncoder(stop_after=10) - enc.prepare(priming_data, + enc.prepare(pd.Series(priming_data), pd.Series(priming_data), encoded_target_values={'targets': [ {'output_type': dtype.float, 'encoded_output': encoded_data_1}, ]}) diff --git a/tests/unit_tests/encoder/text/test_rnn.py b/tests/unit_tests/encoder/text/test_rnn.py index dd55a9638..763cda226 100644 --- a/tests/unit_tests/encoder/text/test_rnn.py +++ b/tests/unit_tests/encoder/text/test_rnn.py @@ -1,5 +1,6 @@ import unittest from lightwood.encoder.text import RnnEncoder +import pandas as pd class TestRnnEncoder(unittest.TestCase): @@ -14,7 +15,7 @@ def test_encode_and_decode(self): ] encoder = RnnEncoder(encoded_vector_size=10, train_iters=7500) - encoder.prepare(sentences) + encoder.prepare(pd.Series(sentences), pd.Series(sentences)) encoder.encode(sentences) # test de decoder diff --git a/tests/unit_tests/encoder/text/test_short.py b/tests/unit_tests/encoder/text/test_short.py index 9f95855c9..83ee806ff 100644 --- a/tests/unit_tests/encoder/text/test_short.py +++ b/tests/unit_tests/encoder/text/test_short.py @@ -89,7 +89,6 @@ def test_smallvocab_target_auto_mode(self): enc = ShortTextEncoder(is_target=True) enc.prepare(priming_data) - assert not enc.cae.use_autoencoder assert enc.is_target is True # _combine is expected to be 'concat' when is_target is True @@ -113,7 +112,6 @@ def test_non_smallvocab_target_auto_mode(self): enc = ShortTextEncoder(is_target=True) enc.prepare(priming_data) - assert not enc.cae.use_autoencoder assert enc.is_target is True # _combine is expected to be 'concat' when is_target is True @@ -137,7 +135,6 @@ def test_smallvocab_non_target_auto_mode(self): enc = ShortTextEncoder(is_target=False) enc.prepare(priming_data) - assert not enc.cae.use_autoencoder assert enc.is_target is False # _combine is expected to be 'mean' when is_target is False @@ -157,7 +154,6 @@ def test_non_smallvocab_non_target_auto_mode(self): enc = ShortTextEncoder(is_target=False) enc.prepare(priming_data) - assert enc.cae.use_autoencoder assert enc.is_target is False # _combine is expected to be 'mean' when is_target is False @@ -177,7 +173,6 @@ def test_smallvocab_non_target_manual_mode(self): enc = ShortTextEncoder(is_target=False, mode='concat') enc.prepare(priming_data) - assert not enc.cae.use_autoencoder assert enc.is_target is False assert enc._mode == 'concat' @@ -199,7 +194,6 @@ def test_non_smallvocab_non_target_manual_mode(self): enc = ShortTextEncoder(is_target=False, mode='concat') enc.prepare(priming_data) - assert enc.cae.use_autoencoder assert enc.is_target is False assert enc._mode == 'concat' diff --git a/tests/unit_tests/encoder/time_series/test_timeseries_rnn.py b/tests/unit_tests/encoder/time_series/test_timeseries_rnn.py index 12dc17791..4383f9767 100644 --- a/tests/unit_tests/encoder/time_series/test_timeseries_rnn.py +++ b/tests/unit_tests/encoder/time_series/test_timeseries_rnn.py @@ -3,6 +3,7 @@ import torch from lightwood.encoder.time_series import TimeSeriesEncoder from lightwood.encoder.time_series.helpers.common import MinMaxNormalizer, CatNormalizer +import pandas as pd class TestRnnEncoder(unittest.TestCase): @@ -48,7 +49,8 @@ def test_overfit(self): batch_size = 1 encoder = TimeSeriesEncoder(stop_after=10) - encoder.prepare(data, feedback_hoop_function=lambda x: print(x), batch_size=batch_size) + encoder.prepare(pd.Series(data), pd.Series(data), + feedback_hoop_function=lambda x: print(x), batch_size=batch_size) encoded = encoder.encode(data) decoded = encoder.decode(encoded, steps=timesteps).tolist() diff --git a/tests/unit_tests/encoder/time_series/test_transformer.py b/tests/unit_tests/encoder/time_series/test_transformer.py index 2f6507f74..50d701927 100644 --- a/tests/unit_tests/encoder/time_series/test_transformer.py +++ b/tests/unit_tests/encoder/time_series/test_transformer.py @@ -4,6 +4,7 @@ import unittest from lightwood.encoder.time_series import TimeSeriesEncoder from lightwood.encoder.time_series.helpers.transformer_helpers import TransformerEncoder, len_to_mask, get_chunk +import pandas as pd class TestTransformerEncoder(unittest.TestCase): @@ -61,7 +62,7 @@ def test_overfit(self): encoder = TimeSeriesEncoder(stop_after=10) encoder.encoder_class = TransformerEncoder encoder._transformer_hidden_size = 32 - encoder.prepare(data, feedback_hoop_function=print) + encoder.prepare(pd.Series(data), pd.Series(data), feedback_hoop_function=print) correct_answer = torch.tensor(example)[:, 1:]