diff --git a/.github/workflows/analysis.yml b/.github/workflows/analysis.yml new file mode 100644 index 000000000..0f0769377 --- /dev/null +++ b/.github/workflows/analysis.yml @@ -0,0 +1,26 @@ +name: Static Code Analysis Of Codebase + +on: [push, pull_request] + +jobs: + static_analysis: + runs-on: ubuntu-latest + steps: + - name: Install Python for analysis + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies for analysis + run: | + python -m pip install --upgrade pip + pip install prospector + + - name: Run static code analysis using Prospector tool + run: prospector > static_analysis.json + + - name: Deploy Static Analysis Result to GitHub Artifacts + uses: actions/upload-artifact@v2 + with: + name: static_analysis_result + path: static_analyis.json \ No newline at end of file diff --git a/.github/workflows/ligthtwood.yml b/.github/workflows/ligthtwood.yml index 77914e128..c059eb005 100644 --- a/.github/workflows/ligthtwood.yml +++ b/.github/workflows/ligthtwood.yml @@ -2,14 +2,14 @@ name: Integration and Unit Tests Lightwood on: push: + branches: [ staging ] pull_request: - branches: - - stable - - staging + branches: [ staging ] + jobs: test: - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest strategy: matrix: os: [ubuntu-latest] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..73b28aafb --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/jendrikseipp/vulture + rev: 'v2.3' # or any later Vulture version + hooks: + - id: vulture \ No newline at end of file diff --git a/lightwood/__about__.py b/lightwood/__about__.py deleted file mode 100755 index ce1b7dff2..000000000 --- a/lightwood/__about__.py +++ /dev/null @@ -1,10 +0,0 @@ -__title__ = 'lightwood' -__package_name__ = 'lightwood' -__version__ = '1.5.0' -__description__ = "Lightwood is a toolkit for automatic machine learning model building" -__email__ = "community@mindsdb.com" -__author__ = 'MindsDB Inc' -__github__ = 'https://github.com/mindsdb/lightwood' -__pypi__ = 'https://pypi.org/project/lightwood' -__license__ = 'GPL-3.0' -__copyright__ = 'Copyright 2019- mindsdb' diff --git a/lightwood/analysis/base.py b/lightwood/analysis/base.py deleted file mode 100644 index 869236bae..000000000 --- a/lightwood/analysis/base.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import Tuple, Dict, Optional - -import pandas as pd -from lightwood.helpers.log import log - - -class BaseAnalysisBlock: - """Class to be inherited by any analysis/explainer block.""" - def __init__(self, - deps: Optional[Tuple] = () - ): - - self.dependencies = deps # can be parallelized when there are no dependencies @TODO enforce - - def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: - """ - This method should be called once during the analysis phase, or not called at all. - It computes any information that the block may either output to the model analysis object, - or use at inference time when `.explain()` is called (in this case, make sure all needed - objects are added to the runtime analyzer so that `.explain()` can access them). - - :param info: Dictionary where any new information or objects are added. The next analysis block will use - the output of the previous block as a starting point. - :param kwargs: Dictionary with named variables from either the core analysis or the rest of the prediction - pipeline. - """ - log.info(f"{self.__class__.__name__}.analyze() has not been implemented, no modifications will be done to the model analysis.") # noqa - return info - - def explain(self, - row_insights: pd.DataFrame, - global_insights: Dict[str, object], **kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]: - """ - This method should be called once during the explaining phase at inference time, or not called at all. - Additional explanations can be at an instance level (row-wise) or global. - For the former, return a data frame with any new insights. For the latter, a dictionary is required. - - :param row_insights: dataframe with previously computed row-level explanations. - :param global_insights: dict() with any explanations that concern all predicted instances or the model itself. - - :returns: - - row_insights: modified input dataframe with any new row insights added here. - - global_insights: dict() with any explanations that concern all predicted instances or the model itself. - """ - log.info(f"{self.__class__.__name__}.explain() has not been implemented, no modifications will be done to the data insights.") # noqa - return row_insights, global_insights diff --git a/lightwood/analysis/explain.py b/lightwood/analysis/explain.py deleted file mode 100644 index 1ba3f9aa2..000000000 --- a/lightwood/analysis/explain.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Optional, List, Dict -import torch -import pandas as pd - -from lightwood.helpers.log import log -from lightwood.api.types import TimeseriesSettings -from lightwood.helpers.ts import get_inferred_timestamps -from lightwood.analysis.base import BaseAnalysisBlock - - -def explain(data: pd.DataFrame, - encoded_data: torch.Tensor, - predictions: pd.DataFrame, - timeseries_settings: TimeseriesSettings, - analysis: Dict, - target_name: str, - target_dtype: str, - - positive_domain: bool, # @TODO: pass inside a {} with params for each block to avoid signature overload - fixed_confidence: float, - anomaly_detection: bool, - - # forces specific confidence level in ICP - anomaly_error_rate: float, - - # ignores anomaly detection for N steps after an - # initial anomaly triggers the cooldown period; - # implicitly assumes series are regularly spaced - anomaly_cooldown: int, - - explainer_blocks: Optional[List[BaseAnalysisBlock]] = [], - ts_analysis: Optional[Dict] = {} - ): - """ - This procedure runs at the end of every normal `.predict()` call. Its goal is to generate prediction insights, - potentially using information generated at the model analysis stage (e.g. confidence estimation). - - As in `analysis()`, any user-specified analysis blocks (see class `BaseAnalysisBlock`) are also called here. - - :return: - row_insights: a DataFrame containing predictions and all generated insights at a row-level. - """ - - # ------------------------- # - # Setup base insights - # ------------------------- # - data = data.reset_index(drop=True) - - row_insights = pd.DataFrame() - global_insights = {} - row_insights['prediction'] = predictions['prediction'] - - if target_name in data.columns: - row_insights['truth'] = data[target_name] - else: - row_insights['truth'] = [None] * len(predictions['prediction']) - - if timeseries_settings.is_timeseries: - if timeseries_settings.group_by: - for col in timeseries_settings.group_by: - row_insights[f'group_{col}'] = data[col] - - for col in timeseries_settings.order_by: - row_insights[f'order_{col}'] = data[col] - - for col in timeseries_settings.order_by: - row_insights[f'order_{col}'] = get_inferred_timestamps( - row_insights, col, ts_analysis['deltas'], timeseries_settings) - - kwargs = { - 'data': data, - 'encoded_data': encoded_data, - 'predictions': predictions, - 'analysis': analysis, - 'target_name': target_name, - 'target_dtype': target_dtype, - 'tss': timeseries_settings, - 'positive_domain': positive_domain, - 'fixed_confidence': fixed_confidence, - 'anomaly_detection': anomaly_detection, - 'anomaly_error_rate': anomaly_error_rate, - 'anomaly_cooldown': anomaly_cooldown - } - - # ------------------------- # - # Call explanation blocks - # ------------------------- # - for block in explainer_blocks: - log.info("The block %s is now running its explain() method", block.__class__.__name__) - row_insights, global_insights = block.explain(row_insights, global_insights, **kwargs) - - return row_insights, global_insights diff --git a/lightwood/analysis/helpers/__init__.py b/lightwood/analysis/helpers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/analysis/helpers/acc_stats.py b/lightwood/analysis/helpers/acc_stats.py deleted file mode 100644 index a05597d8b..000000000 --- a/lightwood/analysis/helpers/acc_stats.py +++ /dev/null @@ -1,186 +0,0 @@ -import random -from types import SimpleNamespace -from typing import Dict, Optional - -import numpy as np -from sklearn.metrics import confusion_matrix - -from lightwood.api.dtype import dtype -from lightwood.analysis.base import BaseAnalysisBlock -from lightwood.helpers.general import evaluate_accuracy - - -class AccStats(BaseAnalysisBlock): - """ Computes accuracy stats and a confusion matrix for the validation dataset """ - - def __init__(self, deps=('ICP',)): - super().__init__(deps=deps) # @TODO: enforce that this actually prevents early execution somehow - - def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: - ns = SimpleNamespace(**kwargs) - - # @TODO: maybe pass ts_analysis to trigger group-wise MASE instead of R2 mean, though it wouldn't be 0-1 bounded - info['score_dict'] = evaluate_accuracy(ns.data, ns.normal_predictions['prediction'], - ns.target, ns.accuracy_functions) - info['normal_accuracy'] = np.mean(list(info['score_dict'].values())) - - self.fit(ns, info['result_df']) - info['val_overall_acc'], info['acc_histogram'], info['cm'], info['acc_samples'] = self.get_accuracy_stats() - return info - - def fit(self, ns: SimpleNamespace, conf=Optional[np.ndarray]): - self.col_stats = ns.dtype_dict - self.target = ns.target - self.input_cols = list(ns.dtype_dict.keys()) - self.buckets = ns.stats_info.buckets if ns.stats_info.buckets else {} - - self.normal_predictions_bucketized = [] - self.real_values_bucketized = [] - self.numerical_samples_arr = [] - - column_indexes = {} - for i, col in enumerate(self.input_cols): - column_indexes[col] = i - - real_present_inputs_arr = [] - for _, row in ns.data.iterrows(): - present_inputs = [1] * len(self.input_cols) - for i, col in enumerate(self.input_cols): - if str(row[col]) in ('None', 'nan', '', 'Nan', 'NAN', 'NaN'): - present_inputs[i] = 0 - real_present_inputs_arr.append(present_inputs) - - for n in range(len(ns.normal_predictions)): - row = ns.data.iloc[n] - real_value = row[self.target] - predicted_value = ns.normal_predictions.iloc[n]['prediction'] - - if isinstance(predicted_value, list): - # T+N time series, for now we compare the T+1 prediction only @TODO: generalize - predicted_value = predicted_value[0] - - predicted_value = predicted_value \ - if self.col_stats[self.target] not in [dtype.integer, dtype.float, dtype.quantity] \ - else float(predicted_value) - - real_value = real_value \ - if self.col_stats[self.target] not in [dtype.integer, dtype.float, dtype.quantity] \ - else float(real_value) - - if self.buckets: - bucket = self.buckets[self.target] - predicted_value_b = get_value_bucket(predicted_value, bucket, self.col_stats[self.target]) - real_value_b = get_value_bucket(real_value, bucket, self.col_stats[self.target]) - else: - predicted_value_b = predicted_value - real_value_b = real_value - - if conf is not None and self.col_stats[self.target] in [dtype.integer, dtype.float, dtype.quantity]: - predicted_range = conf.iloc[n][['lower', 'upper']].tolist() - else: - predicted_range = (predicted_value_b, predicted_value_b) - - self.real_values_bucketized.append(real_value_b) - self.normal_predictions_bucketized.append(predicted_value_b) - if conf is not None and self.col_stats[self.target] in [dtype.integer, dtype.float, dtype.quantity]: - self.numerical_samples_arr.append((real_value, predicted_range)) - - def get_accuracy_stats(self, is_classification=None, is_numerical=None): - bucket_accuracy = {} - bucket_acc_counts = {} - for i, bucket in enumerate(self.normal_predictions_bucketized): - if bucket not in bucket_acc_counts: - bucket_acc_counts[bucket] = [] - - if len(self.numerical_samples_arr) != 0: - bucket_acc_counts[bucket].append(self.numerical_samples_arr[i][1][0] < - self.numerical_samples_arr[i][0] < self.numerical_samples_arr[i][1][1]) # noqa - else: - bucket_acc_counts[bucket].append(1 if bucket == self.real_values_bucketized[i] else 0) - - for bucket in bucket_acc_counts: - bucket_accuracy[bucket] = sum(bucket_acc_counts[bucket]) / len(bucket_acc_counts[bucket]) - - accuracy_count = [] - for counts in list(bucket_acc_counts.values()): - accuracy_count += counts - - overall_accuracy = sum(accuracy_count) / len(accuracy_count) - - for bucket in range(len(self.buckets)): - if bucket not in bucket_accuracy: - if bucket in self.real_values_bucketized: - # If it was never predicted, but it did exist as a real value, then assume 0% confidence when it does get predicted # noqa - bucket_accuracy[bucket] = 0 - - for bucket in range(len(self.buckets)): - if bucket not in bucket_accuracy: - # If it wasn't seen either in the real values or in the predicted values, assume average confidence (maybe should be 0 instead ?) # noqa - bucket_accuracy[bucket] = overall_accuracy - - accuracy_histogram = { - 'buckets': list(bucket_accuracy.keys()), - 'accuracies': list(bucket_accuracy.values()), - 'is_classification': is_classification, - 'is_numerical': is_numerical - } - - labels = list(set([*self.real_values_bucketized, *self.normal_predictions_bucketized])) - matrix = confusion_matrix(self.real_values_bucketized, self.normal_predictions_bucketized, labels=labels) - matrix = [[int(y) if str(y) != 'nan' else 0 for y in x] for x in matrix] - - target_bucket = self.buckets[self.target] - bucket_values = [target_bucket[i] if i < len(target_bucket) else None for i in labels] - - cm = { - 'matrix': matrix, - 'predicted': bucket_values, - 'real': bucket_values - } - - accuracy_samples = None - if len(self.numerical_samples_arr) > 0: - nr_samples = min(400, len(self.numerical_samples_arr)) - sampled_numerical_samples_arr = random.sample(self.numerical_samples_arr, nr_samples) - accuracy_samples = { - 'y': [x[0] for x in sampled_numerical_samples_arr], - 'x': [x[1] for x in sampled_numerical_samples_arr] - } - - return overall_accuracy, accuracy_histogram, cm, accuracy_samples - - -def get_value_bucket(value, buckets, target_dtype): - """ - :return: The bucket in the `histogram` in which our `value` falls - """ - if buckets is None: - return None - - if target_dtype in (dtype.binary, dtype.categorical): - if value in buckets: - bucket = buckets.index(value) - else: - bucket = len(buckets) # for null values - - elif target_dtype in (dtype.integer, dtype.float, dtype.quantity): - bucket = closest(buckets, value) - else: - bucket = len(buckets) # for null values - - return bucket - - -def closest(arr, value): - """ - :return: The index of the member of `arr` which is closest to `value` - """ - if value is None: - return -1 - - for i, ele in enumerate(arr): - value = float(str(value).replace(',', '.')) - if ele > value: - return i - 1 - - return len(arr) - 1 diff --git a/lightwood/analysis/nc/LICENSE b/lightwood/analysis/nc/LICENSE deleted file mode 100644 index f305d4eb9..000000000 --- a/lightwood/analysis/nc/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Henrik Linusson - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/lightwood/analysis/nc/__init__.py b/lightwood/analysis/nc/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/analysis/nc/base.py b/lightwood/analysis/nc/base.py deleted file mode 100644 index 2e472e4d4..000000000 --- a/lightwood/analysis/nc/base.py +++ /dev/null @@ -1,146 +0,0 @@ -# Original author: Henrik Linusson (github.com/donlnz) -import abc -from typing import Dict -import numpy as np -from sklearn.base import BaseEstimator - - -from lightwood.analysis.nc.util import t_softmax - - -class RegressorMixin(object): - def __init__(self) -> None: - super(RegressorMixin, self).__init__() - - @classmethod - def get_problem_type(cls): - return 'regression' - - -class ClassifierMixin(object): - def __init__(self) -> None: - super(ClassifierMixin, self).__init__() - - @classmethod - def get_problem_type(cls) -> str: - return 'classification' - - -class BaseModelAdapter(BaseEstimator): - __metaclass__ = abc.ABCMeta - - def __init__(self, model: object, fit_params: Dict[str, object] = None) -> None: - super(BaseModelAdapter, self).__init__() - - self.model = model - self.last_x, self.last_y = None, None - self.clean = False - self.fit_params = {} if fit_params is None else fit_params - - def fit(self, x: np.array, y: np.array) -> None: - """Fits the model. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of examples for fitting the model. - - y : numpy array of shape [n_samples] - Outputs of examples for fitting the model. - - Returns - ------- - None - """ - - self.model.fit(x, y, **self.fit_params) - self.clean = False - - def predict(self, x: np.array) -> np.array: - """Returns the prediction made by the underlying model. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of test examples. - - Returns - ------- - y : numpy array of shape [n_samples] - Predicted outputs of test examples. - """ - if ( - not self.clean or - self.last_x is None or - self.last_y is None or - not np.array_equal(self.last_x, x) - ): - self.last_x = x - self.last_y = self._underlying_predict(x) - self.clean = True - - return self.last_y.copy() - - @abc.abstractmethod - def _underlying_predict(self, x: np.array) -> np.array: - """Produces a prediction using the encapsulated model. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of test examples. - - Returns - ------- - y : numpy array of shape [n_samples] - Predicted outputs of test examples. - """ - pass - - -class ClassifierAdapter(BaseModelAdapter): - def __init__(self, model: object, fit_params: Dict[str, object] = None) -> None: - super(ClassifierAdapter, self).__init__(model, fit_params) - - def _underlying_predict(self, x: np.array) -> np.array: - return self.model.predict_proba(x) - - -class RegressorAdapter(BaseModelAdapter): - def __init__(self, model: object, fit_params: Dict[str, object] = None) -> None: - super(RegressorAdapter, self).__init__(model, fit_params) - - def _underlying_predict(self, x: np.array) -> np.array: - return self.model.predict(x) - - -class CachedRegressorAdapter(RegressorAdapter): - def __init__(self, model, fit_params=None): - super(CachedRegressorAdapter, self).__init__(model, fit_params) - self.prediction_cache = None - - def fit(self, x=None, y=None): - """ At this point, the predictor has already been trained, but this - has to be called to setup some things in the nonconformist backend """ - pass - - def predict(self, x=None): - """ Same as in .fit() - :return: np.array (n_test, n_classes) with class probability estimates """ - return self.prediction_cache - - -class CachedClassifierAdapter(ClassifierAdapter): - def __init__(self, model, fit_params=None): - super(CachedClassifierAdapter, self).__init__(model, fit_params) - self.prediction_cache = None - - def fit(self, x=None, y=None): - """ At this point, the predictor has already been trained, but this - has to be called to setup some things in the nonconformist backend """ - pass - - def predict(self, x=None): - """ Same as in .fit() - :return: np.array (n_test, n_classes) with class probability estimates """ - return t_softmax(self.prediction_cache, t=0.5) diff --git a/lightwood/analysis/nc/icp.py b/lightwood/analysis/nc/icp.py deleted file mode 100644 index a4792a1f9..000000000 --- a/lightwood/analysis/nc/icp.py +++ /dev/null @@ -1,345 +0,0 @@ -""" -Inductive conformal predictors. -""" -# Original author: Henrik Linusson (github.com/donlnz) -from collections import defaultdict -from functools import partial -from typing import Optional, Union -import numpy as np -from sklearn.base import BaseEstimator -from lightwood.analysis.nc.base import RegressorMixin, ClassifierMixin -from types import FunctionType - - -# ----------------------------------------------------------------------------- -# Base inductive conformal predictor -# ----------------------------------------------------------------------------- -class BaseIcp(BaseEstimator): - """Base class for inductive conformal predictors. - """ - - def __init__(self, nc_function: FunctionType, condition: Union[bool, FunctionType] = None): - self.cal_x, self.cal_y = None, None - self.nc_function = nc_function - - # Check if condition-parameter is the default function (i.e., - # lambda x: 0). This is so we can safely clone the object without - # the clone accidentally having self.conditional = True. - def default_condition(x): - return 0 - is_default = (callable(condition) and - (condition.__code__.co_code == - default_condition.__code__.co_code)) - - if is_default: - self.condition = condition - self.conditional = False - elif callable(condition): - self.condition = condition - self.conditional = True - else: - self.condition = lambda x: 0 - self.conditional = False - - def fit(self, x: np.array, y: np.array) -> None: - """Fit underlying nonconformity scorer. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of examples for fitting the nonconformity scorer. - - y : numpy array of shape [n_samples] - Outputs of examples for fitting the nonconformity scorer. - - Returns - ------- - None - """ - # TODO: incremental? - self.nc_function.fit(x, y) - - def calibrate(self, x, y, increment=False): - """Calibrate conformal predictor based on underlying nonconformity - scorer. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of examples for calibrating the conformal predictor. - - y : numpy array of shape [n_samples, n_features] - Outputs of examples for calibrating the conformal predictor. - - increment : boolean - If ``True``, performs an incremental recalibration of the conformal - predictor. The supplied ``x`` and ``y`` are added to the set of - previously existing calibration examples, and the conformal - predictor is then calibrated on both the old and new calibration - examples. - - Returns - ------- - None - """ - self._calibrate_hook(x, y, increment) - self._update_calibration_set(x, y, increment) - - if self.conditional: - category_map = np.array([self.condition((x[i, :], y[i])) - for i in range(y.size)]) - self.categories = np.unique(category_map) - self.cal_scores = defaultdict(partial(np.ndarray, 0)) - - for cond in self.categories: - idx = category_map == cond - cal_scores = self.nc_function.score(self.cal_x[idx, :], - self.cal_y[idx]) - self.cal_scores[cond] = np.sort(cal_scores)[::-1] - else: - self.categories = np.array([0]) - cal_scores = self.nc_function.score(self.cal_x, self.cal_y) - self.cal_scores = {0: np.sort(cal_scores)[::-1]} - - def _update_calibration_set(self, x: np.array, y: np.array, increment: bool) -> None: - if increment and self.cal_x is not None and self.cal_y is not None: - self.cal_x = np.vstack([self.cal_x, x]) - self.cal_y = np.hstack([self.cal_y, y]) - else: - self.cal_x, self.cal_y = x, y - - def _calibrate_hook(self, x: np.array, y: np.array, increment: bool) -> None: - pass - - -# ----------------------------------------------------------------------------- -# Inductive conformal classifier -# ----------------------------------------------------------------------------- -class IcpClassifier(BaseIcp, ClassifierMixin): - """Inductive conformal classifier. - - Parameters - ---------- - nc_function : BaseScorer - Nonconformity scorer object used to calculate nonconformity of - calibration examples and test patterns. Should implement ``fit(x, y)`` - and ``calc_nc(x, y)``. - - smoothing : boolean - Decides whether to use stochastic smoothing of p-values. - - Attributes - ---------- - cal_x : numpy array of shape [n_cal_examples, n_features] - Inputs of calibration set. - - cal_y : numpy array of shape [n_cal_examples] - Outputs of calibration set. - - nc_function : BaseScorer - Nonconformity scorer object used to calculate nonconformity scores. - - classes : numpy array of shape [n_classes] - List of class labels, with indices corresponding to output columns - of IcpClassifier.predict() - - See also - -------- - IcpRegressor - - References - ---------- - .. [1] Papadopoulos, H., & Haralambous, H. (2011). Reliable prediction - intervals with regression neural networks. Neural Networks, 24(8), - 842-851. - """ - - def __init__(self, nc_function: FunctionType, condition: Union[bool, FunctionType] = None, - smoothing: bool = True) -> None: - super(IcpClassifier, self).__init__(nc_function, condition) - self.classes = None - self.smoothing = smoothing - - def _calibrate_hook(self, x: np.array, y: np.array, increment: bool = False) -> None: - self._update_classes(y, increment) - - def _update_classes(self, y: np.array, increment: bool) -> None: - if self.classes is None or not increment: - self.classes = np.unique(y) - else: - self.classes = np.unique(np.hstack([self.classes, y])) - - def predict(self, x: np.array, significance: Optional[float] = None) -> np.array: - """Predict the output values for a set of input patterns. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of patters for which to predict output values. - - significance : float or None - Significance level (maximum allowed error rate) of predictions. - Should be a float between 0 and 1. If ``None``, then the p-values - are output rather than the predictions. - - Returns - ------- - p : numpy array of shape [n_samples, n_classes] - If significance is ``None``, then p contains the p-values for each - sample-class pair; if significance is a float between 0 and 1, then - p is a boolean array denoting which labels are included in the - prediction sets. - """ - # TODO: if x == self.last_x ... - n_test_objects = x.shape[0] - p = np.zeros((n_test_objects, self.classes.size)) - - for i, c in enumerate(self.classes): - test_class = np.zeros(x.shape[0], dtype=self.classes.dtype) - test_class.fill(c) - - # TODO: maybe calculate p-values using cython or similar - # TODO: interpolated p-values - - # TODO: nc_function.calc_nc should take X * {y1, y2, ... ,yn} - test_nc_scores = self.nc_function.score(x, test_class) - for j, nc in enumerate(test_nc_scores): - cal_scores = self.cal_scores[self.condition((x[j, :], c))][::-1] - n_cal = cal_scores.size - - n_eq = 0 - n_gt = 0 - for cal_score in cal_scores: - if cal_score == nc: - n_eq += 1 - elif nc < cal_score: - n_gt += 1 - - if self.smoothing: - p[j, i] = (n_gt + n_eq * np.random.uniform(0, 1, 1)) / (n_cal + 1) - else: - p[j, i] = (n_gt + n_eq) / (n_cal + 1) - - if significance is not None: - return p > significance - else: - return p - - def predict_conf(self, x): - """Predict the output values for a set of input patterns, using - the confidence-and-credibility output scheme. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of patters for which to predict output values. - - Returns - ------- - p : numpy array of shape [n_samples, 3] - p contains three columns: the first column contains the most - likely class for each test pattern; the second column contains - the confidence in the predicted class label, and the third column - contains the credibility of the prediction. - """ - p = self.predict(x, significance=None) - label = p.argmax(axis=1) - credibility = p.max(axis=1) - for i, idx in enumerate(label): - p[i, idx] = -np.inf - confidence = 1 - p.max(axis=1) - - return np.array([label, confidence, credibility]).T - - -# ----------------------------------------------------------------------------- -# Inductive conformal regressor -# ----------------------------------------------------------------------------- -class IcpRegressor(BaseIcp, RegressorMixin): - """Inductive conformal regressor. - - Parameters - ---------- - nc_function : BaseScorer - Nonconformity scorer object used to calculate nonconformity of - calibration examples and test patterns. Should implement ``fit(x, y)``, - ``calc_nc(x, y)`` and ``predict(x, nc_scores, significance)``. - - Attributes - ---------- - cal_x : numpy array of shape [n_cal_examples, n_features] - Inputs of calibration set. - - cal_y : numpy array of shape [n_cal_examples] - Outputs of calibration set. - - nc_function : BaseScorer - Nonconformity scorer object used to calculate nonconformity scores. - - See also - -------- - IcpClassifier - - References - ---------- - .. [1] Papadopoulos, H., Proedrou, K., Vovk, V., & Gammerman, A. (2002). - Inductive confidence machines for regression. In Machine Learning: ECML - 2002 (pp. 345-356). Springer Berlin Heidelberg. - - .. [2] Papadopoulos, H., & Haralambous, H. (2011). Reliable prediction - intervals with regression neural networks. Neural Networks, 24(8), - 842-851. - """ - - def __init__(self, nc_function: FunctionType, condition: bool = None) -> None: - super(IcpRegressor, self).__init__(nc_function, condition) - - def predict(self, x: np.array, significance: bool = None) -> np.array: - """Predict the output values for a set of input patterns. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of patters for which to predict output values. - - significance : float - Significance level (maximum allowed error rate) of predictions. - Should be a float between 0 and 1. If ``None``, then intervals for - all significance levels (0.01, 0.02, ..., 0.99) are output in a - 3d-matrix. - - Returns - ------- - p : numpy array of shape [n_samples, 2] or [n_samples, 2, 99} - If significance is ``None``, then p contains the interval (minimum - and maximum boundaries) for each test pattern, and each significance - level (0.01, 0.02, ..., 0.99). If significance is a float between - 0 and 1, then p contains the prediction intervals (minimum and - maximum boundaries) for the set of test patterns at the chosen - significance level. - """ - # TODO: interpolated p-values - - n_significance = (99 if significance is None - else np.array(significance).size) - - if n_significance > 1: - prediction = np.zeros((x.shape[0], 2, n_significance)) - else: - prediction = np.zeros((x.shape[0], 2)) - - condition_map = np.array([self.condition((x[i, :], None)) - for i in range(x.shape[0])]) - - for condition in self.categories: - idx = condition_map == condition - if np.sum(idx) > 0: - p = self.nc_function.predict(x[idx, :], - self.cal_scores[condition], - significance) - if n_significance > 1: - prediction[idx, :, :] = p - else: - prediction[idx, :] = p - - return prediction diff --git a/lightwood/analysis/nc/metrics.py b/lightwood/analysis/nc/metrics.py deleted file mode 100644 index 803fea873..000000000 --- a/lightwood/analysis/nc/metrics.py +++ /dev/null @@ -1,171 +0,0 @@ -# Original author: Henrik Linusson (github.com/donlnz) - -import numpy as np - - -# ----------------------------------------------------------------------------- -# Validity measures -# ----------------------------------------------------------------------------- -def reg_n_correct(prediction, y, significance=None): - """Calculates the number of correct predictions made by a conformal - regression model. - """ - if significance is not None: - idx = int(significance * 100 - 1) - prediction = prediction[:, :, idx] - - low = y >= prediction[:, 0] - high = y <= prediction[:, 1] - correct = low * high - - return y[correct].size - - -def reg_mean_errors(prediction, y, significance): - """Calculates the average error rate of a conformal regression model. - """ - return 1 - reg_n_correct(prediction, y, significance) / y.size - - -def class_n_correct(prediction, y, significance): - """Calculates the number of correct predictions made by a conformal - classification model. - """ - labels, y = np.unique(y, return_inverse=True) - prediction = prediction > significance - correct = np.zeros((y.size,), dtype=bool) - for i, y_ in enumerate(y): - correct[i] = prediction[i, int(y_)] - return np.sum(correct) - - -def class_mean_errors(prediction, y, significance=None): - """Calculates the average error rate of a conformal classification model. - """ - return 1 - (class_n_correct(prediction, y, significance) / y.size) - - -def class_one_err(prediction, y, significance=None): - """Calculates the error rate of conformal classifier predictions containing - only a single output label. - """ - labels, y = np.unique(y, return_inverse=True) - prediction = prediction > significance - idx = np.arange(0, y.size, 1) - idx = filter(lambda x: np.sum(prediction[x, :]) == 1, idx) - errors = filter(lambda x: not prediction[x, int(y[x])], idx) - - if len(idx) > 0: - return np.size(errors) / np.size(idx) - else: - return 0 - - -def class_mean_errors_one_class(prediction, y, significance, c=0): - """Calculates the average error rate of a conformal classification model, - considering only test examples belonging to class ``c``. Use - ``functools.partial`` in order to test other classes. - """ - labels, y = np.unique(y, return_inverse=True) - prediction = prediction > significance - idx = np.arange(0, y.size, 1)[y == c] - errs = np.sum(1 for _ in filter(lambda x: not prediction[x, c], idx)) - - if idx.size > 0: - return errs / idx.size - else: - return 0 - - -def class_one_err_one_class(prediction, y, significance, c=0): - """Calculates the error rate of conformal classifier predictions containing - only a single output label. Considers only test examples belonging to - class ``c``. Use ``functools.partial`` in order to test other classes. - """ - labels, y = np.unique(y, return_inverse=True) - prediction = prediction > significance - idx = np.arange(0, y.size, 1) - idx = filter(lambda x: prediction[x, c], idx) - idx = filter(lambda x: np.sum(prediction[x, :]) == 1, idx) - errors = filter(lambda x: int(y[x]) != c, idx) - - if len(idx) > 0: - return np.size(errors) / np.size(idx) - else: - return 0 - - -# ----------------------------------------------------------------------------- -# Efficiency measures -# ----------------------------------------------------------------------------- -def _reg_interval_size(prediction, y, significance): - idx = int(significance * 100 - 1) - prediction = prediction[:, :, idx] - - return prediction[:, 1] - prediction[:, 0] - - -def reg_min_size(prediction, y, significance): - return np.min(_reg_interval_size(prediction, y, significance)) - - -def reg_q1_size(prediction, y, significance): - return np.percentile(_reg_interval_size(prediction, y, significance), 25) - - -def reg_median_size(prediction, y, significance): - return np.median(_reg_interval_size(prediction, y, significance)) - - -def reg_q3_size(prediction, y, significance): - return np.percentile(_reg_interval_size(prediction, y, significance), 75) - - -def reg_max_size(prediction, y, significance): - return np.max(_reg_interval_size(prediction, y, significance)) - - -def reg_mean_size(prediction, y, significance): - """Calculates the average prediction interval size of a conformal - regression model. - """ - return np.mean(_reg_interval_size(prediction, y, significance)) - - -def class_avg_c(prediction, y, significance): - """Calculates the average number of classes per prediction of a conformal - classification model. - """ - prediction = prediction > significance - return np.sum(prediction) / prediction.shape[0] - - -def class_mean_p_val(prediction, y, significance): - """Calculates the mean of the p-values output by a conformal classification - model. - """ - return np.mean(prediction) - - -def class_one_c(prediction, y, significance): - """Calculates the rate of singleton predictions (prediction sets containing - only a single class label) of a conformal classification model. - """ - prediction = prediction > significance - n_singletons = np.sum(1 for _ in filter(lambda x: np.sum(x) == 1, prediction)) - return n_singletons / y.size - - -def class_empty(prediction, y, significance): - """Calculates the rate of singleton predictions (prediction sets containing - only a single class label) of a conformal classification model. - """ - prediction = prediction > significance - n_empty = np.sum(1 for _ in filter(lambda x: np.sum(x) == 0, prediction)) - return n_empty / y.size - - -def n_test(prediction, y, significance): - """Provides the number of test patters used in the evaluation. - """ - return y.size diff --git a/lightwood/analysis/nc/nc.py b/lightwood/analysis/nc/nc.py deleted file mode 100644 index 9cd27e422..000000000 --- a/lightwood/analysis/nc/nc.py +++ /dev/null @@ -1,498 +0,0 @@ -""" -Nonconformity functions. -""" - -# Original author: Henrik Linusson (github.com/donlnz) - -import abc -import numpy as np -import sklearn.base -from scipy.interpolate import interp1d -from copy import deepcopy - - -# ----------------------------------------------------------------------------- -# Error functions -# ----------------------------------------------------------------------------- -class ClassificationErrFunc(object): - """Base class for classification model error functions. - """ # noqa - - __metaclass__ = abc.ABCMeta - - def __init__(self): - super(ClassificationErrFunc, self).__init__() - - @abc.abstractmethod - def apply(self, prediction, y): - """Apply the nonconformity function. - - Parameters - ---------- - prediction : numpy array of shape [n_samples, n_classes] - Class probability estimates for each sample. - - y : numpy array of shape [n_samples] - True output labels of each sample. - - Returns - ------- - nc : numpy array of shape [n_samples] - Nonconformity scores of the samples. - """ # noqa - pass - - -class RegressionErrFunc(object): - """Base class for regression model error functions. - """ # noqa - - __metaclass__ = abc.ABCMeta - - def __init__(self): - super(RegressionErrFunc, self).__init__() - - @abc.abstractmethod - def apply(self, prediction, y): # , norm=None, beta=0): - """Apply the nonconformity function. - - Parameters - ---------- - prediction : numpy array of shape [n_samples, n_classes] - Class probability estimates for each sample. - - y : numpy array of shape [n_samples] - True output labels of each sample. - - Returns - ------- - nc : numpy array of shape [n_samples] - Nonconformity scores of the samples. - """ # noqa - pass - - @abc.abstractmethod - def apply_inverse(self, nc, significance): # , norm=None, beta=0): - """Apply the inverse of the nonconformity function (i.e., - calculate prediction interval). - - Parameters - ---------- - nc : numpy array of shape [n_calibration_samples] - Nonconformity scores obtained for conformal predictor. - - significance : float - Significance level (0, 1). - - Returns - ------- - interval : numpy array of shape [n_samples, 2] - Minimum and maximum interval boundaries for each prediction. - """ # noqa - pass - - -class InverseProbabilityErrFunc(ClassificationErrFunc): - """Calculates the probability of not predicting the correct class. - - For each correct output in ``y``, nonconformity is defined as - - .. math:: - 1 - hat{P}(y_i | x) , . - """ # noqa - - def __init__(self): - super(InverseProbabilityErrFunc, self).__init__() - - def apply(self, prediction, y): - prob = np.zeros(y.size, dtype=np.float32) - for i, y_ in enumerate(y): - if y_ >= prediction.shape[1]: - prob[i] = 0 - else: - prob[i] = prediction[i, int(y_)] - return 1 - prob - - -class MarginErrFunc(ClassificationErrFunc): - """ - Calculates the margin error. - - For each correct output in ``y``, nonconformity is defined as - - .. math:: - 0.5 - frac{hat{P}(y_i | x) - max_{y , != , y_i} hat{P}(y | x)}{2} - """ # noqa - - def __init__(self): - super(MarginErrFunc, self).__init__() - - def apply(self, prediction, y): - prob = np.zeros(y.size, dtype=np.float32) - for i, y_ in enumerate(y): - if y_ >= prediction.shape[1]: - prob[i] = 0 - else: - prob[i] = prediction[i, int(y_)] - prediction[i, int(y_)] = -np.inf - return 0.5 - ((prob - prediction.max(axis=1)) / 2) - - -class AbsErrorErrFunc(RegressionErrFunc): - """Calculates absolute error nonconformity for regression problems. - - For each correct output in ``y``, nonconformity is defined as - - .. math:: - | y_i - hat{y}_i | - """ # noqa - - def __init__(self): - super(AbsErrorErrFunc, self).__init__() - - def apply(self, prediction, y): - return np.abs(prediction - y) - - def apply_inverse(self, nc, significance): - nc = np.sort(nc)[::-1] - border = int(np.floor(significance * (nc.size + 1))) - 1 - # TODO: should probably warn against too few calibration examples - border = min(max(border, 0), nc.size - 1) - return np.vstack([nc[border], nc[border]]) - - -class BoostedAbsErrorErrFunc(RegressionErrFunc): - """ Calculates absolute error nonconformity for regression problems. Applies linear interpolation - for nonconformity scores when we have less than 100 samples in the validation dataset. - """ # noqa - - def __init__(self): - super(BoostedAbsErrorErrFunc, self).__init__() - - def apply(self, prediction, y): - return np.abs(prediction - y) - - def apply_inverse(self, nc, significance): - nc = np.sort(nc)[::-1] - border = int(np.floor(significance * (nc.size + 1))) - 1 - if 1 < nc.size < 100: - x = np.arange(nc.shape[0]) - interp = interp1d(x, nc) - nc = interp(np.linspace(0, nc.size - 1, 100)) - border = min(max(border, 0), nc.size - 1) - return np.vstack([nc[border], nc[border]]) - - -class SignErrorErrFunc(RegressionErrFunc): - """Calculates signed error nonconformity for regression problems. - - For each correct output in ``y``, nonconformity is defined as - - .. math:: - y_i - hat{y}_i - - References - ---------- - .. [1] Linusson, Henrik, Ulf Johansson, and Tuve Lofstrom. - Signed-error conformal regression. Pacific-Asia Conference on Knowledge - Discovery and Data Mining. Springer International Publishing, 2014. - """ # noqa - - def __init__(self): - super(SignErrorErrFunc, self).__init__() - - def apply(self, prediction, y): - return (prediction - y) - - def apply_inverse(self, nc, significance): - nc = np.sort(nc)[::-1] - upper = int(np.floor((significance / 2) * (nc.size + 1))) - lower = int(np.floor((1 - significance / 2) * (nc.size + 1))) - # TODO: should probably warn against too few calibration examples - upper = min(max(upper, 0), nc.size - 1) - lower = max(min(lower, nc.size - 1), 0) - return np.vstack([-nc[lower], nc[upper]]) - - -# ----------------------------------------------------------------------------- -# Base nonconformity scorer -# ----------------------------------------------------------------------------- -class BaseScorer(sklearn.base.BaseEstimator): - __metaclass__ = abc.ABCMeta - - def __init__(self): - super(BaseScorer, self).__init__() - - @abc.abstractmethod - def fit(self, x, y): - pass - - @abc.abstractmethod - def score(self, x, y=None): - pass - - -class RegressorNormalizer(BaseScorer): - def __init__(self, base_model, normalizer_model, err_func): - super(RegressorNormalizer, self).__init__() - self.base_model = base_model - self.normalizer_model = normalizer_model - self.err_func = err_func - - def fit(self, x, y): - residual_prediction = self.base_model.predict(x) - residual_error = np.abs(self.err_func.apply(residual_prediction, y)) - residual_error += 0.00001 # Add small term to avoid log(0) - log_err = np.log(residual_error) - self.normalizer_model.fit(x, log_err) - - def score(self, x, y=None): - norm = np.exp(self.normalizer_model.predict(x)) - return norm - - -class BaseModelNc(BaseScorer): - """Base class for nonconformity scorers based on an underlying model. - - Parameters - ---------- - model : ClassifierAdapter or RegressorAdapter - Underlying classification model used for calculating nonconformity - scores. - - err_func : ClassificationErrFunc or RegressionErrFunc - Error function object. - - normalizer : BaseScorer - Normalization model. - - beta : float - Normalization smoothing parameter. As the beta-value increases, - the normalized nonconformity function approaches a non-normalized - equivalent. - """ # noqa - - def __init__(self, model, err_func, normalizer=None, beta=0): - super(BaseModelNc, self).__init__() - self.err_func = err_func - self.model = model - self.normalizer = normalizer - self.beta = beta - - # If we use sklearn.base.clone (e.g., during cross-validation), - # object references get jumbled, so we need to make sure that the - # normalizer has a reference to the proper model adapter, if applicable. - if (self.normalizer is not None and - hasattr(self.normalizer, 'base_model')): - self.normalizer.base_model = self.model - - self.last_x, self.last_y = None, None - self.last_prediction = None - self.clean = False - - def fit(self, x, y): - """Fits the underlying model of the nonconformity scorer. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of examples for fitting the underlying model. - - y : numpy array of shape [n_samples] - Outputs of examples for fitting the underlying model. - - Returns - ------- - None - """ # noqa - self.model.fit(x, y) - if self.normalizer is not None: - self.normalizer.fit(x, y) - self.clean = False - - def score(self, x, y=None): - """Calculates the nonconformity score of a set of samples. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of examples for which to calculate a nonconformity score. - - y : numpy array of shape [n_samples] - Outputs of examples for which to calculate a nonconformity score. - - Returns - ------- - nc : numpy array of shape [n_samples] - Nonconformity scores of samples. - """ # noqa - prediction = self.model.predict(x) - - err = self.err_func.apply(prediction, y) - if self.normalizer is not None: - try: - norm = self.normalizer.score(x) + self.beta - err = err / norm - except Exception: - pass - - return err - - def __deepcopy__(self, memo={}): - cls = self.__class__ - result = cls.__new__(cls) - memo[id(self)] = result - for k, v in self.__dict__.items(): - if k not in ['model', 'normalizer']: # model should not be copied - setattr(result, k, deepcopy(v, memo)) - else: - setattr(result, k, v) - return result - - -# ----------------------------------------------------------------------------- -# Classification nonconformity scorers -# ----------------------------------------------------------------------------- -class ClassifierNc(BaseModelNc): - """Nonconformity scorer using an underlying class probability estimating - model. - - Parameters - ---------- - model : ClassifierAdapter - Underlying classification model used for calculating nonconformity - scores. - - err_func : ClassificationErrFunc - Error function object. - - normalizer : BaseScorer - Normalization model. - - beta : float - Normalization smoothing parameter. As the beta-value increases, - the normalized nonconformity function approaches a non-normalized - equivalent. - - Attributes - ---------- - model : ClassifierAdapter - Underlying model object. - - err_func : ClassificationErrFunc - Scorer function used to calculate nonconformity scores. - """ # noqa - - def __init__(self, - model, - err_func=MarginErrFunc(), - normalizer=None, - beta=0): - super(ClassifierNc, self).__init__(model, - err_func, - normalizer, - beta) - - -# ----------------------------------------------------------------------------- -# Regression nonconformity scorers -# ----------------------------------------------------------------------------- -class RegressorNc(BaseModelNc): - """Nonconformity scorer using an underlying regression model. - - Parameters - ---------- - model : RegressorAdapter - Underlying regression model used for calculating nonconformity scores. - - err_func : RegressionErrFunc - Error function object. - - normalizer : BaseScorer - Normalization model. - - beta : float - Normalization smoothing parameter. As the beta-value increases, - the normalized nonconformity function approaches a non-normalized - equivalent. - - Attributes - ---------- - model : RegressorAdapter - Underlying model object. - - err_func : RegressionErrFunc - Scorer function used to calculate nonconformity scores. - """ # noqa - - def __init__(self, - model, - err_func=AbsErrorErrFunc(), - normalizer=None, - beta=0): - super(RegressorNc, self).__init__(model, - err_func, - normalizer, - beta) - - def predict(self, x, nc, significance=None): - """Constructs prediction intervals for a set of test examples. - - Predicts the output of each test pattern using the underlying model, - and applies the (partial) inverse nonconformity function to each - prediction, resulting in a prediction interval for each test pattern. - - Parameters - ---------- - x : numpy array of shape [n_samples, n_features] - Inputs of patters for which to predict output values. - - significance : float - Significance level (maximum allowed error rate) of predictions. - Should be a float between 0 and 1. If ``None``, then intervals for - all significance levels (0.01, 0.02, ..., 0.99) are output in a - 3d-matrix. - - Returns - ------- - p : numpy array of shape [n_samples, 2] or [n_samples, 2, 99] - If significance is ``None``, then p contains the interval (minimum - and maximum boundaries) for each test pattern, and each significance - level (0.01, 0.02, ..., 0.99). If significance is a float between - 0 and 1, then p contains the prediction intervals (minimum and - maximum boundaries) for the set of test patterns at the chosen - significance level. - """ # noqa - n_test = x.shape[0] - prediction = self.model.predict(x) - - norm = 1 - if self.normalizer is not None: - try: - norm = self.normalizer.score(x) + self.beta - except Exception: - pass - - if significance: - err_dist = self.err_func.apply_inverse(nc, significance) - err_dist = np.hstack([err_dist] * n_test) - err_dist *= norm - - intervals = np.zeros((x.shape[0], 2)) - intervals[:, 0] = prediction - err_dist[0, :] - intervals[:, 1] = prediction + err_dist[1, :] - - return intervals - else: - significance = np.arange(0.01, 1.0, 0.01) - intervals = np.zeros((x.shape[0], 2, significance.size)) - - for i, s in enumerate(significance): - err_dist = self.err_func.apply_inverse(nc, s) - err_dist = np.hstack([err_dist] * n_test) - err_dist *= norm - - intervals[:, 0, i] = prediction - err_dist[0, :] - intervals[:, 1, i] = prediction + err_dist[0, :] - - return intervals diff --git a/lightwood/analysis/nc/norm.py b/lightwood/analysis/nc/norm.py deleted file mode 100644 index 07aa88797..000000000 --- a/lightwood/analysis/nc/norm.py +++ /dev/null @@ -1,129 +0,0 @@ -from typing import Union - -import torch -import numpy as np -import pandas as pd -from scipy.stats import entropy -from sklearn.linear_model import Ridge -from sklearn.metrics import mean_absolute_error - -from lightwood.api.dtype import dtype -from lightwood.mixer import BaseMixer -from lightwood.api.types import PredictionArguments -from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs - - -class Normalizer(BaseMixer): - """ - Companion class to the confidence estimation analysis block. A normalizer is a secondary machine learning model - tasked with learning to estimate the "difficulty" that the main predictor will have with any problem instance. - - The idea is that this model should emit higher scores for tougher predictions. All scores will be passed as a - normalizing factor to the conformal prediction framework, thus: - - widening bounds at the same confidence level if a prediction is harder - - tightening bounds at the same confidence level if a predictions is easier - - Reference: - Papadopoulos, H., Gammerman, A., & Vovk, V. (2008). Normalized nonconformity measures for regression Conformal Prediction. - - """ # noqa - def __init__(self, fit_params: dict): - super(Normalizer, self).__init__(stop_after=fit_params['stop_after']) - - self.input_cols = list(fit_params['dtype_dict'].keys()) - self.base_predictor = fit_params['predictor'] - self.encoders = fit_params['encoders'] - self.target = fit_params['target'] - self.target_dtype = fit_params['dtype_dict'][fit_params['target']] - self.multi_ts_task = fit_params['is_multi_ts'] - - self.model = Ridge() # @TODO: enable underlying model selection from JsonAI - self.prepared = False - self.prediction_cache = None - self.bounds = (0.5, 1.5) - self.error_fn = mean_absolute_error - - def fit(self, data: EncodedDs) -> None: - try: - data = ConcatedEncodedDs(data) - preds = self.base_predictor(data, args=PredictionArguments.from_dict({'predict_proba': True})) - truths = data.data_frame[self.target] - labels = self.get_labels(preds, truths.values, data.encoders[self.target]) - enc_data = data.get_encoded_data(include_target=False).numpy() - self.model.fit(enc_data, labels) - self.prepared = True - except Exception: - pass - - def __call__(self, ds: Union[ConcatedEncodedDs, torch.Tensor], args: PredictionArguments) -> np.ndarray: - if isinstance(ds, ConcatedEncodedDs): - ds = ds.get_encoded_data(include_target=False) - - if self.prepared: - raw = self.model.predict(ds.numpy()) - scores = np.clip(raw, 0.1, 1e4) # set limit deviations (@TODO: benchmark stability) - # smoothed = clipped / clipped.mean() - else: - scores = np.ones(ds.shape[0]) - - return scores - - def score(self, data) -> np.ndarray: - if not self.prepared: - scores = np.ones(data.shape[0]) # by default, normalizing factor is 1 for all predictions - elif self.prediction_cache is not None: - scores = self.prediction_cache - else: - scores = self.model.predict(data) - - return scores - - def get_labels(self, preds: pd.DataFrame, truths: np.ndarray, target_enc) -> np.ndarray: - if self.target_dtype in [dtype.integer, dtype.float, dtype.quantity]: - if not self.multi_ts_task: - preds = preds.values.squeeze() - else: - preds = [p[0] for p in preds.values.squeeze()] - preds = preds.astype(float) - labels = self.compute_numerical_labels(preds, truths, self.bounds) - - elif self.target_dtype in [dtype.binary, dtype.categorical]: - if self.base_predictor.supports_proba: - prob_cols = [col for col in preds.columns if '__mdb_proba' in col] - col_names = [col.replace('__mdb_proba_', '') for col in prob_cols] - preds = preds[prob_cols] - else: - prob_cols = col_names = target_enc.map.keys() - ohe_preds = pd.get_dummies(preds['prediction'], columns=col_names) - for col in col_names: - if col not in ohe_preds.columns: - ohe_preds[col] = np.zeros(ohe_preds.shape[0]) - preds = ohe_preds - - # reorder preds to ensure classes are in same order as in target_enc - preds.columns = col_names - new_order = [v for k, v in sorted(target_enc.rev_map.items(), key=lambda x: x[0])] - preds = preds.reindex(columns=new_order) - preds = preds.values.squeeze() - preds = preds if prob_cols else target_enc.encode(preds).tolist() - truths = target_enc.encode(truths).numpy() - - labels = self.compute_categorical_labels(preds, truths) - - else: - raise(Exception(f"dtype {self.target_dtype} not supported for confidence normalizer")) - - return labels - - @staticmethod - def compute_numerical_labels(preds: np.ndarray, truths: np.ndarray, bounds: list) -> np.ndarray: - diffs = np.log(abs(preds - truths)) - diffs = diffs / np.max(diffs) if np.max(diffs) > 0 else diffs - labels = np.clip(bounds[0] + diffs, bounds[0], bounds[1]) - return labels - - @staticmethod - def compute_categorical_labels(preds: np.ndarray, truths: np.ndarray) -> np.ndarray: - preds = np.clip(preds, 0.001, 0.999) # avoid inf - labels = entropy(truths, preds, axis=1) - return labels diff --git a/lightwood/analysis/nc/util.py b/lightwood/analysis/nc/util.py deleted file mode 100644 index 24c72a918..000000000 --- a/lightwood/analysis/nc/util.py +++ /dev/null @@ -1,220 +0,0 @@ -from typing import Union, Optional - -import torch -import numpy as np -import pandas as pd -from torch.nn.functional import softmax - -from lightwood.api.dtype import dtype - - -def t_softmax(x, t=1.0, axis=1): - """ Softmax with temperature scaling """ - return softmax(torch.Tensor(x) / t, dim=axis).numpy() - - -def clean_df(df, target, is_classification, label_encoders): - """ Returns cleaned DF for nonconformist calibration """ - # @TODO: reevaluate whether this can be streamlined - enc = label_encoders - - y = df.pop(target).values - - if is_classification: - if enc and isinstance(enc.categories_[0][0], str): - cats = enc.categories_[0].tolist() - # the last element is "__mdb_unknown_cat" - y = np.array([cats.index(i) if i in cats else len(cats) - 1 for i in y]) - y = y.clip(-pow(2, 63), pow(2, 63)).astype(int) - else: - y = y.astype(float) - - return df, y - - -def set_conf_range( - X: pd.DataFrame, - icp, - target_type: dtype, - analysis_info: dict, - positive_domain: bool = False, - std_tol: int = 1, - group: str = '__default', - significance: Optional[float] = None -): - """ - Automatically sets confidence level for numerical and categorical tasks. - - :param X: Validation data. - :param icp: Inductive conformal predictor that sets the confidence level. Either IcpClassifier or IcpRegressor. - :param target_type: dtype of the target column. - :param analysis_info: - :param positive_domain: Flag that indicates whether target is expected to be a positive number. - :param std_tol: Tolerance for automatic confidence level selection; bigger tolerance means higher confidence, in general. - :param group: For tasks with multiple different target groups (where each may have a different std_dev), indicates what group is being considered. - :param significance: Desired confidence level. Can be preset (0 < x <= 0.99) - - :return: set confidence plus predictions regions (for numerical tasks) or pvalues (for categorical tasks). - """ # noqa - # numerical - if target_type in (dtype.integer, dtype.float, dtype.array, dtype.tsarray, dtype.quantity): - - # ICP gets all possible bounds (shape: (B, 2, 99)) - all_ranges = icp.predict(X.values) - - # iterate over confidence levels until spread >= a multiplier of the dataset stddev - if significance is not None: - conf = int(100 * (1 - significance)) - return significance, all_ranges[:, :, conf] - else: - for tol in [std_tol, std_tol + 1, std_tol + 2]: - for significance in range(99): - ranges = all_ranges[:, :, significance] - spread = np.mean(ranges[:, 1] - ranges[:, 0]) - tolerance = analysis_info['df_std_dev'][group] * tol - - if spread <= tolerance: - confidence = (99 - significance) / 100 - if positive_domain: - ranges[ranges < 0] = 0 - return confidence, ranges - else: - ranges = all_ranges[:, :, 0] - if positive_domain: - ranges[ranges < 0] = 0 - return 0.9901, ranges - - # categorical - elif target_type in (dtype.binary, dtype.categorical): - pvals = icp.predict(X.values) # p-values at which each class is included in the predicted set - conf = np.subtract(1, pvals.min(axis=1)) - return conf, pvals - - # default - return 0.005, np.zeros((X.shape[0], 2)) - - -def get_numeric_conf_range( - all_confs: np.ndarray, - df_std_dev: dict = {}, - positive_domain: bool = False, - std_tol: int = 1, - group: Optional[str] = '__default', - error_rate: float = None -): - """ - Gets prediction bounds for numerical targets, based on ICP estimation and width tolerance. - - :param all_confs: All possible bounds depending on confidence level. - :param df_std_dev: Observed train standard deviation for each group target. - :param positive_domain: Flag that indicates whether target is expected to be a positive number. - :param std_tol: Tolerance for automatic confidence level selection; bigger tolerance means higher confidence, in general. - :param group: For tasks with multiple different target groups (where each may have a different std_dev), indicates what group is being considered. - :param error_rate: Pre-determined error rate for the ICP, 0-1 bounded. Can be specified to bypass automatic confidence/bound detection, or to adjust the threshold sensitivity in anomaly detection tasks. - - :return: array with confidence for each data instance, along with lower and upper bounds for each prediction. - """ # noqa - if not isinstance(error_rate, float): - error_rate = None - - if error_rate is None: - significances = [] - conf_ranges = [] - std_dev = df_std_dev[group] - tolerance = std_dev * std_tol - - for sample_idx in range(all_confs.shape[0]): - sample = all_confs[sample_idx, :, :] - for idx in range(sample.shape[1]): - significance = (99 - idx) / 100 - diff = sample[1, idx] - sample[0, idx] - if diff <= tolerance: - conf_range = list(sample[:, idx]) - significances.append(significance) - conf_ranges.append(conf_range) - break - else: - significances.append(0.9991) # default: confident that value falls inside big bounds - bounds = sample[:, 0] - sigma = (bounds[1] - bounds[0]) / 4 - conf_range = [bounds[0] - sigma, bounds[1] + sigma] - conf_ranges.append(conf_range) - - conf_ranges = np.array(conf_ranges) - else: - # fixed error rate - error_rate = max(0.01, min(1.0, error_rate)) - conf = 1 - error_rate - conf_idx = int(100 * error_rate) - 1 - conf_ranges = all_confs[:, :, conf_idx] - significances = [conf for _ in range(conf_ranges.shape[0])] - - if positive_domain: - conf_ranges[conf_ranges < 0] = 0 - return np.array(significances), conf_ranges - - -def get_categorical_conf(all_confs: np.ndarray, conf_candidates: list): - """ - Gets ICP confidence estimation for categorical targets. - Prediction set is always unitary and includes only the predicted label. - - :param all_confs: all possible label sets depending on confidence level - :param conf_candidates: includes preset confidence levels to check - - :return: confidence for each data instance - """ - significances = [] - for sample_idx in range(all_confs.shape[0]): - sample = all_confs[sample_idx, :, :] - for idx in range(sample.shape[1]): - conf = (99 - conf_candidates[idx]) / 100 - if np.sum(sample[:, idx]) == 1: - significances.append(conf) - break - else: - significances.append(0.005) # default: not confident label is the predicted one - return significances - - -def get_anomalies(insights: pd.DataFrame, observed_series: Union[pd.Series, list], cooldown: int = 1): - """ - Simple procedure for unsupervised anomaly detection in time series forecasting. - Uses ICP analysis block output so that any true value falling outside of the lower and upper bounds is tagged as anomalous. - - :param insights: dataframe with row insights used during the `.explain()` phase of all analysis blocks. - :param observed_series: true values from the predicted time series. If empty, no anomalies are flagged. - :param cooldown: minimum amount of observations (assuming regular sampling frequency) that need to pass between two consecutive anomalies. - - :return: list of boolean flags, indicating anomalous behavior for each predicted value. - """ # noqa - anomalies = [] - counter = 0 - - # cast to float (safe, we only call this method if series is numerical) - try: - observed_series = [float(value) for value in observed_series] - except (TypeError, ValueError): - return [None for _ in observed_series] - - lower_bounds = insights['lower'].tolist() - upper_bounds = insights['upper'].tolist() - - for (l, u), t in zip(zip(lower_bounds, upper_bounds), observed_series): - if t is not None: - anomaly = not (l <= t <= u) - - if anomaly and (counter == 0 or counter >= cooldown): - anomalies.append(anomaly) # new anomaly event triggers, reset counter - counter = 1 - elif anomaly and counter < cooldown: - anomalies.append(False) # overwrite as not anomalous if still in cooldown - counter += 1 - else: - anomalies.append(anomaly) - counter = 0 - else: - anomalies.append(None) - counter += 1 - - return anomalies diff --git a/lightwood/api/__init__.py b/lightwood/api/__init__.py deleted file mode 100644 index 49d1ffa85..000000000 --- a/lightwood/api/__init__.py +++ /dev/null @@ -1,45 +0,0 @@ -from lightwood.api.dtype import dtype -from lightwood.api.types import ( - JsonAI, - Output, - Feature, - TypeInformation, - StatisticalAnalysis, - ProblemDefinition, - TimeseriesSettings, - ModelAnalysis, - DataAnalysis, - PredictionArguments, -) -from lightwood.api.predictor import PredictorInterface -from lightwood.api.high_level import ( - analyze_dataset, - code_from_problem, - predictor_from_problem, - predictor_from_code, - code_from_json_ai, - json_ai_from_problem, - predictor_from_state, -) - -__all__ = [ - "analyze_dataset", - "code_from_problem", - "predictor_from_problem", - "predictor_from_code", - "code_from_json_ai", - "json_ai_from_problem", - "JsonAI", - "Output", - "Feature", - "TypeInformation", - "StatisticalAnalysis", - "ProblemDefinition", - "TimeseriesSettings", - "ModelAnalysis", - "DataAnalysis", - "PredictionArguments", - "PredictorInterface", - "dtype", - "predictor_from_state", -] diff --git a/lightwood/api/dtype.py b/lightwood/api/dtype.py deleted file mode 100644 index cedb79da3..000000000 --- a/lightwood/api/dtype.py +++ /dev/null @@ -1,45 +0,0 @@ -class dtype: - """ - Definitions of all data types currently supported. Dtypes currently supported include: - - - **Numerical**: Data that should be represented in the form of a number. Currently ``integer``, ``float``, and ``quantity`` are supported. - - **Categorical**: Data that represents a class or label and is discrete. Currently ``binary``, ``categorical``, and ``tags`` are supported. - - **Date/Time**: Time-series data that is temporal/sequential. Currently ``date``, and ``datetime`` are supported. - - **Text**: Data that can be considered as language information. Currently ``short_text``, and ``rich_text`` are supported. Short text has a small vocabulary (~ 100 words) and is generally a limited number of characters. Rich text is anything with greater complexity. - - **Complex**: Data types that require custom techniques. Currently ``audio``, ``video`` and ``image`` are available, but highly experimental. - - **Array**: Data in the form of a sequence where order must be preserved. Currently ``array`` is the supported type. - - **Miscellaneous**: Miscellaneous data descriptors include ``empty``, an explicitly unknown value versus ``invalid``, a data type not currently supported. - - Custom data types may be implemented here as a flag for subsequent treatment and processing. You are welcome to include your own definitions, so long as they do not override the existing type names (alternatively, if you do, please edit subsequent parts of the preprocessing pipeline to correctly indicate how you want to deal with these data types). - """ # noqa - - # Numerical type data - integer = "integer" - float = "float" - quantity = "quantity" - - # Categorical type data - binary = "binary" - categorical = "categorical" - tags = "tags" - - # Dates and Times (time-series) - date = "date" - datetime = "datetime" - - # Text - short_text = "short_text" - rich_text = "rich_text" - - # Complex Data types - image = "image" - audio = "audio" - video = "video" - - # Series/Sequences - array = "array" - tsarray = 'tsarray' - - # Misc (Unk/NaNs) - empty = "empty" - invalid = "invalid" diff --git a/lightwood/api/predictor.py b/lightwood/api/predictor.py deleted file mode 100644 index 2fbe08968..000000000 --- a/lightwood/api/predictor.py +++ /dev/null @@ -1,145 +0,0 @@ -import dill -from typing import Dict - -import pandas as pd -from lightwood.api.types import ModelAnalysis - - -# Interface that must be respected by predictor objects generated from JSON ML and/or compatible with Mindsdb -class PredictorInterface: - """ - Abstraction of a Lightwood predictor. The ``PredictorInterface`` encompasses how Lightwood interacts with the full ML pipeline. Internally, - - The ``PredictorInterface`` class must have several expected functions: - - - ``analyze_data``: Peform a statistical analysis on the unprocessed data; this helps inform downstream encoders and mixers on how to treat the data types. - - ``preprocess``: Apply cleaning functions to each of the columns within the dataset to prepare them for featurization - - ``split``: Split the input dataset into a train/dev/test set according to your splitter function - - ``prepare``: Create and, if necessary, train your encoders to create feature representations from each column of your data. - - ``featurize``: For input, pre-processed data, create feature vectors - - ``fit``: Train your mixer models to yield predictions from featurized data - - ``analyze_ensemble``: Evaluate the quality of fit for your mixer models - - ``adjust``: Incorporate new data to update pre-existing model(s). - - For simplification, we offer an end-to-end approach that allows you to input raw data and follow every step of the process until you reach a trained predictor with the ``learn`` function: - - - ``learn``: An end-to-end technique specifying how to pre-process, featurize, and train the model(s) of interest. The expected input is raw, untrained data. No explicit output is provided, but the Predictor object will "host" the trained model thus. - - You can also use the predictor to now estimate new data: - - - ``predict``: Deploys the chosen best model, and evaluates the given data to provide target estimates. - - ``save``: Saves the Predictor object for further use. - - The ``PredictorInterface`` is created via J{ai}son's custom code creation. A problem inherits from this class with pre-populated routines to fill out expected results, given the nature of each problem type. - """ # noqa - - model_analysis: ModelAnalysis = None - - def __init__(self): - pass - - def analyze_data(self, data: pd.DataFrame) -> None: - """ - Performs a statistical analysis on the data to identify distributions, imbalanced classes, and other nuances within the data. - - :param data: Data used in training the model(s). - """ # noqa - pass - - def preprocess(self, data: pd.DataFrame) -> pd.DataFrame: - """ - Cleans the unprocessed dataset provided. - - :param data: (Unprocessed) Data used in training the model(s). - :returns: The cleaned data frame - """ # noqa - pass - - def split(self, data: pd.DataFrame) -> Dict[str, pd.DataFrame]: - """ - Categorizes the data into a training/testing split; if data is a classification problem, will stratify the data. - - :param data: Pre-processed data, but generically any dataset to split into train/dev/test. - :returns: Dictionary containing training/testing fraction - """ # noqa - pass - - def prepare(self, data: Dict[str, pd.DataFrame]) -> None: - """ - Prepares the encoders for each column of data. - - :param data: Pre-processed data that has been split into train/test. Explicitly uses "train" and/or "dev" in preparation of encoders. - - :returns: Nothing; prepares the encoders for learned representations. - """ # noqa - - def featurize(self, split_data: Dict[str, pd.DataFrame]): - """ - Provides an encoded representation for each dataset in ``split_data``. Requires `self.encoders` to be prepared. - - :param split_data: Pre-processed data from the dataset, split into train/test (or any other keys relevant) - - :returns: For each dataset provided in ``split_data``, the encoded representations of the data. - """ # noqa - pass - - def fit(self, enc_data: Dict[str, pd.DataFrame]) -> None: - """ - Fits "mixer" models to train predictors on the featurized data. Instantiates a set of trained mixers and an ensemble of them. - - :param enc_data: Pre-processed and featurized data, split into the relevant train/test splits. Keys expected are "train", "dev", and "test" - """ # noqa - pass - - def analyze_ensemble(self, enc_data: Dict[str, pd.DataFrame]) -> None: - """ - Evaluate the quality of mixers within an ensemble of models. - - :param enc_data: Pre-processed and featurized data, split into the relevant train/test splits. - """ - pass - - def learn(self, data: pd.DataFrame) -> None: - """ - Trains the attribute model starting from raw data. Raw data is pre-processed and cleaned accordingly. As data is assigned a particular type (ex: numerical, categorical, etc.), the respective feature encoder will convert it into a representation useable for training ML models. Of all ML models requested, these models are compiled and fit on the training data. - - This step amalgates ``preprocess`` -> ``featurize`` -> ``fit`` with the necessary splitting + analyze_data that occurs. - - :param data: (Unprocessed) Data used in training the model(s). - - :returns: Nothing; instantiates with best fit model from ensemble. - """ # noqa - pass - - def adjust(self, new_data: Dict[str, pd.DataFrame]) -> None: - """ - Adjusts a previously trained model on new data. Adopts the same process as ``learn`` but with the exception that the `adjust` function expects the best model to have been already trained. - - .. warning:: This is experimental and subject to change. - :param new_data: New data used to adjust a previously trained model. Keys must reference "old" and "new" referencing to the old and new datasets. In some situations, the old data is still required to train a model (i.e. Regression) to ensure the new data doesn't entirely override it. - - :returns: Nothing; adjusts best-fit model - """ # noqa - pass - - def predict(self, data: pd.DataFrame, args: Dict[str, object] = {}) -> pd.DataFrame: - """ - Intakes raw data to provide predicted values for your trained model. - - :param data: Data (n_samples, n_columns) that the model(s) will evaluate on and provide the target prediction. - :param args: parameters needed to update the predictor ``PredictionArguments`` object, which holds any parameters relevant for prediction. - - :returns: A dataframe of predictions of the same length of input. - """ # noqa - pass - - def save(self, file_path: str) -> None: - """ - With a provided file path, saves the Predictor instance for later use. - - :param file_path: Location to store your Predictor Instance. - - :returns: Saves Predictor instance. - """ - with open(file_path, "wb") as fp: - dill.dump(self, fp) diff --git a/lightwood/api/types.py b/lightwood/api/types.py deleted file mode 100644 index b6139d073..000000000 --- a/lightwood/api/types.py +++ /dev/null @@ -1,599 +0,0 @@ -# TODO: type hint the returns -# TODO: df_std_dev is not clear in behavior; this would imply all std. of each column but that is not true, it should be renamed df_std_target_dev # noqa - -from typing import Dict, List, Optional, Union -import sys - -if sys.version_info >= (3, 8): - from typing import TypedDict -else: - from typing_extensions import TypedDict - -from dataclasses import dataclass -from lightwood.helpers.log import log -from dataclasses_json import dataclass_json -from dataclasses_json.core import _asdict, Json -import json - - -# See: https://www.python.org/dev/peps/pep-0589/ for how this works -# Not very intuitive but very powerful abstraction, might be useful in other places (@TODO) -class Module(TypedDict): - """ - Modules are the blocks of code that end up being called from the JSON AI, representing either object instantiations or function calls. - - :param module: Name of the module (function or class name) - :param args: Argument to pass to the function or constructor - """ # noqa - module: str - args: Dict[str, str] - - -@dataclass -class Feature: - """ - Within a dataframe, each column is considered its own "feature" (unless ignored etc.). \ - The following expects each feature to have descriptions of the following: - - :param encoder: the methodology for encoding a feature (a Lightwood Encoder) - :param data_dtype: The type of information within this column (ex.: numerical, categorical, etc.) - :param dependency: Any custom attributes for this feature that may require non-standard processing. This highly\ - depends on the encoder (ex: Pretrained text may be fine-tuned on the target; time-series requires prior time-steps). - """ - - encoder: Module - data_dtype: str = None - dependency: List[str] = None - - @staticmethod - def from_dict(obj: Dict): - """ - Create ``Feature`` objects from the a dictionary representation. - - :param obj: A dictionary representation of a column feature's attributes. Must include keys *encoder*, \ - *data_dtype*, and *dependency*. - - :Example: - - >>> my_dict = {"feature_A": {"encoder": MyEncoder, "data_dtype": "categorical", "dependency": None}} - >>> print(Feature.from_dict(my_dict["feature_A"])) - >>> Feature(encoder=None, data_dtype='categorical', dependency=None) - - :returns: A Feature object with loaded information. - """ - encoder = obj["encoder"] - data_dtype = obj.get("data_dtype", None) - dependency = obj.get("dependency", None) - - feature = Feature(encoder=encoder, data_dtype=data_dtype, dependency=dependency) - - return feature - - @staticmethod - def from_json(data: str): - """ - Create ``Feature`` objects from JSON representation. This method calls on :ref: `from_dict` after loading the \ - json config. - - :param data: A JSON representation of the feature. - - :returns: Loaded information into the Feature representation. - """ - return Feature.from_dict(json.loads(data)) - - def to_dict(self, encode_json=False) -> Dict[str, Json]: - """ - Converts a Feature to a dictionary representation. - - :returns: A python dictionary with strings indicating the three key elements and their respective values of \ - the Feature class. - """ - as_dict = _asdict(self, encode_json=encode_json) - for k in list(as_dict.keys()): - if as_dict[k] is None: - del as_dict[k] - return as_dict - - def to_json(self) -> Dict[str, Json]: - """ - Converts a Feature into a JSON object. Calls ``to_dict`` under the hood. - - :returns: Json config syntax for the three key elements and their respective values of the Feature class. - """ - return json.dumps(self.to_dict(), indent=4) - - -@dataclass_json -@dataclass -class Output: - """ - A representation for the output feature. This is specifically used on the target column of your dataset. \ - Four attributes are expected as seen below. - - Note, currently supervised tasks are supported, hence categorical, numerical, and time-series are the expected \ - outputs types. Complex features such as text generation are not currently available by default. - - :param data_dtype: The type of information within the target column (ex.: numerical, categorical, etc.). - :param encoder: the methodology for encoding the target feature (a Lightwood Encoder). There can only be one \ - encoder for the output target. - :param mixers: The list of ML algorithms that are trained for the target distribution. - :param ensemble: For a panel of ML algorithms, the approach of selecting the best mixer, and the metrics used in \ - that evaluation. - """ - - data_dtype: str - encoder: str = None - mixers: List[str] = None - ensemble: str = None - - -@dataclass_json -@dataclass -class TypeInformation: - """ - For a dataset, provides information on columns types, how they're used, and any other potential identifiers. - - TypeInformation is generated within ``data.infer_types``, where small samples of each column are evaluated in a custom framework to understand what kind of data type the model is. The user may override data types, but it is recommended to do so within a JSON-AI config file. - - :param dtypes: For each column's name, the associated data type inferred. - :param additional_info: Any possible sub-categories or additional descriptive information. - :param identifiers: Columns within the dataset highly suspected of being identifiers or IDs. These do not contain informatic value, therefore will be ignored in subsequent training/analysis procedures unless manually indicated. - """ # noqa - - dtypes: Dict[str, str] - additional_info: Dict[str, object] - identifiers: Dict[str, str] - - def __init__(self): - self.dtypes = dict() - self.additional_info = dict() - self.identifiers = dict() - - -@dataclass_json -@dataclass -class StatisticalAnalysis: - """ - The Statistical Analysis data class allows users to consider key descriptors of their data using simple \ - techniques such as histograms, mean and standard deviation, word count, missing values, and any detected bias\ - in the information. - - :param nr_rows: Number of rows (samples) in the dataset - :param df_std_dev: The standard deviation of the target of the dataset - :param train_observed_classes: - :param target_class_distribution: - :param histograms: - :param buckets: - :param missing: - :param distinct: - :param bias: - :param avg_words_per_sentence: - :param positive_domain: - """ - - nr_rows: int - df_std_dev: Optional[float] - train_observed_classes: object # Union[None, List[str]] - target_class_distribution: object # Dict[str, float] - histograms: object # Dict[str, Dict[str, List[object]]] - buckets: object # Dict[str, Dict[str, List[object]]] - missing: object - distinct: object - bias: object - avg_words_per_sentence: object - positive_domain: bool - - -@dataclass_json -@dataclass -class DataAnalysis: - """ - Data Analysis wraps :class: `.StatisticalAnalysis` and :class: `.TypeInformation` together. Further details can be seen in their respective documentation references. - """ # noqa - - statistical_analysis: StatisticalAnalysis - type_information: TypeInformation - - -@dataclass -class TimeseriesSettings: - """ - For time-series specific problems, more specific treatment of the data is necessary. The following attributes \ - enable time-series tasks to be carried out properly. - - :param is_timeseries: Whether the input data should be treated as time series; if true, this flag is checked in \ - subsequent internal steps to ensure processing is appropriate for time-series data. - :param order_by: A list of columns by which the data should be ordered. - :param group_by: Optional list of columns by which the data should be grouped. Each different combination of values\ - for these columns will yield a different series. - :param window: The temporal horizon (number of rows) that a model intakes to "look back" into when making a\ - prediction, after the rows are ordered by order_by columns and split into groups if applicable. - :param nr_predictions: The number of points in the future that predictions should be made for, defaults to 1. Once \ - trained, the model will be able to predict up to this many points into the future. - :param historical_columns: The temporal dynamics of these columns will be used as additional context to train the \ - time series predictor. Note that a non-historical column shall still be used to forecast, but without \ - considering their change through time. - :param target_type: Automatically inferred dtype of the target (e.g. `dtype.integer`, `dtype.float`). - :param use_previous_target: Use the previous values of the target column to generate predictions. Defaults to True. - """ - - is_timeseries: bool - order_by: List[str] = None - window: int = None - group_by: List[str] = None - use_previous_target: bool = True - nr_predictions: int = None - historical_columns: List[str] = None - target_type: str = ( - "" # @TODO: is the current setter (outside of initialization) a sane option? - # @TODO: George: No, I don't think it is, we need to pass this some other way - ) - allow_incomplete_history: bool = False - - @staticmethod - def from_dict(obj: Dict): - """ - Creates a TimeseriesSettings object from python dictionary specifications. - - :param: obj: A python dictionary with the necessary representation for time-series. The only mandatory columns are ``order_by`` and ``window``. - - :returns: A populated ``TimeseriesSettings`` object. - """ # noqa - if len(obj) > 0: - for mandatory_setting in ["order_by", "window"]: - if mandatory_setting not in obj: - err = f"Missing mandatory timeseries setting: {mandatory_setting}" - log.error(err) - raise Exception(err) - - timeseries_settings = TimeseriesSettings( - is_timeseries=True, - order_by=obj["order_by"], - window=obj["window"], - use_previous_target=obj.get("use_previous_target", True), - historical_columns=[], - nr_predictions=obj.get("nr_predictions", 1), - allow_incomplete_history=obj.get('allow_incomplete_history', False) - ) - for setting in obj: - timeseries_settings.__setattr__(setting, obj[setting]) - - else: - timeseries_settings = TimeseriesSettings(is_timeseries=False) - - return timeseries_settings - - @staticmethod - def from_json(data: str): - """ - Creates a TimeseriesSettings object from JSON specifications via python dictionary. - - :param: data: JSON-config file with necessary Time-series specifications - - :returns: A populated ``TimeseriesSettings`` object. - """ - return TimeseriesSettings.from_dict(json.loads(data)) - - def to_dict(self, encode_json=False) -> Dict[str, Json]: - """ - Creates a dictionary from ``TimeseriesSettings`` object - - :returns: A python dictionary containing the ``TimeSeriesSettings`` specifications. - """ - return _asdict(self, encode_json=encode_json) - - def to_json(self) -> Dict[str, Json]: - """ - Creates JSON config from TimeseriesSettings object - :returns: The JSON config syntax containing the ``TimeSeriesSettings`` specifications. - """ - return json.dumps(self.to_dict()) - - -@dataclass -class ProblemDefinition: - """ - The ``ProblemDefinition`` object indicates details on how the models that predict the target are prepared. \ - The only required specification from a user is the ``target``, which indicates the column within the input \ - data that the user is trying to predict. Within the ``ProblemDefinition``, the user can specify aspects \ - about how long the feature-engineering preparation may take, and nuances about training the models. - - :param target: The name of the target column; this is the column that will be used as the goal of the prediction. - :param pct_invalid: Number of data points maximally tolerated as invalid/missing/unknown. \ - If the data cleaning process exceeds this number, no subsequent steps will be taken. - :param unbias_target: all classes are automatically weighted inverse to how often they occur - :param seconds_per_mixer: Number of seconds maximum to spend PER mixer trained in the list of possible mixers. - :param seconds_per_encoder: Number of seconds maximum to spend when training an encoder that requires data to \ - learn a representation. - :param time_aim: Time budget (in seconds) to train all needed components for the predictive tasks, including \ - encoders and models. - :param target_weights: indicates to the accuracy functions how much to weight every target class. - :param positive_domain: For numerical taks, force predictor output to be positive (integer or float). - :param timeseries_settings: TimeseriesSettings object for time-series tasks, refer to its documentation for \ - available settings. - :param anomaly_detection: Whether to conduct unsupervised anomaly detection; currently supported only for time-\ - series. - :param ignore_features: The names of the columns the user wishes to ignore in the ML pipeline. Any column name \ - found in this list will be automatically removed from subsequent steps in the ML pipeline. - :param fit_on_all: Whether to fit the model on the held-out validation data. Validation data is strictly \ - used to evaluate how well a model is doing and is NEVER trained. However, in cases where users anticipate new \ - incoming data over time, the user may train the model further using the entire dataset. - :param strict_mode: crash if an `unstable` block (mixer, encoder, etc.) fails to run. - :param seed_nr: custom seed to use when generating a predictor from this problem definition. - """ - - target: str - pct_invalid: float - unbias_target: bool - seconds_per_mixer: Union[int, None] - seconds_per_encoder: Union[int, None] - time_aim: Union[int, None] - target_weights: Union[List[float], None] - positive_domain: bool - timeseries_settings: TimeseriesSettings - anomaly_detection: bool - ignore_features: List[str] - fit_on_all: bool - strict_mode: bool - seed_nr: int - - @staticmethod - def from_dict(obj: Dict): - """ - Creates a ProblemDefinition object from a python dictionary with necessary specifications. - - :param obj: A python dictionary with the necessary features for the ``ProblemDefinition`` class. - Only requires ``target`` to be specified. - - :returns: A populated ``ProblemDefinition`` object. - """ - target = obj['target'] - pct_invalid = obj.get('pct_invalid', 2) - unbias_target = obj.get('unbias_target', True) - seconds_per_mixer = obj.get('seconds_per_mixer', None) - seconds_per_encoder = obj.get('seconds_per_encoder', None) - time_aim = obj.get('time_aim', None) - target_weights = obj.get('target_weights', None) - positive_domain = obj.get('positive_domain', False) - timeseries_settings = TimeseriesSettings.from_dict(obj.get('timeseries_settings', {})) - anomaly_detection = obj.get('anomaly_detection', True) - ignore_features = obj.get('ignore_features', []) - fit_on_all = obj.get('fit_on_all', True) - strict_mode = obj.get('strict_mode', True) - seed_nr = obj.get('seed_nr', 420) - problem_definition = ProblemDefinition( - target=target, - pct_invalid=pct_invalid, - unbias_target=unbias_target, - seconds_per_mixer=seconds_per_mixer, - seconds_per_encoder=seconds_per_encoder, - time_aim=time_aim, - target_weights=target_weights, - positive_domain=positive_domain, - timeseries_settings=timeseries_settings, - anomaly_detection=anomaly_detection, - ignore_features=ignore_features, - fit_on_all=fit_on_all, - strict_mode=strict_mode, - seed_nr=seed_nr - ) - - return problem_definition - - @staticmethod - def from_json(data: str): - """ - Creates a ProblemDefinition Object from JSON config file. - - :param data: - - :returns: A populated ProblemDefinition object. - """ - return ProblemDefinition.from_dict(json.loads(data)) - - def to_dict(self, encode_json=False) -> Dict[str, Json]: - """ - Creates a python dictionary from the ProblemDefinition object - - :returns: A python dictionary - """ - return _asdict(self, encode_json=encode_json) - - def to_json(self) -> Dict[str, Json]: - """ - Creates a JSON config from the ProblemDefinition object - - :returns: TODO - """ - return json.dumps(self.to_dict()) - - -@dataclass -class JsonAI: - """ - The JsonAI Class allows users to construct flexible JSON config to specify their ML pipeline. JSON-AI follows a \ - recipe of how to pre-process data, construct features, and train on the target column. To do so, the following \ - specifications are required internally. - - :param features: The corresponding``Feature`` object for each of the column names of the dataset - :param outputs: The column name of the target and its ``Output`` object - :param problem_definition: The ``ProblemDefinition`` criteria. - :param identifiers: A dictionary of column names and respective data types that are likely identifiers/IDs within the data. Through the default cleaning process, these are ignored. - :param cleaner: The Cleaner object represents the pre-processing step on a dataframe. The user can specify custom subroutines, if they choose, on how to handle preprocessing. Alternatively, "None" suggests Lightwood's default approach in ``data.cleaner``. - :param splitter: The Splitter object is the method in which the input data is split into training/validation/testing data. - :param analyzer: The Analyzer object is used to evaluate how well a model performed on the predictive task. - :param explainer: The Explainer object deploys explainability tools of interest on a model to indicate how well a model generalizes its predictions. - :param analysis_blocks: The blocks that get used in both analysis and inference inside the analyzer and explainer blocks. - :param timeseries_transformer: Procedure used to transform any timeseries task dataframe into the format that lightwood expects for the rest of the pipeline. - :param timeseries_analyzer: Procedure that extracts key insights from any timeseries in the data (e.g. measurement frequency, target distribution, etc). - :param accuracy_functions: A list of performance metrics used to evaluate the best mixers. - """ # noqa - - features: Dict[str, Feature] - outputs: Dict[str, Output] - problem_definition: ProblemDefinition - identifiers: Dict[str, str] - cleaner: Optional[Module] = None - splitter: Optional[Module] = None - analyzer: Optional[Module] = None - explainer: Optional[Module] = None - analysis_blocks: Optional[List[Module]] = None - timeseries_transformer: Optional[Module] = None - timeseries_analyzer: Optional[Module] = None - accuracy_functions: Optional[List[str]] = None - - @staticmethod - def from_dict(obj: Dict): - """ - Creates a JSON-AI object from dictionary specifications of the JSON-config. - """ - features = {k: Feature.from_dict(v) for k, v in obj["features"].items()} - outputs = {k: Output.from_dict(v) for k, v in obj["outputs"].items()} - problem_definition = ProblemDefinition.from_dict(obj["problem_definition"]) - identifiers = obj["identifiers"] - cleaner = obj.get("cleaner", None) - splitter = obj.get("splitter", None) - analyzer = obj.get("analyzer", None) - explainer = obj.get("explainer", None) - analysis_blocks = obj.get("analysis_blocks", None) - timeseries_transformer = obj.get("timeseries_transformer", None) - timeseries_analyzer = obj.get("timeseries_analyzer", None) - accuracy_functions = obj.get("accuracy_functions", None) - - json_ai = JsonAI( - features=features, - outputs=outputs, - problem_definition=problem_definition, - identifiers=identifiers, - cleaner=cleaner, - splitter=splitter, - analyzer=analyzer, - explainer=explainer, - analysis_blocks=analysis_blocks, - timeseries_transformer=timeseries_transformer, - timeseries_analyzer=timeseries_analyzer, - accuracy_functions=accuracy_functions, - ) - - return json_ai - - @staticmethod - def from_json(data: str): - """ Creates a JSON-AI object from JSON config""" - return JsonAI.from_dict(json.loads(data)) - - def to_dict(self, encode_json=False) -> Dict[str, Json]: - """ - Creates a python dictionary with necessary modules within the ML pipeline specified from the JSON-AI object. - - :returns: A python dictionary that has the necessary components of the ML pipeline for a given dataset. - """ - as_dict = _asdict(self, encode_json=encode_json) - for k in list(as_dict.keys()): - if k == "features": - feature_dict = {} - for name in self.features: - feature_dict[name] = self.features[name].to_dict() - as_dict[k] = feature_dict - if as_dict[k] is None: - del as_dict[k] - return as_dict - - def to_json(self) -> Dict[str, Json]: - """ - Creates JSON config to represent the necessary modules within the ML pipeline specified from the JSON-AI object. - - :returns: A JSON config that has the necessary components of the ML pipeline for a given dataset. - """ - return json.dumps(self.to_dict(), indent=4) - - -@dataclass_json -@dataclass -class ModelAnalysis: - """ - The ``ModelAnalysis`` class stores useful information to describe a model and understand its predictive performance on a validation dataset. - For each trained ML algorithm, we store: - - :param accuracies: Dictionary with obtained values for each accuracy function (specified in JsonAI) - :param accuracy_histogram: Dictionary with histograms of reported accuracy by target value. - :param accuracy_samples: Dictionary with sampled pairs of observed target values and respective predictions. - :param train_sample_size: Size of the training set (data that parameters are updated on) - :param test_sample_size: Size of the testing set (explicitly held out) - :param column_importances: Dictionary with the importance of each column for the model, as estimated by an approach that closely follows a leave-one-covariate-out strategy. - :param confusion_matrix: A confusion matrix for the validation dataset. - :param histograms: Histogram for each dataset feature. - :param dtypes: Inferred data types for each dataset feature. - - """ # noqa - - accuracies: Dict[str, float] - accuracy_histogram: Dict[str, list] - accuracy_samples: Dict[str, list] - train_sample_size: int - test_sample_size: int - column_importances: Dict[str, float] - confusion_matrix: object - histograms: object - dtypes: object - - -@dataclass -class PredictionArguments: - """ - This class contains all possible arguments that can be passed to a Lightwood predictor at inference time. - On each predict call, all arguments included in a parameter dictionary will update the respective fields - in the `PredictionArguments` instance that the predictor will have. - - :param predict_proba: triggers (where supported) predictions in raw probability output form. I.e. for classifiers, - instead of returning only the predicted class, the output additionally includes the assigned probability for - each class. - :param all_mixers: forces an ensemble to return predictions emitted by all its internal mixers. - :param fixed_confidence: For analyzer module, specifies a fixed `alpha` confidence for the model calibration so \ - that predictions, in average, are correct `alpha` percent of the time. - :param anomaly_error_rate: Error rate for unsupervised anomaly detection. Bounded between 0.01 and 0.99 \ - (respectively implies wider and tighter bounds, all other parameters being equal). - :param anomaly_cooldown: Sets the minimum amount of timesteps between consecutive firings of the the anomaly \ - detector. - """ # noqa - - predict_proba: bool = False - all_mixers: bool = False - fixed_confidence: Union[int, float, None] = None - anomaly_error_rate: Union[float, None] = None - anomaly_cooldown: int = 1 - - @staticmethod - def from_dict(obj: Dict): - """ - Creates a ``PredictionArguments`` object from a python dictionary with necessary specifications. - - :param obj: A python dictionary with the necessary features for the ``PredictionArguments`` class. - - :returns: A populated ``PredictionArguments`` object. - """ - - # maybe this should be stateful instead, and save the latest used value for each field? - predict_proba = obj.get('predict_proba', PredictionArguments.predict_proba) - all_mixers = obj.get('all_mixers', PredictionArguments.all_mixers) - fixed_confidence = obj.get('fixed_confidence', PredictionArguments.fixed_confidence) - anomaly_error_rate = obj.get('anomaly_error_rate', PredictionArguments.anomaly_error_rate) - anomaly_cooldown = obj.get('anomaly_cooldown', PredictionArguments.anomaly_cooldown) - - pred_args = PredictionArguments( - predict_proba=predict_proba, - all_mixers=all_mixers, - fixed_confidence=fixed_confidence, - anomaly_error_rate=anomaly_error_rate, - anomaly_cooldown=anomaly_cooldown, - ) - - return pred_args - - def to_dict(self, encode_json=False) -> Dict[str, Json]: - """ - Creates a python dictionary from the ``PredictionArguments`` object - - :returns: A python dictionary - """ - return _asdict(self, encode_json=encode_json) diff --git a/lightwood/data/__init__.py b/lightwood/data/__init__.py deleted file mode 100644 index 1fe88866c..000000000 --- a/lightwood/data/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from lightwood.data.infer_types import infer_types -from lightwood.data.statistical_analysis import statistical_analysis -from lightwood.data.cleaner import cleaner -from lightwood.data.splitter import splitter -from lightwood.data.timeseries_transform import transform_timeseries -from lightwood.data.timeseries_analyzer import timeseries_analyzer -from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs - -__all__ = ['infer_types', 'statistical_analysis', 'cleaner', 'splitter', 'transform_timeseries', 'timeseries_analyzer', - 'EncodedDs', 'ConcatedEncodedDs'] diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py deleted file mode 100644 index 5ad6b691e..000000000 --- a/lightwood/data/encoded_ds.py +++ /dev/null @@ -1,193 +0,0 @@ -import inspect -from typing import List, Tuple -import torch -import numpy as np -import pandas as pd -from torch.utils.data import Dataset -from lightwood.encoder.base import BaseEncoder - - -class EncodedDs(Dataset): - def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target: str) -> None: - """ - Create a Lightwood datasource from a data frame and some encoders. This class inherits from `torch.utils.data.Dataset`. - - Note: normal behavior is to cache encoded representations to avoid duplicated computations. If you want an option to disable, this please open an issue. - - :param encoders: list of Lightwood encoders used to encode the data per each column. - :param data_frame: original dataframe. - :param target: name of the target column to predict. - """ # noqa - self.data_frame = data_frame - self.encoders = encoders - self.target = target - self.cache_encoded = True - self.cache = [None] * len(self.data_frame) - self.encoder_spans = {} - self.input_length = 0 - - # save encoder span, has to use same iterator as in __getitem__ for correct indeces - for col in self.data_frame: - if col != self.target and self.encoders.get(col, False): - self.encoder_spans[col] = (self.input_length, - self.input_length + self.encoders[col].output_size) - self.input_length += self.encoders[col].output_size - - def __len__(self): - """ - The length of an `EncodedDs` datasource equals the amount of rows of the original dataframe. - - :return: length of the `EncodedDs` - """ - return int(self.data_frame.shape[0]) - - def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: - """ - The getter yields a tuple (X, y), where: - - `X `is a concatenation of all encoded representations of the row - - `y` is the encoded target - - :param idx: index of the row to access. - - :return: tuple (X, y) with encoded data. - - """ # noqa - if self.cache_encoded: - if self.cache[idx] is not None: - return self.cache[idx] - - X = torch.FloatTensor() - Y = torch.FloatTensor() - for col in self.data_frame: - if self.encoders.get(col, None): - kwargs = {} - if 'dependency_data' in inspect.signature(self.encoders[col].encode).parameters: - kwargs['dependency_data'] = {dep: [self.data_frame.iloc[idx][dep]] - for dep in self.encoders[col].dependencies} - if hasattr(self.encoders[col], 'data_window'): - cols = [self.target] + [f'{self.target}_timestep_{i}' - for i in range(1, self.encoders[col].data_window)] - else: - cols = [col] - - data = self.data_frame[cols].iloc[idx].tolist() - encoded_tensor = self.encoders[col].encode(data, **kwargs)[0] - if col != self.target: - X = torch.cat([X, encoded_tensor]) - else: - Y = encoded_tensor - - if self.cache_encoded: - self.cache[idx] = (X, Y) - - return X, Y - - def get_column_original_data(self, column_name: str) -> pd.Series: - """ - Gets the original data for any given column of the `EncodedDs`. - - :param column_name: name of the column. - :return: A `pd.Series` with the original data stored in the `column_name` column. - """ - return self.data_frame[column_name] - - def get_encoded_column_data(self, column_name: str) -> torch.Tensor: - """ - Gets the encoded data for any given column of the `EncodedDs`. - - :param column_name: name of the column. - :return: A `torch.Tensor` with the encoded data of the `column_name` column. - """ - kwargs = {} - if 'dependency_data' in inspect.signature(self.encoders[column_name].encode).parameters: - deps = [dep for dep in self.encoders[column_name].dependencies if dep in self.data_frame.columns] - kwargs['dependency_data'] = {dep: self.data_frame[dep].tolist() for dep in deps} - encoded_data = self.encoders[column_name].encode(self.data_frame[column_name], **kwargs) - - if not isinstance(encoded_data, torch.Tensor): - raise Exception( - f'The encoder: {self.encoders[column_name]} for column: {column_name} does not return a Tensor !') - return encoded_data - - def get_encoded_data(self, include_target=True) -> torch.Tensor: - """ - Gets all encoded data. - - :param include_target: whether to include the target column in the output or not. - :return: A `torch.Tensor` with the encoded dataframe. - """ - encoded_dfs = [] - for col in self.data_frame.columns: - if (include_target or col != self.target) and self.encoders.get(col, False): - encoded_dfs.append(self.get_encoded_column_data(col)) - - return torch.cat(encoded_dfs, 1) - - def clear_cache(self): - """ - Clears the `EncodedDs` cache. - """ - self.cache = [None] * len(self.data_frame) - - -class ConcatedEncodedDs(EncodedDs): - """ - `ConcatedEncodedDs` abstracts over multiple encoded datasources (`EncodedDs`) as if they were a single entity. - """ # noqa - def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None: - # @TODO: missing super() call here? - self.encoded_ds_arr = encoded_ds_arr - self.encoded_ds_lenghts = [len(x) for x in self.encoded_ds_arr] - self.encoders = self.encoded_ds_arr[0].encoders - self.encoder_spans = self.encoded_ds_arr[0].encoder_spans - self.target = self.encoded_ds_arr[0].target - - def __len__(self): - """ - See `lightwood.data.encoded_ds.EncodedDs.__len__()`. - """ - # @TODO: behavior here is not intuitive - return max(0, np.sum(self.encoded_ds_lenghts) - 2) - - def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: - """ - See `lightwood.data.encoded_ds.EncodedDs.__getitem__()`. - """ - for ds_idx, length in enumerate(self.encoded_ds_lenghts): - if idx - length < 0: - return self.encoded_ds_arr[ds_idx][idx] - else: - idx -= length - raise StopIteration() - - @property - def data_frame(self) -> pd.DataFrame: - """ - Property that concatenates all underlying `EncodedDs`'s dataframes and returns them. - - Note: be careful to not modify a `ConcatedEncodedDs`, as you can see in the source, it will not have an effect. - - :return: Dataframe with all original data. - """ # noqa - return pd.concat([x.data_frame for x in self.encoded_ds_arr]) - - def get_column_original_data(self, column_name: str) -> pd.Series: - """ - See `lightwood.data.encoded_ds.EncodedDs.get_column_original_data()`. - """ - encoded_df_arr = [x.get_column_original_data(column_name) for x in self.encoded_ds_arr] - return pd.concat(encoded_df_arr) - - def get_encoded_column_data(self, column_name: str) -> torch.Tensor: - """ - See `lightwood.data.encoded_ds.EncodedDs.get_encoded_column_data()`. - """ - encoded_df_arr = [x.get_encoded_column_data(column_name) for x in self.encoded_ds_arr] - return torch.cat(encoded_df_arr, 0) - - def clear_cache(self): - """ - See `lightwood.data.encoded_ds.EncodedDs.clear_cache()`. - """ - for ds in self.encoded_ds_arr: - ds.clear_cache() diff --git a/lightwood/data/splitter.py b/lightwood/data/splitter.py deleted file mode 100644 index e517a90f7..000000000 --- a/lightwood/data/splitter.py +++ /dev/null @@ -1,145 +0,0 @@ -from typing import List, Dict -from itertools import product - -import numpy as np -import pandas as pd - -from lightwood.helpers.log import log -from lightwood.api.dtype import dtype -from lightwood.api.types import TimeseriesSettings - - -def splitter( - data: pd.DataFrame, - tss: TimeseriesSettings, - dtype_dict: Dict[str, str], - seed: int, - pct_train: float, - pct_dev: float, - pct_test: float, - target: str -) -> Dict[str, pd.DataFrame]: - """ - Splits data into training, dev and testing datasets. - - The proportion of data for each split must be specified (JSON-AI sets defaults to 80/10/10). First, rows in the dataset are shuffled randomly. Then a simple split is done. If a target value is provided and is of data type categorical/binary, then the splits will be stratified to maintain the representative populations of each class. - - :param data: Input dataset to be split - :param tss: time-series specific details for splitting - :param dtype_dict: Dictionary with the data type of all columns - :param seed: Random state for pandas data-frame shuffling - :param pct_train: training fraction of data; must be less than 1 - :param pct_dev: dev fraction of data; must be less than 1 - :param pct_test: testing fraction of data; must be less than 1 - :param target: Name of the target column; if specified, data will be stratified on this column - - :returns: A dictionary containing the keys train, test and dev with their respective data frames, as well as the "stratified_on" key indicating which columns the data was stratified on (None if it wasn't stratified on anything) - """ # noqa - pct_sum = pct_train + pct_dev + pct_test - if not (np.isclose(pct_sum, 1, atol=0.001) and np.less(pct_sum, 1 + 1e-5)): - raise Exception(f'The train, dev and test percentage of the data needs to sum up to 1 (got {pct_sum})') - - # Shuffle the data - np.random.seed(seed) - if not tss.is_timeseries: - data = data.sample(frac=1, random_state=seed).reset_index(drop=True) - - # Check if stratification should be done - stratify_on = [] - if target is not None: - if dtype_dict[target] in (dtype.categorical, dtype.binary) and not tss.is_timeseries: - stratify_on = [target] - if tss.is_timeseries and isinstance(tss.group_by, list): - stratify_on = tss.group_by - - # Split the data - if stratify_on: - reshuffle = not tss.is_timeseries - train, dev, test = stratify(data, pct_train, pct_dev, pct_test, stratify_on, seed, reshuffle) - else: - train, dev, test = simple_split(data, pct_train, pct_dev, pct_test) - - return {"train": train, "test": test, "dev": dev, "stratified_on": stratify_on} - - -def simple_split(data: pd.DataFrame, - pct_train: float, - pct_dev: float, - pct_test: float) -> List[pd.DataFrame]: - """ - Simple split method to separate data into training, dev and testing datasets. - - :param data: Input dataset to be split - :param pct_train: training fraction of data; must be less than 1 - :param pct_dev: dev fraction of data; must be less than 1 - :param pct_test: testing fraction of data; must be less than 1 - - :returns Train, dev, and test dataframes - """ - train_cutoff = round(data.shape[0] * pct_train) - dev_cutoff = round(data.shape[0] * pct_dev) + train_cutoff - test_cutoff = round(data.shape[0] * pct_test) + dev_cutoff - - train = data[:train_cutoff] - dev = data[train_cutoff:dev_cutoff] - test = data[dev_cutoff:test_cutoff] - - return [train, dev, test] - - -def stratify(data: pd.DataFrame, - pct_train: float, - pct_dev: float, - pct_test: float, - stratify_on: List[str], - seed: int, - reshuffle: bool) -> List[pd.DataFrame]: - """ - Stratified data splitter. - - The `stratify_on` columns yield a cartesian product by which every different subset will be stratified - independently from the others, and recombined at the end in fractions specified by `pcts`. - - For grouped time series tasks, stratification is done based on the group-by columns. - - :param data: dataframe with data to be split - :param pct_train: fraction of data to use for training split - :param pct_dev: fraction of data to use for dev split (used internally by mixers) - :param pct_test: fraction of data to use for test split (used post-training for analysis) - :param stratify_on: Columns to consider when stratifying - :param seed: Random state for pandas data-frame shuffling - :param reshuffle: specify if reshuffling should be done post-split - - :returns Stratified train, dev, test dataframes - """ # noqa - - train_st = pd.DataFrame(columns=data.columns) - dev_st = pd.DataFrame(columns=data.columns) - test_st = pd.DataFrame(columns=data.columns) - - all_group_combinations = list(product(*[data[col].unique() for col in stratify_on])) - for group in all_group_combinations: - df = data - for idx, col in enumerate(stratify_on): - df = df[df[col] == group[idx]] - - train_cutoff = round(df.shape[0] * pct_train) - dev_cutoff = round(df.shape[0] * pct_dev) + train_cutoff - test_cutoff = round(df.shape[0] * pct_test) + dev_cutoff - - train_st = train_st.append(df[:train_cutoff]) - dev_st = dev_st.append(df[train_cutoff:dev_cutoff]) - test_st = test_st.append(df[dev_cutoff:test_cutoff]) - - if reshuffle: - train_st, dev_st, test_st = [df.sample(frac=1, random_state=seed).reset_index(drop=True) - for df in [train_st, dev_st, test_st]] - - # check that stratified lengths conform to expected percentages - if not np.isclose(len(train_st) / len(data), pct_train, atol=0.01) or \ - not np.isclose(len(dev_st) / len(data), pct_dev, atol=0.01) or \ - not np.isclose(len(test_st) / len(data), pct_test, atol=0.01): - log.info("Could not stratify; reverting to simple split") - train_st, dev_st, test_st = simple_split(data, pct_train, pct_dev, pct_test) - - return [train_st, dev_st, test_st] diff --git a/lightwood/data/statistical_analysis.py b/lightwood/data/statistical_analysis.py deleted file mode 100644 index e46d3d6bf..000000000 --- a/lightwood/data/statistical_analysis.py +++ /dev/null @@ -1,177 +0,0 @@ -from typing import Dict -import pandas as pd -import numpy as np -import datetime -from dateutil.parser import parse as parse_dt -from lightwood.api import StatisticalAnalysis, ProblemDefinition -from lightwood.helpers.numeric import filter_nan_and_none -from lightwood.helpers.seed import seed -from lightwood.data.cleaner import cleaner -from lightwood.helpers.log import log -from lightwood.api.dtype import dtype -from scipy.stats import entropy -from lightwood.data.cleaner import _clean_float - - -def get_datetime_histogram(data: pd.Series, bins: int) -> Dict[str, list]: - """Generates the histogram for date and datetime types - """ - if isinstance(data[0], float) or isinstance(data[0], int): - data = [_clean_float(x) for x in data] - else: - data = [_clean_float(parse_dt(str(x)).timestamp()) for x in data] - - Y, X = np.histogram(data, bins=min(bins, len(set(data))), - range=(min(data), max(data)), density=False) - - X = X[:-1].tolist() - Y = Y.tolist() - - X = [str(datetime.datetime.fromtimestamp(x)) for x in X] - return { - 'x': X, - 'y': Y - } - - -def get_numeric_histogram(data: pd.Series, data_dtype: dtype, bins: int) -> Dict[str, list]: - """Generate the histogram for integer and float typed data - """ - # Handle arrays that are actual arrays and not things that become arrays later - if isinstance(data[0], list): - new_data = [] - for arr in data: - new_data.extend(arr) - data = new_data - - data = [_clean_float(x) for x in data] - - Y, X = np.histogram(data, bins=min(bins, len(set(data))), - range=(min(data), max(data)), density=False) - if data_dtype == dtype.integer: - Y, X = np.histogram(data, bins=[int(round(x)) for x in X], density=False) - - X = X[:-1].tolist() - Y = Y.tolist() - - return { - 'x': X, - 'y': Y - } - - -def compute_entropy_biased_buckets(histogram): - S, biased_buckets = None, None - if histogram is not None or len(histogram['x']) == 0: - hist_x = histogram['x'] - hist_y = histogram['y'] - nr_values = sum(hist_y) - S = entropy([x / nr_values for x in hist_y], base=max(2, len(hist_y))) - if S < 0.25: - pick_nr = -max(1, int(len(hist_y) / 10)) - biased_buckets = [hist_x[i] for i in np.array(hist_y).argsort()[pick_nr:]] - return S, biased_buckets - - -def statistical_analysis(data: pd.DataFrame, - dtypes: Dict[str, str], - identifiers: Dict[str, object], - problem_definition: ProblemDefinition, - seed_nr: int = 420) -> StatisticalAnalysis: - seed(seed_nr) - log.info('Starting statistical analysis') - df = cleaner(data, dtypes, problem_definition.pct_invalid, - identifiers, problem_definition.target, 'train', problem_definition.timeseries_settings, - problem_definition.anomaly_detection) - - missing = {col: len([x for x in df[col] if x is None]) / len(df[col]) for col in df.columns} - distinct = {col: len(set([str(x) for x in df[col]])) / len(df[col]) for col in df.columns} - - nr_rows = len(df) - target = problem_definition.target - positive_domain = False - # get train std, used in analysis - if dtypes[target] in [dtype.float, dtype.integer, dtype.tsarray, dtype.quantity]: - df_std = df[target].astype(float).std() - if min(df[target]) >= 0: - positive_domain = True - elif dtypes[target] in [dtype.array]: - try: - all_vals = [] - for x in df[target]: - all_vals += x - if min(all_vals) >= 0: - positive_domain = True - df_std = pd.Series(all_vals).astype(float).std() - except Exception as e: - log.warning(e) - df_std = 1.0 - else: - df_std = 1.0 - - histograms = {} - buckets = {} - # Get histograms for each column - for col in df.columns: - histograms[col] = None - buckets[col] = None - if dtypes[col] in (dtype.categorical, dtype.binary, dtype.tags): - hist = dict(df[col].value_counts()) - histograms[col] = { - 'x': list([str(x) for x in hist.keys()]), - 'y': list(hist.values()) - } - buckets[col] = histograms[col]['x'] - elif dtypes[col] in (dtype.integer, dtype.float, dtype.array, dtype.tsarray, dtype.quantity): - histograms[col] = get_numeric_histogram(filter_nan_and_none(df[col]), dtypes[col], 50) - buckets[col] = histograms[col]['x'] - elif dtypes[col] in (dtype.date, dtype.datetime): - histograms[col] = get_datetime_histogram(filter_nan_and_none(df[col]), 50) - else: - histograms[col] = {'x': ['Unknown'], 'y': [len(df[col])]} - buckets[col] = [] - - # get observed classes, used in analysis - target_class_distribution = None - if dtypes[target] in (dtype.categorical, dtype.binary): - target_class_distribution = dict(df[target].value_counts().apply(lambda x: x / len(df[target]))) - train_observed_classes = list(target_class_distribution.keys()) - elif dtypes[target] == dtype.tags: - train_observed_classes = None # @TODO: pending call to tags logic -> get all possible tags - else: - train_observed_classes = None - - bias = {} - for col in df.columns: - S, biased_buckets = compute_entropy_biased_buckets(histograms[col]) - bias[col] = { - 'entropy': S, - 'description': """Under the assumption of uniformly distributed data (i.e., same probability for Head or Tails on a coin flip) mindsdb tries to detect potential divergences from such case, and it calls this "potential bias". Thus by our data having any potential bias mindsdb means any divergence from all categories having the same probability of being selected.""", # noqa - 'biased_buckets': biased_buckets - } - - avg_words_per_sentence = {} - for col in df.columns: - if dtypes[col] in (dtype.rich_text, dtype.short_text): - words_per_sentence = [] - for item in df[col]: - if item is not None: - words_per_sentence.append(len(item.split(' '))) - avg_words_per_sentence[col] = int(np.mean(words_per_sentence)) - else: - avg_words_per_sentence[col] = None - - log.info('Finished statistical analysis') - return StatisticalAnalysis( - nr_rows=nr_rows, - df_std_dev=df_std, - train_observed_classes=train_observed_classes, - target_class_distribution=target_class_distribution, - positive_domain=positive_domain, - histograms=histograms, - buckets=buckets, - missing=missing, - distinct=distinct, - bias=bias, - avg_words_per_sentence=avg_words_per_sentence - ) diff --git a/lightwood/data/timeseries_analyzer.py b/lightwood/data/timeseries_analyzer.py deleted file mode 100644 index 85ebb30af..000000000 --- a/lightwood/data/timeseries_analyzer.py +++ /dev/null @@ -1,132 +0,0 @@ -from typing import Dict, Tuple, List - -import numpy as np -import pandas as pd - -from lightwood.api.types import TimeseriesSettings -from lightwood.api.dtype import dtype -from lightwood.encoder.time_series.helpers.common import get_group_matches, generate_target_group_normalizers - - -def timeseries_analyzer(data: pd.DataFrame, dtype_dict: Dict[str, str], - timeseries_settings: TimeseriesSettings, target: str) -> Dict: - """ - This module analyzes (pre-processed) time series data and stores a few useful insights used in the rest of Lightwood's pipeline. - - :param data: dataframe with time series dataset. - :param dtype_dict: dictionary with inferred types for every column. - :param timeseries_settings: A `TimeseriesSettings` object. For more details, check `lightwood.types.TimeseriesSettings`. - :param target: name of the target column. - - The following things are extracted from each time series inside the dataset: - - group_combinations: all observed combinations of values for the set of `group_by` columns. The length of this list determines how many time series are in the data. - - deltas: inferred sampling interval - - ts_naive_residuals: Residuals obtained from the data by a naive forecaster that repeats the last-seen value. - - ts_naive_mae: Mean residual value obtained from the data by a naive forecaster that repeats the last-seen value. - - target_normalizers: objects that may normalize the data within any given time series for effective learning. See `lightwood.encoder.time_series.helpers.common` for available choices. - - :return: Dictionary with the aforementioned insights and the `TimeseriesSettings` object for future references. - """ # noqa - info = { - 'original_type': dtype_dict[target], - 'data': data[target].values - } - if timeseries_settings.group_by is not None: - info['group_info'] = {gcol: data[gcol].tolist() for gcol in timeseries_settings.group_by} # group col values - else: - info['group_info'] = {} - - # @TODO: maybe normalizers should fit using only the training subsets?? - new_data = generate_target_group_normalizers(info) - - if dtype_dict[target] in (dtype.integer, dtype.float, dtype.tsarray): - naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(info, new_data['group_combinations']) - else: - naive_forecast_residuals, scale_factor = {}, {} - - deltas = get_delta(data[timeseries_settings.order_by], - info, - new_data['group_combinations'], - timeseries_settings.order_by) - - return {'target_normalizers': new_data['target_normalizers'], - 'deltas': deltas, - 'tss': timeseries_settings, - 'group_combinations': new_data['group_combinations'], - 'ts_naive_residuals': naive_forecast_residuals, - 'ts_naive_mae': scale_factor - } - - -def get_delta(df: pd.DataFrame, ts_info: dict, group_combinations: list, order_cols: list) -> Dict[str, Dict]: - """ - Infer the sampling interval of each time series, by picking the most popular time interval observed in the training data. - - :param df: Dataframe with time series data. - :param ts_info: Dictionary used internally by `timeseries_analyzer`. Contains group-wise series information, among other things. - :param group_combinations: all tuples with distinct values for `TimeseriesSettings.group_by` columns, defining all available time series. - :param order_cols: all columns specified in `TimeseriesSettings.order_by`. - - :return: - Dictionary with group combination tuples as keys. Values are dictionaries with the inferred delta for each series, for each `order_col`. - """ # noqa - deltas = {"__default": {}} - - # get default delta for all data - for col in order_cols: - series = pd.Series([x[-1] for x in df[col]]) - rolling_diff = series.rolling(window=2).apply(lambda x: x.iloc[1] - x.iloc[0]) - delta = rolling_diff.value_counts(ascending=False).keys()[0] # pick most popular - deltas["__default"][col] = delta - - # get group-wise deltas (if applicable) - if ts_info.get('group_info', False): - original_data = ts_info['data'] - for group in group_combinations: - if group != "__default": - deltas[group] = {} - for col in order_cols: - ts_info['data'] = pd.Series([x[-1] for x in df[col]]) - _, subset = get_group_matches(ts_info, group) - if subset.size > 1: - rolling_diff = pd.Series( - subset.squeeze()).rolling( - window=2).apply( - lambda x: x.iloc[1] - x.iloc[0]) - delta = rolling_diff.value_counts(ascending=False).keys()[0] - deltas[group][col] = delta - ts_info['data'] = original_data - - return deltas - - -def get_naive_residuals(target_data: pd.DataFrame, m: int = 1) -> Tuple[List, float]: - """ - Computes forecasting residuals for the naive method (forecasts for time `t` is the value observed at `t-1`). - Useful for computing MASE forecasting error. - - Note: method assumes predictions are all for the same group combination. For a dataframe that contains multiple - series, use `get_grouped_naive_resiudals`. - - :param target_data: observed time series targets - :param m: season length. the naive forecasts will be the m-th previously seen value for each series - - :return: (list of naive residuals, average residual value) - """ # noqa - residuals = target_data.rolling(window=m + 1).apply(lambda x: abs(x.iloc[m] - x.iloc[0]))[m:].values.flatten() - scale_factor = np.average(residuals) - return residuals.tolist(), scale_factor - - -def get_grouped_naive_residuals(info: Dict, group_combinations: List) -> Tuple[Dict, Dict]: - """ - Wraps `get_naive_residuals` for a dataframe with multiple co-existing time series. - """ # noqa - group_residuals = {} - group_scale_factors = {} - for group in group_combinations: - idxs, subset = get_group_matches(info, group) - residuals, scale_factor = get_naive_residuals(pd.DataFrame(subset)) # @TODO: pass m once we handle seasonality - group_residuals[group] = residuals - group_scale_factors[group] = scale_factor - return group_residuals, group_scale_factors diff --git a/lightwood/data/timeseries_transform.py b/lightwood/data/timeseries_transform.py deleted file mode 100644 index 451c3b0e2..000000000 --- a/lightwood/data/timeseries_transform.py +++ /dev/null @@ -1,346 +0,0 @@ -import copy -import datetime -import dateutil -import numpy as np -import pandas as pd -import multiprocessing as mp -from lightwood.helpers.parallelism import get_nr_procs -from functools import partial -from typing import Dict -from lightwood.api.types import TimeseriesSettings -from lightwood.helpers.log import log -from lightwood.api import dtype - - -def transform_timeseries( - data: pd.DataFrame, dtype_dict: Dict[str, str], - timeseries_settings: TimeseriesSettings, target: str, mode: str) -> pd.DataFrame: - """ - Block that transforms the dataframe of a time series task to a convenient format for use in posterior phases like model training. - - The main transformations performed by this block are: - - Type casting (e.g. to numerical for `order_by` columns). - - Windowing functions for historical context based on `TimeseriesSettings.window` parameter. - - Explicitly add target columns according to the `TimeseriesSettings.nr_predictions` parameter. - - Flag all rows that are "predictable" based on all `TimeseriesSettings`. - - Plus, handle all logic for the streaming use case (where forecasts are only emitted for the last observed data point). - - :param data: Dataframe with data to transform. - :param dtype_dict: Dictionary with the types of each column. - :param timeseries_settings: A `TimeseriesSettings` object. - :param target: The name of the target column to forecast. - :param mode: Either "train" or "predict", depending on what phase is calling this procedure. - - :return: A dataframe with all the transformations applied. - """ # noqa - - tss = timeseries_settings - original_df = copy.deepcopy(data) - gb_arr = tss.group_by if tss.group_by is not None else [] - ob_arr = tss.order_by - window = tss.window - - if '__mdb_make_predictions' in original_df.columns: - index = original_df[original_df['__mdb_make_predictions'].map( - {'True': True, 'False': False, True: True, False: False}).isin([True])] - infer_mode = index.shape[0] == 0 # condition to trigger: __mdb_make_predictions is set to False everywhere - # @TODO: dont drop and use instead of original_index? - original_df = original_df.reset_index(drop=True) if infer_mode else original_df - else: - infer_mode = False - - original_index_list = [] - idx = 0 - for row in original_df.itertuples(): - if _make_pred(row) or infer_mode: - original_index_list.append(idx) - idx += 1 - else: - original_index_list.append(None) - - original_df['original_index'] = original_index_list - - secondary_type_dict = {} - for col in ob_arr: - if dtype_dict[col] in (dtype.date, dtype.integer, dtype.float): - secondary_type_dict[col] = dtype_dict[col] - - # Convert order_by columns to numbers (note, rows are references to mutable rows in `original_df`) - for _, row in original_df.iterrows(): - for col in ob_arr: - # @TODO: Remove if the TS encoder can handle `None` - if row[col] is None or pd.isna(row[col]): - row[col] = 0.0 - else: - if dtype_dict[col] == dtype.date: - try: - row[col] = dateutil.parser.parse( - row[col], - **{} - ) - except (TypeError, ValueError): - pass - - if isinstance(row[col], datetime.datetime): - row[col] = row[col].timestamp() - - try: - row[col] = float(row[col]) - except ValueError: - raise ValueError(f'Failed to order based on column: "{col}" due to faulty value: {row[col]}') - - for oby in tss.order_by: - original_df[f'__mdb_original_{oby}'] = original_df[oby] - - group_lengths = [] - if len(gb_arr) > 0: - df_arr = [] - for _, df in original_df.groupby(gb_arr): - df_arr.append(df.sort_values(by=ob_arr)) - group_lengths.append(len(df)) - else: - df_arr = [original_df] - group_lengths.append(len(original_df)) - - n_groups = len(df_arr) - last_index = original_df['original_index'].max() - for i, subdf in enumerate(df_arr): - if '__mdb_make_predictions' in subdf.columns and mode == 'predict': - if infer_mode: - df_arr[i] = _ts_infer_next_row(subdf, ob_arr, last_index) - last_index += 1 - - if len(original_df) > 500: - # @TODO: restore possibility to override this with args - nr_procs = get_nr_procs(original_df) - log.info(f'Using {nr_procs} processes to reshape.') - pool = mp.Pool(processes=nr_procs) - # Make type `object` so that dataframe cells can be python lists - df_arr = pool.map(partial(_ts_to_obj, historical_columns=ob_arr + tss.historical_columns), df_arr) - df_arr = pool.map(partial(_ts_order_col_to_cell_lists, - order_cols=ob_arr + tss.historical_columns), df_arr) - df_arr = pool.map( - partial( - _ts_add_previous_rows, order_cols=ob_arr + tss.historical_columns, window=window), - df_arr) - - df_arr = pool.map(partial(_ts_add_future_target, target=target, nr_predictions=tss.nr_predictions, - data_dtype=tss.target_type, mode=mode), - df_arr) - - if tss.use_previous_target: - df_arr = pool.map( - partial(_ts_add_previous_target, target=target, window=tss.window), - df_arr) - pool.close() - pool.join() - else: - for i in range(n_groups): - df_arr[i] = _ts_to_obj(df_arr[i], historical_columns=ob_arr + tss.historical_columns) - df_arr[i] = _ts_order_col_to_cell_lists(df_arr[i], order_cols=ob_arr + tss.historical_columns) - df_arr[i] = _ts_add_previous_rows(df_arr[i], - order_cols=ob_arr + tss.historical_columns, window=window) - df_arr[i] = _ts_add_future_target(df_arr[i], target=target, nr_predictions=tss.nr_predictions, - data_dtype=tss.target_type, mode=mode) - if tss.use_previous_target: - df_arr[i] = _ts_add_previous_target(df_arr[i], target=target, window=tss.window) - - combined_df = pd.concat(df_arr) - - if '__mdb_make_predictions' in combined_df.columns: - combined_df = pd.DataFrame(combined_df[combined_df['__mdb_make_predictions'].astype(bool).isin([True])]) - del combined_df['__mdb_make_predictions'] - - if not infer_mode and any([i < tss.window for i in group_lengths]): - if tss.allow_incomplete_history: - log.warning("Forecasting with incomplete historical context, predictions might be subpar") - else: - raise Exception(f'Not enough historical context to make a timeseries prediction. Please provide a number of rows greater or equal to the window size. If you can\'t get enough rows, consider lowering your window size. If you want to force timeseries predictions lacking historical context please set the `allow_incomplete_history` timeseries setting to `True`, but this might lead to subpar predictions.') # noqa - - df_gb_map = None - if n_groups > 1: - df_gb_list = list(combined_df.groupby(tss.group_by)) - df_gb_map = {} - for gb, df in df_gb_list: - df_gb_map['_' + '_'.join(gb)] = df - - timeseries_row_mapping = {} - idx = 0 - - if df_gb_map is None: - for _, row in combined_df.iterrows(): - if not infer_mode: - timeseries_row_mapping[idx] = int( - row['original_index']) if row['original_index'] is not None and not np.isnan( - row['original_index']) else None - else: - timeseries_row_mapping[idx] = idx - idx += 1 - else: - for gb in df_gb_map: - for _, row in df_gb_map[gb].iterrows(): - if not infer_mode: - timeseries_row_mapping[idx] = int( - row['original_index']) if row['original_index'] is not None and not np.isnan( - row['original_index']) else None - else: - timeseries_row_mapping[idx] = idx - - idx += 1 - - del combined_df['original_index'] - - # return combined_df, secondary_type_dict, timeseries_row_mapping, df_gb_map - return combined_df - - -def _ts_infer_next_row(df: pd.DataFrame, ob: str, last_index: int) -> pd.DataFrame: - """ - Adds an inferred next row for streaming mode purposes. - - :param df: dataframe from which next row is inferred. - :param ob: `order_by` column. - :param last_index: index number of the latest row in `df`. - - :return: Modified `df` with the inferred row appended to it. - """ - last_row = df.iloc[[-1]].copy() - if df.shape[0] > 1: - butlast_row = df.iloc[[-2]] - delta = (last_row[ob].values - butlast_row[ob].values).flatten()[0] - else: - delta = 1 - last_row.original_index = None - last_row.index = [last_index + 1] - last_row['__mdb_make_predictions'] = True - last_row['__mdb_ts_inferred'] = True - last_row[ob] += delta - return df.append(last_row) - - -def _make_pred(row) -> bool: - """ - Indicates whether a prediction should be made for `row` or not. - """ - return not hasattr(row, '__mdb_make_predictions') or row.make_predictions - - -def _ts_to_obj(df: pd.DataFrame, historical_columns: list) -> pd.DataFrame: - """ - Casts all historical columns in a dataframe to `object` type. - - :param df: Input dataframe - :param historical_columns: Historical columns to type cast - - :return: Dataframe with `object`-typed historical columns - """ - for hist_col in historical_columns: - df.loc[:, hist_col] = df[hist_col].astype(object) - return df - - -def _ts_order_col_to_cell_lists(df: pd.DataFrame, order_cols: list) -> pd.DataFrame: - """ - Casts all data in `order_by` columns into cells. - - :param df: Input dataframe - :param order_cols: `order_by` columns - - :return: Dataframe with all `order_cols` modified so that their values are cells, e.g. `1` -> `[1]` - """ - for order_col in order_cols: - for ii in range(len(df)): - label = df.index.values[ii] - df.at[label, order_col] = [df.at[label, order_col]] - return df - - -def _ts_add_previous_rows(df: pd.DataFrame, order_cols: list, window: int) -> pd.DataFrame: - """ - Adds previous rows (as determined by `TimeseriesSettings.window`) into the cells of all `order_by` columns. - - :param df: Input dataframe. - :param order_cols: `order_by` columns. - :param window: value of `TimeseriesSettings.window` parameter. - - :return: Dataframe with all `order_cols` modified so that their values are now arrays of historical context. - """ # noqa - for order_col in order_cols: - for i in range(len(df)): - previous_indexes = [*range(max(0, i - window), i)] - - for prev_i in reversed(previous_indexes): - df.iloc[i][order_col].append( - df.iloc[prev_i][order_col][-1] - ) - - # Zero pad - # @TODO: Remove since RNN encoder can do without (???) - df.iloc[i][order_col].extend( - [0] * (1 + window - len(df.iloc[i][order_col])) - ) - df.iloc[i][order_col].reverse() - return df - - -def _ts_add_previous_target(df: pd.DataFrame, target: str, window: int) -> pd.DataFrame: - """ - Adds previous rows (as determined by `TimeseriesSettings.window`) into the cells of the target column. - - :param df: Input dataframe. - :param target: target column name. - :param window: value of `TimeseriesSettings.window` parameter. - - :return: Dataframe with new `__mdb_ts_previous_{target}` column that contains historical target context. - """ # noqa - if target not in df: - return df - previous_target_values = list(df[target]) - del previous_target_values[-1] - previous_target_values = [None] + previous_target_values - - previous_target_values_arr = [] - for i in range(len(previous_target_values)): - prev_vals = previous_target_values[max(i - window, 0):i + 1] - arr = [None] * (window - len(prev_vals) + 1) - arr.extend(prev_vals) - previous_target_values_arr.append(arr) - - df[f'__mdb_ts_previous_{target}'] = previous_target_values_arr - return df - - -def _ts_add_future_target(df, target, nr_predictions, data_dtype, mode): - """ - Adds as many columns to the input dataframe as the forecasting horizon asks for (as determined by `TimeseriesSettings.nr_predictions`). - - :param df: Input dataframe. - :param target: target column name. - :param nr_predictions: value of `TimeseriesSettings.nr_predictions` parameter. - :param data_dtype: dictionary with types of all input columns - :param mode: either "train" or "predict". `Train` will drop rows with incomplet target info. `Predict` has no effect, for now. - - :return: Dataframe with new `{target}_timestep_{i}'` columns that contains target labels at timestep `i` of a total `TimeseriesSettings.nr_predictions`. - """ # noqa - if target not in df: - return df - if data_dtype in (dtype.integer, dtype.float, dtype.array, dtype.tsarray): - df[target] = df[target].astype(float) - - for timestep_index in range(1, nr_predictions): - next_target_value_arr = list(df[target]) - for del_index in range(0, min(timestep_index, len(next_target_value_arr))): - del next_target_value_arr[0] - next_target_value_arr.append(None) - col_name = f'{target}_timestep_{timestep_index}' - df[col_name] = next_target_value_arr - df[col_name] = df[col_name].fillna(value=np.nan) - - # drop rows with incomplete target info. - if mode == 'train': - for col in [f'{target}_timestep_{i}' for i in range(1, nr_predictions)]: - if '__mdb_make_predictions' not in df.columns: - df['__mdb_make_predictions'] = True - df.loc[df[col].isna(), ['__mdb_make_predictions']] = False - - return df diff --git a/lightwood/encoder/array/__init__.py b/lightwood/encoder/array/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/encoder/array/array.py b/lightwood/encoder/array/array.py deleted file mode 100644 index fbde15b1a..000000000 --- a/lightwood/encoder/array/array.py +++ /dev/null @@ -1,75 +0,0 @@ -from typing import Union -import torch -import pandas as pd -import numpy as np -from lightwood.encoder.base import BaseEncoder -from lightwood.api import dtype -from lightwood.encoder.time_series.helpers.common import MinMaxNormalizer, CatNormalizer - - -class ArrayEncoder(BaseEncoder): - """ - Fits a normalizer for array data. To encode, `ArrayEncoder` returns a normalized window of previous data. - It can be used for generic arrays, as well as for handling historical target values in time series tasks. - - :param stop_after: time budget in seconds. - :param window: expected length of array data. - """ - - is_trainable_encoder: bool = True - - def __init__(self, stop_after: int, window: int = None, is_target: bool = False, original_type: dtype = None): - super().__init__(is_target) - self.stop_after = stop_after - self.original_type = original_type - self._normalizer = None - if window is not None: - self.output_size = window + 1 - else: - self.output_size = None - - def prepare(self, train_priming_data, dev_priming_data): - priming_data = pd.concat([train_priming_data, dev_priming_data]) - priming_data = priming_data.values - - if self.output_size is None: - self.output_size = np.max([len(x) for x in priming_data if x is not None]) - for i in range(len(priming_data)): - if priming_data[i] is None: - priming_data[i] = [0] * self.output_size - - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - if self.original_type in (dtype.categorical, dtype.binary): - self._normalizer = CatNormalizer(encoder_class='ordinal') - else: - self._normalizer = MinMaxNormalizer() - - if isinstance(priming_data, pd.Series): - priming_data = priming_data.values - - self._normalizer.prepare(priming_data) - self.output_size *= self._normalizer.output_size - self.is_prepared = True - - def encode(self, column_data: Union[list, np.ndarray, torch.Tensor]) -> torch.Tensor: - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - if isinstance(column_data, pd.Series): - column_data = column_data.values - - for i in range(len(column_data)): - if column_data[i] is None: - column_data[i] = [0] * self.output_size - - data = torch.cat([self._normalizer.encode(column_data)], dim=-1) - data[torch.isnan(data)] = 0.0 - data[torch.isinf(data)] = 0.0 - - return data - - def decode(self, data) -> torch.tensor: - decoded = data.tolist() - return decoded diff --git a/lightwood/encoder/audio/__init__.py b/lightwood/encoder/audio/__init__.py deleted file mode 100644 index 5793fecc8..000000000 --- a/lightwood/encoder/audio/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# This encoder is optional since it's underlying dependency (librosa) needs system dependencies -try: - from lightwood.encoder.audio.mfcc import MFCCEncoder -except Exception: - MFCCEncoder = None - -__all__ = ['MFCCEncoder'] diff --git a/lightwood/encoder/base.py b/lightwood/encoder/base.py deleted file mode 100644 index 4a2b80394..000000000 --- a/lightwood/encoder/base.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import List -import torch - - -class BaseEncoder: - """ - Base class for all encoders. - - An encoder should return encoded representations of any columnar data. - The procedure for this is defined inside the `encode()` method. - - If this encoder is expected to handle an output column, then it also needs to implement the respective `decode()` method that handles the inverse transformation from encoded representations to the final prediction in the original column space. - - For encoders that learn representations (as opposed to rule-based), the `prepare()` method will handle all learning logic. - - The `to()` method is used to move PyTorch-based encoders to and from a GPU. - - :param is_target: Whether the data to encode is the target, as per the problem definition. - :param is_timeseries_encoder: Whether encoder represents sequential/time-series data. Lightwood must provide specific treatment for this kind of encoder - :param is_trainable_encoder: Whether the encoder must return learned representations. Lightwood checks whether this flag is present in order to pass data to the feature representation via the ``prepare`` statement. - - Class Attributes: - - is_prepared: Internal flag to signal that the `prepare()` method has been successfully executed. - - is_nn_encoder: Whether the encoder is neural network-based. - - dependencies: list of additional columns that the encoder might need to encode. - - output_size: length of each encoding tensor for a single data point. - - """ # noqa - is_target: bool - is_prepared: bool - - is_timeseries_encoder: bool = False - is_trainable_encoder: bool = False - - def __init__(self, is_target=False) -> None: - self.is_target = is_target - self.is_prepared = False - self.dependencies = [] - self.output_size = None - - # Not all encoders need to be prepared - def prepare(self, priming_data) -> None: - self.is_prepared = True - - def encode(self, column_data) -> torch.Tensor: - raise NotImplementedError - - def decode(self, encoded_data) -> List[object]: - raise NotImplementedError - - # Should work for all torch-based encoders, but custom behavior may have to be implemented for weird models - def to(self, device, available_devices): - # Find all nn.Module type objects and convert them - # @TODO: Make this work recursively - for v in vars(self): - attr = getattr(self, v) - if isinstance(attr, torch.nn.Module): - attr.to(device) - return self diff --git a/lightwood/encoder/categorical/__init__.py b/lightwood/encoder/categorical/__init__.py deleted file mode 100644 index 82e613ddd..000000000 --- a/lightwood/encoder/categorical/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from lightwood.encoder.categorical.onehot import OneHotEncoder -from lightwood.encoder.categorical.multihot import MultiHotEncoder -from lightwood.encoder.categorical.autoencoder import CategoricalAutoEncoder - -__all__ = ['OneHotEncoder', 'MultiHotEncoder', 'CategoricalAutoEncoder'] diff --git a/lightwood/encoder/categorical/autoencoder.py b/lightwood/encoder/categorical/autoencoder.py deleted file mode 100644 index 5109c4a78..000000000 --- a/lightwood/encoder/categorical/autoencoder.py +++ /dev/null @@ -1,107 +0,0 @@ -import random -import numpy as np -import torch -from torch.utils.data import DataLoader -from lightwood.mixer.helpers.ranger import Ranger -from lightwood.encoder.categorical.onehot import OneHotEncoder -from lightwood.encoder.categorical.gym import Gym -from lightwood.encoder.base import BaseEncoder -from lightwood.helpers.log import log -from lightwood.mixer.helpers.default_net import DefaultNet -import pandas as pd - - -class CategoricalAutoEncoder(BaseEncoder): - is_trainable_encoder: bool = True - - def __init__(self, stop_after: int = 3600, is_target: bool = False, max_encoded_length: int = 100): - super().__init__(is_target) - self.is_prepared = False - self.name = 'Categorical Autoencoder' - self.net = None - self.encoder = None - self.decoder = None - self.onehot_encoder = OneHotEncoder(is_target=self.is_target) - self.desired_error = 0.01 - self.stop_after = stop_after - # @TODO stop using instead of ONEHOT !!!@! - self.output_size = None - self.max_encoded_length = max_encoded_length - - def _encoder_targets(self, data): - oh_encoded_categories = self.onehot_encoder.encode(data) - target = oh_encoded_categories.cpu().numpy() - target_indexes = np.where(target > 0)[1] - targets_c = torch.LongTensor(target_indexes) - labels = targets_c.to(self.net.device) - return labels - - def prepare(self, train_priming_data, dev_priming_data): - priming_data = pd.concat([train_priming_data, dev_priming_data]) - random.seed(len(priming_data)) - - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - self.onehot_encoder.prepare(priming_data) - - input_len = self.onehot_encoder._lang.n_words - - if self.is_target: - log.warning('You are trying to use an autoencoder for the target value! \ - This is very likely a bad idea') - log.info('Preparing a categorical autoencoder, this might take a while') - - embeddings_layer_len = self.max_encoded_length - - self.net = DefaultNet(shape=[input_len, embeddings_layer_len, input_len]) - - criterion = torch.nn.CrossEntropyLoss() - optimizer = Ranger(self.net.parameters()) - - gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, - device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, - output_encoder=self._encoder_targets) - - batch_size = min(200, int(len(priming_data) / 50)) - - priming_data_str = [str(x) for x in priming_data] - train_data_loader = DataLoader( - list(zip(priming_data_str, priming_data_str)), - batch_size=batch_size, shuffle=True) - - test_data_loader = None - - best_model, _, _ = gym.fit(train_data_loader, - test_data_loader, - desired_error=self.desired_error, - max_time=self.stop_after, - eval_every_x_epochs=1, - max_unimproving_models=5) - - self.net = best_model.to(self.net.device) - - modules = [module for module in self.net.modules() if type( - module) != torch.nn.Sequential and type(module) != DefaultNet] - self.encoder = torch.nn.Sequential(*modules[0:2]).eval() - self.decoder = torch.nn.Sequential(*modules[2:3]).eval() - log.info('Categorical autoencoder ready') - - self.output_size = self.onehot_encoder._lang.n_words - self.output_size = self.max_encoded_length - self.is_prepared = True - - def encode(self, column_data): - oh_encoded_tensor = self.onehot_encoder.encode(column_data) - - with torch.no_grad(): - oh_encoded_tensor = oh_encoded_tensor.to(self.net.device) - embeddings = self.encoder(oh_encoded_tensor) - return embeddings.to('cpu') - - def decode(self, encoded_data): - with torch.no_grad(): - encoded_data = encoded_data.to(self.net.device) - oh_encoded_tensor = self.decoder(encoded_data) - oh_encoded_tensor = oh_encoded_tensor.to('cpu') - return self.onehot_encoder.decode(oh_encoded_tensor) diff --git a/lightwood/encoder/categorical/binary.py b/lightwood/encoder/categorical/binary.py deleted file mode 100644 index 1c708179f..000000000 --- a/lightwood/encoder/categorical/binary.py +++ /dev/null @@ -1,71 +0,0 @@ -import torch -import numpy as np -from scipy.special import softmax -from lightwood.encoder.base import BaseEncoder - - -# Exists mainly for datasets with loads of binary flags where OHE can be too slow to fit -class BinaryEncoder(BaseEncoder): - - def __init__(self, is_target=False, target_class_distribution=None): - super().__init__(is_target) - self.map = {} - self.rev_map = {} - self.output_size = 2 - if self.is_target: - self.target_class_distribution = target_class_distribution - self.index_weights = None - - def prepare(self, priming_data): - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - for x in priming_data: - x = str(x) - if x not in self.map: - self.map[x] = len(self.map) - self.rev_map[len(self.rev_map)] = x - - if len(self.map) == 2: - break - - if self.is_target: - self.index_weights = [None, None] - for word in self.map: - if self.target_class_distribution is not None: - self.index_weights[self.map[word]] = 1 / self.target_class_distribution[word] - else: - self.index_weights[self.map[word]] = 1 - - self.index_weights = torch.Tensor(self.index_weights) - - self.is_prepared = True - - def encode(self, column_data): - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - ret = [] - - for word in column_data: - index = self.map.get(word, None) - ret.append([0, 0]) - if index is not None: - ret[-1][index] = 1 - - return torch.Tensor(ret) - - def decode(self, encoded_data, return_raw=False): - encoded_data_list = encoded_data.tolist() - ret = [] - probs = [] - - for vector in encoded_data_list: - ret.append(self.rev_map[np.argmax(vector)]) - - if return_raw: - probs.append(softmax(vector).tolist()) - - if return_raw: - return ret, probs, self.rev_map - else: - return ret diff --git a/lightwood/encoder/categorical/gym.py b/lightwood/encoder/categorical/gym.py deleted file mode 100644 index 8e90eb5b4..000000000 --- a/lightwood/encoder/categorical/gym.py +++ /dev/null @@ -1,132 +0,0 @@ -import copy -import time -import torch - -import numpy as np -from lightwood.helpers.torch import LightwoodAutocast - - -class Gym: - - def __init__(self, model, optimizer, scheduler, loss_criterion, device, - name=None, input_encoder=None, output_encoder=None): - """ - Create an environment for training a pytorch machine learning model - """ - self.device = device - self.model = model - self.optimizer = optimizer - self.scheduler = scheduler - self.loss_criterion = loss_criterion - self.name = name - self.input_encoder = input_encoder - self.output_encoder = output_encoder - - self.best_model = None - - def fit(self, train_data_loader, test_data_loader, desired_error, max_time, callback=None, - eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=None, custom_test_func=None): - started = time.time() - epoch = 0 - lowest_test_error = None - last_test_error = None - test_error_delta_buff = [] - - keep_training = True - - while keep_training: - epoch += 1 - running_loss = 0.0 - error = 0 - self.model = self.model.train() - for i, data in enumerate(train_data_loader, 0): - if custom_train_func is None: - input, real = data - - with LightwoodAutocast(): - if self.input_encoder is not None: - input = self.input_encoder(input) - if self.output_encoder is not None: - real = self.output_encoder(real) - - input = input.to(self.device) - real = real.to(self.device) - - predicted = self.model(input) - loss = self.loss_criterion(predicted, real) - loss.backward() - self.optimizer.step() - - if self.scheduler is not None: - self.scheduler.step() - - self.optimizer.zero_grad() - else: - loss = custom_train_func(self.model, data, self) - - running_loss += loss.item() - error = running_loss / (i + 1) - - if epoch % eval_every_x_epochs == 0: - if test_data_loader is not None: - test_running_loss = 0.0 - test_error = 0 - self.model = self.model.eval() - real_buff = [] - predicted_buff = [] - - for i, data in enumerate(test_data_loader, 0): - if custom_test_func is None: - input, real = data - - if self.input_encoder is not None: - input = self.input_encoder(input) - if self.output_encoder is not None: - real = self.output_encoder(real) - - input = input.to(self.device) - real = real.to(self.device) - - with torch.no_grad(): - predicted = self.model(input) - - real_buff.append(real.tolist()) - predicted_buff.append(predicted.tolist()) - - loss = self.loss_criterion(predicted, real) - else: - with torch.no_grad(): - loss = custom_test_func(self.model, data, self) - - test_running_loss += loss.item() - test_error = test_running_loss / (i + 1) - else: - test_error = error - real_buff = None - predicted_buff = None - - if lowest_test_error is None or test_error < lowest_test_error: - lowest_test_error = test_error - self.best_model = copy.deepcopy(self.model).to('cpu') - - if last_test_error is None: - test_error_delta_buff.append(0) - else: - test_error_delta_buff.append(last_test_error - test_error) - - last_test_error = test_error - - if (time.time() - started) > max_time: - keep_training = False - - if lowest_test_error < desired_error: - keep_training = False - - if len(test_error_delta_buff) >= max_unimproving_models: - delta_mean = np.mean(test_error_delta_buff[-max_unimproving_models:]) - if delta_mean <= 0: - keep_training = False - if callback is not None: - callback(test_error, real_buff, predicted_buff) - - return self.best_model, lowest_test_error, int(time.time() - started) diff --git a/lightwood/encoder/categorical/multihot.py b/lightwood/encoder/categorical/multihot.py deleted file mode 100644 index d04239947..000000000 --- a/lightwood/encoder/categorical/multihot.py +++ /dev/null @@ -1,39 +0,0 @@ -import torch -import numpy as np -from lightwood.encoder import BaseEncoder -from sklearn.preprocessing import MultiLabelBinarizer - - -class MultiHotEncoder(BaseEncoder): - def __init__(self, is_target: bool = False): - super().__init__(is_target) - self._binarizer = MultiLabelBinarizer() - self._seen = set() - self.output_size = None - - @staticmethod - def _clean_col_data(column_data): - print(column_data) - column_data = [(arr if arr is not None else []) for arr in column_data] - column_data = [[str(x) for x in arr] for arr in column_data] - return column_data - - def prepare(self, priming_data, max_dimensions=100): - priming_data = self._clean_col_data(priming_data) - self._binarizer.fit(priming_data + [('None')]) - for arr in priming_data: - for x in arr: - self._seen.add(x) - self.is_prepared = True - self.output_size = len(self.encode(priming_data[0:1])[0]) - - def encode(self, column_data): - column_data = self._clean_col_data(column_data) - data_array = self._binarizer.transform(column_data) - return torch.Tensor(data_array) - - def decode(self, vectors): - # It these are logits output by the neural network, we need to treshold them to binary vectors - vectors = np.where(vectors > 0, 1, 0) - words_tuples = self._binarizer.inverse_transform(vectors) - return [list(w) for w in words_tuples] diff --git a/lightwood/encoder/categorical/onehot.py b/lightwood/encoder/categorical/onehot.py deleted file mode 100644 index 7e5bfadcc..000000000 --- a/lightwood/encoder/categorical/onehot.py +++ /dev/null @@ -1,131 +0,0 @@ -import torch -import numpy as np -from scipy.special import softmax -from lightwood.encoder.text.helpers.rnn_helpers import Lang -from lightwood.helpers.log import log -from lightwood.encoder.base import BaseEncoder - -UNCOMMON_WORD = '__mdb_unknown_cat' -UNCOMMON_TOKEN = 0 - - -class OneHotEncoder(BaseEncoder): - - def __init__(self, is_target=False, target_class_distribution=None, handle_unknown='unknown_token'): - super().__init__(is_target) - self._lang = None - self.rev_map = {} - - if handle_unknown not in {"unknown_token", "return_zeros"}: - raise ValueError(f"handle_unknown should be either 'unknown_token' or 'return_zeros', got {handle_unknown}") - else: - self.handle_unknown = handle_unknown - - if self.is_target: - self.target_class_distribution = target_class_distribution - self.index_weights = None - - def prepare(self, priming_data, max_dimensions=20000): - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - self._lang = Lang('default') - if self.handle_unknown == "return_zeros": - priming_data = [x for x in priming_data if x is not None] - self._lang.index2word = {} - self._lang.word2index = {} - self._lang.n_words = 0 - else: # self.handle_unknown == "unknown_token" - priming_data = [x if x is not None else UNCOMMON_WORD for x in priming_data] - self._lang.index2word = {UNCOMMON_TOKEN: UNCOMMON_WORD} - self._lang.word2index = {UNCOMMON_WORD: UNCOMMON_TOKEN} - self._lang.word2count[UNCOMMON_WORD] = 0 - self._lang.n_words = 1 - - for category in priming_data: - if category is not None: - self._lang.addWord(str(category)) - - while self._lang.n_words > max_dimensions: - if self.handle_unknown == "return_zeros": - necessary_words = [] - else: # self.handle_unknown == "unknown_token" - necessary_words = [UNCOMMON_WORD] - least_occuring_words = self._lang.getLeastOccurring(n=len(necessary_words) + 1) - - word_to_remove = None - for word in least_occuring_words: - if word not in necessary_words: - word_to_remove = word - break - - self._lang.removeWord(word_to_remove) - - if self.is_target: - self.index_weights = [None] * self._lang.n_words - if self.target_class_distribution is not None: - self.index_weights[0] = np.mean(list(self.target_class_distribution.values())) - else: - self.index_weights[0] = 1 - for word in set(priming_data): - if self.target_class_distribution is not None: - self.index_weights[self._lang.word2index[str(word)]] = 1 / self.target_class_distribution[word] - else: - self.index_weights[self._lang.word2index[str(word)]] = 1 - self.index_weights = torch.Tensor(self.index_weights) - - self.output_size = self._lang.n_words - self.rev_map = self._lang.index2word - self.is_prepared = True - - def encode(self, column_data): - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - ret = [] - v_len = self._lang.n_words - - for word in column_data: - encoded_word = [0] * v_len - if word is not None: - word = str(word) - if self.handle_unknown == "return_zeros": - if word in self._lang.word2index: - index = self._lang.word2index[word] - encoded_word[index] = 1 - else: - # Encoding an unknown value will result in a vector of zeros - log.warning('Trying to encode a value never seen before, returning vector of zeros') - else: # self.handle_unknown == "unknown_token" - index = self._lang.word2index[word] if word in self._lang.word2index else UNCOMMON_TOKEN - encoded_word[index] = 1 - - ret.append(encoded_word) - - return torch.Tensor(ret) - - def decode(self, encoded_data, return_raw=False): - encoded_data_list = encoded_data.tolist() - ret = [] - probs = [] - - for vector in encoded_data_list: - # Logits and onehots are not the same in definition - # But this explicitly operates on logits; it will take care of - # the one hot (so you can pass something in the softmax logit space) - # But will not affect something that is already OHE. - - all_zeros = not np.any(vector) - if self.handle_unknown == "return_zeros" and all_zeros: - ret.append(UNCOMMON_WORD) - else: # self.handle_unknown == "unknown_token" - ohe_index = np.argmax(vector) - ret.append(self._lang.index2word[ohe_index]) - - if return_raw: - probs.append(softmax(vector).tolist()) - - if return_raw: - return ret, probs, self.rev_map - else: - return ret diff --git a/lightwood/encoder/datetime/__init__.py b/lightwood/encoder/datetime/__init__.py deleted file mode 100644 index 177c8b00f..000000000 --- a/lightwood/encoder/datetime/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from lightwood.encoder.datetime.datetime import DatetimeEncoder -from lightwood.encoder.datetime.datetime_sin_normalizer import DatetimeNormalizerEncoder - -__all__ = ['DatetimeEncoder', 'DatetimeNormalizerEncoder'] diff --git a/lightwood/encoder/datetime/datetime.py b/lightwood/encoder/datetime/datetime.py deleted file mode 100644 index 001fc03a8..000000000 --- a/lightwood/encoder/datetime/datetime.py +++ /dev/null @@ -1,84 +0,0 @@ -import datetime -import calendar -from typing import Optional -import torch -from lightwood.encoder.base import BaseEncoder - - -class DatetimeEncoder(BaseEncoder): - def __init__(self, is_target: bool = False): - super().__init__(is_target) - self.fields = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second'] - self.constants = {'year': 3000.0, 'month': 12.0, 'weekday': 7.0, - 'hour': 24.0, 'minute': 60.0, 'second': 60.0} - self.output_size = 7 - - def prepare(self, priming_data): - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - self.is_prepared = True - - def encode(self, data): - """ - :param data: # @TODO: receive a consistent data type here; currently either list of lists or pd.Series w/lists - :return: encoded data - """ - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - ret = [self.encode_one(unix_timestamp) for unix_timestamp in data] - - return torch.Tensor(ret) - - def encode_one(self, unix_timestamp: Optional[float]): - """ - Encodes a list of unix_timestamps, or a list of tensors with unix_timestamps - :param data: list of unix_timestamps (unix_timestamp resolution is seconds) - :return: a list of vectors - """ - if unix_timestamp is None: - vector = [0] * len(self.fields) - else: - c = self.constants - date = datetime.datetime.fromtimestamp(unix_timestamp) - day_constant = calendar.monthrange(date.year, date.month)[1] - vector = [date.year / c['year'], date.month / c['month'], date.day / day_constant, - date.weekday() / c['weekday'], date.hour / c['hour'], - date.minute / c['minute'], date.second / c['second']] - return vector - - def decode(self, encoded_data, return_as_datetime=False): - ret = [] - if len(encoded_data.shape) > 2 and encoded_data.shape[0] == 1: - encoded_data = encoded_data.squeeze(0) - - for vector in encoded_data.tolist(): - ret.append(self.decode_one(vector, return_as_datetime=return_as_datetime)) - - return ret - - def decode_one(self, vector, return_as_datetime=False): - if sum(vector) == 0: - decoded = None - - else: - c = self.constants - - year = max(0, round(vector[0] * c['year'])) - month = max(1, min(12, round(vector[1] * c['month']))) - day_constant = calendar.monthrange(year, month)[-1] - day = max(1, min(round(vector[2] * day_constant), day_constant)) - hour = max(0, min(23, round(vector[4] * c['hour']))) - minute = max(0, min(59, round(vector[5] * c['minute']))) - second = max(0, min(59, round(vector[6] * c['second']))) - - dt = datetime.datetime(year=year, month=month, day=day, hour=hour, - minute=minute, second=second) - - if return_as_datetime is True: - decoded = dt - else: - decoded = round(dt.timestamp()) - - return decoded diff --git a/lightwood/encoder/datetime/datetime_sin_normalizer.py b/lightwood/encoder/datetime/datetime_sin_normalizer.py deleted file mode 100644 index 4b9c3473a..000000000 --- a/lightwood/encoder/datetime/datetime_sin_normalizer.py +++ /dev/null @@ -1,109 +0,0 @@ -import datetime -import calendar -import numpy as np -import pandas as pd # @TODO: remove? -import torch -from lightwood.encoder.base import BaseEncoder -from collections.abc import Iterable - - -class DatetimeNormalizerEncoder(BaseEncoder): - def __init__(self, is_target: bool = False, sinusoidal: bool = False): - super().__init__(is_target) - self.sinusoidal = sinusoidal - self.fields = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second'] - self.constants = {'year': 3000.0, 'month': 12.0, 'weekday': 7.0, - 'hour': 24.0, 'minute': 60.0, 'second': 60.0} - if self.sinusoidal: - self.output_size = 2 - else: - self.output_size = 7 - - def prepare(self, priming_data): - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - self.is_prepared = True - - def encode(self, data): - """ - :param data: # @TODO: receive a consistent data type here; currently either list of lists or pd.Series w/lists - :return: encoded data - """ - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - if isinstance(data, pd.Series): - data = data.values - if not isinstance(data[0], Iterable): - data = [data] - - ret = [self.encode_one(row) for row in data] - - return torch.Tensor(ret) - - def encode_one(self, data): - """ - Encodes a list of unix_timestamps, or a list of tensors with unix_timestamps - :param data: list of unix_timestamps (unix_timestamp resolution is seconds) - :return: a list of vectors - """ - ret = [] - for unix_timestamp in data: - if unix_timestamp is None: - if self.sinusoidal: - vector = [0, 1] * len(self.fields) - else: - vector = [0] * len(self.fields) - else: - c = self.constants - if isinstance(unix_timestamp, torch.Tensor): - unix_timestamp = unix_timestamp.item() - date = datetime.datetime.fromtimestamp(unix_timestamp) - day_constant = calendar.monthrange(date.year, date.month)[1] - vector = [date.year / c['year'], date.month / c['month'], date.day / day_constant, - date.weekday() / c['weekday'], date.hour / c['hour'], - date.minute / c['minute'], date.second / c['second']] - if self.sinusoidal: - vector = np.array([(np.sin(n), np.cos(n)) for n in vector]).flatten() - - ret.append(vector) - - return ret - - def decode(self, encoded_data, return_as_datetime=False): - ret = [] - if len(encoded_data.shape) > 2 and encoded_data.shape[0] == 1: - encoded_data = encoded_data.squeeze(0) - - for vector in encoded_data.tolist(): - ret.append(self.decode_one(vector, return_as_datetime=return_as_datetime)) - - return ret - - def decode_one(self, vector, return_as_datetime=False): - if sum(vector) == 0: - decoded = None - - else: - if self.sinusoidal: - vector = list(map(lambda x: np.arcsin(x), vector))[::2] - c = self.constants - - year = max(0, round(vector[0] * c['year'])) - month = max(1, min(12, round(vector[1] * c['month']))) - day_constant = calendar.monthrange(year, month)[-1] - day = max(1, min(round(vector[2] * day_constant), day_constant)) - hour = max(0, min(23, round(vector[4] * c['hour']))) - minute = max(0, min(59, round(vector[5] * c['minute']))) - second = max(0, min(59, round(vector[6] * c['second']))) - - dt = datetime.datetime(year=year, month=month, day=day, hour=hour, - minute=minute, second=second) - - if return_as_datetime is True: - decoded = dt - else: - decoded = round(dt.timestamp()) - - return decoded diff --git a/lightwood/encoder/identity/__init__.py b/lightwood/encoder/identity/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/encoder/identity/identity.py b/lightwood/encoder/identity/identity.py deleted file mode 100644 index 20d787fd9..000000000 --- a/lightwood/encoder/identity/identity.py +++ /dev/null @@ -1,43 +0,0 @@ -import numpy as np -import pandas as pd -from typing import List -import torch -from lightwood.encoder.base import BaseEncoder - - -class IdentityEncoder(BaseEncoder): - """ - The identity encoder which does not change the data during encoding. - - Due to the restrictions of torch.Tensor, the encoding function only accepts types: - - - (nested) lists of numbers - - np.ndarray of numbers, or - - torch.Tensor - - Nonetypes are automatically converted into nans. This cannot be decoded. - If self.handle_nan is True, then all nans are converted to zeros and not decoded back. - - The decode function takes in a torch.Tensor and converts it to a list of numbers. - """ # noqa - - def __init__(self, is_target=False, handle_nan=True) -> None: - super().__init__(is_target) - self.handle_nan = handle_nan - - # Not all encoders need to be prepared - def prepare(self, priming_data: pd.Series) -> None: - self._prepared = True - - def encode(self, column_data: object) -> torch.Tensor: - if isinstance(column_data, torch.Tensor): - res = column_data - else: - res = np.array(column_data, dtype=float) # convert None to nan - res = torch.Tensor(res) - if self.handle_nan: - res = torch.nan_to_num(res) - return res - - def decode(self, encoded_data: torch.Tensor) -> List[object]: - return encoded_data.tolist() diff --git a/lightwood/encoder/image/__init__.py b/lightwood/encoder/image/__init__.py deleted file mode 100644 index 9fe5fd448..000000000 --- a/lightwood/encoder/image/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from lightwood.encoder.image.img_2_vec import Img2VecEncoder - - -__all__ = ['Img2VecEncoder'] diff --git a/lightwood/encoder/image/helpers/__init__.py b/lightwood/encoder/image/helpers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/encoder/numeric/__init__.py b/lightwood/encoder/numeric/__init__.py deleted file mode 100644 index 999df6876..000000000 --- a/lightwood/encoder/numeric/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from lightwood.encoder.numeric.numeric import NumericEncoder -from lightwood.encoder.numeric.ts_numeric import TsNumericEncoder - -__all__ = ['NumericEncoder', 'TsNumericEncoder'] - diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py deleted file mode 100644 index 1d8ee311a..000000000 --- a/lightwood/encoder/numeric/numeric.py +++ /dev/null @@ -1,130 +0,0 @@ -import math -import torch -import numpy as np -from lightwood.encoder.base import BaseEncoder -from lightwood.helpers.log import log - - -class NumericEncoder(BaseEncoder): - - def __init__(self, data_type=None, is_target: bool = False, positive_domain: bool = False): - super().__init__(is_target) - self._type = data_type - self._abs_mean = None - self.positive_domain = positive_domain - self.decode_log = False - self.output_size = 4 if not self.is_target else 3 - - def prepare(self, priming_data): - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - value_type = 'int' - for number in priming_data: - try: - number = float(number) - except Exception: - continue - - if np.isnan(number): - err = 'Lightwood does not support working with NaN values !' - log.warning(err) - continue - - if int(number) != number: - value_type = 'float' - - self._type = value_type if self._type is None else self._type - non_null_priming_data = [float(str(x).replace(',', '.')) for x in priming_data if x is not None] - self._abs_mean = np.mean(np.abs(non_null_priming_data)) - self.is_prepared = True - - def encode(self, data): - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - ret = [] - for real in data: - try: - real = float(real) - except Exception: - try: - real = float(real.replace(',', '.')) - except Exception: - real = None - if self.is_target: - vector = [0] * 3 - if real is not None and self._abs_mean > 0: - vector[0] = 1 if real < 0 and not self.positive_domain else 0 - vector[1] = math.log(abs(real)) if abs(real) > 0 else -20 - vector[2] = real / self._abs_mean - else: - log.debug(f'Can\'t encode target value: {real}') - - else: - vector = [0] * 4 - try: - if real is None: - vector[0] = 0 - else: - vector[0] = 1 - vector[1] = math.log(abs(real)) if abs(real) > 0 else -20 - vector[2] = 1 if real < 0 and not self.positive_domain else 0 - vector[3] = real / self._abs_mean - except Exception as e: - vector = [0] * 4 - log.error(f'Can\'t encode input value: {real}, exception: {e}') - - ret.append(vector) - - return torch.Tensor(ret) - - def decode(self, encoded_values, decode_log=None) -> list: - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - if decode_log is None: - decode_log = self.decode_log - - ret = [] - if isinstance(encoded_values, torch.Tensor): - encoded_values = encoded_values.tolist() - - for vector in encoded_values: - if self.is_target: - if np.isnan( - vector[0]) or vector[0] == float('inf') or np.isnan( - vector[1]) or vector[1] == float('inf') or np.isnan( - vector[2]) or vector[2] == float('inf'): - log.error(f'Got weird target value to decode: {vector}') - real_value = pow(10, 63) - else: - if decode_log: - sign = -1 if vector[0] > 0.5 else 1 - try: - real_value = math.exp(vector[1]) * sign - except OverflowError: - real_value = pow(10, 63) * sign - else: - real_value = vector[2] * self._abs_mean - - if self.positive_domain: - real_value = abs(real_value) - - if self._type == 'int': - real_value = int(real_value) - - else: - if vector[0] < 0.5: - ret.append(None) - continue - - real_value = vector[3] * self._abs_mean - - if self._type == 'int': - real_value = round(real_value) - - if isinstance(real_value, torch.Tensor): - real_value = real_value.item() - ret.append(real_value) - return ret diff --git a/lightwood/encoder/numeric/ts_array_numeric.py b/lightwood/encoder/numeric/ts_array_numeric.py deleted file mode 100644 index 49a457975..000000000 --- a/lightwood/encoder/numeric/ts_array_numeric.py +++ /dev/null @@ -1,64 +0,0 @@ -import torch -import torch.nn.functional as F -from lightwood.encoder import BaseEncoder -from lightwood.encoder.numeric import TsNumericEncoder - - -class TsArrayNumericEncoder(BaseEncoder): - """ - Variant of vanilla numerical encoder, supports dynamic mean re-scaling - """ - - def __init__(self, timesteps: int, is_target: bool = False, positive_domain: bool = False, grouped_by=None): - super(TsArrayNumericEncoder, self).__init__(is_target=is_target) - # time series normalization params - self.normalizers = None - self.group_combinations = None - self.dependencies = grouped_by - self.data_window = timesteps - self.positive_domain = positive_domain - self.sub_encoder = TsNumericEncoder(is_target=is_target, positive_domain=positive_domain, grouped_by=grouped_by) - self.output_size = self.data_window * self.sub_encoder.output_size - - def prepare(self, priming_data): - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - self.sub_encoder.prepare(priming_data) - self.is_prepared = True - - def encode(self, data, dependency_data={}): - """ - :param dependency_data: dict with grouped_by column info, to retrieve the correct normalizer for each datum - :return: tensor with shape (batch, NxK) where N: self.data_window and K: sub-encoder # of output features - """ # noqa - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - if not dependency_data: - dependency_data = {'__default': [None] * len(data)} - - ret = [] - for data_point in data: - ret.append(self.sub_encoder.encode([data_point], dependency_data=dependency_data)) - - ret = torch.hstack(ret) - padding_size = self.output_size - ret.shape[-1] - - if padding_size > 0: - ret = F.pad(ret, (0, padding_size)) - - return ret - - def decode(self, encoded_values, dependency_data=None, return_all=False): - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - encoded_values = encoded_values.reshape(encoded_values.shape[0], - self.data_window, - self.sub_encoder.output_size) - - ret = [] - for encoded_timestep in torch.split(encoded_values, 1, dim=1): - ret.extend(self.sub_encoder.decode(encoded_timestep.squeeze(1), dependency_data=dependency_data)) - - return ret diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py deleted file mode 100644 index 01420c699..000000000 --- a/lightwood/encoder/numeric/ts_numeric.py +++ /dev/null @@ -1,125 +0,0 @@ -import math -import torch -import numpy as np -from lightwood.encoder.numeric import NumericEncoder -from lightwood.helpers.log import log - - -class TsNumericEncoder(NumericEncoder): - """ - Variant of vanilla numerical encoder, supports dynamic mean re-scaling - """ - is_timeseries_encoder: bool = True - - def __init__(self, is_target: bool = False, positive_domain: bool = False, grouped_by=None): - super(TsNumericEncoder, self).__init__(is_target=is_target, positive_domain=positive_domain) - # time series normalization params - self.normalizers = None - self.group_combinations = None - self.dependencies = grouped_by - self.output_size = 2 if is_target else 3 - - def encode(self, data, dependency_data={}): - """ - :param dependency_data: dict with grouped_by column info, to retrieve the correct normalizer for each datum - """ # noqa - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - if not dependency_data: - dependency_data = {'__default': [None] * len(data)} - - ret = [] - for real, group in zip(data, list(zip(*dependency_data.values()))): - try: - real = float(real) - except Exception: - try: - real = float(real.replace(',', '.')) - except Exception: - real = None - if self.is_target: - vector = [0] * 2 - if group is not None and self.normalizers is not None: - try: - mean = self.normalizers[frozenset(group)].abs_mean - except KeyError: - # novel group-by, we use default normalizer mean - mean = self.normalizers['__default'].abs_mean - else: - mean = self._abs_mean - - if real is not None: - vector[0] = 1 if real < 0 and not self.positive_domain else 0 - vector[1] = real / mean if mean != 0 else real - else: - raise Exception(f'Can\'t encode target value: {real}') - - else: - vector = [0] * 3 - try: - if real is not None: - vector[0] = 1 - vector[1] = 1 if real < 0 and not self.positive_domain else 0 - vector[2] = real / self._abs_mean - except Exception as e: - log.error(f'Can\'t encode input value: {real}, exception: {e}') - - ret.append(vector) - - return torch.Tensor(ret) - - def decode(self, encoded_values, decode_log=None, dependency_data=None): - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - if decode_log is None: - decode_log = self.decode_log - - ret = [] - if not dependency_data: - dependency_data = {'__default': [None] * len(encoded_values)} - if isinstance(encoded_values, torch.Tensor): - encoded_values = encoded_values.tolist() - - for vector, group in zip(encoded_values, list(zip(*dependency_data.values()))): - if self.is_target: - if np.isnan(vector[0]) or vector[0] == float('inf') or np.isnan(vector[1]) or vector[1] == float('inf'): - log.error(f'Got weird target value to decode: {vector}') - real_value = pow(10, 63) - else: - if decode_log: - sign = -1 if vector[0] > 0.5 else 1 - try: - real_value = math.exp(vector[1]) * sign - except OverflowError: - real_value = pow(10, 63) * sign - else: - if group is not None and self.normalizers is not None: - try: - mean = self.normalizers[frozenset(group)].abs_mean - except KeyError: - # decode new group with default normalizer - mean = self.normalizers['__default'].abs_mean - else: - mean = self._abs_mean - - real_value = vector[1] * mean if mean != 0 else vector[1] - - if self.positive_domain: - real_value = abs(real_value) - - if self._type == 'int': - real_value = int(round(real_value, 0)) - - else: - if vector[0] < 0.5: - ret.append(None) - continue - - real_value = vector[2] * self._abs_mean - - if self._type == 'int': - real_value = round(real_value) - - ret.append(real_value) - return ret diff --git a/lightwood/encoder/text/__init__.py b/lightwood/encoder/text/__init__.py deleted file mode 100644 index 3f3220eab..000000000 --- a/lightwood/encoder/text/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from lightwood.encoder.text.pretrained import PretrainedLangEncoder -from lightwood.encoder.text.rnn import RnnEncoder -from lightwood.encoder.text.tfidf import TfidfEncoder -from lightwood.encoder.text.short import ShortTextEncoder -from lightwood.encoder.text.vocab import VocabularyEncoder - - -__all__ = ['PretrainedLangEncoder', 'RnnEncoder', 'TfidfEncoder', 'ShortTextEncoder', 'VocabularyEncoder'] diff --git a/lightwood/encoder/text/helpers/__init__.py b/lightwood/encoder/text/helpers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/encoder/text/helpers/pretrained_helpers.py b/lightwood/encoder/text/helpers/pretrained_helpers.py deleted file mode 100644 index 3f13aa117..000000000 --- a/lightwood/encoder/text/helpers/pretrained_helpers.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -2021.03.05 - -Basic helper functions for PretrainedLangEncoder -""" -import torch -from transformers import AdamW - - -class TextEmbed(torch.utils.data.Dataset): - """ - Dataset class for quick embedding/label retrieval. - Labels should be in the index space. - - If the labels provided are not in torch form, will convert them. - """ - - def __init__(self, encodings, labels): - self.encodings = encodings - self.labels = labels - - def __getitem__(self, idx): - item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} - item['labels'] = self.labels[idx] - return item - - def __len__(self): - return len(self.labels) - - -def train_model(model, dataset, device, scheduler=None, log=None, optim=None, n_epochs=4): - """ - Generic training function, given an arbitrary model. - - Given a model, train for n_epochs. - - model - torch.nn model; - dataset - torch.DataLoader; dataset to train - device - torch.device; cuda/cpu - log - lightwood.logger.log; print output - optim - transformers.optimization.AdamW; optimizer - n_epochs - number of epochs to train - - """ - losses = [] - model.train() - if optim is None: - optim = AdamW(model.parameters(), lr=5e-5) - - for epoch in range(n_epochs): - total_loss = 0 - for batch in dataset: - optim.zero_grad() - - inpids = batch['input_ids'].to(device) - attn = batch['attention_mask'].to(device) - labels = batch['labels'].to(device) - outputs = model(inpids, attention_mask=attn, labels=labels) - loss = outputs[0] - - total_loss += loss.item() - - loss.backward() - optim.step() - - if scheduler is not None: - scheduler.step() - - print("Epoch", epoch + 1, "Loss", total_loss) - return model, losses diff --git a/lightwood/encoder/text/helpers/rnn_helpers.py b/lightwood/encoder/text/helpers/rnn_helpers.py deleted file mode 100644 index 3eac106ca..000000000 --- a/lightwood/encoder/text/helpers/rnn_helpers.py +++ /dev/null @@ -1,780 +0,0 @@ -# flake8: noqa -# -*- coding: utf-8 -*- -""" -Translation with a Sequence to Sequence Network and Attention -************************************************************* -**Author**: `Sean Robertson `_ - -In this project we will be teaching a neural network to translate from -French to English. - -:: - - [KEY: > input, = target, < output] - - > il est en train de peindre un tableau . - = he is painting a picture . - < he is painting a picture . - - > pourquoi ne pas essayer ce vin delicieux ? - = why not try that delicious wine ? - < why not try that delicious wine ? - - > elle n est pas poete mais romanciere . - = she is not a poet but a novelist . - < she not not a poet but a novelist . - - > vous etes trop maigre . - = you re too skinny . - < you re all alone . - -... to varying degrees of success. - -This is made possible by the simple but powerful idea of the `sequence -to sequence network `__, in which two -recurrent neural networks work together to transform one sequence to -another. An encoder network condenses an input sequence into a vector, -and a decoder network unfolds that vector into a new sequence. - -.. figure:: /_static/img/seq-seq-images/seq2seq.png - :alt: - -To improve upon this model we'll use an `attention -mechanism `__, which lets the decoder -learn to focus over a specific range of the input sequence. - -**Recommended Reading:** - -I assume you have at least installed PyTorch, know Python, and -understand Tensors: - -- https://pytorch.org/ For installation instructions -- :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general -- :doc:`/beginner/pytorch_with_examples` for a wide and deep overview -- :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user - - -It would also be useful to know about Sequence to Sequence networks and -how they work: - -- `Learning Phrase Representations using RNN Encoder-Decoder for - Statistical Machine Translation `__ -- `Sequence to Sequence Learning with Neural - Networks `__ -- `Neural Machine Translation by Jointly Learning to Align and - Translate `__ -- `A Neural Conversational Model `__ - -You will also find the previous tutorials on -:doc:`/intermediate/char_rnn_classification_tutorial` -and :doc:`/intermediate/char_rnn_generation_tutorial` -helpful as those concepts are very similar to the Encoder and Decoder -models, respectively. - -And for more, read the papers that introduced these topics: - -- `Learning Phrase Representations using RNN Encoder-Decoder for - Statistical Machine Translation `__ -- `Sequence to Sequence Learning with Neural - Networks `__ -- `Neural Machine Translation by Jointly Learning to Align and - Translate `__ -- `A Neural Conversational Model `__ - - -**Requirements** -""" -from __future__ import unicode_literals, print_function, division -import math -import time -from io import open -import unicodedata -import string -import re -import random -import operator - -import torch -import torch.nn as nn -from torch import optim -import torch.nn.functional as F -from lightwood.helpers.torch import LightwoodAutocast - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -###################################################################### -# Loading data files -# ================== -# -# The data for this project is a set of many thousands of English to -# French translation pairs. -# -# `This question on Open Data Stack -# Exchange `__ -# pointed me to the open translation site https://tatoeba.org/ which has -# downloads available at https://tatoeba.org/eng/downloads - and better -# yet, someone did the extra work of splitting language pairs into -# individual text files here: https://www.manythings.org/anki/ -# -# The English to French pairs are too big to include in the repo, so -# download to ``data/eng-fra.txt`` before continuing. The file is a tab -# separated list of translation pairs: -# -# :: -# -# I am cold. J'ai froid. -# -# .. Note:: -# Download the data from -# `here `_ -# and extract it to the current directory. - -###################################################################### -# Similar to the character encoding used in the character-level RNN -# tutorials, we will be representing each word in a language as a one-hot -# vector, or giant vector of zeros except for a single one (at the index -# of the word). Compared to the dozens of characters that might exist in a -# language, there are many many more words, so the encoding vector is much -# larger. We will however cheat a bit and trim the data to only use a few -# thousand words per language. -# -# .. figure:: /_static/img/seq-seq-images/word-encoding.png -# :alt: -# -# - - -###################################################################### -# We'll need a unique index per word to use as the inputs and targets of -# the networks later. To keep track of all this we will use a helper class -# called ``Lang`` which has word → index (``word2index``) and index → word -# (``index2word``) dictionaries, as well as a count of each word -# ``word2count`` to use to later replace rare words. -# - -SOS_token = 0 -EOS_token = 1 -UNK_TOKEN = 2 - - -class Lang: - def __init__(self, name): - self.name = name - self.word2index = {} - self.word2count = {} - self.index2word = {0: "SOS", 1: "EOS", 2: "UNK"} - self.n_words = 3 # Count SOS and EOS - - def addSentence(self, sentence): - for word in sentence.split(' '): - self.addWord(word) - - def addWord(self, word): - if word not in self.word2index: - self.word2index[word] = self.n_words - self.word2count[word] = 0 - self.index2word[self.n_words] = word - self.n_words += 1 - - self.word2count[word] += 1 - - # @NOTE: Very slow, especially for a large language, can be made faster by making index2word a list - def removeWord(self, word): - word_index = self.word2index[word] - del self.word2index[word] - del self.word2count[word] - del self.index2word[word_index] - - self.n_words -= 1 - - for index in [x for x in self.index2word.keys() if x > word_index]: - word = self.index2word[index] - new_index = index - 1 - - self.word2index[word] = new_index - - del self.index2word[index] - self.index2word[new_index] = word - - def getLeastOccurring(self, n=1): - if n == 1: - return min(self.word2count, key=self.word2count.get) - else: - sorted_word2count = sorted(self.word2count.items(), key=operator.itemgetter(1)) - return [x[0] for x in sorted_word2count[:n]] - - -###################################################################### -# The files are all in Unicode, to simplify we will turn Unicode -# characters to ASCII, make everything lowercase, and trim most -# punctuation. -# - -# Turn a Unicode string to plain ASCII, thanks to -# https://stackoverflow.com/a/518232/2809427 -def unicodeToAscii(s): - return ''.join( - c for c in unicodedata.normalize('NFD', s) - if unicodedata.category(c) != 'Mn' - ) - -# Lowercase, trim, and remove non-letter characters - - -def normalizeString(s): - s = unicodeToAscii(s.lower().strip()) - s = re.sub(r"([.!?])", r" \1", s) - s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) - return s - - -###################################################################### -# To read the data file we will split the file into lines, and then split -# lines into pairs. The files are all English → Other Language, so if we -# want to translate from Other Language → English I added the ``reverse`` -# flag to reverse the pairs. -# - -def readLangs(lang1, lang2, reverse=False): - print("Reading lines...") - - # Read the file and split into lines - lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\ - read().strip().split('\n') - - # Split every line into pairs and normalize - pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] - - # Reverse pairs, make Lang instances - if reverse: - pairs = [list(reversed(p)) for p in pairs] - input_lang = Lang(lang2) - output_lang = Lang(lang1) - else: - input_lang = Lang(lang1) - output_lang = Lang(lang2) - - return input_lang, output_lang, pairs - - -###################################################################### -# Since there are a *lot* of example sentences and we want to train -# something quickly, we'll trim the data set to only relatively short and -# simple sentences. Here the maximum length is 10 words (that includes -# ending punctuation) and we're filtering to sentences that translate to -# the form "I am" or "He is" etc. (accounting for apostrophes replaced -# earlier). -# - -MAX_LENGTH = 100 - - -###################################################################### -# The full process for preparing the data is: -# -# - Read text file and split into lines, split lines into pairs -# - Normalize text, filter by length and content -# - Make word lists from sentences in pairs -# - -def prepareData(lang1, lang2, reverse=False): - input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse) - print("Read %s sentence pairs" % len(pairs)) - #pairs = filterPairs(pairs) - print("Trimmed to %s sentence pairs" % len(pairs)) - print("Counting words...") - for pair in pairs: - input_lang.addSentence(pair[0]) - output_lang.addSentence(pair[1]) - print("Counted words:") - print(input_lang.name, input_lang.n_words) - print(output_lang.name, output_lang.n_words) - return input_lang, output_lang, pairs - - -###################################################################### -# The Seq2Seq Model -# ================= -# -# A Recurrent Neural Network, or RNN, is a network that operates on a -# sequence and uses its own output as input for subsequent steps. -# -# A `Sequence to Sequence network `__, or -# seq2seq network, or `Encoder Decoder -# network `__, is a model -# consisting of two RNNs called the encoder and decoder. The encoder reads -# an input sequence and outputs a single vector, and the decoder reads -# that vector to produce an output sequence. -# -# .. figure:: /_static/img/seq-seq-images/seq2seq.png -# :alt: -# -# Unlike sequence prediction with a single RNN, where every input -# corresponds to an output, the seq2seq model frees us from sequence -# length and order, which makes it ideal for translation between two -# languages. -# -# Consider the sentence "Je ne suis pas le chat noir" → "I am not the -# black cat". Most of the words in the input sentence have a direct -# translation in the output sentence, but are in slightly different -# orders, e.g. "chat noir" and "black cat". Because of the "ne/pas" -# construction there is also one more word in the input sentence. It would -# be difficult to produce a correct translation directly from the sequence -# of input words. -# -# With a seq2seq model the encoder creates a single vector which, in the -# ideal case, encodes the "meaning" of the input sequence into a single -# vector — a single point in some N dimensional space of sentences. -# - - -###################################################################### -# The Encoder -# ----------- -# -# The encoder of a seq2seq network is a RNN that outputs some value for -# every word from the input sentence. For every input word the encoder -# outputs a vector and a hidden state, and uses the hidden state for the -# next input word. -# -# .. figure:: /_static/img/seq-seq-images/encoder-network.png -# :alt: -# -# - -class EncoderRNN(nn.Module): - def __init__(self, input_size, hidden_size): - super(EncoderRNN, self).__init__() - self.hidden_size = hidden_size - - self.embedding = nn.Embedding(input_size, hidden_size) - self.gru = nn.GRU(hidden_size, hidden_size) - - def forward(self, input, hidden): - with LightwoodAutocast(): - embedded = self.embedding(input).view(1, 1, -1) - output = embedded - output, hidden = self.gru(output, hidden) - return output, hidden - - def initHidden(self): - return torch.zeros(1, 1, self.hidden_size, device=device) - -###################################################################### -# The Decoder -# ----------- -# -# The decoder is another RNN that takes the encoder output vector(s) and -# outputs a sequence of words to create the translation. -# - - -###################################################################### -# Simple Decoder -# ^^^^^^^^^^^^^^ -# -# In the simplest seq2seq decoder we use only last output of the encoder. -# This last output is sometimes called the *context vector* as it encodes -# context from the entire sequence. This context vector is used as the -# initial hidden state of the decoder. -# -# At every step of decoding, the decoder is given an input token and -# hidden state. The initial input token is the start-of-string ```` -# token, and the first hidden state is the context vector (the encoder's -# last hidden state). -# -# .. figure:: /_static/img/seq-seq-images/decoder-network.png -# :alt: -# -# - -class DecoderRNN(nn.Module): - def __init__(self, hidden_size, output_size): - super(DecoderRNN, self).__init__() - self.hidden_size = hidden_size - - self.embedding = nn.Embedding(output_size, hidden_size) - self.gru = nn.GRU(hidden_size, hidden_size) - self.out = nn.Linear(hidden_size, output_size) - self.softmax = nn.LogSoftmax(dim=1) - - def forward(self, input, hidden): - with LightwoodAutocast(): - output = self.embedding(input).view(1, 1, -1) - output = F.relu(output) - output, hidden = self.gru(output, hidden) - output = self.softmax(self.out(output[0])) - return output, hidden - - def initHidden(self): - return torch.zeros(1, 1, self.hidden_size, device=device) - -###################################################################### -# I encourage you to train and observe the results of this model, but to -# save space we'll be going straight for the gold and introducing the -# Attention Mechanism. -# - - -###################################################################### -# Attention Decoder -# ^^^^^^^^^^^^^^^^^ -# -# If only the context vector is passed betweeen the encoder and decoder, -# that single vector carries the burden of encoding the entire sentence. -# -# Attention allows the decoder network to "focus" on a different part of -# the encoder's outputs for every step of the decoder's own outputs. First -# we calculate a set of *attention weights*. These will be multiplied by -# the encoder output vectors to create a weighted combination. The result -# (called ``attn_applied`` in the code) should contain information about -# that specific part of the input sequence, and thus help the decoder -# choose the right output words. -# -# .. figure:: https://i.imgur.com/1152PYf.png -# :alt: -# -# Calculating the attention weights is done with another feed-forward -# layer ``attn``, using the decoder's input and hidden state as inputs. -# Because there are sentences of all sizes in the training data, to -# actually create and train this layer we have to choose a maximum -# sentence length (input length, for encoder outputs) that it can apply -# to. Sentences of the maximum length will use all the attention weights, -# while shorter sentences will only use the first few. -# -# .. figure:: /_static/img/seq-seq-images/attention-decoder-network.png -# :alt: -# -# - -class AttnDecoderRNN(nn.Module): - def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH): - super(AttnDecoderRNN, self).__init__() - self.hidden_size = hidden_size - self.output_size = output_size - self.dropout_p = dropout_p - self.max_length = max_length - - self.embedding = nn.Embedding(self.output_size, self.hidden_size) - self.attn = nn.Linear(self.hidden_size * 2, self.max_length) - self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size) - self.dropout = nn.Dropout(self.dropout_p) - self.gru = nn.GRU(self.hidden_size, self.hidden_size) - self.out = nn.Linear(self.hidden_size, self.output_size) - - def forward(self, input, hidden, encoder_outputs): - with LightwoodAutocast(): - embedded = self.embedding(input).view(1, 1, -1) - embedded = self.dropout(embedded) - - attn_weights = F.softmax( - self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1) - attn_applied = torch.bmm(attn_weights.unsqueeze(0), - encoder_outputs.unsqueeze(0)) - - output = torch.cat((embedded[0], attn_applied[0]), 1) - output = self.attn_combine(output).unsqueeze(0) - - output = F.relu(output) - output, hidden = self.gru(output, hidden) - - output = F.log_softmax(self.out(output[0]), dim=1) - return output, hidden, attn_weights - - def initHidden(self): - return torch.zeros(1, 1, self.hidden_size, device=device) - - -###################################################################### -# .. note:: There are other forms of attention that work around the length -# limitation by using a relative position approach. Read about "local -# attention" in `Effective Approaches to Attention-based Neural Machine -# Translation `__. -# -# Training -# ======== -# -# Preparing Training Data -# ----------------------- -# -# To train, for each pair we will need an input tensor (indexes of the -# words in the input sentence) and target tensor (indexes of the words in -# the target sentence). While creating these vectors we will append the -# EOS token to both sequences. -# - -def indexesFromSentence(lang, sentence): - - return [lang.word2index[word] if word in lang.word2index else UNK_TOKEN for word in (str(sentence).split(' ') if sentence is not None else [None])] - - -def tensorFromSentence(lang, sentence): - indexes = indexesFromSentence(lang, sentence) - indexes.append(EOS_token) - return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1) - - -def tensorsFromPair(pair, input_lang, output_lang): - input_tensor = tensorFromSentence(input_lang, pair[0]) - target_tensor = tensorFromSentence(output_lang, pair[1]) - return (input_tensor, target_tensor) - - -###################################################################### -# Training the Model -# ------------------ -# -# To train we run the input sentence through the encoder, and keep track -# of every output and the latest hidden state. Then the decoder is given -# the ```` token as its first input, and the last hidden state of the -# encoder as its first hidden state. -# -# "Teacher forcing" is the concept of using the real target outputs as -# each next input, instead of using the decoder's guess as the next input. -# Using teacher forcing causes it to converge faster but `when the trained -# network is exploited, it may exhibit -# instability `__. -# -# You can observe outputs of teacher-forced networks that read with -# coherent grammar but wander far from the correct translation - -# intuitively it has learned to represent the output grammar and can "pick -# up" the meaning once the teacher tells it the first few words, but it -# has not properly learned how to create the sentence from the translation -# in the first place. -# -# Because of the freedom PyTorch's autograd gives us, we can randomly -# choose to use teacher forcing or not with a simple if statement. Turn -# ``teacher_forcing_ratio`` up to use more of it. -# -teacher_forcing_ratio = 0.5 - - -def train( - input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, - max_length=MAX_LENGTH): - encoder_hidden = encoder.initHidden() - - encoder_optimizer.zero_grad() - decoder_optimizer.zero_grad() - - input_length = input_tensor.size(0) - target_length = target_tensor.size(0) - - encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device) - - loss = 0 - - with LightwoodAutocast(): - for ei in range(min(input_length, len(encoder_outputs))): - encoder_output, encoder_hidden = encoder( - input_tensor[ei], encoder_hidden) - encoder_outputs[ei] = encoder_output[0, 0] - - decoder_input = torch.tensor([[SOS_token]], device=device) - - decoder_hidden = encoder_hidden - - use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False - - if use_teacher_forcing: - # Teacher forcing: Feed the target as the next input - for di in range(target_length): - if isinstance(decoder, AttnDecoderRNN): - decoder_output, decoder_hidden, decoder_attention = decoder( - decoder_input, decoder_hidden, encoder_outputs) - else: - decoder_output, decoder_hidden = decoder( - decoder_input, decoder_hidden) - loss += criterion(decoder_output, target_tensor[di]) - decoder_input = target_tensor[di] # Teacher forcing - - else: - # Without teacher forcing: use its own predictions as the next input - for di in range(target_length): - if isinstance(decoder, AttnDecoderRNN): - decoder_output, decoder_hidden, decoder_attention = decoder( - decoder_input, decoder_hidden, encoder_outputs) - else: - decoder_output, decoder_hidden = decoder( - decoder_input, decoder_hidden) - topv, topi = decoder_output.topk(1) - decoder_input = topi.squeeze().detach() # detach from history as input - - loss += criterion(decoder_output, target_tensor[di]) - if decoder_input.item() == EOS_token: - break - - loss.backward() - - encoder_optimizer.step() - decoder_optimizer.step() - - return loss.item() / target_length - - -###################################################################### -# This is a helper function to print time elapsed and estimated time -# remaining given the current time and progress %. -# - - -def asMinutes(s): - m = math.floor(s / 60) - s -= m * 60 - return '%dm %ds' % (m, s) - - -def timeSince(since, percent): - now = time.time() - s = now - since - es = s / (percent) - rs = es - s - return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) - - -###################################################################### -# The whole training process looks like this: -# -# - Start a timer -# - Initialize optimizers and criterion -# - Create set of training pairs -# - Start empty losses array for plotting -# -# Then we call ``train`` many times and occasionally print the progress (% -# of examples, time so far, estimated time) and average loss. -# - -def trainIters(encoder, decoder, input_lang, output_lang, input_rows, output_rows, n_iters, print_every=1000, - plot_every=100, learning_rate=0.01, loss_breakpoint=0.0001, max_length=MAX_LENGTH): - start = time.time() - plot_losses = [] - print_loss_total = 0 # Reset every print_every - plot_loss_total = 0 # Reset every plot_every - - encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) - decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) - - random_index = random.randint(0, len(input_rows)) - - training_pairs = [[tensorFromSentence(input_lang, input_rows[random_index]), tensorFromSentence( - output_lang, output_rows[random_index])] for i in range(n_iters)] - criterion = nn.NLLLoss() - - for iter in range(1, n_iters + 1): - training_pair = training_pairs[iter - 1] - input_tensor = training_pair[0] - target_tensor = training_pair[1] - - loss = train(input_tensor, target_tensor, encoder, - decoder, encoder_optimizer, decoder_optimizer, criterion, max_length) - print_loss_total += loss - plot_loss_total += loss - - print_loss_avg = print_loss_total / print_every - - if print_loss_avg < loss_breakpoint: - - print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), - iter, iter / n_iters * 100, print_loss_avg)) - - break - - if iter % print_every == 0: - print_loss_avg = print_loss_total / print_every - print_loss_total = 0 - print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), - iter, iter / n_iters * 100, print_loss_avg)) - - if iter % plot_every == 0: - plot_loss_avg = plot_loss_total / plot_every - plot_losses.append(plot_loss_avg) - plot_loss_total = 0 - - # showPlot(plot_losses) - - -###################################################################### -# Evaluation -# ========== -# -# Evaluation is mostly the same as training, but there are no targets so -# we simply feed the decoder's predictions back to itself for each step. -# Every time it predicts a word we add it to the output string, and if it -# predicts the EOS token we stop there. We also store the decoder's -# attention outputs for display later. -# - -def evaluate(encoder, decoder, input_lang, output_lang, sentence, max_length=MAX_LENGTH): - with torch.no_grad(): - input_tensor = tensorFromSentence(input_lang, sentence) - input_length = input_tensor.size()[0] - encoder_hidden = encoder.initHidden() - - encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device) - - for ei in range(input_length): - encoder_output, encoder_hidden = encoder(input_tensor[ei], - encoder_hidden) - encoder_outputs[ei] += encoder_output[0, 0] - - decoder_input = torch.tensor([[SOS_token]], device=device) # SOS - - decoder_hidden = encoder_hidden - - decoded_words = [] - decoder_attentions = torch.zeros(max_length, max_length) - - for di in range(max_length): - decoder_output, decoder_hidden, decoder_attention = decoder( - decoder_input, decoder_hidden, encoder_outputs) - decoder_attentions[di] = decoder_attention.data - topv, topi = decoder_output.data.topk(1) - if topi.item() == EOS_token: - decoded_words.append('') - break - else: - decoded_words.append(output_lang.index2word[topi.item()]) - - decoder_input = topi.squeeze().detach() - - return decoded_words, decoder_attentions[:di + 1] - - -###################################################################### -# We can evaluate random sentences from the training set and print out the -# input, target, and output to make some subjective quality judgements: -# - -def evaluateRandomly(encoder, pairs, decoder, n=10, max_length=MAX_LENGTH): - for i in range(n): - pair = random.choice(pairs) - print('>', pair[0]) - print('=', pair[1]) - output_words, attentions = evaluate(encoder, decoder, pair[0], max_length=MAX_LENGTH) - output_sentence = ' '.join(output_words) - print('<', output_sentence) - print('') - - -###################################################################### -# For a better viewing experience we will do the extra work of adding axes -# and labels: -# - -def showAttention(input_sentence, output_words, attentions): - # Set up figure with colorbar - fig = plt.figure() - ax = fig.add_subplot(111) - cax = ax.matshow(attentions.numpy(), cmap='bone') - fig.colorbar(cax) - - # Set up axes - ax.set_xticklabels([''] + input_sentence.split(' ') + - [''], rotation=90) - ax.set_yticklabels([''] + output_words) - - # Show label at every tick - ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) - ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) - - plt.show() diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py deleted file mode 100644 index 75904b437..000000000 --- a/lightwood/encoder/text/pretrained.py +++ /dev/null @@ -1,373 +0,0 @@ -""" -2021.07.16 -Adding flag "embedmode". - -Embed-mode is made for when text is one of many columns in the model. -IF the model is direct (text) -> output, then it's worth just using -the fine-tuned encoder as the "mixer" persay; thus, turn embed-mode OFF. - -This means there are 3 possible modes: - -(1) Classification - -> Fine tuned, output of encoder is [CLS] embedding - -> Fine tuned, output of encoder is the class value -(2) Regression - -> Untrained; output of encoder is [CLS] embedding - -Training with regression is WIP; seems like quantile-binning is the best approach -but using MSE loss while fine-tuning did not demonstrate decent results. Particularly -because the mixer seems to address this. - -2021.03.18 - -## Padding changes the answer slightly in the model. - -The following text encoder uses huggingface's -Distilbert. Internal benchmarks suggest -1 epoch of fine tuning is ideal [classification]. -Training ONLY occurs for classification. Regression problems -are not trained, embeddings are directly generated. - -See: https://huggingface.co/transformers/training.html -for further details. - -Currently the model supports only distilbert. - -When instantiating the DistilBertForSeq.Class object, -num_labels indicates whether you use classification or regression. - -See: https://huggingface.co/transformers/model_doc/distilbert.html#distilbertforsequenceclassification -under the 'labels' command - -For classification - we use num_labels = 1 + num_classes *** - -If you do num_classes + 1, we reserve the LAST label -as the "unknown" label; this is different from the original -distilbert model. (prior to 2021.03) - -TODOs: -+ Regression -+ Batch encodes() tokenization step -+ Look into auto-encoding lower dimensional representations -of the output embedding -+ Look into regression tuning (will require grad. clipping) -+ Look into tuning to the encoded space of output. -""" -import time -import torch -from torch.utils.data import DataLoader -import os -import pandas as pd -from lightwood.encoder.text.helpers.pretrained_helpers import TextEmbed -from lightwood.helpers.device import get_devices -from lightwood.encoder.base import BaseEncoder -from lightwood.helpers.log import log -from lightwood.helpers.torch import LightwoodAutocast -from lightwood.api import dtype -from transformers import ( - DistilBertModel, - DistilBertForSequenceClassification, - DistilBertTokenizerFast, - AdamW, - get_linear_schedule_with_warmup, -) - - -class PretrainedLangEncoder(BaseEncoder): - is_trainable_encoder: bool = True - - """ - Pretrained language models. - Option to train on a target encoding of choice. - - Args: - is_target ::Bool; data column is the target of ML. - model_name ::str; name of pre-trained model - custom_tokenizer ::function; custom tokenizing function - batch_size ::int; size of batch - max_position_embeddings ::int; max sequence length of input text - custom_train ::Bool; If true, trains model on target procided - frozen ::Bool; If true, freezes transformer layers during training. - epochs ::int; number of epochs to train model with - embed_mode ::Bool; If true, assumes the output of the encode() step is the CLS embedding. - """ - - def __init__( - self, - stop_after: int, - is_target=False, - model_name="distilbert", - custom_tokenizer=None, - batch_size=10, - max_position_embeddings=None, - frozen=False, - epochs=1, - output_type=None, - embed_mode=True, - ): - super().__init__(is_target) - - self.output_type = output_type - self.name = model_name + " text encoder" - log.info(self.name) - - self._max_len = max_position_embeddings - self._frozen = frozen - self._batch_size = batch_size - self._epochs = epochs - - # Model setup - self._tokenizer = custom_tokenizer - self._model = None - self.model_type = None - - # TODO: Other LMs; Distilbert is a good balance of speed/performance - self._classifier_model_class = DistilBertForSequenceClassification - self._embeddings_model_class = DistilBertModel - self._tokenizer_class = DistilBertTokenizerFast - self._pretrained_model_name = "distilbert-base-uncased" - - self.device, _ = get_devices() - self.stop_after = stop_after - - self.embed_mode = embed_mode - self.uses_target = True - self.output_size = None - - # DEBUGGING!!! - if self.embed_mode: - log.info("Embedding mode on. [CLS] embedding dim output of encode()") - else: - log.info("Embedding mode off. Logits are output of encode()") - - def prepare(self, train_priming_data: pd.Series, dev_priming_data: pd.Series, encoded_target_values: torch.Tensor): - """ - Prepare the encoder by training on the target. - - Training data must be a dict with "targets" avail. - Automatically assumes this. - """ - os.environ['TOKENIZERS_PARALLELISM'] = 'true' - priming_data = pd.concat([train_priming_data, dev_priming_data]) - priming_data = priming_data.values - if self.is_prepared: - raise Exception("Encoder is already prepared.") - - # TODO: Make tokenizer custom with partial function; feed custom->model - if self._tokenizer is None: - self._tokenizer = self._tokenizer_class.from_pretrained(self._pretrained_model_name) - - # Replaces empty strings with '' - priming_data = [x if x is not None else "" for x in priming_data] - - # Checks training data details - # TODO: Regression flag; currently training supported for categorical only - - if (self.output_type in (dtype.categorical, dtype.binary)): - log.info("Training model.") - - # Prepare priming data into tokenized form + attention masks - text = self._tokenizer(priming_data, truncation=True, padding=True) - - log.info("\tOutput trained is categorical") - - labels = encoded_target_values.argmax(dim=1) - - # Construct the model - self._model = self._classifier_model_class.from_pretrained( - self._pretrained_model_name, - num_labels=len(encoded_target_values[0]), - ).to(self.device) - - # Construct the dataset for training - xinp = TextEmbed(text, labels) - dataset = DataLoader(xinp, batch_size=self._batch_size, shuffle=True) - - # If max length not set, adjust - if self._max_len is None: - self._max_len = self._model.config.max_position_embeddings - - if self._frozen: - log.info("\tFrozen Model + Training Classifier Layers") - """ - Freeze the base transformer model and train - a linear layer on top - """ - # Freeze all the transformer parameters - for param in self._model.base_model.parameters(): - param.requires_grad = False - - optimizer_grouped_parameters = self._model.parameters() - - else: - log.info("\tFine-tuning model") - """ - Fine-tuning parameters with weight decay - """ - no_decay = [ - "bias", - "LayerNorm.weight", - ] # decay on all terms EXCLUDING bias/layernorms - optimizer_grouped_parameters = [ - { - "params": [ - p - for n, p in self._model.named_parameters() - if not any(nd in n for nd in no_decay) - ], - "weight_decay": 0.01, - }, - { - "params": [ - p - for n, p in self._model.named_parameters() - if any(nd in n for nd in no_decay) - ], - "weight_decay": 0.0, - }, - ] - - optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5) - scheduler = get_linear_schedule_with_warmup( - optimizer, - num_warmup_steps=0, # default value for GLUE - num_training_steps=len(dataset) * self._epochs, - ) - - # Train model; declare optimizer earlier if desired. - self._tune_model( - dataset, optim=optimizer, scheduler=scheduler, n_epochs=self._epochs - ) - - else: - log.info("Target is not classification; Embeddings Generator only") - - self.model_type = "embeddings_generator" - self._model = self._embeddings_model_class.from_pretrained( - self._pretrained_model_name - ).to(self.device) - - # TODO: Not a great flag - # Currently, if the task is not classification, you must have - # an embedding generator only. - if self.embed_mode is False: - log.info("Embedding mode must be ON for non-classification targets.") - self.embed_mode = True - - self.is_prepared = True - encoded = self.encode(priming_data[0:1]) - self.output_size = len(encoded[0]) - - def _tune_model(self, dataset, optim, scheduler, n_epochs=1): - """ - Given a model, train for n_epochs. - Specifically intended for tuning; it does NOT use loss/ - stopping criterion. - - model - torch.nn model; - dataset - torch.DataLoader; dataset to train - device - torch.device; cuda/cpu - log - lightwood.logger.log; log.info output - optim - transformers.optimization.AdamW; optimizer - scheduler - scheduling params - n_epochs - number of epochs to train - - """ - self._model.train() - - if optim is None: - log.info("No opt. provided, setting all params with AdamW.") - optim = AdamW(self._model.parameters(), lr=5e-5) - else: - log.info("Optimizer provided") - - if scheduler is None: - log.info("No scheduler provided.") - else: - log.info("Scheduler provided.") - - started = time.time() - for epoch in range(n_epochs): - total_loss = 0 - if time.time() - started > self.stop_after: - break - - for batch in dataset: - optim.zero_grad() - - with LightwoodAutocast(): - inpids = batch["input_ids"].to(self.device) - attn = batch["attention_mask"].to(self.device) - labels = batch["labels"].to(self.device) - outputs = self._model(inpids, attention_mask=attn, labels=labels) - loss = outputs[0] - - total_loss += loss.item() - - loss.backward() - optim.step() - if scheduler is not None: - scheduler.step() - - self._train_callback(epoch, total_loss / len(dataset)) - - def _train_callback(self, epoch, loss): - log.info(f"{self.name} at epoch {epoch+1} and loss {loss}!") - - def encode(self, column_data): - """ - TODO: Maybe batch the text up; may take too long - Given column data, encode the dataset. - - Currently, returns the embedding of the pre-classifier layer. - - Args: - column_data:: [list[str]] list of text data in str form - - Returns: - encoded_representation:: [torch.Tensor] N_sentences x Nembed_dim - """ - if self.is_prepared is False: - raise Exception("You need to first prepare the encoder.") - - # Set model to testing/eval mode. - self._model.eval() - - encoded_representation = [] - - with torch.no_grad(): - # Set the weights; this is GPT-2 - for text in column_data: - - # Omit NaNs - if text is None: - text = "" - - # Tokenize the text with the built-in tokenizer. - inp = self._tokenizer.encode( - text, truncation=True, return_tensors="pt" - ).to(self.device) - - if self.embed_mode: # Embedding mode ON; return [CLS] - output = self._model.base_model(inp).last_hidden_state[:, 0] - - # If the model has a pre-classifier layer, use this embedding. - if hasattr(self._model, "pre_classifier"): - output = self._model.pre_classifier(output) - - else: # Embedding mode off; return classes - output = self._model(inp).logits - - encoded_representation.append(output.detach()) - - return torch.stack(encoded_representation).squeeze(1).to('cpu') - - def decode(self, encoded_values_tensor, max_length=100): - raise Exception("Decoder not implemented.") - - def to(self, device, available_devices): - for v in vars(self): - attr = getattr(self, v) - if isinstance(attr, torch.nn.Module): - attr.to(device) - return self diff --git a/lightwood/encoder/text/rnn.py b/lightwood/encoder/text/rnn.py deleted file mode 100644 index 11f909620..000000000 --- a/lightwood/encoder/text/rnn.py +++ /dev/null @@ -1,104 +0,0 @@ -# flake8: noqa -from lightwood.encoder.text.helpers.rnn_helpers import * -from lightwood.encoder.base import BaseEncoder -from lightwood.helpers.log import log -import math - - -class RnnEncoder(BaseEncoder): - - def __init__(self, encoded_vector_size=256, train_iters=75000, stop_on_error=0.0001, - learning_rate=0.01, is_target=False): - super().__init__(is_target) - self._stop_on_error = stop_on_error - self._learning_rate = learning_rate - self._encoded_vector_size = encoded_vector_size - self._train_iters = train_iters - self._input_lang = None - self._output_lang = None - self._encoder = None - self._decoder = None - - def prepare(self, priming_data): - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - - no_null_sentences = [x if x is not None else '' for x in priming_data] - estimated_time = 1 / 937 * self._train_iters * len(no_null_sentences) - log_every = math.ceil(self._train_iters / 100) - log.info('We will train an encoder for this text, on a CPU it will take about {min} minutes'.format( - min=estimated_time)) - - self._input_lang = Lang('input') - self._output_lang = self._input_lang - - for row in no_null_sentences: - if row is not None: - self._input_lang.addSentence(row) - - max_length = max(map(len, no_null_sentences)) - - hidden_size = self._encoded_vector_size - self._encoder = EncoderRNN(self._input_lang.n_words, hidden_size).to(device) - self._decoder = DecoderRNN(hidden_size, self._output_lang.n_words).to(device) - - trainIters(self._encoder, self._decoder, self._input_lang, self._output_lang, no_null_sentences, - no_null_sentences, self._train_iters, int(log_every), - self._learning_rate, self._stop_on_error, max_length) - - self.is_prepared = True - - def encode(self, column_data): - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - no_null_sentences = [x if x is not None else '' for x in column_data] - ret = [] - with torch.no_grad(): - for row in no_null_sentences: - - encoder_hidden = self._encoder.initHidden() - input_tensor = tensorFromSentence(self._input_lang, row) - input_length = input_tensor.size(0) - - #encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device) - - loss = 0 - - for ei in range(input_length): - encoder_output, encoder_hidden = self._encoder( - input_tensor[ei], encoder_hidden) - #encoder_outputs[ei] = encoder_output[0, 0] - - # use the last hidden state as the encoded vector - ret.append(encoder_hidden.tolist()[0][0]) - - return torch.Tensor(ret) - - def decode(self, encoded_values_tensor, max_length=100): - - ret = [] - with torch.no_grad(): - for decoder_hiddens in encoded_values_tensor: - decoder_hidden = torch.FloatTensor([[decoder_hiddens.tolist()]]).to(device) - - decoder_input = torch.tensor([[SOS_token]], device=device) # SOS - - decoded_words = [] - - for di in range(max_length): - decoder_output, decoder_hidden = self._decoder( - decoder_input, decoder_hidden) - - topv, topi = decoder_output.data.topk(1) - if topi.item() == EOS_token: - decoded_words.append('') - break - else: - decoded_words.append(self._output_lang.index2word[topi.item()]) - - decoder_input = topi.squeeze().detach() - - ret.append(' '.join(decoded_words)) - - return ret diff --git a/lightwood/encoder/text/short.py b/lightwood/encoder/text/short.py deleted file mode 100644 index c80c6cbdc..000000000 --- a/lightwood/encoder/text/short.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing import List -import torch -from lightwood.encoder import BaseEncoder -from lightwood.encoder.categorical import CategoricalAutoEncoder -from lightwood.helpers.text import tokenize_text -from lightwood.helpers.torch import concat_vectors_and_pad, average_vectors -import pandas as pd - - -class ShortTextEncoder(BaseEncoder): - def __init__(self, is_target=False, mode=None): - """ - :param is_target: - :param mode: - None or "concat" or "mean". - When None, it will be set automatically based on is_target: - (is_target) -> 'concat' - (not is_target) -> 'mean' - """ - super().__init__(is_target) - - if mode is None: - if is_target: - self._mode = 'concat' - else: - self._mode = 'mean' - else: - if mode not in ['concat', 'mean']: - self._unexpected_mode() - - if is_target and mode != 'concat': - raise ValueError('mode must be "concat" when is_target=True') - - self._mode = mode - - # Defined in self.prepare() - self._combine_fn = None - self.max_words_per_sent = None - self.cae = CategoricalAutoEncoder(is_target=is_target, max_encoded_length=100) - self.is_prepared = False - - def _unexpected_mode(self): - raise ValueError('unexpected combine value (must be "mean" or "concat")') - - # defining both of these as normal functions because pickle can't deal with lambdas - def _combine_concat(self, vecs): - return concat_vectors_and_pad(vecs, self.max_words_per_sent) - - def _combine_mean(self, vecs): - return average_vectors(vecs) - - def prepare(self, priming_data): - no_null_sentences = (x if x is not None else '' for x in priming_data) - unique_tokens = set() - max_words_per_sent = 0 - for sent in no_null_sentences: - tokens = tokenize_text(sent) - max_words_per_sent = max(max_words_per_sent, len(tokens)) - for tok in tokens: - unique_tokens.add(tok) - - self.cae.prepare(pd.Series(list(unique_tokens)), pd.Series([])) - - if self._mode == 'concat': - self.max_words_per_sent = max_words_per_sent - self._combine_fn = self._combine_concat - elif self._mode == 'mean': - self._combine_fn = self._combine_mean - else: - self._unexpected_mode() - - self.is_prepared = True - encoded = self.encode([priming_data[0]]) - self.output_size = len(encoded[0]) - - def encode(self, column_data: List[str]) -> torch.Tensor: - no_null_sentences = (x if x is not None else '' for x in column_data) - output = [] - for sent in no_null_sentences: - tokens = tokenize_text(sent) - encoded_words = self.cae.encode(tokens) - encoded_sent = self._combine_fn(encoded_words) - output.append(torch.Tensor(encoded_sent)) - output = torch.stack(output) - return output - - def decode(self, vectors): - if self._mode == 'concat': - - vec_size = self.cae.max_encoded_length - - output = [] - for vec in vectors: - - viewed_vec = vec.view(-1, vec_size) - - # Find index of first padding vector - for index, v in enumerate(viewed_vec): - if v.abs().sum() == 0: - break - else: - index = viewed_vec.size(0) - - out = self.cae.decode( - viewed_vec[:index] - ) - - output.append(out) - - return output - - elif self._mode == 'mean': - raise ValueError('decode is only defined for mode="concat"') - else: - self._unexpected_mode() diff --git a/lightwood/encoder/text/tfidf.py b/lightwood/encoder/text/tfidf.py deleted file mode 100644 index 68fff44d9..000000000 --- a/lightwood/encoder/text/tfidf.py +++ /dev/null @@ -1,24 +0,0 @@ -import torch -from sklearn.feature_extraction.text import TfidfVectorizer -import numpy as np - -from lightwood.encoder.base import BaseEncoder - - -class TfidfEncoder(BaseEncoder): - def __init__(self, is_target: bool = False): - super().__init__(is_target) - self.ngram_range = (1, 5) - self.max_features = 500 - - def prepare(self, priming_data, training_data=None): - self.tfidf_vectorizer = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_features) - self.tfidf_vectorizer.fit_transform([str(x) for x in priming_data]) - - def encode(self, column_data): - transformed_data = self.tfidf_vectorizer.transform([str(x) for x in column_data]) - dense_transformed_data = [np.array(x.todense())[0] for x in transformed_data] - return torch.Tensor(dense_transformed_data) - - def decode(self, encoded_values_tensor): - raise Exception('This encoder is not bi-directional') diff --git a/lightwood/encoder/text/vocab.py b/lightwood/encoder/text/vocab.py deleted file mode 100644 index 13379f4f8..000000000 --- a/lightwood/encoder/text/vocab.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import torch -from transformers import DistilBertTokenizer -from lightwood.encoder.base import BaseEncoder - - -class VocabularyEncoder(BaseEncoder): - def __init__(self, is_target: bool = False): - super().__init__(is_target) - self._tokenizer_class = DistilBertTokenizer - self._pretrained_model_name = 'distilbert-base-uncased' - self._max_len = None - self._tokenizer = None - self._pad_id = None - - def prepare(self, priming_data): - os.environ['TOKENIZERS_PARALLELISM'] = 'true' - self._max_len = max([len(x) for x in priming_data]) - self._tokenizer = self._tokenizer_class.from_pretrained(self._pretrained_model_name) - self._pad_id = self._tokenizer.convert_tokens_to_ids([self._tokenizer.pad_token])[0] - - def encode(self, column_data): - vec = [] - for text in column_data: - encoded = self._tokenizer.encode(text[:self._max_len], add_special_tokens=True) - encoded = torch.tensor(encoded + [self._pad_id] * (self._max_len - len(encoded))) - vec.append(encoded) - return torch.stack(vec) - - def decode(self, encoded_values_tensor): - vec = [] - for encoded in encoded_values_tensor: - decoded = self._tokenizer.decode(encoded) - decoded = decoded.split('[PAD]')[0].rstrip().lstrip().lstrip('[CLS] ').rstrip(' [SEP]') - vec.append(decoded) - return vec diff --git a/lightwood/encoder/time_series/__init__.py b/lightwood/encoder/time_series/__init__.py deleted file mode 100644 index 345f4229f..000000000 --- a/lightwood/encoder/time_series/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder - -__all__ = ['TimeSeriesEncoder'] diff --git a/lightwood/encoder/time_series/helpers/__init__.py b/lightwood/encoder/time_series/helpers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/encoder/time_series/helpers/common.py b/lightwood/encoder/time_series/helpers/common.py deleted file mode 100644 index 6e11192d2..000000000 --- a/lightwood/encoder/time_series/helpers/common.py +++ /dev/null @@ -1,149 +0,0 @@ -from itertools import product - -import torch -import numpy as np -import pandas as pd -from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder - -from lightwood.api.dtype import dtype - - -class MinMaxNormalizer: - def __init__(self, combination=()): - self.scaler = MinMaxScaler() - self.abs_mean = None - self.combination = combination # tuple with values in grouped-by columns - self.output_size = 1 - - def prepare(self, x: np.ndarray) -> None: - # @TODO: streamline input type - if isinstance(x[0], list): - x = np.vstack(x) - if isinstance(x[0], torch.Tensor): - x = torch.stack(x).numpy() - if len(x.shape) < 2: - x = np.expand_dims(x, axis=1) - - x = x.astype(float) - x[x == None] = 0 # noqa - self.abs_mean = np.mean(np.abs(x)) - self.scaler.fit(x) - - def encode(self, y: np.ndarray) -> torch.Tensor: - if isinstance(y[0], list): - y = np.vstack(y) - if isinstance(y[0], torch.Tensor): - y = torch.stack(y).numpy() - if len(y.shape) < 2: - y = np.expand_dims(y, axis=1) - - shape = y.shape - y = y.astype(float).reshape(-1, self.scaler.n_features_in_) - out = torch.reshape(torch.Tensor(self.scaler.transform(y)), shape) - return out - - def decode(self, y): - return self.scaler.inverse_transform(y) - - -class CatNormalizer: - def __init__(self, encoder_class='one_hot'): - self.encoder_class = encoder_class - if encoder_class == 'one_hot': - self.scaler = OneHotEncoder(sparse=False, handle_unknown='ignore') - else: - self.scaler = OrdinalEncoder() - - self.unk = "" - - def prepare(self, x): - X = [] - for i in x: - for j in i: - X.append(j if j is not None else self.unk) - self.scaler.fit(np.array(X).reshape(-1, 1)) - self.output_size = len(self.scaler.categories_[0]) if self.encoder_class == 'one_hot' else 1 - - def encode(self, Y): - y = np.array([[str(j) if j is not None else self.unk for j in i] for i in Y]) - out = [] - for i in y: - transformed = self.scaler.transform(i.reshape(-1, 1)) - if isinstance(self.scaler, OrdinalEncoder): - transformed = transformed.flatten() - out.append(transformed) - - return torch.Tensor(out) - - def decode(self, y): - return [[i[0] for i in self.scaler.inverse_transform(o)] for o in y] - - -def get_group_matches(data, combination): - """Given a grouped-by combination, return rows of the data that match belong to it. Params: - data: dict with data to filter and group-by columns info. - combination: tuple with values to filter by - return: indexes for rows to normalize, data to normalize - """ - keys = data['group_info'].keys() # which column does each combination value belong to - - if isinstance(data['data'], pd.Series): - data['data'] = np.vstack(data['data']) - if isinstance(data['data'], np.ndarray) and len(data['data'].shape) < 2: - data['data'] = np.expand_dims(data['data'], axis=1) - - if combination == '__default': - idxs = range(len(data['data'])) - return [idxs, np.array(data['data'])[idxs, :]] # return all data - else: - all_sets = [] - for val, key in zip(combination, keys): - all_sets.append(set([i for i, elt in enumerate(data['group_info'][key]) if elt == val])) - if all_sets: - idxs = list(set.intersection(*all_sets)) - return idxs, np.array(data['data'])[idxs, :] - - else: - return [], np.array([]) - - -def generate_target_group_normalizers(data): - """ - Helper function called from data_source. It generates and fits all needed normalizers for a target variable - based on its grouped entities. - :param data: - :return: modified data with dictionary with normalizers for said target variable based on some grouped-by columns - """ - normalizers = {} - group_combinations = [] - - # categorical normalizers - if data['original_type'] in [dtype.categorical, dtype.binary]: - normalizers['__default'] = CatNormalizer() - normalizers['__default'].prepare(data['data']) - group_combinations.append('__default') - - # numerical normalizers, here we spawn one per each group combination - else: - if data['original_type'] == dtype.tsarray: - data['data'] = data['data'].reshape(-1, 1).astype(float) - - all_group_combinations = list(product(*[set(x) for x in data['group_info'].values()])) - for combination in all_group_combinations: - if combination != (): - combination = frozenset(combination) # freeze so that we can hash with it - _, subset = get_group_matches(data, combination) - if subset.size > 0: - normalizers[combination] = MinMaxNormalizer(combination=combination) - normalizers[combination].prepare(subset) - group_combinations.append(combination) - - # ...plus a default one, used at inference time and fitted with all training data - normalizers['__default'] = MinMaxNormalizer() - normalizers['__default'].prepare(data['data']) - group_combinations.append('__default') - - data['target_normalizers'] = normalizers - data['group_combinations'] = group_combinations - - return data diff --git a/lightwood/encoder/time_series/helpers/rnn_helpers.py b/lightwood/encoder/time_series/helpers/rnn_helpers.py deleted file mode 100644 index 0c87eff38..000000000 --- a/lightwood/encoder/time_series/helpers/rnn_helpers.py +++ /dev/null @@ -1,91 +0,0 @@ -import torch -import torch.nn as nn -import numpy as np -from lightwood.helpers.torch import LightwoodAutocast - - -class DecoderRNNNumerical(nn.Module): - def __init__(self, hidden_size, output_size): - super(DecoderRNNNumerical, self).__init__() - self.hidden_size = hidden_size - self.in_activation = nn.Sigmoid() - self.dropout = nn.Dropout(0.2) - self.gru = nn.GRU(output_size, hidden_size, batch_first=True) - self.out = nn.Linear(hidden_size, output_size) - - def forward(self, input, hidden): - with LightwoodAutocast(): - output = self.in_activation(input.float()) - output, hidden = self.gru(output, hidden) - output = self.dropout(output) - output = self.out(output) - return output, hidden - - def init_hidden(self, device, batch_size=1): - return torch.zeros(1, batch_size, self.hidden_size, device=device) - - def decode(self, data, initial_tensor, criterion, device, hidden_state=None, sos=0): - """This method decodes an input unrolled through time, given an initial hidden state""" - if isinstance(data, tuple): - # when using transformer encoder, data contains a sequence length tensor - data, len_data = data - - if initial_tensor.shape[1] > 1: - # if tensor is a sequence (like the one yielded by the transformer), - # we select only the last timestep for decoding - initial_tensor = initial_tensor[:, -1:, :] - - loss = 0 - next_tensor = torch.full_like(initial_tensor, sos, dtype=torch.float32).to(device) - tensor_target = torch.cat([next_tensor, data], dim=1) # add SOS token at t=0 to true input - if hidden_state is None: - hidden_state = self.init_hidden(device, data.shape[0]) - - for tensor_i in range(data.shape[1] - 1): - rand = np.random.randint(2) - # teach from forward as well as from known tensor alternatively - if rand == 1: - next_tensor, hidden_state = self.forward(tensor_target[:, tensor_i, :].unsqueeze(dim=1), hidden_state) - else: - next_tensor, hidden_state = self.forward(next_tensor.detach(), hidden_state) - - loss += criterion(next_tensor, tensor_target[:, tensor_i + 1, :].unsqueeze(dim=1)) - - return next_tensor, hidden_state, loss - - -class EncoderRNNNumerical(nn.Module): - def __init__(self, input_size, hidden_size): - super(EncoderRNNNumerical, self).__init__() - self.hidden_size = hidden_size - self.dropout = nn.Dropout(0.2) - self.gru = nn.GRU(input_size, hidden_size, batch_first=True) - self.out = nn.Linear(hidden_size, input_size) - - def forward(self, input, hidden): - with LightwoodAutocast(): - output, hidden = self.gru(input, hidden) - output = self.dropout(output) - output = self.out(output) - return output, hidden - - def init_hidden(self, device, batch_size=1): - return torch.zeros(1, batch_size, self.hidden_size, device=device) - - def bptt(self, data, criterion, device): - """This method encodes an input unrolled through time""" - loss = 0 - hidden_state = self.init_hidden(device, batch_size=data.shape[0]) - next_tensor = data[:, 0, :].unsqueeze(dim=1) # initial input - - for tensor_i in range(data.shape[1] - 1): - rand = np.random.randint(2) - # teach from forward as well as from known tensor alternatively - if rand == 1: - next_tensor, hidden_state = self.forward(data[:, tensor_i, :].unsqueeze(dim=1), hidden_state) - else: - next_tensor, hidden_state = self.forward(next_tensor.detach(), hidden_state) - - loss += criterion(next_tensor, data[:, tensor_i + 1, :].unsqueeze(dim=1)) - - return next_tensor, hidden_state, loss diff --git a/lightwood/encoder/time_series/helpers/transformer_helpers.py b/lightwood/encoder/time_series/helpers/transformer_helpers.py deleted file mode 100644 index 84e3acad2..000000000 --- a/lightwood/encoder/time_series/helpers/transformer_helpers.py +++ /dev/null @@ -1,118 +0,0 @@ -import math -import torch -import torch.nn as nn -from lightwood.helpers.torch import LightwoodAutocast - - -def len_to_mask(lengths, zeros): - """ - :param lengths: list of ints with the lengths of the sequences - :param zeros: bool. If false, the first lengths[i] values will be True and the rest will be false. - If true, the first values will be False and the rest True - :return: Boolean tensor of dimension (L, T) with L = len(lenghts) and T = lengths.max(), where with rows with lengths[i] True values followed by lengths.max()-lengths[i] False values. The True and False values are inverted if `zeros == True` - """ # noqa - # Clean trick from: - # https://stackoverflow.com/questions/53403306/how-to-batch-convert-sentence-lengths-to-masks-in-pytorch - mask = torch.arange(lengths.max(), device=lengths.device)[None, :] < lengths[:, None] - if zeros: - mask = ~mask # Logical not - return mask - - -def get_chunk(source, source_lengths, start, step): - """Source is 3D tensor, shaped (batch_size, timesteps, n_dimensions), assuming static sequence length""" - # Compute the lengths of the sequences (-1 due to the last element being used as target but not as data! - trunc_seq_len = int(source_lengths[0].item() - 1) - lengths = torch.zeros(source.shape[0]).fill_(trunc_seq_len).to(source.device) - - # This is necessary for MultiHeadedAttention to work - end = min(start + step, trunc_seq_len) - data = source[:, start:end, :] - target = source[:, start + 1:end + 1, :] - - return data, target, lengths - - -class PositionalEncoding(nn.Module): - def __init__(self, d_model, dropout=0.2, max_len=5000): - super(PositionalEncoding, self).__init__() - self.dropout = nn.Dropout(p=dropout) - - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) - ) - pe[:, 0::2] = torch.sin(position * div_term) - if not d_model % 2: - pe[:, 1::2] = torch.cos(position * div_term) - else: - pe[:, 1::2] = torch.cos(position * div_term)[:, :d_model // 2] - pe = pe.unsqueeze(0).transpose(0, 1) - self.register_buffer("pe", pe) - - def forward(self, x): - with LightwoodAutocast(): - x = x + self.pe[: x.size(0), :] - return self.dropout(x) - - -class TransformerEncoder(nn.Module): - def __init__(self, ninp, nhead, nhid, nlayers, dropout=0.2): - super(TransformerEncoder, self).__init__() - self.src_mask = None - self.src_linear = nn.Linear(ninp, nhid) - self.pos_encoder = PositionalEncoding(nhid, dropout) - encoder_layers = nn.TransformerEncoderLayer(d_model=nhid, nhead=nhead, dim_feedforward=nhid, dropout=dropout) - self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers) - self.src_decoder = nn.Linear(nhid, ninp) - self.init_weights() - - def _generate_square_subsequent_mask(self, sz): - mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) - # Take the logarithm - mask = ( - mask.float() - .masked_fill(mask == 0, float("-inf")) - .masked_fill(mask == 1, float(0.0)) - ) - return mask - - def init_weights(self): - for p in self.parameters(): - if p.dim() > 1: - torch.nn.init.xavier_uniform_(p) - - def forward(self, src, lengths, device): - with LightwoodAutocast(): - if self.src_mask is None or self.src_mask.size(0) != src.size(0): - # Attention mask to avoid attending to upcoming parts of the sequence - self.src_mask = self._generate_square_subsequent_mask(src.size(0)).to( - device - ) - src = self.src_linear(src) - # src = self.pos_encoder(src) # not sure if this is helpful in time series - # The lengths_mask has to be of size [batch, lengths] - lengths_mask = len_to_mask(lengths, zeros=True).to(device) - hidden = self.transformer_encoder( - src, mask=self.src_mask, src_key_padding_mask=lengths_mask - ) - output = self.src_decoder(hidden) - return output, hidden - - def bptt(self, batch, criterion, device): - """This method implements truncated backpropagation through time - Returns: output tensor, None as hidden_state, which does not apply in this case, and loss value""" - loss = 0 - train_batch, len_batch = batch - batch_size, timesteps, _ = train_batch.shape - - for start_chunk in range(0, timesteps, timesteps): - data, targets, lengths_chunk = get_chunk(train_batch, len_batch, start_chunk, timesteps) - # Transformer expects seq_length in first dimension, so we transpose - data = data.transpose(0, 1) - targets = targets.transpose(0, 1) - output, hidden = self.forward(data, lengths_chunk, device) - loss += criterion(output, targets, lengths_chunk) - - return output.transpose(0, 1), hidden, loss diff --git a/lightwood/encoder/time_series/rnn.py b/lightwood/encoder/time_series/rnn.py deleted file mode 100644 index dd16ee1d0..000000000 --- a/lightwood/encoder/time_series/rnn.py +++ /dev/null @@ -1,495 +0,0 @@ -import time -from math import gcd -from typing import List -from copy import deepcopy - -import numpy as np -import pandas as pd -import torch -import torch.nn as nn -from torch import optim - -from lightwood.api import dtype -from lightwood.helpers.log import log -from lightwood.encoder.base import BaseEncoder -from lightwood.helpers.device import get_devices -from lightwood.helpers.torch import LightwoodAutocast -from lightwood.encoder.datetime import DatetimeNormalizerEncoder -from lightwood.encoder.time_series.helpers.rnn_helpers import EncoderRNNNumerical, DecoderRNNNumerical -from lightwood.encoder.time_series.helpers.common import MinMaxNormalizer, CatNormalizer, get_group_matches -from lightwood.encoder.time_series.helpers.transformer_helpers import TransformerEncoder, get_chunk, len_to_mask - - -class TimeSeriesEncoder(BaseEncoder): - is_timeseries_encoder: bool = True - is_trainable_encoder: bool = True - - def __init__(self, stop_after: int, is_target=False, original_type: str = None, target: str = None, - grouped_by: List[str] = [], encoder_type='rnn'): - super().__init__(is_target) - self.device, _ = get_devices() - self.target = target - self.grouped_by = grouped_by - self._learning_rate = 0.01 - self.output_size = 128 - self._transformer_hidden_size = None - self._epochs = int(1e5) # default training epochs - self._stop_on_n_bad_epochs = 5 # stop training after N epochs where loss is worse than running avg - self._epochs_running_avg = 5 # amount of epochs for running average - self._pytorch_wrapper = torch.FloatTensor - self.is_prepared = False - self._is_setup = False - self._max_ts_length = 0 - self._sos = 0.0 # start of sequence for decoding - self._eos = 0.0 # end of input sequence -- padding value for batches - self._n_dims = 1 - self._normalizer = None - self.dep_norms = {} # dict of dict of normalizers for each dependency (can be grouped-by some column) - self._target_type = None - self._group_combinations = None - self.original_type = original_type - self.stop_after = stop_after - if encoder_type.lower() == 'rnn': - self.encoder_class = EncoderRNNNumerical - elif encoder_type.lower() == 'transformer': - self.encoder_class = TransformerEncoder - - def setup_nn(self, ts_analysis, dependencies=None): - """This method must be executed after initializing, else types are unassigned""" - if self.original_type in (dtype.datetime, dtype.date): - self._normalizer = DatetimeNormalizerEncoder(sinusoidal=True) - self._n_dims *= len(self._normalizer.fields) * 2 # sinusoidal datetime components - elif self.original_type in (dtype.float, dtype.integer): - self._normalizer = MinMaxNormalizer() - - total_dims = self._n_dims - dec_hsize = self.output_size - - if dependencies: - for dep_name, dep in dependencies.items(): - self.dependencies.append(dep_name) - - if dep_name in self.grouped_by: - continue # we only use group column for indexing and selecting rows - - assert dep['original_type'] in (dtype.categorical, dtype.binary, - dtype.integer, dtype.float, dtype.tsarray) - - if f'__mdb_ts_previous_{self.target}' == dep_name: - self.dep_norms[dep_name] = ts_analysis['target_normalizers'] - self._group_combinations = ts_analysis['group_combinations'] - self._target_type = dep['original_type'] - - # if TS analysis yields no normalizers for this dependency, we create a generic one based on its dtype - else: - if dep['original_type'] in (dtype.categorical, dtype.binary): - self.dep_norms[dep_name]['__default'] = CatNormalizer() - else: - self.dep_norms[dep_name]['__default'] = MinMaxNormalizer() - - self.dep_norms[dep_name]['__default'].prepare(dep['data']) - self._group_combinations = {'__default': None} - - # add descriptor size to the total encoder output dimensionality - if dep['original_type'] in (dtype.categorical, dtype.binary): - total_dims += len(self.dep_norms[dep_name]['__default'].scaler.categories_[0]) - elif dep['original_type'] in (dtype.integer, dtype.float, dtype.tsarray): - total_dims += 1 - - if self.encoder_class == EncoderRNNNumerical: - self._enc_criterion = nn.MSELoss() - self._dec_criterion = self._enc_criterion - self._encoder = self.encoder_class(input_size=total_dims, - hidden_size=self.output_size).to(self.device) - elif self.encoder_class == TransformerEncoder: - self._enc_criterion = self._masked_criterion - self._dec_criterion = nn.MSELoss() - self._base_criterion = nn.MSELoss(reduction="none") - if self._transformer_hidden_size is None: - self._transformer_hidden_size = total_dims * 2 # arbitrary - - self._encoder = self.encoder_class(ninp=total_dims, - nhead=gcd(dec_hsize, total_dims), - nhid=self._transformer_hidden_size, - nlayers=1).to(self.device) - else: - raise Exception(f"Time series encoder class not supported: {self.encoder_class}") - - self._decoder = DecoderRNNNumerical(output_size=total_dims, hidden_size=dec_hsize).to(self.device) - self._parameters = list(self._encoder.parameters()) + list(self._decoder.parameters()) - self._optimizer = optim.AdamW(self._parameters, lr=self._learning_rate, weight_decay=1e-4) - self._n_dims = total_dims - self._is_setup = True - - def to(self, device, available_devices): - if self._is_setup: - self.device = device - return super().to(device, available_devices) - return self - - def _prepare_raw_data(self, data): - """Convert to array and determine max length""" - out_data = [] - for e in data: - if not isinstance(e, torch.Tensor): - e = np.array(e, dtype=float) - e[np.isnan(e)] = 0.0 - t = torch.tensor(e, dtype=torch.float) - else: - t = e.float() - t[torch.isnan(t)] = 0.0 - out_data.append(t) - lengths = torch.tensor([len(e) for e in data], dtype=torch.float) - return out_data, lengths - - def _get_batch(self, source, start, end): - end = min(end, len(source)) - return source[start:end] - - def prepare(self, train_priming_data: pd.Series, dev_priming_data: pd.Series, dependency_data={}, ts_analysis=None, - feedback_hoop_function=log.info, batch_size=256): - """ - :param priming_data: a list of (self._n_dims)-dimensional time series [[dim1_data], ...] - :param dependency_data: raw data from other columns - :param ts_analysis: dictionary with time analysis info (e.g. normalizers for each target group) - :param feedback_hoop_function: method to use if you want to get feedback on the training process - :param batch_size - """ - priming_data = pd.concat([train_priming_data, dev_priming_data]) - priming_data = list(priming_data.values) - - if self.is_prepared: - raise Exception('You can only call "prepare" once for a given encoder.') - else: - self.setup_nn(ts_analysis, dependency_data) - - started = time.time() - - # Convert to array and determine max length - priming_data, lengths_data = self._prepare_raw_data(priming_data) - self._max_ts_length = int(lengths_data.max()) - - if self._normalizer: - self._normalizer.prepare(priming_data) - priming_data = self._normalizer.encode(priming_data).to(self.device) - if len(priming_data.shape) < 3: - priming_data = priming_data.unsqueeze(-1) - else: - priming_data = torch.stack([d for d in priming_data]).unsqueeze(-1).to(self.device) - - # merge all normalized data into a training batch - normalized_tensors = [] - for dep_name, dep_data in dependency_data.items(): - if dep_name in self.grouped_by: - continue - if dep_data['original_type'] in (dtype.integer, dtype.float): - dep_data['group_info'] = {group: dependency_data[group]['data'] for group in self.grouped_by} - data = torch.zeros((len(priming_data), lengths_data.max().int().item(), 1)) - all_idxs = set(range(len(data))) - for group_name, normalizer in self.dep_norms[dep_name].items(): - if group_name != '__default': - idxs, subset = get_group_matches(dep_data, normalizer.combination) - normalized = normalizer.encode(subset).unsqueeze(-1) - data[idxs, :, :] = normalized - all_idxs -= set(idxs) - if len(all_idxs) > 0 and '__default' in self.dep_norms[dep_name].keys(): - default_norm = self.dep_norms[dep_name]['__default'] - subset = [dep_data['data'][idx] for idx in list(all_idxs)] - data[list(all_idxs), :, :] = torch.Tensor(default_norm.encode(subset)).unsqueeze(-1) - - else: - # categorical has only one normalizer at all times - normalizer = self.dep_norms[dep_name]['__default'] - data = normalizer.encode(dep_data['data'].values) - if len(data.shape) < 3: - data = data.unsqueeze(-1) # add feature dimension - data[torch.isnan(data)] = 0.0 - normalized_tensors.append(data) - - if normalized_tensors: - normalized_data = torch.cat(normalized_tensors, dim=-1).to(self.device) - priming_data = torch.cat([priming_data, normalized_data], dim=-1) - - self._encoder.train() - running_losses = np.full(self._epochs_running_avg, np.nan) - bad_epochs = 0 - - for epoch in range(self._epochs): - average_loss = 0 - - for batch_idx in range(0, len(priming_data), batch_size): - # setup loss and optimizer - self._optimizer.zero_grad() - loss = 0 - - # shape: (batch_size, timesteps, n_dims) - batch = self._get_batch(priming_data, batch_idx, min(batch_idx + batch_size, len(priming_data))) - - # encode and decode through time - with LightwoodAutocast(): - if self.encoder_class == TransformerEncoder: - # pack batch length info tensor - len_batch = self._get_batch(lengths_data, batch_idx, min( - batch_idx + batch_size, len(priming_data))) - batch = batch, len_batch - - next_tensor, hidden_state, dec_loss = self._encoder.bptt( - batch, self._enc_criterion, self.device) - loss += dec_loss - - else: - next_tensor, hidden_state, enc_loss = self._encoder.bptt( - batch, self._enc_criterion, self.device) - loss += enc_loss - - next_tensor, hidden_state, dec_loss = self._decoder.decode( - batch, next_tensor, self._dec_criterion, self.device, hidden_state=hidden_state) - loss += dec_loss - - loss.backward() - - self._optimizer.step() - average_loss += loss.item() - - average_loss = average_loss / len(priming_data) - batch_idx += batch_size - - if epoch > self._epochs_running_avg and average_loss > np.average(running_losses): - bad_epochs += 1 - - # update running loss - running_losses[:-1] = running_losses[1:] - running_losses[-1] = average_loss - - if feedback_hoop_function is not None: - feedback_hoop_function( - "time series encoder epoch [{epoch_n}/{total}] average_loss = {average_loss}".format( - epoch_n=epoch + 1, total=self._epochs, average_loss=average_loss)) - - if bad_epochs > self._stop_on_n_bad_epochs: - break - elif (time.time() - started) > self.stop_after: - break - - self.is_prepared = True - - def _encode_one(self, data, previous=None, initial_hidden=None, return_next_value=False): - """ - This method encodes one single row of serial data - :param data: multidimensional time series as list of lists [[dim1_data], [dim2_data], ...] - (dim_data: string with format "x11, x12, ... x1n") - :param initial_hidden: if you want to encode from an initial hidden state other than 0s - :param return_next_value: if you want to return the next value in the time series too - - :return: either encoded_value or (encoded_value, next_value) - """ - self._encoder.eval() - with torch.no_grad(): - # Convert to array and determine max length - data, lengths_data = self._prepare_raw_data(data) - self._max_ts_length = int(lengths_data.max()) - - if self._normalizer: - data = self._normalizer.encode(data).to(self.device) - if len(data.shape) < 3: - data = data.unsqueeze(-1) - else: - data = torch.stack([d for d in data]).unsqueeze(-1).to(self.device) - - if previous is not None: - target_tensor = torch.stack(previous).to(self.device) - target_tensor[torch.isnan(target_tensor)] = 0.0 - if len(target_tensor.shape) < 3: - target_tensor = target_tensor.transpose(0, 1).unsqueeze(0) - data_tensor = torch.cat((data, target_tensor), dim=-1) - else: - data_tensor = data - - steps = data_tensor.shape[1] - - if self.encoder_class == EncoderRNNNumerical: - encoder_hidden = self._encoder.init_hidden(self.device) - encoder_hidden = encoder_hidden if initial_hidden is None else initial_hidden - - next_tensor = None - for tensor_i in range(steps): - next_tensor, encoder_hidden = self._encoder.forward(data_tensor[:, tensor_i, :].unsqueeze(dim=0), - encoder_hidden) - - else: - next_tensor = None - len_batch = self._get_batch(lengths_data, 0, len(data)) - batch_size, timesteps, _ = data_tensor.shape - - for start_chunk in range(0, timesteps, timesteps): - data, targets, lengths_chunk = get_chunk(data_tensor, len_batch, start_chunk, timesteps) - data = data.transpose(0, 1) - next_tensor, encoder_hidden = self._encoder.forward(data, lengths_chunk, self.device) - - if return_next_value: - return encoder_hidden, next_tensor - else: - return encoder_hidden - - def encode(self, column_data, dependency_data=None, get_next_count=None): - """ - Encode a list of time series data - :param column_data: a list of (self._n_dims)-dimensional time series [[dim1_data], ...] to encode - :param get_next_count: default None, but you can pass a number X and it will return the X following predictions - on the series for each ts_data_point in column_data - :return: a list of encoded time series or if get_next_count !=0 two lists (encoded_values, projected_numbers) - """ - - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - if isinstance(column_data, pd.Series): - data = deepcopy(column_data.values) # get a copy to avoid modifying the actual data frame - else: - data = column_data - - for i in range(len(data)): - if not isinstance(data[i][0], list): - data[i] = [data[i]] # add dimension for 1D timeseries - - # include autoregressive target data - ptd = [] - if dependency_data is not None: - for dep, dep_data in dependency_data.items(): - if dep in self.grouped_by: - continue - # normalize numerical target per group-by - if self._target_type in (dtype.integer, dtype.float, dtype.tsarray): - dep_info = { - 'group_info': {group: dependency_data[group] for group in self.grouped_by}, - 'data': dep_data - } - tensor = torch.zeros((len(dep_data), len(dep_data[0]), 1)).to(self.device) - all_idxs = set(range(len(dep_data))) - - for combination in [c for c in self._group_combinations if c != '__default']: - normalizer = self.dep_norms[dep].get(frozenset(combination), None) - if normalizer is None: - normalizer = self.dep_norms[dep]['__default'] - idxs, subset = get_group_matches(dep_info, normalizer.combination) - if idxs: - tensor[idxs, :, :] = torch.Tensor(normalizer.encode(subset)).unsqueeze(-1).to(self.device) - all_idxs -= set(idxs) - - # encode all remaining rows (not belonging to any grouped combination) with default normalizer - if all_idxs: - default_norm = self.dep_norms[dep]['__default'] - subset = [dep_data[idx] for idx in all_idxs] - tensor[list(all_idxs), :, :] = torch.Tensor( - default_norm.encode(subset)).unsqueeze(-1).to(self.device) - tensor[torch.isnan(tensor)] = 0.0 - - # normalize categorical target - else: - normalizer = self.dep_norms[dep]['__default'] - tensor = normalizer.encode(dep_data) - tensor[torch.isnan(tensor)] = 0.0 - - ptd.append(tensor) - - ret = [] - next = [] - - for i, val in enumerate(data): - if get_next_count is None: - if dependency_data is not None and len(dependency_data) > 0 and len(ptd) > 0: - encoded = self._encode_one(val, previous=[values[i] for values in ptd]) - else: - encoded = self._encode_one(val) - - else: - if get_next_count <= 0: - raise Exception('get_next_count must be greater than 0') - - hidden = None - vector = val - next_i = [] - - for j in range(get_next_count): - hidden, next_reading = self._encode_one(vector, initial_hidden=hidden, return_next_value=True) - vector = [next_reading] - if j == 0: - encoded = hidden - next_i.append(next_reading) - - next_value = next_i[0][0].cpu() - - if self._normalizer: - next_value = torch.Tensor(self._normalizer.decode(next_value)) - - next.append(next_value) - - ret.append(encoded[0][0].cpu()) - - if get_next_count is None: - return torch.stack(ret) - else: - return torch.stack(ret), torch.stack(next) - - def _decode_one(self, hidden, steps): - """ - Decodes a single time series from its encoded representation. - :param hidden: time series embedded representation tensor, with size self.output_size - :param steps: as in decode(), defines how many values to output when reconstructing - :return: decoded time series list - """ - self._decoder.eval() - with torch.no_grad(): - ret = [] - next_tensor = torch.full((1, 1, self._n_dims), self._sos, dtype=torch.float32).to(self.device) - timesteps = steps if steps else self._max_ts_length - for _ in range(timesteps): - next_tensor, hidden = self._decoder.forward(next_tensor, hidden) - ret.append(next_tensor) - return torch.stack(ret) - - def decode(self, encoded_data, steps=None): - """ - Decode a list of embedded multidimensional time series - :param encoded_data: a list of embeddings [ e1, e2, ...] to be decoded into time series - :param steps: fixed number of timesteps to reconstruct from each embedding. - If None, encoder will output the largest length encountered during training. - :return: a list of reconstructed time series - """ - if not self.is_prepared: - raise Exception('You need to call "prepare" before calling "encode" or "decode".') - - ret = [] - for _, val in enumerate(encoded_data): - hidden = torch.unsqueeze(torch.unsqueeze(val, dim=0), dim=0).to(self.device) - reconstruction = self._decode_one(hidden, steps).cpu().squeeze().T.numpy() - - if self._n_dims == 1: - reconstruction = reconstruction.reshape(1, -1) - - if self._normalizer: - reconstruction = self._normalizer.decode(reconstruction) - - ret.append(reconstruction) - - return torch.Tensor(ret) - - def _masked_criterion(self, output, targets, lengths): - """ Computes the loss of the first `lengths` items in the chunk """ - # Put in (B, T) format and zero-out the unnecessary values - mask = len_to_mask(lengths, zeros=False).t() - - # Inflate to feature dimension - mask = mask.unsqueeze(-1).repeat(1, 1, output.shape[-1]) - output = output * mask - targets = targets * mask - - # compute the loss with respect to the appropriate lengths and average across the batch-size - # We compute for every output (x_i)_i=1^L and target (y_i)_i=1^L, loss = 1/L \sum (x_i - y_i)^2 - # And average across the mini-batch - losses = self._base_criterion(output, targets).sum(dim=2).sum(dim=0) - - # The TBPTT will compute a slightly different loss, but it is not problematic - loss = torch.dot((1.0 / lengths.float()), losses) / len(losses) - - return loss diff --git a/lightwood/encoder/type_encoder_maps/Array.py b/lightwood/encoder/type_encoder_maps/Array.py deleted file mode 100644 index cb65e3963..000000000 --- a/lightwood/encoder/type_encoder_maps/Array.py +++ /dev/null @@ -1,4 +0,0 @@ -from lightwood.encoder.array.array import ArrayEncoder - - -__all__ = ['ArrayEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Audio.py b/lightwood/encoder/type_encoder_maps/Audio.py deleted file mode 100644 index 7e440d36b..000000000 --- a/lightwood/encoder/type_encoder_maps/Audio.py +++ /dev/null @@ -1,5 +0,0 @@ -from lightwood.encoder.audio import MFCCEncoder - - -__all__ = ['MFCCEncoder'] - diff --git a/lightwood/encoder/type_encoder_maps/Binary.py b/lightwood/encoder/type_encoder_maps/Binary.py deleted file mode 100644 index d1d007435..000000000 --- a/lightwood/encoder/type_encoder_maps/Binary.py +++ /dev/null @@ -1,7 +0,0 @@ -from lightwood.encoder.categorical.onehot import OneHotEncoder -from lightwood.encoder.categorical.binary import BinaryEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder -from lightwood.encoder.array.array import ArrayEncoder -from lightwood.encoder.identity.identity import IdentityEncoder - -__all__ = ['BinaryEncoder', 'OneHotEncoder', 'TimeSeriesEncoder', 'ArrayEncoder', 'IdentityEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Categorical.py b/lightwood/encoder/type_encoder_maps/Categorical.py deleted file mode 100644 index 079767a0b..000000000 --- a/lightwood/encoder/type_encoder_maps/Categorical.py +++ /dev/null @@ -1,7 +0,0 @@ -from lightwood.encoder.categorical.onehot import OneHotEncoder -from lightwood.encoder.categorical.autoencoder import CategoricalAutoEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder -from lightwood.encoder.array.array import ArrayEncoder - - -__all__ = ['OneHotEncoder', 'CategoricalAutoEncoder', 'TimeSeriesEncoder', 'ArrayEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Date.py b/lightwood/encoder/type_encoder_maps/Date.py deleted file mode 100644 index fe663de22..000000000 --- a/lightwood/encoder/type_encoder_maps/Date.py +++ /dev/null @@ -1,6 +0,0 @@ -from lightwood.encoder.datetime.datetime import DatetimeEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder -from lightwood.encoder.array.array import ArrayEncoder - - -__all__ = ['DatetimeEncoder', 'TimeSeriesEncoder', 'ArrayEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Datetime.py b/lightwood/encoder/type_encoder_maps/Datetime.py deleted file mode 100644 index fe663de22..000000000 --- a/lightwood/encoder/type_encoder_maps/Datetime.py +++ /dev/null @@ -1,6 +0,0 @@ -from lightwood.encoder.datetime.datetime import DatetimeEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder -from lightwood.encoder.array.array import ArrayEncoder - - -__all__ = ['DatetimeEncoder', 'TimeSeriesEncoder', 'ArrayEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Float.py b/lightwood/encoder/type_encoder_maps/Float.py deleted file mode 100644 index dbbc5bff6..000000000 --- a/lightwood/encoder/type_encoder_maps/Float.py +++ /dev/null @@ -1,7 +0,0 @@ -from lightwood.encoder.numeric.numeric import NumericEncoder -from lightwood.encoder.numeric.ts_numeric import TsNumericEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder -from lightwood.encoder.array.array import ArrayEncoder -from lightwood.encoder.identity.identity import IdentityEncoder - -__all__ = ['NumericEncoder', 'TsNumericEncoder', 'TimeSeriesEncoder', 'ArrayEncoder', 'IdentityEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Image.py b/lightwood/encoder/type_encoder_maps/Image.py deleted file mode 100644 index 9fe5fd448..000000000 --- a/lightwood/encoder/type_encoder_maps/Image.py +++ /dev/null @@ -1,4 +0,0 @@ -from lightwood.encoder.image.img_2_vec import Img2VecEncoder - - -__all__ = ['Img2VecEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Integer.py b/lightwood/encoder/type_encoder_maps/Integer.py deleted file mode 100644 index dbbc5bff6..000000000 --- a/lightwood/encoder/type_encoder_maps/Integer.py +++ /dev/null @@ -1,7 +0,0 @@ -from lightwood.encoder.numeric.numeric import NumericEncoder -from lightwood.encoder.numeric.ts_numeric import TsNumericEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder -from lightwood.encoder.array.array import ArrayEncoder -from lightwood.encoder.identity.identity import IdentityEncoder - -__all__ = ['NumericEncoder', 'TsNumericEncoder', 'TimeSeriesEncoder', 'ArrayEncoder', 'IdentityEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Quantity.py b/lightwood/encoder/type_encoder_maps/Quantity.py deleted file mode 100644 index dbbc5bff6..000000000 --- a/lightwood/encoder/type_encoder_maps/Quantity.py +++ /dev/null @@ -1,7 +0,0 @@ -from lightwood.encoder.numeric.numeric import NumericEncoder -from lightwood.encoder.numeric.ts_numeric import TsNumericEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder -from lightwood.encoder.array.array import ArrayEncoder -from lightwood.encoder.identity.identity import IdentityEncoder - -__all__ = ['NumericEncoder', 'TsNumericEncoder', 'TimeSeriesEncoder', 'ArrayEncoder', 'IdentityEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Rich_Text.py b/lightwood/encoder/type_encoder_maps/Rich_Text.py deleted file mode 100644 index f92de6015..000000000 --- a/lightwood/encoder/type_encoder_maps/Rich_Text.py +++ /dev/null @@ -1,8 +0,0 @@ -from lightwood.encoder.text.short import ShortTextEncoder -from lightwood.encoder.text.vocab import VocabularyEncoder -from lightwood.encoder.text.rnn import RnnEncoder as TextRnnEncoder -from lightwood.encoder.categorical.autoencoder import CategoricalAutoEncoder -from lightwood.encoder.text.pretrained import PretrainedLangEncoder - - -__all__ = ['ShortTextEncoder', 'VocabularyEncoder', 'TextRnnEncoder', 'CategoricalAutoEncoder', 'PretrainedLangEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Short_Text.py b/lightwood/encoder/type_encoder_maps/Short_Text.py deleted file mode 100644 index f92de6015..000000000 --- a/lightwood/encoder/type_encoder_maps/Short_Text.py +++ /dev/null @@ -1,8 +0,0 @@ -from lightwood.encoder.text.short import ShortTextEncoder -from lightwood.encoder.text.vocab import VocabularyEncoder -from lightwood.encoder.text.rnn import RnnEncoder as TextRnnEncoder -from lightwood.encoder.categorical.autoencoder import CategoricalAutoEncoder -from lightwood.encoder.text.pretrained import PretrainedLangEncoder - - -__all__ = ['ShortTextEncoder', 'VocabularyEncoder', 'TextRnnEncoder', 'CategoricalAutoEncoder', 'PretrainedLangEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/Tags.py b/lightwood/encoder/type_encoder_maps/Tags.py deleted file mode 100644 index 6b32439eb..000000000 --- a/lightwood/encoder/type_encoder_maps/Tags.py +++ /dev/null @@ -1,7 +0,0 @@ -from lightwood.encoder.categorical.multihot import MultiHotEncoder -from lightwood.encoder.text.pretrained import PretrainedLangEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder -from lightwood.encoder.array.array import ArrayEncoder - - -__all__ = ['MultiHotEncoder', 'PretrainedLangEncoder', 'TimeSeriesEncoder', 'ArrayEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/TimeSeries.py b/lightwood/encoder/type_encoder_maps/TimeSeries.py deleted file mode 100644 index 2c8a7338e..000000000 --- a/lightwood/encoder/type_encoder_maps/TimeSeries.py +++ /dev/null @@ -1,5 +0,0 @@ -from lightwood.encoder.numeric.ts_array_numeric import TsArrayNumericEncoder -from lightwood.encoder.time_series.rnn import TimeSeriesEncoder - - -__all__ = ['TsArrayNumericEncoder', 'TimeSeriesEncoder'] diff --git a/lightwood/encoder/type_encoder_maps/__init__.py b/lightwood/encoder/type_encoder_maps/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/ensemble/base.py b/lightwood/ensemble/base.py deleted file mode 100644 index 49d1affa7..000000000 --- a/lightwood/ensemble/base.py +++ /dev/null @@ -1,37 +0,0 @@ -from typing import List - -import pandas as pd - -from lightwood.mixer.base import BaseMixer -from lightwood.data.encoded_ds import EncodedDs -from lightwood.api.types import PredictionArguments - - -class BaseEnsemble: - """ - Base class for all ensembles. - - Ensembles wrap sets of Lightwood mixers, with the objective of generating better predictions based on the output of each mixer. - - There are two important methods for any ensemble to work: - 1. `__init__()` should prepare all mixers and internal ensemble logic. - 2. `__call__()` applies any aggregation rules to generate final predictions based on the output of each mixer. - - Class Attributes: - - mixers: List of mixers the ensemble will use. - - supports_proba: For classification tasks, whether the ensemble supports yielding per-class scores rather than only returning the predicted label. - - """ # noqa - data: EncodedDs - mixers: List[BaseMixer] - best_index: int # @TODO: maybe only applicable to BestOf - supports_proba: bool - - def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs) -> None: - self.data = data - self.mixers = mixers - self.best_index = 0 - self.supports_proba = False - - def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame: - raise NotImplementedError() diff --git a/lightwood/ensemble/best_of.py b/lightwood/ensemble/best_of.py deleted file mode 100644 index fed88ed9e..000000000 --- a/lightwood/ensemble/best_of.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import List, Optional - -import numpy as np -import pandas as pd - -from lightwood.helpers.log import log -from lightwood.helpers.numeric import can_be_nan_numeric -from lightwood.mixer.base import BaseMixer -from lightwood.ensemble.base import BaseEnsemble -from lightwood.api.types import PredictionArguments -from lightwood.data.encoded_ds import EncodedDs -from lightwood.helpers.general import evaluate_accuracy - - -class BestOf(BaseEnsemble): - """ - This ensemble acts as a mixer selector. - After evaluating accuracy for all internal mixers with the validation data, it sets the best mixer as the underlying model. - """ # noqa - indexes_by_accuracy: List[float] - - def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs, accuracy_functions, - args: PredictionArguments, ts_analysis: Optional[dict] = None) -> None: - super().__init__(target, mixers, data) - - score_list = [] - for _, mixer in enumerate(mixers): - score_dict = evaluate_accuracy( - data.data_frame, - mixer(data, args)['prediction'], - target, - accuracy_functions, - ts_analysis=ts_analysis - ) - avg_score = np.mean(list(score_dict.values())) - log.info(f'Mixer: {type(mixer).__name__} got accuracy: {avg_score}') - - if can_be_nan_numeric(avg_score): - avg_score = -pow(2, 63) - log.warning(f'Change the accuracy of mixer {type(mixer).__name__} to valid value: {avg_score}') - - score_list.append(avg_score) - - self.indexes_by_accuracy = list(reversed(np.array(score_list).argsort())) - self.supports_proba = self.mixers[self.indexes_by_accuracy[0]].supports_proba - log.info(f'Picked best mixer: {type(self.mixers[self.indexes_by_accuracy[0]]).__name__}') - - def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame: - if args.all_mixers: - predictions = {} - for mixer in self.mixers: - predictions[f'__mdb_mixer_{type(mixer).__name__}'] = mixer(ds, args=args)['prediction'] - return pd.DataFrame(predictions) - else: - for mixer_index in self.indexes_by_accuracy: - mixer = self.mixers[mixer_index] - try: - return mixer(ds, args=args) - except Exception as e: - if mixer.stable: - raise(e) - else: - log.warning(f'Unstable mixer {type(mixer).__name__} failed with exception: {e}.\ - Trying next best') diff --git a/lightwood/ensemble/mean_ensemble.py b/lightwood/ensemble/mean_ensemble.py deleted file mode 100644 index 839119d22..000000000 --- a/lightwood/ensemble/mean_ensemble.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import List - -import pandas as pd - -from lightwood.mixer.base import BaseMixer -from lightwood.ensemble.base import BaseEnsemble -from lightwood.api.types import PredictionArguments -from lightwood.data.encoded_ds import EncodedDs -from lightwood import dtype - - -class MeanEnsemble(BaseEnsemble): - def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs, dtype_dict: dict) -> None: - super().__init__(target, mixers, data) - if dtype_dict[target] not in (dtype.float, dtype.integer, dtype.quantity): - raise Exception( - f'This ensemble can only be used regression problems! Got target dtype {dtype_dict[target]} instead!') - - def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame: - predictions_df = pd.DataFrame() - for mixer in self.mixers: - predictions_df[f'__mdb_mixer_{type(mixer).__name__}'] = mixer(ds, args=args)['prediction'] - - return pd.DataFrame(predictions_df.mean(axis='columns'), columns=['prediction']) - diff --git a/lightwood/helpers/__init__.py b/lightwood/helpers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/helpers/device.py b/lightwood/helpers/device.py deleted file mode 100644 index 528ef8ff1..000000000 --- a/lightwood/helpers/device.py +++ /dev/null @@ -1,38 +0,0 @@ -import torch -import os -from random import randint -from torch.cuda import device_count, get_device_capability - - -def is_cuda_compatible(): - compatible_device_count = 0 - if torch.version.cuda is not None: - for d in range(device_count()): - capability = get_device_capability(d) - major = capability[0] - minor = capability[1] - current_arch = major * 10 + minor - min_arch = min((int(arch.split("_")[1]) for arch in torch.cuda.get_arch_list()), default=35) - if (not current_arch < min_arch - and not torch._C._cuda_getCompiledVersion() <= 9000): - compatible_device_count += 1 - - if compatible_device_count > 0: - return True - return False - - -def get_devices(): - if torch.cuda.is_available() and is_cuda_compatible(): - device_str = "cuda" - available_devices = torch.cuda.device_count() - - if available_devices > 1: - if os.environ.get('RANDOM_GPU', False) in ['1', 'true', 'True', True, 1]: - device_str = 'cuda:' + str(randint(0, available_devices - 1)) - available_devices = 1 - else: - device_str = "cpu" - available_devices = 0 - - return torch.device(device_str), available_devices diff --git a/lightwood/helpers/general.py b/lightwood/helpers/general.py deleted file mode 100644 index 31976ce76..000000000 --- a/lightwood/helpers/general.py +++ /dev/null @@ -1,177 +0,0 @@ -import importlib -from typing import List, Union, Dict, Optional - -import numpy as np -import pandas as pd -from sklearn.metrics import r2_score, f1_score, mean_absolute_error - - -from lightwood.encoder.time_series.helpers.common import get_group_matches - - -# ------------------------- # -# Accuracy metrics -# ------------------------- # -def evaluate_accuracy(data: pd.DataFrame, - predictions: pd.Series, - target: str, - accuracy_functions: List[str], - ts_analysis: Optional[dict] = {}) -> Dict[str, float]: - """ - Dispatcher for accuracy evaluation. - - :param data: original dataframe. - :param predictions: output of a lightwood predictor for the input `data`. - :param target: target column name. - :param accuracy_functions: list of accuracy function names. Support currently exists for `scikit-learn`'s `metrics` module, plus any custom methods that Lightwood exposes. - :param ts_analysis: `lightwood.data.timeseries_analyzer` output, used to compute time series task accuracy. - :return: accuracy metric for a dataset and predictions. - """ # noqa - score_dict = {} - - for accuracy_function_str in accuracy_functions: - if accuracy_function_str == 'evaluate_array_accuracy': - nr_predictions = 1 if not isinstance(predictions.iloc[0], list) else len(predictions.iloc[0]) - cols = [target] + [f'{target}_timestep_{i}' for i in range(1, nr_predictions)] - true_values = data[cols].values.tolist() - score_dict[accuracy_function_str] = evaluate_array_accuracy(list(true_values), - list(predictions), - data, - ts_analysis=ts_analysis) - else: - true_values = data[target].tolist() - accuracy_function = getattr(importlib.import_module('sklearn.metrics'), accuracy_function_str) - score_dict[accuracy_function_str] = accuracy_function(list(true_values), list(predictions)) - - return score_dict - - -def evaluate_regression_accuracy( - true_values, - predictions, - **kwargs -): - """ - Evaluates accuracy for regression tasks. - If predictions have a lower and upper bound, then `within-bound` accuracy is computed: whether the ground truth value falls within the predicted region. - If not, then a (positive bounded) R2 score is returned instead. - - :return: accuracy score as defined above. - """ # noqa - if 'lower' and 'upper' in predictions: - Y = np.array(true_values).astype(float) - within = ((Y >= predictions['lower']) & (Y <= predictions['upper'])) - return sum(within) / len(within) - else: - r2 = r2_score(true_values, predictions['prediction']) - return max(r2, 0) - - -def evaluate_multilabel_accuracy(true_values, predictions, **kwargs): - """ - Evaluates accuracy for multilabel/tag prediction. - - :return: weighted f1 score of predictions and ground truths. - """ - pred_values = predictions['prediction'] - return f1_score(true_values, pred_values, average='weighted') - - -def evaluate_array_accuracy( - true_values: List[List[Union[int, float]]], - predictions: List[List[Union[int, float]]], - data: pd.DataFrame, - **kwargs -) -> float: - """ - Evaluate accuracy in numerical time series forecasting tasks. - Defaults to mean absolute scaled error (MASE) if in-sample residuals are available. - If this is not the case, R2 score is computed instead. - - Scores are computed for each timestep (as determined by `timeseries_settings.nr_predictions`), - and the final accuracy is the reciprocal of the average score through all timesteps. - """ - - ts_analysis = kwargs.get('ts_analysis', {}) - naive_errors = ts_analysis.get('ts_naive_mae', {}) - - if not naive_errors: - # use mean R2 method if naive errors are not available - return evaluate_array_r2_accuracy(true_values, predictions, ts_analysis=ts_analysis) - - mases = [] - true_values = np.array(true_values) - predictions = np.array(predictions) - wrapped_data = {'data': data.reset_index(drop=True), - 'group_info': {gcol: data[gcol].tolist() - for gcol in ts_analysis['tss'].group_by} if ts_analysis['tss'].group_by else {} - } - for group in ts_analysis['group_combinations']: - g_idxs, _ = get_group_matches(wrapped_data, group) - - # only evaluate populated groups - if g_idxs: - trues = true_values[g_idxs] - preds = predictions[g_idxs] - - if ts_analysis['tss'].nr_predictions == 1: - preds = np.expand_dims(preds, axis=1) - - # only evaluate accuracy for rows with complete historical context - if len(trues) > ts_analysis['tss'].window: - trues = trues[ts_analysis['tss'].window:] - preds = preds[ts_analysis['tss'].window:] - - # add MASE score for each group (__default only considered if the task is non-grouped) - if len(ts_analysis['group_combinations']) == 1 or group != '__default': - mases.append(mase(trues, preds, ts_analysis['ts_naive_mae'][group], ts_analysis['tss'].nr_predictions)) - - return 1 / max(np.average(mases), 1e-4) # reciprocal to respect "larger -> better" convention - - -def evaluate_array_r2_accuracy( - true_values: List[List[Union[int, float]]], - predictions: List[List[Union[int, float]]], - **kwargs -) -> float: - """ - Default time series forecasting accuracy method. - Returns mean R2 score over all timesteps in the forecasting horizon. - """ - base_acc_fn = kwargs.get('base_acc_fn', lambda t, p: max(0, r2_score(t, p))) - - aggregate = 0.0 - - fh = 1 if not isinstance(predictions[0], list) else len(predictions[0]) - if fh == 1: - predictions = [[p] for p in predictions] - - # only evaluate accuracy for rows with complete historical context - if kwargs.get('ts_analysis', {}).get('tss', False): - true_values = true_values[kwargs['ts_analysis']['tss'].window:] - predictions = predictions[kwargs['ts_analysis']['tss'].window:] - - for i in range(fh): - aggregate += base_acc_fn([t[i] for t in true_values], [p[i] for p in predictions]) - - return aggregate / fh - - -# ------------------------- # -# Helpers -# ------------------------- # -def mase(trues, preds, scale_error, fh): - """ - Computes mean absolute scaled error. - The scale corrective factor is the mean in-sample residual from the naive forecasting method. - """ - if scale_error == 0: - scale_error = 1 # cover (rare) case where series is constant - - agg = 0.0 - for i in range(fh): - true = [t[i] for t in trues] - pred = [p[i] for p in preds] - agg += mean_absolute_error(true, pred) - - return (agg / fh) / scale_error diff --git a/lightwood/helpers/io.py b/lightwood/helpers/io.py deleted file mode 100644 index f8f543463..000000000 --- a/lightwood/helpers/io.py +++ /dev/null @@ -1,20 +0,0 @@ -import requests -from lightwood.helpers.log import log -import os - - -def read_from_path_or_url(path: str, load_from_path): - if path.startswith('http'): - response = requests.get(path) - with open(path.split('/')[-1], 'wb') as f: - f.write(response.content) - try: - return load_from_path(path.split('/')[-1]) - except Exception as e: - log.error(e) - finally: - os.remove(path.split('/')[-1]) - else: - # Will automatically resample to 22.05kHz and convert to mono - return load_from_path(path) - diff --git a/lightwood/helpers/log.py b/lightwood/helpers/log.py deleted file mode 100644 index 0f9ce861d..000000000 --- a/lightwood/helpers/log.py +++ /dev/null @@ -1,19 +0,0 @@ -import logging -import os -import colorlog - - -def initialize_log(): - pid = os.getpid() - - handler = colorlog.StreamHandler() - handler.setFormatter(colorlog.ColoredFormatter()) - - logging.basicConfig(handlers=[handler]) - log = logging.getLogger(f'lightwood-{pid}') - log_level = os.environ.get('LIGHTWOOD_LOG', 'DEBUG') - log.setLevel(log_level) - return log - - -log = initialize_log() diff --git a/lightwood/helpers/numeric.py b/lightwood/helpers/numeric.py deleted file mode 100644 index 5e9d1f943..000000000 --- a/lightwood/helpers/numeric.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Iterable - - -def can_be_nan_numeric(value: object) -> bool: - """ - Determines if **value** might be `nan` or `inf` or some other numeric value (i.e. which can be cast as `float`) that is not actually a number. - - Name is vague due to uncertainty that all edge cases of numeric values that have number-like type behavior are covered. - """ # noqa - - try: - value = str(value) - value = float(value) - except Exception: - return False - - try: - if isinstance(value, float): - a = int(value) # noqa - isnan = False - except Exception: - isnan = True - return isnan - - -def filter_nan_and_none(series: Iterable) -> list: - return [x for x in series if not can_be_nan_numeric(x) and x is not None] diff --git a/lightwood/helpers/parallelism.py b/lightwood/helpers/parallelism.py deleted file mode 100644 index 01b1a0a7e..000000000 --- a/lightwood/helpers/parallelism.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -from typing import Dict -import psutil -import multiprocessing as mp -from lightwood.helpers.log import log - - -def get_nr_procs(df=None): - if os.name == 'nt': - return 1 - else: - available_mem = psutil.virtual_memory().available - max_per_proc_usage = 0.2 * pow(10, 9) - - if df is not None: - max_per_proc_usage += df.memory_usage(index=True, deep=True).sum() - proc_count = int(min(mp.cpu_count(), available_mem // max_per_proc_usage)) - 1 - - return max(proc_count, 1) - - -def run_mut_method(obj: object, arg: object, method: str, identifier: str) -> str: - try: - obj.__getattribute__(method)(arg) - return obj, identifier - except Exception as e: - log.error(f'Exception {e} when running with identifier {identifier}') - raise e - - -def mut_method_call(object_dict: Dict[str, tuple]) -> Dict[str, object]: - manager = mp.Manager() - return_dict = manager.dict() - - nr_procs = get_nr_procs() - pool = mp.Pool(processes=nr_procs) - promise_arr = [] - for name, data in object_dict.items(): - promise = pool.apply_async(func=run_mut_method, args=(data[0], data[1], data[2], name)) - promise_arr.append(promise) - - for promise in promise_arr: - obj, identifier = promise.get() - return_dict[identifier] = obj - log.info(f'Done running for: {identifier}') - - pool.close() - pool.join() - - return dict(return_dict) diff --git a/lightwood/helpers/seed.py b/lightwood/helpers/seed.py deleted file mode 100644 index 5398505fe..000000000 --- a/lightwood/helpers/seed.py +++ /dev/null @@ -1,11 +0,0 @@ -import torch -import numpy as np -import random - - -def seed(seed_nr: int) -> None: - torch.manual_seed(seed_nr) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - np.random.seed(seed_nr) - random.seed(seed_nr) diff --git a/lightwood/helpers/templating.py b/lightwood/helpers/templating.py deleted file mode 100644 index 5c8ecd542..000000000 --- a/lightwood/helpers/templating.py +++ /dev/null @@ -1,105 +0,0 @@ - - -''' -def is_allowed(v): - print(v) - if v is None: - return True - - if isinstance(v, bool): - return True - - try: - float(v) - return True - except: - pass - - if v in ['True', 'False']: - return True - - if isinstance(v, str): - if v.startswith('"') and v.endswith('"'): - return True - if v.startswith("'") and v.endswith("'"): - return True - - # Predictor member - if v.startswith('self.') and '(' not in v and len(v) < 50: - return True - - # Allowed variable names - if v in ['df', 'data', 'encoded_data', 'train_data', 'encoded_train_data', 'test_data']: - return True - - try: - cv = dict(v) - for k in cv: - ka = is_allowed(k) - ma = is_allowed(cv[k]) - if not ka or not ma: - return False - return True - except Exception: - pass - - try: - cv = list(v) - for ma in cv: - ma = is_allowed(m) - if not ma: - return False - return True - except Exception: - pass - - raise Exception(f'Possible code injection: {v}') -''' - - -def is_allowed(v): - if '(' in str(v): - return False - if 'lambda' in str(v): - return False - if '__' in str(v): - return False - - return True - - -def call(entity: dict) -> str: - for k, v in entity['args'].items(): - if not str(v).startswith('$'): - if not is_allowed(v): - raise Exception(f'Invalid value: {v} for arg {k}') - - args = [f'{k}={v}' for k, v in entity['args'].items() if not str(v).startswith('$')] - - for k, v in entity['args'].items(): - if str(v).startswith('$'): - v = str(v).replace('$', 'self.') - args.append(f'{k}={v}') - - args = ','.join(args) - return f"""{entity['module']}({args})""" - - -def inline_dict(obj: dict) -> str: - arr = [] - for k, v in obj.items(): - k = k.replace("'", "\\'").replace('"', '\\"') - arr.append(f"""'{k}': {v}""") - - dict_code = '{\n' + ',\n'.join(arr) + '\n}' - return dict_code - - -def align(code: str, indent: int) -> str: - add_space = '' - for _ in range(indent): - add_space += ' ' - - code_arr = code.split('\n') - code = f'\n{add_space}'.join(code_arr) - return code diff --git a/lightwood/helpers/text.py b/lightwood/helpers/text.py deleted file mode 100644 index 992db70d5..000000000 --- a/lightwood/helpers/text.py +++ /dev/null @@ -1,286 +0,0 @@ -""" -******************************************************* - * Copyright (C) 2017 MindsDB Inc. - * - * This file is part of MindsDB Server. - * - * MindsDB Server can not be copied and/or distributed without the express - * permission of MindsDB Inc - ******************************************************* -""" -from collections import Counter, defaultdict -import string -import json -import re -import hashlib -import numpy as np -import scipy.stats as st -import langdetect -import nltk -from lightwood.api.dtype import dtype - - -try: - nltk.data.find('tokenizers/punkt') -except LookupError: - nltk.download('punkt') - -try: - from nltk.corpus import stopwords - stopwords.words('english') -except LookupError: - nltk.download('stopwords', quiet=True) - - -def get_language_dist(data): - langdetect.DetectorFactory.seed = 0 - lang_dist = defaultdict(lambda: 0) - lang_dist['Unknown'] = 0 - lang_probs_cache = dict() - for text in data: - text = str(text) - text = ''.join([c for c in text if c not in string.punctuation]) - if text not in lang_probs_cache: - try: - lang_probs = langdetect.detect_langs(text) - except langdetect.lang_detect_exception.LangDetectException: - lang_probs = [] - lang_probs_cache[text] = lang_probs - - lang_probs = lang_probs_cache[text] - if len(lang_probs) > 0 and lang_probs[0].prob > 0.90: - lang_dist[lang_probs[0].lang] += 1 - else: - lang_dist['Unknown'] += 1 - - return dict(lang_dist) - - -def analyze_sentences(data): - """ - :param data: list of str - - :returns: - tuple( - int: nr words total, - dict: word_dist, - dict: nr_words_dist - ) - """ - nr_words = 0 - word_dist = defaultdict(int) - nr_words_dist = defaultdict(int) - stop_words = set(stopwords.words('english')) - for text in map(str, data): - text = text.lower() - tokens = tokenize_text(text) - tokens_no_stop = [x for x in tokens if x not in stop_words] - nr_words_dist[len(tokens)] += 1 - nr_words += len(tokens) - for tok in tokens_no_stop: - word_dist[tok] += 1 - - return nr_words, dict(word_dist), dict(nr_words_dist) - - -def word_tokenize(string): - sep_tag = '{#SEP#}' - for separator in WORD_SEPARATORS: - string = str(string).replace(separator, sep_tag) - - words_split = string.split(sep_tag) - num_words = len([word for word in words_split if word and word not in ['', None]]) - return num_words - - -def clean_float(val): - if isinstance(val, (int, float)): - return float(val) - - if isinstance(val, float): - return val - - val = str(val).strip(' ') - val = val.replace(',', '.') - val = val.rstrip('"').lstrip('"') - - if val == '' or val == 'None' or val == 'nan': - return None - - return float(val) - - -def gen_chars(length, character): - """ - # lambda to Generates a string consisting of `length` consiting of repeating `character` - :param length: - :param character: - :return: - """ - return ''.join([character for i in range(length)]) - - -def cast_string_to_python_type(string): - """ Returns None, an integer, float or a string from a string""" - if string is None or string == '': - return None - - if string.isnumeric(): - # Did you know you can write fractions in unicode, and they are numeric but can't be cast to integers !? - try: - return int(string) - except Exception: - return None - - try: - return clean_float(string) - except Exception: - return string - - -def splitRecursive(word, tokens): - words = [str(word)] - for token in tokens: - new_split = [] - for word in words: - new_split += word.split(token) - words = new_split - words = [word for word in words if word not in ['', None]] - return words - - -def hashtext(cell): - text = json.dumps(cell) - return hashlib.md5(text.encode('utf8')).hexdigest() - - -def _is_foreign_key_name(name): - for endings in ['id', 'ID', 'Id']: - for add in ['-', '_', ' ']: - if name.endswith(add + endings): - return True - for endings in ['ID', 'Id']: - if name.endswith(endings): - return True - return False - - -def _is_identifier_name(name): - for keyword in ['account', 'uuid', 'identifier', 'user']: - if keyword in name: - return True - return False - - -def isascii(string): - """ - Used instead of str.isascii because python 3.6 doesn't have that - """ - return all(ord(c) < 128 for c in string) - - -def extract_digits(point): - return ''.join([char for char in str(point) if char.isdigit()]) - - -def get_pct_auto_increment(data): - int_data = [] - for point in [extract_digits(x) for x in data]: - try: - int_data.append(int(point)) - except Exception: - pass - - int_data = sorted(int_data) - prev_nr = int_data[0] - increase_by_one = 0 - for nr in int_data[1:]: - diff = nr - prev_nr - if diff == 1: - increase_by_one += 1 - prev_nr = nr - - return increase_by_one / (len(data) - 1) - - -def get_identifier_description_mp(arg_tup): - data, column_name, data_dtype = arg_tup - return get_identifier_description(data, column_name, data_dtype) - - -def get_identifier_description(data, column_name, data_dtype): - data = list(data) - unquie_pct = len(set(data)) / len(data) - - spaces = [len(str(x).split(' ')) - 1 for x in data] - mean_spaces = np.mean(spaces) - - # Detect auto incrementing index - # -- some cases where I guess people do want to use this for learning, so ignoring this check for now... - # if data_dtype == dtype.integer: - # if get_pct_auto_increment(data) > 0.98 and unquie_pct > 0.99: - # return 'Auto-incrementing identifier' - - # Detect hash - all_same_length = all(len(str(data[0])) == len(str(x)) for x in data) - uuid_charset = set('0123456789abcdefABCDEF-') - all_uuid_charset = all(set(str(x)).issubset(uuid_charset) for x in data) - is_uuid = all_uuid_charset and all_same_length - - if all_same_length and len(data) == len(set(data)) and data_dtype not in (dtype.integer, dtype.float): - str_data = [str(x) for x in data] - randomness_per_index = [] - for i, _ in enumerate(str_data[0]): - N = len(set(x[i] for x in str_data)) - S = st.entropy([*Counter(x[i] for x in str_data).values()]) - randomness_per_index.append(S / np.log(N)) - - if np.mean(randomness_per_index) > 0.95: - return 'Hash-like identifier' - - # Detect foreign key - if data_dtype == dtype.integer: - if _is_foreign_key_name(column_name): - return 'Foreign key' - - if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary): - if unquie_pct > 0.98: - if is_uuid: - return 'UUID' - else: - return 'Unknown identifier' - - # Everything is unique and it's too short to be rich text - if data_dtype in (dtype.categorical, dtype.short_text, dtype.rich_text) and \ - unquie_pct > 0.99999 and mean_spaces < 1: - return 'Unknown identifier' - - return None - - -def contains_alnum(text): - for c in text: - if c.isalnum(): - return True - return False - - -def decontracted(phrase): - # specific - phrase = re.sub(r"won\'t", "will not", phrase) - phrase = re.sub(r"can\'t", "can not", phrase) - - # general - phrase = re.sub(r"n\'t", " not", phrase) - phrase = re.sub(r"\'re", " are", phrase) - phrase = re.sub(r"\'s", " is", phrase) - phrase = re.sub(r"\'d", " would", phrase) - phrase = re.sub(r"\'ll", " will", phrase) - phrase = re.sub(r"\'t", " not", phrase) - phrase = re.sub(r"\'ve", " have", phrase) - phrase = re.sub(r"\'m", " am", phrase) - return phrase - - -def tokenize_text(text): - return [t.lower() for t in nltk.word_tokenize(decontracted(text)) if contains_alnum(t)] diff --git a/lightwood/helpers/torch.py b/lightwood/helpers/torch.py deleted file mode 100644 index 807519ddd..000000000 --- a/lightwood/helpers/torch.py +++ /dev/null @@ -1,72 +0,0 @@ -import functools -import torch -from torch.nn.functional import pad -from lightwood.helpers.device import get_devices - - -def concat_vectors_and_pad(vec_list, max_): - assert len(vec_list) > 0 - assert len(vec_list) <= max_ - assert max_ > 0 - - cat_vec = torch.cat(list(vec_list), dim=0) - - pad_size = max_ - len(vec_list) - padding = (0, pad_size * vec_list[0].size(0)) - padded = pad(cat_vec[None], padding, 'constant', 0)[0] - - return padded - - -def average_vectors(vec_list): - assert len(vec_list) > 0 - return torch.cat([emb[None] for emb in vec_list], dim=0).mean(0) - - -class LightwoodAutocast: - """ - Equivalent to torch.cuda.amp.autocast, but checks device compute capability - to activate the feature only when the GPU has tensor cores to leverage AMP. - """ - active = False - - def __init__(self, enabled=True): - self.major = 0 # GPU major version - torch_version = [int(i) for i in torch.__version__.split('.')[:-1]] - - if not enabled or not torch.cuda.is_available() or torch_version[0] < 1 or torch_version[1] < 6: - self._enabled = False - else: - device, _ = get_devices() - if device.type == 'cuda': - # tensor cores only exist from 7 onwards - # if this is not the case, then AMP is unnecessary overhead - self.major, _ = torch.cuda.get_device_capability(device) - self._enabled = enabled if self.major > 6 else False - else: - self._enabled = False # gpu is available but cpu is forced - - self.prev = self._enabled # necessary reference to exit - LightwoodAutocast.active = self._enabled - - def __enter__(self): - if self._enabled: - self.prev = torch.is_autocast_enabled() - torch.set_autocast_enabled(self._enabled) - torch.autocast_increment_nesting() - - def __exit__(self, *args): - if self._enabled: - # Drop the cache when we exit to a nesting level that's outside any instance of autocast - if torch.autocast_decrement_nesting() == 0: - torch.clear_autocast_cache() - torch.set_autocast_enabled(self.prev) - return False - - def __call__(self, func): - @functools.wraps(func) - def decorate_autocast(*args, **kwargs): - with self: - return func(*args, **kwargs) - - return decorate_autocast diff --git a/lightwood/helpers/ts.py b/lightwood/helpers/ts.py deleted file mode 100644 index 05ddccc3c..000000000 --- a/lightwood/helpers/ts.py +++ /dev/null @@ -1,46 +0,0 @@ -import pandas as pd -from lightwood.api.types import TimeseriesSettings - - -def get_inferred_timestamps(df: pd.DataFrame, col: str, deltas: dict, tss: TimeseriesSettings) -> pd.DataFrame: - nr_predictions = tss.nr_predictions - if tss.group_by: - gby = [f'group_{g}' for g in tss.group_by] - - for (idx, row) in df.iterrows(): - last = row[f'order_{col}'][-1] - - if tss.group_by: - try: - series_delta = deltas[frozenset(row[gby].tolist())][col] - except KeyError: - series_delta = deltas['__default'][col] - else: - series_delta = deltas['__default'][col] - timestamps = [last + t * series_delta for t in range(nr_predictions)] - - if tss.nr_predictions == 1: - timestamps = timestamps[0] # preserves original input format if nr_predictions == 1 - - df[f'order_{col}'].iloc[idx] = timestamps - return df[f'order_{col}'] - - -def add_tn_conf_bounds(data: pd.DataFrame, tss_args: TimeseriesSettings): - """ - Add confidence (and bounds if applicable) to t+n predictions, for n>1 - @TODO: active research question: how to guarantee 1-e coverage for t+n, n>1 - for now, we replicate the width and conf obtained for t+1 - """ - for col in ['confidence', 'lower', 'upper']: - data[col] = data[col].astype(object) - - for idx, row in data.iterrows(): - data['confidence'].iloc[idx] = [row['confidence'] for _ in range(tss_args.nr_predictions)] - - preds = row['prediction'] - width = row['upper'] - row['lower'] - data['lower'].iloc[idx] = [pred - width / 2 for pred in preds] - data['upper'].iloc[idx] = [pred + width / 2 for pred in preds] - - return data diff --git a/lightwood/mixer/__init__.py b/lightwood/mixer/__init__.py deleted file mode 100644 index df5d14843..000000000 --- a/lightwood/mixer/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from lightwood.mixer.unit import Unit -from lightwood.mixer.base import BaseMixer -from lightwood.mixer.neural import Neural -from lightwood.mixer.lightgbm import LightGBM -from lightwood.mixer.lightgbm_array import LightGBMArray -from lightwood.mixer.sktime import SkTime -from lightwood.mixer.regression import Regression - -try: - from lightwood.mixer.qclassic import QClassic -except Exception: - QClassic = None - -__all__ = ['BaseMixer', 'Neural', 'LightGBM', 'LightGBMArray', 'Unit', 'Regression', 'SkTime', 'QClassic'] diff --git a/lightwood/mixer/base.py b/lightwood/mixer/base.py deleted file mode 100644 index dcea431c2..000000000 --- a/lightwood/mixer/base.py +++ /dev/null @@ -1,67 +0,0 @@ -import pandas as pd - -from lightwood.data.encoded_ds import EncodedDs -from lightwood.api.types import PredictionArguments - - -class BaseMixer: - """ - Base class for all mixers. - - Mixers are the backbone of all Lightwood machine learning models. They intake encoded feature representations for every column, and are tasked with learning to fulfill the predictive requirements stated in a problem definition. - - There are two important methods for any mixer to work: - 1. `fit()` contains all logic to train the mixer with the training data that has been encoded by all the (already trained) Lightwood encoders for any given task. - 2. `__call__()` is executed to generate predictions once the mixer has been trained using `fit()`. - - An additional `partial_fit()` method is used to update any mixer that has already been trained. - - Class Attributes: - - stable: If set to `True`, this mixer should always work. Any mixer with `stable=False` can be expected to fail under some circumstances. - - fit_data_len: Length of the training data. - - supports_proba: For classification tasks, whether the mixer supports yielding per-class scores rather than only returning the predicted label. - - """ # noqa - stable: bool - fit_data_len: int # @TODO (Patricio): should this really be in `BaseMixer`? - supports_proba: bool - - def __init__(self, stop_after: int): - """ - Initializer a mixer. - - :param stop_after: Time budget to train this mixer. - """ - self.stop_after = stop_after - self.supports_proba = False - - def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - """ - Fits/trains a mixer with training data. - - :param train_data: encoded representations of the training data subset. - :param dev_data: encoded representations of the "dev" data subset. This can be used as an internal validation subset (e.g. it is used for early stopping in the default `Neural` mixer). - - """ # noqa - raise NotImplementedError() - - def __call__(self, ds: EncodedDs, - args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: - """ - Calls a trained mixer to predict the target column given some input data. - - :param ds: encoded representations of input data. - :param args: a `lightwood.api.types.PredictionArguments` object, including all relevant inference-time arguments to customize the behavior. - :return: - """ # noqa - raise NotImplementedError() - - def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - """ - Partially fits/trains a mixer with new training data. This is a somewhat experimental method, and it aims at updating pre-existing Lightwood predictors. - - :param train_data: encoded representations of the new training data subset. - :param dev_data: encoded representations of new the "dev" data subset. As in `fit()`, this can be used as an internal validation subset. - - """ # noqa - pass diff --git a/lightwood/mixer/helpers/__init__.py b/lightwood/mixer/helpers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lightwood/mixer/helpers/ar_net.py b/lightwood/mixer/helpers/ar_net.py deleted file mode 100644 index 73c5952f0..000000000 --- a/lightwood/mixer/helpers/ar_net.py +++ /dev/null @@ -1,56 +0,0 @@ -import torch -from torch import nn -from lightwood.mixer.helpers.default_net import DefaultNet -from lightwood.helpers.torch import LightwoodAutocast - - -class ArNet(DefaultNet): - """ - DefaultNet variant that adds a secondary stream (simple linear layer) with constrained - weights to learn autoregressive coefficients for numerical time series targets. - """ - - def __init__(self, - encoder_span: dict, # contains index span for each encoder - target_name: str, - input_size: int = None, - output_size: int = None, - shape: list = None, - max_params: int = 3e7, - num_hidden: int = 1, - dropout: float = 0) -> None: - - self.ar_net = None - super().__init__(input_size=input_size, - output_size=output_size, - shape=shape, - max_params=max_params, - num_hidden=num_hidden, - dropout=dropout - ) - self.target = target_name - self.encoder_span = encoder_span - self.ar_column = f'__mdb_ts_previous_{self.target}' - self.ar_idxs = list(*[range(idx[0], idx[1]) for col, idx in encoder_span.items() if col == self.ar_column]) - dims = [(len(self.ar_idxs), output_size)] - linears = [nn.Linear(in_features=inf, out_features=outf) for inf, outf in dims] - self.ar_net = nn.Sequential(*linears) - self.ar_net.to(self.device) - - def to(self, device=None, available_devices=None): - if self.ar_net: - self.ar_net.to(device) - return super().to(device) - - def forward(self, input): - with LightwoodAutocast(): - if len(input.shape) == 1: - input = input.unsqueeze(0) - - residual_output = self.net(input) - ar_output = self.ar_net(input[:, self.ar_idxs]) - if self.ar_net.training: - self.ar_net._modules['0'].weight = nn.Parameter(torch.clamp(self.ar_net._modules['0'].weight, - 0.0, - 0.999)) # force unit root - return ar_output + residual_output diff --git a/lightwood/mixer/helpers/default_net.py b/lightwood/mixer/helpers/default_net.py deleted file mode 100644 index a62467063..000000000 --- a/lightwood/mixer/helpers/default_net.py +++ /dev/null @@ -1,69 +0,0 @@ -import math -import torch -from lightwood.helpers.torch import LightwoodAutocast -from lightwood.helpers.device import get_devices -from lightwood.helpers.log import log -import numpy as np - - -class DefaultNet(torch.nn.Module): - """ - Default neural network module used in `Neural` mixer to learn the predictive task based on encoded feature representations. - - Small architecture with either a one or two linear layers. Hidden size might be constrained to have at most `max_params` in the network, trading off accuracy for speed. - """ # noqa - - def __init__(self, - input_size: int = None, - output_size: int = None, - shape: list = None, - max_params: int = int(3e7), - num_hidden: int = 1, - dropout: float = 0) -> None: - - super(DefaultNet, self).__init__() - if input_size is not None and output_size is not None: - self.input_size = input_size - self.output_size = output_size - hidden_size = max([self.input_size * 2, self.output_size * 2, 400]) - shape = [self.input_size] + [hidden_size] * num_hidden + [self.output_size] - - # If the network is too big, shrink it - if np.sum([shape[i] * shape[i + 1] for i in range(len(shape) - 1)]) > max_params: - log.warning('Shrinking network!') - hidden_size = math.floor(max_params / (self.input_size * self.output_size)) - - if hidden_size > self.output_size: - shape = [self.input_size, hidden_size, self.output_size] - else: - shape = [self.input_size, self.output_size] - if shape is not None: - layers = [] - for ind in range(len(shape) - 1): - layers.append(torch.nn.Linear(shape[ind], shape[ind + 1])) - if ind < len(shape) - 2: - layers.append(torch.nn.SELU()) - if dropout > 0.001: - layers.append(torch.nn.Dropout(p=dropout)) - else: - raise Exception('You must specify other a shape or an input and output size when creating a DefaultNet!') - - self.net = torch.nn.Sequential(*layers) - self.to(get_devices()[0]) - - def to(self, device: torch.device) -> torch.nn.Module: - if 'cuda' not in str(torch.device) == 0: - log.warning( - 'Creating neural network on CPU, it will be significantly slower than using a GPU, consider using a GPU instead') # noqa - self.net = self.net.to(device) - - self.device = device - return self - - def forward(self, input): - try: - with LightwoodAutocast(): - output = self.net(input) - except Exception: - output = self.net(input) - return output diff --git a/lightwood/mixer/helpers/qclassic_net.py b/lightwood/mixer/helpers/qclassic_net.py deleted file mode 100644 index 28563a66d..000000000 --- a/lightwood/mixer/helpers/qclassic_net.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -import qiskit -import numpy as np - -from lightwood.mixer.helpers.default_net import DefaultNet -from lightwood.helpers.torch import LightwoodAutocast - - -class QuantumCircuit: - """ - This class provides a simple interface for interaction - with the quantum circuit - """ - - def __init__(self, n_qubits, backend, shots): - # --- Circuit definition --- - self._circuit = qiskit.QuantumCircuit(n_qubits) - - all_qubits = [i for i in range(n_qubits)] - self.theta = qiskit.circuit.Parameter('theta') - - self._circuit.h(all_qubits) - self._circuit.barrier() - self._circuit.ry(self.theta, all_qubits) - - self._circuit.measure_all() - # --------------------------- - - self.backend = backend - self.shots = shots - - def run(self, thetas): - job = self.backend.run([self._circuit.bind_parameters({self.theta: t}) - for t in thetas], shots=self.shots) - results = job.result().get_counts() - - final = [] - for result in results: - counts = np.array(list(result.values())) - states = np.array(list(result.keys())).astype(float) - - # Compute probabilities for each state - probabilities = counts / self.shots - - # Get state expectation - expectation = np.sum(states * probabilities) - final.append(expectation) - return np.array(final) - - -class HybridSingleFunction(torch.autograd.Function): - """ Hybrid quantum - classical function definition """ - - @staticmethod - def forward(ctx, input, quantum_circuit, shift): - """ Forward pass computation """ - ctx.shift = shift - ctx.quantum_circuit = quantum_circuit - - expectation_z = ctx.quantum_circuit.run(input.tolist()) - result = torch.tensor(expectation_z) - ctx.save_for_backward(input, result) - - return result - - @staticmethod - def backward(ctx, grad_output): - """ Backward pass computation """ - input, expectation_z = ctx.saved_tensors - input_list = np.array(input.tolist()) - - shift_right = input_list + np.ones(input_list.shape) * ctx.shift - shift_left = input_list - np.ones(input_list.shape) * ctx.shift - - expectation_left = ctx.quantum_circuit.run(shift_left) - expectation_right = ctx.quantum_circuit.run(shift_right) - gradients = torch.tensor([expectation_right]) - torch.tensor([expectation_left]) - gradients = np.array(gradients).T - return torch.tensor(gradients).float() * grad_output.float(), None, None - - -class HybridSingle(torch.nn.Module): - """ Hybrid quantum - classical layer definition """ - - def __init__(self, backend, shots, shift): - super(HybridSingle, self).__init__() - self.quantum_circuit = QuantumCircuit(1, backend, shots) - self.shift = shift - - def forward(self, input): - return HybridSingleFunction.apply(input, self.quantum_circuit, self.shift) - - -class QClassicNet(DefaultNet): - """ - DefaultNet variant that uses qiskit to add a final quantum layer - """ - - def __init__(self, - input_size: int = None, - output_size: int = None, - shape: list = None, - max_params: int = 3e7, - num_hidden: int = 1, - dropout: float = 0) -> None: - hidden_size = max([input_size * 2, output_size * 2, 400]) - super().__init__(input_size=input_size, - output_size=hidden_size, - shape=shape, - max_params=max_params, - num_hidden=num_hidden, - dropout=dropout) - self.fc = torch.nn.Linear(hidden_size, output_size) - self.hybrid = HybridSingle(qiskit.Aer.get_backend('aer_simulator'), 100, np.pi / 2) - - def to(self, device=None, available_devices=None): - return super().to(device) - - def forward(self, input): - with LightwoodAutocast(): - if len(input.shape) == 1: - input = input.unsqueeze(0) - classical_output = self.fc(self.net(input)) - full_output = torch.stack([self.hybrid(i) for i in classical_output]) - return full_output.float() diff --git a/lightwood/mixer/helpers/ranger.py b/lightwood/mixer/helpers/ranger.py deleted file mode 100644 index c850f1416..000000000 --- a/lightwood/mixer/helpers/ranger.py +++ /dev/null @@ -1,130 +0,0 @@ -import math -import torch -from torch.optim.optimizer import Optimizer - - -class Ranger(Optimizer): - def __init__( - self, params, lr=0.0005, alpha=0.5, k=5, N_sma_threshold=5, betas=(0.9, 0.999), - eps=1e-5, weight_decay=0.000): - # parameter checks - if not 0.0 <= alpha <= 1.0: - raise ValueError(f'Invalid slow update rate: {alpha}') - if not 1 <= k: - raise ValueError(f'Invalid lookahead steps: {k}') - if not lr > 0: - raise ValueError(f'Invalid Learning Rate: {lr}') - if not eps > 0: - raise ValueError(f'Invalid eps: {eps}') - - # parameter comments: - # beta1 (momentum) of .95 seems to work better than .90... - # N_sma_threshold of 5 seems better in testing than 4. - # In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you. - # @TODO Implement the above testing with AX ^ - - # prep defaults and init torch.optim base - defaults = dict(lr=lr, alpha=alpha, k=k, betas=betas, - N_sma_threshold=N_sma_threshold, eps=eps, weight_decay=weight_decay) - super().__init__(params, defaults) - - # Since we keep LR the same for all param groups, - # store it here for now for quick&easy access if we want to know it - self.lr = lr - - # adjustable threshold - self.N_sma_threshold = N_sma_threshold - - # look ahead params - self.initial_lr = lr - self.alpha = alpha - self.k = k - - # radam buffer for state - self.radam_buffer = [[None, None, None] for ind in range(10)] - - def __setstate__(self, state): - super(Ranger, self).__setstate__(state) - - def step(self, closure=None): - loss = None - - if closure is not None: - loss = closure() - - # Evaluate averages and grad, update param tensors - for group in self.param_groups: - for p in group['params']: - if p.grad is not None: - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('Ranger optimizer does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] # get state dict for this param - - # On the first run initialize the dictionary for each weight group - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - - # look ahead weight storage now in state dict - state['slow_buffer'] = torch.empty_like(p.data) - state['slow_buffer'].copy_(p.data) - # @TODO Couldn't this branch happen after the if above is entered - # in thus replacing torch.zero_like) ?? - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - # begin computations - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - # compute variance mov avg - exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) - # compute mean moving avg - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) - - state['step'] += 1 - - buffered = self.radam_buffer[int(state['step'] % 10)] - - if state['step'] == buffered[0]: - N_sma, step_size = buffered[1], buffered[2] - else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - buffered[1] = N_sma - if N_sma > self.N_sma_threshold: - step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / - N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - else: - step_size = 1.0 / (1 - beta1 ** state['step']) - buffered[2] = step_size - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) - - if N_sma > self.N_sma_threshold: - denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) - else: - p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr']) - - p.data.copy_(p_data_fp32) - - # integrated look ahead... - # we do it at the param level instead of group level - if state['step'] % group['k'] == 0: - slow_p = state['slow_buffer'] - # Find the interpolated weight between the slower buffer (the weight `k` steps ago) - # and the current weight, set that as the state for RAdam - slow_p.add_(p.data - slow_p, alpha=self.alpha) - p.data.copy_(slow_p) - - return loss diff --git a/lightwood/mixer/helpers/residual_net.py b/lightwood/mixer/helpers/residual_net.py deleted file mode 100644 index d8a738cc5..000000000 --- a/lightwood/mixer/helpers/residual_net.py +++ /dev/null @@ -1,74 +0,0 @@ -from typing import List -import torch -from torch import nn -from lightwood.helpers.torch import LightwoodAutocast -from lightwood.helpers.device import get_devices -from lightwood.helpers.log import log - - -class ResidualModule(nn.Module): - def __init__( - self, - input_size - ) -> None: - """Initialize self.""" - intermediate_size = max([input_size * 2, 400]) - super().__init__() - self.normalization = nn.BatchNorm1d(input_size) - self.linear_first = nn.Linear(input_size, intermediate_size) - self.activation_first = nn.SELU() - self.linear_second = nn.Linear(intermediate_size, input_size) - self.dropout = nn.Dropout(0.1) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Perform the forward pass.""" - x_input = x - if self.training: - x = self.normalization(x) - x = self.linear_first(x) - x = self.activation_first(x) - x = self.dropout(x) - x = self.linear_second(x) - x = x_input + x - return x - - -class ResidualNet(torch.nn.Module): - def __init__(self, - input_size: int = None, - output_size: int = None, - shape: List[int] = None, - max_params: int = int(3e5)) -> None: - super(ResidualNet, self).__init__() - self.net = torch.nn.Sequential( - * - ([ResidualModule(input_size) for _ in range(1)] + - [nn.Linear(input_size, max([input_size * 2, output_size * 2, 400])), - nn.Linear(max([input_size * 2, output_size * 2, 400]), - output_size)])) - self.to(*get_devices()) - - def to(self, device: torch.device, available_devices: int) -> torch.nn.Module: - self.net = self.net.to(device) - if available_devices > 1: - self.dp_wrapper_net = torch.nn.DataParallel(self.net) - else: - self.dp_wrapper_net = self.net - - self.device = device - self.available_devices = available_devices - return self - - def forward(self, input): - try: - with LightwoodAutocast(): - output = self.net(input) - except Exception as e: - # Data parallel error - if 'nccl' in str(e).lower(): - self.dp_wrapper_net = self.net - log.warn(f'Data parallel not working: {e}') - else: - raise e - - return output diff --git a/lightwood/mixer/helpers/transform_corss_entropy_loss.py b/lightwood/mixer/helpers/transform_corss_entropy_loss.py deleted file mode 100644 index 94722a621..000000000 --- a/lightwood/mixer/helpers/transform_corss_entropy_loss.py +++ /dev/null @@ -1,14 +0,0 @@ -import torch -from lightwood.helpers.torch import LightwoodAutocast - - -# Basically cross entropy loss that does the one hot decoding of the targets inside of it... useful for code-logic reasons to have it setup like this # noqa -class TransformCrossEntropyLoss(torch.nn.Module): - def __init__(self, **kwargs): - super().__init__() - self.cross_entropy_loss = torch.nn.CrossEntropyLoss(**kwargs) - - def forward(self, preds, target): - with LightwoodAutocast(): - cat_labels = target.max(1).indices - return self.cross_entropy_loss(preds, cat_labels) diff --git a/lightwood/mixer/lightgbm_array.py b/lightwood/mixer/lightgbm_array.py deleted file mode 100644 index 36aedd016..000000000 --- a/lightwood/mixer/lightgbm_array.py +++ /dev/null @@ -1,69 +0,0 @@ -import numpy as np -import pandas as pd -from typing import Dict, List, Union - -from lightwood.api import dtype -from lightwood.helpers.log import log -from lightwood.mixer.base import BaseMixer -from lightwood.mixer.lightgbm import LightGBM -from lightwood.api.types import PredictionArguments -from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs - - -class LightGBMArray(BaseMixer): - """LightGBM-based model, intended for usage in time series tasks.""" - models: List[LightGBM] - n_ts_predictions: int - submodel_stop_after: float - target: str - supports_proba: bool - - def __init__( - self, stop_after: int, target: str, dtype_dict: Dict[str, str], - input_cols: List[str], - n_ts_predictions: int, fit_on_dev: bool): - super().__init__(stop_after) - self.submodel_stop_after = stop_after / n_ts_predictions - self.target = target - dtype_dict[target] = dtype.float - self.models = [LightGBM(self.submodel_stop_after, target, dtype_dict, input_cols, fit_on_dev, use_optuna=False) - for _ in range(n_ts_predictions)] - self.n_ts_predictions = n_ts_predictions # for time series tasks, how long is the forecast horizon - self.supports_proba = False - self.stable = True - - def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - log.info('Started fitting LGBM models for array prediction') - - for timestep in range(self.n_ts_predictions): - if timestep > 0: - train_data.data_frame[self.target] = train_data.data_frame[f'{self.target}_timestep_{timestep}'] - dev_data.data_frame[self.target] = dev_data.data_frame[f'{self.target}_timestep_{timestep}'] - - self.models[timestep].fit(train_data, dev_data) # @TODO: this call could be parallelized - - def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - log.info('Updating array of LGBM models...') - - for timestep in range(self.n_ts_predictions): - if timestep > 0: - train_data.data_frame[self.target] = train_data.data_frame[f'{self.target}_timestep_{timestep}'] - dev_data.data_frame[self.target] = dev_data.data_frame[f'{self.target}_timestep_{timestep}'] - - self.models[timestep].partial_fit(train_data, dev_data) # @TODO: this call could be parallelized - - def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], - args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: - if args.predict_proba: - log.warning('This model does not output probability estimates') - - length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds) - ydf = pd.DataFrame(0, # zero-filled - index=np.arange(length), - columns=[f'prediction_{i}' for i in range(self.n_ts_predictions)]) - - for timestep in range(self.n_ts_predictions): - ydf[f'prediction_{timestep}'] = self.models[timestep](ds, args) - - ydf['prediction'] = ydf.values.tolist() - return ydf[['prediction']] diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py deleted file mode 100644 index bf9dd3543..000000000 --- a/lightwood/mixer/neural.py +++ /dev/null @@ -1,347 +0,0 @@ -import time -from copy import deepcopy -from typing import Dict, List - -import torch -import numpy as np -import pandas as pd -from torch import nn -import torch_optimizer as ad_optim -from sklearn.metrics import r2_score -from torch.cuda.amp import GradScaler -from torch.utils.data import DataLoader -from torch.nn.modules.loss import MSELoss -from torch.optim.optimizer import Optimizer - -from lightwood.api import dtype -from lightwood.helpers.log import log -from lightwood.encoder.base import BaseEncoder -from lightwood.helpers.torch import LightwoodAutocast -from lightwood.data.encoded_ds import EncodedDs -from lightwood.mixer.base import BaseMixer -from lightwood.mixer.helpers.ar_net import ArNet -from lightwood.mixer.helpers.default_net import DefaultNet -from lightwood.api.types import TimeseriesSettings, PredictionArguments -from lightwood.mixer.helpers.transform_corss_entropy_loss import TransformCrossEntropyLoss - - -class Neural(BaseMixer): - model: nn.Module - dtype_dict: dict - target: str - epochs_to_best: int - fit_on_dev: bool - supports_proba: bool - - def __init__( - self, stop_after: int, target: str, dtype_dict: Dict[str, str], - timeseries_settings: TimeseriesSettings, target_encoder: BaseEncoder, net: str, fit_on_dev: bool, - search_hyperparameters: bool): - """ - The Neural mixer trains a fully connected dense network from concatenated encoded outputs of each of the features in the dataset to predicted the encoded output. - - :param stop_after: How long the total fitting process should take - :param target: Name of the target column - :param dtype_dict: Data type dictionary - :param timeseries_settings: TimeseriesSettings object for time-series tasks, refer to its documentation for available settings. - :param target_encoder: Reference to the encoder used for the target - :param net: The network type to use (`DeafultNet` or `ArNet`) - :param fit_on_dev: If we should fit on the dev dataset - :param search_hyperparameters: If the network should run a more through hyperparameter search (currently disabled) - """ # noqa - super().__init__(stop_after) - self.dtype_dict = dtype_dict - self.target = target - self.timeseries_settings = timeseries_settings - self.target_encoder = target_encoder - self.epochs_to_best = 0 - self.fit_on_dev = fit_on_dev - self.net_class = DefaultNet if net == 'DefaultNet' else ArNet - self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical] - self.search_hyperparameters = search_hyperparameters - self.stable = True - - def _final_tuning(self, data): - if self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.quantity): - self.model = self.model.eval() - with torch.no_grad(): - acc_dict = {} - for decode_log in [True, False]: - self.target_encoder.decode_log = decode_log - decoded_predictions = [] - decoded_real_values = [] - for X, Y in data: - X = X.to(self.model.device) - Y = Y.to(self.model.device) - Yh = self.model(X) - - Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh - Y = torch.unsqueeze(Y, 0) if len(Y.shape) < 2 else Y - - decoded_predictions.extend(self.target_encoder.decode(Yh)) - decoded_real_values.extend(self.target_encoder.decode(Y)) - - acc_dict[decode_log] = r2_score(decoded_real_values, decoded_predictions) - - self.target_encoder.decode_log = acc_dict[True] > acc_dict[False] - - def _select_criterion(self) -> torch.nn.Module: - if self.dtype_dict[self.target] in (dtype.categorical, dtype.binary): - criterion = TransformCrossEntropyLoss(weight=self.target_encoder.index_weights.to(self.model.device)) - elif self.dtype_dict[self.target] in (dtype.tags): - criterion = nn.BCEWithLogitsLoss() - elif (self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.tsarray, dtype.quantity) - and self.timeseries_settings.is_timeseries): - criterion = nn.L1Loss() - elif self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.quantity): - criterion = MSELoss() - else: - criterion = MSELoss() - - return criterion - - def _select_optimizer(self) -> Optimizer: - # ad_optim.Ranger - # torch.optim.AdamW - if self.timeseries_settings.is_timeseries: - optimizer = ad_optim.Ranger(self.model.parameters(), lr=self.lr) - else: - optimizer = ad_optim.Ranger(self.model.parameters(), lr=self.lr, weight_decay=2e-2) - - return optimizer - - def _find_lr(self, dl): - optimizer = self._select_optimizer() - criterion = self._select_criterion() - scaler = GradScaler() - - running_losses: List[float] = [] - cum_loss = 0 - lr_log = [] - best_model = self.model - stop = False - batches = 0 - for epoch in range(1, 101): - if stop: - break - - for i, (X, Y) in enumerate(dl): - if stop: - break - - batches += len(X) - X = X.to(self.model.device) - Y = Y.to(self.model.device) - with LightwoodAutocast(): - optimizer.zero_grad() - Yh = self.model(X) - loss = criterion(Yh, Y) - if LightwoodAutocast.active: - scaler.scale(loss).backward() - scaler.step(optimizer) - scaler.update() - else: - loss.backward() - optimizer.step() - cum_loss += loss.item() - - # Account for ranger lookahead update - if (i + 1) * epoch % 6: - batches = 0 - lr = optimizer.param_groups[0]['lr'] - log.info(f'Loss of {cum_loss} with learning rate {lr}') - running_losses.append(cum_loss) - lr_log.append(lr) - cum_loss = 0 - if len(running_losses) < 2 or np.mean(running_losses[:-1]) > np.mean(running_losses): - optimizer.param_groups[0]['lr'] = lr * 1.4 - # Time saving since we don't have to start training fresh - best_model = deepcopy(self.model) - else: - stop = True - - best_loss_lr = lr_log[np.argmin(running_losses)] - lr = best_loss_lr - log.info(f'Found learning rate of: {lr}') - return lr, best_model - - def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, return_model_after): - started = time.time() - epochs_to_best = 0 - best_dev_error = pow(2, 32) - running_errors = [] - best_model = self.model - - for epoch in range(1, return_model_after + 1): - self.model = self.model.train() - running_losses: List[float] = [] - for i, (X, Y) in enumerate(train_dl): - X = X.to(self.model.device) - Y = Y.to(self.model.device) - with LightwoodAutocast(): - optimizer.zero_grad() - Yh = self.model(X) - loss = criterion(Yh, Y) - if LightwoodAutocast.active: - scaler.scale(loss).backward() - scaler.step(optimizer) - scaler.update() - else: - loss.backward() - optimizer.step() - - running_losses.append(loss.item()) - - train_error = np.mean(running_losses) - epoch_error = self._error(dev_dl, criterion) - running_errors.append(epoch_error) - log.debug(f'Loss @ epoch {epoch}: {epoch_error}') - - if np.isnan(train_error) or np.isnan( - running_errors[-1]) or np.isinf(train_error) or np.isinf( - running_errors[-1]): - break - - if best_dev_error > running_errors[-1]: - best_dev_error = running_errors[-1] - best_model = deepcopy(self.model) - epochs_to_best = epoch - - if len(running_errors) >= 5: - delta_mean = np.average([running_errors[-i - 1] - running_errors[-i] for i in range(1, 5)], - weights=[(1 / 2)**i for i in range(1, 5)]) - if delta_mean <= 0: - break - elif (time.time() - started) > stop_after: - break - elif running_errors[-1] < 0.0001 or train_error < 0.0001: - break - - if np.isnan(best_dev_error): - best_dev_error = pow(2, 32) - return best_model, epochs_to_best, best_dev_error - - def _error(self, dev_dl, criterion) -> float: - self.model = self.model.eval() - running_losses: List[float] = [] - with torch.no_grad(): - for X, Y in dev_dl: - X = X.to(self.model.device) - Y = Y.to(self.model.device) - Yh = self.model(X) - running_losses.append(criterion(Yh, Y).item()) - return np.mean(running_losses) - - def _init_net(self, ds: EncodedDs): - net_kwargs = {'input_size': len(ds[0][0]), - 'output_size': len(ds[0][1]), - 'num_hidden': self.num_hidden, - 'dropout': 0} - - if self.net_class == ArNet: - net_kwargs['encoder_span'] = ds.encoder_spans - net_kwargs['target_name'] = self.target - - self.model = self.net_class(**net_kwargs) - - # @TODO: Compare partial fitting fully on and fully off on the benchmarks! - # @TODO: Writeup on the methodology for partial fitting - def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - """ - Fits the Neural mixer on some data, making it ready to predit - - :param train_data: The EncodedDs on which to train the network - :param dev_data: Data used for early stopping and hyperparameter determination - """ - # ConcatedEncodedDs - self.batch_size = min(200, int(len(train_data) / 10)) - self.batch_size = max(40, self.batch_size) - - dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=False) - train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=False) - - self.lr = 1e-4 - self.num_hidden = 1 - - # Find learning rate - # keep the weights - self._init_net(train_data) - self.lr, self.model = self._find_lr(train_dl) - - # Keep on training - optimizer = self._select_optimizer() - criterion = self._select_criterion() - scaler = GradScaler() - - self.model, epoch_to_best_model, err = self._max_fit( - train_dl, dev_dl, criterion, optimizer, scaler, self.stop_after, return_model_after=20000) - - self.epochs_to_best += epoch_to_best_model - - if self.fit_on_dev: - self.partial_fit(dev_data, train_data) - self._final_tuning(dev_data) - - def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - """ - Augments the mixer's fit with new data, nr of epochs is based on the amount of epochs the original fitting took - - :param train_data: The EncodedDs on which to train the network - :param dev_data: Data used for early stopping and hyperparameter determination - """ - - # Based this on how long the initial training loop took, at a low learning rate as to not mock anything up tooo badly # noqa - train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=True) - dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=True) - optimizer = self._select_optimizer() - criterion = self._select_criterion() - scaler = GradScaler() - - self.model, _, _ = self._max_fit(train_dl, dev_dl, criterion, optimizer, scaler, - self.stop_after, max(1, int(self.epochs_to_best / 3))) - - def __call__(self, ds: EncodedDs, - args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: - """ - Make predictions based on datasource similar to the one used to fit (sans the target column) - - :param ds: The EncodedDs for which to generate the predictions - :param arg: Argument for predicting - - :returns: A dataframe cotaining the decoded predictions and (depending on the args) additional information such as the probabilites for each target class - """ # noqa - self.model = self.model.eval() - decoded_predictions: List[object] = [] - all_probs: List[List[float]] = [] - rev_map = {} - - with torch.no_grad(): - for idx, (X, Y) in enumerate(ds): - X = X.to(self.model.device) - Yh = self.model(X) - Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh - - kwargs = {} - for dep in self.target_encoder.dependencies: - kwargs['dependency_data'] = {dep: ds.data_frame.iloc[idx][[dep]].values} - - if args.predict_proba and self.supports_proba: - kwargs['return_raw'] = True - decoded_prediction, probs, rev_map = self.target_encoder.decode(Yh, **kwargs) - all_probs.append(probs) - else: - decoded_prediction = self.target_encoder.decode(Yh, **kwargs) - - if not self.timeseries_settings.is_timeseries or self.timeseries_settings.nr_predictions == 1: - decoded_predictions.extend(decoded_prediction) - else: - decoded_predictions.append(decoded_prediction) - - ydf = pd.DataFrame({'prediction': decoded_predictions}) - - if args.predict_proba and self.supports_proba: - raw_predictions = np.array(all_probs).squeeze() - for idx, label in enumerate(rev_map.values()): - ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx] - - return ydf diff --git a/lightwood/mixer/qclassic.py b/lightwood/mixer/qclassic.py deleted file mode 100644 index 760989f60..000000000 --- a/lightwood/mixer/qclassic.py +++ /dev/null @@ -1,21 +0,0 @@ -from typing import Dict, List - -from lightwood.encoder.base import BaseEncoder -from lightwood.mixer.neural import Neural -from lightwood.mixer.helpers.qclassic_net import QClassicNet -from lightwood.api.types import TimeseriesSettings - - -class QClassic(Neural): - # wrapper class to be combined with Neural class when performance stabilizes - def __init__( - self, stop_after: int, target: str, dtype_dict: Dict[str, str], - input_cols: List[str], - timeseries_settings: TimeseriesSettings, target_encoder: BaseEncoder, net: str, fit_on_dev: bool, - search_hyperparameters: bool): - super().__init__(stop_after, target, dtype_dict, - input_cols, timeseries_settings, target_encoder, - net, fit_on_dev, search_hyperparameters) - - quantum_nets = {"QClassic": QClassicNet} - self.net_class = quantum_nets.get(net, QClassicNet) diff --git a/lightwood/mixer/regression.py b/lightwood/mixer/regression.py deleted file mode 100644 index fae39bcc6..000000000 --- a/lightwood/mixer/regression.py +++ /dev/null @@ -1,63 +0,0 @@ -import torch -import pandas as pd -from scipy.special import softmax -from sklearn.linear_model import LinearRegression - -from lightwood.helpers.log import log -from lightwood.api.dtype import dtype -from lightwood.mixer import BaseMixer -from lightwood.encoder.base import BaseEncoder -from lightwood.api.types import PredictionArguments -from lightwood.data.encoded_ds import ConcatedEncodedDs, EncodedDs - - -class Regression(BaseMixer): - model: LinearRegression - label_map: dict - supports_proba: bool - - def __init__(self, stop_after: int, target_encoder: BaseEncoder, dtype_dict: dict, target: str): - super().__init__(stop_after) - self.target_encoder = target_encoder - self.target_dtype = dtype_dict[target] - self.supports_proba = self.target_dtype in [dtype.binary, dtype.categorical] - self.label_map = {} - self.stable = False - - def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - if self.target_dtype not in (dtype.float, dtype.integer, dtype.quantity): - raise Exception(f'Unspported {self.target_dtype} type for regression') - log.info('Fitting Linear Regression model') - X = [] - Y = [] - for x, y in ConcatedEncodedDs([train_data, dev_data]): - X.append(x.tolist()) - Y.append(y.tolist()) - - if self.supports_proba: - self.label_map = self.target_encoder.rev_map - - self.model = LinearRegression().fit(X, Y) - log.info(f'Regression based correlation of: {self.model.score(X, Y)}') - - def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - self.fit(train_data, dev_data) - - def __call__(self, ds: EncodedDs, - args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: - X = [] - for x, _ in ds: - X.append(x.tolist()) - - Yh = self.model.predict(X) - - decoded_predictions = self.target_encoder.decode(torch.Tensor(Yh)) - - ydf = pd.DataFrame({'prediction': decoded_predictions}) - - if args.predict_proba and self.label_map: - raw_predictions = softmax(Yh.squeeze(), axis=1) - for idx, label in enumerate(self.target_encoder.rev_map.values()): - ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx] - - return ydf diff --git a/lightwood/mixer/unit.py b/lightwood/mixer/unit.py deleted file mode 100644 index fbc17dcc3..000000000 --- a/lightwood/mixer/unit.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -2021.07.16 - -For encoders that already fine-tune on the targets (namely text) -the unity mixer just arg-maxes the output of the encoder. -""" - -from typing import List - -import torch -import pandas as pd - -from lightwood.helpers.log import log -from lightwood.mixer.base import BaseMixer -from lightwood.encoder.base import BaseEncoder -from lightwood.data.encoded_ds import EncodedDs -from lightwood.api.types import PredictionArguments - - -class Unit(BaseMixer): - def __init__(self, stop_after: int, target_encoder: BaseEncoder): - super().__init__(stop_after) - self.target_encoder = target_encoder - self.supports_proba = False - self.stable = True - - def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - log.info("Unit Mixer just borrows from encoder") - - def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: - pass - - def __call__(self, ds: EncodedDs, - args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: - if args.predict_proba: - # @TODO: depending on the target encoder, this might be enabled - log.warning('This model does not output probability estimates') - - decoded_predictions: List[object] = [] - - for X, _ in ds: - decoded_prediction = self.target_encoder.decode(torch.unsqueeze(X, 0)) - decoded_predictions.extend(decoded_prediction) - - ydf = pd.DataFrame({"prediction": decoded_predictions}) - return ydf diff --git a/pyproject.toml b/pyproject.toml index 97f5b7439..dd7a523f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,12 @@ requires = [ "setuptools", "wheel", -] \ No newline at end of file +] + +[tool.vulture] +ignore_decorators = ["@app.route", "@require_*"] +make_whitelist = true +min_confidence = 100 # 100% confidence +paths = ["setup.py", "lightwood", "tests", "docssrc"] +sort_by_size = true +verbose = true \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2fae6f6cb..f8de7f683 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,3 +25,5 @@ pmdarima >=1.8.0,<=1.8.3 black >=21.9b0 typing_extensions colorlog ==6.5.0 +vulture==2.3 +pre-commit==2.15.0