Merge pull request #1211 from mindsdb/staging

Release 24.3.3.0
mindsdb · Mar 19, 2024 · 1516448 · 1516448
2 parents 411343b + 8d1559c
commit 1516448
Show file tree

Hide file tree

Showing 38 changed files with 6,495 additions and 1,194 deletions.
diff --git a/.github/workflows/doc_build.yml b/.github/workflows/doc_build.yml
@@ -10,6 +10,8 @@ on:
 jobs:
   doc_build:
     runs-on: ubuntu-latest
+    permissions:
+      contents: write
 
     steps:
     - name: checkout and set up

diff --git a/.github/workflows/ligthtwood.yml b/.github/workflows/ligthtwood.yml
@@ -26,9 +26,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install --no-cache-dir -e .
-        pip install -r requirements_image.txt
-        pip install flake8
+        python -m pip install setuptools poetry
+        poetry install -E dev -E image
     - name: Install dependencies OSX
       run: |
         if [ "$RUNNER_OS" == "macOS" ]; then
@@ -39,11 +38,11 @@ jobs:
         CHECK_FOR_UPDATES: False
     - name: Lint with flake8
       run: |
-        python -m flake8 .
+        poetry run python -m flake8 .
     - name: Test with unittest
       run: |
         # Run all the "standard" tests
-        python -m unittest discover tests
+        poetry run python -m unittest discover tests
 
   deploy:
     runs-on: ubuntu-latest

diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ We predominantly use PyTorch based approaches, but can support other models.
 
 ## Usage
 
-We invite you to check out our [documentation](https://lightwood.io) for specific guidelines and tutorials! Please stay tuned for updates and changes. 
+We invite you to check out our [documentation](https://mindsdb.github.io/lightwood/) for specific guidelines and tutorials! Please stay tuned for updates and changes. 
 
 ### Quick use cases
 Lightwood works with `pandas.DataFrames`. Once a DataFrame is loaded, defined a "ProblemDefinition" via a dictionary. The only thing a user needs to specify is the name of the column to predict (via the key `target`).

diff --git a/docssrc/source/index.rst b/docssrc/source/index.rst
@@ -19,7 +19,7 @@ Lightwood works with a variety of data types such as numbers, dates, categories,
 
 Our JSON-AI syntax allows users to change any and all parts of the models Lightwood automatically generates. The syntax outlines the specifics details in each step of the modeling pipeline. Users may override default values (for example, changing the type of a column) or alternatively, entirely replace steps with their own methods (ex: use a random forest model for a predictor). Lightwood creates a "JSON-AI" object from this syntax which can then be used to automatically generate python code to represent your pipeline.
 
-For details as to how Lightwood works, check out the `Lightwood Philosophy <https://lightwood.io/lightwood_philosophy.html>`_ .
+For details as to how Lightwood works, check out the `Lightwood Philosophy <https://mindsdb.github.io/lightwood/lightwood_philosophy.html>`_ .
 
 Quick Guide
 =======================
@@ -124,7 +124,7 @@ BYOM: Bring your own models
 
 Lightwood supports user architectures/approaches so long as you follow the abstractions provided within each step. 
 
-Our `tutorials <https://lightwood.io/tutorials.html>`_ provide specific use cases for how to introduce customization into your pipeline. Check out "custom cleaner", "custom splitter", "custom explainer", and "custom mixer". Stay tuned for further updates.
+Our `tutorials <https://mindsdb.github.io/lightwood/tutorials.html>`_ provide specific use cases for how to introduce customization into your pipeline. Check out "custom cleaner", "custom splitter", "custom explainer", and "custom mixer". Stay tuned for further updates.
 
 
 Contribute to Lightwood

diff --git a/lightwood/__about__.py b/lightwood/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '23.12.4.0'
+__version__ = '24.3.3.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "[email protected]"
 __author__ = 'MindsDB Inc'

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
@@ -91,6 +91,11 @@ def lookup_encoder(
                 "positive_domain"
             ] = "$statistical_analysis.positive_domain"
 
+            if problem_defintion.target_weights is not None:
+                encoder_dict["args"][
+                    "target_weights"
+                ] = problem_defintion.target_weights
+
     # Time-series representations require more advanced flags
     if tss.is_timeseries:
         gby = tss.group_by if tss.group_by is not None else []

diff --git a/lightwood/api/predictor.py b/lightwood/api/predictor.py
@@ -28,6 +28,7 @@ class PredictorInterface:
     You can also use the predictor to now estimate new data:
 
     - ``predict``: Deploys the chosen best model, and evaluates the given data to provide target estimates.
+    - ``test``: Similar to predict, but user also passes an accuracy function that will be used to compute a metric with the generated predictions.
     - ``save``: Saves the Predictor object for further use.
 
     The ``PredictorInterface`` is created via J{ai}son's custom code creation. A problem inherits from this class with pre-populated routines to fill out expected results, given the nature of each problem type.
@@ -127,12 +128,27 @@ def adjust(self, new_data: pd.DataFrame, old_data: Optional[pd.DataFrame] = None
 
     def predict(self, data: pd.DataFrame, args: Dict[str, object] = {}) -> pd.DataFrame:
         """
-        Intakes raw data to provide predicted values for your trained model.
+        Intakes raw data to provide model predictions.
+
+        :param data: Data (n_samples, n_columns) that the model will use as input to predict the corresponding target value for each sample.
+        :param args: any parameters used to customize inference behavior. Wrapped as a ``PredictionArguments`` object.
+
+        :returns: A dataframe containing predictions and additional sample-wise information. `n_samples` rows.
+        """  # noqa
+        pass
+
+    def test(
+            self, data: pd.DataFrame, metrics: list, args: Dict[str, object] = {}, strict: bool = False
+    ) -> pd.DataFrame:
+        """
+        Intakes raw data to compute values for a list of provided metrics using a Lightwood predictor.
 
         :param data: Data (n_samples, n_columns) that the model(s) will evaluate on and provide the target prediction.
+        :param metrics: A list of metrics to evaluate the model's performance on.
         :param args: parameters needed to update the predictor ``PredictionArguments`` object, which holds any parameters relevant for prediction.
+        :param strict: If True, the function will raise an error if the model does not support any of the requested metrics. Otherwise it skips them.
 
-        :returns: A dataframe of predictions of the same length of input.
+        :returns: A dataframe with `n_metrics` columns, each cell containing the respective score of each metric.
         """  # noqa
         pass
 

diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
@@ -14,7 +14,7 @@ def __init__(self, encoders: Dict[str, BaseEncoder], data_frame: pd.DataFrame, t
         
         Note: normal behavior is to cache encoded representations to avoid duplicated computations. If you want an option to disable, this please open an issue.
          
-        :param encoders: list of Lightwood encoders used to encode the data per each column.
+        :param encoders: dictionary of Lightwood encoders used to encode the data per each column.
         :param data_frame: original dataframe.
         :param target: name of the target column to predict.
         """  # noqa

diff --git a/lightwood/encoder/__init__.py b/lightwood/encoder/__init__.py
@@ -8,7 +8,6 @@
 from lightwood.encoder.array.ts_num_array import TsArrayNumericEncoder
 from lightwood.encoder.text.short import ShortTextEncoder
 from lightwood.encoder.text.vocab import VocabularyEncoder
-from lightwood.encoder.text.rnn import RnnEncoder as TextRnnEncoder
 from lightwood.encoder.categorical.simple_label import SimpleLabelEncoder
 from lightwood.encoder.categorical.onehot import OneHotEncoder
 from lightwood.encoder.categorical.binary import BinaryEncoder
@@ -22,7 +21,7 @@
 
 
 __all__ = ['BaseEncoder', 'DatetimeEncoder', 'Img2VecEncoder', 'NumericEncoder', 'TsNumericEncoder',
-           'TsArrayNumericEncoder', 'ShortTextEncoder', 'VocabularyEncoder', 'TextRnnEncoder', 'OneHotEncoder',
+           'TsArrayNumericEncoder', 'ShortTextEncoder', 'VocabularyEncoder', 'OneHotEncoder',
            'CategoricalAutoEncoder', 'TimeSeriesEncoder', 'ArrayEncoder', 'MultiHotEncoder', 'TsCatArrayEncoder',
            'NumArrayEncoder', 'CatArrayEncoder', 'SimpleLabelEncoder',
            'PretrainedLangEncoder', 'BinaryEncoder', 'DatetimeNormalizerEncoder', 'MFCCEncoder']
diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
@@ -1,5 +1,6 @@
 import math
-from typing import Union
+from typing import Union, Dict
+from copy import deepcopy as dc
 
 import torch
 import numpy as np
@@ -20,11 +21,15 @@ class NumericEncoder(BaseEncoder):
     The ``absolute_mean`` is computed in the ``prepare`` method and is just the mean of the absolute values of all numbers feed to prepare (which are not none)
 
     ``none`` stands for any number that is an actual python ``None`` value or any sort of non-numeric value (a string, nan, inf)
-    """ # noqa
+    """  # noqa
 
-    def __init__(self, data_type: dtype = None, is_target: bool = False, positive_domain: bool = False):
+    def __init__(self, data_type: dtype = None,
+                 target_weights: Dict[float, float] = None,
+                 is_target: bool = False,
+                 positive_domain: bool = False):
         """
         :param data_type: The data type of the number (integer, float, quantity)
+        :param target_weights: a dictionary of weights to use on the examples.
         :param is_target: Indicates whether the encoder refers to a target column or feature column (True==target)
         :param positive_domain: Forces the encoder to always output positive values
         """
@@ -34,12 +39,19 @@ def __init__(self, data_type: dtype = None, is_target: bool = False, positive_do
         self.decode_log = False
         self.output_size = 4 if not self.is_target else 3
 
+        # Weight-balance info if encoder represents target
+        self.target_weights = None
+        self.index_weights = None
+        if self.is_target and target_weights is not None:
+            self.target_weights = dc(target_weights)
+            self.index_weights = torch.tensor(list(self.target_weights.values()))
+
     def prepare(self, priming_data: pd.Series):
         """
         "NumericalEncoder" uses a rule-based form to prepare results on training (priming) data. The averages etc. are taken from this distribution.
 
         :param priming_data: an iterable data structure containing numbers numbers which will be used to compute the values used for normalizing the encoded representations
-        """ # noqa
+        """  # noqa
         if self.is_prepared:
             raise Exception('You can only call "prepare" once for a given encoder.')
 
@@ -57,7 +69,8 @@ def encode(self, data: Union[np.ndarray, pd.Series]):
         if isinstance(data, pd.Series):
             data = data.values
 
-        inp_data = np.nan_to_num(data.astype(float), nan=0, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min)  # noqa
+        inp_data = np.nan_to_num(data.astype(float), nan=0, posinf=np.finfo(np.float32).max,
+                                 neginf=np.finfo(np.float32).min)  # noqa
         if not self.positive_domain:
             sign = np.vectorize(self._sign_fn, otypes=[float])(inp_data)
         else:
@@ -97,7 +110,7 @@ def decode(self, encoded_values: torch.Tensor, decode_log: bool = None) -> list:
         :param decode_log: Whether to decode the ``log`` or ``linear`` part of the representation, since the encoded vector contains both a log and a linear part
 
         :returns: The decoded array
-        """ # noqa
+        """  # noqa
 
         if not self.is_prepared:
             raise Exception('You need to call "prepare" before calling "encode" or "decode".')
@@ -145,3 +158,22 @@ def decode(self, encoded_values: torch.Tensor, decode_log: bool = None) -> list:
             ret[mask_none] = None
 
         return ret.tolist()  # TODO: update signature on BaseEncoder and replace all encs to return ndarrays
+
+    def get_weights(self, label_data):
+        # get a sorted list of intervals to assign weights. Keys are the interval edges.
+        target_weight_keys = np.array(list(self.target_weights.keys()))
+        target_weight_values = np.array(list(self.target_weights.values()))
+        sorted_indices = np.argsort(target_weight_keys)
+
+        # get sorted arrays for vector numpy operations
+        target_weight_keys = target_weight_keys[sorted_indices]
+        target_weight_values = target_weight_values[sorted_indices]
+
+        # find the indices of the bins according to the keys. clip to the length of the weight values (search sorted
+        # returns indices from 0 to N with N = len(target_weight_keys).
+        assigned_target_weight_indices = np.clip(a=np.searchsorted(target_weight_keys, label_data),
+                                                 a_min=0,
+                                                 a_max=len(target_weight_keys) - 1).astype(np.int32)
+
+        return target_weight_values[assigned_target_weight_indices]
+
diff --git a/lightwood/encoder/text/__init__.py b/lightwood/encoder/text/__init__.py
@@ -1,8 +1,7 @@
 from lightwood.encoder.text.pretrained import PretrainedLangEncoder
-from lightwood.encoder.text.rnn import RnnEncoder
 from lightwood.encoder.text.tfidf import TfidfEncoder
 from lightwood.encoder.text.short import ShortTextEncoder
 from lightwood.encoder.text.vocab import VocabularyEncoder
 
 
-__all__ = ['PretrainedLangEncoder', 'RnnEncoder', 'TfidfEncoder', 'ShortTextEncoder', 'VocabularyEncoder']
+__all__ = ['PretrainedLangEncoder', 'TfidfEncoder', 'ShortTextEncoder', 'VocabularyEncoder']
diff --git a/lightwood/encoder/text/helpers/pretrained_helpers.py b/lightwood/encoder/text/helpers/pretrained_helpers.py
@@ -4,7 +4,6 @@
 Basic helper functions for PretrainedLangEncoder
 """
 import torch
-from transformers import AdamW
 
 
 class TextEmbed(torch.utils.data.Dataset):
@@ -26,48 +25,3 @@ def __getitem__(self, idx):
 
     def __len__(self):
         return len(self.labels)
-
-
-def train_model(model, dataset, device, scheduler=None, log=None, optim=None, n_epochs=4):
-    """
-    Generic training function, given an arbitrary model.
-
-    Given a model, train for n_epochs.
-
-    model - torch.nn model;
-    dataset - torch.DataLoader; dataset to train
-    device - torch.device; cuda/cpu
-    log - lightwood.logger.log; print output
-    optim - transformers.optimization.AdamW; optimizer
-    n_epochs - number of epochs to train
-
-    """
-    if log is None:
-        from lightwood.helpers.log import log
-        log = log.debug
-    losses = []
-    model.train()
-    if optim is None:
-        optim = AdamW(model.parameters(), lr=5e-5)
-
-    for epoch in range(n_epochs):
-        total_loss = 0
-        for batch in dataset:
-            optim.zero_grad()
-
-            inpids = batch['input_ids'].to(device)
-            attn = batch['attention_mask'].to(device)
-            labels = batch['labels'].to(device)
-            outputs = model(inpids, attention_mask=attn, labels=labels)
-            loss = outputs[0]
-
-            total_loss += loss.item()
-
-            loss.backward()
-            optim.step()
-
-            if scheduler is not None:
-                scheduler.step()
-
-        log("Epoch", epoch + 1, "Loss", total_loss)
-    return model, losses