Merge pull request #26 from AutoResearch/25-make-return-value-dataframe

feat: make return values of sampler and pooler pd.Dataframe
AutoResearch · Jul 26, 2024 · efdb238 · efdb238
2 parents a9817a0 + 99c231a
commit efdb238
Show file tree

Hide file tree

Showing 7 changed files with 1,135 additions and 326 deletions.
diff --git a/docs/Basic Usage.ipynb b/docs/Basic Usage.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,8 @@ license = {file = "LICENSE"}
 dependencies = [
     "autora-core>=3.1.0",
     "torch",
-    "pandas"
+    "pandas",
+    "numpy<2,>=1"
 ]
 
 [project.optional-dependencies]

diff --git a/src/autora/experimentalist/falsification/__init__.py b/src/autora/experimentalist/falsification/__init__.py
@@ -1,14 +1,22 @@
+from typing import Iterable, Optional, Union
+
 import numpy as np
 import pandas as pd
 import torch
+from sklearn.preprocessing import StandardScaler
 from torch.autograd import Variable
-from typing import Optional, Iterable, Union
 
-from autora.variable import ValueType, VariableCollection
-from autora.experimentalist.falsification.utils import class_to_onehot, get_iv_limits, align_dataframe_to_ivs
-from autora.experimentalist.falsification.popper_net import PopperNet, train_popper_net_with_model, train_popper_net
+from autora.experimentalist.falsification.popper_net import (
+    train_popper_net,
+    train_popper_net_with_model,
+)
+from autora.experimentalist.falsification.utils import (
+    align_dataframe_to_ivs,
+    class_to_onehot,
+    get_iv_limits,
+)
 from autora.utils.deprecation import deprecated_alias
-from sklearn.preprocessing import StandardScaler
+from autora.variable import ValueType, VariableCollection
 
 
 def pool(
@@ -60,7 +68,9 @@ def pool(
     # format input
 
     if isinstance(reference_conditions, pd.DataFrame):
-        reference_conditions = align_dataframe_to_ivs(reference_conditions, metadata.independent_variables)
+        reference_conditions = align_dataframe_to_ivs(
+            reference_conditions, metadata.independent_variables
+        )
 
     reference_conditions_np = np.array(reference_conditions)
     if len(reference_conditions_np.shape) == 1:
@@ -75,20 +85,24 @@ def pool(
     if metadata.dependent_variables[0].type == ValueType.CLASS:
         # find all unique values in reference_observations
         num_classes = len(np.unique(reference_observations))
-        reference_observations = class_to_onehot(reference_observations, n_classes=num_classes)
+        reference_observations = class_to_onehot(
+            reference_observations, n_classes=num_classes
+        )
 
     reference_conditions_tensor = torch.from_numpy(reference_conditions_np).float()
 
     iv_limit_list = get_iv_limits(reference_conditions_np, metadata)
 
-    popper_net, model_loss = train_popper_net_with_model(model,
-                                              reference_conditions_np,
-                                              reference_observations,
-                                              metadata,
-                                              iv_limit_list,
-                                              training_epochs,
-                                              training_lr,
-                                              plot)
+    popper_net, model_loss = train_popper_net_with_model(
+        model,
+        reference_conditions_np,
+        reference_observations,
+        metadata,
+        iv_limit_list,
+        training_epochs,
+        training_lr,
+        plot,
+    )
 
     # now that the popper network is trained we can sample new data points
     # to sample data points we need to provide the popper network with an initial
@@ -170,8 +184,12 @@ def pool(
             iv_clipped_scaled_value = iv_clipped_value
 
             x[condition, idx] = iv_clipped_scaled_value
+    if isinstance(reference_conditions, pd.DataFrame):
+        new_conditions = pd.DataFrame(x, columns=reference_conditions.columns)
+    else:
+        new_conditions = pd.DataFrame(x)
+    return new_conditions
 
-    return iter(x)
 
 def sample(
     conditions: Union[pd.DataFrame, np.ndarray],
@@ -186,17 +204,18 @@ def sample(
 ):
     """
     A Sampler that generates samples of experimental conditions with the objective of maximizing the
-    (approximated) loss of a model relating experimental conditions to observations. The samples are generated by first
-    training a neural network to approximate the loss of a model for all patterns in the training data.
-    Once trained, the network is then provided with the candidate samples of experimental conditions and the selects
-    those with the highest loss.
+    (approximated) loss of a model relating experimental conditions to observations. The samples are
+    generated by first training a neural network to approximate the loss of a model for all patterns
+    in the training data. Once trained, the network is then provided with the candidate samples of
+    experimental conditions and the selects those with the highest loss.
 
     Args:
         conditions: The candidate samples of experimental conditions to be evaluated.
         model: Scikit-learn model, could be either a classification or regression model
         reference_conditions: Experimental conditions that the model was trained on
         reference_observations: Observations that the model was trained to predict
-        metadata: Meta-data about the dependent and independent variables specifying the experimental conditions
+        metadata: Meta-data about the dependent and independent variables specifying the
+            experimental conditions
         num_samples: Number of samples to return
         training_epochs: Number of epochs to train the popper network for approximating the
         error of the model
@@ -249,7 +268,11 @@ def sample(
     )
 
     if isinstance(condition_pool_copy, pd.DataFrame):
-        new_conditions = pd.DataFrame(new_conditions, columns=condition_pool_copy.columns)
+        new_conditions = pd.DataFrame(
+            new_conditions, columns=condition_pool_copy.columns
+        )
+    else:
+        new_conditions = pd.DataFrame(new_conditions)
 
     return new_conditions
 
@@ -267,17 +290,18 @@ def falsification_score_sample(
 ):
     """
     A Sampler that generates samples of experimental conditions with the objective of maximizing the
-    (approximated) loss of a model relating experimental conditions to observations. The samples are generated by first
-    training a neural network to approximate the loss of a model for all patterns in the training data.
-    Once trained, the network is then provided with the candidate samples of experimental conditions and the selects
-    those with the highest loss.
+    (approximated) loss of a model relating experimental conditions to observations. The samples are
+    generated by first training a neural network to approximate the loss of a model for all patterns
+    in the training data. Once trained, the network is then provided with the candidate samples of
+    experimental conditions and the selects those with the highest loss.
 
     Args:
         conditions: The candidate samples of experimental conditions to be evaluated.
         model: Scikit-learn model, could be either a classification or regression model
         reference_conditions: Experimental conditions that the model was trained on
         reference_observations: Observations that the model was trained to predict
-        metadata: Meta-data about the dependent and independent variables specifying the experimental conditions
+        metadata: Meta-data about the dependent and independent variables specifying
+            the experimental conditions
         num_samples: Number of samples to return
         training_epochs: Number of epochs to train the popper network for approximating the
         error of the model
@@ -303,18 +327,22 @@ def falsification_score_sample(
 
     predicted_observations = model.predict(reference_conditions)
 
-    new_conditions, new_scores =  falsification_score_sample_from_predictions(conditions,
-                                                        predicted_observations,
-                                                        reference_conditions,
-                                                        reference_observations,
-                                                        metadata,
-                                                        num_samples,
-                                                        training_epochs,
-                                                        training_lr,
-                                                        plot)
+    new_conditions, new_scores = falsification_score_sample_from_predictions(
+        conditions,
+        predicted_observations,
+        reference_conditions,
+        reference_observations,
+        metadata,
+        num_samples,
+        training_epochs,
+        training_lr,
+        plot,
+    )
 
     if isinstance(condition_pool_copy, pd.DataFrame):
-        sorted_conditions = pd.DataFrame(new_conditions, columns=condition_pool_copy.columns)
+        sorted_conditions = pd.DataFrame(
+            new_conditions, columns=condition_pool_copy.columns
+        )
     else:
         sorted_conditions = pd.DataFrame(new_conditions)
 
@@ -336,17 +364,19 @@ def falsification_score_sample_from_predictions(
 ):
     """
     A Sampler that generates samples of experimental conditions with the objective of maximizing the
-    (approximated) loss of a model relating experimental conditions to observations. The samples are generated by first
-    training a neural network to approximate the loss of a model for all patterns in the training data.
-    Once trained, the network is then provided with the candidate samples of experimental conditions and the selects
-    those with the highest loss.
+    (approximated) loss of a model relating experimental conditions to observations. The samples are
+    generated by first training a neural network to approximate the loss of a model for all patterns
+    in the training data. Once trained, the network is then provided with the candidate samples of
+    experimental conditions and the selects those with the highest loss.
 
     Args:
         conditions: The candidate samples of experimental conditions to be evaluated.
-        predicted_observations: Prediction obtained from the model for the set of reference experimental conditions
+        predicted_observations: Prediction obtained from the model for the set of
+            reference experimental conditions
         reference_conditions: Experimental conditions that the model was trained on
         reference_observations: Observations that the model was trained to predict
-        metadata: Meta-data about the dependent and independent variables specifying the experimental conditions
+        metadata: Meta-data about the dependent and independent variables specifying
+            the experimental conditions
         num_samples: Number of samples to return
         training_epochs: Number of epochs to train the popper network for approximating the
         error of the model
@@ -381,19 +411,23 @@ def falsification_score_sample_from_predictions(
         if metadata.dependent_variables[0].type == ValueType.CLASS:
             # find all unique values in reference_observations
             num_classes = len(np.unique(reference_observations))
-            reference_observations = class_to_onehot(reference_observations, n_classes=num_classes)
+            reference_observations = class_to_onehot(
+                reference_observations, n_classes=num_classes
+            )
 
     # create list of IV limits
     iv_limit_list = get_iv_limits(reference_conditions, metadata)
 
-    popper_net, model_loss = train_popper_net(predicted_observations,
-                                              reference_conditions,
-                                              reference_observations,
-                                              metadata,
-                                              iv_limit_list,
-                                              training_epochs,
-                                              training_lr,
-                                              plot)
+    popper_net, model_loss = train_popper_net(
+        predicted_observations,
+        reference_conditions,
+        reference_observations,
+        metadata,
+        iv_limit_list,
+        training_epochs,
+        training_lr,
+        plot,
+    )
 
     # now that the popper network is trained we can assign losses to all data points to be evaluated
     popper_input = Variable(torch.from_numpy(conditions)).float()
@@ -407,12 +441,18 @@ def falsification_score_sample_from_predictions(
 
     return sorted_conditions[0:num_samples], sorted_score[0:num_samples]
 
+
 falsification_pool = pool
 falsification_pool.__doc__ = """Alias for pool"""
 falsification_pooler = deprecated_alias(falsification_pool, "falsification_pooler")
 
 falsification_sample = sample
 falsification_pool.__doc__ = """Alias for sample"""
 falsification_sampler = deprecated_alias(falsification_sample, "falsification_sampler")
-falsification_score_sampler = deprecated_alias(falsification_score_sample, "falsification_score_sampler")
-falsification_score_sampler_from_predictions = deprecated_alias(falsification_score_sample_from_predictions, "falsification_score_sampler_from_predictions")
+falsification_score_sampler = deprecated_alias(
+    falsification_score_sample, "falsification_score_sampler"
+)
+falsification_score_sampler_from_predictions = deprecated_alias(
+    falsification_score_sample_from_predictions,
+    "falsification_score_sampler_from_predictions",
+)
diff --git a/src/autora/experimentalist/falsification/popper_net.py b/src/autora/experimentalist/falsification/popper_net.py
@@ -1,12 +1,16 @@
-import torch
-import numpy as np
-from torch import nn
 from typing import List
-from .utils import plot_falsification_diagnostics
+
+import numpy as np
+import torch
 from sklearn.preprocessing import StandardScaler
-from autora.variable import VariableCollection
+from torch import nn
 from torch.autograd import Variable
 
+from autora.variable import VariableCollection
+
+from .utils import plot_falsification_diagnostics
+
+
 # define the network
 class PopperNet(nn.Module):
     def __init__(self, n_input: torch.Tensor, n_output: torch.Tensor):
@@ -81,7 +85,9 @@ def train_popper_net(
     n_output = 1  # only predicting one MSE
 
     # get input pattern for popper net
-    popper_input = Variable(torch.from_numpy(reference_conditions), requires_grad=False).float()
+    popper_input = Variable(
+        torch.from_numpy(reference_conditions), requires_grad=False
+    ).float()
 
     # get target pattern for popper net
     if isinstance(model_prediction, np.ndarray) is False:
@@ -126,7 +132,9 @@ def train_popper_net(
 
     if plot:
         if len(iv_limit_list) > 1:
-            Warning("Plotting currently not supported for more than two independent variables.")
+            Warning(
+                "Plotting currently not supported for more than two independent variables."
+            )
         else:
             popper_input_full = np.linspace(
                 iv_limit_list[0][0], iv_limit_list[0][1], 1000
@@ -189,11 +197,13 @@ def train_popper_net_with_model(
 
     model_prediction = model_predict(reference_conditions)
 
-    return train_popper_net(model_prediction,
-                         reference_conditions,
-                         reference_observations,
-                         metadata,
-                         iv_limit_list,
-                         training_epochs,
-                         training_lr,
-                         plot)
+    return train_popper_net(
+        model_prediction,
+        reference_conditions,
+        reference_observations,
+        metadata,
+        iv_limit_list,
+        training_epochs,
+        training_lr,
+        plot,
+    )