Skip to content

Commit

Permalink
Merge pull request #26 from AutoResearch/25-make-return-value-dataframe
Browse files Browse the repository at this point in the history
feat: make return values of sampler and pooler pd.Dataframe
  • Loading branch information
younesStrittmatter authored Jul 26, 2024
2 parents a9817a0 + 99c231a commit efdb238
Show file tree
Hide file tree
Showing 7 changed files with 1,135 additions and 326 deletions.
1,133 changes: 933 additions & 200 deletions docs/Basic Usage.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ license = {file = "LICENSE"}
dependencies = [
"autora-core>=3.1.0",
"torch",
"pandas"
"pandas",
"numpy<2,>=1"
]

[project.optional-dependencies]
Expand Down
148 changes: 94 additions & 54 deletions src/autora/experimentalist/falsification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
from typing import Iterable, Optional, Union

import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from torch.autograd import Variable
from typing import Optional, Iterable, Union

from autora.variable import ValueType, VariableCollection
from autora.experimentalist.falsification.utils import class_to_onehot, get_iv_limits, align_dataframe_to_ivs
from autora.experimentalist.falsification.popper_net import PopperNet, train_popper_net_with_model, train_popper_net
from autora.experimentalist.falsification.popper_net import (
train_popper_net,
train_popper_net_with_model,
)
from autora.experimentalist.falsification.utils import (
align_dataframe_to_ivs,
class_to_onehot,
get_iv_limits,
)
from autora.utils.deprecation import deprecated_alias
from sklearn.preprocessing import StandardScaler
from autora.variable import ValueType, VariableCollection


def pool(
Expand Down Expand Up @@ -60,7 +68,9 @@ def pool(
# format input

if isinstance(reference_conditions, pd.DataFrame):
reference_conditions = align_dataframe_to_ivs(reference_conditions, metadata.independent_variables)
reference_conditions = align_dataframe_to_ivs(
reference_conditions, metadata.independent_variables
)

reference_conditions_np = np.array(reference_conditions)
if len(reference_conditions_np.shape) == 1:
Expand All @@ -75,20 +85,24 @@ def pool(
if metadata.dependent_variables[0].type == ValueType.CLASS:
# find all unique values in reference_observations
num_classes = len(np.unique(reference_observations))
reference_observations = class_to_onehot(reference_observations, n_classes=num_classes)
reference_observations = class_to_onehot(
reference_observations, n_classes=num_classes
)

reference_conditions_tensor = torch.from_numpy(reference_conditions_np).float()

iv_limit_list = get_iv_limits(reference_conditions_np, metadata)

popper_net, model_loss = train_popper_net_with_model(model,
reference_conditions_np,
reference_observations,
metadata,
iv_limit_list,
training_epochs,
training_lr,
plot)
popper_net, model_loss = train_popper_net_with_model(
model,
reference_conditions_np,
reference_observations,
metadata,
iv_limit_list,
training_epochs,
training_lr,
plot,
)

# now that the popper network is trained we can sample new data points
# to sample data points we need to provide the popper network with an initial
Expand Down Expand Up @@ -170,8 +184,12 @@ def pool(
iv_clipped_scaled_value = iv_clipped_value

x[condition, idx] = iv_clipped_scaled_value
if isinstance(reference_conditions, pd.DataFrame):
new_conditions = pd.DataFrame(x, columns=reference_conditions.columns)
else:
new_conditions = pd.DataFrame(x)
return new_conditions

return iter(x)

def sample(
conditions: Union[pd.DataFrame, np.ndarray],
Expand All @@ -186,17 +204,18 @@ def sample(
):
"""
A Sampler that generates samples of experimental conditions with the objective of maximizing the
(approximated) loss of a model relating experimental conditions to observations. The samples are generated by first
training a neural network to approximate the loss of a model for all patterns in the training data.
Once trained, the network is then provided with the candidate samples of experimental conditions and the selects
those with the highest loss.
(approximated) loss of a model relating experimental conditions to observations. The samples are
generated by first training a neural network to approximate the loss of a model for all patterns
in the training data. Once trained, the network is then provided with the candidate samples of
experimental conditions and the selects those with the highest loss.
Args:
conditions: The candidate samples of experimental conditions to be evaluated.
model: Scikit-learn model, could be either a classification or regression model
reference_conditions: Experimental conditions that the model was trained on
reference_observations: Observations that the model was trained to predict
metadata: Meta-data about the dependent and independent variables specifying the experimental conditions
metadata: Meta-data about the dependent and independent variables specifying the
experimental conditions
num_samples: Number of samples to return
training_epochs: Number of epochs to train the popper network for approximating the
error of the model
Expand Down Expand Up @@ -249,7 +268,11 @@ def sample(
)

if isinstance(condition_pool_copy, pd.DataFrame):
new_conditions = pd.DataFrame(new_conditions, columns=condition_pool_copy.columns)
new_conditions = pd.DataFrame(
new_conditions, columns=condition_pool_copy.columns
)
else:
new_conditions = pd.DataFrame(new_conditions)

return new_conditions

Expand All @@ -267,17 +290,18 @@ def falsification_score_sample(
):
"""
A Sampler that generates samples of experimental conditions with the objective of maximizing the
(approximated) loss of a model relating experimental conditions to observations. The samples are generated by first
training a neural network to approximate the loss of a model for all patterns in the training data.
Once trained, the network is then provided with the candidate samples of experimental conditions and the selects
those with the highest loss.
(approximated) loss of a model relating experimental conditions to observations. The samples are
generated by first training a neural network to approximate the loss of a model for all patterns
in the training data. Once trained, the network is then provided with the candidate samples of
experimental conditions and the selects those with the highest loss.
Args:
conditions: The candidate samples of experimental conditions to be evaluated.
model: Scikit-learn model, could be either a classification or regression model
reference_conditions: Experimental conditions that the model was trained on
reference_observations: Observations that the model was trained to predict
metadata: Meta-data about the dependent and independent variables specifying the experimental conditions
metadata: Meta-data about the dependent and independent variables specifying
the experimental conditions
num_samples: Number of samples to return
training_epochs: Number of epochs to train the popper network for approximating the
error of the model
Expand All @@ -303,18 +327,22 @@ def falsification_score_sample(

predicted_observations = model.predict(reference_conditions)

new_conditions, new_scores = falsification_score_sample_from_predictions(conditions,
predicted_observations,
reference_conditions,
reference_observations,
metadata,
num_samples,
training_epochs,
training_lr,
plot)
new_conditions, new_scores = falsification_score_sample_from_predictions(
conditions,
predicted_observations,
reference_conditions,
reference_observations,
metadata,
num_samples,
training_epochs,
training_lr,
plot,
)

if isinstance(condition_pool_copy, pd.DataFrame):
sorted_conditions = pd.DataFrame(new_conditions, columns=condition_pool_copy.columns)
sorted_conditions = pd.DataFrame(
new_conditions, columns=condition_pool_copy.columns
)
else:
sorted_conditions = pd.DataFrame(new_conditions)

Expand All @@ -336,17 +364,19 @@ def falsification_score_sample_from_predictions(
):
"""
A Sampler that generates samples of experimental conditions with the objective of maximizing the
(approximated) loss of a model relating experimental conditions to observations. The samples are generated by first
training a neural network to approximate the loss of a model for all patterns in the training data.
Once trained, the network is then provided with the candidate samples of experimental conditions and the selects
those with the highest loss.
(approximated) loss of a model relating experimental conditions to observations. The samples are
generated by first training a neural network to approximate the loss of a model for all patterns
in the training data. Once trained, the network is then provided with the candidate samples of
experimental conditions and the selects those with the highest loss.
Args:
conditions: The candidate samples of experimental conditions to be evaluated.
predicted_observations: Prediction obtained from the model for the set of reference experimental conditions
predicted_observations: Prediction obtained from the model for the set of
reference experimental conditions
reference_conditions: Experimental conditions that the model was trained on
reference_observations: Observations that the model was trained to predict
metadata: Meta-data about the dependent and independent variables specifying the experimental conditions
metadata: Meta-data about the dependent and independent variables specifying
the experimental conditions
num_samples: Number of samples to return
training_epochs: Number of epochs to train the popper network for approximating the
error of the model
Expand Down Expand Up @@ -381,19 +411,23 @@ def falsification_score_sample_from_predictions(
if metadata.dependent_variables[0].type == ValueType.CLASS:
# find all unique values in reference_observations
num_classes = len(np.unique(reference_observations))
reference_observations = class_to_onehot(reference_observations, n_classes=num_classes)
reference_observations = class_to_onehot(
reference_observations, n_classes=num_classes
)

# create list of IV limits
iv_limit_list = get_iv_limits(reference_conditions, metadata)

popper_net, model_loss = train_popper_net(predicted_observations,
reference_conditions,
reference_observations,
metadata,
iv_limit_list,
training_epochs,
training_lr,
plot)
popper_net, model_loss = train_popper_net(
predicted_observations,
reference_conditions,
reference_observations,
metadata,
iv_limit_list,
training_epochs,
training_lr,
plot,
)

# now that the popper network is trained we can assign losses to all data points to be evaluated
popper_input = Variable(torch.from_numpy(conditions)).float()
Expand All @@ -407,12 +441,18 @@ def falsification_score_sample_from_predictions(

return sorted_conditions[0:num_samples], sorted_score[0:num_samples]


falsification_pool = pool
falsification_pool.__doc__ = """Alias for pool"""
falsification_pooler = deprecated_alias(falsification_pool, "falsification_pooler")

falsification_sample = sample
falsification_pool.__doc__ = """Alias for sample"""
falsification_sampler = deprecated_alias(falsification_sample, "falsification_sampler")
falsification_score_sampler = deprecated_alias(falsification_score_sample, "falsification_score_sampler")
falsification_score_sampler_from_predictions = deprecated_alias(falsification_score_sample_from_predictions, "falsification_score_sampler_from_predictions")
falsification_score_sampler = deprecated_alias(
falsification_score_sample, "falsification_score_sampler"
)
falsification_score_sampler_from_predictions = deprecated_alias(
falsification_score_sample_from_predictions,
"falsification_score_sampler_from_predictions",
)
40 changes: 25 additions & 15 deletions src/autora/experimentalist/falsification/popper_net.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import torch
import numpy as np
from torch import nn
from typing import List
from .utils import plot_falsification_diagnostics

import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from autora.variable import VariableCollection
from torch import nn
from torch.autograd import Variable

from autora.variable import VariableCollection

from .utils import plot_falsification_diagnostics


# define the network
class PopperNet(nn.Module):
def __init__(self, n_input: torch.Tensor, n_output: torch.Tensor):
Expand Down Expand Up @@ -81,7 +85,9 @@ def train_popper_net(
n_output = 1 # only predicting one MSE

# get input pattern for popper net
popper_input = Variable(torch.from_numpy(reference_conditions), requires_grad=False).float()
popper_input = Variable(
torch.from_numpy(reference_conditions), requires_grad=False
).float()

# get target pattern for popper net
if isinstance(model_prediction, np.ndarray) is False:
Expand Down Expand Up @@ -126,7 +132,9 @@ def train_popper_net(

if plot:
if len(iv_limit_list) > 1:
Warning("Plotting currently not supported for more than two independent variables.")
Warning(
"Plotting currently not supported for more than two independent variables."
)
else:
popper_input_full = np.linspace(
iv_limit_list[0][0], iv_limit_list[0][1], 1000
Expand Down Expand Up @@ -189,11 +197,13 @@ def train_popper_net_with_model(

model_prediction = model_predict(reference_conditions)

return train_popper_net(model_prediction,
reference_conditions,
reference_observations,
metadata,
iv_limit_list,
training_epochs,
training_lr,
plot)
return train_popper_net(
model_prediction,
reference_conditions,
reference_observations,
metadata,
iv_limit_list,
training_epochs,
training_lr,
plot,
)
Loading

0 comments on commit efdb238

Please sign in to comment.