Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ritalyu17 authored Dec 17, 2024
1 parent 09b1f29 commit 635abfc
Show file tree
Hide file tree
Showing 4 changed files with 2,205 additions and 82 deletions.
273 changes: 195 additions & 78 deletions benchmarks/domains/CrabNet_AdvOpt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
"""Crabnet hyperparameter function with 20 continuous and 3 categorical input.
"""
@Time : 2024/11/06 16:21:18
@Author : Rita Lyu
@Version : 1.0
@Contact : [email protected]
@Desc : Crabnet hyperparameter benchmarking, a minimization task on Crabnet hyperparameters.
Crabnet hyperparameter function with 20 continuous (treat as discrete for simplicity) and 3 categorical input.
This code interacts with an external API hosted on Hugging Face Spaces:
https://huggingface.co/spaces/AccelerationConsortium/crabnet-hyperparameter
Expand All @@ -9,16 +16,16 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import time
import os
import numpy as np
import pandas as pd
from pandas import DataFrame

from baybe.campaign import Campaign
from baybe.parameters import NumericalContinuousParameter, CategoricalParameter
from baybe.parameters import NumericalDiscreteParameter, CategoricalParameter, TaskParameter
from baybe.searchspace import SearchSpace
from baybe.targets import NumericalTarget, TargetMode
from baybe.constraints import ContinuousLinearInequalityConstraint
from baybe.recommenders.pure.nonpredictive.sampling import RandomRecommender
from baybe.simulation import simulate_scenarios
from baybe.utils.random import set_random_seed
Expand All @@ -27,10 +34,7 @@
ConvergenceExperimentSettings,
)

if TYPE_CHECKING:
from mpl_toolkits.mplot3d import Axes3D

# Initialize the client
# Wake up the Hugging Face space
from gradio_client import Client
client = Client("AccelerationConsortium/crabnet-hyperparameter")

Expand All @@ -56,136 +60,249 @@ def wake_up_hfspace(client, max_retries=2, wait_time=150):
except Exception as e:
print(f"Attempt {attempt + 1}: Space is asleep. Retrying in {wait_time} seconds...")
time.sleep(wait_time)

raise RuntimeError("Hugging Face space is still asleep after maximum retries.")

wake_up_hfspace(client)

# Define the function to evaluate
def _lookup(c1, c2, c3, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
x11, x12, x13, x14, x15, x16, x17, x18, x19, x20):
def adv_opt(c1, c2, c3, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20):
"""Optimization function with 20 continuous variables and 3 categorical parameters."""
# Assertion checks for the continuous parameters
try:
for xi, name in zip(
[x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
x11, x12, x13, x14, x15, x16, x17, x18, x19, x20],
["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20"]
):
assert 0.0 <= xi <= 1.0, f"{name} must be in the range [0.0, 1.0]."
except AssertionError as e:
raise ValueError(f"Assertion failed for continuous variables: {e}")

# Assertion checks for the categorical parameters
try:
assert c1 in ["c1_0", "c1_1"], f"Invalid value for c1: {c1}."
assert c2 in ["c2_0", "c2_1"], f"Invalid value for c2: {c2}."
assert c3 in ["c3_0", "c3_1", "c3_2"], f"Invalid value for c3: {c3}."
except AssertionError as e:
raise ValueError(f"Assertion failed for categorical variables: {e}")

# Execute the prediction via the Gradio client
result = client.predict(
x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, # Continuous variables
c1, c2, c3, # Categorical variables
0.5, # Fidelity
api_name="/predict",
)
return result['data'][0][0] # Return y1 value
return result['data'][0][0] # return y1 value only

#%%
# Run this cell and above if "BayBE_lookup_testing_y1.csv" and "BayBE_lookup_training_y2.csv" are not provided
# or if you want to generate data for the benchmark

# def generate_parameters():
# while True:
# # Random float values for x1 to x20 between 0.0 and 1.0
# params = {f"x{i}": np.random.uniform(0.0, 1.0) for i in range(1, 21)}

# # Random categorical values for c1, c2, c3
# params["c1"] = np.random.choice(["c1_0", "c1_1"])
# params["c2"] = np.random.choice(["c2_0", "c2_1"])
# params["c3"] = np.random.choice(["c3_0", "c3_1", "c3_2"])

# # Check constraints
# if params["x19"] < params["x20"] and params["x6"] + params["x15"] <= 1.0:
# return params

# # Create DataFrame for 20 input data size in number_init_points
# data = [generate_parameters() for _ in range(20)]
# initial_points = pd.DataFrame(data)
# # make sure c1, c2, c3 are str type
# initial_points['c1'] = initial_points['c1'].apply(str)
# initial_points['c2'] = initial_points['c2'].apply(str)
# initial_points['c3'] = initial_points['c3'].apply(str)

# # create a dataframe, that has initial points for y1 and y2
# # add a Target column for y1/y2 value, and a Function column for the fucntion used
# lookup_training_y2 = initial_points.copy()
# lookup_training_y2['Target'] = lookup_training_y2.apply(lambda x: adv_opt_y2(**x), axis=1)
# lookup_training_y2['Function'] = "TrainingY2"

# lookup_testing_y1 = initial_points.copy()
# lookup_testing_y1['Target'] = lookup_testing_y1.apply(lambda x: adv_opt_y1(**x), axis=1)
# lookup_testing_y1['Function'] = "TestingY1"

# # save lookup_training_y2 and lookup_testing_y1 to csv
# lookup_testing_y1.to_csv("BayBE_lookup_testing_y1.csv", index=False)
# lookup_training_y2.to_csv("BayBE_lookup_training_y2.csv", index=False)
#%%
# import data
strHomeDir = os.getcwd()
df_trainingY2 = pd.read_csv(os.path.join(strHomeDir, "benchmarks", "domains", "CrabNet_lookup_training_y2.csv"))
df_testingY1 = pd.read_csv(os.path.join(strHomeDir, "benchmarks", "domains", "CrabNet_lookup_testing_y1.csv"))

# concatenate the two dataframes
dfSearchSpace = pd.concat([df_testingY1, df_trainingY2], ignore_index=True)
# drop the "Target" column
dfSearchSpace = dfSearchSpace.drop(columns=["Target"])

def advopt(settings: ConvergenceExperimentSettings) -> DataFrame:
"""Crabnet hyperparameter function with 20 continuous and 3 categorical input."""
# Define the continuous and categorical parameters
parameters = [
NumericalContinuousParameter(name="x1", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x2", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x3", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x4", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x5", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x6", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x7", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x8", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x9", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x10", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x11", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x12", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x13", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x14", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x15", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x16", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x17", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x18", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x19", bounds=(0.0, 1.0)),
NumericalContinuousParameter(name="x20", bounds=(0.0, 1.0)),
"""Crabnet hyperparameter optimization with 20 discrete and 3 categorical input.
Compare across random, default, and no task parameter settings.
Inputs:
x1-x20 discrete 0 ≤ xi ≤ 1 for i ∈ {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
note: x1-x20 are continuous hyperparameters, treat as discrete for simplicity
c1 categorical c1 ∈ {c1_0, c1_1}
c2 categorical c2 ∈ {c2_0, c2_1}
c3 categorical c3 ∈ {c3_0, c3_1, c3_2}
Output: continuous
Objective: Minimization
"""
# Define and create the search space
lstParameters = []
lstParameters_noTask = []
for col in dfSearchSpace.columns[:-4]:
# create a NumericalDiscreteParameter
temp_Parameter = NumericalDiscreteParameter(
name = col,
values=np.unique(dfSearchSpace[col]),
tolerance=0.0,
)
# append the parameter to the list
lstParameters.append(temp_Parameter)
lstParameters_noTask.append(temp_Parameter)

categorical_parameters = [
CategoricalParameter(name='c1', values=['c1_0', 'c1_1'], encoding="OHE"),
CategoricalParameter(name='c2', values=['c2_0', 'c2_1'], encoding="OHE"),
CategoricalParameter(name='c3', values=['c3_0', 'c3_1', 'c3_2'], encoding="OHE"),
]
# define constraints
constraints = [
ContinuousLinearInequalityConstraint(parameters=["x19", "x20"], coefficients=[-1.0, 1.0], rhs=0.0),
ContinuousLinearInequalityConstraint(parameters=["x6", "x15"], coefficients=[-1.0, -1.0], rhs=-1.0),
]
lstParameters.extend(categorical_parameters)
lstParameters_noTask.extend(categorical_parameters)

task_parameter = TaskParameter(
name="Function",
values=["TrainingY2", "TestingY1"],
active_values=["TestingY1"],
)

# append the task parameter to the list
lstParameters.append(task_parameter)

search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=lstParameters)
searchspace_noTask = SearchSpace.from_dataframe(df_testingY1.drop(columns=['Target', 'Function']), parameters=lstParameters_noTask)

# Define the objective and search space
objective = NumericalTarget(name="target", mode=TargetMode.MIN).to_objective()
search_space = SearchSpace.from_product(parameters=parameters, constraints=constraints)
# define objective
objective = NumericalTarget(name="Target", mode=TargetMode.MIN).to_objective()

# Define the scenarios
scenarios: dict[str, Campaign] = {
"Random Recommender": Campaign(
searchspace=search_space,
searchspace=SearchSpace.from_dataframe(
df_testingY1.drop(columns=['Target', 'Function']),
parameters=lstParameters_noTask
),
recommender=RandomRecommender(),
objective=objective,
),
"Default Recommender": Campaign(
searchspace=search_space,
searchspace=SearchSpace.from_dataframe(
dfSearchSpace,
parameters=lstParameters,
),
objective=objective,
),
}
"noTask": Campaign(
searchspace=searchspace_noTask,
objective=objective,
),
}

# Simulate the scenarios
return simulate_scenarios(
scenarios,
_lookup,
df_testingY1,
batch_size=settings.batch_size,
n_doe_iterations=settings.n_doe_iterations,
n_mc_iterations=settings.n_mc_iterations,
impute_mode="error",
)


def advopt_transfer_learning(settings: ConvergenceExperimentSettings) -> DataFrame:
"""Crabnet hyperparameter optimization with 20 discrete and 3 categorical input.
Transfer learning with different initial data sizes.
Inputs:
x1-x20 discrete 0 ≤ xi ≤ 1 for i ∈ {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
note: x1-x20 are continuous hyperparameters, treat as discrete for simplicity
c1 categorical c1 ∈ {c1_0, c1_1}
c2 categorical c2 ∈ {c2_0, c2_1}
c3 categorical c3 ∈ {c3_0, c3_1, c3_2}
Output: continuous
Objective: Minimization
"""
# Define and create the search space
lstParameters = []
for col in dfSearchSpace.columns[:-4]:
# create a NumericalDiscreteParameter
temp_Parameter = NumericalDiscreteParameter(
name = col,
values=np.unique(dfSearchSpace[col]),
tolerance=0.0,
)
# append the parameter to the list
lstParameters.append(temp_Parameter)

categorical_parameters = [
CategoricalParameter(name='c1', values=['c1_0', 'c1_1'], encoding="OHE"),
CategoricalParameter(name='c2', values=['c2_0', 'c2_1'], encoding="OHE"),
CategoricalParameter(name='c3', values=['c3_0', 'c3_1', 'c3_2'], encoding="OHE"),
]
lstParameters.extend(categorical_parameters)

task_parameter = TaskParameter(
name="Function",
values=["TrainingY2", "TestingY1"],
active_values=["TestingY1"],
)

# append the task parameter to the list
lstParameters.append(task_parameter)

# define objective
objective = NumericalTarget(name="Target", mode=TargetMode.MIN).to_objective()

for n in (50, 100, 500, 700, 1000):
search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=lstParameters)

campaign_temp = Campaign(searchspace=search_space, objective=objective)
initial_data_temp = [df_trainingY2.sample(n) for _ in range(settings.n_mc_iterations)]

return simulate_scenarios(
{f"{n} Initial Data": campaign_temp},
df_testingY1,
initial_data=initial_data_temp,
batch_size=settings.batch_size,
n_doe_iterations=settings.n_doe_iterations,
impute_mode="error",
)

#%%
benchmark_config = ConvergenceExperimentSettings(
batch_size=1,
n_doe_iterations=10,
n_doe_iterations=30,
n_mc_iterations=5,
)

# Define the benchmark
crabnet_advopt_benchmark = Benchmark(
crabnet_benchmark = Benchmark(
function=advopt,
best_possible_result=None,
settings=benchmark_config,
optimal_function_inputs=None,
)

crabnet_transfer_learning_benchmark = Benchmark(
function=advopt_transfer_learning,
best_possible_result=None,
settings=benchmark_config,
optimal_function_inputs=None,
)

# ---------------- to do
# reasonable visulization of the benchmark (eg. data frame)
# help understand the benchmark

if __name__ == "__main__":
# Describe the benchmark task
print("CrabNet optimization is a minimization task by tuning 20 continuous and 3 categorical hyperparameters.")
print("The continuous hyperparameters include number of attention layers, learning rate, step size of epochs, etc.")
print("CrabNet optimization is a minimization task by tuning 20 numerical and 3 categorical hyperparameters.")
print("The numerical hyperparameters include number of attention layers, learning rate, step size of epochs, etc.")
print("Details can be found in Table 1 of Baird, S. G.; Liu, M.; Sparks, T. D. High-Dimensional Bayesian Optimization of 23 Hyperparameters over 100 Iterations for an Attention-Based Network to Predict Materials Property: A Case Study on CrabNet Using Ax Platform and SAASBO. Computational Materials Science 2022, 211, 111505.")
print("The categorical hyperparameters include boolean values for bias residual network, loss function, and elemental feature vector.")
print("The continuous hyperparameters are normalized to range [0.0, 1.0], while the categorical hyperparameters are one-hot encoded.")
print("The numerical hyperparameters are normalized to range [0.0, 1.0], while the categorical hyperparameters are one-hot encoded.")
print("")
print("The objective is to minimize y1, RMSE, of the CrabNet hyperparameter function. If y1 is greater than 0.2, the result is coonsider bad.")

print("")
print("CrabNet benchmark compares across random, default, and no task parameter set up. ")
print("")
print("CrabNet transfer learning benchmark compares across different initialized data sizes. ")
Loading

0 comments on commit 635abfc

Please sign in to comment.