Skip to content

Commit

Permalink
pep 8
Browse files Browse the repository at this point in the history
  • Loading branch information
kilianFatras committed Nov 10, 2023
1 parent 703dbb6 commit 6acfca7
Show file tree
Hide file tree
Showing 8 changed files with 156 additions and 92 deletions.
2 changes: 2 additions & 0 deletions examples/notebooks/mnist_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@
"outputs": [],
"source": [
"# follows example from https://github.com/google-research/torchsde/blob/master/examples/cont_ddpm.py\n",
"\n",
"\n",
"class SDE(torch.nn.Module):\n",
" noise_type = \"diagonal\"\n",
" sde_type = \"ito\"\n",
Expand Down
2 changes: 2 additions & 0 deletions examples/notebooks/training-8gaussians-to-moons.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,8 @@
],
"source": [
"# %%time\n",
"\n",
"\n",
"class MLP2(torch.nn.Module):\n",
" def __init__(self, dim, out_dim=None, w=64, time_varying=False):\n",
" super().__init__()\n",
Expand Down
4 changes: 2 additions & 2 deletions examples/tabular/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Forest-Flow experiment on the Iris dataset using TorchCFM

This notebook is a self-contained example showing how to train the novel Forest-Flow method to generate tabular data [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The idea behind Forest-Flow is to **learn Independent Conditional Flow-Matching's vector field with XGBoost models** instead of neural networks. The motivation is that it is known that Forests work currently better than neural networks on Tabular data tasks. This idea comes with some difficulties, for instance how to approximate Flow Matching's loss, and this notebook shows how to do it on a minimal example. The method, its training procedure and the experiments are described in [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The full code can be found [here](https://github.com/SamsungSAILMontreal/ForestDiffusion).
This notebook is a self-contained example showing how to train the novel Forest-Flow method to generate tabular data [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The idea behind Forest-Flow is to **learn Independent Conditional Flow-Matching's vector field with XGBoost models** instead of neural networks. The motivation is that it is known that Forests work currently better than neural networks on Tabular data tasks. This idea comes with some difficulties, for instance how to approximate Flow Matching's loss, and this notebook shows how to do it on a minimal example. The method, its training procedure and the experiments are described in [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The full code can be found [here](https://github.com/SamsungSAILMontreal/ForestDiffusion).

To run our jupyter notebooks, installing our package:

Expand All @@ -20,4 +20,4 @@ conda install -c anaconda ipykernel
python -m ipykernel install --user --name=torchcfm

# launch our notebooks with the torchcfm kernel
```
```
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,17 @@
},
"outputs": [],
"source": [
"from sklearn.datasets import load_iris\n",
"import numpy as np\n",
"import copy\n",
"import xgboost as xgb\n",
"from functools import partial\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from joblib import delayed, Parallel\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import torch\n",
"import xgboost as xgb\n",
"from joblib import Parallel, delayed\n",
"from sklearn.datasets import load_iris\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"from torchcfm.conditional_flow_matching import ConditionalFlowMatcher"
]
},
Expand All @@ -46,13 +48,13 @@
"metadata": {},
"outputs": [],
"source": [
"#set seed\n",
"seed=1980\n",
"# set seed\n",
"seed = 1980\n",
"np.random.seed(seed)\n",
"torch.manual_seed(seed)\n",
"torch.cuda.manual_seed(seed)\n",
"torch.cuda.manual_seed_all(seed)\n",
"torch.backends.cudnn.benchmark=True"
"torch.backends.cudnn.benchmark = True"
]
},
{
Expand All @@ -74,7 +76,7 @@
"source": [
"# Iris: numpy dataset with 4 variables (all numerical) and 1 outcome (categorical; 3 categories)\n",
"my_data = load_iris()\n",
"X, y = my_data['data'], my_data['target']\n",
"X, y = my_data[\"data\"], my_data[\"target\"]\n",
"\n",
"# shuffle the observations\n",
"new_perm = np.random.permutation(X.shape[0])\n",
Expand Down Expand Up @@ -165,17 +167,17 @@
"outputs": [],
"source": [
"# Main hyperparameters\n",
"n_t=50 # number of flow steps (higher is better, 50 is enough for great performance)\n",
"duplicate_K=100 # number of different noise sample per real data sample (higher is better)\n",
"n_t = 50 # number of flow steps (higher is better, 50 is enough for great performance)\n",
"duplicate_K = 100 # number of different noise sample per real data sample (higher is better)\n",
"\n",
"# XGBoost hyperparameters\n",
"max_depth=7\n",
"n_estimators=100\n",
"eta=0.3\n",
"tree_method='hist'\n",
"reg_lambda=0.0\n",
"reg_alpha=0.0\n",
"subsample=1.0"
"max_depth = 7\n",
"n_estimators = 100\n",
"eta = 0.3\n",
"tree_method = \"hist\"\n",
"reg_lambda = 0.0\n",
"reg_alpha = 0.0\n",
"subsample = 1.0"
]
},
{
Expand Down Expand Up @@ -214,13 +216,13 @@
"\n",
"# Saving the freqency of the classes and storing label masks for later\n",
"y_uniques, y_probs = np.unique(y, return_counts=True)\n",
"y_probs = y_probs/np.sum(y_probs)\n",
"mask_y = {} # mask for which observations has a specific value of y\n",
"y_probs = y_probs / np.sum(y_probs)\n",
"mask_y = {} # mask for which observations has a specific value of y\n",
"for i in range(len(y_uniques)):\n",
" mask_y[y_uniques[i]] = np.zeros(b, dtype=bool)\n",
" mask_y[y_uniques[i]][y == y_uniques[i]] = True\n",
" mask_y[y_uniques[i]] = np.tile(mask_y[y_uniques[i]], (duplicate_K))\n",
"n_y = len(y_uniques) # number of classes"
"n_y = len(y_uniques) # number of classes"
]
},
{
Expand All @@ -240,15 +242,17 @@
"t_levels = np.linspace(1e-3, 1, num=n_t)\n",
"\n",
"# Interpolation between x0 and x1 (xt)\n",
"X_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n",
"X_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n",
"\n",
"# Output to predict (ut)\n",
"y_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n",
"y_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n",
"\n",
"# Fill with xt and ut\n",
"for i in range(n_t):\n",
" t = torch.ones(X0.shape[0])*t_levels[i] # current t\n",
" _, xt, ut = FM.sample_location_and_conditional_flow(torch.from_numpy(X0), torch.from_numpy(X1), t=t)\n",
" t = torch.ones(X0.shape[0]) * t_levels[i] # current t\n",
" _, xt, ut = FM.sample_location_and_conditional_flow(\n",
" torch.from_numpy(X0), torch.from_numpy(X1), t=t\n",
" )\n",
" X_train[i], y_train[i] = xt.numpy(), ut.numpy()"
]
},
Expand All @@ -270,9 +274,21 @@
"outputs": [],
"source": [
"# Function used for training one model\n",
"\n",
"\n",
"def train_parallel(X_train, y_train):\n",
" model = xgb.XGBRegressor(n_estimators=n_estimators, objective='reg:squarederror', eta=eta, max_depth=max_depth,\n",
" reg_lambda=reg_lambda, reg_alpha=reg_alpha, subsample=subsample, seed=666, tree_method=tree_method, device='cpu')\n",
" model = xgb.XGBRegressor(\n",
" n_estimators=n_estimators,\n",
" objective=\"reg:squarederror\",\n",
" eta=eta,\n",
" max_depth=max_depth,\n",
" reg_lambda=reg_lambda,\n",
" reg_alpha=reg_alpha,\n",
" subsample=subsample,\n",
" seed=666,\n",
" tree_method=tree_method,\n",
" device=\"cpu\",\n",
" )\n",
"\n",
" y_no_miss = ~np.isnan(y_train)\n",
" model.fit(X_train[y_no_miss, :], y_train[y_no_miss])\n",
Expand Down Expand Up @@ -304,12 +320,16 @@
"%%time\n",
"# Train all model(s); fast if you have a decent multi-core CPU, but extremely slow on Google Colab because it uses a weak 2-core CPU\n",
"\n",
"regr = Parallel(n_jobs=-1)( # using all cpus\n",
" delayed(train_parallel)(\n",
" X_train.reshape(n_t, b*duplicate_K, c)[i][mask_y[j], :],\n",
" y_train.reshape(n_t, b*duplicate_K, c)[i][mask_y[j], k]\n",
" ) for i in range(n_t) for j in y_uniques for k in range(c)\n",
" )\n",
"\n",
"regr = Parallel(n_jobs=-1)( # using all cpus\n",
" delayed(train_parallel)(\n",
" X_train.reshape(n_t, b * duplicate_K, c)[i][mask_y[j], :],\n",
" y_train.reshape(n_t, b * duplicate_K, c)[i][mask_y[j], k],\n",
" )\n",
" for i in range(n_t)\n",
" for j in y_uniques\n",
" for k in range(c)\n",
")\n",
"\n",
"# Replace fits with doubly loops to make things easier\n",
"regr_ = [[[None for k in range(c)] for i in range(n_t)] for j in y_uniques]\n",
Expand Down Expand Up @@ -339,7 +359,7 @@
},
"outputs": [],
"source": [
"batch_size = 150 # number of generated samples"
"batch_size = 150 # number of generated samples"
]
},
{
Expand All @@ -351,18 +371,20 @@
"outputs": [],
"source": [
"# Return the flow at time t using the XGBoost models\n",
"\n",
"\n",
"def my_model(t, xt, mask_y=None):\n",
" # xt is [b*c]\n",
" xt = xt.reshape(xt.shape[0] // c, c) # [b, c]\n",
" xt = xt.reshape(xt.shape[0] // c, c) # [b, c]\n",
"\n",
" # Output from the models\n",
" out = np.zeros(xt.shape) # [b, c]\n",
" i = int(round(t*(n_t-1)))\n",
" out = np.zeros(xt.shape) # [b, c]\n",
" i = int(round(t * (n_t - 1)))\n",
" for j, label in enumerate(y_uniques):\n",
" for k in range(c):\n",
" out[mask_y[label], k] = regr[j][i][k].predict(xt[mask_y[label], :])\n",
"\n",
" out = out.reshape(-1) # [b*c]\n",
" out = out.reshape(-1) # [b*c]\n",
" return out"
]
},
Expand All @@ -375,13 +397,15 @@
"outputs": [],
"source": [
"# Simple Euler ODE solver (nothing fancy)\n",
"\n",
"\n",
"def euler_solve(x0, my_model, N=100):\n",
" h = 1 / (N-1)\n",
" h = 1 / (N - 1)\n",
" x_fake = x0\n",
" t = 0\n",
" # from t=0 to t=1\n",
" for i in range(N-1):\n",
" x_fake = x_fake + h*my_model(t=t, xt=x_fake)\n",
" for i in range(N - 1):\n",
" x_fake = x_fake + h * my_model(t=t, xt=x_fake)\n",
" t = t + h\n",
" return x_fake"
]
Expand All @@ -399,23 +423,25 @@
"\n",
"# Generate random labels for the outcome\n",
"label_y_fake = y_uniques[np.argmax(np.random.multinomial(1, y_probs, size=x0.shape[0]), axis=1)]\n",
"mask_y_fake = {} # mask for which observations has a specific value of y\n",
"mask_y_fake = {} # mask for which observations has a specific value of y\n",
"for i in range(len(y_uniques)):\n",
" mask_y_fake[y_uniques[i]] = np.zeros(x0.shape[0], dtype=bool)\n",
" mask_y_fake[y_uniques[i]][label_y_fake == y_uniques[i]] = True\n",
"\n",
"# ODE solve\n",
"ode_solved = euler_solve(my_model=partial(my_model, mask_y=mask_y_fake), x0=x0.reshape(-1), N=n_t) # [t, b*c]\n",
"solution = ode_solved.reshape(batch_size, c) # [b, c]\n",
"ode_solved = euler_solve(\n",
" my_model=partial(my_model, mask_y=mask_y_fake), x0=x0.reshape(-1), N=n_t\n",
") # [t, b*c]\n",
"solution = ode_solved.reshape(batch_size, c) # [b, c]\n",
"\n",
"# invert the min-max normalization\n",
"solution = scaler.inverse_transform(solution)\n",
"\n",
"# clip to min/max values\n",
"small = (solution < X_min).astype(float)\n",
"solution = small*X_min + (1-small)*solution\n",
"solution = small * X_min + (1 - small) * solution\n",
"big = (solution > X_max).astype(float)\n",
"solution = big*X_max + (1-big)*solution\n",
"solution = big * X_max + (1 - big) * solution\n",
"\n",
"# Concatenate the y label\n",
"Xy_fake = np.concatenate((solution, np.expand_dims(label_y_fake, axis=1)), axis=1)"
Expand Down Expand Up @@ -462,7 +488,7 @@
}
],
"source": [
"Xy_true[0:10] # Real data"
"Xy_true[0:10] # Real data"
]
},
{
Expand Down Expand Up @@ -497,7 +523,7 @@
}
],
"source": [
"Xy_fake[0:10] # Flow generated data"
"Xy_fake[0:10] # Flow generated data"
]
},
{
Expand Down Expand Up @@ -526,13 +552,21 @@
"source": [
"_, (ax1, ax2) = plt.subplots(2)\n",
"# Real data\n",
"scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:,-1])\n",
"ax1.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n",
"_ = ax1.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")\n",
"scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:, -1])\n",
"ax1.set(\n",
" xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n",
")\n",
"_ = ax1.legend(\n",
" scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n",
")\n",
"# Fake data\n",
"scatter = ax2.scatter(Xy_fake[:, 0], Xy_fake[:, 1], c=Xy_fake[:,-1])\n",
"ax2.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n",
"_ = ax2.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")"
"scatter = ax2.scatter(Xy_fake[:, 0], Xy_fake[:, 1], c=Xy_fake[:, -1])\n",
"ax2.set(\n",
" xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n",
")\n",
"_ = ax2.legend(\n",
" scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n",
")"
]
},
{
Expand Down Expand Up @@ -567,7 +601,19 @@
"source": [
"%%time\n",
"from ForestDiffusion import ForestDiffusionModel as ForestFlowModel\n",
"forest_model = ForestFlowModel(X, label_y=y, n_t=50, duplicate_K=100, bin_indexes=[], cat_indexes=[], int_indexes=[], diffusion_type='flow', n_jobs=-1, seed=1)\n",
"\n",
"forest_model = ForestFlowModel(\n",
" X,\n",
" label_y=y,\n",
" n_t=50,\n",
" duplicate_K=100,\n",
" bin_indexes=[],\n",
" cat_indexes=[],\n",
" int_indexes=[],\n",
" diffusion_type=\"flow\",\n",
" n_jobs=-1,\n",
" seed=1,\n",
")\n",
"Xy_fake_ = forest_model.generate(batch_size=X.shape[0])"
]
},
Expand Down Expand Up @@ -597,13 +643,21 @@
"source": [
"_, (ax1, ax2) = plt.subplots(2)\n",
"# Real data\n",
"scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:,-1])\n",
"ax1.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n",
"_ = ax1.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")\n",
"scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:, -1])\n",
"ax1.set(\n",
" xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n",
")\n",
"_ = ax1.legend(\n",
" scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n",
")\n",
"# Fake data\n",
"scatter = ax2.scatter(Xy_fake_[:, 0], Xy_fake_[:, 1], c=Xy_fake_[:,-1])\n",
"ax2.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n",
"_ = ax2.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")"
"scatter = ax2.scatter(Xy_fake_[:, 0], Xy_fake_[:, 1], c=Xy_fake_[:, -1])\n",
"ax2.set(\n",
" xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n",
")\n",
"_ = ax2.legend(\n",
" scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n",
")"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ pytest
# Forest-flow example
xgboost
scikit-learn
ForestDiffusion
ForestDiffusion
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,5 @@
long_description=readme,
long_description_content_type="text/markdown",
packages=find_packages(),
extras_require = {
'forest-flow': ['xgboost', 'scikit-learn', 'ForestDiffusion']
}
extras_require={"forest-flow": ["xgboost", "scikit-learn", "ForestDiffusion"]},
)
Loading

0 comments on commit 6acfca7

Please sign in to comment.