diff --git a/examples/notebooks/mnist_example.ipynb b/examples/notebooks/mnist_example.ipynb index f93334f..eb81c7a 100644 --- a/examples/notebooks/mnist_example.ipynb +++ b/examples/notebooks/mnist_example.ipynb @@ -253,6 +253,8 @@ "outputs": [], "source": [ "# follows example from https://github.com/google-research/torchsde/blob/master/examples/cont_ddpm.py\n", + "\n", + "\n", "class SDE(torch.nn.Module):\n", " noise_type = \"diagonal\"\n", " sde_type = \"ito\"\n", diff --git a/examples/notebooks/training-8gaussians-to-moons.ipynb b/examples/notebooks/training-8gaussians-to-moons.ipynb index 3123030..e8d69e0 100644 --- a/examples/notebooks/training-8gaussians-to-moons.ipynb +++ b/examples/notebooks/training-8gaussians-to-moons.ipynb @@ -843,6 +843,8 @@ ], "source": [ "# %%time\n", + "\n", + "\n", "class MLP2(torch.nn.Module):\n", " def __init__(self, dim, out_dim=None, w=64, time_varying=False):\n", " super().__init__()\n", diff --git a/examples/tabular/README.md b/examples/tabular/README.md index d02264e..1d7e471 100644 --- a/examples/tabular/README.md +++ b/examples/tabular/README.md @@ -1,6 +1,6 @@ # Forest-Flow experiment on the Iris dataset using TorchCFM -This notebook is a self-contained example showing how to train the novel Forest-Flow method to generate tabular data [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The idea behind Forest-Flow is to **learn Independent Conditional Flow-Matching's vector field with XGBoost models** instead of neural networks. The motivation is that it is known that Forests work currently better than neural networks on Tabular data tasks. This idea comes with some difficulties, for instance how to approximate Flow Matching's loss, and this notebook shows how to do it on a minimal example. The method, its training procedure and the experiments are described in [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The full code can be found [here](https://github.com/SamsungSAILMontreal/ForestDiffusion). +This notebook is a self-contained example showing how to train the novel Forest-Flow method to generate tabular data [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The idea behind Forest-Flow is to **learn Independent Conditional Flow-Matching's vector field with XGBoost models** instead of neural networks. The motivation is that it is known that Forests work currently better than neural networks on Tabular data tasks. This idea comes with some difficulties, for instance how to approximate Flow Matching's loss, and this notebook shows how to do it on a minimal example. The method, its training procedure and the experiments are described in [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The full code can be found [here](https://github.com/SamsungSAILMontreal/ForestDiffusion). To run our jupyter notebooks, installing our package: @@ -20,4 +20,4 @@ conda install -c anaconda ipykernel python -m ipykernel install --user --name=torchcfm # launch our notebooks with the torchcfm kernel -``` \ No newline at end of file +``` diff --git a/examples/tabular/Tabular_Data_Generation_with_XGBoost_Conditional_Flow_Matching.ipynb b/examples/tabular/Tabular_Data_Generation_with_XGBoost_Conditional_Flow_Matching.ipynb index 0e5bc04..78e6203 100644 --- a/examples/tabular/Tabular_Data_Generation_with_XGBoost_Conditional_Flow_Matching.ipynb +++ b/examples/tabular/Tabular_Data_Generation_with_XGBoost_Conditional_Flow_Matching.ipynb @@ -28,15 +28,17 @@ }, "outputs": [], "source": [ - "from sklearn.datasets import load_iris\n", - "import numpy as np\n", "import copy\n", - "import xgboost as xgb\n", "from functools import partial\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "from joblib import delayed, Parallel\n", + "\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import torch\n", + "import xgboost as xgb\n", + "from joblib import Parallel, delayed\n", + "from sklearn.datasets import load_iris\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", "from torchcfm.conditional_flow_matching import ConditionalFlowMatcher" ] }, @@ -46,13 +48,13 @@ "metadata": {}, "outputs": [], "source": [ - "#set seed\n", - "seed=1980\n", + "# set seed\n", + "seed = 1980\n", "np.random.seed(seed)\n", "torch.manual_seed(seed)\n", "torch.cuda.manual_seed(seed)\n", "torch.cuda.manual_seed_all(seed)\n", - "torch.backends.cudnn.benchmark=True" + "torch.backends.cudnn.benchmark = True" ] }, { @@ -74,7 +76,7 @@ "source": [ "# Iris: numpy dataset with 4 variables (all numerical) and 1 outcome (categorical; 3 categories)\n", "my_data = load_iris()\n", - "X, y = my_data['data'], my_data['target']\n", + "X, y = my_data[\"data\"], my_data[\"target\"]\n", "\n", "# shuffle the observations\n", "new_perm = np.random.permutation(X.shape[0])\n", @@ -165,17 +167,17 @@ "outputs": [], "source": [ "# Main hyperparameters\n", - "n_t=50 # number of flow steps (higher is better, 50 is enough for great performance)\n", - "duplicate_K=100 # number of different noise sample per real data sample (higher is better)\n", + "n_t = 50 # number of flow steps (higher is better, 50 is enough for great performance)\n", + "duplicate_K = 100 # number of different noise sample per real data sample (higher is better)\n", "\n", "# XGBoost hyperparameters\n", - "max_depth=7\n", - "n_estimators=100\n", - "eta=0.3\n", - "tree_method='hist'\n", - "reg_lambda=0.0\n", - "reg_alpha=0.0\n", - "subsample=1.0" + "max_depth = 7\n", + "n_estimators = 100\n", + "eta = 0.3\n", + "tree_method = \"hist\"\n", + "reg_lambda = 0.0\n", + "reg_alpha = 0.0\n", + "subsample = 1.0" ] }, { @@ -214,13 +216,13 @@ "\n", "# Saving the freqency of the classes and storing label masks for later\n", "y_uniques, y_probs = np.unique(y, return_counts=True)\n", - "y_probs = y_probs/np.sum(y_probs)\n", - "mask_y = {} # mask for which observations has a specific value of y\n", + "y_probs = y_probs / np.sum(y_probs)\n", + "mask_y = {} # mask for which observations has a specific value of y\n", "for i in range(len(y_uniques)):\n", " mask_y[y_uniques[i]] = np.zeros(b, dtype=bool)\n", " mask_y[y_uniques[i]][y == y_uniques[i]] = True\n", " mask_y[y_uniques[i]] = np.tile(mask_y[y_uniques[i]], (duplicate_K))\n", - "n_y = len(y_uniques) # number of classes" + "n_y = len(y_uniques) # number of classes" ] }, { @@ -240,15 +242,17 @@ "t_levels = np.linspace(1e-3, 1, num=n_t)\n", "\n", "# Interpolation between x0 and x1 (xt)\n", - "X_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n", + "X_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n", "\n", "# Output to predict (ut)\n", - "y_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n", + "y_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n", "\n", "# Fill with xt and ut\n", "for i in range(n_t):\n", - " t = torch.ones(X0.shape[0])*t_levels[i] # current t\n", - " _, xt, ut = FM.sample_location_and_conditional_flow(torch.from_numpy(X0), torch.from_numpy(X1), t=t)\n", + " t = torch.ones(X0.shape[0]) * t_levels[i] # current t\n", + " _, xt, ut = FM.sample_location_and_conditional_flow(\n", + " torch.from_numpy(X0), torch.from_numpy(X1), t=t\n", + " )\n", " X_train[i], y_train[i] = xt.numpy(), ut.numpy()" ] }, @@ -270,9 +274,21 @@ "outputs": [], "source": [ "# Function used for training one model\n", + "\n", + "\n", "def train_parallel(X_train, y_train):\n", - " model = xgb.XGBRegressor(n_estimators=n_estimators, objective='reg:squarederror', eta=eta, max_depth=max_depth,\n", - " reg_lambda=reg_lambda, reg_alpha=reg_alpha, subsample=subsample, seed=666, tree_method=tree_method, device='cpu')\n", + " model = xgb.XGBRegressor(\n", + " n_estimators=n_estimators,\n", + " objective=\"reg:squarederror\",\n", + " eta=eta,\n", + " max_depth=max_depth,\n", + " reg_lambda=reg_lambda,\n", + " reg_alpha=reg_alpha,\n", + " subsample=subsample,\n", + " seed=666,\n", + " tree_method=tree_method,\n", + " device=\"cpu\",\n", + " )\n", "\n", " y_no_miss = ~np.isnan(y_train)\n", " model.fit(X_train[y_no_miss, :], y_train[y_no_miss])\n", @@ -304,12 +320,16 @@ "%%time\n", "# Train all model(s); fast if you have a decent multi-core CPU, but extremely slow on Google Colab because it uses a weak 2-core CPU\n", "\n", - "regr = Parallel(n_jobs=-1)( # using all cpus\n", - " delayed(train_parallel)(\n", - " X_train.reshape(n_t, b*duplicate_K, c)[i][mask_y[j], :],\n", - " y_train.reshape(n_t, b*duplicate_K, c)[i][mask_y[j], k]\n", - " ) for i in range(n_t) for j in y_uniques for k in range(c)\n", - " )\n", + "\n", + "regr = Parallel(n_jobs=-1)( # using all cpus\n", + " delayed(train_parallel)(\n", + " X_train.reshape(n_t, b * duplicate_K, c)[i][mask_y[j], :],\n", + " y_train.reshape(n_t, b * duplicate_K, c)[i][mask_y[j], k],\n", + " )\n", + " for i in range(n_t)\n", + " for j in y_uniques\n", + " for k in range(c)\n", + ")\n", "\n", "# Replace fits with doubly loops to make things easier\n", "regr_ = [[[None for k in range(c)] for i in range(n_t)] for j in y_uniques]\n", @@ -339,7 +359,7 @@ }, "outputs": [], "source": [ - "batch_size = 150 # number of generated samples" + "batch_size = 150 # number of generated samples" ] }, { @@ -351,18 +371,20 @@ "outputs": [], "source": [ "# Return the flow at time t using the XGBoost models\n", + "\n", + "\n", "def my_model(t, xt, mask_y=None):\n", " # xt is [b*c]\n", - " xt = xt.reshape(xt.shape[0] // c, c) # [b, c]\n", + " xt = xt.reshape(xt.shape[0] // c, c) # [b, c]\n", "\n", " # Output from the models\n", - " out = np.zeros(xt.shape) # [b, c]\n", - " i = int(round(t*(n_t-1)))\n", + " out = np.zeros(xt.shape) # [b, c]\n", + " i = int(round(t * (n_t - 1)))\n", " for j, label in enumerate(y_uniques):\n", " for k in range(c):\n", " out[mask_y[label], k] = regr[j][i][k].predict(xt[mask_y[label], :])\n", "\n", - " out = out.reshape(-1) # [b*c]\n", + " out = out.reshape(-1) # [b*c]\n", " return out" ] }, @@ -375,13 +397,15 @@ "outputs": [], "source": [ "# Simple Euler ODE solver (nothing fancy)\n", + "\n", + "\n", "def euler_solve(x0, my_model, N=100):\n", - " h = 1 / (N-1)\n", + " h = 1 / (N - 1)\n", " x_fake = x0\n", " t = 0\n", " # from t=0 to t=1\n", - " for i in range(N-1):\n", - " x_fake = x_fake + h*my_model(t=t, xt=x_fake)\n", + " for i in range(N - 1):\n", + " x_fake = x_fake + h * my_model(t=t, xt=x_fake)\n", " t = t + h\n", " return x_fake" ] @@ -399,23 +423,25 @@ "\n", "# Generate random labels for the outcome\n", "label_y_fake = y_uniques[np.argmax(np.random.multinomial(1, y_probs, size=x0.shape[0]), axis=1)]\n", - "mask_y_fake = {} # mask for which observations has a specific value of y\n", + "mask_y_fake = {} # mask for which observations has a specific value of y\n", "for i in range(len(y_uniques)):\n", " mask_y_fake[y_uniques[i]] = np.zeros(x0.shape[0], dtype=bool)\n", " mask_y_fake[y_uniques[i]][label_y_fake == y_uniques[i]] = True\n", "\n", "# ODE solve\n", - "ode_solved = euler_solve(my_model=partial(my_model, mask_y=mask_y_fake), x0=x0.reshape(-1), N=n_t) # [t, b*c]\n", - "solution = ode_solved.reshape(batch_size, c) # [b, c]\n", + "ode_solved = euler_solve(\n", + " my_model=partial(my_model, mask_y=mask_y_fake), x0=x0.reshape(-1), N=n_t\n", + ") # [t, b*c]\n", + "solution = ode_solved.reshape(batch_size, c) # [b, c]\n", "\n", "# invert the min-max normalization\n", "solution = scaler.inverse_transform(solution)\n", "\n", "# clip to min/max values\n", "small = (solution < X_min).astype(float)\n", - "solution = small*X_min + (1-small)*solution\n", + "solution = small * X_min + (1 - small) * solution\n", "big = (solution > X_max).astype(float)\n", - "solution = big*X_max + (1-big)*solution\n", + "solution = big * X_max + (1 - big) * solution\n", "\n", "# Concatenate the y label\n", "Xy_fake = np.concatenate((solution, np.expand_dims(label_y_fake, axis=1)), axis=1)" @@ -462,7 +488,7 @@ } ], "source": [ - "Xy_true[0:10] # Real data" + "Xy_true[0:10] # Real data" ] }, { @@ -497,7 +523,7 @@ } ], "source": [ - "Xy_fake[0:10] # Flow generated data" + "Xy_fake[0:10] # Flow generated data" ] }, { @@ -526,13 +552,21 @@ "source": [ "_, (ax1, ax2) = plt.subplots(2)\n", "# Real data\n", - "scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:,-1])\n", - "ax1.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n", - "_ = ax1.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")\n", + "scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:, -1])\n", + "ax1.set(\n", + " xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n", + ")\n", + "_ = ax1.legend(\n", + " scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n", + ")\n", "# Fake data\n", - "scatter = ax2.scatter(Xy_fake[:, 0], Xy_fake[:, 1], c=Xy_fake[:,-1])\n", - "ax2.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n", - "_ = ax2.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")" + "scatter = ax2.scatter(Xy_fake[:, 0], Xy_fake[:, 1], c=Xy_fake[:, -1])\n", + "ax2.set(\n", + " xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n", + ")\n", + "_ = ax2.legend(\n", + " scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n", + ")" ] }, { @@ -567,7 +601,19 @@ "source": [ "%%time\n", "from ForestDiffusion import ForestDiffusionModel as ForestFlowModel\n", - "forest_model = ForestFlowModel(X, label_y=y, n_t=50, duplicate_K=100, bin_indexes=[], cat_indexes=[], int_indexes=[], diffusion_type='flow', n_jobs=-1, seed=1)\n", + "\n", + "forest_model = ForestFlowModel(\n", + " X,\n", + " label_y=y,\n", + " n_t=50,\n", + " duplicate_K=100,\n", + " bin_indexes=[],\n", + " cat_indexes=[],\n", + " int_indexes=[],\n", + " diffusion_type=\"flow\",\n", + " n_jobs=-1,\n", + " seed=1,\n", + ")\n", "Xy_fake_ = forest_model.generate(batch_size=X.shape[0])" ] }, @@ -597,13 +643,21 @@ "source": [ "_, (ax1, ax2) = plt.subplots(2)\n", "# Real data\n", - "scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:,-1])\n", - "ax1.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n", - "_ = ax1.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")\n", + "scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:, -1])\n", + "ax1.set(\n", + " xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n", + ")\n", + "_ = ax1.legend(\n", + " scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n", + ")\n", "# Fake data\n", - "scatter = ax2.scatter(Xy_fake_[:, 0], Xy_fake_[:, 1], c=Xy_fake_[:,-1])\n", - "ax2.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n", - "_ = ax2.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")" + "scatter = ax2.scatter(Xy_fake_[:, 0], Xy_fake_[:, 1], c=Xy_fake_[:, -1])\n", + "ax2.set(\n", + " xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n", + ")\n", + "_ = ax2.legend(\n", + " scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n", + ")" ] }, { diff --git a/requirements.txt b/requirements.txt index cec9f3c..e72bef8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,4 @@ pytest # Forest-flow example xgboost scikit-learn -ForestDiffusion \ No newline at end of file +ForestDiffusion diff --git a/setup.py b/setup.py index c9cf834..b976bb2 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,5 @@ long_description=readme, long_description_content_type="text/markdown", packages=find_packages(), - extras_require = { - 'forest-flow': ['xgboost', 'scikit-learn', 'ForestDiffusion'] - } + extras_require={"forest-flow": ["xgboost", "scikit-learn", "ForestDiffusion"]}, ) diff --git a/test/test_time_t.py b/test/test_time_t.py index b0486f7..21ad5cc 100644 --- a/test/test_time_t.py +++ b/test/test_time_t.py @@ -1,31 +1,33 @@ -"""Tests for time Tensor t""" +"""Tests for time Tensor t.""" # Author: Kilian Fatras +import pytest import torch + from torchcfm.conditional_flow_matching import ( ConditionalFlowMatcher, ExactOptimalTransportConditionalFlowMatcher, - TargetConditionalFlowMatcher, SchrodingerBridgeConditionalFlowMatcher, - VariancePreservingConditionalFlowMatcher + TargetConditionalFlowMatcher, + VariancePreservingConditionalFlowMatcher, ) -import pytest +seed = 1994 +batch_size = 128 -seed=1994 -batch_size=128 @pytest.mark.parametrize( - "FM", - [ConditionalFlowMatcher(sigma=0.0), + "FM", + [ + ConditionalFlowMatcher(sigma=0.0), ExactOptimalTransportConditionalFlowMatcher(sigma=0.0), TargetConditionalFlowMatcher(sigma=0.0), SchrodingerBridgeConditionalFlowMatcher(sigma=0.0), - VariancePreservingConditionalFlowMatcher(sigma=0.0)] + VariancePreservingConditionalFlowMatcher(sigma=0.0), + ], ) def test_random_Tensor_t(FM): - print('HERE') # Test sample_location_and_conditional_flow functions x0 = torch.randn(batch_size, 2) x1 = torch.randn(batch_size, 2) @@ -33,30 +35,36 @@ def test_random_Tensor_t(FM): torch.manual_seed(seed) t_given = torch.rand(batch_size) t_given, xt, ut = FM.sample_location_and_conditional_flow(x0, x1, t=t_given) - + torch.manual_seed(seed) t_random, xt, ut = FM.sample_location_and_conditional_flow(x0, x1, t=None) - - assert any(t_given==t_random) + + assert any(t_given == t_random) + @pytest.mark.parametrize( - "FM", - [ExactOptimalTransportConditionalFlowMatcher(sigma=0.0), - SchrodingerBridgeConditionalFlowMatcher(sigma=0.0)] + "FM", + [ + ExactOptimalTransportConditionalFlowMatcher(sigma=0.0), + SchrodingerBridgeConditionalFlowMatcher(sigma=0.0), + ], ) def test_guided_random_Tensor_t(FM): # Test guided_sample_location_and_conditional_flow functions x0 = torch.randn(batch_size, 2) - y0 = torch.randint(high=10, size=(batch_size,1)) + y0 = torch.randint(high=10, size=(batch_size, 1)) x1 = torch.randn(batch_size, 2) - y1 = torch.randint(high=10, size=(batch_size,1)) - + y1 = torch.randint(high=10, size=(batch_size, 1)) + torch.manual_seed(seed) t_given = torch.rand(batch_size) - t_given, xt, ut, y0, y1 = FM.guided_sample_location_and_conditional_flow(x0, x1, y0=y0, y1=y1, t=t_given) - + t_given, xt, ut, y0, y1 = FM.guided_sample_location_and_conditional_flow( + x0, x1, y0=y0, y1=y1, t=t_given + ) + torch.manual_seed(seed) - t_random, xt, ut, y0, y1 = FM.guided_sample_location_and_conditional_flow(x0, x1, y0=y0, y1=y1, t=None) + t_random, xt, ut, y0, y1 = FM.guided_sample_location_and_conditional_flow( + x0, x1, y0=y0, y1=y1, t=None + ) - assert any(t_given==t_random) - \ No newline at end of file + assert any(t_given == t_random) diff --git a/torchcfm/conditional_flow_matching.py b/torchcfm/conditional_flow_matching.py index 699ea40..23cb76f 100644 --- a/torchcfm/conditional_flow_matching.py +++ b/torchcfm/conditional_flow_matching.py @@ -185,7 +185,7 @@ def sample_location_and_conditional_flow(self, x0, x1, t=None, return_noise=Fals """ if t is None: t = torch.rand(x0.shape[0]).type_as(x0) - assert len(t)==x0.shape[0], "t has to have batch size dimension" + assert len(t) == x0.shape[0], "t has to have batch size dimension" eps = self.sample_noise_like(x0) xt = self.sample_xt(x0, x1, t, eps)