pep 8

atong01 · Nov 10, 2023 · 6acfca7 · 6acfca7
1 parent 703dbb6
commit 6acfca7
Show file tree

Hide file tree

Showing 8 changed files with 156 additions and 92 deletions.
diff --git a/examples/notebooks/mnist_example.ipynb b/examples/notebooks/mnist_example.ipynb
@@ -253,6 +253,8 @@
    "outputs": [],
    "source": [
     "# follows example from https://github.com/google-research/torchsde/blob/master/examples/cont_ddpm.py\n",
+    "\n",
+    "\n",
     "class SDE(torch.nn.Module):\n",
     "    noise_type = \"diagonal\"\n",
     "    sde_type = \"ito\"\n",

diff --git a/examples/notebooks/training-8gaussians-to-moons.ipynb b/examples/notebooks/training-8gaussians-to-moons.ipynb
@@ -843,6 +843,8 @@
    ],
    "source": [
     "# %%time\n",
+    "\n",
+    "\n",
     "class MLP2(torch.nn.Module):\n",
     "    def __init__(self, dim, out_dim=None, w=64, time_varying=False):\n",
     "        super().__init__()\n",

diff --git a/examples/tabular/README.md b/examples/tabular/README.md
@@ -1,6 +1,6 @@
 # Forest-Flow experiment on the Iris dataset using TorchCFM
 
-This notebook is a self-contained example showing how to train the novel Forest-Flow method to generate tabular data [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The idea behind Forest-Flow is to **learn Independent Conditional Flow-Matching's vector field with XGBoost models** instead of neural networks. The motivation is that it is known that Forests work currently better than neural networks on Tabular data tasks. This idea comes with some difficulties, for instance how to approximate Flow Matching's loss, and this notebook shows how to do it on a minimal example. The method, its training procedure and the experiments are described in [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The full code can be found [here](https://github.com/SamsungSAILMontreal/ForestDiffusion). 
+This notebook is a self-contained example showing how to train the novel Forest-Flow method to generate tabular data [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The idea behind Forest-Flow is to **learn Independent Conditional Flow-Matching's vector field with XGBoost models** instead of neural networks. The motivation is that it is known that Forests work currently better than neural networks on Tabular data tasks. This idea comes with some difficulties, for instance how to approximate Flow Matching's loss, and this notebook shows how to do it on a minimal example. The method, its training procedure and the experiments are described in [(Jolicoeur-Martineau et al. 2023)](https://arxiv.org/abs/2309.09968). The full code can be found [here](https://github.com/SamsungSAILMontreal/ForestDiffusion).
 
 To run our jupyter notebooks, installing our package:
 
@@ -20,4 +20,4 @@ conda install -c anaconda ipykernel
 python -m ipykernel install --user --name=torchcfm
 
 # launch our notebooks with the torchcfm kernel
-```
+```
diff --git a/examples/tabular/Tabular_Data_Generation_with_XGBoost_Conditional_Flow_Matching.ipynb b/examples/tabular/Tabular_Data_Generation_with_XGBoost_Conditional_Flow_Matching.ipynb
@@ -28,15 +28,17 @@
    },
    "outputs": [],
    "source": [
-    "from sklearn.datasets import load_iris\n",
-    "import numpy as np\n",
     "import copy\n",
-    "import xgboost as xgb\n",
     "from functools import partial\n",
-    "from sklearn.preprocessing import MinMaxScaler\n",
-    "from joblib import delayed, Parallel\n",
+    "\n",
     "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
     "import torch\n",
+    "import xgboost as xgb\n",
+    "from joblib import Parallel, delayed\n",
+    "from sklearn.datasets import load_iris\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "\n",
     "from torchcfm.conditional_flow_matching import ConditionalFlowMatcher"
    ]
   },
@@ -46,13 +48,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#set seed\n",
-    "seed=1980\n",
+    "# set seed\n",
+    "seed = 1980\n",
     "np.random.seed(seed)\n",
     "torch.manual_seed(seed)\n",
     "torch.cuda.manual_seed(seed)\n",
     "torch.cuda.manual_seed_all(seed)\n",
-    "torch.backends.cudnn.benchmark=True"
+    "torch.backends.cudnn.benchmark = True"
    ]
   },
   {
@@ -74,7 +76,7 @@
    "source": [
     "# Iris: numpy dataset with 4 variables (all numerical) and 1 outcome (categorical; 3 categories)\n",
     "my_data = load_iris()\n",
-    "X, y = my_data['data'], my_data['target']\n",
+    "X, y = my_data[\"data\"], my_data[\"target\"]\n",
     "\n",
     "# shuffle the observations\n",
     "new_perm = np.random.permutation(X.shape[0])\n",
@@ -165,17 +167,17 @@
    "outputs": [],
    "source": [
     "# Main hyperparameters\n",
-    "n_t=50 # number of flow steps (higher is better, 50 is enough for great performance)\n",
-    "duplicate_K=100 # number of different noise sample per real data sample (higher is better)\n",
+    "n_t = 50  # number of flow steps (higher is better, 50 is enough for great performance)\n",
+    "duplicate_K = 100  # number of different noise sample per real data sample (higher is better)\n",
     "\n",
     "# XGBoost hyperparameters\n",
-    "max_depth=7\n",
-    "n_estimators=100\n",
-    "eta=0.3\n",
-    "tree_method='hist'\n",
-    "reg_lambda=0.0\n",
-    "reg_alpha=0.0\n",
-    "subsample=1.0"
+    "max_depth = 7\n",
+    "n_estimators = 100\n",
+    "eta = 0.3\n",
+    "tree_method = \"hist\"\n",
+    "reg_lambda = 0.0\n",
+    "reg_alpha = 0.0\n",
+    "subsample = 1.0"
    ]
   },
   {
@@ -214,13 +216,13 @@
     "\n",
     "# Saving the freqency of the classes and storing label masks for later\n",
     "y_uniques, y_probs = np.unique(y, return_counts=True)\n",
-    "y_probs = y_probs/np.sum(y_probs)\n",
-    "mask_y = {} # mask for which observations has a specific value of y\n",
+    "y_probs = y_probs / np.sum(y_probs)\n",
+    "mask_y = {}  # mask for which observations has a specific value of y\n",
     "for i in range(len(y_uniques)):\n",
     "    mask_y[y_uniques[i]] = np.zeros(b, dtype=bool)\n",
     "    mask_y[y_uniques[i]][y == y_uniques[i]] = True\n",
     "    mask_y[y_uniques[i]] = np.tile(mask_y[y_uniques[i]], (duplicate_K))\n",
-    "n_y = len(y_uniques) # number of classes"
+    "n_y = len(y_uniques)  # number of classes"
    ]
   },
   {
@@ -240,15 +242,17 @@
     "t_levels = np.linspace(1e-3, 1, num=n_t)\n",
     "\n",
     "# Interpolation between x0 and x1 (xt)\n",
-    "X_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n",
+    "X_train = np.zeros((n_t, X0.shape[0], X0.shape[1]))  # [n_t, b, c]\n",
     "\n",
     "# Output to predict (ut)\n",
-    "y_train = np.zeros((n_t, X0.shape[0], X0.shape[1])) # [n_t, b, c]\n",
+    "y_train = np.zeros((n_t, X0.shape[0], X0.shape[1]))  # [n_t, b, c]\n",
     "\n",
     "# Fill with xt and ut\n",
     "for i in range(n_t):\n",
-    "    t = torch.ones(X0.shape[0])*t_levels[i] # current t\n",
-    "    _, xt, ut = FM.sample_location_and_conditional_flow(torch.from_numpy(X0), torch.from_numpy(X1), t=t)\n",
+    "    t = torch.ones(X0.shape[0]) * t_levels[i]  # current t\n",
+    "    _, xt, ut = FM.sample_location_and_conditional_flow(\n",
+    "        torch.from_numpy(X0), torch.from_numpy(X1), t=t\n",
+    "    )\n",
     "    X_train[i], y_train[i] = xt.numpy(), ut.numpy()"
    ]
   },
@@ -270,9 +274,21 @@
    "outputs": [],
    "source": [
     "# Function used for training one model\n",
+    "\n",
+    "\n",
     "def train_parallel(X_train, y_train):\n",
-    "    model = xgb.XGBRegressor(n_estimators=n_estimators, objective='reg:squarederror', eta=eta, max_depth=max_depth,\n",
-    "    reg_lambda=reg_lambda, reg_alpha=reg_alpha, subsample=subsample, seed=666, tree_method=tree_method, device='cpu')\n",
+    "    model = xgb.XGBRegressor(\n",
+    "        n_estimators=n_estimators,\n",
+    "        objective=\"reg:squarederror\",\n",
+    "        eta=eta,\n",
+    "        max_depth=max_depth,\n",
+    "        reg_lambda=reg_lambda,\n",
+    "        reg_alpha=reg_alpha,\n",
+    "        subsample=subsample,\n",
+    "        seed=666,\n",
+    "        tree_method=tree_method,\n",
+    "        device=\"cpu\",\n",
+    "    )\n",
     "\n",
     "    y_no_miss = ~np.isnan(y_train)\n",
     "    model.fit(X_train[y_no_miss, :], y_train[y_no_miss])\n",
@@ -304,12 +320,16 @@
     "%%time\n",
     "# Train all model(s); fast if you have a decent multi-core CPU, but extremely slow on Google Colab because it uses a weak 2-core CPU\n",
     "\n",
-    "regr = Parallel(n_jobs=-1)( # using all cpus\n",
-    "        delayed(train_parallel)(\n",
-    "          X_train.reshape(n_t, b*duplicate_K, c)[i][mask_y[j], :],\n",
-    "          y_train.reshape(n_t, b*duplicate_K, c)[i][mask_y[j], k]\n",
-    "          ) for i in range(n_t) for j in y_uniques for k in range(c)\n",
-    "        )\n",
+    "\n",
+    "regr = Parallel(n_jobs=-1)(  # using all cpus\n",
+    "    delayed(train_parallel)(\n",
+    "        X_train.reshape(n_t, b * duplicate_K, c)[i][mask_y[j], :],\n",
+    "        y_train.reshape(n_t, b * duplicate_K, c)[i][mask_y[j], k],\n",
+    "    )\n",
+    "    for i in range(n_t)\n",
+    "    for j in y_uniques\n",
+    "    for k in range(c)\n",
+    ")\n",
     "\n",
     "# Replace fits with doubly loops to make things easier\n",
     "regr_ = [[[None for k in range(c)] for i in range(n_t)] for j in y_uniques]\n",
@@ -339,7 +359,7 @@
    },
    "outputs": [],
    "source": [
-    "batch_size = 150 # number of generated samples"
+    "batch_size = 150  # number of generated samples"
    ]
   },
   {
@@ -351,18 +371,20 @@
    "outputs": [],
    "source": [
     "# Return the flow at time t using the XGBoost models\n",
+    "\n",
+    "\n",
     "def my_model(t, xt, mask_y=None):\n",
     "    # xt is [b*c]\n",
-    "    xt = xt.reshape(xt.shape[0] // c, c) # [b, c]\n",
+    "    xt = xt.reshape(xt.shape[0] // c, c)  # [b, c]\n",
     "\n",
     "    # Output from the models\n",
-    "    out = np.zeros(xt.shape) # [b, c]\n",
-    "    i = int(round(t*(n_t-1)))\n",
+    "    out = np.zeros(xt.shape)  # [b, c]\n",
+    "    i = int(round(t * (n_t - 1)))\n",
     "    for j, label in enumerate(y_uniques):\n",
     "        for k in range(c):\n",
     "            out[mask_y[label], k] = regr[j][i][k].predict(xt[mask_y[label], :])\n",
     "\n",
-    "    out = out.reshape(-1) # [b*c]\n",
+    "    out = out.reshape(-1)  # [b*c]\n",
     "    return out"
    ]
   },
@@ -375,13 +397,15 @@
    "outputs": [],
    "source": [
     "# Simple Euler ODE solver (nothing fancy)\n",
+    "\n",
+    "\n",
     "def euler_solve(x0, my_model, N=100):\n",
-    "    h = 1 / (N-1)\n",
+    "    h = 1 / (N - 1)\n",
     "    x_fake = x0\n",
     "    t = 0\n",
     "    # from t=0 to t=1\n",
-    "    for i in range(N-1):\n",
-    "        x_fake = x_fake + h*my_model(t=t, xt=x_fake)\n",
+    "    for i in range(N - 1):\n",
+    "        x_fake = x_fake + h * my_model(t=t, xt=x_fake)\n",
     "        t = t + h\n",
     "    return x_fake"
    ]
@@ -399,23 +423,25 @@
     "\n",
     "# Generate random labels for the outcome\n",
     "label_y_fake = y_uniques[np.argmax(np.random.multinomial(1, y_probs, size=x0.shape[0]), axis=1)]\n",
-    "mask_y_fake = {} # mask for which observations has a specific value of y\n",
+    "mask_y_fake = {}  # mask for which observations has a specific value of y\n",
     "for i in range(len(y_uniques)):\n",
     "    mask_y_fake[y_uniques[i]] = np.zeros(x0.shape[0], dtype=bool)\n",
     "    mask_y_fake[y_uniques[i]][label_y_fake == y_uniques[i]] = True\n",
     "\n",
     "# ODE solve\n",
-    "ode_solved = euler_solve(my_model=partial(my_model, mask_y=mask_y_fake), x0=x0.reshape(-1), N=n_t) # [t, b*c]\n",
-    "solution = ode_solved.reshape(batch_size, c) # [b, c]\n",
+    "ode_solved = euler_solve(\n",
+    "    my_model=partial(my_model, mask_y=mask_y_fake), x0=x0.reshape(-1), N=n_t\n",
+    ")  # [t, b*c]\n",
+    "solution = ode_solved.reshape(batch_size, c)  # [b, c]\n",
     "\n",
     "# invert the min-max normalization\n",
     "solution = scaler.inverse_transform(solution)\n",
     "\n",
     "# clip to min/max values\n",
     "small = (solution < X_min).astype(float)\n",
-    "solution = small*X_min + (1-small)*solution\n",
+    "solution = small * X_min + (1 - small) * solution\n",
     "big = (solution > X_max).astype(float)\n",
-    "solution = big*X_max + (1-big)*solution\n",
+    "solution = big * X_max + (1 - big) * solution\n",
     "\n",
     "# Concatenate the y label\n",
     "Xy_fake = np.concatenate((solution, np.expand_dims(label_y_fake, axis=1)), axis=1)"
@@ -462,7 +488,7 @@
     }
    ],
    "source": [
-    "Xy_true[0:10] # Real data"
+    "Xy_true[0:10]  # Real data"
    ]
   },
   {
@@ -497,7 +523,7 @@
     }
    ],
    "source": [
-    "Xy_fake[0:10] # Flow generated data"
+    "Xy_fake[0:10]  # Flow generated data"
    ]
   },
   {
@@ -526,13 +552,21 @@
    "source": [
     "_, (ax1, ax2) = plt.subplots(2)\n",
     "# Real data\n",
-    "scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:,-1])\n",
-    "ax1.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n",
-    "_ = ax1.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")\n",
+    "scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:, -1])\n",
+    "ax1.set(\n",
+    "    xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n",
+    ")\n",
+    "_ = ax1.legend(\n",
+    "    scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n",
+    ")\n",
     "# Fake data\n",
-    "scatter = ax2.scatter(Xy_fake[:, 0], Xy_fake[:, 1], c=Xy_fake[:,-1])\n",
-    "ax2.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n",
-    "_ = ax2.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")"
+    "scatter = ax2.scatter(Xy_fake[:, 0], Xy_fake[:, 1], c=Xy_fake[:, -1])\n",
+    "ax2.set(\n",
+    "    xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n",
+    ")\n",
+    "_ = ax2.legend(\n",
+    "    scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n",
+    ")"
    ]
   },
   {
@@ -567,7 +601,19 @@
    "source": [
     "%%time\n",
     "from ForestDiffusion import ForestDiffusionModel as ForestFlowModel\n",
-    "forest_model = ForestFlowModel(X, label_y=y, n_t=50, duplicate_K=100, bin_indexes=[], cat_indexes=[], int_indexes=[], diffusion_type='flow', n_jobs=-1, seed=1)\n",
+    "\n",
+    "forest_model = ForestFlowModel(\n",
+    "    X,\n",
+    "    label_y=y,\n",
+    "    n_t=50,\n",
+    "    duplicate_K=100,\n",
+    "    bin_indexes=[],\n",
+    "    cat_indexes=[],\n",
+    "    int_indexes=[],\n",
+    "    diffusion_type=\"flow\",\n",
+    "    n_jobs=-1,\n",
+    "    seed=1,\n",
+    ")\n",
     "Xy_fake_ = forest_model.generate(batch_size=X.shape[0])"
    ]
   },
@@ -597,13 +643,21 @@
    "source": [
     "_, (ax1, ax2) = plt.subplots(2)\n",
     "# Real data\n",
-    "scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:,-1])\n",
-    "ax1.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n",
-    "_ = ax1.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")\n",
+    "scatter = ax1.scatter(Xy_true[:, 0], Xy_true[:, 1], c=Xy_true[:, -1])\n",
+    "ax1.set(\n",
+    "    xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n",
+    ")\n",
+    "_ = ax1.legend(\n",
+    "    scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n",
+    ")\n",
     "# Fake data\n",
-    "scatter = ax2.scatter(Xy_fake_[:, 0], Xy_fake_[:, 1], c=Xy_fake_[:,-1])\n",
-    "ax2.set(xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5))\n",
-    "_ = ax2.legend(scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\")"
+    "scatter = ax2.scatter(Xy_fake_[:, 0], Xy_fake_[:, 1], c=Xy_fake_[:, -1])\n",
+    "ax2.set(\n",
+    "    xlabel=my_data.feature_names[0], ylabel=my_data.feature_names[1], xlim=(4, 8), ylim=(2, 4.5)\n",
+    ")\n",
+    "_ = ax2.legend(\n",
+    "    scatter.legend_elements()[0], my_data.target_names, loc=\"lower right\", title=\"Classes\"\n",
+    ")"
    ]
   },
   {

diff --git a/requirements.txt b/requirements.txt
@@ -18,4 +18,4 @@ pytest
 # Forest-flow example
 xgboost
 scikit-learn
-ForestDiffusion
+ForestDiffusion
diff --git a/setup.py b/setup.py
@@ -36,7 +36,5 @@
     long_description=readme,
     long_description_content_type="text/markdown",
     packages=find_packages(),
-    extras_require = {
-        'forest-flow': ['xgboost', 'scikit-learn', 'ForestDiffusion']
-    }
+    extras_require={"forest-flow": ["xgboost", "scikit-learn", "ForestDiffusion"]},
 )