From d345e5c8d066d282b4532e77f7378fef5c13c174 Mon Sep 17 00:00:00 2001
From: 6Ulm <huytran82125@gmail.com>
Date: Mon, 14 Oct 2024 16:43:52 +0200
Subject: [PATCH] fix documentation

---
 README.md                |   4 +-
 ot/gromov/_unbalanced.py | 295 +++++++++++++++++++++------------------
 ot/gromov/_utils.py      |  36 +++--
 3 files changed, 191 insertions(+), 144 deletions(-)

diff --git a/README.md b/README.md
index c2ad764ed..fa9f5789e 100644
--- a/README.md
+++ b/README.md
@@ -379,4 +379,6 @@ distances between Gaussian distributions](https://hal.science/hal-03197398v2/fil
 & B. Thirion (2022). [Aligning individual brains with Fused Unbalanced Gromov-Wasserstein.](https://proceedings.neurips.cc/paper_files/paper/2022/file/8906cac4ca58dcaf17e97a0486ad57ca-Paper-Conference.pdf). Neural Information Processing Systems (NeurIPS).
 
 [71] H. Tran, H. Janati, N. Courty, R. Flamary, I. Redko, P. Demetci & R. Singh (2023). [Unbalanced Co-Optimal Transport](https://dl.acm.org/doi/10.1609/aaai.v37i8.26193). AAAI Conference on
-Artificial Intelligence.
\ No newline at end of file
+Artificial Intelligence.
+
+[72] Thibault Séjourné, François-Xavier Vialard, and Gabriel Peyré (2021). [The Unbalanced Gromov Wasserstein Distance: Conic Formulation and Relaxation](https://proceedings.neurips.cc/paper/2021/file/4990974d150d0de5e6e15a1454fe6b0f-Paper.pdf). Neural Information Processing Systems (NeurIPS).
\ No newline at end of file
diff --git a/ot/gromov/_unbalanced.py b/ot/gromov/_unbalanced.py
index fc4e9688b..cc7b9e53c 100644
--- a/ot/gromov/_unbalanced.py
+++ b/ot/gromov/_unbalanced.py
@@ -26,19 +26,21 @@ def fused_unbalanced_across_spaces_divergence(
 
     r"""Compute the fused unbalanced cross-spaces divergence between two matrices equipped
     with the distributions on rows and columns. We consider two cases of matrix:
+
     - (Squared) similarity matrix in Gromov-Wasserstein setting,
     whose rows and columns represent the samples.
+
     - Arbitrary-size matrix in Co-Optimal Transport setting,
     whose rows represent samples, and columns represent corresponding features/dimensions.
 
-    Return the sample and feature transport plans between
+    More precisely, this function returns the sample and feature transport plans between
     :math:`(\mathbf{X}, \mathbf{w}_{xs}, \mathbf{w}_{xf})` and
-    :math:`(\mathbf{Y}, \mathbf{w}_{ys}, \mathbf{w}_{yf})`.
-
-    The function solves the following problem using Block Coordinate Descent algorithm:
+    :math:`(\mathbf{Y}, \mathbf{w}_{ys}, \mathbf{w}_{yf})`,
+    by solving the following problem using Block Coordinate Descent algorithm:
 
     .. math::
-        \mathbf{Div} = \mathop{\arg \min}_{\mathbf{P}, \mathbf{Q}}
+
+        \mathop{\arg \min}_{\mathbf{P}, \mathbf{Q}}
         &\quad \sum_{i,j,k,l}
         (\mathbf{X}_{i,k} - \mathbf{Y}_{j,l})^2 \mathbf{P}_{i,j} \mathbf{Q}_{k,l} \\
         &+ \rho_s \mathbf{Div}(\mathbf{P}_{\# 1} \mathbf{Q}_{\# 1}^T | \mathbf{w}_{xs} \mathbf{w}_{ys}^T)
@@ -47,7 +49,7 @@ def fused_unbalanced_across_spaces_divergence(
         + \alpha_f \sum_{k, l} \mathbf{Q}_{k,l} \mathbf{M^{(f)}}_{k, l}
         + \mathbf{Reg}(\mathbf{P}, \mathbf{Q})
 
-    Where :
+    Where:
 
     - :math:`\mathbf{X}`: Source input (arbitrary-size) matrix
     - :math:`\mathbf{Y}`: Target input (arbitrary-size) matrix
@@ -59,20 +61,22 @@ def fused_unbalanced_across_spaces_divergence(
     - :math:`\mathbf{w}_{yf}`: Distribution of the features in the target space
     - :math:`\mathbf{Div}`: Either Kullback-Leibler divergence or half-squared L2 norm.
     - :math:`\mathbf{Reg}`: Regularizer for sample and feature couplings.
+
     We consider two types of regularizer:
         + Independent regularization used in unbalanced Co-Optimal Transport
+
         .. math::
             \mathbf{Reg}(\mathbf{P}, \mathbf{Q}) =
             \varepsilon_s \mathbf{Div}(\mathbf{P} | \mathbf{w}_{xs} \mathbf{w}_{ys}^T)
             + \varepsilon_f \mathbf{Div}(\mathbf{Q} | \mathbf{w}_{xf} \mathbf{w}_{yf}^T)
 
         + Joint regularization used in fused unbalanced Gromov-Wasserstein
+
         .. math::
             \mathbf{Reg}(\mathbf{P}, \mathbf{Q}) =
             \varepsilon \mathbf{Div}(\mathbf{P} \otimes \mathbf{Q} | (\mathbf{w}_{xs} \mathbf{w}_{ys}^T) \otimes (\mathbf{w}_{xf} \mathbf{w}_{yf}^T) )
 
-    .. note:: This function allows epsilon to be zero.
-              In that case, unbalanced_method must be either "mm" or "lbfgsb".
+    .. note:: This function allows epsilon to be zero. In that case, `unbalanced_method` must be either "mm" or "lbfgsb".
 
     Parameters
     ----------
@@ -94,23 +98,29 @@ def fused_unbalanced_across_spaces_divergence(
         Uniform distribution by default.
     reg_marginals: float or indexable object of length 1 or 2
         Marginal relaxation terms for sample and feature couplings.
-        If reg_marginals is a scalar or an indexable object of length 1,
-        then the same reg_marginals is applied to both marginal relaxations.
+        If `reg_marginals` is a scalar or an indexable object of length 1,
+        then the same value is applied to both marginal relaxations.
     epsilon : scalar or indexable object of length 2, float or int, optional (default = 0)
         Regularization parameters for entropic approximation of sample and feature couplings.
-        Allow the case where epsilon contains 0. In that case, the MM solver is used by default
-        instead of Sinkhorn solver. If epsilon is scalar, then the same epsilon is applied to
+        Allow the case where `epsilon` contains 0. In that case, the MM solver is used by default
+        instead of Sinkhorn solver. If `epsilon` is scalar, then the same value is applied to
         both regularization of sample and feature couplings.
     reg_type: string, optional
-        reg_type = "joint": then use joint regularization for couplings.
-        reg_type = "indepedent": then use independent regularization for couplings.
+
+        - If `reg_type` = "joint": then use joint regularization for couplings.
+
+        - If `reg_type` = "indepedent": then use independent regularization for couplings.
     divergence : string, optional (default = "kl")
-        If divergence = "kl", then Div is the Kullback-Leibler divergence.
-        If divergence = "l2", then Div is the half squared Euclidean norm.
+
+        - If `divergence` = "kl", then Div is the Kullback-Leibler divergence.
+
+        - If `divergence` = "l2", then Div is the half squared Euclidean norm.
     unbalanced_solver : string, optional (default = "sinkhorn")
         Solver for the unbalanced OT subroutine.
-        If divergence = "kl", then unbalanced_solver can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
-        If divergence = "l2", then unbalanced_solver can be "mm", "lbfgsb"
+
+        - If `divergence` = "kl", then `unbalanced_solver` can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
+
+        - If `divergence` = "l2", then `unbalanced_solver` can be "mm", "lbfgsb"
     alpha : scalar or indexable object of length 2, float or int, optional (default = 0)
         Coeffficient parameter of linear terms with respect to the sample and feature couplings.
         If alpha is scalar, then the same alpha is applied to both linear terms.
@@ -140,8 +150,8 @@ def fused_unbalanced_across_spaces_divergence(
         Tolerance of unbalanced solver for each of the
         two unbalanced optimal transport problems in each BCD iteration.
     log : bool, optional (default = False)
-        If True then the cost and 4 dual vectors, including
-        2 from sample and 2 from feature couplings, are recorded.
+        If True then the cost and four dual vectors, including
+        two from sample and two from feature couplings, are recorded.
     verbose : bool, optional (default = False)
         If True then print the COOT cost at every multiplier of `eval_bcd`-th iteration.
 
@@ -153,7 +163,9 @@ def fused_unbalanced_across_spaces_divergence(
         Feature coupling matrix.
     log : dictionary, optional
         Returned if `log` is True. The keys are:
-            error : list of L1 norms between the current and previous sample coupling.
+
+            error : array-like, float
+                list of L1 norms between the current and previous sample coupling.
             duals_sample : (n_sample_x, n_sample_y) tuple, float
                 Pair of dual vectors when solving OT problem w.r.t the sample coupling.
             duals_feature : (n_feature_x, n_feature_y) tuple, float
@@ -397,15 +409,13 @@ def unbalanced_co_optimal_transport(
     r"""Compute the unbalanced Co-Optimal Transport between two Euclidean point clouds
     (represented as matrices whose rows are samples and columns are the features/dimensions).
 
-    Return the sample and feature transport plans between
+    More precisely, this function returns the sample and feature transport plans between
     :math:`(\mathbf{X}, \mathbf{w}_{xs}, \mathbf{w}_{xf})` and
-    :math:`(\mathbf{Y}, \mathbf{w}_{ys}, \mathbf{w}_{yf})`.
-
-    The function solves the following problem using Block Coordinate Descent algorithm:
+    :math:`(\mathbf{Y}, \mathbf{w}_{ys}, \mathbf{w}_{yf})`,
+    by solving the following problem using Block Coordinate Descent algorithm:
 
     .. math::
-        \mathbf{UCOOT} = \mathop{\arg \min}_{\mathbf{P}, \mathbf{Q}}
-        &\quad \sum_{i,j,k,l}
+        \mathop{\arg \min}_{\mathbf{P}, \mathbf{Q}} &\quad \sum_{i,j,k,l}
         (\mathbf{X}_{i,k} - \mathbf{Y}_{j,l})^2 \mathbf{P}_{i,j} \mathbf{Q}_{k,l} \\
         &+ \rho_s \mathbf{Div}(\mathbf{P}_{\# 1} \mathbf{Q}_{\# 1}^T | \mathbf{w}_{xs} \mathbf{w}_{ys}^T)
         + \rho_f \mathbf{Div}(\mathbf{P}_{\# 2} \mathbf{Q}_{\# 2}^T | \mathbf{w}_{xf} \mathbf{w}_{yf}^T) \\
@@ -414,7 +424,7 @@ def unbalanced_co_optimal_transport(
         &+ \varepsilon_s \mathbf{Div}(\mathbf{P} | \mathbf{w}_{xs} \mathbf{w}_{ys}^T)
         + \varepsilon_f \mathbf{Div}(\mathbf{Q} | \mathbf{w}_{xf} \mathbf{w}_{yf}^T)
 
-    Where :
+    Where:
 
     - :math:`\mathbf{X}`: Source input (arbitrary-size) matrix
     - :math:`\mathbf{Y}`: Target input (arbitrary-size) matrix
@@ -426,8 +436,7 @@ def unbalanced_co_optimal_transport(
     - :math:`\mathbf{w}_{yf}`: Distribution of the features in the target space
     - :math:`\mathbf{Div}`: Either Kullback-Leibler divergence or half-squared L2 norm.
 
-    .. note:: This function allows epsilon to be zero.
-              In that case, unbalanced_method must be either "mm" or "lbfgsb".
+    .. note:: This function allows `epsilon` to be zero. In that case, `unbalanced_method` must be either "mm" or "lbfgsb".
 
     Parameters
     ----------
@@ -449,20 +458,24 @@ def unbalanced_co_optimal_transport(
         Uniform distribution by default.
     reg_marginals: float or indexable object of length 1 or 2
         Marginal relaxation terms for sample and feature couplings.
-        If reg_marginals is a scalar or an indexable object of length 1,
-        then the same reg_marginals is applied to both marginal relaxations.
+        If `reg_marginals is a scalar` or an indexable object of length 1,
+        then the same value is applied to both marginal relaxations.
     epsilon : scalar or indexable object of length 2, float or int, optional (default = 0)
         Regularization parameters for entropic approximation of sample and feature couplings.
-        Allow the case where epsilon contains 0. In that case, the MM solver is used by default
-        instead of Sinkhorn solver. If epsilon is scalar, then the same epsilon is applied to
+        Allow the case where `epsilon` contains 0. In that case, the MM solver is used by default
+        instead of Sinkhorn solver. If `epsilon` is scalar, then the same value is applied to
         both regularization of sample and feature couplings.
     divergence : string, optional (default = "kl")
-        If divergence = "kl", then Div is the Kullback-Leibler divergence.
-        If divergence = "l2", then Div is the half squared Euclidean norm.
+
+        - If `divergence` = "kl", then Div is the Kullback-Leibler divergence.
+
+        - If `divergence` = "l2", then Div is the half squared Euclidean norm.
     unbalanced_solver : string, optional (default = "sinkhorn")
         Solver for the unbalanced OT subroutine.
-        If divergence = "kl", then unbalanced_solver can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
-        If divergence = "l2", then unbalanced_solver can be "mm", "lbfgsb"
+
+        - If `divergence` = "kl", then `unbalanced_solver` can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
+
+        - If `divergence` = "l2", then `unbalanced_solver` can be "mm", "lbfgsb"
     alpha : scalar or indexable object of length 2, float or int, optional (default = 0)
         Coeffficient parameter of linear terms with respect to the sample and feature couplings.
         If alpha is scalar, then the same alpha is applied to both linear terms.
@@ -492,8 +505,8 @@ def unbalanced_co_optimal_transport(
         Tolerance of unbalanced solver for each of the
         two unbalanced optimal transport problems in each BCD iteration.
     log : bool, optional (default = False)
-        If True then the cost and 4 dual vectors, including
-        2 from sample and 2 from feature couplings, are recorded.
+        If True then the cost and four dual vectors, including
+        two from sample and two from feature couplings, are recorded.
     verbose : bool, optional (default = False)
         If True then print the COOT cost at every multiplier of `eval_bcd`-th iteration.
 
@@ -505,10 +518,12 @@ def unbalanced_co_optimal_transport(
         Feature coupling matrix.
     log : dictionary, optional
         Returned if `log` is True. The keys are:
-            error : list of L1 norms between the current and previous sample coupling.
-            duals_sample : (n_sample_x, n_sample_y) tuple, float
+
+            error : array-like, float
+                list of L1 norms between the current and previous sample coupling.
+            duals_sample : (n_sample_x, n_sample_y)-tuple, float
                 Pair of dual vectors when solving OT problem w.r.t the sample coupling.
-            duals_feature : (n_feature_x, n_feature_y) tuple, float
+            duals_feature : (n_feature_x, n_feature_y)-tuple, float
                 Pair of dual vectors when solving OT problem w.r.t the feature coupling.
             linear : float
                 Linear part of the cost.
@@ -517,8 +532,8 @@ def unbalanced_co_optimal_transport(
 
     References
     ----------
-    .. [70] H. Tran, H. Janati, N. Courty, R. Flamary, I. Redko, P. Demetci and R. Singh,
-    Unbalanced Co-Optimal Transport, AAAI Conference on Artificial Intelligence, 2023.
+    .. [71] Tran, H., Janati, H., Courty, N., Flamary, R., Redko, I., Demetci, P., & Singh, R.
+            Unbalanced Co-Optimal Transport. AAAI Conference on Artificial Intelligence, 2023.
     """
 
     return fused_unbalanced_across_spaces_divergence(
@@ -543,15 +558,13 @@ def unbalanced_co_optimal_transport2(
     r"""Compute the unbalanced Co-Optimal Transport between two Euclidean point clouds
     (represented as matrices whose rows are samples and columns are the features/dimensions).
 
-    Return the unbalanced Co-Optimal Transport cost between
+    More precisely, this function returns the unbalanced Co-Optimal Transport cost between
     :math:`(\mathbf{X}, \mathbf{w}_{xs}, \mathbf{w}_{xf})` and
-    :math:`(\mathbf{Y}, \mathbf{w}_{ys}, \mathbf{w}_{yf})`.
-
-    The function solves the following problem using Block Coordinate Descent algorithm:
+    :math:`(\mathbf{Y}, \mathbf{w}_{ys}, \mathbf{w}_{yf})`,
+    by solving the following problem using Block Coordinate Descent algorithm:
 
     .. math::
-        \mathbf{UCOOT} = \mathop{\arg \min}_{\mathbf{P}, \mathbf{Q}}
-        &\quad \sum_{i,j,k,l}
+        \mathop{\min}_{\mathbf{P}, \mathbf{Q}} &\quad \sum_{i,j,k,l}
         (\mathbf{X}_{i,k} - \mathbf{Y}_{j,l})^2 \mathbf{P}_{i,j} \mathbf{Q}_{k,l} \\
         &+ \rho_s \mathbf{Div}(\mathbf{P}_{\# 1} \mathbf{Q}_{\# 1}^T | \mathbf{w}_{xs} \mathbf{w}_{ys}^T)
         + \rho_f \mathbf{Div}(\mathbf{P}_{\# 2} \mathbf{Q}_{\# 2}^T | \mathbf{w}_{xf} \mathbf{w}_{yf}^T) \\
@@ -560,7 +573,7 @@ def unbalanced_co_optimal_transport2(
         &+ \varepsilon_s \mathbf{Div}(\mathbf{P} | \mathbf{w}_{xs} \mathbf{w}_{ys}^T)
         + \varepsilon_f \mathbf{Div}(\mathbf{Q} | \mathbf{w}_{xf} \mathbf{w}_{yf}^T)
 
-    Where :
+    Where:
 
     - :math:`\mathbf{X}`: Source input (arbitrary-size) matrix
     - :math:`\mathbf{Y}`: Target input (arbitrary-size) matrix
@@ -572,11 +585,8 @@ def unbalanced_co_optimal_transport2(
     - :math:`\mathbf{w}_{yf}`: Distribution of the features in the target space
     - :math:`\mathbf{Div}`: Either Kullback-Leibler divergence or half-squared L2 norm.
 
-    .. note:: This function allows epsilon to be zero.
-              In that case, unbalanced_method must be either "mm" or "lbfgsb".
-
-              The computation of gradients is only supported for KL divergence.
-              The case of half squared-L2 norm uses those of KL divergence.
+    .. note:: This function allows `epsilon` to be zero. In that case, `unbalanced_method` must be either "mm" or "lbfgsb".
+            Also the computation of gradients is only supported for KL divergence. The case of half squared-L2 norm uses those of KL divergence.
 
     Parameters
     ----------
@@ -598,20 +608,24 @@ def unbalanced_co_optimal_transport2(
         Uniform distribution by default.
     reg_marginals: float or indexable object of length 1 or 2
         Marginal relaxation terms for sample and feature couplings.
-        If reg_marginals is a scalar or an indexable object of length 1,
-        then the same reg_marginals is applied to both marginal relaxations.
+        If `reg_marginals` is a scalar or an indexable object of length 1,
+        then the same value is applied to both marginal relaxations.
     epsilon : scalar or indexable object of length 2, float or int, optional (default = 0)
         Regularization parameters for entropic approximation of sample and feature couplings.
-        Allow the case where epsilon contains 0. In that case, the MM solver is used by default
-        instead of Sinkhorn solver. If epsilon is scalar, then the same epsilon is applied to
+        Allow the case where `epsilon` contains 0. In that case, the MM solver is used by default
+        instead of Sinkhorn solver. If `epsilon` is scalar, then the same value is applied to
         both regularization of sample and feature couplings.
     divergence : string, optional (default = "kl")
-        If divergence = "kl", then Div is the Kullback-Leibler divergence.
-        If divergence = "l2", then Div is the half squared Euclidean norm.
+
+        - If `divergence` = "kl", then Div is the Kullback-Leibler divergence.
+
+        - If `divergence` = "l2", then Div is the half squared Euclidean norm.
     unbalanced_solver : string, optional (default = "sinkhorn")
         Solver for the unbalanced OT subroutine.
-        If divergence = "kl", then unbalanced_solver can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
-        If divergence = "l2", then unbalanced_solver can be "mm", "lbfgsb"
+
+        - If `divergence` = "kl", then `unbalanced_solver` can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
+
+        - If `divergence` = "l2", then `unbalanced_solver` can be "mm", "lbfgsb"
     alpha : scalar or indexable object of length 2, float or int, optional (default = 0)
         Coeffficient parameter of linear terms with respect to the sample and feature couplings.
         If alpha is scalar, then the same alpha is applied to both linear terms.
@@ -641,8 +655,8 @@ def unbalanced_co_optimal_transport2(
         Tolerance of unbalanced solver for each of the
         two unbalanced optimal transport problems in each BCD iteration.
     log : bool, optional (default = False)
-        If True then the cost and 4 dual vectors, including
-        2 from sample and 2 from feature couplings, are recorded.
+        If True then the cost and four dual vectors, including
+        two from sample and two from feature couplings, are recorded.
     verbose : bool, optional (default = False)
         If True then print the COOT cost at every multiplier of `eval_bcd`-th iteration.
 
@@ -652,10 +666,12 @@ def unbalanced_co_optimal_transport2(
         UCOOT cost.
     log : dictionary, optional
         Returned if `log` is True. The keys are:
-            error : list of L1 norms between the current and previous sample coupling.
-            duals_sample : (n_sample_x, n_sample_y) tuple, float
+
+            error : array-like, float
+                list of L1 norms between the current and previous sample coupling.
+            duals_sample : (n_sample_x, n_sample_y)-tuple, float
                 Pair of dual vectors when solving OT problem w.r.t the sample coupling.
-            duals_feature : (n_feature_x, n_feature_y) tuple, float
+            duals_feature : (n_feature_x, n_feature_y)-tuple, float
                 Pair of dual vectors when solving OT problem w.r.t the feature coupling.
             linear : float
                 Linear part of UCOOT cost.
@@ -666,8 +682,8 @@ def unbalanced_co_optimal_transport2(
 
     References
     ----------
-    .. [70] H. Tran, H. Janati, N. Courty, R. Flamary, I. Redko, P. Demetci and R. Singh,
-    Unbalanced Co-Optimal Transport, AAAI Conference on Artificial Intelligence, 2023.
+    .. [71] Tran, H., Janati, H., Courty, N., Flamary, R., Redko, I., Demetci, P., & Singh, R.
+            Unbalanced Co-Optimal Transport. AAAI Conference on Artificial Intelligence, 2023.
     """
 
     if divergence != "kl":
@@ -747,20 +763,19 @@ def fused_unbalanced_gromov_wasserstein(
     r"""Compute the lower bound of the fused unbalanced Gromov-Wasserstein (FUGW) between two similarity matrices.
     In practice, this lower bound is used interchangeably with the true FUGW.
 
-    Return the transport plan between
-    :math:`(\mathbf{C^X}, \mathbf{w_X})` and :math:`(\mathbf{C^Y}, \mathbf{w_Y})`.
-
-    The function solves the following problem using Block Coordinate Descent algorithm:
+    More precisely, this function returns the transport plan between
+    :math:`(\mathbf{C^X}, \mathbf{w_X})` and :math:`(\mathbf{C^Y}, \mathbf{w_Y})`,
+    by solving the following problem using Block Coordinate Descent algorithm:
 
     .. math::
-        \mathbf{LB-FUGW} = \mathop{\arg \min}_{\mathbf{P}, \mathbf{Q}: mass(P) = mass(Q)}
-        &\quad \sum_{i,j,k,l} (\mathbf{C^X}_{i,k} - \mathbf{C^Y}_{j,l})^2 \mathbf{P}_{i,j} \mathbf{Q}_{k,l} \\
+        \mathop{\arg \min}_{\substack{\mathbf{P}, \mathbf{Q}: \\ mass(P) = mass(Q)}}
+        &\quad \sum_{i,j,k,l} (\mathbf{C^X}_{i,k} - \mathbf{C^Y}_{j,l})^2 \mathbf{P}_{i,j} \mathbf{Q}_{k,l}
+        + \frac{\alpha}{2} \sum_{i,j} (\mathbf{P}_{i,j} + \mathbf{Q}_{i,j}) \mathbf{M}_{i, j} \\
         &+ \rho_1 \mathbf{Div}(\mathbf{P}_{\# 1} \mathbf{Q}_{\# 1}^T | \mathbf{w_X} \mathbf{w_X}^T)
-        + \rho_2 \mathbf{Div}(\mathbf{P}_{\# 2} \mathbf{Q}_{\# 2}^T | \mathbf{w_Y} \mathbf{w_Y}^T)
-        &+ \alpha / 2 \sum_{i,j} (\mathbf{P}_{i,j} + \mathbf{Q}_{i,j}) \mathbf{M}_{i, j}
-        + \varepsilon \mathbf{Div}(\mathbf{P} \otimes \mathbf{Q} | (\mathbf{w_X} \mathbf{w_Y}^T) \otimes (\mathbf{w_X} \mathbf{w_Y}^T))
+        + \rho_2 \mathbf{Div}(\mathbf{P}_{\# 2} \mathbf{Q}_{\# 2}^T | \mathbf{w_Y} \mathbf{w_Y}^T) \\
+        &+ \varepsilon \mathbf{Div}(\mathbf{P} \otimes \mathbf{Q} | (\mathbf{w_X} \mathbf{w_Y}^T) \otimes (\mathbf{w_X} \mathbf{w_Y}^T))
 
-    Where :
+    Where:
 
     - :math:`\mathbf{C^X}`: Source similarity matrix
     - :math:`\mathbf{C^Y}`: Target similarity matrix
@@ -769,8 +784,7 @@ def fused_unbalanced_gromov_wasserstein(
     - :math:`\mathbf{w_Y}`: Distribution of the samples in the target space
     - :math:`\mathbf{Div}`: Either Kullback-Leibler divergence or half-squared L2 norm.
 
-    .. note:: This function allows epsilon to be zero.
-              In that case, unbalanced_method must be either "mm" or "lbfgsb".
+    .. note:: This function allows epsilon to be zero. In that case, `unbalanced_method` must be either "mm" or "lbfgsb".
 
     Parameters
     ----------
@@ -786,27 +800,31 @@ def fused_unbalanced_gromov_wasserstein(
         Uniform distribution by default.
     reg_marginals: float or indexable object of length 1 or 2
         Marginal relaxation terms for sample and feature couplings.
-        If reg_marginals is a scalar or an indexable object of length 1,
-        then the same reg_marginals is applied to both marginal relaxations.
+        If `reg_marginals` is a scalar or an indexable object of length 1,
+        then the same value is applied to both marginal relaxations.
     epsilon : scalar, float or int, optional (default = 0)
         Regularization parameters for entropic approximation of sample and feature couplings.
-        Allow the case where epsilon contains 0. In that case, the MM solver is used by default
-        instead of Sinkhorn solver. If epsilon is scalar, then the same epsilon is applied to
+        Allow the case where `epsilon` contains 0. In that case, the MM solver is used by default
+        instead of Sinkhorn solver. If `epsilon` is scalar, then the same value is applied to
         both regularization of sample and feature couplings.
     divergence : string, optional (default = "kl")
-        If divergence = "kl", then Div is the Kullback-Leibler divergence.
-        If divergence = "l2", then Div is the half squared Euclidean norm.
+
+        - If `divergence` = "kl", then Div is the Kullback-Leibler divergence.
+
+        - If `divergence` = "l2", then Div is the half squared Euclidean norm.
     unbalanced_solver : string, optional (default = "sinkhorn")
         Solver for the unbalanced OT subroutine.
-        If divergence = "kl", then unbalanced_solver can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
-        If divergence = "l2", then unbalanced_solver can be "mm", "lbfgsb"
+
+        - If `divergence` = "kl", then `unbalanced_solver` can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
+
+        - If `divergence` = "l2", then `unbalanced_solver` can be "mm", "lbfgsb"
     alpha : scalar, float or int, optional (default = 0)
         Coeffficient parameter of linear terms with respect to the sample and feature couplings.
         If alpha is scalar, then the same alpha is applied to both linear terms.
     M : (n_sample_x, n_sample_y), float, optional (default = None)
         Sample matrix associated to the Wasserstein linear term on sample coupling.
     init_pi :(n_sample_x, n_sample_y) array-like, optional (default = None)
-        Initialization of sample coupling. By default = wx wy^T.
+        Initialization of sample coupling. By default = :math:`w_X w_Y^T`.
     init_duals : tuple of vectors ((n_sample_x, ), (n_sample_y, )), optional (default = None).
         Initialization of sample and feature dual vectors
         if using Sinkhorn algorithm. Zero vectors by default.
@@ -822,8 +840,8 @@ def fused_unbalanced_gromov_wasserstein(
         Tolerance of unbalanced solver for each of the
         two unbalanced optimal transport problems in each BCD iteration.
     log : bool, optional (default = False)
-        If True then the cost and 4 dual vectors, including
-        2 from sample and 2 from feature couplings, are recorded.
+        If True then the cost and four dual vectors, including
+        two from sample and two from feature couplings, are recorded.
     verbose : bool, optional (default = False)
         If True then print the COOT cost at every multiplier of `eval_bcd`-th iteration.
 
@@ -837,10 +855,12 @@ def fused_unbalanced_gromov_wasserstein(
         In practice, we usually ignore this output.
     log : dictionary, optional
         Returned if `log` is True. The keys are:
-            error : list of L1 norms between the current and previous sample couplings.
-            duals : (n_sample_x, n_sample_y) tuple, float
+
+            error : array-like, float
+                list of L1 norms between the current and previous sample couplings.
+            duals : (n_sample_x, n_sample_y)-tuple, float
                 Pair of dual vectors when solving OT problem w.r.t the sample coupling.
-            linear_cost : float
+            linear : float
                 Linear part of FUGW cost.
             fugw_cost : float
                 Total FUGW cost.
@@ -849,9 +869,13 @@ def fused_unbalanced_gromov_wasserstein(
 
     References
     ----------
-    .. [69] Thual, A., Tran, H., Zemskova, T., Courty, N., Flamary, R., Dehaene, S. & Thirion, B.,
-    Aligning individual brains with Fused Unbalanced Gromov-Wasserstein,
-    Advances in Neural Information Systems, 35 (2022).
+    .. [70] Thual, A., Tran, H., Zemskova, T., Courty, N., Flamary, R., Dehaene, S., & Thirion, B.
+            Aligning individual brains with Fused Unbalanced Gromov-Wasserstein.
+            Advances in Neural Information Systems, 35 (2022).
+
+    .. [72] Thibault Séjourné, François-Xavier Vialard, & Gabriel Peyré.
+            The Unbalanced Gromov Wasserstein Distance: Conic Formulation and Relaxation.
+            Neural Information Processing Systems, 34 (2021).
     """
 
     alpha = (alpha / 2, alpha / 2)
@@ -890,20 +914,19 @@ def fused_unbalanced_gromov_wasserstein2(
     r"""Compute the lower bound of the fused unbalanced Gromov-Wasserstein (FUGW) between two similarity matrices.
     In practice, this lower bound is used interchangeably with the true FUGW.
 
-    Return the lower bound of the fused unbalanced Gromov-Wasserstein cost between
-    :math:`(\mathbf{C^X}, \mathbf{w_X})` and :math:`(\mathbf{C^Y}, \mathbf{w_Y})`.
-
-    The function solves the following problem using Block Coordinate Descent algorithm:
+    More precisely, this function returns the lower bound of the fused unbalanced Gromov-Wasserstein cost between
+    :math:`(\mathbf{C^X}, \mathbf{w_X})` and :math:`(\mathbf{C^Y}, \mathbf{w_Y})`,
+    by solving the following problem using Block Coordinate Descent algorithm:
 
     .. math::
-        \mathbf{LB-FUGW} = \mathop{\arg \min}_{\mathbf{P}, \mathbf{Q}: mass(P) = mass(Q)}
-        &\quad \sum_{i,j,k,l} (\mathbf{C^X}_{i,k} - \mathbf{C^Y}_{j,l})^2 \mathbf{P}_{i,j} \mathbf{Q}_{k,l} \\
+        \mathop{\min}_{\substack{\mathbf{P}, \mathbf{Q}: \\ mass(P) = mass(Q)}}
+        &\quad \sum_{i,j,k,l} (\mathbf{C^X}_{i,k} - \mathbf{C^Y}_{j,l})^2 \mathbf{P}_{i,j} \mathbf{Q}_{k,l}
+        + \frac{\alpha}{2} \sum_{i,j} (\mathbf{P}_{i,j} + \mathbf{Q}_{i,j}) \mathbf{M}_{i, j} \\
         &+ \rho_1 \mathbf{Div}(\mathbf{P}_{\# 1} \mathbf{Q}_{\# 1}^T | \mathbf{w_X} \mathbf{w_X}^T)
-        + \rho_2 \mathbf{Div}(\mathbf{P}_{\# 2} \mathbf{Q}_{\# 2}^T | \mathbf{w_Y} \mathbf{w_Y}^T)
-        &+ \alpha / 2 \sum_{i,j} (\mathbf{P}_{i,j} + \mathbf{Q}_{i,j}) \mathbf{M}_{i, j}
-        + \varepsilon \mathbf{Div}(\mathbf{P} \otimes \mathbf{Q} | (\mathbf{w_X} \mathbf{w_Y}^T) \otimes (\mathbf{w_X} \mathbf{w_Y}^T))
+        + \rho_2 \mathbf{Div}(\mathbf{P}_{\# 2} \mathbf{Q}_{\# 2}^T | \mathbf{w_Y} \mathbf{w_Y}^T) \\
+        &+ \varepsilon \mathbf{Div}(\mathbf{P} \otimes \mathbf{Q} | (\mathbf{w_X} \mathbf{w_Y}^T) \otimes (\mathbf{w_X} \mathbf{w_Y}^T))
 
-    Where :
+    Where:
 
     - :math:`\mathbf{C^X}`: Source similarity matrix
     - :math:`\mathbf{C^Y}`: Target similarity matrix
@@ -912,12 +935,8 @@ def fused_unbalanced_gromov_wasserstein2(
     - :math:`\mathbf{w_Y}`: Distribution of the samples in the target space
     - :math:`\mathbf{Div}`: Either Kullback-Leibler divergence or half-squared L2 norm.
 
-    .. note:: This function allows epsilon to be zero.
-              In that case, unbalanced_method must be either "mm" or "lbfgsb".
-
-              The computation of gradients is only supported for KL divergence,
-              but not for half squared-L2 norm.
-              In case of half squared-L2 norm, the calculation of KL divergence will be used.
+    .. note:: This function allows `epsilon` to be zero. In that case, unbalanced_method must be either "mm" or "lbfgsb".
+            Also the computation of gradients is only supported for KL divergence, but not for half squared-L2 norm. In case of half squared-L2 norm, the calculation of KL divergence will be used.
 
     Parameters
     ----------
@@ -933,27 +952,31 @@ def fused_unbalanced_gromov_wasserstein2(
         Uniform distribution by default.
     reg_marginals: float or indexable object of length 1 or 2
         Marginal relaxation terms for sample and feature couplings.
-        If reg_marginals is a scalar or an indexable object of length 1,
-        then the same reg_marginals is applied to both marginal relaxations.
+        If `reg_marginals` is a scalar or an indexable object of length 1,
+        then the same value is applied to both marginal relaxations.
     epsilon : scalar, float or int, optional (default = 0)
         Regularization parameters for entropic approximation of sample and feature couplings.
-        Allow the case where epsilon contains 0. In that case, the MM solver is used by default
-        instead of Sinkhorn solver. If epsilon is scalar, then the same epsilon is applied to
+        Allow the case where `epsilon` contains 0. In that case, the MM solver is used by default
+        instead of Sinkhorn solver. If `epsilon` is scalar, then the same value is applied to
         both regularization of sample and feature couplings.
     divergence : string, optional (default = "kl")
-        If divergence = "kl", then Div is the Kullback-Leibler divergence.
-        If divergence = "l2", then Div is the half squared Euclidean norm.
+
+        - If `divergence` = "kl", then Div is the Kullback-Leibler divergence.
+
+        - If `divergence` = "l2", then Div is the half squared Euclidean norm.
     unbalanced_solver : string, optional (default = "sinkhorn")
         Solver for the unbalanced OT subroutine.
-        If divergence = "kl", then unbalanced_solver can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
-        If divergence = "l2", then unbalanced_solver can be "mm", "lbfgsb"
+
+        - If `divergence` = "kl", then `unbalanced_solver` can be: "sinkhorn", "sinkhorn_log", "mm", "lbfgsb"
+
+        - If `divergence` = "l2", then `unbalanced_solver` can be "mm", "lbfgsb"
     alpha : scalar, float or int, optional (default = 0)
         Coeffficient parameter of linear terms with respect to the sample and feature couplings.
         If alpha is scalar, then the same alpha is applied to both linear terms.
     M : (n_sample_x, n_sample_y), float, optional (default = None)
         Sample matrix associated to the Wasserstein linear term on sample coupling.
     init_pi :(n_sample_x, n_sample_y) array-like, optional (default = None)
-        Initialization of sample coupling. By default = wx wy^T.
+        Initialization of sample coupling. By default = :math:`w_X w_Y^T`.
     init_duals : tuple of vectors ((n_sample_x, ), (n_sample_y, )), optional (default = None).
         Initialization of sample and feature dual vectors
         if using Sinkhorn algorithm. Zero vectors by default.
@@ -969,8 +992,8 @@ def fused_unbalanced_gromov_wasserstein2(
         Tolerance of unbalanced solver for each of the
         two unbalanced optimal transport problems in each BCD iteration.
     log : bool, optional (default = False)
-        If True then the cost and 4 dual vectors, including
-        Two from sample and two from feature couplings, are recorded.
+        If True then the cost and four dual vectors, including
+        two from sample and two from feature couplings, are recorded.
     verbose : bool, optional (default = False)
         If True then print the COOT cost at every multiplier of `eval_bcd`-th iteration.
 
@@ -980,8 +1003,10 @@ def fused_unbalanced_gromov_wasserstein2(
         Total FUGW cost
     log : dictionary, optional
         Returned if `log` is True. The keys are:
-            error : list of L1 norms between the current and previous sample couplings.
-            duals : (n_sample_x, n_sample_y) tuple, float
+
+            error : array-like, float
+                list of L1 norms between the current and previous sample couplings.
+            duals : (n_sample_x, n_sample_y)-tuple, float
                 Pair of dual vectors when solving OT problem w.r.t the sample coupling.
             linear : float
                 Linear part of FUGW cost.
@@ -992,9 +1017,13 @@ def fused_unbalanced_gromov_wasserstein2(
 
     References
     ----------
-    .. [69] Thual, A., Tran, H., Zemskova, T., Courty, N., Flamary, R., Dehaene, S. & Thirion, B.,
-    Aligning individual brains with Fused Unbalanced Gromov-Wasserstein,
-    Advances in Neural Information Systems, 35 (2022).
+    .. [70] Thual, A., Tran, H., Zemskova, T., Courty, N., Flamary, R., Dehaene, S., & Thirion, B.
+            Aligning individual brains with Fused Unbalanced Gromov-Wasserstein.
+            Advances in Neural Information Systems, 35 (2022).
+
+    .. [72] Thibault Séjourné, François-Xavier Vialard, & Gabriel Peyré.
+            The Unbalanced Gromov Wasserstein Distance: Conic Formulation and Relaxation.
+            Neural Information Processing Systems, 34 (2021).
     """
 
     if divergence != "kl":
diff --git a/ot/gromov/_utils.py b/ot/gromov/_utils.py
index 380d2ae76..fb07bb1ef 100644
--- a/ot/gromov/_utils.py
+++ b/ot/gromov/_utils.py
@@ -805,22 +805,24 @@ def update_barycenter_feature(
 ############################################################################
 
 def div_to_product(pi, a, b, pi1=None, pi2=None, divergence="kl", mass=True, nx=None):
-    r"""Calculate the Bregman divergence between an arbitrary measure and a product measure.
-    This implementation induces cheaper cost than the direct calculation.
+    r"""Fast computation of the Bregman divergence between an arbitrary measure and a product measure.
     Only support for Kullback-Leibler and half-squared L2 divergences.
 
-    For half-squared L2 divergence:
+    - For half-squared L2 divergence:
+
     .. math::
         \frac{1}{2} || \pi - a \otimes b ||^2
         = \frac{1}{2} \Big[ \sum_{i, j} \pi_{ij}^2 + (\sum_i a_i^2) ( \sum_j b_j^2) - 2 \sum_{i, j} a_i \pi_{ij} b_j \Big]
 
-    For Kullback-Leibler divergence:
+    - For Kullback-Leibler divergence:
+
     .. math::
         KL(\pi | a \otimes b)
-        = \langle \pi, \log \pi \rangle - \lange \pi_1, \log a \rangle
+        = \langle \pi, \log \pi \rangle - \langle \pi_1, \log a \rangle
         - \langle \pi_2, \log b \rangle - m(\pi) + m(a) m(b)
 
-    where:
+    where :
+
     - :math:`\pi` is the (`dim_a`, `dim_b`) transport plan
     - :math:`\pi_1` and :math:`\pi_2` are the marginal distributions
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
@@ -879,21 +881,23 @@ def div_to_product(pi, a, b, pi1=None, pi2=None, divergence="kl", mass=True, nx=
 
 
 def div_between_product(mu, nu, alpha, beta, divergence, nx=None):
-    r"""Calculate the Bregmain divergence between two product measures.
-    This implementation induces cheaper cost than the direct calculation.
+    r"""Fast computation of the Bregman divergence between two product measures.
     Only support for Kullback-Leibler and half-squared L2 divergences.
 
     For half-squared L2 divergence:
+
     .. math::
         \frac{1}{2} || \mu \otimes \nu, \alpha \otimes \beta ||^2
         = \frac{1}{2} \Big[ ||\alpha||^2 ||\beta||^2 + ||\mu||^2 ||\nu||^2 - 2 \langle \alpha, \mu \rangle \langle \beta, \nu \rangle \Big]
 
     For Kullback-Leibler divergence:
+
     .. math::
         KL(\mu \otimes \nu, \alpha \otimes \beta)
         = m(\mu) * KL(\nu, \beta) + m(\nu) * KL(\mu, \alpha) + (m(\mu) - m(\alpha)) * (m(\nu) - m(\beta))
 
     where:
+
     - :math:`\mu` and :math:`\alpha` are two measures having the same shape.
     - :math:`\nu` and :math:`\beta` are two measures having the same shape.
     - :math:`m` denotes the mass of the measure
@@ -939,12 +943,15 @@ def uot_cost_matrix(data, pi, tuple_p, hyperparams, divergence, reg_type, nx=Non
     r"""The Block Coordinate Descent algorithm for FUGW and UCOOT
     requires solving an UOT problem in each iteration.
     In particular, we need to specify the following inputs:
+
     - Cost matrix
+
     - Hyperparameters (marginal-relaxations and regularization)
+
     - Reference measures in the marginal-relaxation and regularization terms
 
     This method returns the cost matrix.
-    The method `get_uot_parameters` returns the rest of the inputs.
+    The method :any:`ot.gromov.uot_parameters_and_measures` returns the rest of the inputs.
 
     Parameters
     ----------
@@ -962,7 +969,9 @@ def uot_cost_matrix(data, pi, tuple_p, hyperparams, divergence, reg_type, nx=Non
         Bregman divergence, either "kl" (Kullback-Leibler divergence) or "l2" (half-squared L2 divergence)
     reg_type : string,
         Type of regularization term in the fused unbalanced across-domain divergence
+
         - `reg_type = "joint"` corresponds to FUGW
+
         - `reg_type = "independent"` corresponds to UCOOT
     nx : backend, optional
         If let to its default value None, a backend test will be conducted.
@@ -1001,11 +1010,14 @@ def uot_parameters_and_measures(pi, tuple_weights, hyperparams, reg_type, diverg
     r"""The Block Coordinate Descent algorithm for FUGW and UCOOT
     requires solving an UOT problem in each iteration.
     In particular, we need to specify the following inputs:
+
     - Cost matrix
+
     - Hyperparameters (marginal-relaxations and regularization)
+
     - Reference measures in the marginal-relaxation and regularization terms
 
-    The method `local_cost` returns the cost matrix.
+    The method :any:`ot.gromov.uot_cost_matrix` returns the cost matrix.
     This method returns the rest of the inputs.
 
     Parameters
@@ -1020,7 +1032,9 @@ def uot_parameters_and_measures(pi, tuple_weights, hyperparams, reg_type, diverg
         in the fused unbalanced across-domain divergence
     reg_type : string,
         Type of regularization term in the fused unbalanced across-domain divergence
+
         - `reg_type = "joint"` corresponds to FUGW
+
         - `reg_type = "independent"` corresponds to UCOOT
     divergence : string, default = "kl"
         Bregman divergence, either "kl" (Kullback-Leibler divergence) or "l2" (half-squared L2 divergence)
@@ -1083,7 +1097,9 @@ def fused_unbalanced_across_spaces_cost(M_linear, data, tuple_pxy_samp, tuple_px
         Bregman divergence, either "kl" (Kullback-Leibler divergence) or "l2" (half-squared L2 divergence)
     reg_type : string,
         Type of regularization term in the fused unbalanced across-domain divergence
+
         - `reg_type = "joint"` corresponds to FUGW
+
         - `reg_type = "independent"` corresponds to UCOOT
     nx : backend, optional
         If let to its default value None, a backend test will be conducted.