Source code for ot.gromov

# -*- coding: utf-8 -*-
"""
Gromov-Wasserstein and Fused-Gromov-Wasserstein solvers
"""

# Author: Erwan Vautier <erwan.vautier@gmail.com>
#         Nicolas Courty <ncourty@irisa.fr>
#         Rémi Flamary <remi.flamary@unice.fr>
#         Titouan Vayer <titouan.vayer@irisa.fr>
#         Cédric Vincent-Cuaz <cedric.vincent-cuaz@inria.fr>
#
# License: MIT License

import numpy as np


from .bregman import sinkhorn
from .utils import dist, UndefinedParameter, list_to_array
from .optim import cg
from .lp import emd_1d, emd
from .utils import check_random_state, unif
from .backend import get_backend


[docs]def init_matrix(C1, C2, p, q, loss_fun='square_loss'): r"""Return loss matrices and tensors for Gromov-Wasserstein fast computation Returns the value of :math:`\mathcal{L}(\mathbf{C_1}, \mathbf{C_2}) \otimes \mathbf{T}` with the selected loss function as the loss function of Gromow-Wasserstein discrepancy. The matrices are computed as described in Proposition 1 in :ref:`[12] <references-init-matrix>` Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space - :math:`\mathbf{C_2}`: Metric cost matrix in the target space - :math:`\mathbf{T}`: A coupling between those two spaces The square-loss function :math:`L(a, b) = |a - b|^2` is read as : .. math:: L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b) \mathrm{with} \ f_1(a) &= a^2 f_2(b) &= b^2 h_1(a) &= a h_2(b) &= 2b The kl-loss function :math:`L(a, b) = a \log\left(\frac{a}{b}\right) - a + b` is read as : .. math:: L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b) \mathrm{with} \ f_1(a) &= a \log(a) - a f_2(b) &= b h_1(a) &= a h_2(b) &= \log(b) Parameters ---------- C1 : array-like, shape (ns, ns) Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space p : array-like, shape (ns,) Probability distribution in the source space q : array-like, shape (nt,) Probability distribution in the target space loss_fun : str, optional Name of loss function to use: either 'square_loss' or 'kl_loss' (default='square_loss') Returns ------- constC : array-like, shape (ns, nt) Constant :math:`\mathbf{C}` matrix in Eq. (6) hC1 : array-like, shape (ns, ns) :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6) hC2 : array-like, shape (nt, nt) :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6) .. _references-init-matrix: References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. """ C1, C2, p, q = list_to_array(C1, C2, p, q) nx = get_backend(C1, C2, p, q) if loss_fun == 'square_loss': def f1(a): return (a**2) def f2(b): return (b**2) def h1(a): return a def h2(b): return 2 * b elif loss_fun == 'kl_loss': def f1(a): return a * nx.log(a + 1e-15) - a def f2(b): return b def h1(a): return a def h2(b): return nx.log(b + 1e-15) constC1 = nx.dot( nx.dot(f1(C1), nx.reshape(p, (-1, 1))), nx.ones((1, len(q)), type_as=q) ) constC2 = nx.dot( nx.ones((len(p), 1), type_as=p), nx.dot(nx.reshape(q, (1, -1)), f2(C2).T) ) constC = constC1 + constC2 hC1 = h1(C1) hC2 = h2(C2) return constC, hC1, hC2
[docs]def tensor_product(constC, hC1, hC2, T): r"""Return the tensor for Gromov-Wasserstein fast computation The tensor is computed as described in Proposition 1 Eq. (6) in :ref:`[12] <references-tensor-product>` Parameters ---------- constC : array-like, shape (ns, nt) Constant :math:`\mathbf{C}` matrix in Eq. (6) hC1 : array-like, shape (ns, ns) :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6) hC2 : array-like, shape (nt, nt) :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6) Returns ------- tens : array-like, shape (`ns`, `nt`) :math:`\mathcal{L}(\mathbf{C_1}, \mathbf{C_2}) \otimes \mathbf{T}` tensor-matrix multiplication result .. _references-tensor-product: References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. """ constC, hC1, hC2, T = list_to_array(constC, hC1, hC2, T) nx = get_backend(constC, hC1, hC2, T) A = - nx.dot( nx.dot(hC1, T), hC2.T ) tens = constC + A # tens -= tens.min() return tens
[docs]def gwloss(constC, hC1, hC2, T): r"""Return the Loss for Gromov-Wasserstein The loss is computed as described in Proposition 1 Eq. (6) in :ref:`[12] <references-gwloss>` Parameters ---------- constC : array-like, shape (ns, nt) Constant :math:`\mathbf{C}` matrix in Eq. (6) hC1 : array-like, shape (ns, ns) :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6) hC2 : array-like, shape (nt, nt) :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6) T : array-like, shape (ns, nt) Current value of transport matrix :math:`\mathbf{T}` Returns ------- loss : float Gromov Wasserstein loss .. _references-gwloss: References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. """ tens = tensor_product(constC, hC1, hC2, T) tens, T = list_to_array(tens, T) nx = get_backend(tens, T) return nx.sum(tens * T)
[docs]def gwggrad(constC, hC1, hC2, T): r"""Return the gradient for Gromov-Wasserstein The gradient is computed as described in Proposition 2 in :ref:`[12] <references-gwggrad>` Parameters ---------- constC : array-like, shape (ns, nt) Constant :math:`\mathbf{C}` matrix in Eq. (6) hC1 : array-like, shape (ns, ns) :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6) hC2 : array-like, shape (nt, nt) :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6) T : array-like, shape (ns, nt) Current value of transport matrix :math:`\mathbf{T}` Returns ------- grad : array-like, shape (`ns`, `nt`) Gromov Wasserstein gradient .. _references-gwggrad: References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. """ return 2 * tensor_product(constC, hC1, hC2, T) # [12] Prop. 2 misses a 2 factor
[docs]def update_square_loss(p, lambdas, T, Cs): r""" Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s` couplings calculated at each iteration Parameters ---------- p : array-like, shape (N,) Masses in the targeted barycenter. lambdas : list of float List of the `S` spaces' weights. T : list of S array-like of shape (ns,N) The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration. Cs : list of S array-like, shape(ns,ns) Metric cost matrices. Returns ---------- C : array-like, shape (`nt`, `nt`) Updated :math:`\mathbf{C}` matrix. """ T = list_to_array(*T) Cs = list_to_array(*Cs) p = list_to_array(p) nx = get_backend(p, *T, *Cs) tmpsum = sum([ lambdas[s] * nx.dot( nx.dot(T[s].T, Cs[s]), T[s] ) for s in range(len(T)) ]) ppt = nx.outer(p, p) return tmpsum / ppt
[docs]def update_kl_loss(p, lambdas, T, Cs): r""" Updates :math:`\mathbf{C}` according to the KL Loss kernel with the `S` :math:`\mathbf{T}_s` couplings calculated at each iteration Parameters ---------- p : array-like, shape (N,) Weights in the targeted barycenter. lambdas : list of float List of the `S` spaces' weights T : list of S array-like of shape (ns,N) The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration. Cs : list of S array-like, shape(ns,ns) Metric cost matrices. Returns ---------- C : array-like, shape (`ns`, `ns`) updated :math:`\mathbf{C}` matrix """ Cs = list_to_array(*Cs) T = list_to_array(*T) p = list_to_array(p) nx = get_backend(p, *T, *Cs) tmpsum = sum([ lambdas[s] * nx.dot( nx.dot(T[s].T, Cs[s]), T[s] ) for s in range(len(T)) ]) ppt = nx.outer(p, p) return nx.exp(tmpsum / ppt)
[docs]def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=False, G0=None, **kwargs): r""" Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` The function solves the following optimization problem: .. math:: \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space - :math:`\mathbf{C_2}`: Metric cost matrix in the target space - :math:`\mathbf{p}`: distribution in the source space - :math:`\mathbf{q}`: distribution in the target space - `L`: loss function to account for the misfit between the similarity matrices .. note:: This function is backend-compatible and will work on arrays from all compatible backends. But the algorithm uses the C++ CPU backend which can lead to copy overhead on GPU arrays. Parameters ---------- C1 : array-like, shape (ns, ns) Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space p : array-like, shape (ns,) Distribution in the source space q : array-like, shape (nt,) Distribution in the target space loss_fun : str loss function used for the solver either 'square_loss' or 'kl_loss' max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0) verbose : bool, optional Print information along iterations log : bool, optional record log if True armijo : bool, optional If True the step of the line-search is found via an armijo research. Else closed form is used. If there are convergence issues use False. G0: array-like, shape (ns,nt), optional If None the initial transport plan of the solver is pq^T. Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. **kwargs : dict parameters can be directly passed to the ot.optim.cg solver Returns ------- T : array-like, shape (`ns`, `nt`) Coupling between the two spaces that minimizes: :math:`\sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}` log : dict Convergence information and loss. References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. .. [13] Mémoli, Facundo. Gromov–Wasserstein distances and the metric approach to object matching. Foundations of computational mathematics 11.4 (2011): 417-487. """ p, q = list_to_array(p, q) p0, q0, C10, C20 = p, q, C1, C2 if G0 is None: nx = get_backend(p0, q0, C10, C20) else: G0_ = G0 nx = get_backend(p0, q0, C10, C20, G0_) p = nx.to_numpy(p) q = nx.to_numpy(q) C1 = nx.to_numpy(C10) C2 = nx.to_numpy(C20) if G0 is None: G0 = p[:, None] * q[None, :] else: G0 = nx.to_numpy(G0_) # Check marginals of G0 np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08) np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08) constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) def f(G): return gwloss(constC, hC1, hC2, G) def df(G): return gwggrad(constC, hC1, hC2, G) if log: res, log = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs) log['gw_dist'] = nx.from_numpy(gwloss(constC, hC1, hC2, res), type_as=C10) log['u'] = nx.from_numpy(log['u'], type_as=C10) log['v'] = nx.from_numpy(log['v'], type_as=C10) return nx.from_numpy(res, type_as=C10), log else: return nx.from_numpy(cg(p, q, 0, 1, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=False, **kwargs), type_as=C10)
[docs]def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=False, G0=None, **kwargs): r""" Returns the gromov-wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` The function solves the following optimization problem: .. math:: GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space - :math:`\mathbf{C_2}`: Metric cost matrix in the target space - :math:`\mathbf{p}`: distribution in the source space - :math:`\mathbf{q}`: distribution in the target space - `L`: loss function to account for the misfit between the similarity matrices Note that when using backends, this loss function is differentiable wrt the marices and weights for quadratic loss using the gradients from [38]_. .. note:: This function is backend-compatible and will work on arrays from all compatible backends. But the algorithm uses the C++ CPU backend which can lead to copy overhead on GPU arrays. Parameters ---------- C1 : array-like, shape (ns, ns) Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space p : array-like, shape (ns,) Distribution in the source space. q : array-like, shape (nt,) Distribution in the target space. loss_fun : str loss function used for the solver either 'square_loss' or 'kl_loss' max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0) verbose : bool, optional Print information along iterations log : bool, optional record log if True armijo : bool, optional If True the step of the line-search is found via an armijo research. Else closed form is used. If there are convergence issues use False. G0: array-like, shape (ns,nt), optional If None the initial transport plan of the solver is pq^T. Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. Returns ------- gw_dist : float Gromov-Wasserstein distance log : dict convergence information and Coupling marix References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. .. [13] Mémoli, Facundo. Gromov–Wasserstein distances and the metric approach to object matching. Foundations of computational mathematics 11.4 (2011): 417-487. .. [38] C. Vincent-Cuaz, T. Vayer, R. Flamary, M. Corneli, N. Courty, Online Graph Dictionary Learning, International Conference on Machine Learning (ICML), 2021. """ p, q = list_to_array(p, q) p0, q0, C10, C20 = p, q, C1, C2 if G0 is None: nx = get_backend(p0, q0, C10, C20) else: G0_ = G0 nx = get_backend(p0, q0, C10, C20, G0_) p = nx.to_numpy(p) q = nx.to_numpy(q) C1 = nx.to_numpy(C10) C2 = nx.to_numpy(C20) constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) if G0 is None: G0 = p[:, None] * q[None, :] else: G0 = nx.to_numpy(G0_) # Check marginals of G0 np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08) np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08) def f(G): return gwloss(constC, hC1, hC2, G) def df(G): return gwggrad(constC, hC1, hC2, G) T, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs) T0 = nx.from_numpy(T, type_as=C10) log_gw['gw_dist'] = nx.from_numpy(gwloss(constC, hC1, hC2, T), type_as=C10) log_gw['u'] = nx.from_numpy(log_gw['u'], type_as=C10) log_gw['v'] = nx.from_numpy(log_gw['v'], type_as=C10) log_gw['T'] = T0 gw = log_gw['gw_dist'] if loss_fun == 'square_loss': gC1 = 2 * C1 * (p[:, None] * p[None, :]) - 2 * T.dot(C2).dot(T.T) gC2 = 2 * C2 * (q[:, None] * q[None, :]) - 2 * T.T.dot(C1).dot(T) gC1 = nx.from_numpy(gC1, type_as=C10) gC2 = nx.from_numpy(gC2, type_as=C10) gw = nx.set_gradients(gw, (p0, q0, C10, C20), (log_gw['u'] - nx.mean(log_gw['u']), log_gw['v'] - nx.mean(log_gw['v']), gC1, gC2)) if log: return gw, log_gw else: return gw
[docs]def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, G0=None, log=False, **kwargs): r""" Computes the FGW transport between two graphs (see :ref:`[24] <references-fused-gromov-wasserstein>`) .. math:: \gamma = \mathop{\arg \min}_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q} \mathbf{\gamma} &\geq 0 where : - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1) - `L` is a loss function to account for the misfit between the similarity matrices .. note:: This function is backend-compatible and will work on arrays from all compatible backends. But the algorithm uses the C++ CPU backend which can lead to copy overhead on GPU arrays. The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[24] <references-fused-gromov-wasserstein>` Parameters ---------- M : array-like, shape (ns, nt) Metric cost matrix between features across domains C1 : array-like, shape (ns, ns) Metric cost matrix representative of the structure in the source space C2 : array-like, shape (nt, nt) Metric cost matrix representative of the structure in the target space p : array-like, shape (ns,) Distribution in the source space q : array-like, shape (nt,) Distribution in the target space loss_fun : str, optional Loss function used for the solver alpha : float, optional Trade-off parameter (0 < alpha < 1) armijo : bool, optional If True the step of the line-search is found via an armijo research. Else closed form is used. If there are convergence issues use False. G0: array-like, shape (ns,nt), optional If None the initial transport plan of the solver is pq^T. Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. log : bool, optional record log if True **kwargs : dict parameters can be directly passed to the ot.optim.cg solver Returns ------- gamma : array-like, shape (`ns`, `nt`) Optimal transportation matrix for the given parameters. log : dict Log dictionary return only if log==True in parameters. .. _references-fused-gromov-wasserstein: References ---------- .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas "Optimal Transport for structured data with application on graphs", International Conference on Machine Learning (ICML). 2019. """ p, q = list_to_array(p, q) p0, q0, C10, C20, M0 = p, q, C1, C2, M if G0 is None: nx = get_backend(p0, q0, C10, C20, M0) else: G0_ = G0 nx = get_backend(p0, q0, C10, C20, M0, G0_) p = nx.to_numpy(p) q = nx.to_numpy(q) C1 = nx.to_numpy(C10) C2 = nx.to_numpy(C20) M = nx.to_numpy(M0) if G0 is None: G0 = p[:, None] * q[None, :] else: G0 = nx.to_numpy(G0_) # Check marginals of G0 np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08) np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08) constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) def f(G): return gwloss(constC, hC1, hC2, G) def df(G): return gwggrad(constC, hC1, hC2, G) if log: res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs) fgw_dist = nx.from_numpy(log['loss'][-1], type_as=C10) log['fgw_dist'] = fgw_dist log['u'] = nx.from_numpy(log['u'], type_as=C10) log['v'] = nx.from_numpy(log['v'], type_as=C10) return nx.from_numpy(res, type_as=C10), log else: return nx.from_numpy(cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs), type_as=C10)
[docs]def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, G0=None, log=False, **kwargs): r""" Computes the FGW distance between two graphs see (see :ref:`[24] <references-fused-gromov-wasserstein2>`) .. math:: \min_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q} \mathbf{\gamma} &\geq 0 where : - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1) - `L` is a loss function to account for the misfit between the similarity matrices The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[24] <references-fused-gromov-wasserstein2>` .. note:: This function is backend-compatible and will work on arrays from all compatible backends. But the algorithm uses the C++ CPU backend which can lead to copy overhead on GPU arrays. Note that when using backends, this loss function is differentiable wrt the marices and weights for quadratic loss using the gradients from [38]_. Parameters ---------- M : array-like, shape (ns, nt) Metric cost matrix between features across domains C1 : array-like, shape (ns, ns) Metric cost matrix representative of the structure in the source space. C2 : array-like, shape (nt, nt) Metric cost matrix representative of the structure in the target space. p : array-like, shape (ns,) Distribution in the source space. q : array-like, shape (nt,) Distribution in the target space. loss_fun : str, optional Loss function used for the solver. alpha : float, optional Trade-off parameter (0 < alpha < 1) armijo : bool, optional If True the step of the line-search is found via an armijo research. Else closed form is used. If there are convergence issues use False. G0: array-like, shape (ns,nt), optional If None the initial transport plan of the solver is pq^T. Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. log : bool, optional Record log if True. **kwargs : dict Parameters can be directly passed to the ot.optim.cg solver. Returns ------- fgw-distance : float Fused gromov wasserstein distance for the given parameters. log : dict Log dictionary return only if log==True in parameters. .. _references-fused-gromov-wasserstein2: References ---------- .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas "Optimal Transport for structured data with application on graphs" International Conference on Machine Learning (ICML). 2019. .. [38] C. Vincent-Cuaz, T. Vayer, R. Flamary, M. Corneli, N. Courty, Online Graph Dictionary Learning, International Conference on Machine Learning (ICML), 2021. """ p, q = list_to_array(p, q) p0, q0, C10, C20, M0 = p, q, C1, C2, M if G0 is None: nx = get_backend(p0, q0, C10, C20, M0) else: G0_ = G0 nx = get_backend(p0, q0, C10, C20, M0, G0_) p = nx.to_numpy(p) q = nx.to_numpy(q) C1 = nx.to_numpy(C10) C2 = nx.to_numpy(C20) M = nx.to_numpy(M0) constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) if G0 is None: G0 = p[:, None] * q[None, :] else: G0 = nx.to_numpy(G0_) # Check marginals of G0 np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08) np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08) def f(G): return gwloss(constC, hC1, hC2, G) def df(G): return gwggrad(constC, hC1, hC2, G) T, log_fgw = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs) fgw_dist = nx.from_numpy(log_fgw['loss'][-1], type_as=C10) T0 = nx.from_numpy(T, type_as=C10) log_fgw['fgw_dist'] = fgw_dist log_fgw['u'] = nx.from_numpy(log_fgw['u'], type_as=C10) log_fgw['v'] = nx.from_numpy(log_fgw['v'], type_as=C10) log_fgw['T'] = T0 if loss_fun == 'square_loss': gC1 = 2 * C1 * (p[:, None] * p[None, :]) - 2 * T.dot(C2).dot(T.T) gC2 = 2 * C2 * (q[:, None] * q[None, :]) - 2 * T.T.dot(C1).dot(T) gC1 = nx.from_numpy(gC1, type_as=C10) gC2 = nx.from_numpy(gC2, type_as=C10) fgw_dist = nx.set_gradients(fgw_dist, (p0, q0, C10, C20, M0), (log_fgw['u'] - nx.mean(log_fgw['u']), log_fgw['v'] - nx.mean(log_fgw['v']), alpha * gC1, alpha * gC2, (1 - alpha) * T0)) if log: return fgw_dist, log_fgw else: return fgw_dist
[docs]def GW_distance_estimation(C1, C2, p, q, loss_fun, T, nb_samples_p=None, nb_samples_q=None, std=True, random_state=None): r""" Returns an approximation of the gromov-wasserstein cost between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` with a fixed transport plan :math:`\mathbf{T}`. The function gives an unbiased approximation of the following equation: .. math:: GW = \sum_{i,j,k,l} L(\mathbf{C_{1}}_{i,k}, \mathbf{C_{2}}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space - :math:`\mathbf{C_2}`: Metric cost matrix in the target space - `L` : Loss function to account for the misfit between the similarity matrices - :math:`\mathbf{T}`: Matrix with marginal :math:`\mathbf{p}` and :math:`\mathbf{q}` Parameters ---------- C1 : array-like, shape (ns, ns) Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space p : array-like, shape (ns,) Distribution in the source space q : array-like, shape (nt,) Distribution in the target space loss_fun : function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}` Loss function used for the distance, the transport plan does not depend on the loss function T : csr or array-like, shape (ns, nt) Transport plan matrix, either a sparse csr or a dense matrix nb_samples_p : int, optional `nb_samples_p` is the number of samples (without replacement) along the first dimension of :math:`\mathbf{T}` nb_samples_q : int, optional `nb_samples_q` is the number of samples along the second dimension of :math:`\mathbf{T}`, for each sample along the first std : bool, optional Standard deviation associated with the prediction of the gromov-wasserstein cost random_state : int or RandomState instance, optional Fix the seed for reproducibility Returns ------- : float Gromov-wasserstein cost References ---------- .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc "Sampled Gromov Wasserstein." Machine Learning Journal (MLJ). 2021. """ C1, C2, p, q = list_to_array(C1, C2, p, q) nx = get_backend(C1, C2, p, q) generator = check_random_state(random_state) len_p = p.shape[0] len_q = q.shape[0] # It is always better to sample from the biggest distribution first. if len_p < len_q: p, q = q, p len_p, len_q = len_q, len_p C1, C2 = C2, C1 T = T.T if nb_samples_p is None: if nx.issparse(T): # If T is sparse, it probably mean that PoGroW was used, thus the number of sample is reduced nb_samples_p = min(int(5 * (len_p * np.log(len_p)) ** 0.5), len_p) else: nb_samples_p = len_p else: # The number of sample along the first dimension is without replacement. nb_samples_p = min(nb_samples_p, len_p) if nb_samples_q is None: nb_samples_q = 1 if std: nb_samples_q = max(2, nb_samples_q) index_k = np.zeros((nb_samples_p, nb_samples_q), dtype=int) index_l = np.zeros((nb_samples_p, nb_samples_q), dtype=int) index_i = generator.choice( len_p, size=nb_samples_p, p=nx.to_numpy(p), replace=False ) index_j = generator.choice( len_p, size=nb_samples_p, p=nx.to_numpy(p), replace=False ) for i in range(nb_samples_p): if nx.issparse(T): T_indexi = nx.reshape(nx.todense(T[index_i[i], :]), (-1,)) T_indexj = nx.reshape(nx.todense(T[index_j[i], :]), (-1,)) else: T_indexi = T[index_i[i], :] T_indexj = T[index_j[i], :] # For each of the row sampled, the column is sampled. index_k[i] = generator.choice( len_q, size=nb_samples_q, p=nx.to_numpy(T_indexi / nx.sum(T_indexi)), replace=True ) index_l[i] = generator.choice( len_q, size=nb_samples_q, p=nx.to_numpy(T_indexj / nx.sum(T_indexj)), replace=True ) list_value_sample = nx.stack([ loss_fun( C1[np.ix_(index_i, index_j)], C2[np.ix_(index_k[:, n], index_l[:, n])] ) for n in range(nb_samples_q) ], axis=2) if std: std_value = nx.sum(nx.std(list_value_sample, axis=2) ** 2) ** 0.5 return nx.mean(list_value_sample), std_value / (nb_samples_p * nb_samples_p) else: return nx.mean(list_value_sample)
[docs]def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun, alpha=1, max_iter=100, threshold_plan=0, log=False, verbose=False, random_state=None): r""" Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` using a stochastic Frank-Wolfe. This method has a :math:`\mathcal{O}(\mathrm{max\_iter} \times PN^2)` time complexity with `P` the number of Sinkhorn iterations. The function solves the following optimization problem: .. math:: \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} \mathbf{T}^T \mathbf{1} &= \mathbf{q} \mathbf{T} &\geq 0 Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space - :math:`\mathbf{C_2}`: Metric cost matrix in the target space - :math:`\mathbf{p}`: distribution in the source space - :math:`\mathbf{q}`: distribution in the target space - `L`: loss function to account for the misfit between the similarity matrices Parameters ---------- C1 : array-like, shape (ns, ns) Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space p : array-like, shape (ns,) Distribution in the source space q : array-like, shape (nt,) Distribution in the target space loss_fun : function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}` Loss function used for the distance, the transport plan does not depend on the loss function alpha : float Step of the Frank-Wolfe algorithm, should be between 0 and 1 max_iter : int, optional Max number of iterations threshold_plan : float, optional Deleting very small values in the transport plan. If above zero, it violates the marginal constraints. verbose : bool, optional Print information along iterations log : bool, optional Gives the distance estimated and the standard deviation random_state : int or RandomState instance, optional Fix the seed for reproducibility Returns ------- T : array-like, shape (`ns`, `nt`) Optimal coupling between the two spaces References ---------- .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc "Sampled Gromov Wasserstein." Machine Learning Journal (MLJ). 2021. """ C1, C2, p, q = list_to_array(C1, C2, p, q) nx = get_backend(C1, C2, p, q) len_p = p.shape[0] len_q = q.shape[0] generator = check_random_state(random_state) index = np.zeros(2, dtype=int) # Initialize with default marginal index[0] = generator.choice(len_p, size=1, p=nx.to_numpy(p)) index[1] = generator.choice(len_q, size=1, p=nx.to_numpy(q)) T = nx.tocsr(emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False)) best_gw_dist_estimated = np.inf for cpt in range(max_iter): index[0] = generator.choice(len_p, size=1, p=nx.to_numpy(p)) T_index0 = nx.reshape(nx.todense(T[index[0], :]), (-1,)) index[1] = generator.choice( len_q, size=1, p=nx.to_numpy(T_index0 / nx.sum(T_index0)) ) if alpha == 1: T = nx.tocsr( emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False) ) else: new_T = nx.tocsr( emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False) ) T = (1 - alpha) * T + alpha * new_T # To limit the number of non 0, the values below the threshold are set to 0. T = nx.eliminate_zeros(T, threshold=threshold_plan) if cpt % 10 == 0 or cpt == (max_iter - 1): gw_dist_estimated = GW_distance_estimation( C1=C1, C2=C2, loss_fun=loss_fun, p=p, q=q, T=T, std=False, random_state=generator ) if gw_dist_estimated < best_gw_dist_estimated: best_gw_dist_estimated = gw_dist_estimated best_T = nx.copy(T) if verbose: if cpt % 200 == 0: print('{:5s}|{:12s}'.format('It.', 'Best gw estimated') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(cpt, best_gw_dist_estimated)) if log: log = {} log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation( C1=C1, C2=C2, loss_fun=loss_fun, p=p, q=q, T=best_T, random_state=generator ) return best_T, log return best_T
[docs]def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun, nb_samples_grad=100, epsilon=1, max_iter=500, log=False, verbose=False, random_state=None): r""" Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` using a 1-stochastic Frank-Wolfe. This method has a :math:`\mathcal{O}(\mathrm{max\_iter} \times N \log(N))` time complexity by relying on the 1D Optimal Transport solver. The function solves the following optimization problem: .. math:: \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} \mathbf{T}^T \mathbf{1} &= \mathbf{q} \mathbf{T} &\geq 0 Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space - :math:`\mathbf{C_2}`: Metric cost matrix in the target space - :math:`\mathbf{p}`: distribution in the source space - :math:`\mathbf{q}`: distribution in the target space - `L`: loss function to account for the misfit between the similarity matrices Parameters ---------- C1 : array-like, shape (ns, ns) Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space p : array-like, shape (ns,) Distribution in the source space q : array-like, shape (nt,) Distribution in the target space loss_fun : function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}` Loss function used for the distance, the transport plan does not depend on the loss function nb_samples_grad : int Number of samples to approximate the gradient epsilon : float Weight of the Kullback-Leibler regularization max_iter : int, optional Max number of iterations verbose : bool, optional Print information along iterations log : bool, optional Gives the distance estimated and the standard deviation random_state : int or RandomState instance, optional Fix the seed for reproducibility Returns ------- T : array-like, shape (`ns`, `nt`) Optimal coupling between the two spaces References ---------- .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc "Sampled Gromov Wasserstein." Machine Learning Journal (MLJ). 2021. """ C1, C2, p, q = list_to_array(C1, C2, p, q) nx = get_backend(C1, C2, p, q) len_p = p.shape[0] len_q = q.shape[0] generator = check_random_state(random_state) # The most natural way to define nb_sample is with a simple integer. if isinstance(nb_samples_grad, int): if nb_samples_grad > len_p: # As the sampling along the first dimension is done without replacement, the rest is reported to the second # dimension. nb_samples_grad_p, nb_samples_grad_q = len_p, nb_samples_grad // len_p else: nb_samples_grad_p, nb_samples_grad_q = nb_samples_grad, 1 else: nb_samples_grad_p, nb_samples_grad_q = nb_samples_grad T = nx.outer(p, q) # continue_loop allows to stop the loop if there is several successive small modification of T. continue_loop = 0 # The gradient of GW is more complex if the two matrices are not symmetric. C_are_symmetric = nx.allclose(C1, C1.T, rtol=1e-10, atol=1e-10) and nx.allclose(C2, C2.T, rtol=1e-10, atol=1e-10) for cpt in range(max_iter): index0 = generator.choice( len_p, size=nb_samples_grad_p, p=nx.to_numpy(p), replace=False ) Lik = 0 for i, index0_i in enumerate(index0): index1 = generator.choice( len_q, size=nb_samples_grad_q, p=nx.to_numpy(T[index0_i, :] / nx.sum(T[index0_i, :])), replace=False ) # If the matrices C are not symmetric, the gradient has 2 terms, thus the term is chosen randomly. if (not C_are_symmetric) and generator.rand(1) > 0.5: Lik += nx.mean(loss_fun( C1[:, [index0[i]] * nb_samples_grad_q][:, None, :], C2[:, index1][None, :, :] ), axis=2) else: Lik += nx.mean(loss_fun( C1[[index0[i]] * nb_samples_grad_q, :][:, :, None], C2[index1, :][:, None, :] ), axis=0) max_Lik = nx.max(Lik) if max_Lik == 0: continue # This division by the max is here to facilitate the choice of epsilon. Lik /= max_Lik if epsilon > 0: # Set to infinity all the numbers below exp(-200) to avoid log of 0. log_T = nx.log(nx.clip(T, np.exp(-200), 1)) log_T = nx.where(log_T == -200, -np.inf, log_T) Lik = Lik - epsilon * log_T try: new_T = sinkhorn(a=p, b=q, M=Lik, reg=epsilon) except (RuntimeWarning, UserWarning): print("Warning catched in Sinkhorn: Return last stable T") break else: new_T = emd(a=p, b=q, M=Lik) change_T = nx.mean((T - new_T) ** 2) if change_T <= 10e-20: continue_loop += 1 if continue_loop > 100: # Number max of low modifications of T T = nx.copy(new_T) break else: continue_loop = 0 if verbose and cpt % 10 == 0: if cpt % 200 == 0: print('{:5s}|{:12s}'.format('It.', '||T_n - T_{n+1}||') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(cpt, change_T)) T = nx.copy(new_T) if log: log = {} log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation( C1=C1, C2=C2, loss_fun=loss_fun, p=p, q=q, T=T, random_state=generator ) return T, log return T
[docs]def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, max_iter=1000, tol=1e-9, verbose=False, log=False): r""" Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` The function solves the following optimization problem: .. math:: \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T})) s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} \mathbf{T}^T \mathbf{1} &= \mathbf{q} \mathbf{T} &\geq 0 Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space - :math:`\mathbf{C_2}`: Metric cost matrix in the target space - :math:`\mathbf{p}`: distribution in the source space - :math:`\mathbf{q}`: distribution in the target space - `L`: loss function to account for the misfit between the similarity matrices - `H`: entropy Parameters ---------- C1 : array-like, shape (ns, ns) Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space p : array-like, shape (ns,) Distribution in the source space q : array-like, shape (nt,) Distribution in the target space loss_fun : string Loss function used for the solver either 'square_loss' or 'kl_loss' epsilon : float Regularization term >0 max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0) verbose : bool, optional Print information along iterations log : bool, optional Record log if True. Returns ------- T : array-like, shape (`ns`, `nt`) Optimal coupling between the two spaces References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. """ C1, C2, p, q = list_to_array(C1, C2, p, q) nx = get_backend(C1, C2, p, q) T = nx.outer(p, q) constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) cpt = 0 err = 1 if log: log = {'err': []} while (err > tol and cpt < max_iter): Tprev = T # compute the gradient tens = gwggrad(constC, hC1, hC2, T) T = sinkhorn(p, q, tens, epsilon, method='sinkhorn') if cpt % 10 == 0: # we can speed up the process by checking for the error only all # the 10th iterations err = nx.norm(T - Tprev) if log: log['err'].append(err) if verbose: if cpt % 200 == 0: print('{:5s}|{:12s}'.format( 'It.', 'Err') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(cpt, err)) cpt += 1 if log: log['gw_dist'] = gwloss(constC, hC1, hC2, T) return T, log else: return T
[docs]def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon, max_iter=1000, tol=1e-9, verbose=False, log=False): r""" Returns the entropic gromov-wasserstein discrepancy between the two measured similarity matrices :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` The function solves the following optimization problem: .. math:: GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T})) Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space - :math:`\mathbf{C_2}`: Metric cost matrix in the target space - :math:`\mathbf{p}`: distribution in the source space - :math:`\mathbf{q}`: distribution in the target space - `L`: loss function to account for the misfit between the similarity matrices - `H`: entropy Parameters ---------- C1 : array-like, shape (ns, ns) Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space p : array-like, shape (ns,) Distribution in the source space q : array-like, shape (nt,) Distribution in the target space loss_fun : str Loss function used for the solver either 'square_loss' or 'kl_loss' epsilon : float Regularization term >0 max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0) verbose : bool, optional Print information along iterations log : bool, optional Record log if True. Returns ------- gw_dist : float Gromov-Wasserstein distance References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. """ gw, logv = entropic_gromov_wasserstein( C1, C2, p, q, loss_fun, epsilon, max_iter, tol, verbose, log=True) logv['T'] = gw if log: return logv['gw_dist'], logv else: return logv['gw_dist']
[docs]def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None): r""" Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}` The function solves the following optimization problem: .. math:: \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s) Where : - :math:`\mathbf{C}_s`: metric cost matrix - :math:`\mathbf{p}_s`: distribution Parameters ---------- N : int Size of the targeted barycenter Cs : list of S array-like of shape (ns,ns) Metric cost matrices ps : list of S array-like of shape (ns,) Sample weights in the `S` spaces p : array-like, shape(N,) Weights in the targeted barycenter lambdas : list of float List of the `S` spaces' weights. loss_fun : callable Tensor-matrix multiplication function based on specific loss function. update : callable function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings calculated at each iteration epsilon : float Regularization term >0 max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0) verbose : bool, optional Print information along iterations. log : bool, optional Record log if True. init_C : bool | array-like, shape (N, N) Random initial value for the :math:`\mathbf{C}` matrix provided by user. random_state : int or RandomState instance, optional Fix the seed for reproducibility Returns ------- C : array-like, shape (`N`, `N`) Similarity matrix in the barycenter space (permutated arbitrarily) log : dict Log dictionary of error during iterations. Return only if `log=True` in parameters. References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. """ Cs = list_to_array(*Cs) ps = list_to_array(*ps) p = list_to_array(p) nx = get_backend(*Cs, *ps, p) S = len(Cs) # Initialization of C : random SPD matrix (if not provided by user) if init_C is None: generator = check_random_state(random_state) xalea = generator.randn(N, 2) C = dist(xalea, xalea) C /= C.max() C = nx.from_numpy(C, type_as=p) else: C = init_C cpt = 0 err = 1 error = [] while (err > tol) and (cpt < max_iter): Cprev = C T = [entropic_gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, epsilon, max_iter, 1e-4, verbose, log=False) for s in range(S)] if loss_fun == 'square_loss': C = update_square_loss(p, lambdas, T, Cs) elif loss_fun == 'kl_loss': C = update_kl_loss(p, lambdas, T, Cs) if cpt % 10 == 0: # we can speed up the process by checking for the error only all # the 10th iterations err = nx.norm(C - Cprev) error.append(err) if verbose: if cpt % 200 == 0: print('{:5s}|{:12s}'.format( 'It.', 'Err') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(cpt, err)) cpt += 1 if log: return C, {"err": error} else: return C
[docs]def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None): r""" Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}` The function solves the following optimization problem with block coordinate descent: .. math:: \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s) Where : - :math:`\mathbf{C}_s`: metric cost matrix - :math:`\mathbf{p}_s`: distribution Parameters ---------- N : int Size of the targeted barycenter Cs : list of S array-like of shape (ns, ns) Metric cost matrices ps : list of S array-like of shape (ns,) Sample weights in the `S` spaces p : array-like, shape (N,) Weights in the targeted barycenter lambdas : list of float List of the `S` spaces' weights loss_fun : callable tensor-matrix multiplication function based on specific loss function update : callable function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings calculated at each iteration max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0). verbose : bool, optional Print information along iterations. log : bool, optional Record log if True. init_C : bool | array-like, shape(N,N) Random initial value for the :math:`\mathbf{C}` matrix provided by user. random_state : int or RandomState instance, optional Fix the seed for reproducibility Returns ------- C : array-like, shape (`N`, `N`) Similarity matrix in the barycenter space (permutated arbitrarily) log : dict Log dictionary of error during iterations. Return only if `log=True` in parameters. References ---------- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. """ Cs = list_to_array(*Cs) ps = list_to_array(*ps) p = list_to_array(p) nx = get_backend(*Cs, *ps, p) S = len(Cs) # Initialization of C : random SPD matrix (if not provided by user) if init_C is None: generator = check_random_state(random_state) xalea = generator.randn(N, 2) C = dist(xalea, xalea) C /= C.max() C = nx.from_numpy(C, type_as=p) else: C = init_C cpt = 0 err = 1 error = [] while (err > tol and cpt < max_iter): Cprev = C T = [gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, numItermax=max_iter, stopThr=1e-5, verbose=verbose, log=False) for s in range(S)] if loss_fun == 'square_loss': C = update_square_loss(p, lambdas, T, Cs) elif loss_fun == 'kl_loss': C = update_kl_loss(p, lambdas, T, Cs) if cpt % 10 == 0: # we can speed up the process by checking for the error only all # the 10th iterations err = nx.norm(C - Cprev) error.append(err) if verbose: if cpt % 200 == 0: print('{:5s}|{:12s}'.format( 'It.', 'Err') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(cpt, err)) cpt += 1 if log: return C, {"err": error} else: return C
[docs]def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False, p=None, loss_fun='square_loss', max_iter=100, tol=1e-9, verbose=False, log=False, init_C=None, init_X=None, random_state=None): r"""Compute the fgw barycenter as presented eq (5) in :ref:`[24] <references-fgw-barycenters>` Parameters ---------- N : int Desired number of samples of the target barycenter Ys: list of array-like, each element has shape (ns,d) Features of all samples Cs : list of array-like, each element has shape (ns,ns) Structure matrices of all samples ps : list of array-like, each element has shape (ns,) Masses of all samples. lambdas : list of float List of the `S` spaces' weights alpha : float Alpha parameter for the fgw distance fixed_structure : bool Whether to fix the structure of the barycenter during the updates fixed_features : bool Whether to fix the feature of the barycenter during the updates loss_fun : str Loss function used for the solver either 'square_loss' or 'kl_loss' max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0). verbose : bool, optional Print information along iterations. log : bool, optional Record log if True. init_C : array-like, shape (N,N), optional Initialization for the barycenters' structure matrix. If not set a random init is used. init_X : array-like, shape (N,d), optional Initialization for the barycenters' features. If not set a random init is used. random_state : int or RandomState instance, optional Fix the seed for reproducibility Returns ------- X : array-like, shape (`N`, `d`) Barycenters' features C : array-like, shape (`N`, `N`) Barycenters' structure matrix log : dict Only returned when log=True. It contains the keys: - :math:`\mathbf{T}`: list of (`N`, `ns`) transport matrices - :math:`(\mathbf{M}_s)_s`: all distance matrices between the feature of the barycenter and the other features :math:`(dist(\mathbf{X}, \mathbf{Y}_s))_s` shape (`N`, `ns`) .. _references-fgw-barycenters: References ---------- .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas "Optimal Transport for structured data with application on graphs" International Conference on Machine Learning (ICML). 2019. """ Cs = list_to_array(*Cs) ps = list_to_array(*ps) Ys = list_to_array(*Ys) p = list_to_array(p) nx = get_backend(*Cs, *Ys, *ps) S = len(Cs) d = Ys[0].shape[1] # dimension on the node features if p is None: p = nx.ones(N, type_as=Cs[0]) / N if fixed_structure: if init_C is None: raise UndefinedParameter('If C is fixed it must be initialized') else: C = init_C else: if init_C is None: generator = check_random_state(random_state) xalea = generator.randn(N, 2) C = dist(xalea, xalea) C = nx.from_numpy(C, type_as=ps[0]) else: C = init_C if fixed_features: if init_X is None: raise UndefinedParameter('If X is fixed it must be initialized') else: X = init_X else: if init_X is None: X = nx.zeros((N, d), type_as=ps[0]) else: X = init_X T = [nx.outer(p, q) for q in ps] Ms = [dist(X, Ys[s]) for s in range(len(Ys))] cpt = 0 err_feature = 1 err_structure = 1 if log: log_ = {} log_['err_feature'] = [] log_['err_structure'] = [] log_['Ts_iter'] = [] while ((err_feature > tol or err_structure > tol) and cpt < max_iter): Cprev = C Xprev = X if not fixed_features: Ys_temp = [y.T for y in Ys] X = update_feature_matrix(lambdas, Ys_temp, T, p).T Ms = [dist(X, Ys[s]) for s in range(len(Ys))] if not fixed_structure: if loss_fun == 'square_loss': T_temp = [t.T for t in T] C = update_structure_matrix(p, lambdas, T_temp, Cs) T = [fused_gromov_wasserstein(Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha, numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)] # T is N,ns err_feature = nx.norm(X - nx.reshape(Xprev, (N, d))) err_structure = nx.norm(C - Cprev) if log: log_['err_feature'].append(err_feature) log_['err_structure'].append(err_structure) log_['Ts_iter'].append(T) if verbose: if cpt % 200 == 0: print('{:5s}|{:12s}'.format( 'It.', 'Err') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(cpt, err_structure)) print('{:5d}|{:8e}|'.format(cpt, err_feature)) cpt += 1 if log: log_['T'] = T # from target to Ys log_['p'] = p log_['Ms'] = Ms if log: return X, C, log_ else: return X, C
[docs]def update_structure_matrix(p, lambdas, T, Cs): r"""Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s` couplings. It is calculated at each iteration Parameters ---------- p : array-like, shape (N,) Masses in the targeted barycenter. lambdas : list of float List of the `S` spaces' weights. T : list of S array-like of shape (ns, N) The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration. Cs : list of S array-like, shape (ns, ns) Metric cost matrices. Returns ------- C : array-like, shape (`nt`, `nt`) Updated :math:`\mathbf{C}` matrix. """ p = list_to_array(p) T = list_to_array(*T) Cs = list_to_array(*Cs) nx = get_backend(*Cs, *T, p) tmpsum = sum([ lambdas[s] * nx.dot( nx.dot(T[s].T, Cs[s]), T[s] ) for s in range(len(T)) ]) ppt = nx.outer(p, p) return tmpsum / ppt
[docs]def update_feature_matrix(lambdas, Ys, Ts, p): r"""Updates the feature with respect to the `S` :math:`\mathbf{T}_s` couplings. See "Solving the barycenter problem with Block Coordinate Descent (BCD)" in :ref:`[24] <references-update-feature-matrix>` calculated at each iteration Parameters ---------- p : array-like, shape (N,) masses in the targeted barycenter lambdas : list of float List of the `S` spaces' weights Ts : list of S array-like, shape (ns,N) The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration Ys : list of S array-like, shape (d,ns) The features. Returns ------- X : array-like, shape (`d`, `N`) .. _references-update-feature-matrix: References ---------- .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas "Optimal Transport for structured data with application on graphs" International Conference on Machine Learning (ICML). 2019. """ p = list_to_array(p) Ts = list_to_array(*Ts) Ys = list_to_array(*Ys) nx = get_backend(*Ys, *Ts, p) p = 1. / p tmpsum = sum([ lambdas[s] * nx.dot(Ys[s], Ts[s].T) * p[None, :] for s in range(len(Ts)) ]) return tmpsum
[docs]def gromov_wasserstein_dictionary_learning(Cs, D, nt, reg=0., ps=None, q=None, epochs=20, batch_size=32, learning_rate=1., Cdict_init=None, projection='nonnegative_symmetric', use_log=True, tol_outer=10**(-5), tol_inner=10**(-5), max_iter_outer=20, max_iter_inner=200, use_adam_optimizer=True, verbose=False, **kwargs): r""" Infer Gromov-Wasserstein linear dictionary :math:`\{ (\mathbf{C_{dict}[d]}, q) \}_{d \in [D]}` from the list of structures :math:`\{ (\mathbf{C_s},\mathbf{p_s}) \}_s` .. math:: \min_{\mathbf{C_{dict}}, \{\mathbf{w_s} \}_{s \leq S}} \sum_{s=1}^S GW_2(\mathbf{C_s}, \sum_{d=1}^D w_{s,d}\mathbf{C_{dict}[d]}, \mathbf{p_s}, \mathbf{q}) - reg\| \mathbf{w_s} \|_2^2 such that, :math:`\forall s \leq S` : - :math:`\mathbf{w_s}^\top \mathbf{1}_D = 1` - :math:`\mathbf{w_s} \geq \mathbf{0}_D` Where : - :math:`\forall s \leq S, \mathbf{C_s}` is a (ns,ns) pairwise similarity matrix of variable size ns. - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrix of fixed size nt. - :math:`\forall s \leq S, \mathbf{p_s}` is the source distribution corresponding to :math:`\mathbf{C_s}` - :math:`\mathbf{q}` is the target distribution assigned to every structures in the embedding space. - reg is the regularization coefficient. The stochastic algorithm used for estimating the graph dictionary atoms as proposed in [38] Parameters ---------- Cs : list of S symmetric array-like, shape (ns, ns) List of Metric/Graph cost matrices of variable size (ns, ns). D: int Number of dictionary atoms to learn nt: int Number of samples within each dictionary atoms reg : float, optional Coefficient of the negative quadratic regularization used to promote sparsity of w. The default is 0. ps : list of S array-like, shape (ns,), optional Distribution in each source space C of Cs. Default is None and corresponds to uniform distibutions. q : array-like, shape (nt,), optional Distribution in the embedding space whose structure will be learned. Default is None and corresponds to uniform distributions. epochs: int, optional Number of epochs used to learn the dictionary. Default is 32. batch_size: int, optional Batch size for each stochastic gradient update of the dictionary. Set to the dataset size if the provided batch_size is higher than the dataset size. Default is 32. learning_rate: float, optional Learning rate used for the stochastic gradient descent. Default is 1. Cdict_init: list of D array-like with shape (nt, nt), optional Used to initialize the dictionary. If set to None (Default), the dictionary will be initialized randomly. Else Cdict must have shape (D, nt, nt) i.e match provided shape features. projection: str , optional If 'nonnegative' and/or 'symmetric' is in projection, the corresponding projection will be performed at each stochastic update of the dictionary Else the set of atoms is :math:`R^{nt * nt}`. Default is 'nonnegative_symmetric' log: bool, optional If set to True, losses evolution by batches and epochs are tracked. Default is False. use_adam_optimizer: bool, optional If set to True, adam optimizer with default settings is used as adaptative learning rate strategy. Else perform SGD with fixed learning rate. Default is True. tol_outer : float, optional Solver precision for the BCD algorithm, measured by absolute relative error on consecutive losses. Default is :math:`10^{-5}`. tol_inner : float, optional Solver precision for the Conjugate Gradient algorithm used to get optimal w at a fixed transport, measured by absolute relative error on consecutive losses. Default is :math:`10^{-5}`. max_iter_outer : int, optional Maximum number of iterations for the BCD. Default is 20. max_iter_inner : int, optional Maximum number of iterations for the Conjugate Gradient. Default is 200. verbose : bool, optional Print the reconstruction loss every epoch. Default is False. Returns ------- Cdict_best_state : D array-like, shape (D,nt,nt) Metric/Graph cost matrices composing the dictionary. The dictionary leading to the best loss over an epoch is saved and returned. log: dict If use_log is True, contains loss evolutions by batches and epochs. References ------- ..[38] Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Marco Corneli, Nicolas Courty. "Online Graph Dictionary Learning" International Conference on Machine Learning (ICML). 2021. """ # Handle backend of non-optional arguments Cs0 = Cs nx = get_backend(*Cs0) Cs = [nx.to_numpy(C) for C in Cs0] dataset_size = len(Cs) # Handle backend of optional arguments if ps is None: ps = [unif(C.shape[0]) for C in Cs] else: ps = [nx.to_numpy(p) for p in ps] if q is None: q = unif(nt) else: q = nx.to_numpy(q) if Cdict_init is None: # Initialize randomly structures of dictionary atoms based on samples dataset_means = [C.mean() for C in Cs] Cdict = np.random.normal(loc=np.mean(dataset_means), scale=np.std(dataset_means), size=(D, nt, nt)) else: Cdict = nx.to_numpy(Cdict_init).copy() assert Cdict.shape == (D, nt, nt) if 'symmetric' in projection: Cdict = 0.5 * (Cdict + Cdict.transpose((0, 2, 1))) if 'nonnegative' in projection: Cdict[Cdict < 0.] = 0 if use_adam_optimizer: adam_moments = _initialize_adam_optimizer(Cdict) log = {'loss_batches': [], 'loss_epochs': []} const_q = q[:, None] * q[None, :] Cdict_best_state = Cdict.copy() loss_best_state = np.inf if batch_size > dataset_size: batch_size = dataset_size iter_by_epoch = dataset_size // batch_size + int((dataset_size % batch_size) > 0) for epoch in range(epochs): cumulated_loss_over_epoch = 0. for _ in range(iter_by_epoch): # batch sampling batch = np.random.choice(range(dataset_size), size=batch_size, replace=False) cumulated_loss_over_batch = 0. unmixings = np.zeros((batch_size, D)) Cs_embedded = np.zeros((batch_size, nt, nt)) Ts = [None] * batch_size for batch_idx, C_idx in enumerate(batch): # BCD solver for Gromov-Wassersteisn linear unmixing used independently on each structure of the sampled batch unmixings[batch_idx], Cs_embedded[batch_idx], Ts[batch_idx], current_loss = gromov_wasserstein_linear_unmixing( Cs[C_idx], Cdict, reg=reg, p=ps[C_idx], q=q, tol_outer=tol_outer, tol_inner=tol_inner, max_iter_outer=max_iter_outer, max_iter_inner=max_iter_inner ) cumulated_loss_over_batch += current_loss cumulated_loss_over_epoch += cumulated_loss_over_batch if use_log: log['loss_batches'].append(cumulated_loss_over_batch) # Stochastic projected gradient step over dictionary atoms grad_Cdict = np.zeros_like(Cdict) for batch_idx, C_idx in enumerate(batch): shared_term_structures = Cs_embedded[batch_idx] * const_q - (Cs[C_idx].dot(Ts[batch_idx])).T.dot(Ts[batch_idx]) grad_Cdict += unmixings[batch_idx][:, None, None] * shared_term_structures[None, :, :] grad_Cdict *= 2 / batch_size if use_adam_optimizer: Cdict, adam_moments = _adam_stochastic_updates(Cdict, grad_Cdict, learning_rate, adam_moments) else: Cdict -= learning_rate * grad_Cdict if 'symmetric' in projection: Cdict = 0.5 * (Cdict + Cdict.transpose((0, 2, 1))) if 'nonnegative' in projection: Cdict[Cdict < 0.] = 0. if use_log: log['loss_epochs'].append(cumulated_loss_over_epoch) if loss_best_state > cumulated_loss_over_epoch: loss_best_state = cumulated_loss_over_epoch Cdict_best_state = Cdict.copy() if verbose: print('--- epoch =', epoch, ' cumulated reconstruction error: ', cumulated_loss_over_epoch) return nx.from_numpy(Cdict_best_state), log
def _initialize_adam_optimizer(variable): # Initialization for our numpy implementation of adam optimizer atoms_adam_m = np.zeros_like(variable) # Initialize first moment tensor atoms_adam_v = np.zeros_like(variable) # Initialize second moment tensor atoms_adam_count = 1 return {'mean': atoms_adam_m, 'var': atoms_adam_v, 'count': atoms_adam_count} def _adam_stochastic_updates(variable, grad, learning_rate, adam_moments, beta_1=0.9, beta_2=0.99, eps=1e-09): adam_moments['mean'] = beta_1 * adam_moments['mean'] + (1 - beta_1) * grad adam_moments['var'] = beta_2 * adam_moments['var'] + (1 - beta_2) * (grad**2) unbiased_m = adam_moments['mean'] / (1 - beta_1**adam_moments['count']) unbiased_v = adam_moments['var'] / (1 - beta_2**adam_moments['count']) variable -= learning_rate * unbiased_m / (np.sqrt(unbiased_v) + eps) adam_moments['count'] += 1 return variable, adam_moments
[docs]def gromov_wasserstein_linear_unmixing(C, Cdict, reg=0., p=None, q=None, tol_outer=10**(-5), tol_inner=10**(-5), max_iter_outer=20, max_iter_inner=200, **kwargs): r""" Returns the Gromov-Wasserstein linear unmixing of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary :math:`\{ (\mathbf{C_{dict}[d]}, \mathbf{q}) \}_{d \in [D]}`. .. math:: \min_{ \mathbf{w}} GW_2(\mathbf{C}, \sum_{d=1}^D w_d\mathbf{C_{dict}[d]}, \mathbf{p}, \mathbf{q}) - reg \| \mathbf{w} \|_2^2 such that: - :math:`\mathbf{w}^\top \mathbf{1}_D = 1` - :math:`\mathbf{w} \geq \mathbf{0}_D` Where : - :math:`\mathbf{C}` is the (ns,ns) pairwise similarity matrix. - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrices of size nt. - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights. - reg is the regularization coefficient. The algorithm used for solving the problem is a Block Coordinate Descent as discussed in [38], algorithm 1. Parameters ---------- C : array-like, shape (ns, ns) Metric/Graph cost matrix. Cdict : D array-like, shape (D,nt,nt) Metric/Graph cost matrices composing the dictionary on which to embed C. reg : float, optional. Coefficient of the negative quadratic regularization used to promote sparsity of w. Default is 0. p : array-like, shape (ns,), optional Distribution in the source space C. Default is None and corresponds to uniform distribution. q : array-like, shape (nt,), optional Distribution in the space depicted by the dictionary. Default is None and corresponds to uniform distribution. tol_outer : float, optional Solver precision for the BCD algorithm. tol_inner : float, optional Solver precision for the Conjugate Gradient algorithm used to get optimal w at a fixed transport. Default is :math:`10^{-5}`. max_iter_outer : int, optional Maximum number of iterations for the BCD. Default is 20. max_iter_inner : int, optional Maximum number of iterations for the Conjugate Gradient. Default is 200. Returns ------- w: array-like, shape (D,) gromov-wasserstein linear unmixing of :math:`(\mathbf{C},\mathbf{p})` onto the span of the dictionary. Cembedded: array-like, shape (nt,nt) embedded structure of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary, :math:`\sum_d w_d\mathbf{C_{dict}[d]}`. T: array-like (ns, nt) Gromov-Wasserstein transport plan between :math:`(\mathbf{C},\mathbf{p})` and :math:`(\sum_d w_d\mathbf{C_{dict}[d]}, \mathbf{q})` current_loss: float reconstruction error References ------- ..[38] Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Marco Corneli, Nicolas Courty. "Online Graph Dictionary Learning" International Conference on Machine Learning (ICML). 2021. """ C0, Cdict0 = C, Cdict nx = get_backend(C0, Cdict0) C = nx.to_numpy(C0) Cdict = nx.to_numpy(Cdict0) if p is None: p = unif(C.shape[0]) else: p = nx.to_numpy(p) if q is None: q = unif(Cdict.shape[-1]) else: q = nx.to_numpy(q) T = p[:, None] * q[None, :] D = len(Cdict) w = unif(D) # Initialize uniformly the unmixing w Cembedded = np.sum(w[:, None, None] * Cdict, axis=0) const_q = q[:, None] * q[None, :] # Trackers for BCD convergence convergence_criterion = np.inf current_loss = 10**15 outer_count = 0 while (convergence_criterion > tol_outer) and (outer_count < max_iter_outer): previous_loss = current_loss # 1. Solve GW transport between (C,p) and (\sum_d Cdictionary[d],q) fixing the unmixing w T, log = gromov_wasserstein(C1=C, C2=Cembedded, p=p, q=q, loss_fun='square_loss', G0=T, log=True, armijo=False, **kwargs) current_loss = log['gw_dist'] if reg != 0: current_loss -= reg * np.sum(w**2) # 2. Solve linear unmixing problem over w with a fixed transport plan T w, Cembedded, current_loss = _cg_gromov_wasserstein_unmixing( C=C, Cdict=Cdict, Cembedded=Cembedded, w=w, const_q=const_q, T=T, starting_loss=current_loss, reg=reg, tol=tol_inner, max_iter=max_iter_inner, **kwargs ) if previous_loss != 0: convergence_criterion = abs(previous_loss - current_loss) / abs(previous_loss) else: # handle numerical issues around 0 convergence_criterion = abs(previous_loss - current_loss) / 10**(-15) outer_count += 1 return nx.from_numpy(w), nx.from_numpy(Cembedded), nx.from_numpy(T), nx.from_numpy(current_loss)
def _cg_gromov_wasserstein_unmixing(C, Cdict, Cembedded, w, const_q, T, starting_loss, reg=0., tol=10**(-5), max_iter=200, **kwargs): r""" Returns for a fixed admissible transport plan, the linear unmixing w minimizing the Gromov-Wasserstein cost between :math:`(\mathbf{C},\mathbf{p})` and :math:`(\sum_d w[d]*\mathbf{C_{dict}[d]}, \mathbf{q})` .. math:: \min_{\mathbf{w}} \sum_{ijkl} (C_{i,j} - \sum_{d=1}^D w_d*C_{dict}[d]_{k,l} )^2 T_{i,k}T_{j,l} - reg* \| \mathbf{w} \|_2^2 Such that: - :math:`\mathbf{w}^\top \mathbf{1}_D = 1` - :math:`\mathbf{w} \geq \mathbf{0}_D` Where : - :math:`\mathbf{C}` is the (ns,ns) pairwise similarity matrix. - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrices of nt points. - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights. - :math:`\mathbf{w}` is the linear unmixing of :math:`(\mathbf{C}, \mathbf{p})` onto :math:`(\sum_d w_d \mathbf{Cdict[d]}, \mathbf{q})`. - :math:`\mathbf{T}` is the optimal transport plan conditioned by the current state of :math:`\mathbf{w}`. - reg is the regularization coefficient. The algorithm used for solving the problem is a Conditional Gradient Descent as discussed in [38] Parameters ---------- C : array-like, shape (ns, ns) Metric/Graph cost matrix. Cdict : list of D array-like, shape (nt,nt) Metric/Graph cost matrices composing the dictionary on which to embed C. Each matrix in the dictionary must have the same size (nt,nt). Cembedded: array-like, shape (nt,nt) Embedded structure :math:`(\sum_d w[d]*Cdict[d],q)` of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary. Used to avoid redundant computations. w: array-like, shape (D,) Linear unmixing of the input structure onto the dictionary const_q: array-like, shape (nt,nt) product matrix :math:`\mathbf{q}\mathbf{q}^\top` where q is the target space distribution. Used to avoid redundant computations. T: array-like, shape (ns,nt) fixed transport plan between the input structure and its representation in the dictionary. p : array-like, shape (ns,) Distribution in the source space. q : array-like, shape (nt,) Distribution in the embedding space depicted by the dictionary. reg : float, optional. Coefficient of the negative quadratic regularization used to promote sparsity of w. Default is 0. Returns ------- w: ndarray (D,) optimal unmixing of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary span given OT starting from previously optimal unmixing. """ convergence_criterion = np.inf current_loss = starting_loss count = 0 const_TCT = np.transpose(C.dot(T)).dot(T) while (convergence_criterion > tol) and (count < max_iter): previous_loss = current_loss # 1) Compute gradient at current point w grad_w = 2 * np.sum(Cdict * (Cembedded[None, :, :] * const_q[None, :, :] - const_TCT[None, :, :]), axis=(1, 2)) grad_w -= 2 * reg * w # 2) Conditional gradient direction finding: x= \argmin_x x^T.grad_w min_ = np.min(grad_w) x = (grad_w == min_).astype(np.float64) x /= np.sum(x) # 3) Line-search step: solve \argmin_{\gamma \in [0,1]} a*gamma^2 + b*gamma + c gamma, a, b, Cembedded_diff = _linesearch_gromov_wasserstein_unmixing(w, grad_w, x, Cdict, Cembedded, const_q, const_TCT, reg) # 4) Updates: w <-- (1-gamma)*w + gamma*x w += gamma * (x - w) Cembedded += gamma * Cembedded_diff current_loss += a * (gamma**2) + b * gamma if previous_loss != 0: # not that the loss can be negative if reg >0 convergence_criterion = abs(previous_loss - current_loss) / abs(previous_loss) else: # handle numerical issues around 0 convergence_criterion = abs(previous_loss - current_loss) / 10**(-15) count += 1 return w, Cembedded, current_loss def _linesearch_gromov_wasserstein_unmixing(w, grad_w, x, Cdict, Cembedded, const_q, const_TCT, reg, **kwargs): r""" Compute optimal steps for the line search problem of Gromov-Wasserstein linear unmixing .. math:: \min_{\gamma \in [0,1]} \sum_{ijkl} (C_{i,j} - \sum_{d=1}^D z_d(\gamma)C_{dict}[d]_{k,l} )^2 T_{i,k}T_{j,l} - reg\| \mathbf{z}(\gamma) \|_2^2 Such that: - :math:`\mathbf{z}(\gamma) = (1- \gamma)\mathbf{w} + \gamma \mathbf{x}` Parameters ---------- w : array-like, shape (D,) Unmixing. grad_w : array-like, shape (D, D) Gradient of the reconstruction loss with respect to w. x: array-like, shape (D,) Conditional gradient direction. Cdict : list of D array-like, shape (nt,nt) Metric/Graph cost matrices composing the dictionary on which to embed C. Each matrix in the dictionary must have the same size (nt,nt). Cembedded: array-like, shape (nt,nt) Embedded structure :math:`(\sum_d w_dCdict[d],q)` of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary. Used to avoid redundant computations. const_q: array-like, shape (nt,nt) product matrix :math:`\mathbf{q}\mathbf{q}^\top` where q is the target space distribution. Used to avoid redundant computations. const_TCT: array-like, shape (nt, nt) :math:`\mathbf{T}^\top \mathbf{C}^\top \mathbf{T}`. Used to avoid redundant computations. Returns ------- gamma: float Optimal value for the line-search step a: float Constant factor appearing in the factorization :math:`a \gamma^2 + b \gamma +c` of the reconstruction loss b: float Constant factor appearing in the factorization :math:`a \gamma^2 + b \gamma +c` of the reconstruction loss Cembedded_diff: numpy array, shape (nt, nt) Difference between models evaluated in :math:`\mathbf{w}` and in :math:`\mathbf{w}`. reg : float, optional. Coefficient of the negative quadratic regularization used to promote sparsity of :math:`\mathbf{w}`. """ # 3) Line-search step: solve \argmin_{\gamma \in [0,1]} a*gamma^2 + b*gamma + c Cembedded_x = np.sum(x[:, None, None] * Cdict, axis=0) Cembedded_diff = Cembedded_x - Cembedded trace_diffx = np.sum(Cembedded_diff * Cembedded_x * const_q) trace_diffw = np.sum(Cembedded_diff * Cembedded * const_q) a = trace_diffx - trace_diffw b = 2 * (trace_diffw - np.sum(Cembedded_diff * const_TCT)) if reg != 0: a -= reg * np.sum((x - w)**2) b -= 2 * reg * np.sum(w * (x - w)) if a > 0: gamma = min(1, max(0, - b / (2 * a))) elif a + b < 0: gamma = 1 else: gamma = 0 return gamma, a, b, Cembedded_diff
[docs]def fused_gromov_wasserstein_dictionary_learning(Cs, Ys, D, nt, alpha, reg=0., ps=None, q=None, epochs=20, batch_size=32, learning_rate_C=1., learning_rate_Y=1., Cdict_init=None, Ydict_init=None, projection='nonnegative_symmetric', use_log=False, tol_outer=10**(-5), tol_inner=10**(-5), max_iter_outer=20, max_iter_inner=200, use_adam_optimizer=True, verbose=False, **kwargs): r""" Infer Fused Gromov-Wasserstein linear dictionary :math:`\{ (\mathbf{C_{dict}[d]}, \mathbf{Y_{dict}[d]}, \mathbf{q}) \}_{d \in [D]}` from the list of S attributed structures :math:`\{ (\mathbf{C_s}, \mathbf{Y_s},\mathbf{p_s}) \}_s` .. math:: \min_{\mathbf{C_{dict}},\mathbf{Y_{dict}}, \{\mathbf{w_s}\}_{s}} \sum_{s=1}^S FGW_{2,\alpha}(\mathbf{C_s}, \mathbf{Y_s}, \sum_{d=1}^D w_{s,d}\mathbf{C_{dict}[d]},\sum_{d=1}^D w_{s,d}\mathbf{Y_{dict}[d]}, \mathbf{p_s}, \mathbf{q}) \\ - reg\| \mathbf{w_s} \|_2^2 Such that :math:`\forall s \leq S` : - :math:`\mathbf{w_s}^\top \mathbf{1}_D = 1` - :math:`\mathbf{w_s} \geq \mathbf{0}_D` Where : - :math:`\forall s \leq S, \mathbf{C_s}` is a (ns,ns) pairwise similarity matrix of variable size ns. - :math:`\forall s \leq S, \mathbf{Y_s}` is a (ns,d) features matrix of variable size ns and fixed dimension d. - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrix of fixed size nt. - :math:`\mathbf{Y_{dict}}` is a (D, nt, d) tensor of D features matrix of fixed size nt and fixed dimension d. - :math:`\forall s \leq S, \mathbf{p_s}` is the source distribution corresponding to :math:`\mathbf{C_s}` - :math:`\mathbf{q}` is the target distribution assigned to every structures in the embedding space. - :math:`\alpha` is the trade-off parameter of Fused Gromov-Wasserstein - reg is the regularization coefficient. The stochastic algorithm used for estimating the attributed graph dictionary atoms as proposed in [38] Parameters ---------- Cs : list of S symmetric array-like, shape (ns, ns) List of Metric/Graph cost matrices of variable size (ns,ns). Ys : list of S array-like, shape (ns, d) List of feature matrix of variable size (ns,d) with d fixed. D: int Number of dictionary atoms to learn nt: int Number of samples within each dictionary atoms alpha : float Trade-off parameter of Fused Gromov-Wasserstein reg : float, optional Coefficient of the negative quadratic regularization used to promote sparsity of w. The default is 0. ps : list of S array-like, shape (ns,), optional Distribution in each source space C of Cs. Default is None and corresponds to uniform distibutions. q : array-like, shape (nt,), optional Distribution in the embedding space whose structure will be learned. Default is None and corresponds to uniform distributions. epochs: int, optional Number of epochs used to learn the dictionary. Default is 32. batch_size: int, optional Batch size for each stochastic gradient update of the dictionary. Set to the dataset size if the provided batch_size is higher than the dataset size. Default is 32. learning_rate_C: float, optional Learning rate used for the stochastic gradient descent on Cdict. Default is 1. learning_rate_Y: float, optional Learning rate used for the stochastic gradient descent on Ydict. Default is 1. Cdict_init: list of D array-like with shape (nt, nt), optional Used to initialize the dictionary structures Cdict. If set to None (Default), the dictionary will be initialized randomly. Else Cdict must have shape (D, nt, nt) i.e match provided shape features. Ydict_init: list of D array-like with shape (nt, d), optional Used to initialize the dictionary features Ydict. If set to None, the dictionary features will be initialized randomly. Else Ydict must have shape (D, nt, d) where d is the features dimension of inputs Ys and also match provided shape features. projection: str, optional If 'nonnegative' and/or 'symmetric' is in projection, the corresponding projection will be performed at each stochastic update of the dictionary Else the set of atoms is :math:`R^{nt * nt}`. Default is 'nonnegative_symmetric' log: bool, optional If set to True, losses evolution by batches and epochs are tracked. Default is False. use_adam_optimizer: bool, optional If set to True, adam optimizer with default settings is used as adaptative learning rate strategy. Else perform SGD with fixed learning rate. Default is True. tol_outer : float, optional Solver precision for the BCD algorithm, measured by absolute relative error on consecutive losses. Default is :math:`10^{-5}`. tol_inner : float, optional Solver precision for the Conjugate Gradient algorithm used to get optimal w at a fixed transport, measured by absolute relative error on consecutive losses. Default is :math:`10^{-5}`. max_iter_outer : int, optional Maximum number of iterations for the BCD. Default is 20. max_iter_inner : int, optional Maximum number of iterations for the Conjugate Gradient. Default is 200. verbose : bool, optional Print the reconstruction loss every epoch. Default is False. Returns ------- Cdict_best_state : D array-like, shape (D,nt,nt) Metric/Graph cost matrices composing the dictionary. The dictionary leading to the best loss over an epoch is saved and returned. Ydict_best_state : D array-like, shape (D,nt,d) Feature matrices composing the dictionary. The dictionary leading to the best loss over an epoch is saved and returned. log: dict If use_log is True, contains loss evolutions by batches and epoches. References ------- ..[38] Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Marco Corneli, Nicolas Courty. "Online Graph Dictionary Learning" International Conference on Machine Learning (ICML). 2021. """ Cs0, Ys0 = Cs, Ys nx = get_backend(*Cs0, *Ys0) Cs = [nx.to_numpy(C) for C in Cs0] Ys = [nx.to_numpy(Y) for Y in Ys0] d = Ys[0].shape[-1] dataset_size = len(Cs) if ps is None: ps = [unif(C.shape[0]) for C in Cs] else: ps = [nx.to_numpy(p) for p in ps] if q is None: q = unif(nt) else: q = nx.to_numpy(q) if Cdict_init is None: # Initialize randomly structures of dictionary atoms based on samples dataset_means = [C.mean() for C in Cs] Cdict = np.random.normal(loc=np.mean(dataset_means), scale=np.std(dataset_means), size=(D, nt, nt)) else: Cdict = nx.to_numpy(Cdict_init).copy() assert Cdict.shape == (D, nt, nt) if Ydict_init is None: # Initialize randomly features of dictionary atoms based on samples distribution by feature component dataset_feature_means = np.stack([F.mean(axis=0) for F in Ys]) Ydict = np.random.normal(loc=dataset_feature_means.mean(axis=0), scale=dataset_feature_means.std(axis=0), size=(D, nt, d)) else: Ydict = nx.to_numpy(Ydict_init).copy() assert Ydict.shape == (D, nt, d) if 'symmetric' in projection: Cdict = 0.5 * (Cdict + Cdict.transpose((0, 2, 1))) if 'nonnegative' in projection: Cdict[Cdict < 0.] = 0. if use_adam_optimizer: adam_moments_C = _initialize_adam_optimizer(Cdict) adam_moments_Y = _initialize_adam_optimizer(Ydict) log = {'loss_batches': [], 'loss_epochs': []} const_q = q[:, None] * q[None, :] diag_q = np.diag(q) Cdict_best_state = Cdict.copy() Ydict_best_state = Ydict.copy() loss_best_state = np.inf if batch_size > dataset_size: batch_size = dataset_size iter_by_epoch = dataset_size // batch_size + int((dataset_size % batch_size) > 0) for epoch in range(epochs): cumulated_loss_over_epoch = 0. for _ in range(iter_by_epoch): # Batch iterations batch = np.random.choice(range(dataset_size), size=batch_size, replace=False) cumulated_loss_over_batch = 0. unmixings = np.zeros((batch_size, D)) Cs_embedded = np.zeros((batch_size, nt, nt)) Ys_embedded = np.zeros((batch_size, nt, d)) Ts = [None] * batch_size for batch_idx, C_idx in enumerate(batch): # BCD solver for Gromov-Wassersteisn linear unmixing used independently on each structure of the sampled batch unmixings[batch_idx], Cs_embedded[batch_idx], Ys_embedded[batch_idx], Ts[batch_idx], current_loss = fused_gromov_wasserstein_linear_unmixing( Cs[C_idx], Ys[C_idx], Cdict, Ydict, alpha, reg=reg, p=ps[C_idx], q=q, tol_outer=tol_outer, tol_inner=tol_inner, max_iter_outer=max_iter_outer, max_iter_inner=max_iter_inner ) cumulated_loss_over_batch += current_loss cumulated_loss_over_epoch += cumulated_loss_over_batch if use_log: log['loss_batches'].append(cumulated_loss_over_batch) # Stochastic projected gradient step over dictionary atoms grad_Cdict = np.zeros_like(Cdict) grad_Ydict = np.zeros_like(Ydict) for batch_idx, C_idx in enumerate(batch): shared_term_structures = Cs_embedded[batch_idx] * const_q - (Cs[C_idx].dot(Ts[batch_idx])).T.dot(Ts[batch_idx]) shared_term_features = diag_q.dot(Ys_embedded[batch_idx]) - Ts[batch_idx].T.dot(Ys[C_idx]) grad_Cdict += alpha * unmixings[batch_idx][:, None, None] * shared_term_structures[None, :, :] grad_Ydict += (1 - alpha) * unmixings[batch_idx][:, None, None] * shared_term_features[None, :, :] grad_Cdict *= 2 / batch_size grad_Ydict *= 2 / batch_size if use_adam_optimizer: Cdict, adam_moments_C = _adam_stochastic_updates(Cdict, grad_Cdict, learning_rate_C, adam_moments_C) Ydict, adam_moments_Y = _adam_stochastic_updates(Ydict, grad_Ydict, learning_rate_Y, adam_moments_Y) else: Cdict -= learning_rate_C * grad_Cdict Ydict -= learning_rate_Y * grad_Ydict if 'symmetric' in projection: Cdict = 0.5 * (Cdict + Cdict.transpose((0, 2, 1))) if 'nonnegative' in projection: Cdict[Cdict < 0.] = 0. if use_log: log['loss_epochs'].append(cumulated_loss_over_epoch) if loss_best_state > cumulated_loss_over_epoch: loss_best_state = cumulated_loss_over_epoch Cdict_best_state = Cdict.copy() Ydict_best_state = Ydict.copy() if verbose: print('--- epoch: ', epoch, ' cumulated reconstruction error: ', cumulated_loss_over_epoch) return nx.from_numpy(Cdict_best_state), nx.from_numpy(Ydict_best_state), log
[docs]def fused_gromov_wasserstein_linear_unmixing(C, Y, Cdict, Ydict, alpha, reg=0., p=None, q=None, tol_outer=10**(-5), tol_inner=10**(-5), max_iter_outer=20, max_iter_inner=200, **kwargs): r""" Returns the Fused Gromov-Wasserstein linear unmixing of :math:`(\mathbf{C},\mathbf{Y},\mathbf{p})` onto the attributed dictionary atoms :math:`\{ (\mathbf{C_{dict}[d]},\mathbf{Y_{dict}[d]}, \mathbf{q}) \}_{d \in [D]}` .. math:: \min_{\mathbf{w}} FGW_{2,\alpha}(\mathbf{C},\mathbf{Y}, \sum_{d=1}^D w_d\mathbf{C_{dict}[d]},\sum_{d=1}^D w_d\mathbf{Y_{dict}[d]}, \mathbf{p}, \mathbf{q}) - reg \| \mathbf{w} \|_2^2 such that, :math:`\forall s \leq S` : - :math:`\mathbf{w_s}^\top \mathbf{1}_D = 1` - :math:`\mathbf{w_s} \geq \mathbf{0}_D` Where : - :math:`\mathbf{C}` is a (ns,ns) pairwise similarity matrix of variable size ns. - :math:`\mathbf{Y}` is a (ns,d) features matrix of variable size ns and fixed dimension d. - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrix of fixed size nt. - :math:`\mathbf{Y_{dict}}` is a (D, nt, d) tensor of D features matrix of fixed size nt and fixed dimension d. - :math:`\mathbf{p}` is the source distribution corresponding to :math:`\mathbf{C_s}` - :math:`\mathbf{q}` is the target distribution assigned to every structures in the embedding space. - :math:`\alpha` is the trade-off parameter of Fused Gromov-Wasserstein - reg is the regularization coefficient. The algorithm used for solving the problem is a Block Coordinate Descent as discussed in [38], algorithm 6. Parameters ---------- C : array-like, shape (ns, ns) Metric/Graph cost matrix. Y : array-like, shape (ns, d) Feature matrix. Cdict : D array-like, shape (D,nt,nt) Metric/Graph cost matrices composing the dictionary on which to embed (C,Y). Ydict : D array-like, shape (D,nt,d) Feature matrices composing the dictionary on which to embed (C,Y). alpha: float, Trade-off parameter of Fused Gromov-Wasserstein. reg : float, optional Coefficient of the negative quadratic regularization used to promote sparsity of w. The default is 0. p : array-like, shape (ns,), optional Distribution in the source space C. Default is None and corresponds to uniform distribution. q : array-like, shape (nt,), optional Distribution in the space depicted by the dictionary. Default is None and corresponds to uniform distribution. tol_outer : float, optional Solver precision for the BCD algorithm. tol_inner : float, optional Solver precision for the Conjugate Gradient algorithm used to get optimal w at a fixed transport. Default is :math:`10^{-5}`. max_iter_outer : int, optional Maximum number of iterations for the BCD. Default is 20. max_iter_inner : int, optional Maximum number of iterations for the Conjugate Gradient. Default is 200. Returns ------- w: array-like, shape (D,) fused gromov-wasserstein linear unmixing of (C,Y,p) onto the span of the dictionary. Cembedded: array-like, shape (nt,nt) embedded structure of :math:`(\mathbf{C},\mathbf{Y}, \mathbf{p})` onto the dictionary, :math:`\sum_d w_d\mathbf{C_{dict}[d]}`. Yembedded: array-like, shape (nt,d) embedded features of :math:`(\mathbf{C},\mathbf{Y}, \mathbf{p})` onto the dictionary, :math:`\sum_d w_d\mathbf{Y_{dict}[d]}`. T: array-like (ns,nt) Fused Gromov-Wasserstein transport plan between :math:`(\mathbf{C},\mathbf{p})` and :math:`(\sum_d w_d\mathbf{C_{dict}[d]}, \sum_d w_d\mathbf{Y_{dict}[d]},\mathbf{q})`. current_loss: float reconstruction error References ------- ..[38] Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Marco Corneli, Nicolas Courty. "Online Graph Dictionary Learning" International Conference on Machine Learning (ICML). 2021. """ C0, Y0, Cdict0, Ydict0 = C, Y, Cdict, Ydict nx = get_backend(C0, Y0, Cdict0, Ydict0) C = nx.to_numpy(C0) Y = nx.to_numpy(Y0) Cdict = nx.to_numpy(Cdict0) Ydict = nx.to_numpy(Ydict0) if p is None: p = unif(C.shape[0]) else: p = nx.to_numpy(p) if q is None: q = unif(Cdict.shape[-1]) else: q = nx.to_numpy(q) T = p[:, None] * q[None, :] D = len(Cdict) d = Y.shape[-1] w = unif(D) # Initialize with uniform weights ns = C.shape[-1] nt = Cdict.shape[-1] # modeling (C,Y) Cembedded = np.sum(w[:, None, None] * Cdict, axis=0) Yembedded = np.sum(w[:, None, None] * Ydict, axis=0) # constants depending on q const_q = q[:, None] * q[None, :] diag_q = np.diag(q) # Trackers for BCD convergence convergence_criterion = np.inf current_loss = 10**15 outer_count = 0 Ys_constM = (Y**2).dot(np.ones((d, nt))) # constant in computing euclidean pairwise feature matrix while (convergence_criterion > tol_outer) and (outer_count < max_iter_outer): previous_loss = current_loss # 1. Solve GW transport between (C,p) and (\sum_d Cdictionary[d],q) fixing the unmixing w Yt_varM = (np.ones((ns, d))).dot((Yembedded**2).T) M = Ys_constM + Yt_varM - 2 * Y.dot(Yembedded.T) # euclidean distance matrix between features T, log = fused_gromov_wasserstein(M, C, Cembedded, p, q, loss_fun='square_loss', alpha=alpha, armijo=False, G0=T, log=True) current_loss = log['fgw_dist'] if reg != 0: current_loss -= reg * np.sum(w**2) # 2. Solve linear unmixing problem over w with a fixed transport plan T w, Cembedded, Yembedded, current_loss = _cg_fused_gromov_wasserstein_unmixing(C, Y, Cdict, Ydict, Cembedded, Yembedded, w, T, p, q, const_q, diag_q, current_loss, alpha, reg, tol=tol_inner, max_iter=max_iter_inner, **kwargs) if previous_loss != 0: convergence_criterion = abs(previous_loss - current_loss) / abs(previous_loss) else: convergence_criterion = abs(previous_loss - current_loss) / 10**(-12) outer_count += 1 return nx.from_numpy(w), nx.from_numpy(Cembedded), nx.from_numpy(Yembedded), nx.from_numpy(T), nx.from_numpy(current_loss)
def _cg_fused_gromov_wasserstein_unmixing(C, Y, Cdict, Ydict, Cembedded, Yembedded, w, T, p, q, const_q, diag_q, starting_loss, alpha, reg, tol=10**(-6), max_iter=200, **kwargs): r""" Returns for a fixed admissible transport plan, the optimal linear unmixing :math:`\mathbf{w}` minimizing the Fused Gromov-Wasserstein cost between :math:`(\mathbf{C},\mathbf{Y},\mathbf{p})` and :math:`(\sum_d w_d \mathbf{C_{dict}[d]},\sum_d w_d*\mathbf{Y_{dict}[d]}, \mathbf{q})` .. math:: \min_{\mathbf{w}} \alpha \sum_{ijkl} (C_{i,j} - \sum_{d=1}^D w_d C_{dict}[d]_{k,l} )^2 T_{i,k}T_{j,l} \\+ (1-\alpha) \sum_{ij} \| \mathbf{Y_i} - \sum_d w_d \mathbf{Y_{dict}[d]_j} \|_2^2 T_{ij}- reg \| \mathbf{w} \|_2^2 Such that : - :math:`\mathbf{w}^\top \mathbf{1}_D = 1` - :math:`\mathbf{w} \geq \mathbf{0}_D` Where : - :math:`\mathbf{C}` is a (ns,ns) pairwise similarity matrix of variable size ns. - :math:`\mathbf{Y}` is a (ns,d) features matrix of variable size ns and fixed dimension d. - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrix of fixed size nt. - :math:`\mathbf{Y_{dict}}` is a (D, nt, d) tensor of D features matrix of fixed size nt and fixed dimension d. - :math:`\mathbf{p}` is the source distribution corresponding to :math:`\mathbf{C_s}` - :math:`\mathbf{q}` is the target distribution assigned to every structures in the embedding space. - :math:`\mathbf{T}` is the optimal transport plan conditioned by the previous state of :math:`\mathbf{w}` - :math:`\alpha` is the trade-off parameter of Fused Gromov-Wasserstein - reg is the regularization coefficient. The algorithm used for solving the problem is a Conditional Gradient Descent as discussed in [38], algorithm 7. Parameters ---------- C : array-like, shape (ns, ns) Metric/Graph cost matrix. Y : array-like, shape (ns, d) Feature matrix. Cdict : list of D array-like, shape (nt,nt) Metric/Graph cost matrices composing the dictionary on which to embed (C,Y). Each matrix in the dictionary must have the same size (nt,nt). Ydict : list of D array-like, shape (nt,d) Feature matrices composing the dictionary on which to embed (C,Y). Each matrix in the dictionary must have the same size (nt,d). Cembedded: array-like, shape (nt,nt) Embedded structure of (C,Y) onto the dictionary Yembedded: array-like, shape (nt,d) Embedded features of (C,Y) onto the dictionary w: array-like, shape (n_D,) Linear unmixing of (C,Y) onto (Cdict,Ydict) const_q: array-like, shape (nt,nt) product matrix :math:`\mathbf{qq}^\top` where :math:`\mathbf{q}` is the target space distribution. diag_q: array-like, shape (nt,nt) diagonal matrix with values of q on the diagonal. T: array-like, shape (ns,nt) fixed transport plan between (C,Y) and its model p : array-like, shape (ns,) Distribution in the source space (C,Y). q : array-like, shape (nt,) Distribution in the embedding space depicted by the dictionary. alpha: float, Trade-off parameter of Fused Gromov-Wasserstein. reg : float, optional Coefficient of the negative quadratic regularization used to promote sparsity of w. Returns ------- w: ndarray (D,) linear unmixing of :math:`(\mathbf{C},\mathbf{Y},\mathbf{p})` onto the span of :math:`(C_{dict},Y_{dict})` given OT corresponding to previous unmixing. """ convergence_criterion = np.inf current_loss = starting_loss count = 0 const_TCT = np.transpose(C.dot(T)).dot(T) ones_ns_d = np.ones(Y.shape) while (convergence_criterion > tol) and (count < max_iter): previous_loss = current_loss # 1) Compute gradient at current point w # structure grad_w = alpha * np.sum(Cdict * (Cembedded[None, :, :] * const_q[None, :, :] - const_TCT[None, :, :]), axis=(1, 2)) # feature grad_w += (1 - alpha) * np.sum(Ydict * (diag_q.dot(Yembedded)[None, :, :] - T.T.dot(Y)[None, :, :]), axis=(1, 2)) grad_w -= reg * w grad_w *= 2 # 2) Conditional gradient direction finding: x= \argmin_x x^T.grad_w min_ = np.min(grad_w) x = (grad_w == min_).astype(np.float64) x /= np.sum(x) # 3) Line-search step: solve \argmin_{\gamma \in [0,1]} a*gamma^2 + b*gamma + c gamma, a, b, Cembedded_diff, Yembedded_diff = _linesearch_fused_gromov_wasserstein_unmixing(w, grad_w, x, Y, Cdict, Ydict, Cembedded, Yembedded, T, const_q, const_TCT, ones_ns_d, alpha, reg) # 4) Updates: w <-- (1-gamma)*w + gamma*x w += gamma * (x - w) Cembedded += gamma * Cembedded_diff Yembedded += gamma * Yembedded_diff current_loss += a * (gamma**2) + b * gamma if previous_loss != 0: convergence_criterion = abs(previous_loss - current_loss) / abs(previous_loss) else: convergence_criterion = abs(previous_loss - current_loss) / 10**(-12) count += 1 return w, Cembedded, Yembedded, current_loss def _linesearch_fused_gromov_wasserstein_unmixing(w, grad_w, x, Y, Cdict, Ydict, Cembedded, Yembedded, T, const_q, const_TCT, ones_ns_d, alpha, reg, **kwargs): r""" Compute optimal steps for the line search problem of Fused Gromov-Wasserstein linear unmixing .. math:: \min_{\gamma \in [0,1]} \alpha \sum_{ijkl} (C_{i,j} - \sum_{d=1}^D z_d(\gamma)C_{dict}[d]_{k,l} )^2 T_{i,k}T_{j,l} \\ + (1-\alpha) \sum_{ij} \| \mathbf{Y_i} - \sum_d z_d(\gamma) \mathbf{Y_{dict}[d]_j} \|_2^2 - reg\| \mathbf{z}(\gamma) \|_2^2 Such that : - :math:`\mathbf{z}(\gamma) = (1- \gamma)\mathbf{w} + \gamma \mathbf{x}` Parameters ---------- w : array-like, shape (D,) Unmixing. grad_w : array-like, shape (D, D) Gradient of the reconstruction loss with respect to w. x: array-like, shape (D,) Conditional gradient direction. Y: arrat-like, shape (ns,d) Feature matrix of the input space Cdict : list of D array-like, shape (nt, nt) Metric/Graph cost matrices composing the dictionary on which to embed (C,Y). Each matrix in the dictionary must have the same size (nt,nt). Ydict : list of D array-like, shape (nt, d) Feature matrices composing the dictionary on which to embed (C,Y). Each matrix in the dictionary must have the same size (nt,d). Cembedded: array-like, shape (nt, nt) Embedded structure of (C,Y) onto the dictionary Yembedded: array-like, shape (nt, d) Embedded features of (C,Y) onto the dictionary T: array-like, shape (ns, nt) Fixed transport plan between (C,Y) and its current model. const_q: array-like, shape (nt,nt) product matrix :math:`\mathbf{q}\mathbf{q}^\top` where q is the target space distribution. Used to avoid redundant computations. const_TCT: array-like, shape (nt, nt) :math:`\mathbf{T}^\top \mathbf{C}^\top \mathbf{T}`. Used to avoid redundant computations. ones_ns_d: array-like, shape (ns, d) :math:`\mathbf{1}_{ ns \times d}`. Used to avoid redundant computations. alpha: float, Trade-off parameter of Fused Gromov-Wasserstein. reg : float, optional Coefficient of the negative quadratic regularization used to promote sparsity of w. Returns ------- gamma: float Optimal value for the line-search step a: float Constant factor appearing in the factorization :math:`a \gamma^2 + b \gamma +c` of the reconstruction loss b: float Constant factor appearing in the factorization :math:`a \gamma^2 + b \gamma +c` of the reconstruction loss Cembedded_diff: numpy array, shape (nt, nt) Difference between structure matrix of models evaluated in :math:`\mathbf{w}` and in :math:`\mathbf{w}`. Yembedded_diff: numpy array, shape (nt, nt) Difference between feature matrix of models evaluated in :math:`\mathbf{w}` and in :math:`\mathbf{w}`. """ # polynomial coefficients from quadratic objective (with respect to w) on structures Cembedded_x = np.sum(x[:, None, None] * Cdict, axis=0) Cembedded_diff = Cembedded_x - Cembedded trace_diffx = np.sum(Cembedded_diff * Cembedded_x * const_q) trace_diffw = np.sum(Cembedded_diff * Cembedded * const_q) # Constant factor appearing in the factorization a*gamma^2 + b*g + c of the Gromov-Wasserstein reconstruction loss a_gw = trace_diffx - trace_diffw b_gw = 2 * (trace_diffw - np.sum(Cembedded_diff * const_TCT)) # polynomial coefficient from quadratic objective (with respect to w) on features Yembedded_x = np.sum(x[:, None, None] * Ydict, axis=0) Yembedded_diff = Yembedded_x - Yembedded # Constant factor appearing in the factorization a*gamma^2 + b*g + c of the Gromov-Wasserstein reconstruction loss a_w = np.sum(ones_ns_d.dot((Yembedded_diff**2).T) * T) b_w = 2 * np.sum(T * (ones_ns_d.dot((Yembedded * Yembedded_diff).T) - Y.dot(Yembedded_diff.T))) a = alpha * a_gw + (1 - alpha) * a_w b = alpha * b_gw + (1 - alpha) * b_w if reg != 0: a -= reg * np.sum((x - w)**2) b -= 2 * reg * np.sum(w * (x - w)) if a > 0: gamma = min(1, max(0, -b / (2 * a))) elif a + b < 0: gamma = 1 else: gamma = 0 return gamma, a, b, Cembedded_diff, Yembedded_diff