Source code for ot.gromov._lowrank

"""
Low rank Gromov-Wasserstein solver
"""

# Author: Laurène David <laurene.david@ip-paris.fr>
#
# License: MIT License

import warnings
from ..utils import unif, get_lowrank_lazytensor
from ..backend import get_backend
from ..lowrank import compute_lr_sqeuclidean_matrix, _init_lr_sinkhorn, _LR_Dysktra


def _flat_product_operator(X, nx=None):
    r"""
    Implementation of the flattened out-product operator.

    This function is used in low rank gromov wasserstein to compute the low rank decomposition of
    a cost matrix's squared hadamard product (page 6 in paper).

    Parameters
    ----------
    X: array-like, shape (n_samples, n_col)
        Input matrix for operator

    nx: default None
        POT backend

    Returns
    ----------
    X_flat: array-like, shape (n_samples, n_col**2)
        Matrix with flattened out-product operator applied on each row

    References
    ----------
    .. [67] Scetbon, M., Peyré, G. & Cuturi, M. (2022).
        "Linear-Time GromovWasserstein Distances using Low Rank Couplings and Costs".
        In International Conference on Machine Learning (ICML), 2022.

    """

    if nx is None:
        nx = get_backend(X)

    n = X.shape[0]
    x1 = X[0, :][:, None]
    X_flat = nx.dot(x1, x1.T).flatten()[:, None]

    for i in range(1, n):
        x = X[i, :][:, None]
        x_out = nx.dot(x, x.T).flatten()[:, None]
        X_flat = nx.concatenate((X_flat, x_out), axis=1)

    X_flat = X_flat.T

    return X_flat


[docs] def lowrank_gromov_wasserstein_samples( X_s, X_t, a=None, b=None, reg=0, rank=None, alpha=1e-10, gamma_init="rescale", rescale_cost=True, cost_factorized_Xs=None, cost_factorized_Xt=None, stopThr=1e-4, numItermax=1000, stopThr_dykstra=1e-3, numItermax_dykstra=10000, seed_init=49, warn=True, warn_dykstra=False, log=False, ): r""" Solve the entropic regularization Gromov-Wasserstein transport problem under low-nonnegative rank constraints on the couplings and cost matrices. Squared euclidean distance matrices are considered for the target and source distributions. The function solves the following optimization problem: .. math:: \mathop{\min_{(Q,R,g) \in \mathcal{C(a,b,r)}}} \mathcal{Q}_{A,B}(Q\mathrm{diag}(1/g)R^T) - \epsilon \cdot H((Q,R,g)) where : - :math: `A` is the (`dim_a`, `dim_a`) square pairwise cost matrix of the source domain. - :math: `B` is the (`dim_a`, `dim_a`) square pairwise cost matrix of the target domain. - :math: `\mathcal{Q}_{A,B}` is quadratic objective function of the Gromov Wasserstein plan. - :math: `Q` and `R` are the low-rank matrix decomposition of the Gromov-Wasserstein plan. - :math: `g` is the weight vector for the low-rank decomposition of the Gromov-Wasserstein plan. - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (histograms, both sum to 1). - :math: `r` is the rank of the Gromov-Wasserstein plan. - :math: `\mathcal{C(a,b,r)}` are the low-rank couplings of the OT problem. - :math:`H((Q,R,g))` is the values of the three respective entropies evaluated for each term. Parameters ---------- X_s : array-like, shape (n_samples_a, dim_Xs) Samples in the source domain X_t : array-like, shape (n_samples_b, dim_Xt) Samples in the target domain a : array-like, shape (n_samples_a,), optional Samples weights in the source domain If let to its default value None, uniform distribution is taken. b : array-like, shape (n_samples_b,), optional Samples weights in the target domain If let to its default value None, uniform distribution is taken. reg : float, optional Regularization term >=0 rank : int, optional. Default is None. (>0) Nonnegative rank of the OT plan. If None, min(ns, nt) is considered. alpha : int, optional. Default is 1e-10. (>0 and <1/r) Lower bound for the weight vector g. rescale_cost : bool, optional. Default is False Rescale the low rank factorization of the sqeuclidean cost matrix seed_init : int, optional. Default is 49. (>0) Random state for the 'random' initialization of low rank couplings gamma_init : str, optional. Default is "rescale". Initialization strategy for gamma. 'rescale', or 'theory' Gamma is a constant that scales the convergence criterion of the Mirror Descent optimization scheme used to compute the low-rank couplings (Q, R and g) numItermax : int, optional. Default is 1000. Max number of iterations for Low Rank GW stopThr : float, optional. Default is 1e-4. Stop threshold on error (>0) for Low Rank GW The error is the sum of Kullback Divergences computed for each low rank coupling (Q, R and g) and scaled using gamma. numItermax_dykstra : int, optional. Default is 2000. Max number of iterations for the Dykstra algorithm stopThr_dykstra : float, optional. Default is 1e-7. Stop threshold on error (>0) in Dykstra cost_factorized_Xs: tuple, optional. Default is None Tuple with two pre-computed low rank decompositions (A1, A2) of the source cost matrix. Both matrices should have a shape of (n_samples_a, dim_Xs + 2). If None, the low rank cost matrices will be computed as sqeuclidean cost matrices. cost_factorized_Xt: tuple, optional. Default is None Tuple with two pre-computed low rank decompositions (B1, B2) of the target cost matrix. Both matrices should have a shape of (n_samples_b, dim_Xt + 2). If None, the low rank cost matrices will be computed as sqeuclidean cost matrices. warn : bool, optional if True, raises a warning if the low rank GW algorithm doesn't convergence. warn_dykstra: bool, optional if True, raises a warning if the Dykstra algorithm doesn't convergence. log : bool, optional record log if True Returns --------- Q : array-like, shape (n_samples_a, r) First low-rank matrix decomposition of the OT plan R: array-like, shape (n_samples_b, r) Second low-rank matrix decomposition of the OT plan g : array-like, shape (r, ) Weight vector for the low-rank decomposition of the OT log : dict (lazy_plan, value and value_linear) log dictionary return only if log==True in parameters References ---------- .. [67] Scetbon, M., Peyré, G. & Cuturi, M. (2022). "Linear-Time GromovWasserstein Distances using Low Rank Couplings and Costs". In International Conference on Machine Learning (ICML), 2022. """ # POT backend nx = get_backend(X_s, X_t) ns, nt = X_s.shape[0], X_t.shape[0] # Initialize weights a, b if a is None: a = unif(ns, type_as=X_s) if b is None: b = unif(nt, type_as=X_t) # Compute rank (see Section 3.1, def 1) r = rank if rank is None: r = min(ns, nt) else: r = min(ns, nt, rank) if r <= 0: raise ValueError("The rank parameter cannot have a negative value") # Dykstra won't converge if 1/rank < alpha (see Section 3.2) if 1 / r < alpha: raise ValueError( "alpha ({a}) should be smaller than 1/rank ({r}) for the Dykstra algorithm to converge.".format( a=alpha, r=1 / rank ) ) if cost_factorized_Xs is not None: A1, A2 = cost_factorized_Xs else: A1, A2 = compute_lr_sqeuclidean_matrix(X_s, X_s, rescale_cost, nx=nx) if cost_factorized_Xt is not None: B1, B2 = cost_factorized_Xt else: B1, B2 = compute_lr_sqeuclidean_matrix(X_t, X_t, rescale_cost, nx=nx) # Initial values for LR couplings (Q, R, g) with LOT Q, R, g = _init_lr_sinkhorn( X_s, X_t, a, b, r, init="random", random_state=seed_init, reg_init=None, nx=nx ) # Gamma initialization if gamma_init == "theory": L = (27 * nx.norm(A1) * nx.norm(A2)) / alpha**4 gamma = 1 / (2 * L) if gamma_init not in ["rescale", "theory"]: raise ( NotImplementedError('Not implemented gamma_init="{}"'.format(gamma_init)) ) # initial value of error err = 1 for ii in range(numItermax): Q_prev = Q R_prev = R g_prev = g if err > stopThr: # Compute cost matrices C1 = nx.dot(A2.T, Q * (1 / g)[None, :]) C1 = -4 * nx.dot(A1, C1) C2 = nx.dot(R.T, B1) C2 = nx.dot(C2, B2.T) diag_g = (1 / g)[None, :] # Compute C*R dot using the lr decomposition of C CR = nx.dot(C2, R) CR = nx.dot(C1, CR) CR_g = CR * diag_g # Compute C.T * Q using the lr decomposition of C CQ = nx.dot(C1.T, Q) CQ = nx.dot(C2.T, CQ) CQ_g = CQ * diag_g # Compute omega omega = nx.diag(nx.dot(Q.T, CR)) # Rescale gamma at each iteration if gamma_init == "rescale": norm_1 = nx.max(nx.abs(CR_g + reg * nx.log(Q))) ** 2 norm_2 = nx.max(nx.abs(CQ_g + reg * nx.log(R))) ** 2 norm_3 = nx.max(nx.abs(-omega * (diag_g**2))) ** 2 gamma = 10 / max(norm_1, norm_2, norm_3) K1 = nx.exp(-gamma * CR_g - ((gamma * reg) - 1) * nx.log(Q)) K2 = nx.exp(-gamma * CQ_g - ((gamma * reg) - 1) * nx.log(R)) K3 = nx.exp((gamma * omega / (g**2)) - (gamma * reg - 1) * nx.log(g)) # Update couplings with LR Dykstra algorithm Q, R, g = _LR_Dysktra( K1, K2, K3, a, b, alpha, stopThr_dykstra, numItermax_dykstra, warn_dykstra, nx, ) # Update error with kullback-divergence err_1 = ((1 / gamma) ** 2) * (nx.kl_div(Q, Q_prev) + nx.kl_div(Q_prev, Q)) err_2 = ((1 / gamma) ** 2) * (nx.kl_div(R, R_prev) + nx.kl_div(R_prev, R)) err_3 = ((1 / gamma) ** 2) * (nx.kl_div(g, g_prev) + nx.kl_div(g_prev, g)) err = err_1 + err_2 + err_3 # fix divide by zero Q = Q + 1e-16 R = R + 1e-16 g = g + 1e-16 else: break else: if warn: warnings.warn( "Low Rank GW did not converge. You might want to " "increase the number of iterations `numItermax` " ) # Update low rank costs C1 = nx.dot(A2.T, Q * (1 / g)[None, :]) C1 = -4 * nx.dot(A1, C1) C2 = nx.dot(R.T, B1) C2 = nx.dot(C2, B2.T) # Compute lazy plan (using LazyTensor class) lazy_plan = get_lowrank_lazytensor(Q, R, 1 / g) # Compute value_quad A1_, A2_ = _flat_product_operator(A1, nx), _flat_product_operator(A2, nx) B1_, B2_ = _flat_product_operator(B1, nx), _flat_product_operator(B2, nx) x_ = nx.dot(A1_, nx.dot(A2_.T, a)) y_ = nx.dot(B1_, nx.dot(B2_.T, b)) c1 = nx.dot(x_, a) + nx.dot(y_, b) G = nx.dot(C1, nx.dot(C2, R)) G = nx.dot(Q.T, G * diag_g) value_quad = c1 + nx.trace(G) / 2 if reg != 0: reg_Q = nx.sum(Q * nx.log(Q + 1e-16)) # entropy for Q reg_g = nx.sum(g * nx.log(g + 1e-16)) # entropy for g reg_R = nx.sum(R * nx.log(R + 1e-16)) # entropy for R value = value_quad + reg * (reg_Q + reg_g + reg_R) else: value = value_quad if log: dict_log = dict() dict_log["value"] = value dict_log["value_quad"] = value_quad dict_log["lazy_plan"] = lazy_plan return Q, R, g, dict_log return Q, R, g