# -*- coding: utf-8 -*-
"""
Partial (Fused) Gromov-Wasserstein solvers.
"""
# Author: Laetitia Chapel <laetitia.chapel@irisa.fr>
# Cédric Vincent-Cuaz <cedvincentcuaz@gmail.com>
# Yikun Bai < yikun.bai@vanderbilt.edu >
#
# License: MIT License
from ..utils import list_to_array, unif
from ..backend import get_backend, NumpyBackend
from ..partial import entropic_partial_wasserstein
from ._utils import _transform_matrix, gwloss, gwggrad
from ..optim import partial_cg, solve_1d_linesearch_quad
import numpy as np
import warnings
[docs]
def partial_gromov_wasserstein(
C1,
C2,
p=None,
q=None,
m=None,
loss_fun="square_loss",
nb_dummies=1,
G0=None,
thres=1,
numItermax=1e4,
tol=1e-8,
symmetric=None,
warn=True,
log=False,
verbose=False,
**kwargs,
):
r"""
Returns the Partial Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})`
and :math:`(\mathbf{C_2}, \mathbf{q})`.
The function solves the following optimization problem using Conditional Gradient:
.. math::
\mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
\mathbf{T}^T \mathbf{1} &= \mathbf{q}
\mathbf{T} &\geq 0
\mathbf{1}^T \mathbf{T}^T \mathbf{1} = m &\leq \min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}
where :
- :math:`\mathbf{C_1}`: Metric cost matrix in the source space.
- :math:`\mathbf{C_2}`: Metric cost matrix in the target space.
- :math:`\mathbf{p}`: Distribution in the source space.
- :math:`\mathbf{q}`: Distribution in the target space.
- `m` is the amount of mass to be transported
- `L`: Loss function to account for the misfit between the similarity matrices.
The formulation of the problem has been proposed in
:ref:`[29] <references-partial-gromov-wasserstein>`
.. note:: This function is backend-compatible and will work on arrays
from all compatible backends. But the algorithm uses the C++ CPU backend
which can lead to copy overhead on GPU arrays.
.. note:: All computations in the conjugate gradient solver are done with
numpy to limit memory overhead.
.. note:: This function will cast the computed transport plan to the data
type of the provided input :math:`\mathbf{C}_1`. Casting to an integer
tensor might result in a loss of precision. If this behaviour is
unwanted, please make sure to provide a floating point input.
Parameters
----------
C1 : array-like, shape (ns, ns)
Metric cost matrix in the source space
C2 : array-like, shape (nt, nt)
Metric costfr matrix in the target space
p : array-like, shape (ns,), optional
Distribution in the source space.
If let to its default value None, uniform distribution is taken.
q : array-like, shape (nt,), optional
Distribution in the target space.
If let to its default value None, uniform distribution is taken.
m : float, optional
Amount of mass to be transported
(default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
loss_fun : str, optional
Loss function used for the solver either 'square_loss' or 'kl_loss'.
nb_dummies : int, optional
Number of dummy points to add (avoid instabilities in the EMD solver)
G0 : array-like, shape (ns, nt), optional
Initialization of the transportation matrix
thres : float, optional
quantile of the gradient matrix to populate the cost matrix when 0
(default: 1)
numItermax : int, optional
Max number of iterations
tol : float, optional
tolerance for stopping iterations
symmetric : bool, optional
Either C1 and C2 are to be assumed symmetric or not.
If let to its default None value, a symmetry test will be conducted.
Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
warn: bool, optional.
Whether to raise a warning when EMD did not converge.
log : bool, optional
return log if True
verbose : bool, optional
Print information along iterations
**kwargs : dict
parameters can be directly passed to the emd solver
Returns
-------
T : array-like, shape (`ns`, `nt`)
Optimal transport matrix between the two spaces.
log : dict
Convergence information and loss.
Examples
--------
>>> from ot.gromov import partial_gromov_wasserstein
>>> import scipy as sp
>>> a = np.array([0.25] * 4)
>>> b = np.array([0.25] * 4)
>>> x = np.array([1,2,100,200]).reshape((-1,1))
>>> y = np.array([3,2,98,199]).reshape((-1,1))
>>> C1 = sp.spatial.distance.cdist(x, x)
>>> C2 = sp.spatial.distance.cdist(y, y)
>>> np.round(partial_gromov_wasserstein(C1, C2, a, b),2)
array([[0. , 0.25, 0. , 0. ],
[0.25, 0. , 0. , 0. ],
[0. , 0. , 0.25, 0. ],
[0. , 0. , 0. , 0.25]])
>>> np.round(partial_gromov_wasserstein(C1, C2, a, b, m=0.25),2)
array([[0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. ],
[0. , 0. , 0.25, 0. ],
[0. , 0. , 0. , 0. ]])
.. _references-partial-gromov-wasserstein:
References
----------
.. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
Transport with Applications on Positive-Unlabeled Learning".
NeurIPS.
"""
arr = [C1, C2]
if p is not None:
arr.append(list_to_array(p))
else:
p = unif(C1.shape[0], type_as=C1)
if q is not None:
arr.append(list_to_array(q))
else:
q = unif(C2.shape[0], type_as=C1)
if G0 is not None:
G0_ = G0
arr.append(G0)
nx = get_backend(*arr)
p0, q0, C10, C20 = p, q, C1, C2
p = nx.to_numpy(p0)
q = nx.to_numpy(q0)
C1 = nx.to_numpy(C10)
C2 = nx.to_numpy(C20)
if symmetric is None:
symmetric = np.allclose(C1, C1.T, atol=1e-10) and np.allclose(
C2, C2.T, atol=1e-10
)
if m is None:
m = min(np.sum(p), np.sum(q))
elif m < 0:
raise ValueError("Problem infeasible. Parameter m should be greater" " than 0.")
elif m > min(np.sum(p), np.sum(q)):
raise ValueError(
"Problem infeasible. Parameter m should lower or"
" equal than min(|a|_1, |b|_1)."
)
if G0 is None:
G0 = (
np.outer(p, q) * m / (np.sum(p) * np.sum(q))
) # make sure |G0|=m, G01_m\leq p, G0.T1_n\leq q.
else:
G0 = nx.to_numpy(G0_)
# Check marginals of G0
assert np.all(G0.sum(1) <= p)
assert np.all(G0.sum(0) <= q)
q_extended = np.append(q, [(np.sum(p) - m) / nb_dummies] * nb_dummies)
p_extended = np.append(p, [(np.sum(q) - m) / nb_dummies] * nb_dummies)
# cg for GW is implemented using numpy on CPU
np_ = NumpyBackend()
fC1, fC2, hC1, hC2 = _transform_matrix(C1, C2, loss_fun, np_)
fC2t = fC2.T
if not symmetric:
fC1t, hC1t, hC2t = fC1.T, hC1.T, hC2.T
ones_p = np_.ones(p.shape[0], type_as=p)
ones_q = np_.ones(q.shape[0], type_as=q)
def f(G):
pG = G.sum(1)
qG = G.sum(0)
constC1 = np.outer(np.dot(fC1, pG), ones_q)
constC2 = np.outer(ones_p, np.dot(qG, fC2t))
return gwloss(constC1 + constC2, hC1, hC2, G, np_)
if symmetric:
def df(G):
pG = G.sum(1)
qG = G.sum(0)
constC1 = np.outer(np.dot(fC1, pG), ones_q)
constC2 = np.outer(ones_p, np.dot(qG, fC2t))
return gwggrad(constC1 + constC2, hC1, hC2, G, np_)
else:
def df(G):
pG = G.sum(1)
qG = G.sum(0)
constC1 = np.outer(np.dot(fC1, pG), ones_q)
constC2 = np.outer(ones_p, np.dot(qG, fC2t))
constC1t = np.outer(np.dot(fC1t, pG), ones_q)
constC2t = np.outer(ones_p, np.dot(qG, fC2))
return 0.5 * (
gwggrad(constC1 + constC2, hC1, hC2, G, np_)
+ gwggrad(constC1t + constC2t, hC1t, hC2t, G, np_)
)
def line_search(cost, G, deltaG, Mi, cost_G, df_G, **kwargs):
df_Gc = df(deltaG + G)
return solve_partial_gromov_linesearch(
G, deltaG, cost_G, df_G, df_Gc, M=0.0, reg=1.0, nx=np_, **kwargs
)
if not nx.is_floating_point(C10):
warnings.warn(
"Input structure matrix consists of integers. The transport plan will be "
"casted accordingly, possibly resulting in a loss of precision. "
"If this behaviour is unwanted, please make sure your input "
"structure matrix consists of floating point elements.",
stacklevel=2,
)
if log:
res, log = partial_cg(
p,
q,
p_extended,
q_extended,
0.0,
1.0,
f,
df,
G0,
line_search,
log=True,
numItermax=numItermax,
stopThr=tol,
stopThr2=0.0,
warn=warn,
**kwargs,
)
log["partial_gw_dist"] = nx.from_numpy(log["loss"][-1], type_as=C10)
return nx.from_numpy(res, type_as=C10), log
else:
return nx.from_numpy(
partial_cg(
p,
q,
p_extended,
q_extended,
0.0,
1.0,
f,
df,
G0,
line_search,
log=False,
numItermax=numItermax,
stopThr=tol,
stopThr2=0.0,
**kwargs,
),
type_as=C10,
)
[docs]
def partial_gromov_wasserstein2(
C1,
C2,
p=None,
q=None,
m=None,
loss_fun="square_loss",
nb_dummies=1,
G0=None,
thres=1,
numItermax=1e4,
tol=1e-7,
symmetric=None,
warn=False,
log=False,
verbose=False,
**kwargs,
):
r"""
Returns the Partial Gromov-Wasserstein discrepancy between
:math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`.
The function solves the following optimization problem using Conditional Gradient:
.. math::
\mathbf{PGW} = \mathop{\min}_\mathbf{T} \quad \sum_{i,j,k,l}
L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
\mathbf{T}^T \mathbf{1} &= \mathbf{q}
\mathbf{T} &\geq 0
\mathbf{1}^T \mathbf{T}^T \mathbf{1} = m &\leq \min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}
where :
- :math:`\mathbf{C_1}`: Metric cost matrix in the source space.
- :math:`\mathbf{C_2}`: Metric cost matrix in the target space.
- :math:`\mathbf{p}`: Distribution in the source space.
- :math:`\mathbf{q}`: Distribution in the target space.
- `m` is the amount of mass to be transported
- `L`: Loss function to account for the misfit between the similarity matrices.
The formulation of the problem has been proposed in
:ref:`[29] <references-partial-gromov-wasserstein2>`
Note that when using backends, this loss function is differentiable wrt the
matrices (C1, C2).
.. note:: This function is backend-compatible and will work on arrays
from all compatible backends. But the algorithm uses the C++ CPU backend
which can lead to copy overhead on GPU arrays.
.. note:: All computations in the conjugate gradient solver are done with
numpy to limit memory overhead.
.. note:: This function will cast the computed transport plan to the data
type of the provided input :math:`\mathbf{C}_1`. Casting to an integer
tensor might result in a loss of precision. If this behaviour is
unwanted, please make sure to provide a floating point input.
Parameters
----------
C1 : ndarray, shape (ns, ns)
Metric cost matrix in the source space
C2 : ndarray, shape (nt, nt)
Metric cost matrix in the target space
p : ndarray, shape (ns,)
Distribution in the source space
q : ndarray, shape (nt,)
Distribution in the target space
m : float, optional
Amount of mass to be transported
(default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
loss_fun : str, optional
Loss function used for the solver either 'square_loss' or 'kl_loss'.
nb_dummies : int, optional
Number of dummy points to add (avoid instabilities in the EMD solver)
G0 : ndarray, shape (ns, nt), optional
Initialization of the transportation matrix
thres : float, optional
quantile of the gradient matrix to populate the cost matrix when 0
(default: 1)
numItermax : int, optional
Max number of iterations
tol : float, optional
tolerance for stopping iterations
symmetric : bool, optional
Either C1 and C2 are to be assumed symmetric or not.
If let to its default None value, a symmetry test will be conducted.
Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
warn: bool, optional.
Whether to raise a warning when EMD did not converge.
log : bool, optional
return log if True
verbose : bool, optional
Print information along iterations
**kwargs : dict
parameters can be directly passed to the emd solver
.. warning::
When dealing with a large number of points, the EMD solver may face
some instabilities, especially when the mass associated to the dummy
point is large. To avoid them, increase the number of dummy points
(allows a smoother repartition of the mass over the points).
Returns
-------
partial_gw_dist : float
partial GW discrepancy
log : dict
log dictionary returned only if `log` is `True`
Examples
--------
>>> from ot.gromov import partial_gromov_wasserstein2
>>> import scipy as sp
>>> a = np.array([0.25] * 4)
>>> b = np.array([0.25] * 4)
>>> x = np.array([1,2,100,200]).reshape((-1,1))
>>> y = np.array([3,2,98,199]).reshape((-1,1))
>>> C1 = sp.spatial.distance.cdist(x, x)
>>> C2 = sp.spatial.distance.cdist(y, y)
>>> np.round(partial_gromov_wasserstein2(C1, C2, a, b),2)
3.38
>>> np.round(partial_gromov_wasserstein2(C1, C2, a, b, m=0.25),2)
0.0
.. _references-partial-gromov-wasserstein2:
References
----------
.. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
Transport with Applications on Positive-Unlabeled Learning".
NeurIPS.
"""
# simple get_backend as the full one will be handled in gromov_wasserstein
nx = get_backend(C1, C2)
# init marginals if set as None
if p is None:
p = unif(C1.shape[0], type_as=C1)
if q is None:
q = unif(C2.shape[0], type_as=C1)
T, log_pgw = partial_gromov_wasserstein(
C1,
C2,
p,
q,
m,
loss_fun,
nb_dummies,
G0,
thres,
numItermax,
tol,
symmetric,
warn,
True,
verbose,
**kwargs,
)
log_pgw["T"] = T
pgw = log_pgw["partial_gw_dist"]
if loss_fun == "square_loss":
gC1 = 2 * C1 * nx.outer(p, p) - 2 * nx.dot(T, nx.dot(C2, T.T))
gC2 = 2 * C2 * nx.outer(q, q) - 2 * nx.dot(T.T, nx.dot(C1, T))
elif loss_fun == "kl_loss":
gC1 = nx.log(C1 + 1e-15) * nx.outer(p, p) - nx.dot(
T, nx.dot(nx.log(C2 + 1e-15), T.T)
)
gC2 = -nx.dot(T.T, nx.dot(C1, T)) / (C2 + 1e-15) + nx.outer(q, q)
pgw = nx.set_gradients(pgw, (C1, C2), (gC1, gC2))
if log:
return pgw, log_pgw
else:
return pgw
[docs]
def partial_fused_gromov_wasserstein(
M,
C1,
C2,
p=None,
q=None,
m=None,
loss_fun="square_loss",
alpha=0.5,
nb_dummies=1,
G0=None,
thres=1,
numItermax=1e4,
tol=1e-8,
symmetric=None,
warn=True,
log=False,
verbose=False,
**kwargs,
):
r"""
Returns the Partial Fused Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{F_1}, \mathbf{p})`
and :math:`(\mathbf{C_2}, \mathbf{F_2}, \mathbf{q})`, with pairwise
distance matrix :math:`\mathbf{M}` between node feature matrices.
The function solves the following optimization problem using Conditional Gradient:
.. math::
\mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
\alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
\mathbf{T}^T \mathbf{1} &= \mathbf{q}
\mathbf{T} &\geq 0
\mathbf{1}^T \mathbf{T}^T \mathbf{1} = m &\leq \min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}
where :
- :math:`\mathbf{M}`: metric cost matrix between features across domains
- :math:`\mathbf{C_1}`: Metric cost matrix in the source space.
- :math:`\mathbf{C_2}`: Metric cost matrix in the target space.
- :math:`\mathbf{p}`: Distribution in the source space.
- :math:`\mathbf{q}`: Distribution in the target space.
- `m` is the amount of mass to be transported
- `L`: Loss function to account for the misfit between the similarity matrices.
The formulation of the problem has been proposed in
:ref:`[29] <references-partial-gromov-wasserstein>`
.. note:: This function is backend-compatible and will work on arrays
from all compatible backends. But the algorithm uses the C++ CPU backend
which can lead to copy overhead on GPU arrays.
.. note:: All computations in the conjugate gradient solver are done with
numpy to limit memory overhead.
.. note:: This function will cast the computed transport plan to the data
type of the provided input :math:`\mathbf{C}_1`. Casting to an integer
tensor might result in a loss of precision. If this behaviour is
unwanted, please make sure to provide a floating point input.
Parameters
----------
M : array-like, shape (ns, nt)
Metric cost matrix between features across domains
C1 : array-like, shape (ns, ns)
Metric cost matrix in the source space
C2 : array-like, shape (nt, nt)
Metric costfr matrix in the target space
p : array-like, shape (ns,), optional
Distribution in the source space.
If let to its default value None, uniform distribution is taken.
q : array-like, shape (nt,), optional
Distribution in the target space.
If let to its default value None, uniform distribution is taken.
m : float, optional
Amount of mass to be transported
(default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
loss_fun : str, optional
Loss function used for the solver either 'square_loss' or 'kl_loss'.
alpha : float, optional
Trade-off parameter (0 < alpha < 1)
nb_dummies : int, optional
Number of dummy points to add (avoid instabilities in the EMD solver)
G0 : array-like, shape (ns, nt), optional
Initialization of the transportation matrix
thres : float, optional
quantile of the gradient matrix to populate the cost matrix when 0
(default: 1)
numItermax : int, optional
Max number of iterations
tol : float, optional
tolerance for stopping iterations
symmetric : bool, optional
Either C1 and C2 are to be assumed symmetric or not.
If let to its default None value, a symmetry test will be conducted.
Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
warn: bool, optional.
Whether to raise a warning when EMD did not converge.
log : bool, optional
return log if True
verbose : bool, optional
Print information along iterations
**kwargs : dict
parameters can be directly passed to the emd solver
Returns
-------
T : array-like, shape (`ns`, `nt`)
Optimal transport matrix between the two spaces.
log : dict
Convergence information and loss.
.. _references-partial-gromov-wasserstein:
References
----------
.. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
Transport with Applications on Positive-Unlabeled Learning".
NeurIPS.
.. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
and Courty Nicolas "Optimal Transport for structured data with
application on graphs", International Conference on Machine Learning
(ICML). 2019.
"""
arr = [M, C1, C2]
if p is not None:
arr.append(list_to_array(p))
else:
p = unif(C1.shape[0], type_as=C1)
if q is not None:
arr.append(list_to_array(q))
else:
q = unif(C2.shape[0], type_as=C1)
if G0 is not None:
G0_ = G0
arr.append(G0)
nx = get_backend(*arr)
p0, q0, M0, C10, C20, alpha0 = p, q, M, C1, C2, alpha
p = nx.to_numpy(p0)
q = nx.to_numpy(q0)
M = nx.to_numpy(M0)
C1 = nx.to_numpy(C10)
C2 = nx.to_numpy(C20)
alpha = nx.to_numpy(alpha0)
if symmetric is None:
symmetric = np.allclose(C1, C1.T, atol=1e-10) and np.allclose(
C2, C2.T, atol=1e-10
)
if m is None:
m = min(np.sum(p), np.sum(q))
elif m < 0:
raise ValueError("Problem infeasible. Parameter m should be greater" " than 0.")
elif m > min(np.sum(p), np.sum(q)):
raise ValueError(
"Problem infeasible. Parameter m should lower or"
" equal than min(|a|_1, |b|_1)."
)
if G0 is None:
G0 = (
np.outer(p, q) * m / (np.sum(p) * np.sum(q))
) # make sure |G0|=m, G01_m\leq p, G0.T1_n\leq q.
else:
G0 = nx.to_numpy(G0_)
# Check marginals of G0
assert np.all(G0.sum(1) <= p)
assert np.all(G0.sum(0) <= q)
q_extended = np.append(q, [(np.sum(p) - m) / nb_dummies] * nb_dummies)
p_extended = np.append(p, [(np.sum(q) - m) / nb_dummies] * nb_dummies)
# cg for GW is implemented using numpy on CPU
np_ = NumpyBackend()
fC1, fC2, hC1, hC2 = _transform_matrix(C1, C2, loss_fun, np_)
fC2t = fC2.T
if not symmetric:
fC1t, hC1t, hC2t = fC1.T, hC1.T, hC2.T
ones_p = np_.ones(p.shape[0], type_as=p)
ones_q = np_.ones(q.shape[0], type_as=q)
def f(G):
pG = G.sum(1)
qG = G.sum(0)
constC1 = np.outer(np.dot(fC1, pG), ones_q)
constC2 = np.outer(ones_p, np.dot(qG, fC2t))
return gwloss(constC1 + constC2, hC1, hC2, G, np_)
if symmetric:
def df(G):
pG = G.sum(1)
qG = G.sum(0)
constC1 = np.outer(np.dot(fC1, pG), ones_q)
constC2 = np.outer(ones_p, np.dot(qG, fC2t))
return gwggrad(constC1 + constC2, hC1, hC2, G, np_)
else:
def df(G):
pG = G.sum(1)
qG = G.sum(0)
constC1 = np.outer(np.dot(fC1, pG), ones_q)
constC2 = np.outer(ones_p, np.dot(qG, fC2t))
constC1t = np.outer(np.dot(fC1t, pG), ones_q)
constC2t = np.outer(ones_p, np.dot(qG, fC2))
return 0.5 * (
gwggrad(constC1 + constC2, hC1, hC2, G, np_)
+ gwggrad(constC1t + constC2t, hC1t, hC2t, G, np_)
)
def line_search(cost, G, deltaG, Mi, cost_G, df_G, **kwargs):
df_Gc = df(deltaG + G)
return solve_partial_gromov_linesearch(
G,
deltaG,
cost_G,
df_G,
df_Gc,
M=(1 - alpha) * M,
reg=alpha,
nx=np_,
**kwargs,
)
if not nx.is_floating_point(C10):
warnings.warn(
"Input structure matrix consists of integers. The transport plan will be "
"casted accordingly, possibly resulting in a loss of precision. "
"If this behaviour is unwanted, please make sure your input "
"structure matrix consists of floating point elements.",
stacklevel=2,
)
if log:
res, log = partial_cg(
p,
q,
p_extended,
q_extended,
(1 - alpha) * M,
alpha,
f,
df,
G0,
line_search,
log=True,
numItermax=numItermax,
stopThr=tol,
stopThr2=0.0,
warn=warn,
**kwargs,
)
log["partial_fgw_dist"] = nx.from_numpy(log["loss"][-1], type_as=C10)
return nx.from_numpy(res, type_as=C10), log
else:
return nx.from_numpy(
partial_cg(
p,
q,
p_extended,
q_extended,
(1 - alpha) * M,
alpha,
f,
df,
G0,
line_search,
log=False,
numItermax=numItermax,
stopThr=tol,
stopThr2=0.0,
**kwargs,
),
type_as=C10,
)
[docs]
def partial_fused_gromov_wasserstein2(
M,
C1,
C2,
p=None,
q=None,
m=None,
loss_fun="square_loss",
alpha=0.5,
nb_dummies=1,
G0=None,
thres=1,
numItermax=1e4,
tol=1e-7,
symmetric=None,
warn=False,
log=False,
verbose=False,
**kwargs,
):
r"""
Returns the Partial Fused Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{F_1}, \mathbf{p})`
and :math:`(\mathbf{C_2}, \mathbf{F_2}, \mathbf{q})`, with pairwise
distance matrix :math:`\mathbf{M}` between node feature matrices.
The function solves the following optimization problem using Conditional Gradient:
.. math::
\mathbf{PFGW}_{\alpha} = \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
\alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
\mathbf{T}^T \mathbf{1} &= \mathbf{q}
\mathbf{T} &\geq 0
\mathbf{1}^T \mathbf{T}^T \mathbf{1} = m &\leq \min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}
where :
- :math:`\mathbf{M}`: metric cost matrix between features across domains
- :math:`\mathbf{C_1}`: Metric cost matrix in the source space.
- :math:`\mathbf{C_2}`: Metric cost matrix in the target space.
- :math:`\mathbf{p}`: Distribution in the source space.
- :math:`\mathbf{q}`: Distribution in the target space.
- `m` is the amount of mass to be transported
- `L`: Loss function to account for the misfit between the similarity matrices.
The formulation of the problem has been proposed in
:ref:`[29] <references-partial-gromov-wasserstein2>`
Note that when using backends, this loss function is differentiable wrt the
matrices (M, C1, C2).
.. note:: This function is backend-compatible and will work on arrays
from all compatible backends. But the algorithm uses the C++ CPU backend
which can lead to copy overhead on GPU arrays.
.. note:: All computations in the conjugate gradient solver are done with
numpy to limit memory overhead.
.. note:: This function will cast the computed transport plan to the data
type of the provided input :math:`\mathbf{C}_1`. Casting to an integer
tensor might result in a loss of precision. If this behaviour is
unwanted, please make sure to provide a floating point input.
Parameters
----------
M : array-like, shape (ns, nt)
Metric cost matrix between features across domains
C1 : ndarray, shape (ns, ns)
Metric cost matrix in the source space
C2 : ndarray, shape (nt, nt)
Metric cost matrix in the target space
p : ndarray, shape (ns,)
Distribution in the source space
q : ndarray, shape (nt,)
Distribution in the target space
m : float, optional
Amount of mass to be transported
(default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
loss_fun : str, optional
Loss function used for the solver either 'square_loss' or 'kl_loss'.
alpha : float, optional
Trade-off parameter (0 < alpha < 1)
nb_dummies : int, optional
Number of dummy points to add (avoid instabilities in the EMD solver)
G0 : ndarray, shape (ns, nt), optional
Initialization of the transportation matrix
thres : float, optional
quantile of the gradient matrix to populate the cost matrix when 0
(default: 1)
numItermax : int, optional
Max number of iterations
tol : float, optional
tolerance for stopping iterations
symmetric : bool, optional
Either C1 and C2 are to be assumed symmetric or not.
If let to its default None value, a symmetry test will be conducted.
Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
warn: bool, optional.
Whether to raise a warning when EMD did not converge.
log : bool, optional
return log if True
verbose : bool, optional
Print information along iterations
**kwargs : dict
parameters can be directly passed to the emd solver
.. warning::
When dealing with a large number of points, the EMD solver may face
some instabilities, especially when the mass associated to the dummy
point is large. To avoid them, increase the number of dummy points
(allows a smoother repartition of the mass over the points).
Returns
-------
partial_fgw_dist : float
partial FGW discrepancy
log : dict
log dictionary returned only if `log` is `True`
.. _references-partial-gromov-wasserstein2:
References
----------
.. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
Transport with Applications on Positive-Unlabeled Learning".
NeurIPS.
.. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
and Courty Nicolas "Optimal Transport for structured data with
application on graphs", International Conference on Machine Learning
(ICML). 2019.
"""
# simple get_backend as the full one will be handled in gromov_wasserstein
nx = get_backend(M, C1, C2)
# init marginals if set as None
if p is None:
p = unif(C1.shape[0], type_as=C1)
if q is None:
q = unif(C2.shape[0], type_as=C1)
T, log_pfgw = partial_fused_gromov_wasserstein(
M,
C1,
C2,
p,
q,
m,
loss_fun,
alpha,
nb_dummies,
G0,
thres,
numItermax,
tol,
symmetric,
warn,
True,
verbose,
**kwargs,
)
log_pfgw["T"] = T
pfgw = log_pfgw["partial_fgw_dist"]
# compute separate terms for gradients and log
lin_term = nx.sum(T * M)
log_pfgw["quad_loss"] = pfgw - (1 - alpha) * lin_term
log_pfgw["lin_loss"] = lin_term * (1 - alpha)
pgw_term = log_pfgw["quad_loss"] / alpha
if loss_fun == "square_loss":
gC1 = 2 * C1 * nx.outer(p, p) - 2 * nx.dot(T, nx.dot(C2, T.T))
gC2 = 2 * C2 * nx.outer(q, q) - 2 * nx.dot(T.T, nx.dot(C1, T))
elif loss_fun == "kl_loss":
gC1 = nx.log(C1 + 1e-15) * nx.outer(p, p) - nx.dot(
T, nx.dot(nx.log(C2 + 1e-15), T.T)
)
gC2 = -nx.dot(T.T, nx.dot(C1, T)) / (C2 + 1e-15) + nx.outer(q, q)
if isinstance(alpha, int) or isinstance(alpha, float):
pfgw = nx.set_gradients(
pfgw, (M, C1, C2), ((1 - alpha) * T, alpha * gC1, alpha * gC2)
)
else:
pfgw = nx.set_gradients(
pfgw,
(M, C1, C2, alpha),
((1 - alpha) * T, alpha * gC1, alpha * gC2, pgw_term - lin_term),
)
if log:
return pfgw, log_pfgw
else:
return pfgw
[docs]
def solve_partial_gromov_linesearch(
G,
deltaG,
cost_G,
df_G,
df_Gc,
M,
reg,
alpha_min=None,
alpha_max=None,
nx=None,
**kwargs,
):
"""
Solve the linesearch in the FW iterations of partial (F)GW following eq.5 of :ref:`[29]`.
Parameters
----------
G : array-like, shape(ns,nt)
The transport map at a given iteration of the FW
deltaG : array-like (ns,nt)
Difference between the optimal map `Gc` found by linearization in the
FW algorithm and the value at a given iteration
cost_G : float
Value of the cost at `G`
df_G : array-like (ns,nt)
Gradient of the GW cost at `G`
df_Gc : array-like (ns,nt)
Gradient of the GW cost at `Gc`
M : array-like (ns,nt)
Cost matrix between the features.
reg : float
Regularization parameter.
alpha_min : float, optional
Minimum value for alpha
alpha_max : float, optional
Maximum value for alpha
nx : backend, optional
If let to its default value None, a backend test will be conducted.
Returns
-------
alpha : float
The optimal step size of the FW
fc : int
nb of function call. Useless here
cost_G : float
The value of the cost for the next iteration
df_G : array-like (ns,nt)
Updated gradient of the GW cost
References
----------
.. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
Transport with Applications on Positive-Unlabeled Learning".
NeurIPS.
"""
if nx is None:
if isinstance(M, int) or isinstance(M, float):
nx = get_backend(G, deltaG, df_G, df_Gc)
else:
nx = get_backend(G, deltaG, df_G, df_Gc, M)
df_deltaG = df_Gc - df_G
cost_deltaG = 0.5 * nx.sum(df_deltaG * deltaG)
a = reg * cost_deltaG
# formula to check for partial FGW
b = nx.sum(M * deltaG) + reg * nx.sum(df_G * deltaG)
alpha = solve_1d_linesearch_quad(a, b)
if alpha_min is not None or alpha_max is not None:
alpha = np.clip(alpha, alpha_min, alpha_max)
# the new cost is deduced from the line search quadratic function
cost_G = cost_G + a * (alpha**2) + b * alpha
# update the gradient for next cg iteration
df_G = df_G + alpha * df_deltaG
return alpha, 1, cost_G, df_G
[docs]
def entropic_partial_gromov_wasserstein(
C1,
C2,
p=None,
q=None,
reg=1.0,
m=None,
loss_fun="square_loss",
G0=None,
numItermax=1000,
tol=1e-7,
symmetric=None,
log=False,
verbose=False,
):
r"""
Returns the partial Gromov-Wasserstein transport between
:math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
The function solves the following optimization problem:
.. math::
\gamma = \mathop{\arg \min}_{\gamma} \quad \sum_{i,j,k,l}
L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l})\cdot
\gamma_{i,j}\cdot\gamma_{k,l} + \mathrm{reg} \cdot\Omega(\gamma)
.. math::
s.t. \ \gamma &\geq 0
\gamma \mathbf{1} &\leq \mathbf{a}
\gamma^T \mathbf{1} &\leq \mathbf{b}
\mathbf{1}^T \gamma^T \mathbf{1} = m
&\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
where :
- :math:`\mathbf{C_1}` is the metric cost matrix in the source space
- :math:`\mathbf{C_2}` is the metric cost matrix in the target space
- :math:`\mathbf{p}` and :math:`\mathbf{q}` are the sample weights
- `L`: quadratic loss function
- :math:`\Omega` is the entropic regularization term,
:math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
- `m` is the amount of mass to be transported
The formulation of the GW problem has been proposed in
:ref:`[12] <references-entropic-partial-gromov-wasserstein>` and the
partial GW in :ref:`[29] <references-entropic-partial-gromov-wasserstein>`
Parameters
----------
C1 : array-like, shape (ns, ns)
Metric cost matrix in the source space
C2 : array-like, shape (nt, nt)
Metric cost matrix in the target space
p : array-like, shape (ns,), optional
Distribution in the source space.
If let to its default value None, uniform distribution is taken.
q : array-like, shape (nt,), optional
Distribution in the target space.
If let to its default value None, uniform distribution is taken.
reg: float, optional. Default is 1.
entropic regularization parameter
m : float, optional
Amount of mass to be transported (default:
:math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
loss_fun : str, optional
Loss function used for the solver either 'square_loss' or 'kl_loss'.
G0 : array-like, shape (ns, nt), optional
Initialization of the transportation matrix
numItermax : int, optional
Max number of iterations
tol : float, optional
Stop threshold on error (>0)
symmetric : bool, optional
Either C1 and C2 are to be assumed symmetric or not.
If let to its default None value, a symmetry test will be conducted.
Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
log : bool, optional
return log if True
verbose : bool, optional
Print information along iterations
Examples
--------
>>> from ot.gromov import entropic_partial_gromov_wasserstein
>>> import scipy as sp
>>> a = np.array([0.25] * 4)
>>> b = np.array([0.25] * 4)
>>> x = np.array([1,2,100,200]).reshape((-1,1))
>>> y = np.array([3,2,98,199]).reshape((-1,1))
>>> C1 = sp.spatial.distance.cdist(x, x)
>>> C2 = sp.spatial.distance.cdist(y, y)
>>> np.round(entropic_partial_gromov_wasserstein(C1, C2, a, b, 1e2), 2)
array([[0.12, 0.13, 0. , 0. ],
[0.13, 0.12, 0. , 0. ],
[0. , 0. , 0.25, 0. ],
[0. , 0. , 0. , 0.25]])
>>> np.round(entropic_partial_gromov_wasserstein(C1, C2, a, b, 1e2,0.25), 2)
array([[0.02, 0.03, 0. , 0.03],
[0.03, 0.03, 0. , 0.03],
[0. , 0. , 0.03, 0. ],
[0.02, 0.02, 0. , 0.03]])
Returns
-------
:math: `gamma` : (dim_a, dim_b) ndarray
Optimal transportation matrix for the given parameters
log : dict
log dictionary returned only if `log` is `True`
.. _references-entropic-partial-gromov-wasserstein:
References
----------
.. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
"Gromov-Wasserstein averaging of kernel and distance matrices."
International Conference on Machine Learning (ICML). 2016.
.. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
Transport with Applications on Positive-Unlabeled Learning".
NeurIPS.
See Also
--------
ot.partial.partial_gromov_wasserstein: exact Partial Gromov-Wasserstein
"""
arr = [C1, C2, G0]
if p is not None:
p = list_to_array(p)
arr.append(p)
if q is not None:
q = list_to_array(q)
arr.append(q)
nx = get_backend(*arr)
if p is None:
p = nx.ones(C1.shape[0], type_as=C1) / C1.shape[0]
if q is None:
q = nx.ones(C2.shape[0], type_as=C2) / C2.shape[0]
if m is None:
m = min(nx.sum(p), nx.sum(q))
elif m < 0:
raise ValueError("Problem infeasible. Parameter m should be greater" " than 0.")
elif m > min(nx.sum(p), nx.sum(q)):
raise ValueError(
"Problem infeasible. Parameter m should lower or"
" equal than min(|a|_1, |b|_1)."
)
if G0 is None:
G0 = (
nx.outer(p, q) * m / (nx.sum(p) * nx.sum(q))
) # make sure |G0|=m, G01_m\leq p, G0.T1_n\leq q.
else:
# Check marginals of G0
assert nx.any(nx.sum(G0, 1) <= p)
assert nx.any(nx.sum(G0, 0) <= q)
if symmetric is None:
symmetric = np.allclose(C1, C1.T, atol=1e-10) and np.allclose(
C2, C2.T, atol=1e-10
)
# Setup gradient computation
fC1, fC2, hC1, hC2 = _transform_matrix(C1, C2, loss_fun, nx)
fC2t = fC2.T
if not symmetric:
fC1t, hC1t, hC2t = fC1.T, hC1.T, hC2.T
ones_p = nx.ones(p.shape[0], type_as=p)
ones_q = nx.ones(q.shape[0], type_as=q)
def f(G):
pG = nx.sum(G, 1)
qG = nx.sum(G, 0)
constC1 = nx.outer(nx.dot(fC1, pG), ones_q)
constC2 = nx.outer(ones_p, nx.dot(qG, fC2t))
return gwloss(constC1 + constC2, hC1, hC2, G, nx)
if symmetric:
def df(G):
pG = nx.sum(G, 1)
qG = nx.sum(G, 0)
constC1 = nx.outer(nx.dot(fC1, pG), ones_q)
constC2 = nx.outer(ones_p, nx.dot(qG, fC2t))
return gwggrad(constC1 + constC2, hC1, hC2, G, nx)
else:
def df(G):
pG = nx.sum(G, 1)
qG = nx.sum(G, 0)
constC1 = nx.outer(nx.dot(fC1, pG), ones_q)
constC2 = nx.outer(ones_p, nx.dot(qG, fC2t))
constC1t = nx.outer(nx.dot(fC1t, pG), ones_q)
constC2t = nx.outer(ones_p, nx.dot(qG, fC2))
return 0.5 * (
gwggrad(constC1 + constC2, hC1, hC2, G, nx)
+ gwggrad(constC1t + constC2t, hC1t, hC2t, G, nx)
)
cpt = 0
err = 1
loge = {"err": []}
while err > tol and cpt < numItermax:
Gprev = G0
M_entr = df(G0)
G0 = entropic_partial_wasserstein(p, q, M_entr, reg, m)
if cpt % 10 == 0: # to speed up the computations
err = np.linalg.norm(G0 - Gprev)
if log:
loge["err"].append(err)
if verbose:
if cpt % 200 == 0:
print(
"{:5s}|{:12s}|{:12s}".format("It.", "Err", "Loss")
+ "\n"
+ "-" * 31
)
print("{:5d}|{:8e}|{:8e}".format(cpt, err, f(G0)))
cpt += 1
if log:
loge["partial_gw_dist"] = f(G0)
return G0, loge
else:
return G0
[docs]
def entropic_partial_gromov_wasserstein2(
C1,
C2,
p=None,
q=None,
reg=1.0,
m=None,
loss_fun="square_loss",
G0=None,
numItermax=1000,
tol=1e-7,
symmetric=None,
log=False,
verbose=False,
):
r"""
Returns the partial Gromov-Wasserstein discrepancy between
:math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
The function solves the following optimization problem:
.. math::
PGW = \min_{\gamma} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k},
\mathbf{C_2}_{j,l})\cdot
\gamma_{i,j}\cdot\gamma_{k,l} + \mathrm{reg} \cdot\Omega(\gamma)
.. math::
s.t. \ \gamma &\geq 0
\gamma \mathbf{1} &\leq \mathbf{a}
\gamma^T \mathbf{1} &\leq \mathbf{b}
\mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
where :
- :math:`\mathbf{C_1}` is the metric cost matrix in the source space
- :math:`\mathbf{C_2}` is the metric cost matrix in the target space
- :math:`\mathbf{p}` and :math:`\mathbf{q}` are the sample weights
- `L`: Loss function to account for the misfit between the similarity matrices.
- :math:`\Omega` is the entropic regularization term,
:math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
- `m` is the amount of mass to be transported
The formulation of the GW problem has been proposed in
:ref:`[12] <references-entropic-partial-gromov-wasserstein2>` and the
partial GW in :ref:`[29] <references-entropic-partial-gromov-wasserstein2>`
Parameters
----------
C1 : ndarray, shape (ns, ns)
Metric cost matrix in the source space
C2 : ndarray, shape (nt, nt)
Metric cost matrix in the target space
p : array-like, shape (ns,), optional
Distribution in the source space.
If let to its default value None, uniform distribution is taken.
q : array-like, shape (nt,), optional
Distribution in the target space.
If let to its default value None, uniform distribution is taken.
reg: float
entropic regularization parameter
m : float, optional
Amount of mass to be transported (default:
:math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
loss_fun : str, optional
Loss function used for the solver either 'square_loss' or 'kl_loss'.
G0 : ndarray, shape (ns, nt), optional
Initialization of the transportation matrix
numItermax : int, optional
Max number of iterations
tol : float, optional
Stop threshold on error (>0)
symmetric : bool, optional
Either C1 and C2 are to be assumed symmetric or not.
If let to its default None value, a symmetry test will be conducted.
Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
log : bool, optional
return log if True
verbose : bool, optional
Print information along iterations
Returns
-------
partial_gw_dist: float
Partial Gromov-Wasserstein distance
log : dict
log dictionary returned only if `log` is `True`
Examples
--------
>>> from ot.gromov import entropic_partial_gromov_wasserstein2
>>> import scipy as sp
>>> a = np.array([0.25] * 4)
>>> b = np.array([0.25] * 4)
>>> x = np.array([1,2,100,200]).reshape((-1,1))
>>> y = np.array([3,2,98,199]).reshape((-1,1))
>>> C1 = sp.spatial.distance.cdist(x, x)
>>> C2 = sp.spatial.distance.cdist(y, y)
>>> np.round(entropic_partial_gromov_wasserstein2(C1, C2, a, b, 1e2), 2)
3.75
.. _references-entropic-partial-gromov-wasserstein2:
References
----------
.. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
"Gromov-Wasserstein averaging of kernel and distance matrices."
International Conference on Machine Learning (ICML). 2016.
.. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
Transport with Applications on Positive-Unlabeled Learning".
NeurIPS.
"""
partial_gw, log_gw = entropic_partial_gromov_wasserstein(
C1, C2, p, q, reg, m, loss_fun, G0, numItermax, tol, symmetric, True, verbose
)
log_gw["T"] = partial_gw
if log:
return log_gw["partial_gw_dist"], log_gw
else:
return log_gw["partial_gw_dist"]