From 98529fdd0db5777f3aa6a8c6c4a9dcdbae02a1d4 Mon Sep 17 00:00:00 2001 From: Gonzague de Carpentier Date: Mon, 12 Sep 2022 15:54:49 +0200 Subject: [PATCH 1/3] correct ot.gromov.init_matrix and ot.gromov.gwggrad to support asymmetric cost matrices --- ot/gromov.py | 72 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/ot/gromov.py b/ot/gromov.py index bc1c8e593..9502c4aa6 100644 --- a/ot/gromov.py +++ b/ot/gromov.py @@ -79,8 +79,12 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'): Returns ------- - constC : array-like, shape (ns, nt) + constC1 : array-like, shape (ns, nt) Constant :math:`\mathbf{C}` matrix in Eq. (6) + constC2 : array-like, shape (ns, nt) + Constant :math:`\mathbf{C}` matrix in Eq. (6) but computed with C1.T and C2.T instead of C1 and C2. + This second matrix is useful for the computation of the gradient in the case of asymmetric C1 and C2, as explained + in :ref:`[*] ` in subsection 5.1. hC1 : array-like, shape (ns, ns) :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6) hC2 : array-like, shape (nt, nt) @@ -90,9 +94,14 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'): .. _references-init-matrix: References ---------- - .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, + .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. + + .. [*] Dan Meller and Gonzague de Carpentier, + "Mutilingual alignment of word embedding spaces", Subsection 5.1 + Research project at École Polytechnique. 2021. + http://dx.doi.org/10.13140/RG.2.2.21701.37609. """ C1, C2, p, q = list_to_array(C1, C2, p, q) @@ -123,19 +132,28 @@ def h1(a): def h2(b): return nx.log(b + 1e-15) - constC1 = nx.dot( + constC1a = nx.dot( nx.dot(f1(C1), nx.reshape(p, (-1, 1))), nx.ones((1, len(q)), type_as=q) ) - constC2 = nx.dot( + constC1b = nx.dot( nx.ones((len(p), 1), type_as=p), nx.dot(nx.reshape(q, (1, -1)), f2(C2).T) ) - constC = constC1 + constC2 + constC2a = nx.dot( + nx.dot(f1(C1).T, nx.reshape(p, (-1, 1))), + nx.ones((1, len(q)), type_as=q) + ) + constC2b = nx.dot( + nx.ones((len(p), 1), type_as=p), + nx.dot(nx.reshape(q, (1, -1)), f2(C2)) + ) + constC1 = constC1a + constC1b + constC2 = constC2a + constC2b hC1 = h1(C1) hC2 = h2(C2) - return constC, hC1, hC2 + return constC1, constC2, hC1, hC2 def tensor_product(constC, hC1, hC2, T): @@ -216,15 +234,19 @@ def gwloss(constC, hC1, hC2, T): return nx.sum(tens * T) -def gwggrad(constC, hC1, hC2, T): +def gwggrad(constC1, constC2, hC1, hC2, T): r"""Return the gradient for Gromov-Wasserstein - The gradient is computed as described in Proposition 2 in :ref:`[12] ` + The gradient is computed as described in Proposition 2 in :ref:`[12] ` and in :ref:`[*] `. Parameters ---------- - constC : array-like, shape (ns, nt) + constC1 : array-like, shape (ns, nt) Constant :math:`\mathbf{C}` matrix in Eq. (6) + constC2 : array-like, shape (ns, nt) + Constant :math:`\mathbf{C}` matrix in Eq. (6) but computed with C1.T and C2.T instead of C1 and C2. + This second matrix is useful in the case of asymmetric C1 and C2, as explained + in :ref:`[*] ` in subsection 5.1. hC1 : array-like, shape (ns, ns) :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6) hC2 : array-like, shape (nt, nt) @@ -241,13 +263,19 @@ def gwggrad(constC, hC1, hC2, T): .. _references-gwggrad: References ---------- - .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, + .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. + + .. [*] Dan Meller and Gonzague de Carpentier, + "Mutilingual alignment of word embedding spaces", Subsection 5.1 + Research project at École Polytechnique. 2021. + http://dx.doi.org/10.13140/RG.2.2.21701.37609. """ - return 2 * tensor_product(constC, hC1, hC2, - T) # [12] Prop. 2 misses a 2 factor + # [12] Prop. 2 misses a second term in the gradient (explained in [*]). + # In the case of symmetric matrices, the 2 terms are equal so Prop. 2 misses only a 2 factor. + return tensor_product(constC1, hC1, hC2, T) + tensor_product(constC2, hC1.T, hC2.T, T) def update_square_loss(p, lambdas, T, Cs): @@ -415,13 +443,13 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=F np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08) np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08) - constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) + constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) def f(G): return gwloss(constC, hC1, hC2, G) def df(G): - return gwggrad(constC, hC1, hC2, G) + return gwggrad(constC, constC2, hC1, hC2, G) if log: res, log = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs) @@ -521,7 +549,7 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', log=False, armijo= C1 = nx.to_numpy(C10) C2 = nx.to_numpy(C20) - constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) + constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) if G0 is None: G0 = p[:, None] * q[None, :] @@ -535,7 +563,7 @@ def f(G): return gwloss(constC, hC1, hC2, G) def df(G): - return gwggrad(constC, hC1, hC2, G) + return gwggrad(constC, constC2, hC1, hC2, G) T, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs) @@ -653,13 +681,13 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08) np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08) - constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) + constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) def f(G): return gwloss(constC, hC1, hC2, G) def df(G): - return gwggrad(constC, hC1, hC2, G) + return gwggrad(constC, constC2, hC1, hC2, G) if log: res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs) @@ -764,7 +792,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5 C2 = nx.to_numpy(C20) M = nx.to_numpy(M0) - constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) + constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) if G0 is None: G0 = p[:, None] * q[None, :] @@ -778,7 +806,7 @@ def f(G): return gwloss(constC, hC1, hC2, G) def df(G): - return gwggrad(constC, hC1, hC2, G) + return gwggrad(constC, constC2, hC1, hC2, G) T, log_fgw = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs) @@ -1280,7 +1308,7 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, T = nx.outer(p, q) - constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) + constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun) cpt = 0 err = 1 @@ -1293,7 +1321,7 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, Tprev = T # compute the gradient - tens = gwggrad(constC, hC1, hC2, T) + tens = gwggrad(constC, constC2, hC1, hC2, T) T = sinkhorn(p, q, tens, epsilon, method='sinkhorn') From c68550f0d0981541d8d12d14d00b106731171485 Mon Sep 17 00:00:00 2001 From: Gonzague de Carpentier Date: Mon, 12 Sep 2022 19:40:34 +0200 Subject: [PATCH 2/3] removed trailing whitespaces to pass PEP8 check --- ot/gromov.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ot/gromov.py b/ot/gromov.py index 9502c4aa6..03bfa9bf0 100644 --- a/ot/gromov.py +++ b/ot/gromov.py @@ -83,7 +83,7 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'): Constant :math:`\mathbf{C}` matrix in Eq. (6) constC2 : array-like, shape (ns, nt) Constant :math:`\mathbf{C}` matrix in Eq. (6) but computed with C1.T and C2.T instead of C1 and C2. - This second matrix is useful for the computation of the gradient in the case of asymmetric C1 and C2, as explained + This second matrix is useful for the computation of the gradient in the case of asymmetric C1 and C2, as explained in :ref:`[*] ` in subsection 5.1. hC1 : array-like, shape (ns, ns) :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6) @@ -97,7 +97,7 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'): .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. - + .. [*] Dan Meller and Gonzague de Carpentier, "Mutilingual alignment of word embedding spaces", Subsection 5.1 Research project at École Polytechnique. 2021. @@ -245,7 +245,7 @@ def gwggrad(constC1, constC2, hC1, hC2, T): Constant :math:`\mathbf{C}` matrix in Eq. (6) constC2 : array-like, shape (ns, nt) Constant :math:`\mathbf{C}` matrix in Eq. (6) but computed with C1.T and C2.T instead of C1 and C2. - This second matrix is useful in the case of asymmetric C1 and C2, as explained + This second matrix is useful in the case of asymmetric C1 and C2, as explained in :ref:`[*] ` in subsection 5.1. hC1 : array-like, shape (ns, ns) :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6) @@ -266,16 +266,16 @@ def gwggrad(constC1, constC2, hC1, hC2, T): .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. - + .. [*] Dan Meller and Gonzague de Carpentier, "Mutilingual alignment of word embedding spaces", Subsection 5.1 Research project at École Polytechnique. 2021. http://dx.doi.org/10.13140/RG.2.2.21701.37609. """ - # [12] Prop. 2 misses a second term in the gradient (explained in [*]). + # [12] Prop. 2 misses a second term in the gradient (explained in [*]). # In the case of symmetric matrices, the 2 terms are equal so Prop. 2 misses only a 2 factor. - return tensor_product(constC1, hC1, hC2, T) + tensor_product(constC2, hC1.T, hC2.T, T) + return tensor_product(constC1, hC1, hC2, T) + tensor_product(constC2, hC1.T, hC2.T, T) def update_square_loss(p, lambdas, T, Cs): From 1eb9aabfcebeac7730e730e06624931311dd5b1a Mon Sep 17 00:00:00 2001 From: decarpentierg <82534773+decarpentierg@users.noreply.github.com> Date: Tue, 13 Sep 2022 19:31:30 +0200 Subject: [PATCH 3/3] update RELEASE.md --- RELEASES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASES.md b/RELEASES.md index 8b4f0de9f..478eb4791 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -6,6 +6,7 @@ - Added Generalized Wasserstein Barycenter solver + example (PR #372), fixed graphical details on the example (PR #376) - Added Free Support Sinkhorn Barycenter + example (PR #387) +- Generalization of Gromov-Wasserstein to asymmetric cost matrices (PR #399) #### Closed issues