From 98529fdd0db5777f3aa6a8c6c4a9dcdbae02a1d4 Mon Sep 17 00:00:00 2001
From: Gonzague de Carpentier <decarpentierg@gmail.com>
Date: Mon, 12 Sep 2022 15:54:49 +0200
Subject: [PATCH 1/3] correct ot.gromov.init_matrix and ot.gromov.gwggrad to
 support asymmetric cost matrices

---
 ot/gromov.py | 72 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/ot/gromov.py b/ot/gromov.py
index bc1c8e593..9502c4aa6 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -79,8 +79,12 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
 
     Returns
     -------
-    constC : array-like, shape (ns, nt)
+    constC1 : array-like, shape (ns, nt)
         Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    constC2 : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6) but computed with C1.T and C2.T instead of C1 and C2.
+        This second matrix is useful for the computation of the gradient in the case of asymmetric C1 and C2, as explained 
+        in :ref:`[*] <references-init-matrix>` in subsection 5.1.
     hC1 : array-like, shape (ns, ns)
         :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
     hC2 : array-like, shape (nt, nt)
@@ -90,9 +94,14 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
     .. _references-init-matrix:
     References
     ----------
-    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
+    ..  [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
+    
+    ..  [*] Dan Meller and Gonzague de Carpentier,
+        "Mutilingual alignment of word embedding spaces", Subsection 5.1
+        Research project at École Polytechnique. 2021.
+        http://dx.doi.org/10.13140/RG.2.2.21701.37609.
 
     """
     C1, C2, p, q = list_to_array(C1, C2, p, q)
@@ -123,19 +132,28 @@ def h1(a):
         def h2(b):
             return nx.log(b + 1e-15)
 
-    constC1 = nx.dot(
+    constC1a = nx.dot(
         nx.dot(f1(C1), nx.reshape(p, (-1, 1))),
         nx.ones((1, len(q)), type_as=q)
     )
-    constC2 = nx.dot(
+    constC1b = nx.dot(
         nx.ones((len(p), 1), type_as=p),
         nx.dot(nx.reshape(q, (1, -1)), f2(C2).T)
     )
-    constC = constC1 + constC2
+    constC2a = nx.dot(
+        nx.dot(f1(C1).T, nx.reshape(p, (-1, 1))),
+        nx.ones((1, len(q)), type_as=q)
+    )
+    constC2b = nx.dot(
+        nx.ones((len(p), 1), type_as=p),
+        nx.dot(nx.reshape(q, (1, -1)), f2(C2))
+    )
+    constC1 = constC1a + constC1b
+    constC2 = constC2a + constC2b
     hC1 = h1(C1)
     hC2 = h2(C2)
 
-    return constC, hC1, hC2
+    return constC1, constC2, hC1, hC2
 
 
 def tensor_product(constC, hC1, hC2, T):
@@ -216,15 +234,19 @@ def gwloss(constC, hC1, hC2, T):
     return nx.sum(tens * T)
 
 
-def gwggrad(constC, hC1, hC2, T):
+def gwggrad(constC1, constC2, hC1, hC2, T):
     r"""Return the gradient for Gromov-Wasserstein
 
-    The gradient is computed as described in Proposition 2 in :ref:`[12] <references-gwggrad>`
+    The gradient is computed as described in Proposition 2 in :ref:`[12] <references-gwggrad>` and in :ref:`[*] <references-gwggrad>`.
 
     Parameters
     ----------
-    constC : array-like, shape (ns, nt)
+    constC1 : array-like, shape (ns, nt)
         Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    constC2 : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6) but computed with C1.T and C2.T instead of C1 and C2.
+        This second matrix is useful in the case of asymmetric C1 and C2, as explained 
+        in :ref:`[*] <references-init-matrix>` in subsection 5.1.
     hC1 : array-like, shape (ns, ns)
         :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
     hC2 : array-like, shape (nt, nt)
@@ -241,13 +263,19 @@ def gwggrad(constC, hC1, hC2, T):
     .. _references-gwggrad:
     References
     ----------
-    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
+    ..  [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
+    
+    ..  [*] Dan Meller and Gonzague de Carpentier,
+        "Mutilingual alignment of word embedding spaces", Subsection 5.1
+        Research project at École Polytechnique. 2021.
+        http://dx.doi.org/10.13140/RG.2.2.21701.37609.
 
     """
-    return 2 * tensor_product(constC, hC1, hC2,
-                              T)  # [12] Prop. 2 misses a 2 factor
+    # [12] Prop. 2 misses a second term in the gradient (explained in [*]). 
+    # In the case of symmetric matrices, the 2 terms are equal so Prop. 2 misses only a 2 factor.
+    return tensor_product(constC1, hC1, hC2, T) + tensor_product(constC2, hC1.T, hC2.T, T)  
 
 
 def update_square_loss(p, lambdas, T, Cs):
@@ -415,13 +443,13 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=F
         np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08)
         np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08)
 
-    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+    constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
     def f(G):
         return gwloss(constC, hC1, hC2, G)
 
     def df(G):
-        return gwggrad(constC, hC1, hC2, G)
+        return gwggrad(constC, constC2, hC1, hC2, G)
 
     if log:
         res, log = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
@@ -521,7 +549,7 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=
     C1 = nx.to_numpy(C10)
     C2 = nx.to_numpy(C20)
 
-    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+    constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
     if G0 is None:
         G0 = p[:, None] * q[None, :]
@@ -535,7 +563,7 @@ def f(G):
         return gwloss(constC, hC1, hC2, G)
 
     def df(G):
-        return gwggrad(constC, hC1, hC2, G)
+        return gwggrad(constC, constC2, hC1, hC2, G)
 
     T, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
 
@@ -653,13 +681,13 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
         np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08)
         np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08)
 
-    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+    constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
     def f(G):
         return gwloss(constC, hC1, hC2, G)
 
     def df(G):
-        return gwggrad(constC, hC1, hC2, G)
+        return gwggrad(constC, constC2, hC1, hC2, G)
 
     if log:
         res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
@@ -764,7 +792,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
     C2 = nx.to_numpy(C20)
     M = nx.to_numpy(M0)
 
-    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+    constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
     if G0 is None:
         G0 = p[:, None] * q[None, :]
@@ -778,7 +806,7 @@ def f(G):
         return gwloss(constC, hC1, hC2, G)
 
     def df(G):
-        return gwggrad(constC, hC1, hC2, G)
+        return gwggrad(constC, constC2, hC1, hC2, G)
 
     T, log_fgw = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
 
@@ -1280,7 +1308,7 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
 
     T = nx.outer(p, q)
 
-    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+    constC, constC2, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
     cpt = 0
     err = 1
@@ -1293,7 +1321,7 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
         Tprev = T
 
         # compute the gradient
-        tens = gwggrad(constC, hC1, hC2, T)
+        tens = gwggrad(constC, constC2, hC1, hC2, T)
 
         T = sinkhorn(p, q, tens, epsilon, method='sinkhorn')
 

From c68550f0d0981541d8d12d14d00b106731171485 Mon Sep 17 00:00:00 2001
From: Gonzague de Carpentier <decarpentierg@gmail.com>
Date: Mon, 12 Sep 2022 19:40:34 +0200
Subject: [PATCH 2/3] removed trailing whitespaces to pass PEP8 check

---
 ot/gromov.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ot/gromov.py b/ot/gromov.py
index 9502c4aa6..03bfa9bf0 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -83,7 +83,7 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
         Constant :math:`\mathbf{C}` matrix in Eq. (6)
     constC2 : array-like, shape (ns, nt)
         Constant :math:`\mathbf{C}` matrix in Eq. (6) but computed with C1.T and C2.T instead of C1 and C2.
-        This second matrix is useful for the computation of the gradient in the case of asymmetric C1 and C2, as explained 
+        This second matrix is useful for the computation of the gradient in the case of asymmetric C1 and C2, as explained
         in :ref:`[*] <references-init-matrix>` in subsection 5.1.
     hC1 : array-like, shape (ns, ns)
         :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
@@ -97,7 +97,7 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
     ..  [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
-    
+
     ..  [*] Dan Meller and Gonzague de Carpentier,
         "Mutilingual alignment of word embedding spaces", Subsection 5.1
         Research project at École Polytechnique. 2021.
@@ -245,7 +245,7 @@ def gwggrad(constC1, constC2, hC1, hC2, T):
         Constant :math:`\mathbf{C}` matrix in Eq. (6)
     constC2 : array-like, shape (ns, nt)
         Constant :math:`\mathbf{C}` matrix in Eq. (6) but computed with C1.T and C2.T instead of C1 and C2.
-        This second matrix is useful in the case of asymmetric C1 and C2, as explained 
+        This second matrix is useful in the case of asymmetric C1 and C2, as explained
         in :ref:`[*] <references-init-matrix>` in subsection 5.1.
     hC1 : array-like, shape (ns, ns)
         :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
@@ -266,16 +266,16 @@ def gwggrad(constC1, constC2, hC1, hC2, T):
     ..  [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
-    
+
     ..  [*] Dan Meller and Gonzague de Carpentier,
         "Mutilingual alignment of word embedding spaces", Subsection 5.1
         Research project at École Polytechnique. 2021.
         http://dx.doi.org/10.13140/RG.2.2.21701.37609.
 
     """
-    # [12] Prop. 2 misses a second term in the gradient (explained in [*]). 
+    # [12] Prop. 2 misses a second term in the gradient (explained in [*]).
     # In the case of symmetric matrices, the 2 terms are equal so Prop. 2 misses only a 2 factor.
-    return tensor_product(constC1, hC1, hC2, T) + tensor_product(constC2, hC1.T, hC2.T, T)  
+    return tensor_product(constC1, hC1, hC2, T) + tensor_product(constC2, hC1.T, hC2.T, T)
 
 
 def update_square_loss(p, lambdas, T, Cs):

From 1eb9aabfcebeac7730e730e06624931311dd5b1a Mon Sep 17 00:00:00 2001
From: decarpentierg <82534773+decarpentierg@users.noreply.github.com>
Date: Tue, 13 Sep 2022 19:31:30 +0200
Subject: [PATCH 3/3] update RELEASE.md

---
 RELEASES.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/RELEASES.md b/RELEASES.md
index 8b4f0de9f..478eb4791 100644
--- a/RELEASES.md
+++ b/RELEASES.md
@@ -6,6 +6,7 @@
 
 - Added Generalized Wasserstein Barycenter solver + example (PR #372), fixed graphical details on the example (PR #376)
 - Added Free Support Sinkhorn Barycenter + example (PR #387)
+- Generalization of Gromov-Wasserstein to asymmetric cost matrices (PR #399)
 
 #### Closed issues