import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


import torch

# Shall we use the GPU?
use_cuda = torch.cuda.is_available()
device = torch.device("cuda") if use_cuda else torch.device("cpu")

# PyTorch CPU/GPU Tensor to NumPy array conversion:
numpy = lambda x: x.detach().cpu().numpy()


from sampling import draw_samples

# Let's keep things fast by default. Feel free to increase!
N = 1000 # if use_cuda else 200

# X and Y are sampled from two densities, encoded as png images:
x_i = draw_samples("data/density_a.png", N=N, device=device)
y_j = draw_samples("data/density_b.png", N=N, device=device)


# Arbitrary color scheme for the x_i's:
colors_i = (10 * x_i[:, 0]).cos() * (10 * x_i[:, 1]).cos()
colors_i = numpy(colors_i)

# Just a plain blue color for the y_j's:
colors_j = [[0.55, 0.55, 0.95]]


from sampling import draw_figure

draw_figure(
    x_i=x_i,
    colors_i=colors_i,
    y_j=y_j,
    colors_j=colors_j,
)


def gradient_flow(x_i, y_j, loss, dt=0.05):
    """
    Flows along the gradient of the cost function using a simple Euler scheme.
    We rely on the PyTorch automatic differentiation engine to compute the
    gradient of the loss function.

    Parameters
    ----------
        x_i : (N,2) torch tensor
            samples of the moving source measure
        y_j : (N,2) torch tensor
            samples of the source target measure
        cost : (α_i,x_i,β_j,y_j) -> torch float number,
            real-valued function
        dt : float, default = .05
            time step, i.e. learning rate
    """

    # Parameters for the gradient descent:
    Nsteps = int(5 / dt) + 1
    display_its = [int(t / dt) for t in [0, 0.25, 0.50, 1.0, 2.0, 5.0]]

    # Make sure that we won't modify the input measures
    x_i, y_j = x_i.clone(), y_j.clone()

    # We're going to perform gradient descent on loss(x_i; y_j)
    # wrt. the positions x_i of the points that make up the source:
    x_i.requires_grad = True

    plt.figure(figsize=(12, 8))
    k = 1
    for i in range(Nsteps):  # Euler scheme ===============
        # Compute cost and gradient
        l_xy = loss(x_i, y_j)
        [g] = torch.autograd.grad(l_xy, [x_i])

        if i in display_its:  # display
            ax = plt.subplot(2, 3, k)
            ax.set_title(f"t = {dt*i:1.2f}")
            k = k + 1

            draw_figure(
                x_i=x_i,
                colors_i=colors_i,
                grad_i=g * N,
                y_j=y_j,
                colors_j=colors_j,
                ax=ax,
            )

        # in-place modification of the tensor's values
        x_i.data -= dt * N * g


def Euclidean_loss(x_i, y_j):
    """
    Simplistic squared Euclidean distance between sampled point clouds,
    assuming a pairwise correspondence between x_i[k] and y_j[k].
    """
    return 0.5 * ((x_i - y_j) ** 2).sum(1).mean()


gradient_flow(x_i, y_j, Euclidean_loss)


def squared_distances(x_i, y_j):
    """Returns the (N,N) matrix of squared distances between the points x_i and y_j.

    Note that this is a *very* naive Python code, written for pedagogical purposes.
    Please see www.kernel-operations.io for a scalable and (much) faster implementation.
    """
    diff_xy = x_i.view(N, 1, 2) - y_j.view(1, N, 2)  # (N,N,D)
    return (diff_xy**2).sum(dim=2)  # (N,N)


def ICP_loss(x_i, y_j):
    D_xy = squared_distances(x_i, y_j)
    dists_i = D_xy.min(dim=1).values
    return 0.5 * dists_i.mean()


gradient_flow(x_i, y_j, ICP_loss)


def ICP_loss_symmetric(x_i, y_j):
    D_xy = squared_distances(x_i, y_j)  # (N,N)
    dists_i = D_xy.min(dim=1).values  # (N,)
    dists_j = D_xy.min(dim=0).values  # (N,)
    return 0.25 * (dists_i.mean() + dists_j.mean())


gradient_flow(x_i, y_j, ICP_loss_symmetric)


def kernel_loss(kernel, sigma=1.0):
    def loss(x_i, y_j):
        K_xy = kernel(x_i, y_j, sigma)  # (N,N)
        K_xx = kernel(x_i, x_i, sigma)  # (N,N)
        K_yy = kernel(y_j, y_j, sigma)  # (N,N)

        return (1 / (2 * N**2)) * (K_xx.sum() - 2 * K_xy.sum() + K_yy.sum())

    return loss


def gaussian_kernel(x_i, y_j, sigma=1.0):
    D_xy = squared_distances(x_i / sigma, y_j / sigma)  # (N,N)
    return (-D_xy / 2).exp()

gradient_flow(x_i, y_j, kernel_loss(gaussian_kernel, sigma=1.0))


gradient_flow(x_i, y_j, kernel_loss(gaussian_kernel, sigma=0.5))


gradient_flow(x_i, y_j, kernel_loss(gaussian_kernel, sigma=0.1))


def exponential_kernel(x_i, y_j, sigma=1.0):
    D_xy = squared_distances(x_i / sigma, y_j / sigma)  # (N,N)
    return (-(D_xy + 0.00001).sqrt()).exp()


gradient_flow(x_i, y_j, kernel_loss(exponential_kernel, sigma=0.2))


def distance_kernel(x_i, y_j, sigma=1.0):
    D_xy = squared_distances(x_i / sigma, y_j / sigma)  # (N,N)
    return -(D_xy + 0.00001).sqrt()


gradient_flow(x_i, y_j, kernel_loss(distance_kernel, sigma=1.0))


def softmin(eps, C, G):
    return - eps * (G.view(1,N) - C.view(N,N) / eps).logsumexp(dim=1)

def OT_loss(blur=0.05, n_iter=20):
    def ot_loss(x_i, y_j):
        # The (N,N) cost matrices:
        C_xy = squared_distances(x_i, y_j) / 2
        C_yx = C_xy.t()
        C_xx = squared_distances(x_i, x_i) / 2
        C_yy = squared_distances(y_j, y_j) / 2
        
        # Simulated annealing heuristic (aka. epsilon-scaling in operations research):
        # let the temperature decrease across iterations, from 1 (=diameter) to the target value
        scales = torch.from_numpy(np.geomspace(1., blur, n_iter)).to(device=device, dtype=torch.float32)

        # Initialize the dual potentials:
        uni = - np.log(N) * torch.ones(N, device=device, dtype=torch.float32)
        g_xy = softmin(1., C_yx, uni)
        f_yx = softmin(1., C_xy, uni)
        g_yy = softmin(1., C_yy, uni)
        f_xx = softmin(1., C_xx, uni)
        
        # Symmetric Sinkhorn loop, in the log-domain, with annealing:
        for scale in scales:
            eps = scale ** 2
            ft_yx = softmin(eps, C_xy, uni + g_xy / eps)  # Y -> X
            gt_xy = softmin(eps, C_yx, uni + f_yx / eps)  # X -> Y

            ft_xx = softmin(eps, C_xx, uni + f_xx / eps)  # X -> X
            gt_yy = softmin(eps, C_yy, uni + g_yy / eps)  # Y -> Y

            # Symmetric updates:
            f_yx, g_xy = 0.5 * (f_yx + ft_yx), 0.5 * (g_xy + gt_xy)  # OT(X,Y)
            f_xx, g_yy = 0.5 * (f_xx + ft_xx), 0.5 * (g_yy + gt_yy)  # OT(X,X), OT(Y,Y)

        # Final iteration to bypass the PyTorch back-propagation:
        eps = blur ** 2
        f_yx, g_xy = (
            softmin(eps, C_xy, (uni + g_xy / eps).detach()),
            softmin(eps, C_yx, (uni + f_yx / eps).detach()),
        )
        f_xx = softmin(eps, C_xx, (uni + f_xx / eps).detach())
        g_yy = softmin(eps, C_yy, (uni + g_yy / eps).detach())
            
        # Return the debiased dual cost:
        return (f_yx - f_xx).mean() + (g_xy - g_yy).mean()

    return ot_loss


gradient_flow(x_i, y_j, OT_loss(blur=0.01))


gradient_flow(x_i, y_j, OT_loss(blur=0.2))

Minimizing a distance between two point clouds - what could go wrong?¶

Setup¶

Gradient flows¶

Euclidean distance¶

Closest point projections¶

Kernel/MMD/Sobolev norms¶

Optimal Transport distance¶

Conclusion¶