From 849a8d8bfb1de2bd92efc32ffc81f1b27719d269 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 27 Oct 2017 13:35:40 +0200
Subject: [PATCH 01/79] first commit

---
 doc/modules/classes.rst             |   1 +
 sklearn/neighbors/__init__.py       |   4 +-
 sklearn/neighbors/nca.py            | 524 ++++++++++++++++++++++++++++
 sklearn/neighbors/tests/test_nca.py | 253 ++++++++++++++
 4 files changed, 781 insertions(+), 1 deletion(-)
 create mode 100644 sklearn/neighbors/nca.py
 create mode 100644 sklearn/neighbors/tests/test_nca.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index d3fd6d4e4479d..fbb773e8024f0 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1113,6 +1113,7 @@ Model validation
    neighbors.RadiusNeighborsRegressor
    neighbors.NearestCentroid
    neighbors.NearestNeighbors
+   neighbors.NeighborhoodComponentAnalysis
 
 .. autosummary::
    :toctree: generated/
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 852b0a5fe32f6..8e211ef9ec448 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -14,6 +14,7 @@
 from .kde import KernelDensity
 from .approximate import LSHForest
 from .lof import LocalOutlierFactor
+from .nca import NeighborhoodComponentAnalysis
 
 __all__ = ['BallTree',
            'DistanceMetric',
@@ -28,4 +29,5 @@
            'radius_neighbors_graph',
            'KernelDensity',
            'LSHForest',
-           'LocalOutlierFactor']
+           'LocalOutlierFactor',
+           'NeighborhoodComponentAnalysis']
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
new file mode 100644
index 0000000000000..a3d35e4dd0a2d
--- /dev/null
+++ b/sklearn/neighbors/nca.py
@@ -0,0 +1,524 @@
+# coding: utf-8
+"""
+Neighborhood Component Analysis
+"""
+
+# License: BSD 3 Clause
+
+from __future__ import print_function
+
+import numpy as np
+import sys
+import time
+from scipy.misc import logsumexp
+from scipy.optimize import minimize
+
+from ..base import BaseEstimator, TransformerMixin
+from ..preprocessing import LabelEncoder
+from ..decomposition import PCA
+from ..utils.multiclass import check_classification_targets
+from ..utils.random import check_random_state
+from ..utils.validation import check_is_fitted, check_array, check_X_y
+from ..externals.six import integer_types
+
+
+class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
+    """Neighborhood Component Analysis
+
+    Parameters
+    ----------
+    n_features_out: int, optional (default=None)
+        Preferred dimensionality of the embedding.
+
+    init : string or numpy array, optional (default='pca')
+        Initialization of the linear transformation. Possible options are
+        'pca', 'identity', 'random', and a numpy array of shape
+        (n_features_a, n_features_b).
+
+        pca:
+            ``n_features_out`` many principal components of the inputs passed
+            to :meth:`fit` will be used to initialize the transformation.
+
+        identity:
+            If ``n_features_out`` is strictly smaller than the
+            dimensionality of the inputs passed to :meth:`fit`, the identity
+            matrix will be truncated to the first ``n_features_out`` rows.
+
+        random:
+            The initial transformation will be a random array of shape
+            (n_features_out, n_features). Each value is sampled from the
+            standard normal distribution.
+
+        numpy array:
+            n_features_b must match the dimensionality of the inputs passed to
+            :meth:`fit` and n_features_a must be less than or equal to that.
+            If ``n_features_out`` is not None, n_features_a must match it.
+
+    max_iter : int, optional (default=50)
+        Maximum number of iterations in the optimization.
+
+    tol : float, optional (default=1e-5)
+        Convergence tolerance for the optimization.
+
+    callback : callable, optional (default=None)
+        If not None, this function is called after every iteration of the
+        optimizer, taking as arguments the current solution (transformation)
+        and the number of iterations. This might be useful in case one wants
+        to examine or store the transformation found after each iteration.
+
+    store_opt_result : bool, optional (default=False)
+        If True, the :class:`scipy.optimize.OptimizeResult` object returned by
+        :meth:`minimize` of `scipy.optimize` will be stored as attribute
+        ``opt_result_``.
+
+    verbose : int, optional (default=0)
+        If 0, no progress messages will be printed.
+        If 1, progress messages will be printed to stdout.
+        If > 1, progress messages will be printed and the ``iprint``
+        parameter of :meth:`_minimize_lbfgsb` of `scipy.optimize` will be set
+        to ``verbose - 2``.
+
+    random_state : int or numpy.RandomState or None, optional (default=None)
+        A pseudo random number generator object or a seed for it if int. If
+        ``init='random'``, ``random_state`` is used to initialize the random
+        transformation. If ``init='pca'``, ``random_state`` is passed as an
+        argument to PCA when initializing the transformation.
+
+    Attributes
+    ----------
+    transformation_ : array, shape (n_features_out, n_features)
+    The linear transformation learned during fitting.
+
+    n_iter_ : int
+    Counts the number of iterations performed by the optimizer.
+
+    opt_result_ : scipy.optimize.OptimizeResult (optional)
+        A dictionary of information representing the optimization result.
+        This is stored only if ``store_opt_result`` was True.
+
+    Examples
+    --------
+    >>> from sklearn.neighbors.nca import NeighborhoodComponentAnalysis
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = load_iris(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ... stratify=y, test_size=0.7, random_state=42)
+    >>> nca = NeighborhoodComponentAnalysis(None,random_state=42)
+    >>> nca.fit(X_train, y_train) # doctest: +ELLIPSIS
+    NeighborhoodComponentAnalysis(...)
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> knn.fit(X_train, y_train) # doctest: +ELLIPSIS
+    KNeighborsClassifier(...)
+    >>> print(knn.score(X_test, y_test))
+    0.933333333333
+    >>> knn.fit(nca.transform(X_train), y_train) # doctest: +ELLIPSIS
+    KNeighborsClassifier(...)
+    >>> print(knn.score(nca.transform(X_test), y_test))
+    0.961904761905
+
+    Notes
+    -----
+    Neighborhood Component Analysis (NCA) is a machine learning algorithm for
+    metric learning. It learns a linear transformation of the space in a
+    supervised fashion to improve the classification accuracy of a
+    stochastic nearest neighbors rule in this new space.
+
+    .. warning::
+
+        As NCA is optimizing a non-convex objective function, it will
+        likely end up in a local optimum. Several runs with independent random
+        init might be necessary to get a good convergence.
+
+    References
+    ----------
+    .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
+           "Neighbourhood Components Analysis". Advances in Neural Information
+           Processing Systems. 17, 513-520, 2005.
+           http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf
+    """
+
+    def __init__(self, n_features_out=None, init='identity', max_iter=50,
+                 tol=1e-5, callback=None, store_opt_result=False, verbose=0,
+                 random_state=None):
+
+        # Parameters
+        self.n_features_out = n_features_out
+        self.init = init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.callback = callback
+        self.store_opt_result = store_opt_result
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def fit(self, X, y):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The training samples.
+
+        y : array-like, shape (n_samples,)
+            The corresponding training labels.
+
+        Returns
+        -------
+        self : object
+            returns a trained NeighborhoodComponentAnalysis model.
+        """
+
+        # Verify inputs X and y and NCA parameters, and transform a copy if
+        # needed
+        X_valid, y_valid, init = self._validate_params(X, y)
+
+        # Initialize the random generator
+        self.random_state_ = check_random_state(self.random_state)
+
+        # Measure the total training time
+        t_train = time.time()
+
+        # Compute arrays that stay fixed during optimization:
+        # mask for fast lookup of same-class samples
+        masks = _make_masks(y_valid)
+        # pairwise differences
+        diffs = X_valid[:, np.newaxis] - X_valid[np.newaxis]
+
+        # Initialize the transformation
+        transformation = self._initialize(X_valid, init)
+
+        # Create a dictionary of parameters to be passed to the optimizer
+        disp = self.verbose - 2 if self.verbose > 1 else -1
+        optimizer_params = {'method': 'L-BFGS-B',
+                            'fun': self._loss_grad_lbfgs,
+                            'args': (X_valid, y_valid, diffs, masks),
+                            'jac': True,
+                            'x0': transformation,
+                            'tol': self.tol,
+                            'options': dict(maxiter=self.max_iter, disp=disp),
+                            'callback': self._callback
+                            }
+
+        # Call the optimizer
+        self.n_iter_ = 0
+        opt_result = minimize(**optimizer_params)
+
+        # Reshape the solution found by the optimizer
+        self.transformation_ = opt_result.x.reshape(-1, X_valid.shape[1])
+
+        # Stop timer
+        t_train = time.time() - t_train
+        if self.verbose:
+            print('[{}] Training took {:8.2f}s.'.format(
+                self.__class__.__name__, t_train))
+
+        # Optionally store information returned by the optimizer
+        if self.store_opt_result:
+            self.opt_result_ = opt_result
+
+        return self
+
+    def transform(self, X):
+        """Applies the learned transformation to the given data.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Data samples.
+
+        Returns
+        -------
+        X_embedded: array, shape (n_samples, n_features_out)
+            The data samples transformed.
+
+        Raises
+        ------
+        NotFittedError
+            If :meth:`fit` has not been called before.
+        """
+
+        check_is_fitted(self, ['transformation_'])
+        X = check_array(X)
+
+        return np.dot(X, self.transformation_.T)
+
+    def _validate_params(self, X, y):
+        """Validate parameters as soon as :meth:`fit` is called.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The training samples.
+
+        y : array-like, shape (n_samples,)
+            The corresponding training labels.
+
+        Returns
+        -------
+        X_valid : array, shape (n_samples, n_features)
+            The validated training samples.
+
+        y_valid : array, shape (n_samples,)
+            The validated training labels, encoded to be integers in
+            the range(0, n_classes).
+
+        init : string or numpy array of shape (n_features_a, n_features_b)
+            The validated initialization of the linear transformation.
+
+        Raises
+        -------
+        TypeError
+            If a parameter is not an instance of the desired type.
+
+        ValueError
+            If a parameter's value violates its legal value range or if the
+            combination of two or more given parameters is incompatible.
+        """
+
+        # Validate the inputs X and y, and converts y to numerical classes.
+        X_valid, y_valid = check_X_y(X, y)
+        check_classification_targets(y_valid)
+        y_valid = LabelEncoder().fit_transform(y_valid)
+
+        # Check the preferred embedding dimensionality
+        if self.n_features_out is not None:
+            _check_scalar(self.n_features_out, 'n_features_out',
+                          integer_types, 1)
+
+            if self.n_features_out > X.shape[1]:
+                raise ValueError('The preferred embedding dimensionality '
+                                 '`n_features_out` ({}) cannot be greater '
+                                 'than the given data dimensionality ({})!'
+                                 .format(self.n_features_out, X.shape[1]))
+
+        _check_scalar(self.max_iter, 'max_iter', integer_types, 1)
+        _check_scalar(self.tol, 'tol', float, 0.)
+        _check_scalar(self.verbose, 'verbose', integer_types, 0)
+
+        if self.callback is not None:
+            if not callable(self.callback):
+                raise ValueError('`callback` is not callable.')
+
+        # Check how the linear transformation should be initialized
+        init = self.init
+
+        if isinstance(init, np.ndarray):
+            init = check_array(init)
+
+            # Assert that init.shape[1] = X.shape[1]
+            if init.shape[1] != X_valid.shape[1]:
+                raise ValueError(
+                    'The input dimensionality ({}) of the given '
+                    'linear transformation `init` must match the '
+                    'dimensionality of the given inputs `X` ({}).'
+                    .format(init.shape[1], X_valid.shape[1]))
+
+            # Assert that init.shape[0] <= init.shape[1]
+            if init.shape[0] > init.shape[1]:
+                raise ValueError(
+                    'The output dimensionality ({}) of the given '
+                    'linear transformation `init` cannot be '
+                    'greater than its input dimensionality ({}).'
+                    .format(init.shape[0], init.shape[1]))
+
+            if self.n_features_out is not None:
+                # Assert that self.n_features_out = init.shape[0]
+                if self.n_features_out != init.shape[0]:
+                    raise ValueError(
+                        'The preferred embedding dimensionality '
+                        '`n_features_out` ({}) does not match '
+                        'the output dimensionality of the given '
+                        'linear transformation `init` ({})!'
+                        .format(self.n_features_out,
+                                init.shape[0]))
+
+        elif init in ['pca', 'identity', 'random']:
+            pass
+        else:
+            raise ValueError(
+                "`init` must be 'pca', 'identity', 'random' or a numpy "
+                "array of shape (n_features_out, n_features).")
+
+        return X_valid, y_valid, init
+
+    def _initialize(self, X, init):
+        """Initialize the transformation.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Data samples.
+
+        init : string or numpy array of shape (n_features_a, n_features_b)
+            The validated initialization of the linear transformation.
+
+        Returns
+        -------
+        transformation : array, shape (n_features_out, n_features)
+            The initialized linear transformation.
+
+        """
+
+        transformation = init
+
+        if isinstance(init, np.ndarray):
+            pass
+        else:
+            n_features_out = self.n_features_out or X.shape[1]
+            if init == 'identity':
+                transformation = np.eye(n_features_out, X.shape[1])
+            elif init == 'random':
+                transformation = self.random_state_.randn(n_features_out,
+                                                          X.shape[1])
+            elif init == 'pca':
+                pca = PCA(n_components=n_features_out,
+                          random_state=self.random_state_)
+                t_pca = time.time()
+                if self.verbose:
+                    print('Finding principal components... ', end='')
+                    sys.stdout.flush()
+
+                pca.fit(X)
+                if self.verbose:
+                    print('done in {:5.2f}s'.format(time.time() - t_pca))
+
+                transformation = pca.components_
+        return transformation
+
+    def _callback(self, transformation):
+        """Called after each iteration of the optimizer.
+
+        Parameters
+        ----------
+        transformation : array, shape(n_features_out, n_features)
+            The solution computed by the optimizer in this iteration.
+        """
+        if self.callback is not None:
+            self.callback(transformation, self.n_iter_)
+
+        self.n_iter_ += 1
+
+    def _loss_grad_lbfgs(self, transformation, X, y, diffs,
+                         masks):
+        """Compute the loss and the loss gradient w.r.t. ``transformation``.
+
+        Parameters
+        ----------
+        transformation : array, shape (n_features_out, n_features)
+            The linear transformation on which to compute loss and evaluate
+            gradient
+        X : array, shape (n_samples, n_features)
+            The training samples.
+
+        y : array, shape (n_samples,)
+            The corresponding training labels.
+
+        diffs : array, shape (n_samples, n_samples, n_features)
+            Pairwise differences between training samples.
+
+        masks : array, shape (n_samples, n_classes)
+            One-hot encoding of y.
+
+        Returns
+        -------
+        loss : float
+            The loss computed for the given transformation.
+
+        gradient : array, shape (n_features_out * n_features,)
+            The new (flattened) gradient of the loss.
+        """
+
+        transformation = transformation.reshape(-1, X.shape[1])
+        loss = 0
+        gradient = np.zeros(transformation.shape)
+        X_embedded = transformation.dot(X.T).T
+
+        # for every sample, compute its contribution to loss and gradient
+        for i in range(X.shape[0]):
+            diff_embedded = X_embedded[i] - X_embedded
+            sum_of_squares = np.einsum('ij,ij->i', diff_embedded,
+                                       diff_embedded)
+            sum_of_squares[i] = np.inf
+            soft = np.exp(-sum_of_squares - logsumexp(-sum_of_squares))
+            ci = masks[:, y[i]]
+            p_i_j = soft[ci]
+            not_ci = np.logical_not(ci)
+            diff_ci = diffs[i, ci, :]  # n_samples * n_features
+            diff_not_ci = diffs[i, not_ci, :]
+            sum_ci = diff_ci.T.dot(
+                (p_i_j[:, np.newaxis] * diff_embedded[ci, :]))
+            sum_not_ci = diff_not_ci.T.dot((soft[not_ci][:, np.newaxis] *
+                                            diff_embedded[not_ci, :]))
+            p_i = np.sum(p_i_j)
+            gradient += 2 * (p_i * (sum_ci.T + sum_not_ci.T) - sum_ci.T)
+            loss += p_i
+        return - loss, - gradient.ravel()
+
+
+##########################
+# Some helper functions #
+#########################
+
+
+def _check_scalar(x, name, target_type, min_val=None, max_val=None):
+    """Validate scalar parameters type and value.
+
+    Parameters
+    ----------
+    x : object
+        The scalar parameter to validate.
+
+    name : str
+        The name of the parameter to be printed in error messages.
+
+    target_type : type or tuple
+        Acceptable data types for the parameter.
+
+    min_val : float or int, optional (default=None)
+        The minimum value value the parameter can take. If None (default) it
+        is implied that the parameter does not have a lower bound.
+
+    max_val: float or int, optional (default=None)
+        The maximum valid value the parameter can take. If None (default) it
+        is implied that the parameter does not have an upper bound.
+
+    Raises
+    -------
+    TypeError
+        If the parameter's type does not match the desired type.
+
+    ValueError
+        If the parameter's value violates the given bounds.
+    """
+
+    if not isinstance(x, target_type):
+        raise TypeError('`{}` must be an instance of {}, not {}.'
+                        .format(name, target_type, type(x)))
+
+    if min_val is not None and x < min_val:
+        raise ValueError('`{}`= {}, must be >= {}.'.format(name, x, min_val))
+
+    if max_val is not None and x > max_val:
+        raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val))
+
+
+def _make_masks(y):
+    """Create one-hot encoding of vector ``y``.
+
+    Parameters
+    ----------
+    y : array, shape (n_samples,)
+        Data samples labels.
+
+    Returns
+    -------
+    masks: array, shape (n_samples, n_classes)
+        One-hot encoding of ``y``.
+    """
+
+    n = y.shape[0]
+    masks = np.zeros((n, y.max() + 1))
+    masks[np.arange(n), y] = [1]
+    return masks.astype(bool)
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
new file mode 100644
index 0000000000000..fccc046e51892
--- /dev/null
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -0,0 +1,253 @@
+import numpy as np
+from sklearn.utils import check_random_state
+from sklearn.utils.testing import assert_raises, assert_equal
+from sklearn.datasets import load_iris, make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors.nca import NeighborhoodComponentAnalysis, _make_masks
+from sklearn.metrics import pairwise_distances
+
+
+rng = check_random_state(0)
+# load and shuffle iris dataset
+iris = load_iris()
+perm = rng.permutation(iris.target.size)
+iris_data = iris.data[perm]
+iris_target = iris.target[perm]
+EPS = np.finfo(float).eps
+
+
+def test_finite_differences():
+    r"""Test gradient of loss function
+
+    Test if the gradient is correct by computing the relative difference
+    between the projected gradient PG:
+
+    .. math::
+
+        PG = \mathbf d^{\top} \cdot \nabla
+        \mathcal L(\mathbf x)
+
+    and the finite differences FD:
+
+    .. math::
+
+        FD = \frac{\mathcal L(\mathbf x + \epsilon \mathbf d) -
+        \mathcal L(\mathbf x - \epsilon \mathbf d)}{2 \epsilon}
+
+
+    where :math:`d` is a random direction (random vector of shape `n_features`,
+    and norm 1), :math:`\epsilon` is a very small number, :math:`\mathcal L` is
+    the loss function and :math:`\nabla \mathcal L` is its gradient. This
+    relative difference should be zero:
+
+    .. math ::
+
+        \frac{|PG -FD|}{|PG|} = 0
+
+
+    """
+    # Initialize `transformation`, `X` and `y` and `NCA`
+    random_state = check_random_state(0)
+    n_features = 10
+    num_dims = 2
+    n_samples = 100
+    n_labels = 3
+    y = random_state.randint(0, n_labels, (n_samples))
+    point = random_state.randn(num_dims, n_features)
+    X = random_state.randn(n_samples, n_features)
+    nca = NeighborhoodComponentAnalysis(None, init=point)
+
+    X, y, init = nca._validate_params(X, y)
+    masks = _make_masks(y)
+    diffs = X[:, np.newaxis] - X[np.newaxis]
+
+    point = nca._initialize(X, init)
+    # compute the gradient at `point`
+    _, gradient = nca._loss_grad_lbfgs(point, X, y, diffs,
+                                       masks)
+
+    # create a random direction of norm 1
+    random_direction = random_state.randn(*point.shape)
+    random_direction /= np.linalg.norm(random_direction)
+
+    # computes projected gradient
+    projected_gradient = random_direction.ravel().dot(
+                                      gradient.ravel())
+
+    # compute finite differences
+    eps = 1e-5
+    right_loss, _ = nca._loss_grad_lbfgs(point + eps * random_direction,
+                                         X, y, diffs, masks)
+    left_loss, _ = nca._loss_grad_lbfgs(point - eps * random_direction,
+                                        X, y, diffs, masks)
+    finite_differences = 1/(2*eps) * (right_loss - left_loss)
+
+    # compute relative error
+    relative_error = np.abs(finite_differences - projected_gradient) / \
+        np.abs(projected_gradient)
+    np.testing.assert_almost_equal(relative_error, 0.)
+
+
+def test_simple_example():
+    """Test on a simple example.
+
+    Puts four points in the input space where the opposite labels points are
+    next to each other. After transform the same labels points should be next
+    to each other.
+
+    """
+    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
+    y = np.array([1, 0, 1, 0])
+    nca = NeighborhoodComponentAnalysis(n_features_out=2, init='identity',
+                                        random_state=42)
+    nca.fit(X, y)
+    X_transformed = nca.transform(X)
+    np.testing.assert_equal(pairwise_distances(X_transformed).argsort()[:, 1],
+                            np.array([2, 3, 0, 1]))
+
+
+def test_params_validation():
+    # Test that invalid parameters raise value error
+    X = np.arange(12).reshape(4, 3)
+    y = [1, 1, 2, 2]
+    NCA = NeighborhoodComponentAnalysis
+
+    # TypeError
+    assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
+    assert_raises(TypeError, NCA(verbose='true').fit, X, y)
+    assert_raises(TypeError, NCA(tol=1).fit, X, y)
+    assert_raises(TypeError, NCA(n_features_out='invalid').fit,
+                  X, y)
+
+    # ValueError
+    assert_raises(ValueError, NCA(init=1).fit, X, y)
+    assert_raises(ValueError, NCA(max_iter=-1).fit, X, y)
+
+    fit_func = NCA(init=np.random.rand(5, 3)).fit
+    assert_raises(ValueError, fit_func, X, y)
+    assert_raises(ValueError, NCA(n_features_out=10).fit, X, y)
+
+
+def test_transformation_dimensions():
+    X = np.arange(12).reshape(4, 3)
+    y = [1, 1, 2, 2]
+
+    # Fail if transformation input dimension does not match inputs dimensions
+    transformation = np.array([[1, 2], [3, 4]])
+    assert_raises(ValueError,
+                  NeighborhoodComponentAnalysis(None, init=transformation).fit,
+                  X, y)
+
+    # Fail if transformation output dimension is larger than
+    # transformation input dimension
+    transformation = np.array([[1, 2], [3, 4], [5, 6]])
+    # len(transformation) > len(transformation[0])
+    assert_raises(ValueError,
+                  NeighborhoodComponentAnalysis(None, init=transformation).fit,
+                  X, y)
+
+    # Pass otherwise
+    transformation = np.arange(9).reshape(3, 3)
+    NeighborhoodComponentAnalysis(None, init=transformation).fit(X, y)
+
+
+def test_n_features_out():
+    X = np.arange(12).reshape(4, 3)
+    y = [1, 1, 2, 2]
+
+    transformation = np.array([[1, 2, 3], [4, 5, 6]])
+
+    # n_features_out = X.shape[1] != transformation.shape[0]
+    nca = NeighborhoodComponentAnalysis(n_features_out=3, init=transformation)
+    assert_raises(ValueError, nca.fit, X, y)
+
+    # n_features_out > X.shape[1]
+    nca = NeighborhoodComponentAnalysis(n_features_out=5, init=transformation)
+    assert_raises(ValueError, nca.fit, X, y)
+
+    # n_features_out < X.shape[1]
+    nca = NeighborhoodComponentAnalysis(n_features_out=2, init='identity')
+    nca.fit(X, y)
+
+
+def test_init_transformation():
+    X, y = make_classification(n_samples=30, n_features=5,
+                               n_redundant=0, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+    # Start learning from scratch
+    nca = NeighborhoodComponentAnalysis(None, init='identity')
+    nca.fit(X_train, y_train)
+
+    # Initialize with random
+    nca_random = NeighborhoodComponentAnalysis(None, init='random')
+    nca_random.fit(X_train, y_train)
+
+    # Initialize with PCA
+    nca_pca = NeighborhoodComponentAnalysis(None, init='pca')
+    nca_pca.fit(X_train, y_train)
+
+    init = np.random.rand(X.shape[1], X.shape[1])
+    nca = NeighborhoodComponentAnalysis(None, init=init)
+    nca.fit(X_train, y_train)
+
+    # init.shape[1] must match X.shape[1]
+    init = np.random.rand(X.shape[1], X.shape[1] + 1)
+    nca = NeighborhoodComponentAnalysis(None, init=init)
+    assert_raises(ValueError, nca.fit, X_train, y_train)
+
+    # init.shape[0] must be <= init.shape[1]
+    init = np.random.rand(X.shape[1] + 1, X.shape[1])
+    nca = NeighborhoodComponentAnalysis(None, init=init)
+    assert_raises(ValueError, nca.fit, X_train, y_train)
+
+    # init.shape[0] must match n_features_out
+    init = np.random.rand(X.shape[1], X.shape[1])
+    nca = NeighborhoodComponentAnalysis(n_features_out=X.shape[1] - 2,
+                                        init=init)
+    assert_raises(ValueError, nca.fit, X_train, y_train)
+
+
+def test_verbose():
+    nca = NeighborhoodComponentAnalysis(None, verbose=1)
+    nca.fit(iris_data, iris_target)
+
+
+def test_callable():
+    X = iris_data
+    y = iris_target
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
+
+    nca = NeighborhoodComponentAnalysis(None, callback='my_cb')
+    assert_raises(ValueError, nca.fit, X_train, y_train)
+
+    max_iter = 10
+
+    def my_cb(transformation, n_iter):
+        rem_iter = max_iter - n_iter
+        print('{} iterations remaining...'.format(rem_iter))
+
+    nca = NeighborhoodComponentAnalysis(None, max_iter=max_iter,
+                                        callback=my_cb, verbose=1)
+    nca.fit(X_train, y_train)
+
+
+def test_terminate_early():
+    X = iris_data
+    y = iris_target
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
+
+    nca = NeighborhoodComponentAnalysis(None, max_iter=5)
+    nca.fit(X_train, y_train)
+
+
+def test_store_opt_result():
+    X = iris_data
+    y = iris_target
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
+
+    nca = NeighborhoodComponentAnalysis(None, max_iter=5,
+                                        store_opt_result=True)
+    nca.fit(X_train, y_train)
+    transformation = nca.opt_result_.x
+    assert_equal(transformation.size, X.shape[1]**2)

From 04222de8910d817d6e0d4d7ca28a1e66dc73ce12 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 27 Oct 2017 16:05:25 +0200
Subject: [PATCH 02/79] minor corrections in docstring

---
 sklearn/neighbors/nca.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index a3d35e4dd0a2d..a825a1ab44de4 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -27,7 +27,7 @@ class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    n_features_out: int, optional (default=None)
+    n_features_out : int, optional (default=None)
         Preferred dimensionality of the embedding.
 
     init : string or numpy array, optional (default='pca')
@@ -87,10 +87,10 @@ class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
     Attributes
     ----------
     transformation_ : array, shape (n_features_out, n_features)
-    The linear transformation learned during fitting.
+        The linear transformation learned during fitting.
 
     n_iter_ : int
-    Counts the number of iterations performed by the optimizer.
+        Counts the number of iterations performed by the optimizer.
 
     opt_result_ : scipy.optimize.OptimizeResult (optional)
         A dictionary of information representing the optimization result.
@@ -121,9 +121,9 @@ class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
     Notes
     -----
     Neighborhood Component Analysis (NCA) is a machine learning algorithm for
-    metric learning. It learns a linear transformation of the space in a
-    supervised fashion to improve the classification accuracy of a
-    stochastic nearest neighbors rule in this new space.
+    metric learning. It learns a linear transformation in a supervised fashion
+    to improve the classification accuracy of a stochastic nearest neighbors
+    rule in the new space.
 
     .. warning::
 

From 34c5457017ae2192e0499915a3fd6b89ff940794 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 27 Oct 2017 17:01:24 +0200
Subject: [PATCH 03/79] remove comment

---
 sklearn/neighbors/nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index a825a1ab44de4..bd1b4721a8280 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -445,7 +445,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
             ci = masks[:, y[i]]
             p_i_j = soft[ci]
             not_ci = np.logical_not(ci)
-            diff_ci = diffs[i, ci, :]  # n_samples * n_features
+            diff_ci = diffs[i, ci, :]
             diff_not_ci = diffs[i, not_ci, :]
             sum_ci = diff_ci.T.dot(
                 (p_i_j[:, np.newaxis] * diff_embedded[ci, :]))

From 89f68ee054203741cbf97900927261f283a4bd3e Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 30 Oct 2017 10:33:41 +0100
Subject: [PATCH 04/79] Add verbose during iterations

---
 sklearn/neighbors/nca.py            | 21 +++++++++++++++++++++
 sklearn/neighbors/tests/test_nca.py |  1 +
 2 files changed, 22 insertions(+)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index bd1b4721a8280..50ac51fd654de 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -430,6 +430,19 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
             The new (flattened) gradient of the loss.
         """
 
+        if self.n_iter_ == 0:
+            self.n_iter_ += 1
+            if self.verbose:
+                header_fields = ['Iteration', 'Objective Value', 'Time(s)']
+                header_fmt = '{:>10} {:>20} {:>10}'
+                header = header_fmt.format(*header_fields)
+                cls_name = self.__class__.__name__
+                print('[{}]'.format(cls_name))
+                print('[{}] {}\n[{}] {}'.format(cls_name, header,
+                                                cls_name, '-' * len(header)))
+
+        t_funcall = time.time()
+
         transformation = transformation.reshape(-1, X.shape[1])
         loss = 0
         gradient = np.zeros(transformation.shape)
@@ -454,6 +467,14 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
             p_i = np.sum(p_i_j)
             gradient += 2 * (p_i * (sum_ci.T + sum_not_ci.T) - sum_ci.T)
             loss += p_i
+
+        if self.verbose:
+            t_funcall = time.time() - t_funcall
+            values_fmt = '[{}] {:>10} {:>20.6e} {:>10.2f}'
+            print(values_fmt.format(self.__class__.__name__, self.n_iter_,
+                                    loss, t_funcall))
+            sys.stdout.flush()
+
         return - loss, - gradient.ravel()
 
 
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index fccc046e51892..a60e4399fe62b 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -60,6 +60,7 @@ def test_finite_differences():
     X, y, init = nca._validate_params(X, y)
     masks = _make_masks(y)
     diffs = X[:, np.newaxis] - X[np.newaxis]
+    nca.n_iter_ = 0
 
     point = nca._initialize(X, init)
     # compute the gradient at `point`

From 42e078a9f0c102cf22fbf3a687ec358ac642c8e8 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 31 Oct 2017 10:22:49 +0100
Subject: [PATCH 05/79] Update code according to code review:

https://github.com/wdevazelhes/scikit-learn/pull/1#pullrequestreview-72533389
---
 doc/modules/classes.rst             |   2 +-
 sklearn/neighbors/__init__.py       |   4 +-
 sklearn/neighbors/nca.py            |  77 +++++++++++---------
 sklearn/neighbors/tests/test_nca.py | 106 +++++++++++++++++++++-------
 4 files changed, 125 insertions(+), 64 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index fbb773e8024f0..17f2704e0829b 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1113,7 +1113,7 @@ Model validation
    neighbors.RadiusNeighborsRegressor
    neighbors.NearestCentroid
    neighbors.NearestNeighbors
-   neighbors.NeighborhoodComponentAnalysis
+   neighbors.NeighborhoodComponentsAnalysis
 
 .. autosummary::
    :toctree: generated/
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 8e211ef9ec448..367928fad5b5a 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -14,7 +14,7 @@
 from .kde import KernelDensity
 from .approximate import LSHForest
 from .lof import LocalOutlierFactor
-from .nca import NeighborhoodComponentAnalysis
+from .nca import NeighborhoodComponentsAnalysis
 
 __all__ = ['BallTree',
            'DistanceMetric',
@@ -30,4 +30,4 @@
            'KernelDensity',
            'LSHForest',
            'LocalOutlierFactor',
-           'NeighborhoodComponentAnalysis']
+           'NeighborhoodComponentsAnalysis']
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 50ac51fd654de..1c755427c918c 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -12,6 +12,7 @@
 import time
 from scipy.misc import logsumexp
 from scipy.optimize import minimize
+from sklearn.preprocessing import OneHotEncoder
 
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import LabelEncoder
@@ -22,8 +23,8 @@
 from ..externals.six import integer_types
 
 
-class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
-    """Neighborhood Component Analysis
+class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
+    """Neighborhood Components Analysis
 
     Parameters
     ----------
@@ -98,16 +99,16 @@ class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
 
     Examples
     --------
-    >>> from sklearn.neighbors.nca import NeighborhoodComponentAnalysis
+    >>> from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
     >>> from sklearn.neighbors import KNeighborsClassifier
     >>> from sklearn.datasets import load_iris
     >>> from sklearn.model_selection import train_test_split
     >>> X, y = load_iris(return_X_y=True)
     >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
     ... stratify=y, test_size=0.7, random_state=42)
-    >>> nca = NeighborhoodComponentAnalysis(None,random_state=42)
+    >>> nca = NeighborhoodComponentsAnalysis(None,random_state=42)
     >>> nca.fit(X_train, y_train) # doctest: +ELLIPSIS
-    NeighborhoodComponentAnalysis(...)
+    NeighborhoodComponentsAnalysis(...)
     >>> knn = KNeighborsClassifier(n_neighbors=3)
     >>> knn.fit(X_train, y_train) # doctest: +ELLIPSIS
     KNeighborsClassifier(...)
@@ -123,13 +124,7 @@ class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
     Neighborhood Component Analysis (NCA) is a machine learning algorithm for
     metric learning. It learns a linear transformation in a supervised fashion
     to improve the classification accuracy of a stochastic nearest neighbors
-    rule in the new space.
-
-    .. warning::
-
-        As NCA is optimizing a non-convex objective function, it will
-        likely end up in a local optimum. Several runs with independent random
-        init might be necessary to get a good convergence.
+    rule in the transformed space.
 
     References
     ----------
@@ -137,9 +132,13 @@ class NeighborhoodComponentAnalysis(BaseEstimator, TransformerMixin):
            "Neighbourhood Components Analysis". Advances in Neural Information
            Processing Systems. 17, 513-520, 2005.
            http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf
+
+    .. [2] Wikipedia entry on Neighborhood Components Analysis
+           https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
+
     """
 
-    def __init__(self, n_features_out=None, init='identity', max_iter=50,
+    def __init__(self, n_features_out=None, init='pca', max_iter=50,
                  tol=1e-5, callback=None, store_opt_result=False, verbose=0,
                  random_state=None):
 
@@ -167,7 +166,7 @@ def fit(self, X, y):
         Returns
         -------
         self : object
-            returns a trained NeighborhoodComponentAnalysis model.
+            returns a trained NeighborhoodComponentsAnalysis model.
         """
 
         # Verify inputs X and y and NCA parameters, and transform a copy if
@@ -182,7 +181,8 @@ def fit(self, X, y):
 
         # Compute arrays that stay fixed during optimization:
         # mask for fast lookup of same-class samples
-        masks = _make_masks(y_valid)
+        masks = OneHotEncoder(sparse=False,
+                              dtype=bool).fit_transform(y_valid[:, np.newaxis])
         # pairwise differences
         diffs = X_valid[:, np.newaxis] - X_valid[np.newaxis]
 
@@ -193,7 +193,7 @@ def fit(self, X, y):
         disp = self.verbose - 2 if self.verbose > 1 else -1
         optimizer_params = {'method': 'L-BFGS-B',
                             'fun': self._loss_grad_lbfgs,
-                            'args': (X_valid, y_valid, diffs, masks),
+                            'args': (X_valid, y_valid, diffs, masks, -1.0),
                             'jac': True,
                             'x0': transformation,
                             'tol': self.tol,
@@ -401,7 +401,7 @@ def _callback(self, transformation):
         self.n_iter_ += 1
 
     def _loss_grad_lbfgs(self, transformation, X, y, diffs,
-                         masks):
+                         masks, sign=1.0):
         """Compute the loss and the loss gradient w.r.t. ``transformation``.
 
         Parameters
@@ -448,23 +448,29 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
         gradient = np.zeros(transformation.shape)
         X_embedded = transformation.dot(X.T).T
 
-        # for every sample, compute its contribution to loss and gradient
+        # for every sample x_i, compute its contribution to loss and gradient
         for i in range(X.shape[0]):
+            # compute distances to x_i in embedded space
             diff_embedded = X_embedded[i] - X_embedded
-            sum_of_squares = np.einsum('ij,ij->i', diff_embedded,
-                                       diff_embedded)
-            sum_of_squares[i] = np.inf
-            soft = np.exp(-sum_of_squares - logsumexp(-sum_of_squares))
-            ci = masks[:, y[i]]
-            p_i_j = soft[ci]
-            not_ci = np.logical_not(ci)
+            dist_embedded = np.einsum('ij,ij->i', diff_embedded,
+                                      diff_embedded)
+            dist_embedded[i] = np.inf
+
+            # compute exponentiated distances (use the log-sum-exp trick to
+            # avoid numerical instabilities
+            exp_dist_embedded = np.exp(-dist_embedded -
+                                       logsumexp(-dist_embedded))
+            ci = masks[:, y[i]]  # samples that are in the same class as x_i
+            p_i_j = exp_dist_embedded[ci]
             diff_ci = diffs[i, ci, :]
-            diff_not_ci = diffs[i, not_ci, :]
+            diff_not_ci = diffs[i, ~ci, :]
             sum_ci = diff_ci.T.dot(
                 (p_i_j[:, np.newaxis] * diff_embedded[ci, :]))
-            sum_not_ci = diff_not_ci.T.dot((soft[not_ci][:, np.newaxis] *
-                                            diff_embedded[not_ci, :]))
-            p_i = np.sum(p_i_j)
+            sum_not_ci = diff_not_ci.T.dot((exp_dist_embedded[~ci][:,
+                                            np.newaxis] *
+                                            diff_embedded[~ci, :]))
+            p_i = np.sum(p_i_j)  # probability of x_i to be correctly
+            # classified
             gradient += 2 * (p_i * (sum_ci.T + sum_not_ci.T) - sum_ci.T)
             loss += p_i
 
@@ -475,7 +481,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
                                     loss, t_funcall))
             sys.stdout.flush()
 
-        return - loss, - gradient.ravel()
+        return sign * loss, sign * gradient.ravel()
 
 
 ##########################
@@ -538,8 +544,9 @@ def _make_masks(y):
     masks: array, shape (n_samples, n_classes)
         One-hot encoding of ``y``.
     """
-
-    n = y.shape[0]
-    masks = np.zeros((n, y.max() + 1))
-    masks[np.arange(n), y] = [1]
-    return masks.astype(bool)
+    masks = OneHotEncoder(sparse=False, dtype=bool).fit_transform(y[:,
+                                                                  np.newaxis])
+    # n = y.shape[0]
+    # masks = np.zeros((n, y.max() + 1), dtype=bool)
+    # masks[np.arange(n), y] = [True]
+    return masks
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index a60e4399fe62b..5adcdd3404830 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,9 +1,10 @@
 import numpy as np
+from numpy.testing import assert_array_equal
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import assert_raises, assert_equal
 from sklearn.datasets import load_iris, make_classification
 from sklearn.model_selection import train_test_split
-from sklearn.neighbors.nca import NeighborhoodComponentAnalysis, _make_masks
+from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis, _make_masks
 from sklearn.metrics import pairwise_distances
 
 
@@ -55,7 +56,7 @@ def test_finite_differences():
     y = random_state.randint(0, n_labels, (n_samples))
     point = random_state.randn(num_dims, n_features)
     X = random_state.randn(n_samples, n_features)
-    nca = NeighborhoodComponentAnalysis(None, init=point)
+    nca = NeighborhoodComponentsAnalysis(None, init=point)
 
     X, y, init = nca._validate_params(X, y)
     masks = _make_masks(y)
@@ -99,8 +100,8 @@ def test_simple_example():
     """
     X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
     y = np.array([1, 0, 1, 0])
-    nca = NeighborhoodComponentAnalysis(n_features_out=2, init='identity',
-                                        random_state=42)
+    nca = NeighborhoodComponentsAnalysis(n_features_out=2, init='identity',
+                                         random_state=42)
     nca.fit(X, y)
     X_transformed = nca.transform(X)
     np.testing.assert_equal(pairwise_distances(X_transformed).argsort()[:, 1],
@@ -111,7 +112,7 @@ def test_params_validation():
     # Test that invalid parameters raise value error
     X = np.arange(12).reshape(4, 3)
     y = [1, 1, 2, 2]
-    NCA = NeighborhoodComponentAnalysis
+    NCA = NeighborhoodComponentsAnalysis
 
     # TypeError
     assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
@@ -136,7 +137,8 @@ def test_transformation_dimensions():
     # Fail if transformation input dimension does not match inputs dimensions
     transformation = np.array([[1, 2], [3, 4]])
     assert_raises(ValueError,
-                  NeighborhoodComponentAnalysis(None, init=transformation).fit,
+                  NeighborhoodComponentsAnalysis(None,
+                                                 init=transformation).fit,
                   X, y)
 
     # Fail if transformation output dimension is larger than
@@ -144,12 +146,13 @@ def test_transformation_dimensions():
     transformation = np.array([[1, 2], [3, 4], [5, 6]])
     # len(transformation) > len(transformation[0])
     assert_raises(ValueError,
-                  NeighborhoodComponentAnalysis(None, init=transformation).fit,
+                  NeighborhoodComponentsAnalysis(None,
+                                                 init=transformation).fit,
                   X, y)
 
     # Pass otherwise
     transformation = np.arange(9).reshape(3, 3)
-    NeighborhoodComponentAnalysis(None, init=transformation).fit(X, y)
+    NeighborhoodComponentsAnalysis(None, init=transformation).fit(X, y)
 
 
 def test_n_features_out():
@@ -159,15 +162,15 @@ def test_n_features_out():
     transformation = np.array([[1, 2, 3], [4, 5, 6]])
 
     # n_features_out = X.shape[1] != transformation.shape[0]
-    nca = NeighborhoodComponentAnalysis(n_features_out=3, init=transformation)
+    nca = NeighborhoodComponentsAnalysis(n_features_out=3, init=transformation)
     assert_raises(ValueError, nca.fit, X, y)
 
     # n_features_out > X.shape[1]
-    nca = NeighborhoodComponentAnalysis(n_features_out=5, init=transformation)
+    nca = NeighborhoodComponentsAnalysis(n_features_out=5, init=transformation)
     assert_raises(ValueError, nca.fit, X, y)
 
     # n_features_out < X.shape[1]
-    nca = NeighborhoodComponentAnalysis(n_features_out=2, init='identity')
+    nca = NeighborhoodComponentsAnalysis(n_features_out=2, init='identity')
     nca.fit(X, y)
 
 
@@ -177,49 +180,100 @@ def test_init_transformation():
     X_train, X_test, y_train, y_test = train_test_split(X, y)
 
     # Start learning from scratch
-    nca = NeighborhoodComponentAnalysis(None, init='identity')
+    nca = NeighborhoodComponentsAnalysis(None, init='identity')
     nca.fit(X_train, y_train)
 
     # Initialize with random
-    nca_random = NeighborhoodComponentAnalysis(None, init='random')
+    nca_random = NeighborhoodComponentsAnalysis(None, init='random')
     nca_random.fit(X_train, y_train)
 
     # Initialize with PCA
-    nca_pca = NeighborhoodComponentAnalysis(None, init='pca')
+    nca_pca = NeighborhoodComponentsAnalysis(None, init='pca')
     nca_pca.fit(X_train, y_train)
 
     init = np.random.rand(X.shape[1], X.shape[1])
-    nca = NeighborhoodComponentAnalysis(None, init=init)
+    nca = NeighborhoodComponentsAnalysis(None, init=init)
     nca.fit(X_train, y_train)
 
     # init.shape[1] must match X.shape[1]
     init = np.random.rand(X.shape[1], X.shape[1] + 1)
-    nca = NeighborhoodComponentAnalysis(None, init=init)
+    nca = NeighborhoodComponentsAnalysis(None, init=init)
     assert_raises(ValueError, nca.fit, X_train, y_train)
 
     # init.shape[0] must be <= init.shape[1]
     init = np.random.rand(X.shape[1] + 1, X.shape[1])
-    nca = NeighborhoodComponentAnalysis(None, init=init)
+    nca = NeighborhoodComponentsAnalysis(None, init=init)
     assert_raises(ValueError, nca.fit, X_train, y_train)
 
     # init.shape[0] must match n_features_out
     init = np.random.rand(X.shape[1], X.shape[1])
-    nca = NeighborhoodComponentAnalysis(n_features_out=X.shape[1] - 2,
-                                        init=init)
+    nca = NeighborhoodComponentsAnalysis(n_features_out=X.shape[1] - 2,
+                                         init=init)
     assert_raises(ValueError, nca.fit, X_train, y_train)
 
 
 def test_verbose():
-    nca = NeighborhoodComponentAnalysis(None, verbose=1)
+    nca = NeighborhoodComponentsAnalysis(None, verbose=1)
     nca.fit(iris_data, iris_target)
 
 
+def test_singleton_class():
+    X = iris_data
+    y = iris_target
+    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y)
+
+    # one singleton class
+    singleton_class = 1
+    ind_singleton, = np.where(y_tr == singleton_class)
+    y_tr[ind_singleton] = 2
+    y_tr[ind_singleton[0]] = singleton_class
+
+    nca = NeighborhoodComponentsAnalysis(max_iter=30)
+    nca.fit(X_tr, y_tr)
+
+    # One non-singleton class
+    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y)
+    ind_1, = np.where(y_tr == 1)
+    ind_2, = np.where(y_tr == 2)
+    y_tr[ind_1] = 0
+    y_tr[ind_1[0]] = 1
+    y_tr[ind_2] = 0
+    y_tr[ind_2[0]] = 2
+
+    nca = NeighborhoodComponentsAnalysis(max_iter=30)
+    nca.fit(X_tr, y_tr)
+
+    # Only singleton classes
+    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y)
+    ind_0, = np.where(y_tr == 0)
+    ind_1, = np.where(y_tr == 1)
+    ind_2, = np.where(y_tr == 2)
+    X_tr = X_tr[[ind_0[0], ind_1[0], ind_2[0]]]
+    y_tr = y_tr[[ind_0[0], ind_1[0], ind_2[0]]]
+
+    nca = NeighborhoodComponentsAnalysis(init='identity', max_iter=30)
+    nca.fit(X_tr, y_tr)
+    assert_array_equal(X, nca.transform(X))
+
+
+def test_one_class():
+    X = iris_data[iris_target == 0]
+    y = iris_target[iris_target == 0]
+    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)
+
+    nca = NeighborhoodComponentsAnalysis(max_iter=30,
+                                         n_features_out=X.shape[1],
+                                         init='identity')
+    nca.fit(X_tr, y_tr)
+    assert_array_equal(X, nca.transform(X))
+
+
 def test_callable():
     X = iris_data
     y = iris_target
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
-    nca = NeighborhoodComponentAnalysis(None, callback='my_cb')
+    nca = NeighborhoodComponentsAnalysis(None, callback='my_cb')
     assert_raises(ValueError, nca.fit, X_train, y_train)
 
     max_iter = 10
@@ -228,8 +282,8 @@ def my_cb(transformation, n_iter):
         rem_iter = max_iter - n_iter
         print('{} iterations remaining...'.format(rem_iter))
 
-    nca = NeighborhoodComponentAnalysis(None, max_iter=max_iter,
-                                        callback=my_cb, verbose=1)
+    nca = NeighborhoodComponentsAnalysis(None, max_iter=max_iter,
+                                         callback=my_cb, verbose=1)
     nca.fit(X_train, y_train)
 
 
@@ -238,7 +292,7 @@ def test_terminate_early():
     y = iris_target
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
-    nca = NeighborhoodComponentAnalysis(None, max_iter=5)
+    nca = NeighborhoodComponentsAnalysis(None, max_iter=5)
     nca.fit(X_train, y_train)
 
 
@@ -247,8 +301,8 @@ def test_store_opt_result():
     y = iris_target
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
-    nca = NeighborhoodComponentAnalysis(None, max_iter=5,
-                                        store_opt_result=True)
+    nca = NeighborhoodComponentsAnalysis(None, max_iter=5,
+                                         store_opt_result=True)
     nca.fit(X_train, y_train)
     transformation = nca.opt_result_.x
     assert_equal(transformation.size, X.shape[1]**2)

From 4c7c0d418048b8d8b5534c9406e4e54a5a3d8644 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 31 Oct 2017 10:28:59 +0100
Subject: [PATCH 06/79] Remove _make_masks and use OneHotEncoder instead

---
 sklearn/neighbors/nca.py            | 21 ---------------------
 sklearn/neighbors/tests/test_nca.py |  6 ++++--
 2 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 1c755427c918c..eb85dc0e540c8 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -529,24 +529,3 @@ def _check_scalar(x, name, target_type, min_val=None, max_val=None):
 
     if max_val is not None and x > max_val:
         raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val))
-
-
-def _make_masks(y):
-    """Create one-hot encoding of vector ``y``.
-
-    Parameters
-    ----------
-    y : array, shape (n_samples,)
-        Data samples labels.
-
-    Returns
-    -------
-    masks: array, shape (n_samples, n_classes)
-        One-hot encoding of ``y``.
-    """
-    masks = OneHotEncoder(sparse=False, dtype=bool).fit_transform(y[:,
-                                                                  np.newaxis])
-    # n = y.shape[0]
-    # masks = np.zeros((n, y.max() + 1), dtype=bool)
-    # masks[np.arange(n), y] = [True]
-    return masks
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 5adcdd3404830..3999a7dff93cc 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,10 +1,11 @@
 import numpy as np
 from numpy.testing import assert_array_equal
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import assert_raises, assert_equal
 from sklearn.datasets import load_iris, make_classification
 from sklearn.model_selection import train_test_split
-from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis, _make_masks
+from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
 from sklearn.metrics import pairwise_distances
 
 
@@ -59,7 +60,8 @@ def test_finite_differences():
     nca = NeighborhoodComponentsAnalysis(None, init=point)
 
     X, y, init = nca._validate_params(X, y)
-    masks = _make_masks(y)
+    masks = OneHotEncoder(sparse=False,
+                          dtype=bool).fit_transform(y[:, np.newaxis])
     diffs = X[:, np.newaxis] - X[np.newaxis]
     nca.n_iter_ = 0
 

From 4c81a16af637fdafa0689d457cafe02c48cb6cdc Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 31 Oct 2017 10:42:23 +0100
Subject: [PATCH 07/79] precise that distances are squared

---
 sklearn/neighbors/nca.py            |  2 +-
 sklearn/neighbors/tests/test_nca.py | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index eb85dc0e540c8..806a954a2ba02 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -450,7 +450,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
 
         # for every sample x_i, compute its contribution to loss and gradient
         for i in range(X.shape[0]):
-            # compute distances to x_i in embedded space
+            # compute squared distances to x_i in embedded space
             diff_embedded = X_embedded[i] - X_embedded
             dist_embedded = np.einsum('ij,ij->i', diff_embedded,
                                       diff_embedded)
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 3999a7dff93cc..7473392da7660 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -50,14 +50,10 @@ def test_finite_differences():
     """
     # Initialize `transformation`, `X` and `y` and `NCA`
     random_state = check_random_state(0)
-    n_features = 10
-    num_dims = 2
-    n_samples = 100
-    n_labels = 3
-    y = random_state.randint(0, n_labels, (n_samples))
-    point = random_state.randn(num_dims, n_features)
-    X = random_state.randn(n_samples, n_features)
-    nca = NeighborhoodComponentsAnalysis(None, init=point)
+    X, y = make_classification(random_state=random_state)
+    point = random_state.randn(random_state.randint(1, X.shape[1] + 1),
+                               X.shape[1])
+    nca = NeighborhoodComponentsAnalysis(init=point)
 
     X, y, init = nca._validate_params(X, y)
     masks = OneHotEncoder(sparse=False,

From 824e940627129f0289efb4e9fe66a929d2a8bbc8 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 31 Oct 2017 15:12:10 +0100
Subject: [PATCH 08/79] remove useless None

---
 sklearn/neighbors/nca.py            |  2 +-
 sklearn/neighbors/tests/test_nca.py | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 806a954a2ba02..4179faa9f6315 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -106,7 +106,7 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     >>> X, y = load_iris(return_X_y=True)
     >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
     ... stratify=y, test_size=0.7, random_state=42)
-    >>> nca = NeighborhoodComponentsAnalysis(None,random_state=42)
+    >>> nca = NeighborhoodComponentsAnalysis(random_state=42)
     >>> nca.fit(X_train, y_train) # doctest: +ELLIPSIS
     NeighborhoodComponentsAnalysis(...)
     >>> knn = KNeighborsClassifier(n_neighbors=3)
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 7473392da7660..0e2f573d84ec2 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -135,7 +135,7 @@ def test_transformation_dimensions():
     # Fail if transformation input dimension does not match inputs dimensions
     transformation = np.array([[1, 2], [3, 4]])
     assert_raises(ValueError,
-                  NeighborhoodComponentsAnalysis(None,
+                  NeighborhoodComponentsAnalysis(
                                                  init=transformation).fit,
                   X, y)
 
@@ -144,13 +144,13 @@ def test_transformation_dimensions():
     transformation = np.array([[1, 2], [3, 4], [5, 6]])
     # len(transformation) > len(transformation[0])
     assert_raises(ValueError,
-                  NeighborhoodComponentsAnalysis(None,
+                  NeighborhoodComponentsAnalysis(
                                                  init=transformation).fit,
                   X, y)
 
     # Pass otherwise
     transformation = np.arange(9).reshape(3, 3)
-    NeighborhoodComponentsAnalysis(None, init=transformation).fit(X, y)
+    NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
 
 
 def test_n_features_out():
@@ -178,29 +178,29 @@ def test_init_transformation():
     X_train, X_test, y_train, y_test = train_test_split(X, y)
 
     # Start learning from scratch
-    nca = NeighborhoodComponentsAnalysis(None, init='identity')
+    nca = NeighborhoodComponentsAnalysis(init='identity')
     nca.fit(X_train, y_train)
 
     # Initialize with random
-    nca_random = NeighborhoodComponentsAnalysis(None, init='random')
+    nca_random = NeighborhoodComponentsAnalysis(init='random')
     nca_random.fit(X_train, y_train)
 
     # Initialize with PCA
-    nca_pca = NeighborhoodComponentsAnalysis(None, init='pca')
+    nca_pca = NeighborhoodComponentsAnalysis(init='pca')
     nca_pca.fit(X_train, y_train)
 
     init = np.random.rand(X.shape[1], X.shape[1])
-    nca = NeighborhoodComponentsAnalysis(None, init=init)
+    nca = NeighborhoodComponentsAnalysis(init=init)
     nca.fit(X_train, y_train)
 
     # init.shape[1] must match X.shape[1]
     init = np.random.rand(X.shape[1], X.shape[1] + 1)
-    nca = NeighborhoodComponentsAnalysis(None, init=init)
+    nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raises(ValueError, nca.fit, X_train, y_train)
 
     # init.shape[0] must be <= init.shape[1]
     init = np.random.rand(X.shape[1] + 1, X.shape[1])
-    nca = NeighborhoodComponentsAnalysis(None, init=init)
+    nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raises(ValueError, nca.fit, X_train, y_train)
 
     # init.shape[0] must match n_features_out
@@ -211,7 +211,7 @@ def test_init_transformation():
 
 
 def test_verbose():
-    nca = NeighborhoodComponentsAnalysis(None, verbose=1)
+    nca = NeighborhoodComponentsAnalysis(verbose=1)
     nca.fit(iris_data, iris_target)
 
 
@@ -271,7 +271,7 @@ def test_callable():
     y = iris_target
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
-    nca = NeighborhoodComponentsAnalysis(None, callback='my_cb')
+    nca = NeighborhoodComponentsAnalysis(callback='my_cb')
     assert_raises(ValueError, nca.fit, X_train, y_train)
 
     max_iter = 10
@@ -280,7 +280,7 @@ def my_cb(transformation, n_iter):
         rem_iter = max_iter - n_iter
         print('{} iterations remaining...'.format(rem_iter))
 
-    nca = NeighborhoodComponentsAnalysis(None, max_iter=max_iter,
+    nca = NeighborhoodComponentsAnalysis(max_iter=max_iter,
                                          callback=my_cb, verbose=1)
     nca.fit(X_train, y_train)
 
@@ -290,7 +290,7 @@ def test_terminate_early():
     y = iris_target
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
-    nca = NeighborhoodComponentsAnalysis(None, max_iter=5)
+    nca = NeighborhoodComponentsAnalysis(max_iter=5)
     nca.fit(X_train, y_train)
 
 
@@ -299,7 +299,7 @@ def test_store_opt_result():
     y = iris_target
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
-    nca = NeighborhoodComponentsAnalysis(None, max_iter=5,
+    nca = NeighborhoodComponentsAnalysis(max_iter=5,
                                          store_opt_result=True)
     nca.fit(X_train, y_train)
     transformation = nca.opt_result_.x

From d4294ac74977d4cbaabc8b85125912fe1400b292 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 31 Oct 2017 16:15:20 +0100
Subject: [PATCH 09/79] simplify tests

---
 sklearn/neighbors/tests/test_nca.py | 129 ++++++++++++----------------
 1 file changed, 55 insertions(+), 74 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 0e2f573d84ec2..7f3edaa9beee2 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -4,7 +4,6 @@
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import assert_raises, assert_equal
 from sklearn.datasets import load_iris, make_classification
-from sklearn.model_selection import train_test_split
 from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
 from sklearn.metrics import pairwise_distances
 
@@ -18,6 +17,24 @@
 EPS = np.finfo(float).eps
 
 
+def test_simple_example():
+    """Test on a simple example.
+
+    Puts four points in the input space where the opposite labels points are
+    next to each other. After transform the same labels points should be next
+    to each other.
+
+    """
+    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
+    y = np.array([1, 0, 1, 0])
+    nca = NeighborhoodComponentsAnalysis(n_features_out=2, init='identity',
+                                         random_state=42)
+    nca.fit(X, y)
+    Xansformed = nca.transform(X)
+    np.testing.assert_equal(pairwise_distances(Xansformed).argsort()[:, 1],
+                            np.array([2, 3, 0, 1]))
+
+
 def test_finite_differences():
     r"""Test gradient of loss function
 
@@ -49,10 +66,9 @@ def test_finite_differences():
 
     """
     # Initialize `transformation`, `X` and `y` and `NCA`
-    random_state = check_random_state(0)
-    X, y = make_classification(random_state=random_state)
-    point = random_state.randn(random_state.randint(1, X.shape[1] + 1),
-                               X.shape[1])
+    X = iris_data
+    y = iris_target
+    point = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=point)
 
     X, y, init = nca._validate_params(X, y)
@@ -67,7 +83,7 @@ def test_finite_differences():
                                        masks)
 
     # create a random direction of norm 1
-    random_direction = random_state.randn(*point.shape)
+    random_direction = rng.randn(*point.shape)
     random_direction /= np.linalg.norm(random_direction)
 
     # computes projected gradient
@@ -88,24 +104,6 @@ def test_finite_differences():
     np.testing.assert_almost_equal(relative_error, 0.)
 
 
-def test_simple_example():
-    """Test on a simple example.
-
-    Puts four points in the input space where the opposite labels points are
-    next to each other. After transform the same labels points should be next
-    to each other.
-
-    """
-    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
-    y = np.array([1, 0, 1, 0])
-    nca = NeighborhoodComponentsAnalysis(n_features_out=2, init='identity',
-                                         random_state=42)
-    nca.fit(X, y)
-    X_transformed = nca.transform(X)
-    np.testing.assert_equal(pairwise_distances(X_transformed).argsort()[:, 1],
-                            np.array([2, 3, 0, 1]))
-
-
 def test_params_validation():
     # Test that invalid parameters raise value error
     X = np.arange(12).reshape(4, 3)
@@ -116,8 +114,7 @@ def test_params_validation():
     assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
     assert_raises(TypeError, NCA(verbose='true').fit, X, y)
     assert_raises(TypeError, NCA(tol=1).fit, X, y)
-    assert_raises(TypeError, NCA(n_features_out='invalid').fit,
-                  X, y)
+    assert_raises(TypeError, NCA(n_features_out='invalid').fit, X, y)
 
     # ValueError
     assert_raises(ValueError, NCA(init=1).fit, X, y)
@@ -135,8 +132,7 @@ def test_transformation_dimensions():
     # Fail if transformation input dimension does not match inputs dimensions
     transformation = np.array([[1, 2], [3, 4]])
     assert_raises(ValueError,
-                  NeighborhoodComponentsAnalysis(
-                                                 init=transformation).fit,
+                  NeighborhoodComponentsAnalysis(init=transformation).fit,
                   X, y)
 
     # Fail if transformation output dimension is larger than
@@ -144,8 +140,7 @@ def test_transformation_dimensions():
     transformation = np.array([[1, 2], [3, 4], [5, 6]])
     # len(transformation) > len(transformation[0])
     assert_raises(ValueError,
-                  NeighborhoodComponentsAnalysis(
-                                                 init=transformation).fit,
+                  NeighborhoodComponentsAnalysis(init=transformation).fit,
                   X, y)
 
     # Pass otherwise
@@ -175,104 +170,99 @@ def test_n_features_out():
 def test_init_transformation():
     X, y = make_classification(n_samples=30, n_features=5,
                                n_redundant=0, random_state=0)
-    X_train, X_test, y_train, y_test = train_test_split(X, y)
 
     # Start learning from scratch
     nca = NeighborhoodComponentsAnalysis(init='identity')
-    nca.fit(X_train, y_train)
+    nca.fit(X, y)
 
     # Initialize with random
     nca_random = NeighborhoodComponentsAnalysis(init='random')
-    nca_random.fit(X_train, y_train)
+    nca_random.fit(X, y)
 
     # Initialize with PCA
     nca_pca = NeighborhoodComponentsAnalysis(init='pca')
-    nca_pca.fit(X_train, y_train)
+    nca_pca.fit(X, y)
 
     init = np.random.rand(X.shape[1], X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
-    nca.fit(X_train, y_train)
+    nca.fit(X, y)
 
     # init.shape[1] must match X.shape[1]
     init = np.random.rand(X.shape[1], X.shape[1] + 1)
     nca = NeighborhoodComponentsAnalysis(init=init)
-    assert_raises(ValueError, nca.fit, X_train, y_train)
+    assert_raises(ValueError, nca.fit, X, y)
 
     # init.shape[0] must be <= init.shape[1]
     init = np.random.rand(X.shape[1] + 1, X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
-    assert_raises(ValueError, nca.fit, X_train, y_train)
+    assert_raises(ValueError, nca.fit, X, y)
 
     # init.shape[0] must match n_features_out
     init = np.random.rand(X.shape[1], X.shape[1])
     nca = NeighborhoodComponentsAnalysis(n_features_out=X.shape[1] - 2,
                                          init=init)
-    assert_raises(ValueError, nca.fit, X_train, y_train)
+    assert_raises(ValueError, nca.fit, X, y)
 
 
 def test_verbose():
     nca = NeighborhoodComponentsAnalysis(verbose=1)
     nca.fit(iris_data, iris_target)
+    # TODO: rather assert that some message is printed
 
 
 def test_singleton_class():
     X = iris_data
     y = iris_target
-    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y)
 
     # one singleton class
     singleton_class = 1
-    ind_singleton, = np.where(y_tr == singleton_class)
-    y_tr[ind_singleton] = 2
-    y_tr[ind_singleton[0]] = singleton_class
+    ind_singleton, = np.where(y == singleton_class)
+    y[ind_singleton] = 2
+    y[ind_singleton[0]] = singleton_class
 
     nca = NeighborhoodComponentsAnalysis(max_iter=30)
-    nca.fit(X_tr, y_tr)
+    nca.fit(X, y)
 
     # One non-singleton class
-    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y)
-    ind_1, = np.where(y_tr == 1)
-    ind_2, = np.where(y_tr == 2)
-    y_tr[ind_1] = 0
-    y_tr[ind_1[0]] = 1
-    y_tr[ind_2] = 0
-    y_tr[ind_2[0]] = 2
+    ind_1, = np.where(y == 1)
+    ind_2, = np.where(y == 2)
+    y[ind_1] = 0
+    y[ind_1[0]] = 1
+    y[ind_2] = 0
+    y[ind_2[0]] = 2
 
     nca = NeighborhoodComponentsAnalysis(max_iter=30)
-    nca.fit(X_tr, y_tr)
+    nca.fit(X, y)
 
     # Only singleton classes
-    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y)
-    ind_0, = np.where(y_tr == 0)
-    ind_1, = np.where(y_tr == 1)
-    ind_2, = np.where(y_tr == 2)
-    X_tr = X_tr[[ind_0[0], ind_1[0], ind_2[0]]]
-    y_tr = y_tr[[ind_0[0], ind_1[0], ind_2[0]]]
+    ind_0, = np.where(y == 0)
+    ind_1, = np.where(y == 1)
+    ind_2, = np.where(y == 2)
+    X = X[[ind_0[0], ind_1[0], ind_2[0]]]
+    y = y[[ind_0[0], ind_1[0], ind_2[0]]]
 
     nca = NeighborhoodComponentsAnalysis(init='identity', max_iter=30)
-    nca.fit(X_tr, y_tr)
+    nca.fit(X, y)
     assert_array_equal(X, nca.transform(X))
 
 
 def test_one_class():
     X = iris_data[iris_target == 0]
     y = iris_target[iris_target == 0]
-    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)
 
     nca = NeighborhoodComponentsAnalysis(max_iter=30,
                                          n_features_out=X.shape[1],
                                          init='identity')
-    nca.fit(X_tr, y_tr)
+    nca.fit(X, y)
     assert_array_equal(X, nca.transform(X))
 
 
 def test_callable():
     X = iris_data
     y = iris_target
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
     nca = NeighborhoodComponentsAnalysis(callback='my_cb')
-    assert_raises(ValueError, nca.fit, X_train, y_train)
+    assert_raises(ValueError, nca.fit, X, y)
 
     max_iter = 10
 
@@ -282,25 +272,16 @@ def my_cb(transformation, n_iter):
 
     nca = NeighborhoodComponentsAnalysis(max_iter=max_iter,
                                          callback=my_cb, verbose=1)
-    nca.fit(X_train, y_train)
-
-
-def test_terminate_early():
-    X = iris_data
-    y = iris_target
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
-
-    nca = NeighborhoodComponentsAnalysis(max_iter=5)
-    nca.fit(X_train, y_train)
+    nca.fit(X, y)
+    # TODO: rather assert that message is printed
 
 
 def test_store_opt_result():
     X = iris_data
     y = iris_target
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
     nca = NeighborhoodComponentsAnalysis(max_iter=5,
                                          store_opt_result=True)
-    nca.fit(X_train, y_train)
+    nca.fit(X, y)
     transformation = nca.opt_result_.x
     assert_equal(transformation.size, X.shape[1]**2)

From 296e295e011d92bb83f34f3e8e88f1d0f2e34a22 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 2 Nov 2017 16:13:41 +0100
Subject: [PATCH 10/79] ensure min samples = 2 to make check_fit2d_1sample pass

---
 sklearn/neighbors/nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 4179faa9f6315..0e4edd56ec473 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -278,7 +278,7 @@ def _validate_params(self, X, y):
         """
 
         # Validate the inputs X and y, and converts y to numerical classes.
-        X_valid, y_valid = check_X_y(X, y)
+        X_valid, y_valid = check_X_y(X, y, ensure_min_samples=2)
         check_classification_targets(y_valid)
         y_valid = LabelEncoder().fit_transform(y_valid)
 

From 616f9a23639f65700cc958cea53c7c8e566b3435 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 7 Nov 2017 16:47:25 +0100
Subject: [PATCH 11/79] Do not precompute pairwise differences

Indeed, they do not add a significative speedup but have a high memory cost.
---
 sklearn/neighbors/nca.py            | 15 +++++----------
 sklearn/neighbors/tests/test_nca.py | 12 +++++-------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 0e4edd56ec473..7a6a3b1da3715 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -183,8 +183,6 @@ def fit(self, X, y):
         # mask for fast lookup of same-class samples
         masks = OneHotEncoder(sparse=False,
                               dtype=bool).fit_transform(y_valid[:, np.newaxis])
-        # pairwise differences
-        diffs = X_valid[:, np.newaxis] - X_valid[np.newaxis]
 
         # Initialize the transformation
         transformation = self._initialize(X_valid, init)
@@ -193,7 +191,7 @@ def fit(self, X, y):
         disp = self.verbose - 2 if self.verbose > 1 else -1
         optimizer_params = {'method': 'L-BFGS-B',
                             'fun': self._loss_grad_lbfgs,
-                            'args': (X_valid, y_valid, diffs, masks, -1.0),
+                            'args': (X_valid, y_valid, masks, -1.0),
                             'jac': True,
                             'x0': transformation,
                             'tol': self.tol,
@@ -400,8 +398,7 @@ def _callback(self, transformation):
 
         self.n_iter_ += 1
 
-    def _loss_grad_lbfgs(self, transformation, X, y, diffs,
-                         masks, sign=1.0):
+    def _loss_grad_lbfgs(self, transformation, X, y, masks, sign=1.0):
         """Compute the loss and the loss gradient w.r.t. ``transformation``.
 
         Parameters
@@ -415,9 +412,6 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
         y : array, shape (n_samples,)
             The corresponding training labels.
 
-        diffs : array, shape (n_samples, n_samples, n_features)
-            Pairwise differences between training samples.
-
         masks : array, shape (n_samples, n_classes)
             One-hot encoding of y.
 
@@ -462,8 +456,9 @@ def _loss_grad_lbfgs(self, transformation, X, y, diffs,
                                        logsumexp(-dist_embedded))
             ci = masks[:, y[i]]  # samples that are in the same class as x_i
             p_i_j = exp_dist_embedded[ci]
-            diff_ci = diffs[i, ci, :]
-            diff_not_ci = diffs[i, ~ci, :]
+            diffs = X[i, :] - X
+            diff_ci = diffs[ci, :]
+            diff_not_ci = diffs[~ci, :]
             sum_ci = diff_ci.T.dot(
                 (p_i_j[:, np.newaxis] * diff_embedded[ci, :]))
             sum_not_ci = diff_not_ci.T.dot((exp_dist_embedded[~ci][:,
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 7f3edaa9beee2..1f0bd3d338cde 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -74,13 +74,11 @@ def test_finite_differences():
     X, y, init = nca._validate_params(X, y)
     masks = OneHotEncoder(sparse=False,
                           dtype=bool).fit_transform(y[:, np.newaxis])
-    diffs = X[:, np.newaxis] - X[np.newaxis]
     nca.n_iter_ = 0
 
     point = nca._initialize(X, init)
     # compute the gradient at `point`
-    _, gradient = nca._loss_grad_lbfgs(point, X, y, diffs,
-                                       masks)
+    _, gradient = nca._loss_grad_lbfgs(point, X, y, masks)
 
     # create a random direction of norm 1
     random_direction = rng.randn(*point.shape)
@@ -92,10 +90,10 @@ def test_finite_differences():
 
     # compute finite differences
     eps = 1e-5
-    right_loss, _ = nca._loss_grad_lbfgs(point + eps * random_direction,
-                                         X, y, diffs, masks)
-    left_loss, _ = nca._loss_grad_lbfgs(point - eps * random_direction,
-                                        X, y, diffs, masks)
+    right_loss, _ = nca._loss_grad_lbfgs(point + eps * random_direction, X, y,
+                                         masks)
+    left_loss, _ = nca._loss_grad_lbfgs(point - eps * random_direction, X, y,
+                                        masks)
     finite_differences = 1/(2*eps) * (right_loss - left_loss)
 
     # compute relative error

From 12cf3a9061e67a32ba57f7e42d489ca70ea77f6a Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 14 Nov 2017 14:43:03 +0100
Subject: [PATCH 12/79] add example

---
 examples/neighbors/plot_nca_dim_reduction.py | 102 +++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 examples/neighbors/plot_nca_dim_reduction.py

diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
new file mode 100644
index 0000000000000..47f3374bf66d0
--- /dev/null
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -0,0 +1,102 @@
+"""
+==============================================================
+Dimensionality Reduction with Neighborhood Components Analysis
+==============================================================
+
+Sample usage of Neighborhood Components Analysis for dimensionality reduction.
+
+This example compares different (linear) dimensionality reduction methods
+applied on the Digits data set. The data set contains images of digits from
+0 to 9 with approximately 180 samples of each class. Each image is of
+dimension 8x8 = 64, and is reduced to a two-dimensional data point.
+
+Principal Component Analysis (PCA) applied to this data identifies the
+combination of attributes (principal components, or directions in the
+feature space) that account for the most variance in the data. Here we
+plot the different samples on the 2 first principal components.
+
+Linear Discriminant Analysis (LDA) tries to identify attributes that
+account for the most variance *between classes*. In particular,
+LDA, in contrast to PCA, is a supervised method, using known class labels.
+
+Neighborhood Components Analysis (NCA) tries to find a feature space such
+that a stochastic nearest neighbor algorithm will give the best accuracy.
+Like LDA, it is a supervised method.
+
+One can see that NCA enforces a clustering of the data that is visually
+meaningful even after the large dimensionality reduction.
+"""
+
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.decomposition import PCA
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.neighbors import KNeighborsClassifier, \
+    NeighborhoodComponentsAnalysis
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+print(__doc__)
+
+n_neighbors = 3
+random_state = 0
+
+# Load Digits dataset
+digits = datasets.load_digits()
+X, y = digits.data, digits.target
+
+# Split into train/test
+X_train, X_test, y_train, y_test = \
+    train_test_split(X, y, test_size=0.5, stratify=y,
+                     random_state=random_state)
+
+dim = len(X[0])
+n_classes = len(np.unique(y))
+
+# Reduce dimension to 2 with PCA
+pca = make_pipeline(StandardScaler(),
+                    PCA(n_components=2, random_state=random_state))
+
+# Reduce dimension to 2 with LinearDiscriminantAnalysis
+lda = make_pipeline(StandardScaler(),
+                    LinearDiscriminantAnalysis(n_components=2))
+
+# Reduce dimension to 2 with NeighborhoodComponentAnalysis
+nca = make_pipeline(StandardScaler(),
+                    NeighborhoodComponentsAnalysis(n_features_out=2,
+                                                   verbose=1,
+                                                   random_state=random_state))
+
+# Use a nearest neighbor classifier to evaluate the methods
+knn = KNeighborsClassifier(n_neighbors=n_neighbors)
+
+# Make a list of the methods to be compared
+dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
+
+plt.figure()
+for i, (name, model) in enumerate(dim_reduction_methods):
+    plt.subplot(1, 3, i + 1)
+
+    # Fit the method's model
+    model.fit(X_train, y_train)
+
+    # Fit a nearest neighbor classifier on the embedded training set
+    knn.fit(model.transform(X_train), y_train)
+
+    # Compute the nearest neighbor accuracy on the embedded test set
+    acc_knn = knn.score(model.transform(X_test), y_test)
+
+    # Embed the data set in 2 dimensions using the fitted model
+    X_embedded = model.transform(X)
+
+    # Plot the embedding and show the evaluation score
+    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
+    plt.title("{}, KNN (k={})".format(name, n_neighbors))
+    plt.text(0.9, 0.1, '{:.2f}'.format(acc_knn), size=15,
+             ha='center', va='center', transform=plt.gca().transAxes)
+
+plt.show()

From 7b37e8d97abb27650876bf1e9319b78cf51eb576 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 14 Nov 2017 14:48:22 +0100
Subject: [PATCH 13/79] reorganize transposes

---
 sklearn/neighbors/nca.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 7a6a3b1da3715..d9929ffb0ee17 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -440,7 +440,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, masks, sign=1.0):
         transformation = transformation.reshape(-1, X.shape[1])
         loss = 0
         gradient = np.zeros(transformation.shape)
-        X_embedded = transformation.dot(X.T).T
+        X_embedded = np.dot(X, transformation.T)
 
         # for every sample x_i, compute its contribution to loss and gradient
         for i in range(X.shape[0]):
@@ -466,7 +466,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, masks, sign=1.0):
                                             diff_embedded[~ci, :]))
             p_i = np.sum(p_i_j)  # probability of x_i to be correctly
             # classified
-            gradient += 2 * (p_i * (sum_ci.T + sum_not_ci.T) - sum_ci.T)
+            gradient += 2 * (p_i * (sum_ci + sum_not_ci) - sum_ci).T
             loss += p_i
 
         if self.verbose:

From 48cab1105dd9c3c11ccec547195898b43538b50e Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 14 Nov 2017 15:07:15 +0100
Subject: [PATCH 14/79] simplify gradient

---
 sklearn/neighbors/nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index d9929ffb0ee17..207a56c27c0bb 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -466,7 +466,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, masks, sign=1.0):
                                             diff_embedded[~ci, :]))
             p_i = np.sum(p_i_j)  # probability of x_i to be correctly
             # classified
-            gradient += 2 * (p_i * (sum_ci + sum_not_ci) - sum_ci).T
+            gradient += 2 * (p_i * sum_not_ci + (p_i - 1) * sum_ci).T
             loss += p_i
 
         if self.verbose:

From 47928aa4759e177f4ce385575f64f2e77fdac8bd Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 22 Nov 2017 11:55:32 +0100
Subject: [PATCH 15/79] Fixes according to code review

---
 examples/neighbors/plot_nca_dim_reduction.py | 7 +++----
 sklearn/neighbors/nca.py                     | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index 47f3374bf66d0..2c31c6155a2b9 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -95,8 +95,7 @@
 
     # Plot the embedding and show the evaluation score
     plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
-    plt.title("{}, KNN (k={})".format(name, n_neighbors))
-    plt.text(0.9, 0.1, '{:.2f}'.format(acc_knn), size=15,
-             ha='center', va='center', transform=plt.gca().transAxes)
-
+    plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
+                                                              n_neighbors,
+                                                              acc_knn))
 plt.show()
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 207a56c27c0bb..9d213555f82a6 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -12,8 +12,7 @@
 import time
 from scipy.misc import logsumexp
 from scipy.optimize import minimize
-from sklearn.preprocessing import OneHotEncoder
-
+from ..preprocessing import OneHotEncoder
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import LabelEncoder
 from ..decomposition import PCA
@@ -406,6 +405,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, masks, sign=1.0):
         transformation : array, shape (n_features_out, n_features)
             The linear transformation on which to compute loss and evaluate
             gradient
+
         X : array, shape (n_samples, n_features)
             The training samples.
 

From 4612e5f049d5f327b8c97b58aa9df6103c6ed796 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 13 Dec 2017 13:38:13 +0100
Subject: [PATCH 16/79] Retrieving LMNN documentation in order to adapt it to
 NCA

---
 doc/modules/neighbors.rst                     | 226 ++++++++++++++++++
 examples/neighbors/plot_nca_classification.py |  87 +++++++
 examples/neighbors/plot_nca_dim_reduction.py  |   7 +-
 examples/neighbors/plot_nca_illustration.py   | 101 ++++++++
 .../neighbors/plot_nca_illustration_bis.py    |  79 ++++++
 5 files changed, 497 insertions(+), 3 deletions(-)
 create mode 100644 examples/neighbors/plot_nca_classification.py
 create mode 100644 examples/neighbors/plot_nca_illustration.py
 create mode 100644 examples/neighbors/plot_nca_illustration_bis.py

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index b023178e46f8d..119b590276f14 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -514,3 +514,229 @@ the model from 0.81 to 0.82.
 
   * :ref:`sphx_glr_auto_examples_neighbors_plot_nearest_centroid.py`: an example of
     classification using nearest centroid with different shrink thresholds.
+
+
+
+.. _nca:
+
+Neighborhood Components Analysis
+================================
+
+.. sectionauthor:: William de Vazelhes <william.de-vazelhes@inria.fr>
+
+Neighborhood Components Analysis (NCA,
+:class:`NeighborhoodComponentAnalysis`) is
+a distance metric learning algorithm which aims to improve the accuracy of
+nearest neighbors classification compared to the standard Euclidean distance.
+
+.. |nca_illustration_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_illustration_001.png
+   :target: ../auto_examples/neighbors/plot_nca_illustration.html
+   :scale: 50
+
+.. |nca_illustration_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_illustration_002.png
+   :target: ../auto_examples/neighbors/plot_nca_illustration.html
+   :scale: 50
+
+.. centered:: |nca_illustration_1| |nca_illustration_2|
+
+The algorithm  directly  maximizes  a stochastic  variant  of  the
+leave-one-out k-nearest neighbors (KNN) score on the training set.  It can also
+learn a low-dimensional linear  embedding  of  labeled  data  that  can  be
+used for  data  visualization and fast classification.  Unlike other methods,
+our classification model is non-parametric, making no assumptions about the
+shape of the class distributions or the boundaries between them.  The
+performance of the method is demonstrated on several data sets, both for metric
+learning and linear dimensionality reduction.
+In the above figure, we consider some points from a randomly generated
+dataset. We focus on the stochastic KNN classification of point n°3. In the
+original
+space, it has
+many stochastic neighbors, the thickness of the bond representing the
+softmax distance to it hence their weight in the prediction of its class.
+However, in the embedding space, the only non-negligible stochastic
+neighbors are from the same class as sample 3, guaranteeing that it will be
+well classified.
+
+
+
+Classification
+--------------
+
+Combined with a nearest neighbors classifier (:class:`KNeighborsClassifier`),
+this method is attractive for classification because it can naturally
+handle multi-class problems without any increase in the model size, and only
+a single parameter (``n_neighbors``) has to be selected by the user before
+training.
+
+Neighborhood Components Analysis classification has been shown to work well in
+practice for data sets of varying size and difficulty. In contrast to
+related methods such as Linear Discriminant Analysis, NCA does not make any
+assumptions about the class distributions. The nearest neighbor classification
+can naturally produce highly irregular decision boundaries.
+
+To use this model for classification, one needs to combine a
+:class:`NeighborhoodComponentsAnalysis`
+instance that learns the optimal transformation with a :class:`KNeighborsClassifier`
+instance that performs the classification in the embedded space. Here is an
+example using the two classes:
+
+    >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = load_iris(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ... stratify=y, test_size=0.7, random_state=42)
+    >>> nca = NeighborhoodComponentsAnalysis(random_state=42)
+    >>> nca.fit(X_train, y_train) # doctest: +ELLIPSIS
+    NeighborhoodComponentsAnalysis(...)
+    >>> # Apply the learned transformation when using KNeighborsClassifier
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> knn.fit(nca.transform(X_train), y_train) # doctest: +ELLIPSIS
+    KNeighborsClassifier(...)
+    >>> print(knn.score(nca.transform(X_test), y_test))
+    0.961904761905
+
+Alternatively, one can create a :class:`sklearn.pipeline.Pipeline` instance
+that automatically applies the transformation when fitting or predicting:
+
+    >>> from sklearn.pipeline import Pipeline
+    >>> nca = NeighborhoodComponentsAnalysis(random_state=42)
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
+    >>> nca_pipe.fit(X_train, y_train) # doctest: +ELLIPSIS
+    Pipeline(...)
+    >>> print(nca_pipe.score(X_test, y_test))
+    0.961904761905
+
+.. |nca_classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_classification_001.png
+   :target: ../auto_examples/neighbors/plot_nca_classification.html
+   :scale: 50
+
+.. |nca_classification_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_classification_002.png
+   :target: ../auto_examples/neighbors/plot_nca_classification.html
+   :scale: 50
+
+.. centered:: |nca_classification_1| |nca_classification_2|
+
+
+The plot shows decision boundaries for nearest neighbor classification and
+large margin nearest neighbor classification.
+
+
+Dimensionality reduction
+------------------------
+
+:class:`NeighborhoodComponentsAnalysis` can be used to perform supervised
+dimensionality reduction. The input data are projected onto a linear subspace
+consisting of the directions which minimize the NCA objective. The desired
+dimensionality can be set using the parameter ``n_features_out``.
+For instance, the following shows a comparison of dimensionality reduction
+with Principal Component Analysis (:class:`sklearn.decomposition.PCA`),
+Linear Discriminant Analysis (:class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+and Neighborhood Component Analysis (:class:`NeighborhoodComponentsAnalysis`)
+on the Digits dataset, a dataset with size
+:math:`n_{samples} = 1797` and :math:`n_{features} = 64`.
+The data set is splitted in a training and test set of equal size. What is
+more, a :class:`sklearn.preprocessing.StandardScaler` fitted on the training
+set and
+transforms
+the data from both sets. For evaluation the 3-nearest neighbor classification
+accuracy is
+computed on the
+2-dimensional embedding found by each method. Each data sample belongs to one
+of 10 classes.
+
+.. |nca_dim_reduction_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_dim_reduction_001.png
+   :target: ../auto_examples/neighbors/plot_nca_dim_reduction.html
+   :width: 32%
+
+.. |nca_dim_reduction_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_dim_reduction_002.png
+   :target: ../auto_examples/neighbors/plot_nca_dim_reduction.html
+   :width: 32%
+
+.. |nca_dim_reduction_3| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_dim_reduction_003.png
+   :target: ../auto_examples/neighbors/plot_nca_dim_reduction.html
+   :width: 32%
+
+.. centered:: |nca_dim_reduction_1| |nca_dim_reduction_2| |nca_dim_reduction_3|
+
+
+Mathematical formulation
+------------------------
+
+NCA learns a linear transformation matrix :math:`L` of
+size ``(n_features_out, n_features)``. NCA maximises in average the
+probability :math:`p_i` of sample :math:`i` being
+classified as :math:`C_i`, where :math:`p_i` is a weighted sum of all other
+samples of
+class :math:`C_i`, with a weighting related to their distance to :math:`i`.
+
+The contribution of sample :math:`i` to the cost function is therefore the
+following (it is the probability of sample :math:`i` to be classify as
+:math:`C`):
+
+.. math::
+
+  p_{i}=\sum\nolimits_{j \in C_i}{p_{i j}}
+
+where :math:`C_i` is the set of points in the same class as sample :math:`i`,
+and :math:`p_{i j}` is the softmax over Euclidean distances in the
+transformed space:
+
+.. math::
+
+  p_{i j} = \frac{\exp(-||L x_i - L x_j||^2)}{\sum\nolimits_{k \ne
+            i} {\exp{-(||L x_i - L x_k||^2)}}} , p_{i i} = 0
+
+
+Mahalanobis distance
+^^^^^^^^^^^^^^^^^^^^
+
+NCA can be seen as learning a (squared) Mahalanobis distance metric:
+
+.. math::
+
+    || L(x_i - x_j)||^2 = (x_i - x_j)^TM(x_i - x_j),
+
+where :math:`M = L^T L` is a symmetric positive semi-definite matrix of size
+``(n_features_out, n_features_out)``.
+
+
+Implementation
+--------------
+
+This implementation follows what is explained in the paper. For the
+optimisation method,
+currently it uses scipy's l-bfgs-b optimization method with a full gradient
+computation at each iteration, to avoid to tune the
+learning rate and provide a stable learning.
+
+See the examples below and the doc string of
+:meth:`NeighborhoodComponentsAnalysis.fit`
+for further information.
+
+Complexity
+----------
+
+All pairwise differences are needed to compute the cost function, at each
+iteration, so the complexity is :math:`O(d*n^2*i)` with :math:`d` the
+dimension of the input space, :math:`n` the number of samples, and :math:`i`
+ the number of iterations.
+
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_classification.py`
+ * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_dim_reduction.py`
+
+
+.. topic:: References:
+
+   * | `"Neighbourhood Components Analysis". Advances in Neural Information"
+       <http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf>`_,
+     | J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov, Advances in
+     | Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520.
+
+    * `Wikipedia entry on Neighborhood Components Analysis
+      <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py
new file mode 100644
index 0000000000000..d84d96ad9314a
--- /dev/null
+++ b/examples/neighbors/plot_nca_classification.py
@@ -0,0 +1,87 @@
+"""
+============================================================================
+Comparing Nearest Neighbors and Neighborhood Components Analysis
+============================================================================
+
+An example comparing nearest neighbors classification with and without
+Neighborhood Components Analysis.
+
+It will plot the decision boundaries for each class determined by a simple
+Nearest Neighbors classifier against the decision boundaries determined by a
+Neighborhood Components Analysis classifier. The latter aims to find a distance
+metric that maximizes the nearest neighbor classification accuracy on a given
+training set.
+"""
+
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import KNeighborsClassifier, \
+    NeighborhoodComponentsAnalysis
+from sklearn.pipeline import Pipeline
+
+
+print(__doc__)
+
+n_neighbors = 1
+
+dataset = datasets.load_iris()
+X, y = dataset.data, dataset.target
+
+# we only take the first two features. We could avoid this ugly
+# slicing by using a two-dim datasets
+X = X[:, [0, 2]]
+
+X_train, X_test, y_train, y_test = \
+    train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)
+
+h = .01  # step size in the mesh
+
+# Create color maps
+cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
+cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
+
+names = ['K-Nearest Neighbors', 'Neighborhood Components Analysis']
+
+classifiers = [Pipeline([('scaler', StandardScaler()),
+                        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
+                        ]),
+               Pipeline([('scaler', StandardScaler()),
+                        ('nca', NeighborhoodComponentsAnalysis()),
+                        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
+                        ])
+               ]
+
+x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                     np.arange(y_min, y_max, h))
+
+for name, clf in zip(names, classifiers):
+
+    clf.fit(X_train, y_train)
+    score = clf.score(X_test, y_test)
+
+    # Plot the decision boundary. For that, we will assign a color to each
+    # point in the mesh [x_min, x_max]x[y_min, y_max].
+    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
+
+    # Put the result into a color plot
+    Z = Z.reshape(xx.shape)
+    plt.figure()
+    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)
+
+    # Plot also the training and testing points
+    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
+    plt.xlim(xx.min(), xx.max())
+    plt.ylim(yy.min(), yy.max())
+    plt.title("{} (k = {})".format(name, n_neighbors))
+    plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,
+             ha='center', va='center', transform=plt.gca().transAxes)
+
+plt.show()
diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index 2c31c6155a2b9..acb5e7c5b03fe 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -77,9 +77,10 @@
 # Make a list of the methods to be compared
 dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
 
-plt.figure()
+# plt.figure()
 for i, (name, model) in enumerate(dim_reduction_methods):
-    plt.subplot(1, 3, i + 1)
+    plt.figure()
+    # plt.subplot(1, 3, i + 1, aspect=1)
 
     # Fit the method's model
     model.fit(X_train, y_train)
@@ -94,7 +95,7 @@
     X_embedded = model.transform(X)
 
     # Plot the embedding and show the evaluation score
-    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
+    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')
     plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
                                                               n_neighbors,
                                                               acc_knn))
diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
new file mode 100644
index 0000000000000..1eca8e0bfcbec
--- /dev/null
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -0,0 +1,101 @@
+"""
+==========================================
+Large Margin Nearest Neighbor Illustration
+==========================================
+
+An example illustrating the goal of learning a distance metric that maximizes
+the nearest neighbors classification accuracy. The example is solely for
+illustration purposes. Please refer to the :ref:`User Guide <lmnn>` for
+more information.
+"""
+
+# Author: John Chiotellis <johnyc.code@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.datasets import make_classification
+from sklearn.neighbors import NeighborhoodComponentsAnalysis
+from matplotlib import cm
+from scipy.misc import logsumexp
+
+from sklearn.preprocessing import OneHotEncoder
+
+print(__doc__)
+
+n_neighbors = 1
+random_state = 0
+
+# Create a tiny data set of 9 samples from 3 classes
+X, y = make_classification(n_samples=9, n_features=2, n_informative=2,
+                           n_redundant=0, n_classes=3, n_clusters_per_class=1,
+                           class_sep=1.0, random_state=random_state)
+
+# Plot the points in the original space
+plt.figure()
+ax = plt.gca()
+
+# Draw the graph nodes
+ax.scatter(X[:, 0], X[:, 1], s=300, c=y, cmap='tab10', alpha=0.4)
+for i in range(X.shape[0]):
+    ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center')
+
+
+
+def p_i(X, i):
+    diff_embedded = X[i] - X
+    dist_embedded = np.einsum('ij,ij->i', diff_embedded,
+                              diff_embedded)
+    dist_embedded[i] = np.inf
+
+    # compute exponentiated distances (use the log-sum-exp trick to
+    # avoid numerical instabilities
+    exp_dist_embedded = np.exp(-dist_embedded -
+                               logsumexp(-dist_embedded))
+    return exp_dist_embedded
+
+
+masks = OneHotEncoder(sparse=False,
+                      dtype=bool).fit_transform(y[:, np.newaxis])
+
+# for i, pt_i in enumerate(X):  # or maybe select only one point
+i=3
+def relate_point(X, i, ax):
+    pt_i = X[i]
+    for j, pt_j in enumerate(X):
+        thickness = p_i(X, i)
+        if i != j:
+            line = ([pt_i[0], pt_j[0]], [pt_i[1], pt_j[1]])
+            ax.plot(*line, c=cm.tab10(y[j]),
+                    linewidth=5*thickness[j])
+
+relate_point(X, i, ax)
+ax.set_title("Original points")
+ax.axes.get_xaxis().set_visible(False)
+ax.axes.get_yaxis().set_visible(False)
+
+# Learn an embedding with LargeMarginNearestNeighbor
+nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state)
+nca = nca.fit(X, y)
+
+# Plot the points after transformation with LargeMarginNearestNeighbor
+plt.figure()
+ax2 = plt.gca()
+
+# Get the embedding and find the new nearest neighbors
+X_embedded = nca.transform(X)
+
+ax2.scatter(X_embedded[:, 0], X_embedded[:, 1], s=300, c=y, cmap='tab10',
+            alpha=0.4)
+
+relate_point(X_embedded, i, ax2)
+
+for i in range(len(X)):
+    ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i),
+             va='center', ha='center')
+
+# Make axes equal so that boundaries are displayed correctly as circles
+ax2.set_title("NCA embedding")
+ax2.axes.get_xaxis().set_visible(False)
+ax2.axes.get_yaxis().set_visible(False)
+plt.show()
\ No newline at end of file
diff --git a/examples/neighbors/plot_nca_illustration_bis.py b/examples/neighbors/plot_nca_illustration_bis.py
new file mode 100644
index 0000000000000..96356c0eccf4e
--- /dev/null
+++ b/examples/neighbors/plot_nca_illustration_bis.py
@@ -0,0 +1,79 @@
+"""
+==========================================
+Large Margin Nearest Neighbor Illustration
+==========================================
+
+An example illustrating the goal of learning a distance metric that maximizes
+the nearest neighbors classification accuracy. The example is solely for
+illustration purposes. Please refer to the :ref:`User Guide <lmnn>` for
+more information.
+"""
+
+# Author: John Chiotellis <johnyc.code@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+
+from sklearn.datasets import make_classification
+from sklearn.neighbors import KNeighborsClassifier, \
+    NeighborhoodComponentsAnalysis
+
+from sklearn.pipeline import Pipeline
+
+print(__doc__)
+
+n_neighbors = 1
+random_state = 0
+
+cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
+cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
+
+# Create a tiny data set of 9 samples from 3 classes
+X, y = make_classification(n_samples=9, n_features=2, n_informative=2,
+                           n_redundant=0, n_classes=3, n_clusters_per_class=1,
+                           class_sep=1.0, random_state=random_state)
+
+X = np.vstack([np.hstack([np.linspace(0, 1, 5)[:, np.newaxis],
+                        np.linspace(0, 1, 5)[:, np.newaxis] + 0.3]),
+              np.hstack([np.linspace(0, 1, 3)[:, np.newaxis],
+                        np.linspace(0, 1, 3)[:, np.newaxis]])])
+y = np.hstack([np.zeros(5), np.ones(3)])
+
+n_neighbors = 1
+
+x_min, x_max = X[:, 0].min() - 0.2, X[:, 0].max() + 0.2
+y_min, y_max = X[:, 1].min() - 0.2, X[:, 1].max() + 0.2
+xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.005),
+                     np.arange(y_min, y_max, 0.005))
+grid = np.c_[xx.ravel(), yy.ravel()]
+
+# plot decision boundary with knn in the input space
+knn = KNeighborsClassifier(n_neighbors)
+knn.fit(X, y)
+
+plt.figure()
+Z = knn.predict(grid).reshape(xx.shape)
+plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)
+plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, s=20, edgecolor='k')
+
+# plot decision boundary with NCA in the input space
+nca = NeighborhoodComponentsAnalysis(verbose=1)
+X_embedded = nca.fit_transform(X, y)
+
+plt.figure()
+grid_e = nca.transform(grid)
+knn.fit(X_embedded, y)
+Z = knn.predict(grid_e).reshape(xx.shape)
+plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)
+plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, s=20, edgecolor='k')
+
+# # plot decision boundary with nca in the embedding space
+# plt.figure()
+# Z = knn.predict(grid_e).reshape(xx.shape)
+# plt.pcolormesh(grid_e[:, 0].reshape(xx.shape), grid_e[:, 1].reshape(xx.shape),
+#             Z, cmap=cmap_light, alpha=.8)
+# plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, cmap=cmap_bold, s=20, \
+#                                                               edgecolor='k')
+# plt.show()

From 27ab46bb8505ae66ed87624343b4600c8e6fcd0a Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Sat, 30 Dec 2017 00:29:39 +0100
Subject: [PATCH 17/79] Adapt documentation to Neighborhood Components Analysis

---
 doc/modules/neighbors.rst                     | 121 +++++++++---------
 examples/neighbors/plot_nca_classification.py |   4 +-
 examples/neighbors/plot_nca_dim_reduction.py  |   1 -
 examples/neighbors/plot_nca_illustration.py   |  19 +--
 .../neighbors/plot_nca_illustration_bis.py    |  79 ------------
 5 files changed, 70 insertions(+), 154 deletions(-)
 delete mode 100644 examples/neighbors/plot_nca_illustration_bis.py

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 119b590276f14..8c34df65fc352 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -524,10 +524,13 @@ Neighborhood Components Analysis
 
 .. sectionauthor:: William de Vazelhes <william.de-vazelhes@inria.fr>
 
-Neighborhood Components Analysis (NCA,
-:class:`NeighborhoodComponentAnalysis`) is
-a distance metric learning algorithm which aims to improve the accuracy of
+Neighborhood Components Analysis (NCA, :class:`NeighborhoodComponentsAnalysis`   )
+is a distance metric learning algorithm which aims to improve the accuracy of
 nearest neighbors classification compared to the standard Euclidean distance.
+The algorithm  directly  maximizes  a stochastic  variant  of  the
+leave-one-out k-nearest neighbors (KNN) score on the training set.  It can also
+learn a low-dimensional linear  embedding  of  labeled  data  that  can  be
+used for  data  visualization and fast classification.
 
 .. |nca_illustration_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_illustration_001.png
    :target: ../auto_examples/neighbors/plot_nca_illustration.html
@@ -539,23 +542,15 @@ nearest neighbors classification compared to the standard Euclidean distance.
 
 .. centered:: |nca_illustration_1| |nca_illustration_2|
 
-The algorithm  directly  maximizes  a stochastic  variant  of  the
-leave-one-out k-nearest neighbors (KNN) score on the training set.  It can also
-learn a low-dimensional linear  embedding  of  labeled  data  that  can  be
-used for  data  visualization and fast classification.  Unlike other methods,
-our classification model is non-parametric, making no assumptions about the
-shape of the class distributions or the boundaries between them.  The
-performance of the method is demonstrated on several data sets, both for metric
-learning and linear dimensionality reduction.
-In the above figure, we consider some points from a randomly generated
-dataset. We focus on the stochastic KNN classification of point n°3. In the
-original
-space, it has
-many stochastic neighbors, the thickness of the bond representing the
-softmax distance to it hence their weight in the prediction of its class.
-However, in the embedding space, the only non-negligible stochastic
-neighbors are from the same class as sample 3, guaranteeing that it will be
-well classified.
+
+In the above figure, we consider some points from a randomly generated dataset.
+We focus on the stochastic KNN classification of point n°3, the thickness of a
+bond representing a softmax distance hence the weight of the neighbor vote in
+the classification. In the original space, sample 3 has many stochastic
+neighbors from various classes, so the right class is not very likely. However,
+in the embedding space, the only non-negligible stochastic neighbors are from
+the same class as sample 3, guaranteeing that the latter will be well
+classified.
 
 
@@ -564,8 +559,8 @@ Classification
 
 Combined with a nearest neighbors classifier (:class:`KNeighborsClassifier`),
 this method is attractive for classification because it can naturally
-handle multi-class problems without any increase in the model size, and only
-a single parameter (``n_neighbors``) has to be selected by the user before
+handle multi-class problems without any increase in the model size, and no
+additional parameter than that of KNN has to be selected by the user before
 training.
 
 Neighborhood Components Analysis classification has been shown to work well in
@@ -575,10 +570,9 @@ assumptions about the class distributions. The nearest neighbor classification
 can naturally produce highly irregular decision boundaries.
 
 To use this model for classification, one needs to combine a
-:class:`NeighborhoodComponentsAnalysis`
-instance that learns the optimal transformation with a :class:`KNeighborsClassifier`
-instance that performs the classification in the embedded space. Here is an
-example using the two classes:
+:class:`NeighborhoodComponentsAnalysis` instance that learns the optimal
+transformation with a :class:`KNeighborsClassifier` instance that performs the
+classification in the embedded space. Here is an example using the two classes:
 
     >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
     >>> from sklearn.neighbors import KNeighborsClassifier
@@ -619,9 +613,9 @@ that automatically applies the transformation when fitting or predicting:
 
 .. centered:: |nca_classification_1| |nca_classification_2|
 
-
-The plot shows decision boundaries for nearest neighbor classification and
-large margin nearest neighbor classification.
+The plot shows decision boundaries for Nearest Neighbor Classification and
+Neighborhood Components Analysis classification, when training and scoring
+on only two features, for visualisation purpose.
 
 
 Dimensionality reduction
@@ -630,20 +624,16 @@ Dimensionality reduction
 :class:`NeighborhoodComponentsAnalysis` can be used to perform supervised
 dimensionality reduction. The input data are projected onto a linear subspace
 consisting of the directions which minimize the NCA objective. The desired
-dimensionality can be set using the parameter ``n_features_out``.
-For instance, the following shows a comparison of dimensionality reduction
-with Principal Component Analysis (:class:`sklearn.decomposition.PCA`),
-Linear Discriminant Analysis (:class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+dimensionality can be set using the parameter ``n_features_out``. For instance,
+the following shows a comparison of dimensionality reduction with Principal
+Component Analysis (:class:`sklearn.decomposition.PCA`), Linear Discriminant
+Analysis (:class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
 and Neighborhood Component Analysis (:class:`NeighborhoodComponentsAnalysis`)
-on the Digits dataset, a dataset with size
-:math:`n_{samples} = 1797` and :math:`n_{features} = 64`.
-The data set is splitted in a training and test set of equal size. What is
-more, a :class:`sklearn.preprocessing.StandardScaler` fitted on the training
-set and
-transforms
-the data from both sets. For evaluation the 3-nearest neighbor classification
-accuracy is
-computed on the
+on the Digits dataset, a dataset with size :math:`n_{samples} = 1797` and
+:math:`n_{features} = 64`. The data set is splitted in a training and test set
+of equal size. What is more, a :class:`sklearn.preprocessing.StandardScaler`
+fitted on the training set and transforms the data from both sets. For
+evaluation the 3-nearest neighbor classification accuracy is computed on the
 2-dimensional embedding found by each method. Each data sample belongs to one
 of 10 classes.
 
@@ -666,15 +656,9 @@ Mathematical formulation
 ------------------------
 
 NCA learns a linear transformation matrix :math:`L` of
-size ``(n_features_out, n_features)``. NCA maximises in average the
+size ``(n_features_out, n_features)``, which maximises in average the
 probability :math:`p_i` of sample :math:`i` being
-classified as :math:`C_i`, where :math:`p_i` is a weighted sum of all other
-samples of
-class :math:`C_i`, with a weighting related to their distance to :math:`i`.
-
-The contribution of sample :math:`i` to the cost function is therefore the
-following (it is the probability of sample :math:`i` to be classify as
-:math:`C`):
+classified as :math:`C_i`, defined by:
 
 .. math::
 
@@ -700,29 +684,40 @@ NCA can be seen as learning a (squared) Mahalanobis distance metric:
     || L(x_i - x_j)||^2 = (x_i - x_j)^TM(x_i - x_j),
 
 where :math:`M = L^T L` is a symmetric positive semi-definite matrix of size
-``(n_features_out, n_features_out)``.
+``(n_features, n_features)``.
 
 
 Implementation
 --------------
 
 This implementation follows what is explained in the paper. For the
-optimisation method,
-currently it uses scipy's l-bfgs-b optimization method with a full gradient
-computation at each iteration, to avoid to tune the
-learning rate and provide a stable learning.
+optimisation method, it currently uses scipy's l-bfgs-b with a full gradient
+computation at each iteration, to avoid to tune the learning rate and provide a
+stable learning.
 
 See the examples below and the doc string of
-:meth:`NeighborhoodComponentsAnalysis.fit`
-for further information.
+:meth:`NeighborhoodComponentsAnalysis.fit` for further information.
 
 Complexity
 ----------
 
-All pairwise differences are needed to compute the cost function, at each
-iteration, so the complexity is :math:`O(d*n^2*i)` with :math:`d` the
-dimension of the input space, :math:`n` the number of samples, and :math:`i`
- the number of iterations.
+Training
+^^^^^^^^
+First, time complexity depends on the number of iterations done. Besides,
+currently the algorithm has to compute, for each sample, its contribution to
+the cost and the gradient. The more complex operation in this computation are
+the dot products between differences in the input space and differences in the
+embedded space, which has complexity ``n_features_out * n_features *
+n_samples``. Therefore time complexity is ``O[n_iterations * n_samples^2 *
+n_features * n_features_out]`` In addition, the biggest matrix in memory has
+size ``max(n_features * n_features_out, n_features * n_samples,
+n_features_out * n_samples)``.
+
+Transform
+^^^^^^^^^
+Here the ``transform`` operation returns :math:`LX^T`, therefore its time
+complexity equals ``n_features_out * n_features * n_samples_test``. There is no
+added space complexity in the operation.
 
 
 .. topic:: Examples:
@@ -738,5 +733,5 @@ dimension of the input space, :math:`n` the number of samples, and :math:`i`
      | J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov, Advances in
      | Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520.
 
-    * `Wikipedia entry on Neighborhood Components Analysis
-      <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
+   * `Wikipedia entry on Neighborhood Components Analysis
+     <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py
index d84d96ad9314a..e70b17f77274b 100644
--- a/examples/neighbors/plot_nca_classification.py
+++ b/examples/neighbors/plot_nca_classification.py
@@ -33,7 +33,7 @@
 dataset = datasets.load_iris()
 X, y = dataset.data, dataset.target
 
-# we only take the first two features. We could avoid this ugly
+# we only take two features. We could avoid this ugly
 # slicing by using a two-dim datasets
 X = X[:, [0, 2]]
 
@@ -46,7 +46,7 @@
 cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
 cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
 
-names = ['K-Nearest Neighbors', 'Neighborhood Components Analysis']
+names = ['KNN', 'NCA, KNN']
 
 classifiers = [Pipeline([('scaler', StandardScaler()),
                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index acb5e7c5b03fe..ababdc849225c 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -68,7 +68,6 @@
 # Reduce dimension to 2 with NeighborhoodComponentAnalysis
 nca = make_pipeline(StandardScaler(),
                     NeighborhoodComponentsAnalysis(n_features_out=2,
-                                                   verbose=1,
                                                    random_state=random_state))
 
 # Use a nearest neighbor classifier to evaluate the methods
diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 1eca8e0bfcbec..354f0b185f61c 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -1,15 +1,14 @@
 """
-==========================================
-Large Margin Nearest Neighbor Illustration
-==========================================
+=============================================
+Neighborhood Components Analysis Illustration
+=============================================
 
 An example illustrating the goal of learning a distance metric that maximizes
 the nearest neighbors classification accuracy. The example is solely for
-illustration purposes. Please refer to the :ref:`User Guide <lmnn>` for
+illustration purposes. Please refer to the :ref:`User Guide <nca>` for
 more information.
 """
 
-# Author: John Chiotellis <johnyc.code@gmail.com>
 # License: BSD 3 clause
 
 import numpy as np
@@ -58,8 +57,6 @@ def p_i(X, i):
 masks = OneHotEncoder(sparse=False,
                       dtype=bool).fit_transform(y[:, np.newaxis])
 
-# for i, pt_i in enumerate(X):  # or maybe select only one point
-i=3
 def relate_point(X, i, ax):
     pt_i = X[i]
     for j, pt_j in enumerate(X):
@@ -69,16 +66,20 @@ def relate_point(X, i, ax):
             ax.plot(*line, c=cm.tab10(y[j]),
                     linewidth=5*thickness[j])
 
+# we consider only point n°3
+i=3
+
+# Plot bonds linked to sample i in the original space
 relate_point(X, i, ax)
 ax.set_title("Original points")
 ax.axes.get_xaxis().set_visible(False)
 ax.axes.get_yaxis().set_visible(False)
 
-# Learn an embedding with LargeMarginNearestNeighbor
+# Learn an embedding with NeighborhoodComponentsAnalysis
 nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state)
 nca = nca.fit(X, y)
 
-# Plot the points after transformation with LargeMarginNearestNeighbor
+# Plot the points after transformation with NeighborhoodComponentsAnalysis
 plt.figure()
 ax2 = plt.gca()
 
diff --git a/examples/neighbors/plot_nca_illustration_bis.py b/examples/neighbors/plot_nca_illustration_bis.py
deleted file mode 100644
index 96356c0eccf4e..0000000000000
--- a/examples/neighbors/plot_nca_illustration_bis.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""
-==========================================
-Large Margin Nearest Neighbor Illustration
-==========================================
-
-An example illustrating the goal of learning a distance metric that maximizes
-the nearest neighbors classification accuracy. The example is solely for
-illustration purposes. Please refer to the :ref:`User Guide <lmnn>` for
-more information.
-"""
-
-# Author: John Chiotellis <johnyc.code@gmail.com>
-# License: BSD 3 clause
-
-import numpy as np
-import matplotlib.pyplot as plt
-from matplotlib.colors import ListedColormap
-
-from sklearn.datasets import make_classification
-from sklearn.neighbors import KNeighborsClassifier, \
-    NeighborhoodComponentsAnalysis
-
-from sklearn.pipeline import Pipeline
-
-print(__doc__)
-
-n_neighbors = 1
-random_state = 0
-
-cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
-cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
-
-# Create a tiny data set of 9 samples from 3 classes
-X, y = make_classification(n_samples=9, n_features=2, n_informative=2,
-                           n_redundant=0, n_classes=3, n_clusters_per_class=1,
-                           class_sep=1.0, random_state=random_state)
-
-X = np.vstack([np.hstack([np.linspace(0, 1, 5)[:, np.newaxis],
-                        np.linspace(0, 1, 5)[:, np.newaxis] + 0.3]),
-              np.hstack([np.linspace(0, 1, 3)[:, np.newaxis],
-                        np.linspace(0, 1, 3)[:, np.newaxis]])])
-y = np.hstack([np.zeros(5), np.ones(3)])
-
-n_neighbors = 1
-
-x_min, x_max = X[:, 0].min() - 0.2, X[:, 0].max() + 0.2
-y_min, y_max = X[:, 1].min() - 0.2, X[:, 1].max() + 0.2
-xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.005),
-                     np.arange(y_min, y_max, 0.005))
-grid = np.c_[xx.ravel(), yy.ravel()]
-
-# plot decision boundary with knn in the input space
-knn = KNeighborsClassifier(n_neighbors)
-knn.fit(X, y)
-
-plt.figure()
-Z = knn.predict(grid).reshape(xx.shape)
-plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)
-plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, s=20, edgecolor='k')
-
-# plot decision boundary with NCA in the input space
-nca = NeighborhoodComponentsAnalysis(verbose=1)
-X_embedded = nca.fit_transform(X, y)
-
-plt.figure()
-grid_e = nca.transform(grid)
-knn.fit(X_embedded, y)
-Z = knn.predict(grid_e).reshape(xx.shape)
-plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)
-plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, s=20, edgecolor='k')
-
-# # plot decision boundary with nca in the embedding space
-# plt.figure()
-# Z = knn.predict(grid_e).reshape(xx.shape)
-# plt.pcolormesh(grid_e[:, 0].reshape(xx.shape), grid_e[:, 1].reshape(xx.shape),
-#             Z, cmap=cmap_light, alpha=.8)
-# plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, cmap=cmap_bold, s=20, \
-#                                                               edgecolor='k')
-# plt.show()

From 44e19d6321f383a5949fc6c9bfd21030833a317a Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 3 Jan 2018 11:23:21 +0100
Subject: [PATCH 18/79] fix pep8 errors

---
 examples/neighbors/plot_nca_classification.py | 10 +++++-----
 examples/neighbors/plot_nca_illustration.py   |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py
index e70b17f77274b..4cb8f225e8c00 100644
--- a/examples/neighbors/plot_nca_classification.py
+++ b/examples/neighbors/plot_nca_classification.py
@@ -49,12 +49,12 @@
 names = ['KNN', 'NCA, KNN']
 
 classifiers = [Pipeline([('scaler', StandardScaler()),
-                        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
-                        ]),
+                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
+                         ]),
                Pipeline([('scaler', StandardScaler()),
-                        ('nca', NeighborhoodComponentsAnalysis()),
-                        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
-                        ])
+                         ('nca', NeighborhoodComponentsAnalysis()),
+                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
+                         ])
                ]
 
 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 354f0b185f61c..b3a5269441222 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -40,7 +40,6 @@
     ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center')
 
 
-
 def p_i(X, i):
     diff_embedded = X[i] - X
     dist_embedded = np.einsum('ij,ij->i', diff_embedded,
@@ -57,6 +56,7 @@ def p_i(X, i):
 masks = OneHotEncoder(sparse=False,
                       dtype=bool).fit_transform(y[:, np.newaxis])
 
+
 def relate_point(X, i, ax):
     pt_i = X[i]
     for j, pt_j in enumerate(X):
@@ -67,7 +67,7 @@ def relate_point(X, i, ax):
                     linewidth=5*thickness[j])
 
 # we consider only point n°3
-i=3
+i = 3
 
 # Plot bonds linked to sample i in the original space
 relate_point(X, i, ax)
@@ -99,4 +99,4 @@ def relate_point(X, i, ax):
 ax2.set_title("NCA embedding")
 ax2.axes.get_xaxis().set_visible(False)
 ax2.axes.get_yaxis().set_visible(False)
-plt.show()
\ No newline at end of file
+plt.show()

From dcb1a8a55217f33316df9bf871cde5cee396f8aa Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 3 Jan 2018 13:09:02 +0100
Subject: [PATCH 19/79] fix flake8 error

---
 examples/neighbors/plot_nca_illustration.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index b3a5269441222..9582220797db1 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -66,6 +66,7 @@ def relate_point(X, i, ax):
             ax.plot(*line, c=cm.tab10(y[j]),
                     linewidth=5*thickness[j])
 
+
 # we consider only point n°3
 i = 3
 

From 6ba16926408ce82cf03c18a5e1727d51fac2c5d2 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 3 Jan 2018 14:44:53 +0100
Subject: [PATCH 20/79] fix encoding error

---
 examples/neighbors/plot_nca_illustration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 9582220797db1..a0f7733c2b8fa 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -67,7 +67,7 @@ def relate_point(X, i, ax):
                     linewidth=5*thickness[j])
 
 
-# we consider only point n°3
+# we consider only point 3
 i = 3
 
 # Plot bonds linked to sample i in the original space

From 03b126b0cacc1a002a4c447312b15a2d2f9d25a5 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 15 Jan 2018 14:24:02 +0100
Subject: [PATCH 21/79] changes according to review
 https://github.com/scikit-learn/scikit-learn/pull/10058#pullrequestreview-87420798

---
 doc/modules/neighbors.rst                   | 107 ++++++++++----------
 examples/neighbors/plot_nca_illustration.py |   9 +-
 2 files changed, 60 insertions(+), 56 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 8c34df65fc352..5f729aad84fa7 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -524,13 +524,13 @@ Neighborhood Components Analysis
 
 .. sectionauthor:: William de Vazelhes <william.de-vazelhes@inria.fr>
 
-Neighborhood Components Analysis (NCA, :class:`NeighborhoodComponentsAnalysis`   )
+Neighborhood Components Analysis (NCA, :class:`NeighborhoodComponentsAnalysis`)
 is a distance metric learning algorithm which aims to improve the accuracy of
 nearest neighbors classification compared to the standard Euclidean distance.
 The algorithm  directly  maximizes  a stochastic  variant  of  the
 leave-one-out k-nearest neighbors (KNN) score on the training set.  It can also
-learn a low-dimensional linear  embedding  of  labeled  data  that  can  be
-used for  data  visualization and fast classification.
+learn a low-dimensional linear  embedding  of  data  that  can  be used for
+data  visualization and fast classification.
 
 .. |nca_illustration_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_illustration_001.png
    :target: ../auto_examples/neighbors/plot_nca_illustration.html
@@ -543,14 +543,14 @@ used for  data  visualization and fast classification.
 .. centered:: |nca_illustration_1| |nca_illustration_2|
 
 
-In the above figure, we consider some points from a randomly generated dataset.
-We focus on the stochastic KNN classification of point n°3, the thickness of a
-bond representing a softmax distance hence the weight of the neighbor vote in
-the classification. In the original space, sample 3 has many stochastic
-neighbors from various classes, so the right class is not very likely. However,
-in the embedding space, the only non-negligible stochastic neighbors are from
-the same class as sample 3, guaranteeing that the latter will be well
-classified.
+In the above illustrating figure, we consider some points from a randomly
+generated dataset. We focus on the stochastic KNN classification of point n°3,
+the thickness of a bond representing a softmax distance hence the weight of the
+neighbor vote in the classification. In the original space, sample 3 has many
+stochastic neighbors from various classes, so the right class is not very
+likely. However, in the embedding space learned by NCA, the only non-negligible
+stochastic neighbors are from the same class as sample 3, guaranteeing that the
+latter will be well classified.
 
 
@@ -558,21 +558,21 @@ Classification
 --------------
 
 Combined with a nearest neighbors classifier (:class:`KNeighborsClassifier`),
-this method is attractive for classification because it can naturally
-handle multi-class problems without any increase in the model size, and no
-additional parameter than that of KNN has to be selected by the user before
-training.
+NCA is attractive for classification because it can naturally handle
+multi-class problems without any increase in the model size, and does not
+introduce additional parameters that require fine-tuning by the user.
 
-Neighborhood Components Analysis classification has been shown to work well in
-practice for data sets of varying size and difficulty. In contrast to
-related methods such as Linear Discriminant Analysis, NCA does not make any
-assumptions about the class distributions. The nearest neighbor classification
-can naturally produce highly irregular decision boundaries.
+NCA classification has been shown to work well in practice for data sets of
+varying size and difficulty. In contrast to related methods such as Linear
+Discriminant Analysis, NCA does not make any assumptions about the class
+distributions. The nearest neighbor classification can naturally produce highly
+irregular decision boundaries.
 
 To use this model for classification, one needs to combine a
 :class:`NeighborhoodComponentsAnalysis` instance that learns the optimal
 transformation with a :class:`KNeighborsClassifier` instance that performs the
-classification in the embedded space. Here is an example using the two classes:
+classification in the embedding space. Here is an example using the two
+classes:
 
     >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
     >>> from sklearn.neighbors import KNeighborsClassifier
@@ -614,28 +614,26 @@ that automatically applies the transformation when fitting or predicting:
 .. centered:: |nca_classification_1| |nca_classification_2|
 
 The plot shows decision boundaries for Nearest Neighbor Classification and
-Neighborhood Components Analysis classification, when training and scoring
-on only two features, for visualisation purpose.
+Neighborhood Components Analysis classification on the iris dataset, when
+training and scoring on only two features, for visualisation purpose.
 
 
 Dimensionality reduction
 ------------------------
 
-:class:`NeighborhoodComponentsAnalysis` can be used to perform supervised
-dimensionality reduction. The input data are projected onto a linear subspace
-consisting of the directions which minimize the NCA objective. The desired
-dimensionality can be set using the parameter ``n_features_out``. For instance,
-the following shows a comparison of dimensionality reduction with Principal
-Component Analysis (:class:`sklearn.decomposition.PCA`), Linear Discriminant
-Analysis (:class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
-and Neighborhood Component Analysis (:class:`NeighborhoodComponentsAnalysis`)
-on the Digits dataset, a dataset with size :math:`n_{samples} = 1797` and
-:math:`n_{features} = 64`. The data set is splitted in a training and test set
-of equal size. What is more, a :class:`sklearn.preprocessing.StandardScaler`
-fitted on the training set and transforms the data from both sets. For
-evaluation the 3-nearest neighbor classification accuracy is computed on the
-2-dimensional embedding found by each method. Each data sample belongs to one
-of 10 classes.
+NCA can be used to perform supervised dimensionality reduction. The input data
+are projected onto a linear subspace consisting of the directions which
+minimize the NCA objective. The desired dimensionality can be set using the
+parameter ``n_features_out``. For instance, the following figure shows a
+comparison of dimensionality reduction with Principal Component Analysis
+(:class:`sklearn.decomposition.PCA`), Linear Discriminant Analysis
+(:class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`) and
+Neighborhood Component Analysis (:class:`NeighborhoodComponentsAnalysis`) on
+the Digits dataset, a dataset with size :math:`n_{samples} = 1797` and
+:math:`n_{features} = 64`. The data set is split into a training and a test set
+of equal size, then standardized. For evaluation the 3-nearest neighbor
+classification accuracy is computed on the 2-dimensional embedding found by
+each method. Each data sample belongs to one of 10 classes.
 
 .. |nca_dim_reduction_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_dim_reduction_001.png
    :target: ../auto_examples/neighbors/plot_nca_dim_reduction.html
@@ -655,23 +653,30 @@ of 10 classes.
 Mathematical formulation
 ------------------------
 
-NCA learns a linear transformation matrix :math:`L` of
-size ``(n_features_out, n_features)``, which maximises in average the
-probability :math:`p_i` of sample :math:`i` being
-classified as :math:`C_i`, defined by:
+The goal of NCA is to learn an optimal linear transformation matrix :math:`L^*`
+of size ``(n_features_out, n_features)``, which maximises in average the
+probability :math:`p_i` of sample :math:`i` being correctly classified, i.e.:
 
 .. math::
 
-  p_{i}=\sum\nolimits_{j \in C_i}{p_{i j}}
+  L^*= \max\limits_{L} \sum\limits_{i=0}^{N - 1} p_{i}
+
+with :math:`N` = ``n_samples`` and :math:`p_i` the probability of sample
+:math:`i` being correctly classified according to a stochastic nearest
+neighbors rule in the learned embedded space:
+
+.. math::
+
+  p_{i}=\sum\limits_{j \in C_i}{p_{i j}}
 
 where :math:`C_i` is the set of points in the same class as sample :math:`i`,
-and :math:`p_{i j}` is the softmax over Euclidean distances in the
-transformed space:
+and :math:`p_{i j}` is the softmax over Euclidean distances in the embedded
+space:
 
 .. math::
 
-  p_{i j} = \frac{\exp(-||L x_i - L x_j||^2)}{\sum\nolimits_{k \ne
-            i} {\exp{-(||L x_i - L x_k||^2)}}} , p_{i i} = 0
+  p_{i j} = \frac{\exp(-||L x_i - L x_j||^2)}{\sum\limits_{k \ne
+            i} {\exp{-(||L x_i - L x_k||^2)}}} , \quad p_{i i} = 0
 
 
 Mahalanobis distance
@@ -690,12 +695,12 @@ where :math:`M = L^T L` is a symmetric positive semi-definite matrix of size
 Implementation
 --------------
 
-This implementation follows what is explained in the paper. For the
+This implementation follows what is explained in the original paper. For the
 optimisation method, it currently uses scipy's l-bfgs-b with a full gradient
-computation at each iteration, to avoid to tune the learning rate and provide a
+computation at each iteration, to avoid to tune the learning rate and provide
 stable learning.
 
-See the examples below and the doc string of
+See the examples below and the docstring of
 :meth:`NeighborhoodComponentsAnalysis.fit` for further information.
 
 Complexity
@@ -705,7 +710,7 @@ Training
 ^^^^^^^^
 First, time complexity depends on the number of iterations done. Besides,
 currently the algorithm has to compute, for each sample, its contribution to
-the cost and the gradient. The more complex operation in this computation are
+the cost and the gradient. The dominating terms in this computation are
 the dot products between differences in the input space and differences in the
 embedded space, which has complexity ``n_features_out * n_features *
 n_samples``. Therefore time complexity is ``O[n_iterations * n_samples^2 *
diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index a0f7733c2b8fa..254d61b924690 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -35,9 +35,9 @@
 ax = plt.gca()
 
 # Draw the graph nodes
-ax.scatter(X[:, 0], X[:, 1], s=300, c=y, cmap='tab10', alpha=0.4)
 for i in range(X.shape[0]):
     ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center')
+    ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[i]), alpha=0.4)
 
 
 def p_i(X, i):
@@ -63,7 +63,7 @@ def relate_point(X, i, ax):
         thickness = p_i(X, i)
         if i != j:
             line = ([pt_i[0], pt_j[0]], [pt_i[1], pt_j[1]])
-            ax.plot(*line, c=cm.tab10(y[j]),
+            ax.plot(*line, c=cm.Set1(y[j]),
                     linewidth=5*thickness[j])
 
 
@@ -87,14 +87,13 @@ def relate_point(X, i, ax):
 # Get the embedding and find the new nearest neighbors
 X_embedded = nca.transform(X)
 
-ax2.scatter(X_embedded[:, 0], X_embedded[:, 1], s=300, c=y, cmap='tab10',
-            alpha=0.4)
-
 relate_point(X_embedded, i, ax2)
 
 for i in range(len(X)):
     ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i),
              va='center', ha='center')
+    ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[i]),
+                alpha=0.4)
 
 # Make axes equal so that boundaries are displayed correctly as circles
 ax2.set_title("NCA embedding")

From 8b5646c0552f74a492c4070aec91b0354f8a39f4 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 15 Jan 2018 16:00:12 +0100
Subject: [PATCH 22/79] correct objective function doc

---
 doc/modules/neighbors.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 5f729aad84fa7..e9954e6045714 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -653,13 +653,13 @@ each method. Each data sample belongs to one of 10 classes.
 Mathematical formulation
 ------------------------
 
-The goal of NCA is to learn an optimal linear transformation matrix :math:`L^*`
-of size ``(n_features_out, n_features)``, which maximises in average the
-probability :math:`p_i` of sample :math:`i` being correctly classified, i.e.:
+The goal of NCA is to learn an optimal linear transformation matrix of size
+``(n_features_out, n_features)``, which maximises in average the probability
+:math:`p_i` of sample :math:`i` being correctly classified, i.e.:
 
 .. math::
 
-  L^*= \max\limits_{L} \sum\limits_{i=0}^{N - 1} p_{i}
+  \underset{L}{\arg\max} \sum\limits_{i=0}^{N - 1} p_{i}
 
 with :math:`N` = ``n_samples`` and :math:`p_i` the probability of sample
 :math:`i` being correctly classified according to a stochastic nearest
@@ -700,7 +700,7 @@ optimisation method, it currently uses scipy's l-bfgs-b with a full gradient
 computation at each iteration, to avoid to tune the learning rate and provide
 stable learning.
 
-See the examples below and the docstring of
+See the examples below and the doc  string of
 :meth:`NeighborhoodComponentsAnalysis.fit` for further information.
 
 Complexity

From 9a09e29cfd2bb6ad24bdfae653cdc689060db658 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 5 Jun 2018 16:13:47 +0200
Subject: [PATCH 23/79] Add batch computations of loss and gradient.

---
 sklearn/neighbors/nca.py            | 60 ++++++++++++-----------------
 sklearn/neighbors/tests/test_nca.py | 10 ++---
 2 files changed, 28 insertions(+), 42 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 9d213555f82a6..bb0be0152e82e 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -10,8 +10,9 @@
 import numpy as np
 import sys
 import time
-from scipy.misc import logsumexp
+from scipy.special import logsumexp
 from scipy.optimize import minimize
+from ..metrics import pairwise_distances
 from ..preprocessing import OneHotEncoder
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import LabelEncoder
@@ -190,7 +191,7 @@ def fit(self, X, y):
         disp = self.verbose - 2 if self.verbose > 1 else -1
         optimizer_params = {'method': 'L-BFGS-B',
                             'fun': self._loss_grad_lbfgs,
-                            'args': (X_valid, y_valid, masks, -1.0),
+                            'args': (X_valid, y_valid, -1.0),
                             'jac': True,
                             'x0': transformation,
                             'tol': self.tol,
@@ -397,7 +398,7 @@ def _callback(self, transformation):
 
         self.n_iter_ += 1
 
-    def _loss_grad_lbfgs(self, transformation, X, y, masks, sign=1.0):
+    def _loss_grad_lbfgs(self, transformation, X, y, sign=1.0):
         """Compute the loss and the loss gradient w.r.t. ``transformation``.
 
         Parameters
@@ -412,9 +413,6 @@ def _loss_grad_lbfgs(self, transformation, X, y, masks, sign=1.0):
         y : array, shape (n_samples,)
             The corresponding training labels.
 
-        masks : array, shape (n_samples, n_classes)
-            One-hot encoding of y.
-
         Returns
         -------
         loss : float
@@ -438,36 +436,26 @@ def _loss_grad_lbfgs(self, transformation, X, y, masks, sign=1.0):
         t_funcall = time.time()
 
         transformation = transformation.reshape(-1, X.shape[1])
-        loss = 0
-        gradient = np.zeros(transformation.shape)
-        X_embedded = np.dot(X, transformation.T)
-
-        # for every sample x_i, compute its contribution to loss and gradient
-        for i in range(X.shape[0]):
-            # compute squared distances to x_i in embedded space
-            diff_embedded = X_embedded[i] - X_embedded
-            dist_embedded = np.einsum('ij,ij->i', diff_embedded,
-                                      diff_embedded)
-            dist_embedded[i] = np.inf
-
-            # compute exponentiated distances (use the log-sum-exp trick to
-            # avoid numerical instabilities
-            exp_dist_embedded = np.exp(-dist_embedded -
-                                       logsumexp(-dist_embedded))
-            ci = masks[:, y[i]]  # samples that are in the same class as x_i
-            p_i_j = exp_dist_embedded[ci]
-            diffs = X[i, :] - X
-            diff_ci = diffs[ci, :]
-            diff_not_ci = diffs[~ci, :]
-            sum_ci = diff_ci.T.dot(
-                (p_i_j[:, np.newaxis] * diff_embedded[ci, :]))
-            sum_not_ci = diff_not_ci.T.dot((exp_dist_embedded[~ci][:,
-                                            np.newaxis] *
-                                            diff_embedded[~ci, :]))
-            p_i = np.sum(p_i_j)  # probability of x_i to be correctly
-            # classified
-            gradient += 2 * (p_i * sum_not_ci + (p_i - 1) * sum_ci).T
-            loss += p_i
+        X_embedded = np.dot(X, transformation.T)  # (n_samples, n_features_out)
+        mask = y[:, np.newaxis] == y[np.newaxis, :]  # (n_samples, n_samples)
+
+        # Compute softmax distances
+        p_ij = pairwise_distances(X_embedded, squared=True)
+        np.fill_diagonal(p_ij, np.inf)
+        p_ij = np.exp(-p_ij - logsumexp(-p_ij, axis=1, keepdims=True))
+        # (n_samples, n_samples)
+
+        # Compute loss
+        masked_p_ij = p_ij * mask
+        p = np.sum(masked_p_ij, axis=1, keepdims=True)  # (n_samples, 1)
+        loss = np.sum(p)
+
+        # Compute gradient of loss w.r.t. `transform`
+        weighted_p_ij = masked_p_ij - p_ij * p
+        gradient = 2 * (X_embedded.T.dot(weighted_p_ij + weighted_p_ij.T) -
+                        X_embedded.T * np.sum(weighted_p_ij, axis=0)).dot(X)
+        # time complexity: O(n_features_out x n_samples x
+        # min(n_samples, n_features))
 
         if self.verbose:
             t_funcall = time.time() - t_funcall
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 1f0bd3d338cde..dd40d26efa50c 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -78,7 +78,7 @@ def test_finite_differences():
 
     point = nca._initialize(X, init)
     # compute the gradient at `point`
-    _, gradient = nca._loss_grad_lbfgs(point, X, y, masks)
+    _, gradient = nca._loss_grad_lbfgs(point, X, y)
 
     # create a random direction of norm 1
     random_direction = rng.randn(*point.shape)
@@ -90,11 +90,9 @@ def test_finite_differences():
 
     # compute finite differences
     eps = 1e-5
-    right_loss, _ = nca._loss_grad_lbfgs(point + eps * random_direction, X, y,
-                                         masks)
-    left_loss, _ = nca._loss_grad_lbfgs(point - eps * random_direction, X, y,
-                                        masks)
-    finite_differences = 1/(2*eps) * (right_loss - left_loss)
+    right_loss, _ = nca._loss_grad_lbfgs(point + eps * random_direction, X, y)
+    left_loss, _ = nca._loss_grad_lbfgs(point - eps * random_direction, X, y)
+    finite_differences = 1 / (2 * eps) * (right_loss - left_loss)
 
     # compute relative error
     relative_error = np.abs(finite_differences - projected_gradient) / \

From 7721221396e6c9f5eec31c2f731dba7ce38bb9a8 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 5 Jun 2018 16:14:39 +0200
Subject: [PATCH 24/79] Update documentation.

---
 doc/modules/neighbors.rst | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 7b02b9d569c11..abcc08692e135 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -704,15 +704,12 @@ Complexity
 
 Training
 ^^^^^^^^
-First, time complexity depends on the number of iterations done. Besides,
-currently the algorithm has to compute, for each sample, its contribution to
-the cost and the gradient. The dominating terms in this computation are
-the dot products between differences in the input space and differences in the
-embedded space, which has complexity ``n_features_out * n_features *
-n_samples``. Therefore time complexity is ``O[n_iterations * n_samples^2 *
-n_features * n_features_out]`` In addition, the biggest matrix in memory has
-size ``max(n_features * n_features_out, n_features * n_samples,
-n_features_out * n_samples)``.
+NCA stores a matrix of pairwise distances, taking ``n_samples ** 2`` memory.
+Time complexity depends on the number of iterations done by the optimisation
+ algorithm. However, one can set the maximum number of iterations with the
+ argument ``max_iter``. For each iteration, time complexity is
+ ``O(n_features_out x n_samples x min(n_samples, n_features)``.
+
 
 Transform
 ^^^^^^^^^

From 173a96644519ceb330066d37e780a76698a78d20 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 6 Jun 2018 11:41:35 +0200
Subject: [PATCH 25/79] FIX: import scipy.misc.logsumexp for older versions of
 scipy, and scipy.special.logsumexp otherwise

---
 sklearn/neighbors/nca.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index bb0be0152e82e..587c6f08a20a4 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -10,7 +10,10 @@
 import numpy as np
 import sys
 import time
-from scipy.special import logsumexp
+try:  # scipy.misc.logsumexp is deprecated in scipy 1.0.0
+    from scipy.special import logsumexp
+except ImportError:
+    from scipy.misc import logsumexp
 from scipy.optimize import minimize
 from ..metrics import pairwise_distances
 from ..preprocessing import OneHotEncoder

From 2cd3bf6c348f37fa639797763a852e50abbb15a0 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 7 Jun 2018 09:32:36 +0200
Subject: [PATCH 26/79] FIX: remove newly introduced keepdims for logsumexp

---
 sklearn/neighbors/nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 587c6f08a20a4..5f8f885df114a 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -445,7 +445,7 @@ def _loss_grad_lbfgs(self, transformation, X, y, sign=1.0):
         # Compute softmax distances
         p_ij = pairwise_distances(X_embedded, squared=True)
         np.fill_diagonal(p_ij, np.inf)
-        p_ij = np.exp(-p_ij - logsumexp(-p_ij, axis=1, keepdims=True))
+        p_ij = np.exp(-p_ij - logsumexp(-p_ij, axis=1)[:, np.newaxis])
         # (n_samples, n_samples)
 
         # Compute loss

From c50c8415c313e1c3329c789ebc71dc0104bb5664 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 7 Jun 2018 12:23:00 +0200
Subject: [PATCH 27/79] FIX: remove unused old masks and use the new mask
 instead

---
 examples/neighbors/plot_nca_illustration.py |  6 ------
 sklearn/neighbors/nca.py                    | 18 ++++++++----------
 sklearn/neighbors/tests/test_nca.py         | 12 ++++++------
 3 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 254d61b924690..68a4c303366b4 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -18,8 +18,6 @@
 from matplotlib import cm
 from scipy.misc import logsumexp
 
-from sklearn.preprocessing import OneHotEncoder
-
 print(__doc__)
 
 n_neighbors = 1
@@ -53,10 +51,6 @@ def p_i(X, i):
     return exp_dist_embedded
 
 
-masks = OneHotEncoder(sparse=False,
-                      dtype=bool).fit_transform(y[:, np.newaxis])
-
-
 def relate_point(X, i, ax):
     pt_i = X[i]
     for j, pt_j in enumerate(X):
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 5f8f885df114a..e8f0660959b1a 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -16,7 +16,6 @@
     from scipy.misc import logsumexp
 from scipy.optimize import minimize
 from ..metrics import pairwise_distances
-from ..preprocessing import OneHotEncoder
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import LabelEncoder
 from ..decomposition import PCA
@@ -182,10 +181,9 @@ def fit(self, X, y):
         # Measure the total training time
         t_train = time.time()
 
-        # Compute arrays that stay fixed during optimization:
-        # mask for fast lookup of same-class samples
-        masks = OneHotEncoder(sparse=False,
-                              dtype=bool).fit_transform(y_valid[:, np.newaxis])
+        # Compute mask that stays fixed during optimization:
+        mask = y_valid[:, np.newaxis] == y_valid[np.newaxis, :]
+        # (n_samples, n_samples)
 
         # Initialize the transformation
         transformation = self._initialize(X_valid, init)
@@ -194,7 +192,7 @@ def fit(self, X, y):
         disp = self.verbose - 2 if self.verbose > 1 else -1
         optimizer_params = {'method': 'L-BFGS-B',
                             'fun': self._loss_grad_lbfgs,
-                            'args': (X_valid, y_valid, -1.0),
+                            'args': (X_valid, mask, -1.0),
                             'jac': True,
                             'x0': transformation,
                             'tol': self.tol,
@@ -401,7 +399,7 @@ def _callback(self, transformation):
 
         self.n_iter_ += 1
 
-    def _loss_grad_lbfgs(self, transformation, X, y, sign=1.0):
+    def _loss_grad_lbfgs(self, transformation, X, mask, sign=1.0):
         """Compute the loss and the loss gradient w.r.t. ``transformation``.
 
         Parameters
@@ -413,8 +411,9 @@ def _loss_grad_lbfgs(self, transformation, X, y, sign=1.0):
         X : array, shape (n_samples, n_features)
             The training samples.
 
-        y : array, shape (n_samples,)
-            The corresponding training labels.
+        mask : array, shape (n_samples, n_samples)
+            A mask where ``mask[i, j] == 1`` if ``X[i]`` and ``X[j]`` belong
+            to the same class, and ``0`` otherwise.
 
         Returns
         -------
@@ -440,7 +439,6 @@ def _loss_grad_lbfgs(self, transformation, X, y, sign=1.0):
 
         transformation = transformation.reshape(-1, X.shape[1])
         X_embedded = np.dot(X, transformation.T)  # (n_samples, n_features_out)
-        mask = y[:, np.newaxis] == y[np.newaxis, :]  # (n_samples, n_samples)
 
         # Compute softmax distances
         p_ij = pairwise_distances(X_embedded, squared=True)
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index dd40d26efa50c..0e774ba791963 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,6 +1,5 @@
 import numpy as np
 from numpy.testing import assert_array_equal
-from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import assert_raises, assert_equal
 from sklearn.datasets import load_iris, make_classification
@@ -72,13 +71,12 @@ def test_finite_differences():
     nca = NeighborhoodComponentsAnalysis(init=point)
 
     X, y, init = nca._validate_params(X, y)
-    masks = OneHotEncoder(sparse=False,
-                          dtype=bool).fit_transform(y[:, np.newaxis])
+    mask = y[:, np.newaxis] == y[np.newaxis, :]  # (n_samples, n_samples)
     nca.n_iter_ = 0
 
     point = nca._initialize(X, init)
     # compute the gradient at `point`
-    _, gradient = nca._loss_grad_lbfgs(point, X, y)
+    _, gradient = nca._loss_grad_lbfgs(point, X, mask)
 
     # create a random direction of norm 1
     random_direction = rng.randn(*point.shape)
@@ -90,8 +88,10 @@ def test_finite_differences():
 
     # compute finite differences
     eps = 1e-5
-    right_loss, _ = nca._loss_grad_lbfgs(point + eps * random_direction, X, y)
-    left_loss, _ = nca._loss_grad_lbfgs(point - eps * random_direction, X, y)
+    right_loss, _ = nca._loss_grad_lbfgs(point + eps * random_direction, X,
+                                         mask)
+    left_loss, _ = nca._loss_grad_lbfgs(point - eps * random_direction, X,
+                                        mask)
     finite_differences = 1 / (2 * eps) * (right_loss - left_loss)
 
     # compute relative error

From 094aa97750ddcc9274203b0acf5bf3a5a66c529d Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 20 Jun 2018 11:46:50 +0200
Subject: [PATCH 28/79] FIX: fix doctest CI fail by putting ellipsis

---
 sklearn/neighbors/nca.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index e8f0660959b1a..e70e0b64c0b74 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -114,12 +114,12 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     >>> knn = KNeighborsClassifier(n_neighbors=3)
     >>> knn.fit(X_train, y_train) # doctest: +ELLIPSIS
     KNeighborsClassifier(...)
-    >>> print(knn.score(X_test, y_test))
-    0.933333333333
+    >>> print(knn.score(X_test, y_test)) # doctest: +ELLIPSIS
+    0.933333...
     >>> knn.fit(nca.transform(X_train), y_train) # doctest: +ELLIPSIS
     KNeighborsClassifier(...)
-    >>> print(knn.score(nca.transform(X_test), y_test))
-    0.961904761905
+    >>> print(knn.score(nca.transform(X_test), y_test)) # doctest: +ELLIPSIS
+    0.961904...
 
     Notes
     -----

From e6daf4e57f60d0f08e9bc29c184e030602d2f1d2 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 20 Jun 2018 13:57:08 +0200
Subject: [PATCH 29/79] FIX: fix doctest CI fail by putting ellipsis, this time
 in rst file

---
 doc/modules/neighbors.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index abcc08692e135..ff7a19c05d7d4 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -584,8 +584,8 @@ classes:
     >>> knn = KNeighborsClassifier(n_neighbors=3)
     >>> knn.fit(nca.transform(X_train), y_train) # doctest: +ELLIPSIS
     KNeighborsClassifier(...)
-    >>> print(knn.score(nca.transform(X_test), y_test))
-    0.961904761905
+    >>> print(knn.score(nca.transform(X_test), y_test)) # doctest: +ELLIPSIS
+    0.96190476...
 
 Alternatively, one can create a :class:`sklearn.pipeline.Pipeline` instance
 that automatically applies the transformation when fitting or predicting:
@@ -596,8 +596,8 @@ that automatically applies the transformation when fitting or predicting:
     >>> nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
     >>> nca_pipe.fit(X_train, y_train) # doctest: +ELLIPSIS
     Pipeline(...)
-    >>> print(nca_pipe.score(X_test, y_test))
-    0.961904761905
+    >>> print(nca_pipe.score(X_test, y_test)) # doctest: +ELLIPSIS
+    0.96190476...
 
 .. |nca_classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_classification_001.png
    :target: ../auto_examples/neighbors/plot_nca_classification.html

From e160a6e5e14a6558ff2c162ef49f29a8b54e29c9 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 20 Jun 2018 13:57:08 +0200
Subject: [PATCH 30/79] FIX: fix doctest CI fail by putting ellipsis, this time
 in rst file

---
 doc/modules/neighbors.rst           |   8 +-
 sklearn/neighbors/nca.py            | 118 ++++++++++++-------
 sklearn/neighbors/tests/test_nca.py | 176 ++++++++++++++++++++++------
 3 files changed, 215 insertions(+), 87 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index abcc08692e135..ff7a19c05d7d4 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -584,8 +584,8 @@ classes:
     >>> knn = KNeighborsClassifier(n_neighbors=3)
     >>> knn.fit(nca.transform(X_train), y_train) # doctest: +ELLIPSIS
     KNeighborsClassifier(...)
-    >>> print(knn.score(nca.transform(X_test), y_test))
-    0.961904761905
+    >>> print(knn.score(nca.transform(X_test), y_test)) # doctest: +ELLIPSIS
+    0.96190476...
 
 Alternatively, one can create a :class:`sklearn.pipeline.Pipeline` instance
 that automatically applies the transformation when fitting or predicting:
@@ -596,8 +596,8 @@ that automatically applies the transformation when fitting or predicting:
     >>> nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
     >>> nca_pipe.fit(X_train, y_train) # doctest: +ELLIPSIS
     Pipeline(...)
-    >>> print(nca_pipe.score(X_test, y_test))
-    0.961904761905
+    >>> print(nca_pipe.score(X_test, y_test)) # doctest: +ELLIPSIS
+    0.96190476...
 
 .. |nca_classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_classification_001.png
    :target: ../auto_examples/neighbors/plot_nca_classification.html
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index e70e0b64c0b74..4980f825c088d 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -7,9 +7,14 @@
 
 from __future__ import print_function
 
+from warnings import warn
+
 import numpy as np
 import sys
 import time
+
+from sklearn.exceptions import ConvergenceWarning
+
 try:  # scipy.misc.logsumexp is deprecated in scipy 1.0.0
     from scipy.special import logsumexp
 except ImportError:
@@ -30,8 +35,9 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    n_features_out : int, optional (default=None)
+    n_components : int, optional (default=None)
         Preferred dimensionality of the embedding.
+        If None it is inferred from ``init``.
 
     init : string or numpy array, optional (default='pca')
         Initialization of the linear transformation. Possible options are
@@ -39,23 +45,23 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
         (n_features_a, n_features_b).
 
         pca:
-            ``n_features_out`` many principal components of the inputs passed
+            ``n_components`` many principal components of the inputs passed
             to :meth:`fit` will be used to initialize the transformation.
 
         identity:
-            If ``n_features_out`` is strictly smaller than the
+            If ``n_components`` is strictly smaller than the
             dimensionality of the inputs passed to :meth:`fit`, the identity
-            matrix will be truncated to the first ``n_features_out`` rows.
+            matrix will be truncated to the first ``n_components`` rows.
 
         random:
             The initial transformation will be a random array of shape
-            (n_features_out, n_features). Each value is sampled from the
+            (n_components, n_features). Each value is sampled from the
             standard normal distribution.
 
         numpy array:
             n_features_b must match the dimensionality of the inputs passed to
             :meth:`fit` and n_features_a must be less than or equal to that.
-            If ``n_features_out`` is not None, n_features_a must match it.
+            If ``n_components`` is not None, n_features_a must match it.
 
     max_iter : int, optional (default=50)
         Maximum number of iterations in the optimization.
@@ -89,7 +95,7 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    transformation_ : array, shape (n_features_out, n_features)
+    components_ : array, shape (n_components, n_features)
         The linear transformation learned during fitting.
 
     n_iter_ : int
@@ -97,7 +103,26 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
     opt_result_ : scipy.optimize.OptimizeResult (optional)
         A dictionary of information representing the optimization result.
-        This is stored only if ``store_opt_result`` was True.
+        This is stored only if ``store_opt_result`` is True. It contains the
+        following attributes:
+
+        x : ndarray
+            The solution of the optimization.
+        success : bool
+            Whether or not the optimizer exited successfully.
+        status : int
+            Termination status of the optimizer.
+        message : str
+            Description of the cause of the termination.
+        fun, jac : ndarray
+            Values of objective function and its Jacobian.
+        hess_inv : scipy.sparse.linalg.LinearOperator
+            the product of a vector with the approximate inverse of the
+            Hessian of the objective function..
+        nfev : int
+            Number of evaluations of the objective function..
+        nit : int
+            Number of iterations performed by the optimizer.
 
     Examples
     --------
@@ -140,12 +165,12 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, n_features_out=None, init='pca', max_iter=50,
+    def __init__(self, n_components=None, init='pca', max_iter=50,
                  tol=1e-5, callback=None, store_opt_result=False, verbose=0,
                  random_state=None):
 
         # Parameters
-        self.n_features_out = n_features_out
+        self.n_components = n_components
         self.init = init
         self.max_iter = max_iter
         self.tol = tol
@@ -205,13 +230,20 @@ def fit(self, X, y):
         opt_result = minimize(**optimizer_params)
 
         # Reshape the solution found by the optimizer
-        self.transformation_ = opt_result.x.reshape(-1, X_valid.shape[1])
+        self.components_ = opt_result.x.reshape(-1, X_valid.shape[1])
 
         # Stop timer
         t_train = time.time() - t_train
         if self.verbose:
-            print('[{}] Training took {:8.2f}s.'.format(
-                self.__class__.__name__, t_train))
+            cls_name = self.__class__.__name__
+
+            # Warn the user if the algorithm did not converge
+            if not opt_result.success:
+                warn('[{}] NCA did not converge: {}'.format(
+                    cls_name, opt_result.message),
+                     ConvergenceWarning)
+
+            print('[{}] Training took {:8.2f}s.'.format(cls_name, t_train))
 
         # Optionally store information returned by the optimizer
         if self.store_opt_result:
@@ -229,7 +261,7 @@ def transform(self, X):
 
         Returns
         -------
-        X_embedded: array, shape (n_samples, n_features_out)
+        X_embedded: array, shape (n_samples, n_components)
             The data samples transformed.
 
         Raises
@@ -238,10 +270,10 @@ def transform(self, X):
             If :meth:`fit` has not been called before.
         """
 
-        check_is_fitted(self, ['transformation_'])
+        check_is_fitted(self, ['components_'])
         X = check_array(X)
 
-        return np.dot(X, self.transformation_.T)
+        return np.dot(X, self.components_.T)
 
     def _validate_params(self, X, y):
         """Validate parameters as soon as :meth:`fit` is called.
@@ -282,15 +314,15 @@ def _validate_params(self, X, y):
         y_valid = LabelEncoder().fit_transform(y_valid)
 
         # Check the preferred embedding dimensionality
-        if self.n_features_out is not None:
-            _check_scalar(self.n_features_out, 'n_features_out',
+        if self.n_components is not None:
+            _check_scalar(self.n_components, 'n_components',
                           integer_types, 1)
 
-            if self.n_features_out > X.shape[1]:
+            if self.n_components > X.shape[1]:
                 raise ValueError('The preferred embedding dimensionality '
-                                 '`n_features_out` ({}) cannot be greater '
+                                 '`n_components` ({}) cannot be greater '
                                  'than the given data dimensionality ({})!'
-                                 .format(self.n_features_out, X.shape[1]))
+                                 .format(self.n_components, X.shape[1]))
 
         _check_scalar(self.max_iter, 'max_iter', integer_types, 1)
         _check_scalar(self.tol, 'tol', float, 0.)
@@ -322,23 +354,21 @@ def _validate_params(self, X, y):
                     'greater than its input dimensionality ({}).'
                     .format(init.shape[0], init.shape[1]))
 
-            if self.n_features_out is not None:
-                # Assert that self.n_features_out = init.shape[0]
-                if self.n_features_out != init.shape[0]:
-                    raise ValueError(
-                        'The preferred embedding dimensionality '
-                        '`n_features_out` ({}) does not match '
-                        'the output dimensionality of the given '
-                        'linear transformation `init` ({})!'
-                        .format(self.n_features_out,
-                                init.shape[0]))
-
+            if self.n_components is not None:
+                # Assert that self.n_components = init.shape[0]
+                if self.n_components != init.shape[0]:
+                    raise ValueError('The preferred embedding dimensionality '
+                                     '`n_components` ({}) does not match '
+                                     'the output dimensionality of the given '
+                                     'linear transformation `init` ({})!'
+                                     .format(self.n_components,
+                                             init.shape[0]))
         elif init in ['pca', 'identity', 'random']:
             pass
         else:
             raise ValueError(
                 "`init` must be 'pca', 'identity', 'random' or a numpy "
-                "array of shape (n_features_out, n_features).")
+                "array of shape (n_components, n_features).")
 
         return X_valid, y_valid, init
 
@@ -348,14 +378,14 @@ def _initialize(self, X, init):
         Parameters
         ----------
         X : array-like, shape (n_samples, n_features)
-            Data samples.
+            The training samples.
 
         init : string or numpy array of shape (n_features_a, n_features_b)
             The validated initialization of the linear transformation.
 
         Returns
         -------
-        transformation : array, shape (n_features_out, n_features)
+        transformation : array, shape (n_components, n_features)
             The initialized linear transformation.
 
         """
@@ -365,14 +395,14 @@ def _initialize(self, X, init):
         if isinstance(init, np.ndarray):
             pass
         else:
-            n_features_out = self.n_features_out or X.shape[1]
+            n_components = self.n_components or X.shape[1]
             if init == 'identity':
-                transformation = np.eye(n_features_out, X.shape[1])
+                transformation = np.eye(n_components, X.shape[1])
             elif init == 'random':
-                transformation = self.random_state_.randn(n_features_out,
+                transformation = self.random_state_.randn(n_components,
                                                           X.shape[1])
             elif init == 'pca':
-                pca = PCA(n_components=n_features_out,
+                pca = PCA(n_components=n_components,
                           random_state=self.random_state_)
                 t_pca = time.time()
                 if self.verbose:
@@ -391,7 +421,7 @@ def _callback(self, transformation):
 
         Parameters
         ----------
-        transformation : array, shape(n_features_out, n_features)
+        transformation : array, shape(n_components, n_features)
             The solution computed by the optimizer in this iteration.
         """
         if self.callback is not None:
@@ -404,7 +434,7 @@ def _loss_grad_lbfgs(self, transformation, X, mask, sign=1.0):
 
         Parameters
         ----------
-        transformation : array, shape (n_features_out, n_features)
+        transformation : array, shape (n_components, n_features)
             The linear transformation on which to compute loss and evaluate
             gradient
 
@@ -420,7 +450,7 @@ def _loss_grad_lbfgs(self, transformation, X, mask, sign=1.0):
         loss : float
             The loss computed for the given transformation.
 
-        gradient : array, shape (n_features_out * n_features,)
+        gradient : array, shape (n_components * n_features,)
             The new (flattened) gradient of the loss.
         """
 
@@ -438,7 +468,7 @@ def _loss_grad_lbfgs(self, transformation, X, mask, sign=1.0):
         t_funcall = time.time()
 
         transformation = transformation.reshape(-1, X.shape[1])
-        X_embedded = np.dot(X, transformation.T)  # (n_samples, n_features_out)
+        X_embedded = np.dot(X, transformation.T)  # (n_samples, n_components)
 
         # Compute softmax distances
         p_ij = pairwise_distances(X_embedded, squared=True)
@@ -455,7 +485,7 @@ def _loss_grad_lbfgs(self, transformation, X, mask, sign=1.0):
         weighted_p_ij = masked_p_ij - p_ij * p
         gradient = 2 * (X_embedded.T.dot(weighted_p_ij + weighted_p_ij.T) -
                         X_embedded.T * np.sum(weighted_p_ij, axis=0)).dot(X)
-        # time complexity: O(n_features_out x n_samples x
+        # time complexity: O(n_components x n_samples x
         # min(n_samples, n_features))
 
         if self.verbose:
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 0e774ba791963..2baee4272a3a0 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,7 +1,13 @@
+import sys
+
 import numpy as np
 from numpy.testing import assert_array_equal
+
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.externals.six import StringIO
 from sklearn.utils import check_random_state
-from sklearn.utils.testing import assert_raises, assert_equal
+from sklearn.utils.testing import assert_raises, assert_equal, \
+    assert_raise_message, assert_warns_message
 from sklearn.datasets import load_iris, make_classification
 from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
 from sklearn.metrics import pairwise_distances
@@ -26,7 +32,7 @@ def test_simple_example():
     """
     X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
     y = np.array([1, 0, 1, 0])
-    nca = NeighborhoodComponentsAnalysis(n_features_out=2, init='identity',
+    nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
                                          random_state=42)
     nca.fit(X, y)
     Xansformed = nca.transform(X)
@@ -110,16 +116,32 @@ def test_params_validation():
     assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
     assert_raises(TypeError, NCA(verbose='true').fit, X, y)
     assert_raises(TypeError, NCA(tol=1).fit, X, y)
-    assert_raises(TypeError, NCA(n_features_out='invalid').fit, X, y)
+    assert_raises(TypeError, NCA(n_components='invalid').fit, X, y)
 
     # ValueError
-    assert_raises(ValueError, NCA(init=1).fit, X, y)
-    assert_raises(ValueError, NCA(max_iter=-1).fit, X, y)
-
-    fit_func = NCA(init=np.random.rand(5, 3)).fit
-    assert_raises(ValueError, fit_func, X, y)
-    assert_raises(ValueError, NCA(n_features_out=10).fit, X, y)
-
+    assert_raise_message(ValueError,
+                  "`init` must be 'pca', 'identity', 'random' or a numpy "
+                  "array of shape (n_components, n_features).",
+                  NCA(init=1).fit, X, y)
+    assert_raise_message(ValueError,
+                         '`max_iter`= -1, must be >= 1.',
+                         NCA(max_iter=-1).fit, X, y)
+
+    init=np.random.rand(5, 3)
+    assert_raise_message(ValueError,
+                         'The output dimensionality ({}) of the given linear '
+                         'transformation `init` cannot be greater than its '
+                         'input dimensionality ({}).'
+                         .format(init.shape[0], init.shape[1]),
+                         NCA(init=init).fit, X, y)
+
+    n_components = 10
+    assert_raise_message(ValueError,
+                         'The preferred embedding dimensionality '
+                         '`n_components` ({}) cannot be greater '
+                         'than the given data dimensionality ({})!'
+                         .format(n_components, X.shape[1]),
+                         NCA(n_components=n_components).fit, X, y)
 
 def test_transformation_dimensions():
     X = np.arange(12).reshape(4, 3)
@@ -144,23 +166,36 @@ def test_transformation_dimensions():
     NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
 
 
-def test_n_features_out():
+def test_n_components():
     X = np.arange(12).reshape(4, 3)
     y = [1, 1, 2, 2]
 
-    transformation = np.array([[1, 2, 3], [4, 5, 6]])
-
-    # n_features_out = X.shape[1] != transformation.shape[0]
-    nca = NeighborhoodComponentsAnalysis(n_features_out=3, init=transformation)
-    assert_raises(ValueError, nca.fit, X, y)
-
-    # n_features_out > X.shape[1]
-    nca = NeighborhoodComponentsAnalysis(n_features_out=5, init=transformation)
-    assert_raises(ValueError, nca.fit, X, y)
-
-    # n_features_out < X.shape[1]
-    nca = NeighborhoodComponentsAnalysis(n_features_out=2, init='identity')
-    nca.fit(X, y)
+    init = np.random.rand(X.shape[1] - 1, 3)
+
+    # n_components = X.shape[1] != transformation.shape[0]
+    n_components = X.shape[1]
+    lmnn = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    assert_raise_message(ValueError,
+                         'The preferred embedding dimensionality '
+                         '`n_components` ({}) does not match '
+                         'the output dimensionality of the given '
+                         'linear transformation `init` ({})!'
+                         .format(n_components, init.shape[0]),
+                         lmnn.fit, X, y)
+
+    # n_components > X.shape[1]
+    n_components = X.shape[1] + 2
+    lmnn = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    assert_raise_message(ValueError,
+                         'The preferred embedding dimensionality '
+                         '`n_components` ({}) cannot be greater '
+                         'than the given data dimensionality ({})!'
+                         .format(n_components, X.shape[1]),
+                         lmnn.fit, X, y)
+
+    # n_components < X.shape[1]
+    lmnn = NeighborhoodComponentsAnalysis(n_components=2, init='identity')
+    lmnn.fit(X, y)
 
 
 def test_init_transformation():
@@ -185,25 +220,70 @@ def test_init_transformation():
 
     # init.shape[1] must match X.shape[1]
     init = np.random.rand(X.shape[1], X.shape[1] + 1)
-    nca = NeighborhoodComponentsAnalysis(init=init)
-    assert_raises(ValueError, nca.fit, X, y)
+    lmnn = NeighborhoodComponentsAnalysis(init=init)
+    assert_raise_message(ValueError,
+                         'The input dimensionality ({}) of the given '
+                         'linear transformation `init` must match the '
+                         'dimensionality of the given inputs `X` ({}).'
+                         .format(init.shape[1], X.shape[1]),
+                         lmnn.fit, X, y)
 
     # init.shape[0] must be <= init.shape[1]
     init = np.random.rand(X.shape[1] + 1, X.shape[1])
-    nca = NeighborhoodComponentsAnalysis(init=init)
-    assert_raises(ValueError, nca.fit, X, y)
-
-    # init.shape[0] must match n_features_out
+    lmnn = NeighborhoodComponentsAnalysis(init=init)
+    assert_raise_message(ValueError,
+                         'The output dimensionality ({}) of the given '
+                         'linear transformation `init` cannot be '
+                         'greater than its input dimensionality ({}).'
+                         .format(init.shape[0], init.shape[1]),
+                         lmnn.fit, X, y)
+
+    # init.shape[0] must match n_components
     init = np.random.rand(X.shape[1], X.shape[1])
-    nca = NeighborhoodComponentsAnalysis(n_features_out=X.shape[1] - 2,
-                                         init=init)
-    assert_raises(ValueError, nca.fit, X, y)
+    n_components = X.shape[1] - 2
+    lmnn = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    assert_raise_message(ValueError,
+                         'The preferred embedding dimensionality '
+                         '`n_components` ({}) does not match '
+                         'the output dimensionality of the given '
+                         'linear transformation `init` ({})!'
+                         .format(n_components, init.shape[0]),
+                         lmnn.fit, X, y)
 
 
 def test_verbose():
+    # assert there is proper output when verbose = 1
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+
     nca = NeighborhoodComponentsAnalysis(verbose=1)
-    nca.fit(iris_data, iris_target)
-    # TODO: rather assert that some message is printed
+    try:
+        nca.fit(iris_data, iris_target)
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+
+    # check output
+    assert("[NeighborhoodComponentsAnalysis]" in out)
+    assert("Finding principal components" in out)
+    assert ("Finding principal components" in out)
+    assert ("Training took" in out)
+
+    # assert by default there is no output (verbose=0)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+
+    nca = NeighborhoodComponentsAnalysis()
+    try:
+        nca.fit(iris_data, iris_target)
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
+
+    # check output
+    assert(out == '')
 
 
 def test_singleton_class():
@@ -247,13 +327,13 @@ def test_one_class():
     y = iris_target[iris_target == 0]
 
     nca = NeighborhoodComponentsAnalysis(max_iter=30,
-                                         n_features_out=X.shape[1],
+                                         n_components=X.shape[1],
                                          init='identity')
     nca.fit(X, y)
     assert_array_equal(X, nca.transform(X))
 
 
-def test_callable():
+def test_callback():
     X = iris_data
     y = iris_target
 
@@ -266,11 +346,21 @@ def my_cb(transformation, n_iter):
         rem_iter = max_iter - n_iter
         print('{} iterations remaining...'.format(rem_iter))
 
+    # assert that my_cb is called
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+
     nca = NeighborhoodComponentsAnalysis(max_iter=max_iter,
                                          callback=my_cb, verbose=1)
-    nca.fit(X, y)
-    # TODO: rather assert that message is printed
+    try:
+        nca.fit(iris_data, iris_target)
+    finally:
+        out = sys.stdout.getvalue()
+        sys.stdout.close()
+        sys.stdout = old_stdout
 
+    # check output
+    assert('{} iterations remaining...'.format(max_iter-1) in out)
 
 def test_store_opt_result():
     X = iris_data
@@ -281,3 +371,11 @@ def test_store_opt_result():
     nca.fit(X, y)
     transformation = nca.opt_result_.x
     assert_equal(transformation.size, X.shape[1]**2)
+
+def test_convergence_warning():
+
+    nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
+    cls_name = nca.__class__.__name__
+    assert_warns_message(ConvergenceWarning,
+                         '[{}] NCA did not converge'.format(cls_name),
+                         nca.fit, iris_data, iris_target)

From fbc679b5cb4f3b0f710d9cb847b1f75158cf5c84 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 22 Jun 2018 09:18:21 +0200
Subject: [PATCH 31/79] Updates to be coherent with latest changes from pr
 #8602 (commits
 https://github.com/scikit-learn/scikit-learn/pull/8602/commits/7afa6bbea9fee0490843294bf3fe9da99ce5dc27
 and
 https://github.com/scikit-learn/scikit-learn/pull/8602/commits/7e683214217ca7915adad0e9bd67c1b5734618bd)

- Rename n_features_out to n_components
- Rename transformation_ to components_
- Update tests with assert_raise_message
- Improve verbosity
---
 sklearn/neighbors/nca.py            |  5 +---
 sklearn/neighbors/tests/test_nca.py | 37 +++++++++++++++--------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 4980f825c088d..9311e58d227c2 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -8,13 +8,9 @@
 from __future__ import print_function
 
 from warnings import warn
-
 import numpy as np
 import sys
 import time
-
-from sklearn.exceptions import ConvergenceWarning
-
 try:  # scipy.misc.logsumexp is deprecated in scipy 1.0.0
     from scipy.special import logsumexp
 except ImportError:
@@ -28,6 +24,7 @@
 from ..utils.random import check_random_state
 from ..utils.validation import check_is_fitted, check_array, check_X_y
 from ..externals.six import integer_types
+from ..exceptions import ConvergenceWarning
 
 
 class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 2baee4272a3a0..4d2c92105eff6 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,8 +1,6 @@
 import sys
-
 import numpy as np
 from numpy.testing import assert_array_equal
-
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.externals.six import StringIO
 from sklearn.utils import check_random_state
@@ -120,14 +118,14 @@ def test_params_validation():
 
     # ValueError
     assert_raise_message(ValueError,
-                  "`init` must be 'pca', 'identity', 'random' or a numpy "
-                  "array of shape (n_components, n_features).",
-                  NCA(init=1).fit, X, y)
+                         "`init` must be 'pca', 'identity', 'random' or a "
+                         "numpy array of shape (n_components, n_features).",
+                         NCA(init=1).fit, X, y)
     assert_raise_message(ValueError,
                          '`max_iter`= -1, must be >= 1.',
                          NCA(max_iter=-1).fit, X, y)
 
-    init=np.random.rand(5, 3)
+    init = np.random.rand(5, 3)
     assert_raise_message(ValueError,
                          'The output dimensionality ({}) of the given linear '
                          'transformation `init` cannot be greater than its '
@@ -143,6 +141,7 @@ def test_params_validation():
                          .format(n_components, X.shape[1]),
                          NCA(n_components=n_components).fit, X, y)
 
+
 def test_transformation_dimensions():
     X = np.arange(12).reshape(4, 3)
     y = [1, 1, 2, 2]
@@ -174,28 +173,28 @@ def test_n_components():
 
     # n_components = X.shape[1] != transformation.shape[0]
     n_components = X.shape[1]
-    lmnn = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,
                          'The preferred embedding dimensionality '
                          '`n_components` ({}) does not match '
                          'the output dimensionality of the given '
                          'linear transformation `init` ({})!'
                          .format(n_components, init.shape[0]),
-                         lmnn.fit, X, y)
+                         nca.fit, X, y)
 
     # n_components > X.shape[1]
     n_components = X.shape[1] + 2
-    lmnn = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,
                          'The preferred embedding dimensionality '
                          '`n_components` ({}) cannot be greater '
                          'than the given data dimensionality ({})!'
                          .format(n_components, X.shape[1]),
-                         lmnn.fit, X, y)
+                         nca.fit, X, y)
 
     # n_components < X.shape[1]
-    lmnn = NeighborhoodComponentsAnalysis(n_components=2, init='identity')
-    lmnn.fit(X, y)
+    nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity')
+    nca.fit(X, y)
 
 
 def test_init_transformation():
@@ -220,35 +219,35 @@ def test_init_transformation():
 
     # init.shape[1] must match X.shape[1]
     init = np.random.rand(X.shape[1], X.shape[1] + 1)
-    lmnn = NeighborhoodComponentsAnalysis(init=init)
+    nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raise_message(ValueError,
                          'The input dimensionality ({}) of the given '
                          'linear transformation `init` must match the '
                          'dimensionality of the given inputs `X` ({}).'
                          .format(init.shape[1], X.shape[1]),
-                         lmnn.fit, X, y)
+                         nca.fit, X, y)
 
     # init.shape[0] must be <= init.shape[1]
     init = np.random.rand(X.shape[1] + 1, X.shape[1])
-    lmnn = NeighborhoodComponentsAnalysis(init=init)
+    nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raise_message(ValueError,
                          'The output dimensionality ({}) of the given '
                          'linear transformation `init` cannot be '
                          'greater than its input dimensionality ({}).'
                          .format(init.shape[0], init.shape[1]),
-                         lmnn.fit, X, y)
+                         nca.fit, X, y)
 
     # init.shape[0] must match n_components
     init = np.random.rand(X.shape[1], X.shape[1])
     n_components = X.shape[1] - 2
-    lmnn = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
+    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,
                          'The preferred embedding dimensionality '
                          '`n_components` ({}) does not match '
                          'the output dimensionality of the given '
                          'linear transformation `init` ({})!'
                          .format(n_components, init.shape[0]),
-                         lmnn.fit, X, y)
+                         nca.fit, X, y)
 
 
 def test_verbose():
@@ -362,6 +361,7 @@ def my_cb(transformation, n_iter):
     # check output
     assert('{} iterations remaining...'.format(max_iter-1) in out)
 
+
 def test_store_opt_result():
     X = iris_data
     y = iris_target
@@ -372,6 +372,7 @@ def test_store_opt_result():
     transformation = nca.opt_result_.x
     assert_equal(transformation.size, X.shape[1]**2)
 
+
 def test_convergence_warning():
 
     nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)

From 92faf4fabcad75524fb65210d9f7e2cfecb33e07 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 22 Jun 2018 10:04:22 +0200
Subject: [PATCH 32/79] ENH: Add warm_start feature from LMNN (PR #8602)

---
 sklearn/neighbors/nca.py            | 24 ++++++++++--
 sklearn/neighbors/tests/test_nca.py | 60 ++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 9311e58d227c2..5b55a4a09c5b5 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -60,6 +60,11 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
             :meth:`fit` and n_features_a must be less than or equal to that.
             If ``n_components`` is not None, n_features_a must match it.
 
+    warm_start : bool, optional, (default=False)
+        If True and :meth:`fit` has been called before, the solution of the
+        previous call to :meth:`fit` is used as the initial linear
+        transformation (``n_components`` and ``init`` will be ignored).
+
     max_iter : int, optional (default=50)
         Maximum number of iterations in the optimization.
 
@@ -162,13 +167,14 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, n_components=None, init='pca', max_iter=50,
-                 tol=1e-5, callback=None, store_opt_result=False, verbose=0,
-                 random_state=None):
+    def __init__(self, n_components=None, init='pca', warm_start=False,
+                 max_iter=50, tol=1e-5, callback=None, store_opt_result=False,
+                 verbose=0, random_state=None):
 
         # Parameters
         self.n_components = n_components
         self.init = init
+        self.warm_start = warm_start
         self.max_iter = max_iter
         self.tol = tol
         self.callback = callback
@@ -321,6 +327,16 @@ def _validate_params(self, X, y):
                                  'than the given data dimensionality ({})!'
                                  .format(self.n_components, X.shape[1]))
 
+        # If warm_start is enabled, check that the inputs are consistent
+        _check_scalar(self.warm_start, 'warm_start', bool)
+        if self.warm_start and hasattr(self, 'components_'):
+            if self.components_.shape[1] != X.shape[1]:
+                raise ValueError('The new inputs dimensionality ({}) does not '
+                                 'match the input dimensionality of the '
+                                 'previously learned transformation ({}).'
+                                 .format(X.shape[1],
+                                         self.components_.shape[1]))
+
         _check_scalar(self.max_iter, 'max_iter', integer_types, 1)
         _check_scalar(self.tol, 'tol', float, 0.)
         _check_scalar(self.verbose, 'verbose', integer_types, 0)
@@ -388,6 +404,8 @@ def _initialize(self, X, init):
         """
 
         transformation = init
+        if self.warm_start and hasattr(self, 'components_'):
+            transformation = self.components_
 
         if isinstance(init, np.ndarray):
             pass
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 4d2c92105eff6..53e331bcad888 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -5,7 +5,7 @@
 from sklearn.externals.six import StringIO
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import assert_raises, assert_equal, \
-    assert_raise_message, assert_warns_message
+    assert_raise_message, assert_warns_message, assert_true
 from sklearn.datasets import load_iris, make_classification
 from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
 from sklearn.metrics import pairwise_distances
@@ -115,6 +115,7 @@ def test_params_validation():
     assert_raises(TypeError, NCA(verbose='true').fit, X, y)
     assert_raises(TypeError, NCA(tol=1).fit, X, y)
     assert_raises(TypeError, NCA(n_components='invalid').fit, X, y)
+    assert_raises(TypeError, NCA(warm_start=1).fit, X, y)
 
     # ValueError
     assert_raise_message(ValueError,
@@ -250,6 +251,63 @@ def test_init_transformation():
                          nca.fit, X, y)
 
 
+def test_warm_start_validation():
+    X, y = make_classification(n_samples=30, n_features=5, n_classes=4,
+                               n_redundant=0, n_informative=5, random_state=0)
+
+    nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
+    nca.fit(X, y)
+
+    X_less_features, y = \
+        make_classification(n_samples=30, n_features=4, n_classes=4,
+                            n_redundant=0, n_informative=4, random_state=0)
+    assert_raise_message(ValueError,
+                         'The new inputs dimensionality ({}) does not '
+                         'match the input dimensionality of the '
+                         'previously learned transformation ({}).'
+                         .format(X_less_features.shape[1],
+                                 nca.components_.shape[1]),
+                         nca.fit, X_less_features, y)
+
+
+def test_warm_start_effectiveness():
+    # A 1-iteration second fit on same data should give almost same result
+    # with warm starting, and quite different result without warm starting.
+
+    X, y = make_classification(n_samples=30, n_features=5,
+                               n_redundant=0, random_state=0)
+    n_iter = 10
+
+    nca_warm = NeighborhoodComponentsAnalysis(warm_start=True,
+                                              max_iter=n_iter, random_state=0)
+    nca_warm.fit(X, y)
+    transformation_warm = nca_warm.components_
+    nca_warm.max_iter = 1
+    nca_warm.fit(X, y)
+    transformation_warm_plus_one = nca_warm.components_
+
+    nca_cold = NeighborhoodComponentsAnalysis(warm_start=False,
+                                              max_iter=n_iter, random_state=0)
+    nca_cold.fit(X, y)
+    transformation_cold = nca_cold.components_
+    nca_cold.max_iter = 1
+    nca_cold.fit(X, y)
+    transformation_cold_plus_one = nca_cold.components_
+
+    diff_warm = np.sum(np.abs(transformation_warm_plus_one -
+                              transformation_warm))
+    diff_cold = np.sum(np.abs(transformation_cold_plus_one -
+                              transformation_cold))
+
+    assert_true(diff_warm < 2.0,
+                "Transformer changed significantly after one iteration even "
+                "though it was warm-started.")
+
+    assert_true(diff_cold > diff_warm,
+                "Cold-started transformer changed less significantly than "
+                "warm-started transformer after one iteration.")
+
+
 def test_verbose():
     # assert there is proper output when verbose = 1
     old_stdout = sys.stdout

From b172898ab1cc5d8dcea0c18ac91cf19714429db6 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 22 Jun 2018 10:06:20 +0200
Subject: [PATCH 33/79] FIX: rename remaining old n_features_out to
 n_components

---
 examples/neighbors/plot_nca_dim_reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index ababdc849225c..edd7034274b69 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -67,7 +67,7 @@
 
 # Reduce dimension to 2 with NeighborhoodComponentAnalysis
 nca = make_pipeline(StandardScaler(),
-                    NeighborhoodComponentsAnalysis(n_features_out=2,
+                    NeighborhoodComponentsAnalysis(n_components=2,
                                                    random_state=random_state))
 
 # Use a nearest neighbor classifier to evaluate the methods

From 816f3de53663c533fab3644837a22959bdd7b578 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 22 Jun 2018 10:24:06 +0200
Subject: [PATCH 34/79] FIX: Update doc like in commit
 https://github.com/scikit-learn/scikit-learn/pull/8602/commits/7afa6bbea9fee0490843294bf3fe9da99ce5dc27

---
 doc/modules/decomposition.rst | 4 ++++
 doc/modules/neighbors.rst     | 9 +++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index d897377d16269..6411fc9472f5a 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -905,3 +905,7 @@ when data can be fetched sequentially.
     * `"Stochastic Variational Inference"
       <http://www.columbia.edu/~jwp2128/Papers/HoffmanBleiWangPaisley2013.pdf>`_
       M. Hoffman, D. Blei, C. Wang, J. Paisley, 2013
+
+
+See also :ref:`_nca_dim_reduction` for dimensionality reduction with
+Neighborhood Components Analysis.
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index ff7a19c05d7d4..2c9823fc2804f 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -613,6 +613,7 @@ The plot shows decision boundaries for Nearest Neighbor Classification and
 Neighborhood Components Analysis classification on the iris dataset, when
 training and scoring on only two features, for visualisation purpose.
 
+.. _nca_dim_reduction:
 
 Dimensionality reduction
 ------------------------
@@ -620,7 +621,7 @@ Dimensionality reduction
 NCA can be used to perform supervised dimensionality reduction. The input data
 are projected onto a linear subspace consisting of the directions which
 minimize the NCA objective. The desired dimensionality can be set using the
-parameter ``n_features_out``. For instance, the following figure shows a
+parameter ``n_components``. For instance, the following figure shows a
 comparison of dimensionality reduction with Principal Component Analysis
 (:class:`sklearn.decomposition.PCA`), Linear Discriminant Analysis
 (:class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`) and
@@ -650,7 +651,7 @@ Mathematical formulation
 ------------------------
 
 The goal of NCA is to learn an optimal linear transformation matrix of size
-``(n_features_out, n_features)``, which maximises in average the probability
+``(n_components, n_features)``, which maximises in average the probability
 :math:`p_i` of sample :math:`i` being correctly classified, i.e.:
 
 .. math::
@@ -708,13 +709,13 @@ NCA stores a matrix of pairwise distances, taking ``n_samples ** 2`` memory.
 Time complexity depends on the number of iterations done by the optimisation
  algorithm. However, one can set the maximum number of iterations with the
  argument ``max_iter``. For each iteration, time complexity is
- ``O(n_features_out x n_samples x min(n_samples, n_features)``.
+ ``O(n_components x n_samples x min(n_samples, n_features)``.
 
 
 Transform
 ^^^^^^^^^
 Here the ``transform`` operation returns :math:`LX^T`, therefore its time
-complexity equals ``n_features_out * n_features * n_samples_test``. There is no
+complexity equals ``n_components * n_features * n_samples_test``. There is no
 added space complexity in the operation.
 
 
From 85b2cddf76a4270aba5ca7c046546b6104990384 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 22 Jun 2018 13:28:05 +0200
Subject: [PATCH 35/79] FIX: make test_warm_start_effectiveness_work - Fix if
 to elif (otherwise warm_start does not work) - Use iris dataset because it
 (empirically) makes a more significative difference between warm start and
 cold start NCA - Use the default max_iter (50 at time of writing) instead of
 10 to make NCA converge more - Augment comparison threshold from 2 to 3
 because in this case differences are a bit bigger

---
 sklearn/neighbors/nca.py            |  2 +-
 sklearn/neighbors/tests/test_nca.py | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 5b55a4a09c5b5..c657af1f84bb4 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -407,7 +407,7 @@ def _initialize(self, X, init):
         if self.warm_start and hasattr(self, 'components_'):
             transformation = self.components_
 
-        if isinstance(init, np.ndarray):
+        elif isinstance(init, np.ndarray):
             pass
         else:
             n_components = self.n_components or X.shape[1]
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 53e331bcad888..6cc4e059d82d3 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -274,20 +274,16 @@ def test_warm_start_effectiveness():
     # A 1-iteration second fit on same data should give almost same result
     # with warm starting, and quite different result without warm starting.
 
-    X, y = make_classification(n_samples=30, n_features=5,
-                               n_redundant=0, random_state=0)
-    n_iter = 10
+    X, y = load_iris(return_X_y=True)
 
-    nca_warm = NeighborhoodComponentsAnalysis(warm_start=True,
-                                              max_iter=n_iter, random_state=0)
+    nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
     nca_warm.fit(X, y)
     transformation_warm = nca_warm.components_
     nca_warm.max_iter = 1
     nca_warm.fit(X, y)
     transformation_warm_plus_one = nca_warm.components_
 
-    nca_cold = NeighborhoodComponentsAnalysis(warm_start=False,
-                                              max_iter=n_iter, random_state=0)
+    nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
     nca_cold.fit(X, y)
     transformation_cold = nca_cold.components_
     nca_cold.max_iter = 1
@@ -299,7 +295,7 @@ def test_warm_start_effectiveness():
     diff_cold = np.sum(np.abs(transformation_cold_plus_one -
                               transformation_cold))
 
-    assert_true(diff_warm < 2.0,
+    assert_true(diff_warm < 3.0,
                 "Transformer changed significantly after one iteration even "
                 "though it was warm-started.")
 

From 4ed68ddd4916a01297666a054c086e089a2a5e5e Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 22 Jun 2018 14:57:33 +0200
Subject: [PATCH 36/79] ENH: Add possible LDA initialization

---
 sklearn/neighbors/nca.py            | 52 +++++++++++++++++++----------
 sklearn/neighbors/tests/test_nca.py | 11 ++++--
 2 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index c657af1f84bb4..1fcc9070149b0 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -11,6 +11,7 @@
 import numpy as np
 import sys
 import time
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 try:  # scipy.misc.logsumexp is deprecated in scipy 1.0.0
     from scipy.special import logsumexp
 except ImportError:
@@ -38,12 +39,19 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
     init : string or numpy array, optional (default='pca')
         Initialization of the linear transformation. Possible options are
-        'pca', 'identity', 'random', and a numpy array of shape
+        'pca', 'lda', 'identity', 'random', and a numpy array of shape
         (n_features_a, n_features_b).
 
         pca:
             ``n_components`` many principal components of the inputs passed
             to :meth:`fit` will be used to initialize the transformation.
+            (See :class:`~sklearn.decomposition.PCA`)
+
+        lda:
+            ``n_components`` many most discriminative components of the inputs
+            passed to :meth:`fit` will be used to initialize the
+            transformation. (See
+            :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
 
         identity:
             If ``n_components`` is strictly smaller than the
@@ -214,7 +222,7 @@ def fit(self, X, y):
         # (n_samples, n_samples)
 
         # Initialize the transformation
-        transformation = self._initialize(X_valid, init)
+        transformation = self._initialize(X_valid, y_valid, init)
 
         # Create a dictionary of parameters to be passed to the optimizer
         disp = self.verbose - 2 if self.verbose > 1 else -1
@@ -376,16 +384,16 @@ def _validate_params(self, X, y):
                                      'linear transformation `init` ({})!'
                                      .format(self.n_components,
                                              init.shape[0]))
-        elif init in ['pca', 'identity', 'random']:
+        elif init in ['pca', 'lda', 'identity', 'random']:
             pass
         else:
             raise ValueError(
-                "`init` must be 'pca', 'identity', 'random' or a numpy "
+                "`init` must be 'pca', 'lda', 'identity', 'random' or a numpy "
                 "array of shape (n_components, n_features).")
 
         return X_valid, y_valid, init
 
-    def _initialize(self, X, init):
+    def _initialize(self, X, y, init):
         """Initialize the transformation.
 
         Parameters
@@ -393,6 +401,9 @@ def _initialize(self, X, init):
         X : array-like, shape (n_samples, n_features)
             The training samples.
 
+        y : array-like, shape (n_samples,)
+            The training labels.
+
         init : string or numpy array of shape (n_features_a, n_features_b)
             The validated initialization of the linear transformation.
 
@@ -416,19 +427,26 @@ def _initialize(self, X, init):
             elif init == 'random':
                 transformation = self.random_state_.randn(n_components,
                                                           X.shape[1])
-            elif init == 'pca':
-                pca = PCA(n_components=n_components,
-                          random_state=self.random_state_)
-                t_pca = time.time()
+            elif init in {'pca', 'lda'}:
+                init_time = time.time()
+                if init == 'pca':
+                    pca = PCA(n_components=n_components,
+                              random_state=self.random_state_)
+                    if self.verbose:
+                        print('Finding principal components... ', end='')
+                        sys.stdout.flush()
+                    pca.fit(X)
+                    transformation = pca.components_
+                elif init == 'lda':
+                    lda = LinearDiscriminantAnalysis(n_components=n_components)
+                    if self.verbose:
+                        print('Finding most discriminative components... ',
+                              end='')
+                        sys.stdout.flush()
+                    lda.fit(X, y)
+                    transformation = lda.scalings_.T[:n_components]
                 if self.verbose:
-                    print('Finding principal components... ', end='')
-                    sys.stdout.flush()
-
-                pca.fit(X)
-                if self.verbose:
-                    print('done in {:5.2f}s'.format(time.time() - t_pca))
-
-                transformation = pca.components_
+                    print('done in {:5.2f}s'.format(time.time() - init_time))
         return transformation
 
     def _callback(self, transformation):
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 6cc4e059d82d3..b25e9313ef268 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -78,7 +78,7 @@ def test_finite_differences():
     mask = y[:, np.newaxis] == y[np.newaxis, :]  # (n_samples, n_samples)
     nca.n_iter_ = 0
 
-    point = nca._initialize(X, init)
+    point = nca._initialize(X, y, init)
     # compute the gradient at `point`
     _, gradient = nca._loss_grad_lbfgs(point, X, mask)
 
@@ -119,8 +119,9 @@ def test_params_validation():
 
     # ValueError
     assert_raise_message(ValueError,
-                         "`init` must be 'pca', 'identity', 'random' or a "
-                         "numpy array of shape (n_components, n_features).",
+                         "`init` must be 'pca', 'lda', 'identity', 'random' "
+                         "or a numpy array of shape "
+                         "(n_components, n_features).",
                          NCA(init=1).fit, X, y)
     assert_raise_message(ValueError,
                          '`max_iter`= -1, must be >= 1.',
@@ -214,6 +215,10 @@ def test_init_transformation():
     nca_pca = NeighborhoodComponentsAnalysis(init='pca')
     nca_pca.fit(X, y)
 
+    # Initialize with LDA
+    nca_lda = NeighborhoodComponentsAnalysis(init='lda')
+    nca_lda.fit(X, y)
+
     init = np.random.rand(X.shape[1], X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
     nca.fit(X, y)

From 1f9c20872a91f6254e852b4d047670a9c1f72d93 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 25 Jun 2018 10:41:34 +0200
Subject: [PATCH 37/79] ENH: add 'auto' initialization

- Change nca code + docstrings
- Make test
- Update illustration example because axes scale should be the same to better see euclidean distances (the example with updated code (lda initialization) makes data more stretched hence we see more the problem
---
 examples/neighbors/plot_nca_illustration.py |  2 +
 sklearn/neighbors/nca.py                    | 39 ++++++++++++-----
 sklearn/neighbors/tests/test_nca.py         | 47 ++++++++++++++++++++-
 3 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 68a4c303366b4..70f2d8b230b1a 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -69,6 +69,7 @@ def relate_point(X, i, ax):
 ax.set_title("Original points")
 ax.axes.get_xaxis().set_visible(False)
 ax.axes.get_yaxis().set_visible(False)
+ax.axis('equal')
 
 # Learn an embedding with NeighborhoodComponentsAnalysis
 nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state)
@@ -93,4 +94,5 @@ def relate_point(X, i, ax):
 ax2.set_title("NCA embedding")
 ax2.axes.get_xaxis().set_visible(False)
 ax2.axes.get_yaxis().set_visible(False)
+ax2.axis('equal')
 plt.show()
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 1fcc9070149b0..95d3ff3876642 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -37,20 +37,30 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
         Preferred dimensionality of the embedding.
         If None it is inferred from ``init``.
 
-    init : string or numpy array, optional (default='pca')
+    init : string or numpy array, optional (default='auto')
         Initialization of the linear transformation. Possible options are
-        'pca', 'lda', 'identity', 'random', and a numpy array of shape
+        'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
         (n_features_a, n_features_b).
 
+        'auto':
+            Depending on ``n_components``, the most reasonable initialization
+            will be chosen among the following ones. First, we try to use
+            'lda', as it uses labels information: if ``n_components <=
+            n_classes``, ``init='lda'``. If we can't, we then try 'pca', as it
+            projects data in meaningful directions (those of higher variance):
+            if ``n_components < min(n_features, n_samples)``, ``init = 'pca'``.
+            Otherwise, we just use 'identity'.
+
         pca:
             ``n_components`` many principal components of the inputs passed
             to :meth:`fit` will be used to initialize the transformation.
             (See :class:`~sklearn.decomposition.PCA`)
 
         lda:
-            ``n_components`` many most discriminative components of the inputs
-            passed to :meth:`fit` will be used to initialize the
-            transformation. (See
+            ``min(n_components, n_classes)`` many most discriminative
+            components of the inputs passed to :meth:`fit` will be used to
+            initialize the transformation. (If ``n_components > n_classes``,
+            the rest of the components will be zero.) (See
             :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
 
         identity:
@@ -175,7 +185,7 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, n_components=None, init='pca', warm_start=False,
+    def __init__(self, n_components=None, init='auto', warm_start=False,
                  max_iter=50, tol=1e-5, callback=None, store_opt_result=False,
                  verbose=0, random_state=None):
 
@@ -384,12 +394,12 @@ def _validate_params(self, X, y):
                                      'linear transformation `init` ({})!'
                                      .format(self.n_components,
                                              init.shape[0]))
-        elif init in ['pca', 'lda', 'identity', 'random']:
+        elif init in ['auto', 'pca', 'lda', 'identity', 'random']:
             pass
         else:
             raise ValueError(
-                "`init` must be 'pca', 'lda', 'identity', 'random' or a numpy "
-                "array of shape (n_components, n_features).")
+                "`init` must be 'auto', 'pca', 'lda', 'identity', 'random' "
+                "or a numpy array of shape (n_components, n_features).")
 
         return X_valid, y_valid, init
 
@@ -421,7 +431,16 @@ def _initialize(self, X, y, init):
         elif isinstance(init, np.ndarray):
             pass
         else:
-            n_components = self.n_components or X.shape[1]
+            n_samples, n_features = X.shape
+            n_components = self.n_components or n_features
+            if init == 'auto':
+                n_classes = len(np.unique(y))
+                if n_components <= n_classes:
+                    init = 'lda'
+                elif n_components < min(n_features, n_samples):
+                    init = 'pca'
+                else:
+                    init = 'identity'
             if init == 'identity':
                 transformation = np.eye(n_components, X.shape[1])
             elif init == 'random':
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index b25e9313ef268..baeb81921a7ec 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,6 +1,8 @@
+import pytest
 import sys
 import numpy as np
 from numpy.testing import assert_array_equal
+from sklearn import clone
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.externals.six import StringIO
 from sklearn.utils import check_random_state
@@ -119,8 +121,8 @@ def test_params_validation():
 
     # ValueError
     assert_raise_message(ValueError,
-                         "`init` must be 'pca', 'lda', 'identity', 'random' "
-                         "or a numpy array of shape "
+                         "`init` must be 'auto', 'pca', 'lda', 'identity', "
+                         "'random' or a numpy array of shape "
                          "(n_components, n_features).",
                          NCA(init=1).fit, X, y)
     assert_raise_message(ValueError,
@@ -211,6 +213,10 @@ def test_init_transformation():
     nca_random = NeighborhoodComponentsAnalysis(init='random')
     nca_random.fit(X, y)
 
+    # Initialize with auto
+    nca_auto = NeighborhoodComponentsAnalysis(init='auto')
+    nca_auto.fit(X, y)
+
     # Initialize with PCA
     nca_pca = NeighborhoodComponentsAnalysis(init='pca')
     nca_pca.fit(X, y)
@@ -256,6 +262,43 @@ def test_init_transformation():
                          nca.fit, X, y)
 
 
+@pytest.mark.parametrize('n_samples', [17, 19, 23, 29])
+@pytest.mark.parametrize('n_features', [17, 19, 23, 29])
+@pytest.mark.parametrize('n_classes', [17, 19, 23])
+@pytest.mark.parametrize('n_components', [17, 19, 23, 29])
+def test_auto_init(n_samples, n_features, n_classes, n_components):
+    # Test that auto choose the init as expected with every configuration
+    # of order of n_samples, n_features, n_classes and n_components.
+    RNG = check_random_state(0)
+    nca_base = NeighborhoodComponentsAnalysis(init='auto',
+                                              n_components=n_components,
+                                              max_iter=1, random_state=RNG)
+    if n_classes >= n_samples:
+        pass
+        # n_classes > n_samples is impossible, and n_classes == n_samples
+        # throws an error from lda but is an absurd case
+    else:
+        X = RNG.randn(n_samples, n_features)
+        y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
+        if n_components > n_features:
+            pass
+        else:
+            nca = clone(nca_base)
+            nca.fit(X, y)
+            if n_components <= n_classes:
+                nca_lda = clone(nca_base).set_params(init='lda')
+                nca_lda.fit(X, y)
+                assert_array_equal(nca.components_, nca_lda.components_)
+            elif n_components < min(n_features, n_samples):
+                nca_pca = clone(nca_base).set_params(init='pca')
+                nca_pca.fit(X, y)
+                assert_array_equal(nca.components_, nca_pca.components_)
+            else:
+                nca_id = clone(nca_base).set_params(init='identity')
+                nca_id.fit(X, y)
+                assert_array_equal(nca.components_, nca_id.components_)
+
+
 def test_warm_start_validation():
     X, y = make_classification(n_samples=30, n_features=5, n_classes=4,
                                n_redundant=0, n_informative=5, random_state=0)

From e050128f3ea8f7ff3d3b13be791d7b6a3748ea93 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 25 Jun 2018 14:19:57 +0200
Subject: [PATCH 38/79] FIX test appropriate message depending on init

---
 sklearn/neighbors/tests/test_nca.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index baeb81921a7ec..4cdc8c4964047 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -352,12 +352,22 @@ def test_warm_start_effectiveness():
                 "warm-started transformer after one iteration.")
 
 
-def test_verbose():
-    # assert there is proper output when verbose = 1
+@pytest.mark.parametrize('init_name', ['pca', 'lda', 'identity', 'random',
+                                       'precomputed'])
+def test_verbose(init_name):
+    # assert there is proper output when verbose = 1, for every initialization
+    # except auto because auto will call one of the others
+    msgs = {'pca': "Finding principal components",
+            'lda': "Finding most discriminative components",
+            'identity': '', 'random': '', 'precomputed': ''}
+    if init_name == 'precomputed':
+        init = rng.randn(iris_data.shape[1], iris_data.shape[1])
+    else:
+        init = init_name
     old_stdout = sys.stdout
     sys.stdout = StringIO()
 
-    nca = NeighborhoodComponentsAnalysis(verbose=1)
+    nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
     try:
         nca.fit(iris_data, iris_target)
     finally:
@@ -367,8 +377,7 @@ def test_verbose():
 
     # check output
     assert("[NeighborhoodComponentsAnalysis]" in out)
-    assert("Finding principal components" in out)
-    assert ("Finding principal components" in out)
+    assert(msgs[init_name] in out)
     assert ("Training took" in out)
 
     # assert by default there is no output (verbose=0)

From ead98504b0333532596e2fb2a9f23106e72967a2 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 25 Jun 2018 14:48:25 +0200
Subject: [PATCH 39/79] FIX import name with relative path

---
 sklearn/neighbors/nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 95d3ff3876642..6688e5d65d227 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -11,12 +11,12 @@
 import numpy as np
 import sys
 import time
-from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 try:  # scipy.misc.logsumexp is deprecated in scipy 1.0.0
     from scipy.special import logsumexp
 except ImportError:
     from scipy.misc import logsumexp
 from scipy.optimize import minimize
+from ..discriminant_analysis import LinearDiscriminantAnalysis
 from ..metrics import pairwise_distances
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import LabelEncoder

From a807df2ce0580cf26b0c79fd77fca8d9ff3cda26 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 25 Jun 2018 15:29:02 +0200
Subject: [PATCH 40/79] FIX simplify test and check almost equal to pass tests
 on linux 32 bits

---
 sklearn/neighbors/tests/test_nca.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 4cdc8c4964047..b4b7434e24fcb 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,7 +1,7 @@
 import pytest
 import sys
 import numpy as np
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_equal, assert_array_almost_equal
 from sklearn import clone
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.externals.six import StringIO
@@ -269,16 +269,15 @@ def test_init_transformation():
 def test_auto_init(n_samples, n_features, n_classes, n_components):
     # Test that auto choose the init as expected with every configuration
     # of order of n_samples, n_features, n_classes and n_components.
-    RNG = check_random_state(0)
     nca_base = NeighborhoodComponentsAnalysis(init='auto',
                                               n_components=n_components,
-                                              max_iter=1, random_state=RNG)
+                                              max_iter=1, random_state=rng)
     if n_classes >= n_samples:
         pass
         # n_classes > n_samples is impossible, and n_classes == n_samples
         # throws an error from lda but is an absurd case
     else:
-        X = RNG.randn(n_samples, n_features)
+        X = rng.randn(n_samples, n_features)
         y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
         if n_components > n_features:
             pass
@@ -286,17 +285,13 @@ def test_auto_init(n_samples, n_features, n_classes, n_components):
             nca = clone(nca_base)
             nca.fit(X, y)
             if n_components <= n_classes:
-                nca_lda = clone(nca_base).set_params(init='lda')
-                nca_lda.fit(X, y)
-                assert_array_equal(nca.components_, nca_lda.components_)
+                nca_other = clone(nca_base).set_params(init='lda')
             elif n_components < min(n_features, n_samples):
-                nca_pca = clone(nca_base).set_params(init='pca')
-                nca_pca.fit(X, y)
-                assert_array_equal(nca.components_, nca_pca.components_)
+                nca_other = clone(nca_base).set_params(init='pca')
             else:
-                nca_id = clone(nca_base).set_params(init='identity')
-                nca_id.fit(X, y)
-                assert_array_equal(nca.components_, nca_id.components_)
+                nca_other = clone(nca_base).set_params(init='identity')
+            nca_other.fit(X, y)
+            assert_array_almost_equal(nca.components_, nca_other.components_)
 
 
 def test_warm_start_validation():

From e00d4a1122890ef21634066efcdee8f4652a841b Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Jun 2018 16:48:53 +0200
Subject: [PATCH 41/79] FIX Move LDA import inside NCA class to avoid circular
 dependencies

LDA was importing at one point neighbors hence NCA
---
 sklearn/neighbors/nca.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 6688e5d65d227..6ee2f05e5d9e7 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -16,7 +16,6 @@
 except ImportError:
     from scipy.misc import logsumexp
 from scipy.optimize import minimize
-from ..discriminant_analysis import LinearDiscriminantAnalysis
 from ..metrics import pairwise_distances
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import LabelEncoder
@@ -457,6 +456,8 @@ def _initialize(self, X, y, init):
                     pca.fit(X)
                     transformation = pca.components_
                 elif init == 'lda':
+                    from ..discriminant_analysis import \
+                        LinearDiscriminantAnalysis
                     lda = LinearDiscriminantAnalysis(n_components=n_components)
                     if self.verbose:
                         print('Finding most discriminative components... ',

From aa90c9bcbdbed0843b5388d60f1e2551f872091c Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 28 Jun 2018 13:49:08 +0200
Subject: [PATCH 42/79] DOC add what s new entry

---
 doc/whats_new/v0.20.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 7383b17cf1e6b..42385f5018d79 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -139,6 +139,11 @@ Model evaluation
 
 Decomposition, manifold learning and clustering
 
+- Added :class:`neighbors.NeighborhoodComponentsAnalysis`, which implements the
+  Neighborhood Components Analysis metric learning algorithm described in
+  Goldberger et al. (2005). :issue:`10058` by :user:`William de Vazelhes
+  <wdevazelhes>` and :user:`John Chiotellis <johny-c>`.
+
 - :class:`cluster.AgglomerativeClustering` now supports Single Linkage
   clustering via ``linkage='single'``. :issue:`9372` by
   :user:`Leland McInnes <lmcinnes>` and :user:`Steve Astels <sastels>`.

From 85bd54fbcb501594fe6747a6cb4a18563cd98fe6 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 29 Jun 2018 14:25:14 +0200
Subject: [PATCH 43/79] MAINT simplify gradient testing

---
 sklearn/neighbors/tests/test_nca.py | 69 ++++++-----------------------
 1 file changed, 13 insertions(+), 56 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index b4b7434e24fcb..1f083cd7938b1 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -2,6 +2,7 @@
 import sys
 import numpy as np
 from numpy.testing import assert_array_equal, assert_array_almost_equal
+from scipy.optimize import check_grad
 from sklearn import clone
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.externals.six import StringIO
@@ -41,69 +42,25 @@ def test_simple_example():
 
 
 def test_finite_differences():
-    r"""Test gradient of loss function
-
-    Test if the gradient is correct by computing the relative difference
-    between the projected gradient PG:
-
-    .. math::
-
-        PG = \mathbf d^{\top} \cdot \nabla
-        \mathcal L(\mathbf x)
-
-    and the finite differences FD:
-
-    .. math::
-
-        FD = \frac{\mathcal L(\mathbf x + \epsilon \mathbf d) -
-        \mathcal L(\mathbf x - \epsilon \mathbf d)}{2 \epsilon}
-
-
-    where :math:`d` is a random direction (random vector of shape `n_features`,
-    and norm 1), :math:`\epsilon` is a very small number, :math:`\mathcal L` is
-    the loss function and :math:`\nabla \mathcal L` is its gradient. This
-    relative difference should be zero:
-
-    .. math ::
-
-        \frac{|PG -FD|}{|PG|} = 0
-
+    """Test gradient of loss function
 
+    Assert that the gradient is almost equal to its finite differences
+    approximation.
     """
-    # Initialize `transformation`, `X` and `y` and `NCA`
-    X = iris_data
-    y = iris_target
-    point = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
-    nca = NeighborhoodComponentsAnalysis(init=point)
-
-    X, y, init = nca._validate_params(X, y)
-    mask = y[:, np.newaxis] == y[np.newaxis, :]  # (n_samples, n_samples)
+    # Initialize the transformation `M`, as well as `X` and `y` and `NCA`
+    X, y = make_classification()
+    M = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
+    nca = NeighborhoodComponentsAnalysis()
     nca.n_iter_ = 0
+    mask = y[:, np.newaxis] == y[np.newaxis, :]
 
-    point = nca._initialize(X, y, init)
-    # compute the gradient at `point`
-    _, gradient = nca._loss_grad_lbfgs(point, X, mask)
-
-    # create a random direction of norm 1
-    random_direction = rng.randn(*point.shape)
-    random_direction /= np.linalg.norm(random_direction)
-
-    # computes projected gradient
-    projected_gradient = random_direction.ravel().dot(
-                                      gradient.ravel())
+    def fun(M): return nca._loss_grad_lbfgs(M, X, mask)[0]
 
-    # compute finite differences
-    eps = 1e-5
-    right_loss, _ = nca._loss_grad_lbfgs(point + eps * random_direction, X,
-                                         mask)
-    left_loss, _ = nca._loss_grad_lbfgs(point - eps * random_direction, X,
-                                        mask)
-    finite_differences = 1 / (2 * eps) * (right_loss - left_loss)
+    def grad(M): return nca._loss_grad_lbfgs(M, X, mask)[1]
 
     # compute relative error
-    relative_error = np.abs(finite_differences - projected_gradient) / \
-        np.abs(projected_gradient)
-    np.testing.assert_almost_equal(relative_error, 0.)
+    rel_diff = check_grad(fun, grad, M.ravel()) / np.linalg.norm(grad(M))
+    np.testing.assert_almost_equal(rel_diff, 0., decimal=6)
 
 
 def test_params_validation():

From aa9ace7dc07303cbe9fbf7da48639235600036f0 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 29 Jun 2018 15:11:58 +0200
Subject: [PATCH 44/79] TST FIX be more tolerant on decimals for older versions
 of numerical packages

---
 sklearn/neighbors/tests/test_nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 1f083cd7938b1..3f6c4b4ab9c3d 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -60,7 +60,7 @@ def grad(M): return nca._loss_grad_lbfgs(M, X, mask)[1]
 
     # compute relative error
     rel_diff = check_grad(fun, grad, M.ravel()) / np.linalg.norm(grad(M))
-    np.testing.assert_almost_equal(rel_diff, 0., decimal=6)
+    np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
 
 
 def test_params_validation():

From cc072617e4670cafdec22df71b15021295e95b96 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 29 Jun 2018 15:48:50 +0200
Subject: [PATCH 45/79] STY fix continuation lines, removing backslashes

---
 sklearn/neighbors/nca.py            |  4 ++--
 sklearn/neighbors/tests/test_nca.py | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 6ee2f05e5d9e7..ace942a03091a 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -456,8 +456,8 @@ def _initialize(self, X, y, init):
                     pca.fit(X)
                     transformation = pca.components_
                 elif init == 'lda':
-                    from ..discriminant_analysis import \
-                        LinearDiscriminantAnalysis
+                    from ..discriminant_analysis import (
+                        LinearDiscriminantAnalysis)
                     lda = LinearDiscriminantAnalysis(n_components=n_components)
                     if self.verbose:
                         print('Finding most discriminative components... ',
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 3f6c4b4ab9c3d..c96fdf5653869 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -7,8 +7,9 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.externals.six import StringIO
 from sklearn.utils import check_random_state
-from sklearn.utils.testing import assert_raises, assert_equal, \
-    assert_raise_message, assert_warns_message, assert_true
+from sklearn.utils.testing import (assert_raises, assert_equal,
+                                   assert_raise_message, assert_warns_message,
+                                   assert_true)
 from sklearn.datasets import load_iris, make_classification
 from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
 from sklearn.metrics import pairwise_distances
@@ -258,9 +259,9 @@ def test_warm_start_validation():
     nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
     nca.fit(X, y)
 
-    X_less_features, y = \
-        make_classification(n_samples=30, n_features=4, n_classes=4,
-                            n_redundant=0, n_informative=4, random_state=0)
+    X_less_features, y = make_classification(n_samples=30, n_features=4,
+                                             n_classes=4, n_redundant=0,
+                                             n_informative=4, random_state=0)
     assert_raise_message(ValueError,
                          'The new inputs dimensionality ({}) does not '
                          'match the input dimensionality of the '

From 16cf04d26ea41786e5b158bda66ebcf6f408f25e Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Sun, 15 Jul 2018 06:28:49 +0200
Subject: [PATCH 46/79] FIX: fix logsumexp import

---
 examples/neighbors/plot_nca_illustration.py | 2 +-
 sklearn/neighbors/nca.py                    | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 70f2d8b230b1a..bc020fc4a1d40 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -16,7 +16,7 @@
 from sklearn.datasets import make_classification
 from sklearn.neighbors import NeighborhoodComponentsAnalysis
 from matplotlib import cm
-from scipy.misc import logsumexp
+from sklearn.utils.fixes import logsumexp
 
 print(__doc__)
 
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index ace942a03091a..d88cf05ec8d24 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -11,11 +11,8 @@
 import numpy as np
 import sys
 import time
-try:  # scipy.misc.logsumexp is deprecated in scipy 1.0.0
-    from scipy.special import logsumexp
-except ImportError:
-    from scipy.misc import logsumexp
 from scipy.optimize import minimize
+from ..utils.fixes import logsumexp
 from ..metrics import pairwise_distances
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import LabelEncoder

From 8c7af3c41a2b8509130837dd407e4ec0fc7a6b08 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 23 Jul 2018 14:21:28 +0200
Subject: [PATCH 47/79] TST: simplify verbose testing with pytest capsys

---
 sklearn/neighbors/tests/test_nca.py | 44 ++++++++---------------------
 1 file changed, 11 insertions(+), 33 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index c96fdf5653869..ccea465cbde0e 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,11 +1,9 @@
 import pytest
-import sys
 import numpy as np
 from numpy.testing import assert_array_equal, assert_array_almost_equal
 from scipy.optimize import check_grad
 from sklearn import clone
 from sklearn.exceptions import ConvergenceWarning
-from sklearn.externals.six import StringIO
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import (assert_raises, assert_equal,
                                    assert_raise_message, assert_warns_message,
@@ -307,7 +305,7 @@ def test_warm_start_effectiveness():
 
 @pytest.mark.parametrize('init_name', ['pca', 'lda', 'identity', 'random',
                                        'precomputed'])
-def test_verbose(init_name):
+def test_verbose(init_name, capsys):
     # assert there is proper output when verbose = 1, for every initialization
     # except auto because auto will call one of the others
     msgs = {'pca': "Finding principal components",
@@ -317,34 +315,21 @@ def test_verbose(init_name):
         init = rng.randn(iris_data.shape[1], iris_data.shape[1])
     else:
         init = init_name
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-
     nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
-    try:
-        nca.fit(iris_data, iris_target)
-    finally:
-        out = sys.stdout.getvalue()
-        sys.stdout.close()
-        sys.stdout = old_stdout
+    nca.fit(iris_data, iris_target)
+    out, _ = capsys.readouterr()
 
     # check output
     assert("[NeighborhoodComponentsAnalysis]" in out)
     assert(msgs[init_name] in out)
     assert ("Training took" in out)
 
-    # assert by default there is no output (verbose=0)
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
 
+def test_no_verbose(capsys):
+    # assert by default there is no output (verbose=0)
     nca = NeighborhoodComponentsAnalysis()
-    try:
-        nca.fit(iris_data, iris_target)
-    finally:
-        out = sys.stdout.getvalue()
-        sys.stdout.close()
-        sys.stdout = old_stdout
-
+    nca.fit(iris_data, iris_target)
+    out, _ = capsys.readouterr()
     # check output
     assert(out == '')
 
@@ -396,7 +381,7 @@ def test_one_class():
     assert_array_equal(X, nca.transform(X))
 
 
-def test_callback():
+def test_callback(capsys):
     X = iris_data
     y = iris_target
 
@@ -410,20 +395,13 @@ def my_cb(transformation, n_iter):
         print('{} iterations remaining...'.format(rem_iter))
 
     # assert that my_cb is called
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-
     nca = NeighborhoodComponentsAnalysis(max_iter=max_iter,
                                          callback=my_cb, verbose=1)
-    try:
-        nca.fit(iris_data, iris_target)
-    finally:
-        out = sys.stdout.getvalue()
-        sys.stdout.close()
-        sys.stdout = old_stdout
+    nca.fit(iris_data, iris_target)
+    out, _ = capsys.readouterr()
 
     # check output
-    assert('{} iterations remaining...'.format(max_iter-1) in out)
+    assert('{} iterations remaining...'.format(max_iter - 1) in out)
 
 
 def test_store_opt_result():

From 27f2b5c9db1c341f5a4e53fba2705111a686ce00 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 1 Aug 2018 15:11:05 +0200
Subject: [PATCH 48/79] TST: check more explicitely verbose

---
 sklearn/neighbors/tests/test_nca.py | 30 +++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index ccea465cbde0e..e9d3573c444ed 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,4 +1,5 @@
 import pytest
+import re
 import numpy as np
 from numpy.testing import assert_array_equal, assert_array_almost_equal
 from scipy.optimize import check_grad
@@ -308,9 +309,9 @@ def test_warm_start_effectiveness():
 def test_verbose(init_name, capsys):
     # assert there is proper output when verbose = 1, for every initialization
     # except auto because auto will call one of the others
-    msgs = {'pca': "Finding principal components",
-            'lda': "Finding most discriminative components",
-            'identity': '', 'random': '', 'precomputed': ''}
+    regexp_init = '... done in \ *\d+\.\d{2}s'
+    msgs = {'pca': "Finding principal components" + regexp_init,
+            'lda': "Finding most discriminative components" + regexp_init}
     if init_name == 'precomputed':
         init = rng.randn(iris_data.shape[1], iris_data.shape[1])
     else:
@@ -320,9 +321,26 @@ def test_verbose(init_name, capsys):
     out, _ = capsys.readouterr()
 
     # check output
-    assert("[NeighborhoodComponentsAnalysis]" in out)
-    assert(msgs[init_name] in out)
-    assert ("Training took" in out)
+    lines = re.split('\n+', out)
+    # if pca or lda init, an additional line is printed, so we test
+    # it and remove it to test the rest equally among initializations
+    if init_name in ['pca', 'lda']:
+        assert re.match(msgs[init_name], lines[0])
+        lines = lines[1:]
+    assert lines[0] == '[NeighborhoodComponentsAnalysis]'
+    header = '{:>10} {:>20} {:>10}'.format('Iteration', 'Objective Value',
+                                           'Time(s)')
+    assert lines[1] == '[NeighborhoodComponentsAnalysis] {}'.format(header)
+    assert lines[2] == ('[NeighborhoodComponentsAnalysis] {}'
+                        .format('-' * len(header)))
+    for line in lines[3:-2]:
+        # The following regex will match for instance:
+        #  '[NeighborhoodComponentsAnalysis]  0    6.988936e+01   0.01'
+        assert re.match("\[NeighborhoodComponentsAnalysis\]\ *\d+\ *\d\.\d{6}e"
+                        "[+|-]\d+\ *\d+\.\d{2}", line)
+    assert re.match("\[NeighborhoodComponentsAnalysis\] Training took\ *"
+                    "\d+\.\d{2}s\.", lines[-2])
+    assert lines[-1] == ''
 
 
 def test_no_verbose(capsys):

From 85f8d21fac948da2349e7ba69419e600f9e954da Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 1 Aug 2018 17:58:27 +0200
Subject: [PATCH 49/79] FIX: remove non-ASCII character

---
 sklearn/neighbors/tests/test_nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index e9d3573c444ed..dd3cf6f3ac474 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -335,7 +335,7 @@ def test_verbose(init_name, capsys):
                         .format('-' * len(header)))
     for line in lines[3:-2]:
         # The following regex will match for instance:
-        #  '[NeighborhoodComponentsAnalysis]  0    6.988936e+01   0.01'
+        # '[NeighborhoodComponentsAnalysis]  0    6.988936e+01   0.01'
         assert re.match("\[NeighborhoodComponentsAnalysis\]\ *\d+\ *\d\.\d{6}e"
                         "[+|-]\d+\ *\d+\.\d{2}", line)
     assert re.match("\[NeighborhoodComponentsAnalysis\] Training took\ *"

From 396f30f337cb1ffadd3b80145d7fc2651b231b6d Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 17 Aug 2018 19:29:37 +0200
Subject: [PATCH 50/79] ENH: simplify gradient expression

---
 sklearn/neighbors/nca.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index d88cf05ec8d24..259d47d3cd891 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -533,10 +533,11 @@ def _loss_grad_lbfgs(self, transformation, X, mask, sign=1.0):
 
         # Compute gradient of loss w.r.t. `transform`
         weighted_p_ij = masked_p_ij - p_ij * p
-        gradient = 2 * (X_embedded.T.dot(weighted_p_ij + weighted_p_ij.T) -
-                        X_embedded.T * np.sum(weighted_p_ij, axis=0)).dot(X)
-        # time complexity: O(n_components x n_samples x
-        # min(n_samples, n_features))
+        weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T
+        np.fill_diagonal(weighted_p_ij_sym, - weighted_p_ij.sum(axis=0))
+        gradient = 2 * (X_embedded.T.dot(weighted_p_ij_sym)).dot(X)
+        # time complexity of the gradient: O(n_components x n_samples x (
+        # n_samples + n_features))
 
         if self.verbose:
             t_funcall = time.time() - t_funcall

From 88303737ba4fab3df7d86db0ce4798d94cf9bde7 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 29 Nov 2018 18:04:24 +0100
Subject: [PATCH 51/79] MAINT: address review
 https://github.com/scikit-learn/scikit-learn/pull/10058#pullrequestreview-176109007

---
 doc/modules/neighbors.rst           |  2 --
 sklearn/neighbors/tests/test_nca.py | 28 +++++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 2c9823fc2804f..a99db42d867ab 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -512,7 +512,6 @@ the model from 0.81 to 0.82.
     classification using nearest centroid with different shrink thresholds.
 
 
-
 .. _nca:
 
 Neighborhood Components Analysis
@@ -549,7 +548,6 @@ stochastic neighbors are from the same class as sample 3, guaranteeing that the
 latter will be well classified.
 
 
-
 Classification
 --------------
 
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index dd3cf6f3ac474..fc07f521a610c 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -36,11 +36,34 @@ def test_simple_example():
     nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
                                          random_state=42)
     nca.fit(X, y)
-    Xansformed = nca.transform(X)
-    np.testing.assert_equal(pairwise_distances(Xansformed).argsort()[:, 1],
+    X_t = nca.transform(X)
+    np.testing.assert_equal(pairwise_distances(X_t).argsort()[:, 1],
                             np.array([2, 3, 0, 1]))
 
 
+def test_toy_example_collapse_points():
+    """Test on a toy example of three points that should collapse
+
+    Test that on this simple example, the new points are collapsed:
+    Two same label points with a different label point in the middle.
+    The objective is 2/(1 + exp(d/2)), with d the euclidean distance
+    between the two same labels points. This is maximized for d=0
+    (because d>=0), with an objective equal to 1 (loss=-1.).
+
+    """
+    input_dim = 5
+    two_points = rng.randn(2, input_dim)
+    X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
+    y = [0, 0, 1]
+    nca = NeighborhoodComponentsAnalysis(random_state=42,
+                                         store_opt_result=True)
+    X_t = nca.fit_transform(X, y)
+    print(X_t)
+    # test that points are collapsed into one point
+    assert_array_almost_equal(X_t - X_t[0], 0.)
+    assert nca.opt_result_.fun == -1.
+
+
 def test_finite_differences():
     """Test gradient of loss function
 
@@ -434,7 +457,6 @@ def test_store_opt_result():
 
 
 def test_convergence_warning():
-
     nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
     cls_name = nca.__class__.__name__
     assert_warns_message(ConvergenceWarning,

From ded5ecb0a6940c9e068761f00d4529aefb3a6d1b Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 29 Nov 2018 18:16:32 +0100
Subject: [PATCH 52/79] DOC: Add what's new entry

---
 doc/whats_new/v0.21.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 4d472a0bb9835..034fca54e054a 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -106,6 +106,12 @@ Support for Python 3.4 and below has been officially dropped.
   when called before fit :issue:`12279` by :user:`Krishna Sangeeth
   <whiletruelearn>`.
 
+- |MajorFeature| A metric learning algorithm:
+  :class:`neighbors.NeighborhoodComponentsAnalysis`, which implements the
+  Neighborhood Components Analysis algorithm described in Goldberger et al.
+  (2005). :issue:`10058` by :user:`William de Vazelhes
+  <wdevazelhes>` and :user:`John Chiotellis <johny-c>`.
+
 :mod:`sklearn.pipeline`
 .......................
 

From 589f57d92942e49dd9622596b5028f91855486fb Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 6 Dec 2018 15:36:56 +0100
Subject: [PATCH 53/79] FIX: try raw string to pass flake8 (cf.
 https://github.com/iodide-project/pyodide/pull/204#issuecomment-426269946)

---
 sklearn/neighbors/tests/test_nca.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index fc07f521a610c..777c0b53efdab 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -359,10 +359,10 @@ def test_verbose(init_name, capsys):
     for line in lines[3:-2]:
         # The following regex will match for instance:
         # '[NeighborhoodComponentsAnalysis]  0    6.988936e+01   0.01'
-        assert re.match("\[NeighborhoodComponentsAnalysis\]\ *\d+\ *\d\.\d{6}e"
-                        "[+|-]\d+\ *\d+\.\d{2}", line)
-    assert re.match("\[NeighborhoodComponentsAnalysis\] Training took\ *"
-                    "\d+\.\d{2}s\.", lines[-2])
+        assert re.match(r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e"
+                         "[+|-]\d+\ *\d+\.\d{2}", line)
+    assert re.match(r"\[NeighborhoodComponentsAnalysis\] Training took\ *"
+                     "\d+\.\d{2}s\.", lines[-2])
     assert lines[-1] == ''
 
 
From 600adf231a9964dd7f5e7a061493cd238c9768e2 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 6 Dec 2018 15:44:45 +0100
Subject: [PATCH 54/79] FIX: try the exact syntax that passed the linter

---
 sklearn/neighbors/tests/test_nca.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 777c0b53efdab..260cbd60b053b 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -332,7 +332,7 @@ def test_warm_start_effectiveness():
 def test_verbose(init_name, capsys):
     # assert there is proper output when verbose = 1, for every initialization
     # except auto because auto will call one of the others
-    regexp_init = '... done in \ *\d+\.\d{2}s'
+    regexp_init = r'... done in \ *\d+\.\d{2}s'
     msgs = {'pca': "Finding principal components" + regexp_init,
             'lda': "Finding most discriminative components" + regexp_init}
     if init_name == 'precomputed':
@@ -359,10 +359,10 @@ def test_verbose(init_name, capsys):
     for line in lines[3:-2]:
         # The following regex will match for instance:
         # '[NeighborhoodComponentsAnalysis]  0    6.988936e+01   0.01'
-        assert re.match(r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e"
-                         "[+|-]\d+\ *\d+\.\d{2}", line)
-    assert re.match(r"\[NeighborhoodComponentsAnalysis\] Training took\ *"
-                     "\d+\.\d{2}s\.", lines[-2])
+        assert re.match(r'\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e'
+                        r'[+|-]\d+\ *\d+\.\d{2}', line)
+    assert re.match(r'\[NeighborhoodComponentsAnalysis\] Training took\ *'
+                    r'\d+\.\d{2}s\.', lines[-2])
     assert lines[-1] == ''
 
 
From d274c4a3770ace6b08f8704cef9eb1177319f3ce Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 6 Dec 2018 16:19:50 +0100
Subject: [PATCH 55/79] TST: give some tolerance for
 test_toy_example_collapse_points

---
 sklearn/neighbors/tests/test_nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 260cbd60b053b..80a605ec9f88b 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -61,7 +61,7 @@ def test_toy_example_collapse_points():
     print(X_t)
     # test that points are collapsed into one point
     assert_array_almost_equal(X_t - X_t[0], 0.)
-    assert nca.opt_result_.fun == -1.
+    assert nca.opt_result_.fun + 1 < 1e-10
 
 
 def test_finite_differences():

From 2dbf064f14650b3b5895f05ca25e0ea0e1e11691 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 7 Dec 2018 10:28:32 +0100
Subject: [PATCH 56/79] relaunch travis


From e17003ed9a8778b7a817f84b0d84f5bebdf44f2c Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 12 Dec 2018 12:56:15 +0100
Subject: [PATCH 57/79] FIX: use checked_random_state instead of np.random

---
 iterate.dat                         | 29 +++++++++++++++++++++++++++++
 sklearn/neighbors/tests/test_nca.py | 12 ++++++------
 2 files changed, 35 insertions(+), 6 deletions(-)
 create mode 100644 iterate.dat

diff --git a/iterate.dat b/iterate.dat
new file mode 100644
index 0000000000000..02903b12a3c14
--- /dev/null
+++ b/iterate.dat
@@ -0,0 +1,29 @@
+RUNNING THE L-BFGS-B CODE
+
+it    = iteration number
+nf    = number of function evaluations
+nseg  = number of segments explored during the Cauchy search
+nact  = number of active bounds at the generalized Cauchy point
+sub   = manner in which the subspace minimization terminated:
+        con = converged, bnd = a bound was reached
+itls  = number of iterations performed in the line search
+stepl = step length used
+tstep = norm of the displacement (total step)
+projg = norm of the projected gradient
+f     = function value
+
+           * * *
+
+Machine precision = 2.220D-16
+ N =            3     M =           10
+
+   it   nf  nseg  nact  sub  itls  stepl    tstep     projg        f
+    0    1     -     -   -     -     -        -     6.279D+01  1.386D+02
+    1    2     1     0  ---    0  1.2D-02  1.0D+00  2.422D+01  9.713D+01
+    2    3     0     0  con    0  1.0D+00  4.1D-01  1.605D+01  8.872D+01
+    3    4     0     0  con    0  1.0D+00  7.1D-01  3.502D+00  8.158D+01
+
+STOP: TOTAL NO. of ITERATIONS EXCEEDS LIMIT                 
+
+ Total User time 0.000E+00 seconds.
+
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 80a605ec9f88b..a25deacf3a0fe 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -109,7 +109,7 @@ def test_params_validation():
                          '`max_iter`= -1, must be >= 1.',
                          NCA(max_iter=-1).fit, X, y)
 
-    init = np.random.rand(5, 3)
+    init = rng.rand(5, 3)
     assert_raise_message(ValueError,
                          'The output dimensionality ({}) of the given linear '
                          'transformation `init` cannot be greater than its '
@@ -153,7 +153,7 @@ def test_n_components():
     X = np.arange(12).reshape(4, 3)
     y = [1, 1, 2, 2]
 
-    init = np.random.rand(X.shape[1] - 1, 3)
+    init = rng.rand(X.shape[1] - 1, 3)
 
     # n_components = X.shape[1] != transformation.shape[0]
     n_components = X.shape[1]
@@ -205,12 +205,12 @@ def test_init_transformation():
     nca_lda = NeighborhoodComponentsAnalysis(init='lda')
     nca_lda.fit(X, y)
 
-    init = np.random.rand(X.shape[1], X.shape[1])
+    init = rng.rand(X.shape[1], X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
     nca.fit(X, y)
 
     # init.shape[1] must match X.shape[1]
-    init = np.random.rand(X.shape[1], X.shape[1] + 1)
+    init = rng.rand(X.shape[1], X.shape[1] + 1)
     nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raise_message(ValueError,
                          'The input dimensionality ({}) of the given '
@@ -220,7 +220,7 @@ def test_init_transformation():
                          nca.fit, X, y)
 
     # init.shape[0] must be <= init.shape[1]
-    init = np.random.rand(X.shape[1] + 1, X.shape[1])
+    init = rng.rand(X.shape[1] + 1, X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raise_message(ValueError,
                          'The output dimensionality ({}) of the given '
@@ -230,7 +230,7 @@ def test_init_transformation():
                          nca.fit, X, y)
 
     # init.shape[0] must match n_components
-    init = np.random.rand(X.shape[1], X.shape[1])
+    init = rng.rand(X.shape[1], X.shape[1])
     n_components = X.shape[1] - 2
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,

From 32118aa753df9fc2ee9101a48cd2bfeabfae8dda Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 12 Dec 2018 14:58:24 +0100
Subject: [PATCH 58/79] FIX: delete iterate.dat

---
 iterate.dat | 29 -----------------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 iterate.dat

diff --git a/iterate.dat b/iterate.dat
deleted file mode 100644
index 02903b12a3c14..0000000000000
--- a/iterate.dat
+++ /dev/null
@@ -1,29 +0,0 @@
-RUNNING THE L-BFGS-B CODE
-
-it    = iteration number
-nf    = number of function evaluations
-nseg  = number of segments explored during the Cauchy search
-nact  = number of active bounds at the generalized Cauchy point
-sub   = manner in which the subspace minimization terminated:
-        con = converged, bnd = a bound was reached
-itls  = number of iterations performed in the line search
-stepl = step length used
-tstep = norm of the displacement (total step)
-projg = norm of the projected gradient
-f     = function value
-
-           * * *
-
-Machine precision = 2.220D-16
- N =            3     M =           10
-
-   it   nf  nseg  nact  sub  itls  stepl    tstep     projg        f
-    0    1     -     -   -     -     -        -     6.279D+01  1.386D+02
-    1    2     1     0  ---    0  1.2D-02  1.0D+00  2.422D+01  9.713D+01
-    2    3     0     0  con    0  1.0D+00  4.1D-01  1.605D+01  8.872D+01
-    3    4     0     0  con    0  1.0D+00  7.1D-01  3.502D+00  8.158D+01
-
-STOP: TOTAL NO. of ITERATIONS EXCEEDS LIMIT                 
-
- Total User time 0.000E+00 seconds.
-

From cf55015a98e3e9a49c72212ed0c2f394c00d6c09 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 12 Dec 2018 17:22:20 +0100
Subject: [PATCH 59/79] FIX: Fix dealing with the case of
 LinearDiscriminantAnalysis initialization:

- fixed when to choose LinearDisciminantAnalysis when given 'auto' initialization
- adapted the tests where 'lda' init can be set so that this cases will not return an error
---
 sklearn/neighbors/nca.py            |  2 +-
 sklearn/neighbors/tests/test_nca.py | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 259d47d3cd891..c0089365aae8d 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -431,7 +431,7 @@ def _initialize(self, X, y, init):
             n_components = self.n_components or n_features
             if init == 'auto':
                 n_classes = len(np.unique(y))
-                if n_components <= n_classes:
+                if n_components <= min(n_features, n_classes - 1):
                     init = 'lda'
                 elif n_components < min(n_features, n_samples):
                     init = 'pca'
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index a25deacf3a0fe..716cdd59b421b 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -9,7 +9,7 @@
 from sklearn.utils.testing import (assert_raises, assert_equal,
                                    assert_raise_message, assert_warns_message,
                                    assert_true)
-from sklearn.datasets import load_iris, make_classification
+from sklearn.datasets import load_iris, make_classification, make_blobs
 from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
 from sklearn.metrics import pairwise_distances
 
@@ -182,8 +182,7 @@ def test_n_components():
 
 
 def test_init_transformation():
-    X, y = make_classification(n_samples=30, n_features=5,
-                               n_redundant=0, random_state=0)
+    X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
 
     # Start learning from scratch
     nca = NeighborhoodComponentsAnalysis(init='identity')
@@ -260,11 +259,13 @@ def test_auto_init(n_samples, n_features, n_classes, n_components):
         X = rng.randn(n_samples, n_features)
         y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
         if n_components > n_features:
+            # this would return a ValueError, which is already tested in
+            # test_params_validation
             pass
         else:
             nca = clone(nca_base)
             nca.fit(X, y)
-            if n_components <= n_classes:
+            if n_components <= min(n_classes - 1, n_features):
                 nca_other = clone(nca_base).set_params(init='lda')
             elif n_components < min(n_features, n_samples):
                 nca_other = clone(nca_base).set_params(init='pca')
@@ -332,15 +333,16 @@ def test_warm_start_effectiveness():
 def test_verbose(init_name, capsys):
     # assert there is proper output when verbose = 1, for every initialization
     # except auto because auto will call one of the others
+    X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
     regexp_init = r'... done in \ *\d+\.\d{2}s'
     msgs = {'pca': "Finding principal components" + regexp_init,
             'lda': "Finding most discriminative components" + regexp_init}
     if init_name == 'precomputed':
-        init = rng.randn(iris_data.shape[1], iris_data.shape[1])
+        init = rng.randn(X.shape[1], X.shape[1])
     else:
         init = init_name
     nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
-    nca.fit(iris_data, iris_target)
+    nca.fit(X, y)
     out, _ = capsys.readouterr()
 
     # check output

From 44839a022ba6c1810f2c20299081541dce63e869 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 18 Jan 2019 16:50:35 +0100
Subject: [PATCH 60/79] Address reviews
 https://github.com/scikit-learn/scikit-learn/pull/10058#pullrequestreview-191082492
 and
 https://github.com/scikit-learn/scikit-learn/pull/10058#pullrequestreview-191186183

---
 doc/modules/neighbors.rst                     | 23 +----
 examples/neighbors/plot_nca_classification.py |  4 +-
 examples/neighbors/plot_nca_dim_reduction.py  |  4 +-
 sklearn/neighbors/nca.py                      | 91 +++++++++----------
 4 files changed, 49 insertions(+), 73 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 86820b9e19c52..9a89a13b98f6a 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -537,9 +537,8 @@ data  visualization and fast classification.
 
 .. centered:: |nca_illustration_1| |nca_illustration_2|
 
-
 In the above illustrating figure, we consider some points from a randomly
-generated dataset. We focus on the stochastic KNN classification of point n°3,
+generated dataset. We focus on the stochastic KNN classification of point no. 3,
 the thickness of a bond representing a softmax distance hence the weight of the
 neighbor vote in the classification. In the original space, sample 3 has many
 stochastic neighbors from various classes, so the right class is not very
@@ -568,26 +567,6 @@ transformation with a :class:`KNeighborsClassifier` instance that performs the
 classification in the embedding space. Here is an example using the two
 classes:
 
-    >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
-    >>> from sklearn.neighbors import KNeighborsClassifier
-    >>> from sklearn.datasets import load_iris
-    >>> from sklearn.model_selection import train_test_split
-    >>> X, y = load_iris(return_X_y=True)
-    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
-    ... stratify=y, test_size=0.7, random_state=42)
-    >>> nca = NeighborhoodComponentsAnalysis(random_state=42)
-    >>> nca.fit(X_train, y_train) # doctest: +ELLIPSIS
-    NeighborhoodComponentsAnalysis(...)
-    >>> # Apply the learned transformation when using KNeighborsClassifier
-    >>> knn = KNeighborsClassifier(n_neighbors=3)
-    >>> knn.fit(nca.transform(X_train), y_train) # doctest: +ELLIPSIS
-    KNeighborsClassifier(...)
-    >>> print(knn.score(nca.transform(X_test), y_test)) # doctest: +ELLIPSIS
-    0.96190476...
-
-Alternatively, one can create a :class:`sklearn.pipeline.Pipeline` instance
-that automatically applies the transformation when fitting or predicting:
-
     >>> from sklearn.pipeline import Pipeline
     >>> nca = NeighborhoodComponentsAnalysis(random_state=42)
     >>> knn = KNeighborsClassifier(n_neighbors=3)
diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py
index 4cb8f225e8c00..94b81416d802d 100644
--- a/examples/neighbors/plot_nca_classification.py
+++ b/examples/neighbors/plot_nca_classification.py
@@ -21,8 +21,8 @@
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
-from sklearn.neighbors import KNeighborsClassifier, \
-    NeighborhoodComponentsAnalysis
+from sklearn.neighbors import (KNeighborsClassifier,
+                               NeighborhoodComponentsAnalysis)
 from sklearn.pipeline import Pipeline
 
 
diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index edd7034274b69..1ee0031fdeb1c 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -35,8 +35,8 @@
 from sklearn.model_selection import train_test_split
 from sklearn.decomposition import PCA
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-from sklearn.neighbors import KNeighborsClassifier, \
-    NeighborhoodComponentsAnalysis
+from sklearn.neighbors import (KNeighborsClassifier,
+                               NeighborhoodComponentsAnalysis)
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index c0089365aae8d..438b0c8730b68 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -12,7 +12,7 @@
 import sys
 import time
 from scipy.optimize import minimize
-from ..utils.fixes import logsumexp
+from ..utils.extmath import softmax
 from ..metrics import pairwise_distances
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import LabelEncoder
@@ -27,18 +27,25 @@
 class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     """Neighborhood Components Analysis
 
+    Neighborhood Component Analysis (NCA) is a machine learning algorithm for
+    metric learning. It learns a linear transformation in a supervised fashion
+    to improve the classification accuracy of a stochastic nearest neighbors
+    rule in the transformed space.
+
+    Read more in the :ref:`User Guide <NeighborhoodComponentsAnalysis>`.
+
     Parameters
     ----------
     n_components : int, optional (default=None)
         Preferred dimensionality of the embedding.
-        If None it is inferred from ``init``.
+        If None it will be set to ``n_features``.
 
     init : string or numpy array, optional (default='auto')
         Initialization of the linear transformation. Possible options are
         'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
         (n_features_a, n_features_b).
 
-        'auto':
+        'auto'
             Depending on ``n_components``, the most reasonable initialization
             will be chosen among the following ones. First, we try to use
             'lda', as it uses labels information: if ``n_components <=
@@ -47,29 +54,29 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
             if ``n_components < min(n_features, n_samples)``, ``init = 'pca'``.
             Otherwise, we just use 'identity'.
 
-        pca:
-            ``n_components`` many principal components of the inputs passed
+        'pca'
+            ``n_components`` principal components of the inputs passed
             to :meth:`fit` will be used to initialize the transformation.
-            (See :class:`~sklearn.decomposition.PCA`)
+            (See :class:`PCA`)
 
-        lda:
-            ``min(n_components, n_classes)`` many most discriminative
+        'lda'
+            ``min(n_components, n_classes)`` most discriminative
             components of the inputs passed to :meth:`fit` will be used to
             initialize the transformation. (If ``n_components > n_classes``,
             the rest of the components will be zero.) (See
-            :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+            :class:`LinearDiscriminantAnalysis`)
 
-        identity:
+        'identity'
             If ``n_components`` is strictly smaller than the
             dimensionality of the inputs passed to :meth:`fit`, the identity
             matrix will be truncated to the first ``n_components`` rows.
 
-        random:
+        'random'
             The initial transformation will be a random array of shape
             (n_components, n_features). Each value is sampled from the
             standard normal distribution.
 
-        numpy array:
+        numpy array
             n_features_b must match the dimensionality of the inputs passed to
             :meth:`fit` and n_features_a must be less than or equal to that.
             If ``n_components`` is not None, n_features_a must match it.
@@ -162,13 +169,6 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     >>> print(knn.score(nca.transform(X_test), y_test)) # doctest: +ELLIPSIS
     0.961904...
 
-    Notes
-    -----
-    Neighborhood Component Analysis (NCA) is a machine learning algorithm for
-    metric learning. It learns a linear transformation in a supervised fashion
-    to improve the classification accuracy of a stochastic nearest neighbors
-    rule in the transformed space.
-
     References
     ----------
     .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
@@ -184,8 +184,6 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     def __init__(self, n_components=None, init='auto', warm_start=False,
                  max_iter=50, tol=1e-5, callback=None, store_opt_result=False,
                  verbose=0, random_state=None):
-
-        # Parameters
         self.n_components = n_components
         self.init = init
         self.warm_start = warm_start
@@ -215,7 +213,7 @@ def fit(self, X, y):
 
         # Verify inputs X and y and NCA parameters, and transform a copy if
         # needed
-        X_valid, y_valid, init = self._validate_params(X, y)
+        X, y, init = self._validate_params(X, y)
 
         # Initialize the random generator
         self.random_state_ = check_random_state(self.random_state)
@@ -223,18 +221,18 @@ def fit(self, X, y):
         # Measure the total training time
         t_train = time.time()
 
-        # Compute mask that stays fixed during optimization:
-        mask = y_valid[:, np.newaxis] == y_valid[np.newaxis, :]
+        # Compute a mask that stays fixed during optimization:
+        same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
         # (n_samples, n_samples)
 
         # Initialize the transformation
-        transformation = self._initialize(X_valid, y_valid, init)
+        transformation = self._initialize(X, y, init)
 
         # Create a dictionary of parameters to be passed to the optimizer
         disp = self.verbose - 2 if self.verbose > 1 else -1
         optimizer_params = {'method': 'L-BFGS-B',
                             'fun': self._loss_grad_lbfgs,
-                            'args': (X_valid, mask, -1.0),
+                            'args': (X, same_class_mask, -1.0),
                             'jac': True,
                             'x0': transformation,
                             'tol': self.tol,
@@ -247,7 +245,7 @@ def fit(self, X, y):
         opt_result = minimize(**optimizer_params)
 
         # Reshape the solution found by the optimizer
-        self.components_ = opt_result.x.reshape(-1, X_valid.shape[1])
+        self.components_ = opt_result.x.reshape(-1, X.shape[1])
 
         # Stop timer
         t_train = time.time() - t_train
@@ -305,10 +303,10 @@ def _validate_params(self, X, y):
 
         Returns
         -------
-        X_valid : array, shape (n_samples, n_features)
+        X : array, shape (n_samples, n_features)
             The validated training samples.
 
-        y_valid : array, shape (n_samples,)
+        y : array, shape (n_samples,)
             The validated training labels, encoded to be integers in
             the range(0, n_classes).
 
@@ -326,9 +324,9 @@ def _validate_params(self, X, y):
         """
 
         # Validate the inputs X and y, and converts y to numerical classes.
-        X_valid, y_valid = check_X_y(X, y, ensure_min_samples=2)
-        check_classification_targets(y_valid)
-        y_valid = LabelEncoder().fit_transform(y_valid)
+        X, y = check_X_y(X, y, ensure_min_samples=2)
+        check_classification_targets(y)
+        y = LabelEncoder().fit_transform(y)
 
         # Check the preferred embedding dimensionality
         if self.n_components is not None:
@@ -366,12 +364,12 @@ def _validate_params(self, X, y):
             init = check_array(init)
 
             # Assert that init.shape[1] = X.shape[1]
-            if init.shape[1] != X_valid.shape[1]:
+            if init.shape[1] != X.shape[1]:
                 raise ValueError(
                     'The input dimensionality ({}) of the given '
                     'linear transformation `init` must match the '
                     'dimensionality of the given inputs `X` ({}).'
-                    .format(init.shape[1], X_valid.shape[1]))
+                    .format(init.shape[1], X.shape[1]))
 
             # Assert that init.shape[0] <= init.shape[1]
             if init.shape[0] > init.shape[1]:
@@ -397,7 +395,7 @@ def _validate_params(self, X, y):
                 "`init` must be 'auto', 'pca', 'lda', 'identity', 'random' "
                 "or a numpy array of shape (n_components, n_features).")
 
-        return X_valid, y_valid, init
+        return X, y, init
 
     def _initialize(self, X, y, init):
         """Initialize the transformation.
@@ -423,7 +421,6 @@ def _initialize(self, X, y, init):
         transformation = init
         if self.warm_start and hasattr(self, 'components_'):
             transformation = self.components_
-
         elif isinstance(init, np.ndarray):
             pass
         else:
@@ -479,19 +476,19 @@ def _callback(self, transformation):
 
         self.n_iter_ += 1
 
-    def _loss_grad_lbfgs(self, transformation, X, mask, sign=1.0):
+    def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
         """Compute the loss and the loss gradient w.r.t. ``transformation``.
 
         Parameters
         ----------
-        transformation : array, shape (n_components, n_features)
-            The linear transformation on which to compute loss and evaluate
-            gradient
+        transformation : array, shape (n_components * n_features,)
+            The raveled linear transformation on which to compute loss and evaluate
+            gradient.
 
         X : array, shape (n_samples, n_features)
             The training samples.
 
-        mask : array, shape (n_samples, n_samples)
+        same_class_mask : array, shape (n_samples, n_samples)
             A mask where ``mask[i, j] == 1`` if ``X[i]`` and ``X[j]`` belong
             to the same class, and ``0`` otherwise.
 
@@ -523,20 +520,20 @@ def _loss_grad_lbfgs(self, transformation, X, mask, sign=1.0):
         # Compute softmax distances
         p_ij = pairwise_distances(X_embedded, squared=True)
         np.fill_diagonal(p_ij, np.inf)
-        p_ij = np.exp(-p_ij - logsumexp(-p_ij, axis=1)[:, np.newaxis])
-        # (n_samples, n_samples)
+        p_ij = softmax(-p_ij)  # (n_samples, n_samples)
 
         # Compute loss
-        masked_p_ij = p_ij * mask
+        masked_p_ij = p_ij * same_class_mask
         p = np.sum(masked_p_ij, axis=1, keepdims=True)  # (n_samples, 1)
         loss = np.sum(p)
 
         # Compute gradient of loss w.r.t. `transform`
         weighted_p_ij = masked_p_ij - p_ij * p
         weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T
-        np.fill_diagonal(weighted_p_ij_sym, - weighted_p_ij.sum(axis=0))
-        gradient = 2 * (X_embedded.T.dot(weighted_p_ij_sym)).dot(X)
+        np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))
+        gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)
         # time complexity of the gradient: O(n_components x n_samples x (
+
         # n_samples + n_features))
 
         if self.verbose:
@@ -572,7 +569,7 @@ def _check_scalar(x, name, target_type, min_val=None, max_val=None):
         The minimum value value the parameter can take. If None (default) it
         is implied that the parameter does not have a lower bound.
 
-    max_val: float or int, optional (default=None)
+    max_val : float or int, optional (default=None)
         The maximum valid value the parameter can take. If None (default) it
         is implied that the parameter does not have an upper bound.
 

From 822620d457d0d0015616dc2e7dff43de20c10a86 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 18 Jan 2019 17:00:20 +0100
Subject: [PATCH 61/79] STY: fix PEP8 line too long error

---
 sklearn/neighbors/nca.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 438b0c8730b68..df5be58aeee33 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -482,8 +482,8 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
         Parameters
         ----------
         transformation : array, shape (n_components * n_features,)
-            The raveled linear transformation on which to compute loss and evaluate
-            gradient.
+            The raveled linear transformation on which to compute loss and
+            evaluate gradient.
 
         X : array, shape (n_samples, n_features)
             The training samples.

From 41d3cefda5de1c9d6a1bad4f8100dd078a1a2008 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Fri, 18 Jan 2019 18:05:34 +0100
Subject: [PATCH 62/79] Fix doctest

---
 doc/modules/neighbors.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 9a89a13b98f6a..17d72695a4b9f 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -567,7 +567,14 @@ transformation with a :class:`KNeighborsClassifier` instance that performs the
 classification in the embedding space. Here is an example using the two
 classes:
 
+    >>> from sklearn.neighbors import (NeighborhoodComponentsAnalysis,
+    ... KNeighborsClassifier)
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
     >>> from sklearn.pipeline import Pipeline
+    >>> X, y = load_iris(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ... stratify=y, test_size=0.7, random_state=42)
     >>> nca = NeighborhoodComponentsAnalysis(random_state=42)
     >>> knn = KNeighborsClassifier(n_neighbors=3)
     >>> nca_pipe = Pipeline([('nca', nca), ('knn', knn)])

From faa84fc693241a0b550702d608575b0f67b3bd45 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 22 Jan 2019 09:18:22 +0100
Subject: [PATCH 63/79] FIX: remove deprecated assert_true

---
 sklearn/neighbors/tests/test_nca.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 716cdd59b421b..da024afa3be05 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -7,8 +7,7 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import (assert_raises, assert_equal,
-                                   assert_raise_message, assert_warns_message,
-                                   assert_true)
+                                   assert_raise_message, assert_warns_message)
 from sklearn.datasets import load_iris, make_classification, make_blobs
 from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
 from sklearn.metrics import pairwise_distances
@@ -319,13 +318,13 @@ def test_warm_start_effectiveness():
     diff_cold = np.sum(np.abs(transformation_cold_plus_one -
                               transformation_cold))
 
-    assert_true(diff_warm < 3.0,
-                "Transformer changed significantly after one iteration even "
-                "though it was warm-started.")
+    assert (diff_warm < 3.0,
+            "Transformer changed significantly after one iteration even "
+            "though it was warm-started.")
 
-    assert_true(diff_cold > diff_warm,
-                "Cold-started transformer changed less significantly than "
-                "warm-started transformer after one iteration.")
+    assert (diff_cold > diff_warm,
+            "Cold-started transformer changed less significantly than "
+            "warm-started transformer after one iteration.")
 
 
 @pytest.mark.parametrize('init_name', ['pca', 'lda', 'identity', 'random',

From db2950a8b68826740dd680c37c495fcfdd9e2126 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 22 Jan 2019 09:42:41 +0100
Subject: [PATCH 64/79] TST fix assertion always true in tests

---
 sklearn/neighbors/tests/test_nca.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index da024afa3be05..2073819f2f1b7 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -317,14 +317,12 @@ def test_warm_start_effectiveness():
                               transformation_warm))
     diff_cold = np.sum(np.abs(transformation_cold_plus_one -
                               transformation_cold))
+    assert diff_warm < 3.0, ("Transformer changed significantly after one "
+                              "iteration even though it was warm-started.")
 
-    assert (diff_warm < 3.0,
-            "Transformer changed significantly after one iteration even "
-            "though it was warm-started.")
-
-    assert (diff_cold > diff_warm,
-            "Cold-started transformer changed less significantly than "
-            "warm-started transformer after one iteration.")
+    assert diff_cold > diff_warm, ("Cold-started transformer changed less "
+                                   "significantly than warm-started "
+                                   "transformer after one iteration.")
 
 
 @pytest.mark.parametrize('init_name', ['pca', 'lda', 'identity', 'random',

From f16770c8caf9cdd56636fc486d337041694f2552 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 22 Jan 2019 09:48:33 +0100
Subject: [PATCH 65/79] TST: fix PEP8 indent error

---
 sklearn/neighbors/tests/test_nca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 2073819f2f1b7..2b30ae6b7c801 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -318,7 +318,7 @@ def test_warm_start_effectiveness():
     diff_cold = np.sum(np.abs(transformation_cold_plus_one -
                               transformation_cold))
     assert diff_warm < 3.0, ("Transformer changed significantly after one "
-                              "iteration even though it was warm-started.")
+                             "iteration even though it was warm-started.")
 
     assert diff_cold > diff_warm, ("Cold-started transformer changed less "
                                    "significantly than warm-started "

From 49189c69c20743426dea83ba0e406a519776478e Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 22 Jan 2019 12:03:54 +0100
Subject: [PATCH 66/79] API: remove the possibility to store the opt_result
 (see
 https://github.com/scikit-learn/scikit-learn/pull/10058#discussion_r247865298)

---
 sklearn/neighbors/nca.py            | 37 ++--------------------
 sklearn/neighbors/tests/test_nca.py | 49 ++++++++++++++++++++++++-----
 2 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index df5be58aeee33..e8f302fff7eed 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -98,11 +98,6 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
         and the number of iterations. This might be useful in case one wants
         to examine or store the transformation found after each iteration.
 
-    store_opt_result : bool, optional (default=False)
-        If True, the :class:`scipy.optimize.OptimizeResult` object returned by
-        :meth:`minimize` of `scipy.optimize` will be stored as attribute
-        ``opt_result_``.
-
     verbose : int, optional (default=0)
         If 0, no progress messages will be printed.
         If 1, progress messages will be printed to stdout.
@@ -124,29 +119,6 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     n_iter_ : int
         Counts the number of iterations performed by the optimizer.
 
-    opt_result_ : scipy.optimize.OptimizeResult (optional)
-        A dictionary of information representing the optimization result.
-        This is stored only if ``store_opt_result`` is True. It contains the
-        following attributes:
-
-        x : ndarray
-            The solution of the optimization.
-        success : bool
-            Whether or not the optimizer exited successfully.
-        status : int
-            Termination status of the optimizer.
-        message : str
-            Description of the cause of the termination.
-        fun, jac : ndarray
-            Values of objective function and its Jacobian.
-        hess_inv : scipy.sparse.linalg.LinearOperator
-            the product of a vector with the approximate inverse of the
-            Hessian of the objective function..
-        nfev : int
-            Number of evaluations of the objective function..
-        nit : int
-            Number of iterations performed by the optimizer.
-
     Examples
     --------
     >>> from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
@@ -182,15 +154,14 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self, n_components=None, init='auto', warm_start=False,
-                 max_iter=50, tol=1e-5, callback=None, store_opt_result=False,
-                 verbose=0, random_state=None):
+                 max_iter=50, tol=1e-5, callback=None, verbose=0,
+                 random_state=None):
         self.n_components = n_components
         self.init = init
         self.warm_start = warm_start
         self.max_iter = max_iter
         self.tol = tol
         self.callback = callback
-        self.store_opt_result = store_opt_result
         self.verbose = verbose
         self.random_state = random_state
 
@@ -260,10 +231,6 @@ def fit(self, X, y):
 
             print('[{}] Training took {:8.2f}s.'.format(cls_name, t_train))
 
-        # Optionally store information returned by the optimizer
-        if self.store_opt_result:
-            self.opt_result_ = opt_result
-
         return self
 
     def transform(self, X):
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 2b30ae6b7c801..574e4db9371b0 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -54,13 +54,32 @@ def test_toy_example_collapse_points():
     two_points = rng.randn(2, input_dim)
     X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
     y = [0, 0, 1]
+
+    class LossStorer:
+
+        def __init__(self, X, y):
+            self.loss = np.inf  # initialize the loss to very high
+            # Initialize a fake NCA and variables needed to compute the loss:
+            self.fake_nca = NeighborhoodComponentsAnalysis()
+            self.fake_nca.n_iter_ = np.inf
+            self.X, y, _ = self.fake_nca._validate_params(X, y)
+            self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
+
+        def callback(self, transformation, n_iter):
+            """Stores the last value of the loss function"""
+            self.loss, _ = self.fake_nca._loss_grad_lbfgs(transformation,
+                                                          self.X,
+                                                          self.same_class_mask,
+                                                          -1.0)
+
+    loss_storer = LossStorer(X, y)
     nca = NeighborhoodComponentsAnalysis(random_state=42,
-                                         store_opt_result=True)
+                                         callback=loss_storer.callback)
     X_t = nca.fit_transform(X, y)
     print(X_t)
     # test that points are collapsed into one point
     assert_array_almost_equal(X_t - X_t[0], 0.)
-    assert nca.opt_result_.fun + 1 < 1e-10
+    assert abs(loss_storer.loss + 1) < 1e-10
 
 
 def test_finite_differences():
@@ -444,15 +463,31 @@ def my_cb(transformation, n_iter):
     assert('{} iterations remaining...'.format(max_iter - 1) in out)
 
 
-def test_store_opt_result():
+def test_expected_transformation_shape():
+    """Test that the transformation has the expected shape."""
     X = iris_data
     y = iris_target
 
-    nca = NeighborhoodComponentsAnalysis(max_iter=5,
-                                         store_opt_result=True)
+    class TransformationStorer:
+
+        def __init__(self, X, y):
+            # Initialize a fake NCA and variables needed to call the loss
+            # function:
+            self.fake_nca = NeighborhoodComponentsAnalysis()
+            self.fake_nca.n_iter_ = np.inf
+            self.X, y, _ = self.fake_nca._validate_params(X, y)
+            self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
+
+        def callback(self, transformation, n_iter):
+            """Stores the last value of the transformation taken as input by
+            the optimizer"""
+            self.transformation = transformation
+
+    transformation_storer = TransformationStorer(X, y)
+    cb = transformation_storer.callback
+    nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb)
     nca.fit(X, y)
-    transformation = nca.opt_result_.x
-    assert_equal(transformation.size, X.shape[1]**2)
+    assert_equal(transformation_storer.transformation.size, X.shape[1]**2)
 
 
 def test_convergence_warning():

From f015bade8cf4b3a3dafdfdd5c9c0778a705b248c Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 25 Feb 2019 12:36:36 +0100
Subject: [PATCH 67/79] Move examples up in documentation and add NCA to
 manifold examples

---
 doc/modules/neighbors.rst            | 12 ++++++------
 examples/manifold/plot_lle_digits.py | 18 +++++++++++++++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 17d72695a4b9f..8ade8230fbe90 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -631,6 +631,12 @@ each method. Each data sample belongs to one of 10 classes.
 .. centered:: |nca_dim_reduction_1| |nca_dim_reduction_2| |nca_dim_reduction_3|
 
 
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_classification.py`
+ * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_dim_reduction.py`
+ * :ref:`sphx_glr_auto_examples_manifold_plot_lle_digits.py`
+
 Mathematical formulation
 ------------------------
 
@@ -703,12 +709,6 @@ complexity equals ``n_components * n_features * n_samples_test``. There is no
 added space complexity in the operation.
 
 
-.. topic:: Examples:
-
- * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_classification.py`
- * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_dim_reduction.py`
-
-
 .. topic:: References:
 
    * | `"Neighbourhood Components Analysis". Advances in Neural Information"
diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py
index 133d81bab0f62..699bd612eccd4 100644
--- a/examples/manifold/plot_lle_digits.py
+++ b/examples/manifold/plot_lle_digits.py
@@ -15,6 +15,11 @@
 this example, which is not the default setting. It ensures global stability
 of the embedding, i.e., the embedding does not depend on random
 initialization.
+
+Linear Discriminant Analysis, from the :mod:`sklearn.discriminant_analysis`
+module, and Neighborhood Components Analysis, from the :mod:`sklearn.neighbors`
+module, are supervised dimensionality reduction method, i.e. they make use of
+the provided labels, contrary to other methods.
 """
 
 # Authors: Fabian Pedregosa <fabian.pedregosa@inria.fr>
@@ -30,7 +35,7 @@
 import matplotlib.pyplot as plt
 from matplotlib import offsetbox
 from sklearn import (manifold, datasets, decomposition, ensemble,
-                     discriminant_analysis, random_projection)
+                     discriminant_analysis, random_projection, neighbors)
 
 digits = datasets.load_digits(n_class=6)
 X = digits.data
@@ -227,4 +232,15 @@ def plot_embedding(X, title=None):
                "t-SNE embedding of the digits (time %.2fs)" %
                (time() - t0))
 
+#----------------------------------------------------------------------
+# NCA embedding of the digits dataset
+print("Computing NCA embedding")
+nca = neighbors.NeighborhoodComponentsAnalysis(n_components=2, random_state=0)
+t0 = time()
+X_nca = nca.fit_transform(X, y)
+
+plot_embedding(X_nca,
+               "NCA embedding of the digits (time %.2fs)" %
+               (time() - t0))
+
 plt.show()

From 0e5d5b33a6b143ab01696f81f430ccc8af8d731f Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 25 Feb 2019 13:57:37 +0100
Subject: [PATCH 68/79] STY: fix pep8 errors

---
 examples/manifold/plot_lle_digits.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py
index 699bd612eccd4..de893bff4207e 100644
--- a/examples/manifold/plot_lle_digits.py
+++ b/examples/manifold/plot_lle_digits.py
@@ -44,7 +44,7 @@
 n_neighbors = 30
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Scale and visualize the embedding vectors
 def plot_embedding(X, title=None):
     x_min, x_max = np.min(X, 0), np.max(X, 0)
@@ -75,7 +75,7 @@ def plot_embedding(X, title=None):
         plt.title(title)
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Plot images of the digits
 n_img_per_row = 20
 img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
@@ -91,7 +91,7 @@ def plot_embedding(X, title=None):
 plt.title('A selection from the 64-dimensional digits dataset')
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Random 2D projection using a random unitary matrix
 print("Computing random projection")
 rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
@@ -109,7 +109,7 @@ def plot_embedding(X, title=None):
                "Principal Components projection of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Projection on to the first 2 linear discriminant components
 
 print("Computing Linear Discriminant Analysis projection")
@@ -122,7 +122,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Isomap projection of the digits dataset
 print("Computing Isomap embedding")
 t0 = time()
@@ -133,7 +133,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Locally linear embedding of the digits dataset
 print("Computing LLE embedding")
 clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -146,7 +146,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Modified Locally linear embedding of the digits dataset
 print("Computing modified LLE embedding")
 clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -159,7 +159,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # HLLE embedding of the digits dataset
 print("Computing Hessian LLE embedding")
 clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -172,7 +172,7 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # LTSA embedding of the digits dataset
 print("Computing LTSA embedding")
 clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
@@ -184,7 +184,7 @@ def plot_embedding(X, title=None):
                "Local Tangent Space Alignment of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # MDS  embedding of the digits dataset
 print("Computing MDS embedding")
 clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
@@ -195,7 +195,7 @@ def plot_embedding(X, title=None):
                "MDS embedding of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Random Trees embedding of the digits dataset
 print("Computing Totally Random Trees embedding")
 hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
@@ -209,7 +209,7 @@ def plot_embedding(X, title=None):
                "Random forest embedding of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Spectral embedding of the digits dataset
 print("Computing Spectral embedding")
 embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
@@ -221,7 +221,7 @@ def plot_embedding(X, title=None):
                "Spectral embedding of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # t-SNE embedding of the digits dataset
 print("Computing t-SNE embedding")
 tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
@@ -232,7 +232,7 @@ def plot_embedding(X, title=None):
                "t-SNE embedding of the digits (time %.2fs)" %
                (time() - t0))
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # NCA embedding of the digits dataset
 print("Computing NCA embedding")
 nca = neighbors.NeighborhoodComponentsAnalysis(n_components=2, random_state=0)

From 77dc953afc58425b5f31d9571f6c7b47aeecb12d Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Feb 2019 10:20:52 +0100
Subject: [PATCH 69/79] adress gael's review except
 https://github.com/scikit-learn/scikit-learn/pull/10058#pullrequestreview-207481064

---
 sklearn/neighbors/nca.py            | 63 ++++-------------------------
 sklearn/neighbors/tests/test_nca.py | 47 ++++++++++++---------
 sklearn/utils/validation.py         | 42 +++++++++++++++++++
 3 files changed, 78 insertions(+), 74 deletions(-)

diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index e8f302fff7eed..62bea4ab70839 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -19,7 +19,8 @@
 from ..decomposition import PCA
 from ..utils.multiclass import check_classification_targets
 from ..utils.random import check_random_state
-from ..utils.validation import check_is_fitted, check_array, check_X_y
+from ..utils.validation import (check_is_fitted, check_array, check_X_y,
+                                check_scalar)
 from ..externals.six import integer_types
 from ..exceptions import ConvergenceWarning
 
@@ -297,8 +298,8 @@ def _validate_params(self, X, y):
 
         # Check the preferred embedding dimensionality
         if self.n_components is not None:
-            _check_scalar(self.n_components, 'n_components',
-                          integer_types, 1)
+            check_scalar(self.n_components, 'n_components',
+                         integer_types, 1)
 
             if self.n_components > X.shape[1]:
                 raise ValueError('The preferred embedding dimensionality '
@@ -307,7 +308,7 @@ def _validate_params(self, X, y):
                                  .format(self.n_components, X.shape[1]))
 
         # If warm_start is enabled, check that the inputs are consistent
-        _check_scalar(self.warm_start, 'warm_start', bool)
+        check_scalar(self.warm_start, 'warm_start', bool)
         if self.warm_start and hasattr(self, 'components_'):
             if self.components_.shape[1] != X.shape[1]:
                 raise ValueError('The new inputs dimensionality ({}) does not '
@@ -316,9 +317,9 @@ def _validate_params(self, X, y):
                                  .format(X.shape[1],
                                          self.components_.shape[1]))
 
-        _check_scalar(self.max_iter, 'max_iter', integer_types, 1)
-        _check_scalar(self.tol, 'tol', float, 0.)
-        _check_scalar(self.verbose, 'verbose', integer_types, 0)
+        check_scalar(self.max_iter, 'max_iter', integer_types, 1)
+        check_scalar(self.tol, 'tol', float, 0.)
+        check_scalar(self.verbose, 'verbose', integer_types, 0)
 
         if self.callback is not None:
             if not callable(self.callback):
@@ -500,7 +501,6 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
         np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))
         gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)
         # time complexity of the gradient: O(n_components x n_samples x (
-
         # n_samples + n_features))
 
         if self.verbose:
@@ -511,50 +511,3 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
             sys.stdout.flush()
 
         return sign * loss, sign * gradient.ravel()
-
-
-##########################
-# Some helper functions #
-#########################
-
-
-def _check_scalar(x, name, target_type, min_val=None, max_val=None):
-    """Validate scalar parameters type and value.
-
-    Parameters
-    ----------
-    x : object
-        The scalar parameter to validate.
-
-    name : str
-        The name of the parameter to be printed in error messages.
-
-    target_type : type or tuple
-        Acceptable data types for the parameter.
-
-    min_val : float or int, optional (default=None)
-        The minimum value value the parameter can take. If None (default) it
-        is implied that the parameter does not have a lower bound.
-
-    max_val : float or int, optional (default=None)
-        The maximum valid value the parameter can take. If None (default) it
-        is implied that the parameter does not have an upper bound.
-
-    Raises
-    -------
-    TypeError
-        If the parameter's type does not match the desired type.
-
-    ValueError
-        If the parameter's value violates the given bounds.
-    """
-
-    if not isinstance(x, target_type):
-        raise TypeError('`{}` must be an instance of {}, not {}.'
-                        .format(name, target_type, type(x)))
-
-    if min_val is not None and x < min_val:
-        raise ValueError('`{}`= {}, must be >= {}.'.format(name, x, min_val))
-
-    if max_val is not None and x > max_val:
-        raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val))
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 574e4db9371b0..a486cd79fb810 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -50,8 +50,9 @@ def test_toy_example_collapse_points():
     (because d>=0), with an objective equal to 1 (loss=-1.).
 
     """
+    random_state = np.random.RandomState(42)
     input_dim = 5
-    two_points = rng.randn(2, input_dim)
+    two_points = random_state.randn(2, input_dim)
     X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
     y = [0, 0, 1]
 
@@ -89,15 +90,19 @@ def test_finite_differences():
     approximation.
     """
     # Initialize the transformation `M`, as well as `X` and `y` and `NCA`
+    random_state = np.random.RandomState(42)
     X, y = make_classification()
-    M = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
+    M = random_state.randn(random_state.randint(1, X.shape[1] + 1),
+                           X.shape[1])
     nca = NeighborhoodComponentsAnalysis()
     nca.n_iter_ = 0
     mask = y[:, np.newaxis] == y[np.newaxis, :]
 
-    def fun(M): return nca._loss_grad_lbfgs(M, X, mask)[0]
+    def fun(M):
+        return nca._loss_grad_lbfgs(M, X, mask)[0]
 
-    def grad(M): return nca._loss_grad_lbfgs(M, X, mask)[1]
+    def grad(M):
+        return nca._loss_grad_lbfgs(M, X, mask)[1]
 
     # compute relative error
     rel_diff = check_grad(fun, grad, M.ravel()) / np.linalg.norm(grad(M))
@@ -109,6 +114,7 @@ def test_params_validation():
     X = np.arange(12).reshape(4, 3)
     y = [1, 1, 2, 2]
     NCA = NeighborhoodComponentsAnalysis
+    random_state = np.random.RandomState(42)
 
     # TypeError
     assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
@@ -127,7 +133,7 @@ def test_params_validation():
                          '`max_iter`= -1, must be >= 1.',
                          NCA(max_iter=-1).fit, X, y)
 
-    init = rng.rand(5, 3)
+    init = random_state.rand(5, 3)
     assert_raise_message(ValueError,
                          'The output dimensionality ({}) of the given linear '
                          'transformation `init` cannot be greater than its '
@@ -168,10 +174,11 @@ def test_transformation_dimensions():
 
 
 def test_n_components():
+    random_state = np.random.RandomState(42)
     X = np.arange(12).reshape(4, 3)
     y = [1, 1, 2, 2]
 
-    init = rng.rand(X.shape[1] - 1, 3)
+    init = random_state.rand(X.shape[1] - 1, 3)
 
     # n_components = X.shape[1] != transformation.shape[0]
     n_components = X.shape[1]
@@ -200,6 +207,7 @@ def test_n_components():
 
 
 def test_init_transformation():
+    random_state = np.random.RandomState(42)
     X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
 
     # Start learning from scratch
@@ -222,12 +230,12 @@ def test_init_transformation():
     nca_lda = NeighborhoodComponentsAnalysis(init='lda')
     nca_lda.fit(X, y)
 
-    init = rng.rand(X.shape[1], X.shape[1])
+    init = random_state.rand(X.shape[1], X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
     nca.fit(X, y)
 
     # init.shape[1] must match X.shape[1]
-    init = rng.rand(X.shape[1], X.shape[1] + 1)
+    init = random_state.rand(X.shape[1], X.shape[1] + 1)
     nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raise_message(ValueError,
                          'The input dimensionality ({}) of the given '
@@ -237,7 +245,7 @@ def test_init_transformation():
                          nca.fit, X, y)
 
     # init.shape[0] must be <= init.shape[1]
-    init = rng.rand(X.shape[1] + 1, X.shape[1])
+    init = random_state.rand(X.shape[1] + 1, X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raise_message(ValueError,
                          'The output dimensionality ({}) of the given '
@@ -247,7 +255,7 @@ def test_init_transformation():
                          nca.fit, X, y)
 
     # init.shape[0] must match n_components
-    init = rng.rand(X.shape[1], X.shape[1])
+    init = random_state.rand(X.shape[1], X.shape[1])
     n_components = X.shape[1] - 2
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,
@@ -266,15 +274,17 @@ def test_init_transformation():
 def test_auto_init(n_samples, n_features, n_classes, n_components):
     # Test that auto choose the init as expected with every configuration
     # of order of n_samples, n_features, n_classes and n_components.
+    random_state = np.random.RandomState(42)
     nca_base = NeighborhoodComponentsAnalysis(init='auto',
                                               n_components=n_components,
-                                              max_iter=1, random_state=rng)
+                                              max_iter=1,
+                                              random_state=random_state)
     if n_classes >= n_samples:
         pass
         # n_classes > n_samples is impossible, and n_classes == n_samples
         # throws an error from lda but is an absurd case
     else:
-        X = rng.randn(n_samples, n_features)
+        X = random_state.randn(n_samples, n_features)
         y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
         if n_components > n_features:
             # this would return a ValueError, which is already tested in
@@ -316,20 +326,18 @@ def test_warm_start_effectiveness():
     # A 1-iteration second fit on same data should give almost same result
     # with warm starting, and quite different result without warm starting.
 
-    X, y = load_iris(return_X_y=True)
-
     nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
-    nca_warm.fit(X, y)
+    nca_warm.fit(iris_data, iris_target)
     transformation_warm = nca_warm.components_
     nca_warm.max_iter = 1
-    nca_warm.fit(X, y)
+    nca_warm.fit(iris_data, iris_target)
     transformation_warm_plus_one = nca_warm.components_
 
     nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
-    nca_cold.fit(X, y)
+    nca_cold.fit(iris_data, iris_target)
     transformation_cold = nca_cold.components_
     nca_cold.max_iter = 1
-    nca_cold.fit(X, y)
+    nca_cold.fit(iris_data, iris_target)
     transformation_cold_plus_one = nca_cold.components_
 
     diff_warm = np.sum(np.abs(transformation_warm_plus_one -
@@ -349,12 +357,13 @@ def test_warm_start_effectiveness():
 def test_verbose(init_name, capsys):
     # assert there is proper output when verbose = 1, for every initialization
     # except auto because auto will call one of the others
+    random_state = np.random.RandomState(42)
     X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
     regexp_init = r'... done in \ *\d+\.\d{2}s'
     msgs = {'pca': "Finding principal components" + regexp_init,
             'lda': "Finding most discriminative components" + regexp_init}
     if init_name == 'precomputed':
-        init = rng.randn(X.shape[1], X.shape[1])
+        init = random_state.randn(X.shape[1], X.shape[1])
     else:
         init = init_name
     nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 9810f7f865fc3..7ad913034471d 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -936,3 +936,45 @@ def check_non_negative(X, whom):
 
     if X_min < 0:
         raise ValueError("Negative values in data passed to %s" % whom)
+
+
+def check_scalar(x, name, target_type, min_val=None, max_val=None):
+    """Validate scalar parameters type and value.
+
+    Parameters
+    ----------
+    x : object
+        The scalar parameter to validate.
+
+    name : str
+        The name of the parameter to be printed in error messages.
+
+    target_type : type or tuple
+        Acceptable data types for the parameter.
+
+    min_val : float or int, optional (default=None)
+        The minimum value value the parameter can take. If None (default) it
+        is implied that the parameter does not have a lower bound.
+
+    max_val : float or int, optional (default=None)
+        The maximum valid value the parameter can take. If None (default) it
+        is implied that the parameter does not have an upper bound.
+
+    Raises
+    -------
+    TypeError
+        If the parameter's type does not match the desired type.
+
+    ValueError
+        If the parameter's value violates the given bounds.
+    """
+
+    if not isinstance(x, target_type):
+        raise TypeError('`{}` must be an instance of {}, not {}.'
+                        .format(name, target_type, type(x)))
+
+    if min_val is not None and x < min_val:
+        raise ValueError('`{}`= {}, must be >= {}.'.format(name, x, min_val))
+
+    if max_val is not None and x > max_val:
+        raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val))

From a65318972f1c5965bc3bc839bce75b60e46c36da Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Feb 2019 11:14:04 +0100
Subject: [PATCH 70/79] Address aurelien's review

---
 doc/modules/neighbors.rst                     | 44 +++++++++++--------
 examples/neighbors/plot_nca_classification.py | 15 ++++---
 examples/neighbors/plot_nca_dim_reduction.py  |  2 +-
 3 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 8ade8230fbe90..1c208d9d4fa97 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -538,13 +538,16 @@ data  visualization and fast classification.
 .. centered:: |nca_illustration_1| |nca_illustration_2|
 
 In the above illustrating figure, we consider some points from a randomly
-generated dataset. We focus on the stochastic KNN classification of point no. 3,
-the thickness of a bond representing a softmax distance hence the weight of the
-neighbor vote in the classification. In the original space, sample 3 has many
-stochastic neighbors from various classes, so the right class is not very
-likely. However, in the embedding space learned by NCA, the only non-negligible
-stochastic neighbors are from the same class as sample 3, guaranteeing that the
-latter will be well classified.
+generated dataset. We focus on the stochastic KNN classification of point no.
+3. The thickness of a link between sample 3 and another point is proportional
+to their distance, and can be seen as the relative weight (or probability) that
+a stochastic nearest neighbor prediction rule would assign to this point. In
+the original space, sample 3 has many stochastic neighbors from various
+classes, so the right class is not very likely. However, in the embedding space
+learned by NCA, the only stochastic neighbors with non-negligible weight are
+from the same class as sample 3, guaranteeing that the latter will be well
+classified. See the :ref:`mathematical formulation<mathematical_formulation>`
+for more details.
 
 
 Classification
@@ -595,7 +598,7 @@ classes:
 
 The plot shows decision boundaries for Nearest Neighbor Classification and
 Neighborhood Components Analysis classification on the iris dataset, when
-training and scoring on only two features, for visualisation purpose.
+training and scoring on only two features, for visualisation purposes.
 
 .. _nca_dim_reduction:
 
@@ -637,12 +640,15 @@ each method. Each data sample belongs to one of 10 classes.
  * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_dim_reduction.py`
  * :ref:`sphx_glr_auto_examples_manifold_plot_lle_digits.py`
 
+.. _nca_mathematical_formulation:
+
 Mathematical formulation
 ------------------------
 
 The goal of NCA is to learn an optimal linear transformation matrix of size
-``(n_components, n_features)``, which maximises in average the probability
-:math:`p_i` of sample :math:`i` being correctly classified, i.e.:
+``(n_components, n_features)``, which maximises the sum over all samples
+:math:`i` of the probability :math:`p_i` that math:`i` is correctly classified,
+i.e.:
 
 .. math::
 
@@ -682,10 +688,10 @@ where :math:`M = L^T L` is a symmetric positive semi-definite matrix of size
 Implementation
 --------------
 
-This implementation follows what is explained in the original paper. For the
-optimisation method, it currently uses scipy's l-bfgs-b with a full gradient
-computation at each iteration, to avoid to tune the learning rate and provide
-stable learning.
+This implementation follows what is explained in the original paper [1]_. For
+the optimisation method, it currently uses scipy's l-bfgs-b with a full
+gradient computation at each iteration, to avoid to tune the learning rate and
+provide stable learning.
 
 See the examples below and the doc  string of
 :meth:`NeighborhoodComponentsAnalysis.fit` for further information.
@@ -711,10 +717,10 @@ added space complexity in the operation.
 
 .. topic:: References:
 
-   * | `"Neighbourhood Components Analysis". Advances in Neural Information"
-       <http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf>`_,
-     | J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov, Advances in
-     | Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520.
+.. [1] `"Neighbourhood Components Analysis". Advances in Neural Information"
+     <http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf>`_,
+     J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov, Advances in
+     Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520.
 
-   * `Wikipedia entry on Neighborhood Components Analysis
+.. [2] `Wikipedia entry on Neighborhood Components Analysis
      <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py
index 94b81416d802d..9ba8cd76b24ad 100644
--- a/examples/neighbors/plot_nca_classification.py
+++ b/examples/neighbors/plot_nca_classification.py
@@ -1,15 +1,16 @@
 """
-============================================================================
-Comparing Nearest Neighbors and Neighborhood Components Analysis
-============================================================================
+=============================================================================
+Comparing Nearest Neighbors with and without Neighborhood Components Analysis
+=============================================================================
 
 An example comparing nearest neighbors classification with and without
 Neighborhood Components Analysis.
 
-It will plot the decision boundaries for each class determined by a simple
-Nearest Neighbors classifier against the decision boundaries determined by a
-Neighborhood Components Analysis classifier. The latter aims to find a distance
-metric that maximizes the nearest neighbor classification accuracy on a given
+It will plot the class decision boundaries given by a Nearest Neighbors
+classifier when using the Euclidean distance on the original features, versus
+using the Euclidean distance after the transformation learned by Neighborhood
+Components Analysis. The latter aims to find a linear transformation that
+maximises the (stochastic) nearest neighbor classification accuracy on the
 training set.
 """
 
diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index 1ee0031fdeb1c..127783e00c4cf 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -24,7 +24,7 @@
 Like LDA, it is a supervised method.
 
 One can see that NCA enforces a clustering of the data that is visually
-meaningful even after the large dimensionality reduction.
+meaningful despite the large reduction in dimension.
 """
 
 # License: BSD 3 clause

From be9b1e1724977655d1bc73eb007b7164fc820d70 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Feb 2019 11:16:47 +0100
Subject: [PATCH 71/79] Simplify test about auto init even more

---
 sklearn/neighbors/tests/test_nca.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index a486cd79fb810..19bd286c0dee8 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -267,10 +267,10 @@ def test_init_transformation():
                          nca.fit, X, y)
 
 
-@pytest.mark.parametrize('n_samples', [17, 19, 23, 29])
-@pytest.mark.parametrize('n_features', [17, 19, 23, 29])
-@pytest.mark.parametrize('n_classes', [17, 19, 23])
-@pytest.mark.parametrize('n_components', [17, 19, 23, 29])
+@pytest.mark.parametrize('n_samples', [3, 5, 7, 11])
+@pytest.mark.parametrize('n_features', [3, 5, 7, 11])
+@pytest.mark.parametrize('n_classes', [5, 7, 11])
+@pytest.mark.parametrize('n_components', [3, 5, 7, 11])
 def test_auto_init(n_samples, n_features, n_classes, n_components):
     # Test that auto choose the init as expected with every configuration
     # of order of n_samples, n_features, n_classes and n_components.

From 2b1c8f21647abd2299a47af236b5a03f854ea572 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Feb 2019 12:28:53 +0100
Subject: [PATCH 72/79] Fix doc and replace embedding by projection for
 consistency

---
 doc/modules/decomposition.rst                |  2 +-
 doc/modules/neighbors.rst                    | 22 ++++++++++----------
 examples/manifold/plot_lle_digits.py         |  6 +++---
 examples/neighbors/plot_nca_dim_reduction.py |  2 +-
 sklearn/neighbors/nca.py                     | 20 ++++++++++--------
 sklearn/neighbors/tests/test_nca.py          | 22 +++++++++++---------
 6 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index 2e318fce505a4..5bfa96bbb759c 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -955,5 +955,5 @@ when data can be fetched sequentially.
       M. Hoffman, D. Blei, C. Wang, J. Paisley, 2013
 
 
-See also :ref:`_nca_dim_reduction` for dimensionality reduction with
+See also :ref:`nca_dim_reduction` for dimensionality reduction with
 Neighborhood Components Analysis.
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 1c208d9d4fa97..2d9b606767bbc 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -522,10 +522,10 @@ Neighborhood Components Analysis
 Neighborhood Components Analysis (NCA, :class:`NeighborhoodComponentsAnalysis`)
 is a distance metric learning algorithm which aims to improve the accuracy of
 nearest neighbors classification compared to the standard Euclidean distance.
-The algorithm  directly  maximizes  a stochastic  variant  of  the
-leave-one-out k-nearest neighbors (KNN) score on the training set.  It can also
-learn a low-dimensional linear  embedding  of  data  that  can  be used for
-data  visualization and fast classification.
+The algorithm directly maximizes a stochastic variant of the leave-one-out
+k-nearest neighbors (KNN) score on the training set. It can also learn a
+low-dimensional linear projection of data that can be used for data
+visualization and fast classification.
 
 .. |nca_illustration_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_illustration_001.png
    :target: ../auto_examples/neighbors/plot_nca_illustration.html
@@ -543,7 +543,7 @@ generated dataset. We focus on the stochastic KNN classification of point no.
 to their distance, and can be seen as the relative weight (or probability) that
 a stochastic nearest neighbor prediction rule would assign to this point. In
 the original space, sample 3 has many stochastic neighbors from various
-classes, so the right class is not very likely. However, in the embedding space
+classes, so the right class is not very likely. However, in the projected space
 learned by NCA, the only stochastic neighbors with non-negligible weight are
 from the same class as sample 3, guaranteeing that the latter will be well
 classified. See the :ref:`mathematical formulation<mathematical_formulation>`
@@ -567,7 +567,7 @@ irregular decision boundaries.
 To use this model for classification, one needs to combine a
 :class:`NeighborhoodComponentsAnalysis` instance that learns the optimal
 transformation with a :class:`KNeighborsClassifier` instance that performs the
-classification in the embedding space. Here is an example using the two
+classification in the projected space. Here is an example using the two
 classes:
 
     >>> from sklearn.neighbors import (NeighborhoodComponentsAnalysis,
@@ -616,8 +616,8 @@ Neighborhood Component Analysis (:class:`NeighborhoodComponentsAnalysis`) on
 the Digits dataset, a dataset with size :math:`n_{samples} = 1797` and
 :math:`n_{features} = 64`. The data set is split into a training and a test set
 of equal size, then standardized. For evaluation the 3-nearest neighbor
-classification accuracy is computed on the 2-dimensional embedding found by
-each method. Each data sample belongs to one of 10 classes.
+classification accuracy is computed on the 2-dimensional projected points found
+by each method. Each data sample belongs to one of 10 classes.
 
 .. |nca_dim_reduction_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nca_dim_reduction_001.png
    :target: ../auto_examples/neighbors/plot_nca_dim_reduction.html
@@ -703,9 +703,9 @@ Training
 ^^^^^^^^
 NCA stores a matrix of pairwise distances, taking ``n_samples ** 2`` memory.
 Time complexity depends on the number of iterations done by the optimisation
- algorithm. However, one can set the maximum number of iterations with the
- argument ``max_iter``. For each iteration, time complexity is
- ``O(n_components x n_samples x min(n_samples, n_features)``.
+algorithm. However, one can set the maximum number of iterations with the
+argument ``max_iter``. For each iteration, time complexity is
+``O(n_components x n_samples x min(n_samples, n_features)``.
 
 
 Transform
diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py
index de893bff4207e..4a3002a05d0dd 100644
--- a/examples/manifold/plot_lle_digits.py
+++ b/examples/manifold/plot_lle_digits.py
@@ -124,7 +124,7 @@ def plot_embedding(X, title=None):
 
 # ----------------------------------------------------------------------
 # Isomap projection of the digits dataset
-print("Computing Isomap embedding")
+print("Computing Isomap projection")
 t0 = time()
 X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X)
 print("Done.")
@@ -233,8 +233,8 @@ def plot_embedding(X, title=None):
                (time() - t0))
 
 # ----------------------------------------------------------------------
-# NCA embedding of the digits dataset
-print("Computing NCA embedding")
+# NCA projection of the digits dataset
+print("Computing NCA projection")
 nca = neighbors.NeighborhoodComponentsAnalysis(n_components=2, random_state=0)
 t0 = time()
 X_nca = nca.fit_transform(X, y)
diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index 127783e00c4cf..ea06b2768e851 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -93,7 +93,7 @@
     # Embed the data set in 2 dimensions using the fitted model
     X_embedded = model.transform(X)
 
-    # Plot the embedding and show the evaluation score
+    # Plot the projected points and show the evaluation score
     plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')
     plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
                                                               n_neighbors,
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 62bea4ab70839..caf71134b9bb9 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -38,7 +38,7 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     Parameters
     ----------
     n_components : int, optional (default=None)
-        Preferred dimensionality of the embedding.
+        Preferred dimensionality of the projected space.
         If None it will be set to ``n_features``.
 
     init : string or numpy array, optional (default='auto')
@@ -296,15 +296,16 @@ def _validate_params(self, X, y):
         check_classification_targets(y)
         y = LabelEncoder().fit_transform(y)
 
-        # Check the preferred embedding dimensionality
+        # Check the preferred dimensionality of the projected space
         if self.n_components is not None:
             check_scalar(self.n_components, 'n_components',
                          integer_types, 1)
 
             if self.n_components > X.shape[1]:
-                raise ValueError('The preferred embedding dimensionality '
-                                 '`n_components` ({}) cannot be greater '
-                                 'than the given data dimensionality ({})!'
+                raise ValueError('The preferred dimensionality of the '
+                                 'projected space `n_components` ({}) cannot '
+                                 'be greater than the given data '
+                                 'dimensionality ({})!'
                                  .format(self.n_components, X.shape[1]))
 
         # If warm_start is enabled, check that the inputs are consistent
@@ -350,10 +351,11 @@ def _validate_params(self, X, y):
             if self.n_components is not None:
                 # Assert that self.n_components = init.shape[0]
                 if self.n_components != init.shape[0]:
-                    raise ValueError('The preferred embedding dimensionality '
-                                     '`n_components` ({}) does not match '
-                                     'the output dimensionality of the given '
-                                     'linear transformation `init` ({})!'
+                    raise ValueError('The preferred dimensionality of the '
+                                     'projected space `n_components` ({}) does'
+                                     ' not match the output dimensionality of '
+                                     'the given linear transformation '
+                                     '`init` ({})!'
                                      .format(self.n_components,
                                              init.shape[0]))
         elif init in ['auto', 'pca', 'lda', 'identity', 'random']:
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 19bd286c0dee8..9920190f764d7 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -143,9 +143,10 @@ def test_params_validation():
 
     n_components = 10
     assert_raise_message(ValueError,
-                         'The preferred embedding dimensionality '
-                         '`n_components` ({}) cannot be greater '
-                         'than the given data dimensionality ({})!'
+                         'The preferred dimensionality of the '
+                         'projected space `n_components` ({}) cannot '
+                         'be greater than the given data '
+                         'dimensionality ({})!'
                          .format(n_components, X.shape[1]),
                          NCA(n_components=n_components).fit, X, y)
 
@@ -184,8 +185,8 @@ def test_n_components():
     n_components = X.shape[1]
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,
-                         'The preferred embedding dimensionality '
-                         '`n_components` ({}) does not match '
+                         'The preferred dimensionality of the '
+                         'projected space `n_components` ({}) does not match '
                          'the output dimensionality of the given '
                          'linear transformation `init` ({})!'
                          .format(n_components, init.shape[0]),
@@ -195,9 +196,10 @@ def test_n_components():
     n_components = X.shape[1] + 2
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,
-                         'The preferred embedding dimensionality '
-                         '`n_components` ({}) cannot be greater '
-                         'than the given data dimensionality ({})!'
+                         'The preferred dimensionality of the '
+                         'projected space `n_components` ({}) cannot '
+                         'be greater than the given data '
+                         'dimensionality ({})!'
                          .format(n_components, X.shape[1]),
                          nca.fit, X, y)
 
@@ -259,8 +261,8 @@ def test_init_transformation():
     n_components = X.shape[1] - 2
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,
-                         'The preferred embedding dimensionality '
-                         '`n_components` ({}) does not match '
+                         'The preferred dimensionality of the '
+                         'projected space `n_components` ({}) does not match '
                          'the output dimensionality of the given '
                          'linear transformation `init` ({})!'
                          .format(n_components, init.shape[0]),

From af14e5dfa0daa790ca3eb09b0e03ea0a76520718 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Feb 2019 17:01:41 +0100
Subject: [PATCH 73/79] Address Gael's review

---
 doc/modules/neighbors.rst              | 12 ++++-----
 sklearn/utils/tests/test_validation.py | 35 ++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 2d9b606767bbc..1ce461f59ecc1 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -717,10 +717,10 @@ added space complexity in the operation.
 
 .. topic:: References:
 
-.. [1] `"Neighbourhood Components Analysis". Advances in Neural Information"
-     <http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf>`_,
-     J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov, Advances in
-     Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520.
+    .. [1] `"Neighbourhood Components Analysis". Advances in Neural Information"
+      <http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf>`_,
+      J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov, Advances in
+      Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520.
 
-.. [2] `Wikipedia entry on Neighborhood Components Analysis
-     <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
+    .. [2] `Wikipedia entry on Neighborhood Components Analysis
+      <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index e9d766ed44094..ca5ced164114a 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -39,8 +39,8 @@
     assert_all_finite,
     check_memory,
     check_non_negative,
-    _num_samples
-)
+    _num_samples,
+    check_scalar)
 import sklearn
 
 from sklearn.exceptions import NotFittedError
@@ -797,3 +797,34 @@ def __len__(self):
 
     X = TestNonNumericShape()
     assert _num_samples(X) == len(X)
+
+
+@pytest.mark.parametrize('x, target_type, min_val, max_val',
+                         [(3, int, 2, 5),
+                          (2.5, float, 2, 5)])
+def test_check_scalar_valid(x, target_type, min_val, max_val):
+    """Test that check_scalar returns no error/warning if valid inputs are
+    provided"""
+    with pytest.warns(None) as record:
+        check_scalar(x, "test_name", target_type, min_val, max_val)
+    assert len(record) == 0
+
+
+@pytest.mark.parametrize('x, target_name, target_type, min_val, max_val, '
+                         'err_msg',
+                         [(1, "test_name1", float, 2, 4,
+                           TypeError("`test_name1` must be an instance of "
+                                     "<class 'float'>, not <class 'int'>.")),
+                          (1, "test_name2", int, 2, 4,
+                           ValueError('`test_name2`= 1, must be >= 2.')),
+                          (5, "test_name3", int, 2, 4,
+                           ValueError('`test_name3`= 5, must be <= 4.'))])
+def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val,
+                              err_msg):
+    """Test that check_scalar returns the right error if a wrong input type is
+    given"""
+    with pytest.raises(Exception) as raised_error:
+        check_scalar(x, target_name, target_type=target_type,
+                     min_val=min_val, max_val=max_val)
+    assert str(raised_error.value) == str(err_msg)
+    assert type(raised_error.value) == type(err_msg)

From 3a78d1a8c557b53b00a3c745b91f0bc3b349bba3 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 27 Feb 2019 14:14:54 +0100
Subject: [PATCH 74/79] few nitpicks and make some links in the doc work

---
 doc/modules/neighbors.rst              |  6 +++---
 sklearn/neighbors/nca.py               | 14 +++++++-------
 sklearn/utils/tests/test_validation.py |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 1ce461f59ecc1..c325f86129dfa 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -546,7 +546,7 @@ the original space, sample 3 has many stochastic neighbors from various
 classes, so the right class is not very likely. However, in the projected space
 learned by NCA, the only stochastic neighbors with non-negligible weight are
 from the same class as sample 3, guaranteeing that the latter will be well
-classified. See the :ref:`mathematical formulation<mathematical_formulation>`
+classified. See the :ref:`mathematical formulation <nca_mathematical_formulation>`
 for more details.
 
 
@@ -647,8 +647,8 @@ Mathematical formulation
 
 The goal of NCA is to learn an optimal linear transformation matrix of size
 ``(n_components, n_features)``, which maximises the sum over all samples
-:math:`i` of the probability :math:`p_i` that math:`i` is correctly classified,
-i.e.:
+:math:`i` of the probability :math:`p_i` that :math:`i` is correctly
+classified, i.e.:
 
 .. math::
 
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index caf71134b9bb9..adde1d683d6dc 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -33,7 +33,7 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     to improve the classification accuracy of a stochastic nearest neighbors
     rule in the transformed space.
 
-    Read more in the :ref:`User Guide <NeighborhoodComponentsAnalysis>`.
+    Read more in the :ref:`User Guide <nca>`.
 
     Parameters
     ----------
@@ -58,14 +58,14 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
         'pca'
             ``n_components`` principal components of the inputs passed
             to :meth:`fit` will be used to initialize the transformation.
-            (See :class:`PCA`)
+            (See `decomposition.PCA`)
 
         'lda'
             ``min(n_components, n_classes)`` most discriminative
             components of the inputs passed to :meth:`fit` will be used to
             initialize the transformation. (If ``n_components > n_classes``,
             the rest of the components will be zero.) (See
-            :class:`LinearDiscriminantAnalysis`)
+            `discriminant_analysis.LinearDiscriminantAnalysis`)
 
         'identity'
             If ``n_components`` is strictly smaller than the
@@ -74,7 +74,7 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
         'random'
             The initial transformation will be a random array of shape
-            (n_components, n_features). Each value is sampled from the
+            `(n_components, n_features)`. Each value is sampled from the
             standard normal distribution.
 
         numpy array
@@ -102,9 +102,9 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
     verbose : int, optional (default=0)
         If 0, no progress messages will be printed.
         If 1, progress messages will be printed to stdout.
-        If > 1, progress messages will be printed and the ``iprint``
-        parameter of :meth:`_minimize_lbfgsb` of `scipy.optimize` will be set
-        to ``verbose - 2``.
+        If > 1, progress messages will be printed and the ``disp``
+        parameter of :func:`scipy.optimize.minimize` will be set to
+        ``verbose - 2``.
 
     random_state : int or numpy.RandomState or None, optional (default=None)
         A pseudo random number generator object or a seed for it if int. If
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index ca5ced164114a..e2bc9dd8a58b2 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -821,7 +821,7 @@ def test_check_scalar_valid(x, target_type, min_val, max_val):
                            ValueError('`test_name3`= 5, must be <= 4.'))])
 def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val,
                               err_msg):
-    """Test that check_scalar returns the right error if a wrong input type is
+    """Test that check_scalar returns the right error if a wrong input is
     given"""
     with pytest.raises(Exception) as raised_error:
         check_scalar(x, target_name, target_type=target_type,

From 58d169c20793645b757f897795cfeb49a436e557 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 27 Feb 2019 15:53:03 +0100
Subject: [PATCH 75/79] Address alex's review

---
 doc/modules/neighbors.rst                  |  6 +--
 doc/modules/neural_networks_supervised.rst |  2 +-
 doc/modules/sgd.rst                        |  2 +-
 sklearn/neighbors/nca.py                   | 18 +++----
 sklearn/neighbors/tests/test_nca.py        | 58 +++++++++++-----------
 sklearn/utils/validation.py                |  2 +-
 6 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index c325f86129dfa..094eec438d357 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -689,11 +689,11 @@ Implementation
 --------------
 
 This implementation follows what is explained in the original paper [1]_. For
-the optimisation method, it currently uses scipy's l-bfgs-b with a full
+the optimisation method, it currently uses scipy's L-BFGS-B with a full
 gradient computation at each iteration, to avoid to tune the learning rate and
 provide stable learning.
 
-See the examples below and the doc  string of
+See the examples below and the docstring of
 :meth:`NeighborhoodComponentsAnalysis.fit` for further information.
 
 Complexity
@@ -705,7 +705,7 @@ NCA stores a matrix of pairwise distances, taking ``n_samples ** 2`` memory.
 Time complexity depends on the number of iterations done by the optimisation
 algorithm. However, one can set the maximum number of iterations with the
 argument ``max_iter``. For each iteration, time complexity is
-``O(n_components x n_samples x min(n_samples, n_features)``.
+``O(n_components x n_samples x min(n_samples, n_features))``.
 
 
 Transform
diff --git a/doc/modules/neural_networks_supervised.rst b/doc/modules/neural_networks_supervised.rst
index d3e3ac5710cb1..793de7f8212d1 100644
--- a/doc/modules/neural_networks_supervised.rst
+++ b/doc/modules/neural_networks_supervised.rst
@@ -152,7 +152,7 @@ indices where the value is `1` represents the assigned classes of that sample::
     >>> clf.predict([[0., 0.]])
     array([[0, 1]])
 
-See the examples below and the doc string of
+See the examples below and the docstring of
 :meth:`MLPClassifier.fit` for further information.
 
 .. topic:: Examples:
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 08e864a71b76e..b28c6918cd0f6 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -154,7 +154,7 @@ one-vs-all classification.
 
 :class:`SGDClassifier` supports both weighted classes and weighted
 instances via the fit parameters ``class_weight`` and ``sample_weight``. See
-the examples below and the doc string of :meth:`SGDClassifier.fit` for
+the examples below and the docstring of :meth:`SGDClassifier.fit` for
 further information.
 
 .. topic:: Examples:
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index adde1d683d6dc..a85d0a28603da 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -48,12 +48,11 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
         'auto'
             Depending on ``n_components``, the most reasonable initialization
-            will be chosen among the following ones. First, we try to use
-            'lda', as it uses labels information: if ``n_components <=
-            n_classes``, ``init='lda'``. If we can't, we then try 'pca', as it
-            projects data in meaningful directions (those of higher variance):
-            if ``n_components < min(n_features, n_samples)``, ``init = 'pca'``.
-            Otherwise, we just use 'identity'.
+            will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+            it uses labels information. If not, but
+            ``n_components < min(n_features, n_samples)``, we use 'pca', as
+            it projects data in meaningful directions (those of higher
+            variance). Otherwise, we just use 'identity'.
 
         'pca'
             ``n_components`` principal components of the inputs passed
@@ -95,9 +94,10 @@ class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
 
     callback : callable, optional (default=None)
         If not None, this function is called after every iteration of the
-        optimizer, taking as arguments the current solution (transformation)
-        and the number of iterations. This might be useful in case one wants
-        to examine or store the transformation found after each iteration.
+        optimizer, taking as arguments the current solution (flattened
+        transformation matrix) and the number of iterations. This might be
+        useful in case one wants to examine or store the transformation
+        found after each iteration.
 
     verbose : int, optional (default=0)
         If 0, no progress messages will be printed.
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 9920190f764d7..fed63790461bc 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -26,8 +26,8 @@ def test_simple_example():
     """Test on a simple example.
 
     Puts four points in the input space where the opposite labels points are
-    next to each other. After transform the same labels points should be next
-    to each other.
+    next to each other. After transform the samples from the same class
+    should be next to each other.
 
     """
     X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
@@ -36,23 +36,24 @@ def test_simple_example():
                                          random_state=42)
     nca.fit(X, y)
     X_t = nca.transform(X)
-    np.testing.assert_equal(pairwise_distances(X_t).argsort()[:, 1],
-                            np.array([2, 3, 0, 1]))
+    assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
+                       np.array([2, 3, 0, 1]))
 
 
 def test_toy_example_collapse_points():
     """Test on a toy example of three points that should collapse
 
-    Test that on this simple example, the new points are collapsed:
-    Two same label points with a different label point in the middle.
-    The objective is 2/(1 + exp(d/2)), with d the euclidean distance
-    between the two same labels points. This is maximized for d=0
-    (because d>=0), with an objective equal to 1 (loss=-1.).
+    We build a simple example: two points from the same class and a point from
+    a different class in the middle of them. On this simple example, the new
+    (transformed) points should all collapse into one single point. Indeed, the
+    objective is 2/(1 + exp(d/2)), with d the euclidean distance between the
+    two samples from the same class. This is maximized for d=0 (because d>=0),
+    with an objective equal to 1 (loss=-1.).
 
     """
-    random_state = np.random.RandomState(42)
+    rng = np.random.RandomState(42)
     input_dim = 5
-    two_points = random_state.randn(2, input_dim)
+    two_points = rng.randn(2, input_dim)
     X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
     y = [0, 0, 1]
 
@@ -90,10 +91,10 @@ def test_finite_differences():
     approximation.
     """
     # Initialize the transformation `M`, as well as `X` and `y` and `NCA`
-    random_state = np.random.RandomState(42)
+    rng = np.random.RandomState(42)
     X, y = make_classification()
-    M = random_state.randn(random_state.randint(1, X.shape[1] + 1),
-                           X.shape[1])
+    M = rng.randn(rng.randint(1, X.shape[1] + 1),
+                  X.shape[1])
     nca = NeighborhoodComponentsAnalysis()
     nca.n_iter_ = 0
     mask = y[:, np.newaxis] == y[np.newaxis, :]
@@ -114,7 +115,7 @@ def test_params_validation():
     X = np.arange(12).reshape(4, 3)
     y = [1, 1, 2, 2]
     NCA = NeighborhoodComponentsAnalysis
-    random_state = np.random.RandomState(42)
+    rng = np.random.RandomState(42)
 
     # TypeError
     assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
@@ -133,7 +134,7 @@ def test_params_validation():
                          '`max_iter`= -1, must be >= 1.',
                          NCA(max_iter=-1).fit, X, y)
 
-    init = random_state.rand(5, 3)
+    init = rng.rand(5, 3)
     assert_raise_message(ValueError,
                          'The output dimensionality ({}) of the given linear '
                          'transformation `init` cannot be greater than its '
@@ -175,11 +176,11 @@ def test_transformation_dimensions():
 
 
 def test_n_components():
-    random_state = np.random.RandomState(42)
+    rng = np.random.RandomState(42)
     X = np.arange(12).reshape(4, 3)
     y = [1, 1, 2, 2]
 
-    init = random_state.rand(X.shape[1] - 1, 3)
+    init = rng.rand(X.shape[1] - 1, 3)
 
     # n_components = X.shape[1] != transformation.shape[0]
     n_components = X.shape[1]
@@ -209,7 +210,7 @@ def test_n_components():
 
 
 def test_init_transformation():
-    random_state = np.random.RandomState(42)
+    rng = np.random.RandomState(42)
     X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
 
     # Start learning from scratch
@@ -232,12 +233,12 @@ def test_init_transformation():
     nca_lda = NeighborhoodComponentsAnalysis(init='lda')
     nca_lda.fit(X, y)
 
-    init = random_state.rand(X.shape[1], X.shape[1])
+    init = rng.rand(X.shape[1], X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
     nca.fit(X, y)
 
     # init.shape[1] must match X.shape[1]
-    init = random_state.rand(X.shape[1], X.shape[1] + 1)
+    init = rng.rand(X.shape[1], X.shape[1] + 1)
     nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raise_message(ValueError,
                          'The input dimensionality ({}) of the given '
@@ -247,7 +248,7 @@ def test_init_transformation():
                          nca.fit, X, y)
 
     # init.shape[0] must be <= init.shape[1]
-    init = random_state.rand(X.shape[1] + 1, X.shape[1])
+    init = rng.rand(X.shape[1] + 1, X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
     assert_raise_message(ValueError,
                          'The output dimensionality ({}) of the given '
@@ -257,7 +258,7 @@ def test_init_transformation():
                          nca.fit, X, y)
 
     # init.shape[0] must match n_components
-    init = random_state.rand(X.shape[1], X.shape[1])
+    init = rng.rand(X.shape[1], X.shape[1])
     n_components = X.shape[1] - 2
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
     assert_raise_message(ValueError,
@@ -276,17 +277,17 @@ def test_init_transformation():
 def test_auto_init(n_samples, n_features, n_classes, n_components):
     # Test that auto choose the init as expected with every configuration
     # of order of n_samples, n_features, n_classes and n_components.
-    random_state = np.random.RandomState(42)
+    rng = np.random.RandomState(42)
     nca_base = NeighborhoodComponentsAnalysis(init='auto',
                                               n_components=n_components,
                                               max_iter=1,
-                                              random_state=random_state)
+                                              random_state=rng)
     if n_classes >= n_samples:
         pass
         # n_classes > n_samples is impossible, and n_classes == n_samples
         # throws an error from lda but is an absurd case
     else:
-        X = random_state.randn(n_samples, n_features)
+        X = rng.randn(n_samples, n_features)
         y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
         if n_components > n_features:
             # this would return a ValueError, which is already tested in
@@ -359,13 +360,13 @@ def test_warm_start_effectiveness():
 def test_verbose(init_name, capsys):
     # assert there is proper output when verbose = 1, for every initialization
     # except auto because auto will call one of the others
-    random_state = np.random.RandomState(42)
+    rng = np.random.RandomState(42)
     X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
     regexp_init = r'... done in \ *\d+\.\d{2}s'
     msgs = {'pca': "Finding principal components" + regexp_init,
             'lda': "Finding most discriminative components" + regexp_init}
     if init_name == 'precomputed':
-        init = random_state.randn(X.shape[1], X.shape[1])
+        init = rng.randn(X.shape[1], X.shape[1])
     else:
         init = init_name
     nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
@@ -461,6 +462,7 @@ def test_callback(capsys):
     max_iter = 10
 
     def my_cb(transformation, n_iter):
+        assert transformation.shape == (iris_data.shape[1]**2,)
         rem_iter = max_iter - n_iter
         print('{} iterations remaining...'.format(rem_iter))
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 7ad913034471d..96922a8e4af28 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -953,7 +953,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None):
         Acceptable data types for the parameter.
 
     min_val : float or int, optional (default=None)
-        The minimum value value the parameter can take. If None (default) it
+        The minimum valid value the parameter can take. If None (default) it
         is implied that the parameter does not have a lower bound.
 
     max_val : float or int, optional (default=None)

From fbd28e18b9073e85cbd4ad35c99a5a2eec3b5dd1 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 28 Feb 2019 11:04:46 +0100
Subject: [PATCH 76/79] Adress Alex's review

---
 examples/neighbors/plot_nca_classification.py | 2 +-
 sklearn/neighbors/nca.py                      | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py
index 9ba8cd76b24ad..5536e8eb69e89 100644
--- a/examples/neighbors/plot_nca_classification.py
+++ b/examples/neighbors/plot_nca_classification.py
@@ -35,7 +35,7 @@
 X, y = dataset.data, dataset.target
 
 # we only take two features. We could avoid this ugly
-# slicing by using a two-dim datasets
+# slicing by using a two-dim dataset
 X = X[:, [0, 2]]
 
 X_train, X_test, y_train, y_test = \
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index a85d0a28603da..38f62886807f2 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -3,7 +3,9 @@
 Neighborhood Component Analysis
 """
 
-# License: BSD 3 Clause
+# Authors: William de Vazelhes <wdevazelhes@gmail.com>
+#          John Chiotellis <ioannis.chiotellis@in.tum.de>
+# License: BSD 3 clause
 
 from __future__ import print_function
 
@@ -438,7 +440,7 @@ def _callback(self, transformation):
 
         Parameters
         ----------
-        transformation : array, shape(n_components, n_features)
+        transformation : array, shape=(n_components * n_features,)
             The solution computed by the optimizer in this iteration.
         """
         if self.callback is not None:

From 8d65ebc67e6cd3e892593102079be26a23b855fb Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 28 Feb 2019 12:07:16 +0100
Subject: [PATCH 77/79] Add authors in test too

---
 sklearn/neighbors/tests/test_nca.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index fed63790461bc..2397af5bc0ed1 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -1,3 +1,12 @@
+# coding: utf-8
+"""
+Testing for Neighborhood Component Analysis module (sklearn.neighbors.nca)
+"""
+
+# Authors: William de Vazelhes <wdevazelhes@gmail.com>
+#          John Chiotellis <ioannis.chiotellis@in.tum.de>
+# License: BSD 3 clause
+
 import pytest
 import re
 import numpy as np

From ed0d23ad9efdc799a6a43e4fde4929251998b2e5 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 28 Feb 2019 12:17:21 +0100
Subject: [PATCH 78/79] add check_scalar to utils

---
 doc/modules/classes.rst   | 1 +
 sklearn/utils/__init__.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 51628b1bc0081..53723c3f6ea86 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1433,6 +1433,7 @@ Low-level methods
    utils.assert_all_finite
    utils.check_X_y
    utils.check_array
+   utils.check_scalar
    utils.check_consistent_length
    utils.check_random_state
    utils.class_weight.compute_class_weight
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index ebbdbcaa2b702..6150e017e3e28 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -19,7 +19,7 @@
                          assert_all_finite,
                          check_random_state, column_or_1d, check_array,
                          check_consistent_length, check_X_y, indexable,
-                         check_symmetric)
+                         check_symmetric, check_scalar)
 from .. import get_config
 
 
@@ -60,7 +60,7 @@ class Parallel(_joblib.Parallel):
            "check_random_state",
            "compute_class_weight", "compute_sample_weight",
            "column_or_1d", "safe_indexing",
-           "check_consistent_length", "check_X_y", 'indexable',
+           "check_consistent_length", "check_X_y", "check_scalar", 'indexable',
            "check_symmetric", "indices_to_mask", "deprecated",
            "cpu_count", "Parallel", "Memory", "delayed", "parallel_backend",
            "register_parallel_backend", "hash", "effective_n_jobs",

From 6dbef86d9f83dd823423fc3a9cf932ec3a377a0a Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 28 Feb 2019 15:56:20 +0100
Subject: [PATCH 79/79] MajorFeature > API

---
 doc/whats_new/v0.21.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index d9bb66bdb5a5c..f0df026b2f01e 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -235,6 +235,12 @@ Support for Python 3.4 and below has been officially dropped.
 :mod:`sklearn.neighbors`
 ........................
 
+- |MajorFeature| A metric learning algorithm:
+  :class:`neighbors.NeighborhoodComponentsAnalysis`, which implements the
+  Neighborhood Components Analysis algorithm described in Goldberger et al.
+  (2005). :issue:`10058` by :user:`William de Vazelhes
+  <wdevazelhes>` and :user:`John Chiotellis <johny-c>`.
+
 - |API| Methods in :class:`neighbors.NearestNeighbors` :
   :func:`~neighbors.NearestNeighbors.kneighbors`,
   :func:`~neighbors.NearestNeighbors.radius_neighbors`,
@@ -244,12 +250,6 @@ Support for Python 3.4 and below has been officially dropped.
   when called before ``fit`` :issue:`12279` by :user:`Krishna Sangeeth
   <whiletruelearn>`.
 
-- |MajorFeature| A metric learning algorithm:
-  :class:`neighbors.NeighborhoodComponentsAnalysis`, which implements the
-  Neighborhood Components Analysis algorithm described in Goldberger et al.
-  (2005). :issue:`10058` by :user:`William de Vazelhes
-  <wdevazelhes>` and :user:`John Chiotellis <johny-c>`.
-
 :mod:`sklearn.neural_network`
 .............................