# Source code for sklearn.manifold.locally_linear

"""Locally Linear Embedding"""

# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
#         Jake Vanderplas  -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) INRIA 2011

import numpy as np
from scipy.linalg import eigh, svd, qr, solve
from scipy.sparse import eye, csr_matrix
from scipy.sparse.linalg import eigsh

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_random_state, check_array
from ..utils.extmath import stable_cumsum
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
from ..neighbors import NearestNeighbors

def barycenter_weights(X, Z, reg=1e-3):
"""Compute barycenter weights of X from Y along the first axis

We estimate the weights to assign to each point in Y[i] to recover
the point X[i]. The barycenter weights sum to 1.

Parameters
----------
X : array-like, shape (n_samples, n_dim)

Z : array-like, shape (n_samples, n_neighbors, n_dim)

reg : float, optional
amount of regularization to add for the problem to be
well-posed in the case of n_neighbors > n_dim

Returns
-------
B : array-like, shape (n_samples, n_neighbors)

Notes
-----
"""
X = check_array(X, dtype=FLOAT_DTYPES)
Z = check_array(Z, dtype=FLOAT_DTYPES, allow_nd=True)

n_samples, n_neighbors = X.shape, Z.shape
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
v = np.ones(n_neighbors, dtype=X.dtype)

# this might raise a LinalgError if G is singular and has trace
# zero
for i, A in enumerate(Z.transpose(0, 2, 1)):
C = A.T - X[i]  # broadcasting
G = np.dot(C, C.T)
trace = np.trace(G)
if trace > 0:
R = reg * trace
else:
R = reg
G.flat[::Z.shape + 1] += R
w = solve(G, v, sym_pos=True)
B[i, :] = w / np.sum(w)
return B

def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=1):
"""Computes the barycenter weighted graph of k-Neighbors for points in X

Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, sparse array, precomputed tree, or NearestNeighbors
object.

n_neighbors : int
Number of neighbors for each sample.

reg : float, optional
Amount of regularization when solving the least-squares
problem. Only relevant if mode='barycenter'. If None, use the
default.

n_jobs : int, optional (default = 1)
The number of parallel jobs to run for neighbors search.
If -1, then the number of jobs is set to the number of CPU cores.

Returns
-------
A : sparse matrix in CSR format, shape = [n_samples, n_samples]
A[i, j] is assigned the weight of edge that connects i to j.

--------
sklearn.neighbors.kneighbors_graph
"""
knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit(X)
X = knn._fit_X
n_samples = X.shape
ind = knn.kneighbors(X, return_distance=False)[:, 1:]
data = barycenter_weights(X, X[ind], reg=reg)
indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
return csr_matrix((data.ravel(), ind.ravel(), indptr),
shape=(n_samples, n_samples))

def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
random_state=None):
"""
Find the null space of a matrix M.

Parameters
----------
M : {array, matrix, sparse matrix, LinearOperator}
Input covariance matrix: should be symmetric positive semi-definite

k : integer
Number of eigenvalues/vectors to return

k_skip : integer, optional
Number of low eigenvalues to skip.

eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems.  It is
best to try several random seeds in order to check results.
dense  : use standard dense matrix operations for the eigenvalue
decomposition.  For this method, M must be an array
or matrix type.  This method should be avoided for
large problems.

tol : float, optional
Tolerance for 'arpack' method.
Not used if eigen_solver=='dense'.

max_iter : maximum number of iterations for 'arpack' method
not used if eigen_solver=='dense'

random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by np.random. Used when solver == 'arpack'.

"""
if eigen_solver == 'auto':
if M.shape > 200 and k + k_skip < 10:
eigen_solver = 'arpack'
else:
eigen_solver = 'dense'

if eigen_solver == 'arpack':
random_state = check_random_state(random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, M.shape)
try:
eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
tol=tol, maxiter=max_iter,
v0=v0)
except RuntimeError as msg:
raise ValueError("Error in determining null-space with ARPACK. "
"Error message: '%s'. "
"Note that method='arpack' can fail when the "
"weight matrix is singular or otherwise "
"ill-behaved.  method='dense' is recommended. "
% msg)

return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
elif eigen_solver == 'dense':
if hasattr(M, 'toarray'):
M = M.toarray()
eigen_values, eigen_vectors = eigh(
M, eigvals=(k_skip, k + k_skip - 1), overwrite_a=True)
index = np.argsort(np.abs(eigen_values))
return eigen_vectors[:, index], np.sum(eigen_values)
else:
raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)

def locally_linear_embedding(
X, n_neighbors, n_components, reg=1e-3, eigen_solver='auto', tol=1e-6,
max_iter=100, method='standard', hessian_tol=1E-4, modified_tol=1E-12,
random_state=None, n_jobs=1):
"""Perform a Locally Linear Embedding analysis on the data.

Read more in the :ref:User Guide <locally_linear_embedding>.

Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, sparse array, precomputed tree, or NearestNeighbors
object.

n_neighbors : integer
number of neighbors to consider for each point.

n_components : integer
number of coordinates for the manifold.

reg : float
regularization constant, multiplies the trace of the local covariance
matrix of the distances.

eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data

arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems.  It is
best to try several random seeds in order to check results.

dense  : use standard dense matrix operations for the eigenvalue
decomposition.  For this method, M must be an array
or matrix type.  This method should be avoided for
large problems.

tol : float, optional
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.

max_iter : integer
maximum number of iterations for the arpack solver.

method : {'standard', 'hessian', 'modified', 'ltsa'}
standard : use the standard locally linear embedding algorithm.
see reference _
hessian  : use the Hessian eigenmap method.  This method requires
n_neighbors > n_components * (1 + (n_components + 1) / 2.
see reference _
modified : use the modified locally linear embedding algorithm.
see reference _
ltsa     : use local tangent space alignment algorithm
see reference _

hessian_tol : float, optional
Tolerance for Hessian eigenmapping method.
Only used if method == 'hessian'

modified_tol : float, optional
Tolerance for modified LLE method.
Only used if method == 'modified'

random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by np.random. Used when solver == 'arpack'.

n_jobs : int, optional (default = 1)
The number of parallel jobs to run for neighbors search.
If -1, then the number of jobs is set to the number of CPU cores.

Returns
-------
Y : array-like, shape [n_samples, n_components]
Embedding vectors.

squared_error : float
Reconstruction error for the embedding vectors. Equivalent to
norm(Y - W Y, 'fro')**2, where W are the reconstruction weights.

References
----------

..  Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding.  Science 290:2323 (2000).
..  Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A.  100:5591 (2003).
..  Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382
..  Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ.  8:406 (2004)
"""
if eigen_solver not in ('auto', 'arpack', 'dense'):
raise ValueError("unrecognized eigen_solver '%s'" % eigen_solver)

if method not in ('standard', 'hessian', 'modified', 'ltsa'):
raise ValueError("unrecognized method '%s'" % method)

nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
nbrs.fit(X)
X = nbrs._fit_X

N, d_in = X.shape

if n_components > d_in:
raise ValueError("output dimension must be less than or equal "
"to input dimension")
if n_neighbors >= N:
raise ValueError("n_neighbors must be less than number of points")

if n_neighbors <= 0:
raise ValueError("n_neighbors must be positive")

M_sparse = (eigen_solver != 'dense')

if method == 'standard':
W = barycenter_kneighbors_graph(
nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs)

# we'll compute M = (I-W)'(I-W)
# depending on the solver, we'll do this differently
if M_sparse:
M = eye(*W.shape, format=W.format) - W
M = (M.T * M).tocsr()
else:
M = (W.T * W - W.T - W).toarray()
M.flat[::M.shape + 1] += 1  # W = W - I = W - I

elif method == 'hessian':
dp = n_components * (n_components + 1) // 2

if n_neighbors <= n_components + dp:
raise ValueError("for method='hessian', n_neighbors must be "
"greater than "
"[n_components * (n_components + 3) / 2]")

neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1,
return_distance=False)
neighbors = neighbors[:, 1:]

Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
Yi[:, 0] = 1

M = np.zeros((N, N), dtype=np.float64)

use_svd = (n_neighbors > d_in)

for i in range(N):
Gi = X[neighbors[i]]
Gi -= Gi.mean(0)

# build Hessian estimator
if use_svd:
U = svd(Gi, full_matrices=0)
else:
Ci = np.dot(Gi, Gi.T)
U = eigh(Ci)[:, ::-1]

Yi[:, 1:1 + n_components] = U[:, :n_components]

j = 1 + n_components
for k in range(n_components):
Yi[:, j:j + n_components - k] = (U[:, k:k + 1] *
U[:, k:n_components])
j += n_components - k

Q, R = qr(Yi)

w = Q[:, n_components + 1:]
S = w.sum(0)

S[np.where(abs(S) < hessian_tol)] = 1
w /= S

nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(w, w.T)

if M_sparse:
M = csr_matrix(M)

elif method == 'modified':
if n_neighbors < n_components:
raise ValueError("modified LLE requires "
"n_neighbors >= n_components")

neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1,
return_distance=False)
neighbors = neighbors[:, 1:]

# find the eigenvectors and eigenvalues of each local covariance
# matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
# where the columns are eigenvectors
V = np.zeros((N, n_neighbors, n_neighbors))
nev = min(d_in, n_neighbors)
evals = np.zeros([N, nev])

# choose the most efficient way to find the eigenvectors
use_svd = (n_neighbors > d_in)

if use_svd:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
V[i], evals[i], _ = svd(X_nbrs,
full_matrices=True)
evals **= 2
else:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
evi, vi = eigh(C_nbrs)
evals[i] = evi[::-1]
V[i] = vi[:, ::-1]

# find regularized weights: this is like normal LLE.
# because we've already computed the SVD of each covariance matrix,
# it's faster to use this rather than np.linalg.solve
reg = 1E-3 * evals.sum(1)

tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
tmp[:, :nev] /= evals + reg[:, None]
tmp[:, nev:] /= reg[:, None]

w_reg = np.zeros((N, n_neighbors))
for i in range(N):
w_reg[i] = np.dot(V[i], tmp[i])
w_reg /= w_reg.sum(1)[:, None]

# calculate eta: the median of the ratio of small to large eigenvalues
# across the points.  This is used to determine s_i, below
rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
eta = np.median(rho)

# find s_i, the size of the "almost null space" for each point:
# this is the size of the largest set of eigenvalues
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
s_range = np.zeros(N, dtype=int)
evals_cumsum = stable_cumsum(evals, 1)
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
for i in range(N):
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
s_range += n_neighbors - nev  # number of zero eigenvalues

# Now calculate M.
# This is the [N x N] matrix whose null space is the desired embedding
M = np.zeros((N, N), dtype=np.float64)
for i in range(N):
s_i = s_range[i]

# select bottom s_i eigenvectors and calculate alpha
Vi = V[i, :, n_neighbors - s_i:]
alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)

# compute Householder matrix which satisfies
#  Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
# using prescription from paper
h = alpha_i * np.ones(s_i) - np.dot(Vi.T, np.ones(n_neighbors))

norm_h = np.linalg.norm(h)
if norm_h < modified_tol:
h *= 0
else:
h /= norm_h

# Householder matrix is
#  >> Hi = np.identity(s_i) - 2*np.outer(h,h)
# Then the weight matrix is
#  >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
# We do this much more efficiently:
Wi = (Vi - 2 * np.outer(np.dot(Vi, h), h) +
(1 - alpha_i) * w_reg[i, :, None])

# Update M as follows:
# >> W_hat = np.zeros( (N,s_i) )
# >> W_hat[neighbors[i],:] = Wi
# >> W_hat[i] -= 1
# >> M += np.dot(W_hat,W_hat.T)
# We can do this much more efficiently:
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
Wi_sum1 = Wi.sum(1)
M[i, neighbors[i]] -= Wi_sum1
M[neighbors[i], i] -= Wi_sum1
M[i, i] += s_i

if M_sparse:
M = csr_matrix(M)

elif method == 'ltsa':
neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1,
return_distance=False)
neighbors = neighbors[:, 1:]

M = np.zeros((N, N))

use_svd = (n_neighbors > d_in)

for i in range(N):
Xi = X[neighbors[i]]
Xi -= Xi.mean(0)

# compute n_components largest eigenvalues of Xi * Xi^T
if use_svd:
v = svd(Xi, full_matrices=True)
else:
Ci = np.dot(Xi, Xi.T)
v = eigh(Ci)[:, ::-1]

Gi = np.zeros((n_neighbors, n_components + 1))
Gi[:, 1:] = v[:, :n_components]
Gi[:, 0] = 1. / np.sqrt(n_neighbors)

GiGiT = np.dot(Gi, Gi.T)

nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] -= GiGiT
M[neighbors[i], neighbors[i]] += 1

return null_space(M, n_components, k_skip=1, eigen_solver=eigen_solver,
tol=tol, max_iter=max_iter, random_state=random_state)

class LocallyLinearEmbedding(BaseEstimator, TransformerMixin):
"""Locally Linear Embedding

Read more in the :ref:User Guide <locally_linear_embedding>.

Parameters
----------
n_neighbors : integer
number of neighbors to consider for each point.

n_components : integer
number of coordinates for the manifold

reg : float
regularization constant, multiplies the trace of the local covariance
matrix of the distances.

eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data

arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems.  It is
best to try several random seeds in order to check results.

dense  : use standard dense matrix operations for the eigenvalue
decomposition.  For this method, M must be an array
or matrix type.  This method should be avoided for
large problems.

tol : float, optional
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.

max_iter : integer
maximum number of iterations for the arpack solver.
Not used if eigen_solver=='dense'.

method : string ('standard', 'hessian', 'modified' or 'ltsa')
standard : use the standard locally linear embedding algorithm.  see
reference 
hessian  : use the Hessian eigenmap method. This method requires
n_neighbors > n_components * (1 + (n_components + 1) / 2
see reference 
modified : use the modified locally linear embedding algorithm.
see reference 
ltsa     : use local tangent space alignment algorithm
see reference 

hessian_tol : float, optional
Tolerance for Hessian eigenmapping method.
Only used if method == 'hessian'

modified_tol : float, optional
Tolerance for modified LLE method.
Only used if method == 'modified'

neighbors_algorithm : string ['auto'|'brute'|'kd_tree'|'ball_tree']
algorithm to use for nearest neighbors search,
passed to neighbors.NearestNeighbors instance

random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by np.random. Used when eigen_solver == 'arpack'.

n_jobs : int, optional (default = 1)
The number of parallel jobs to run.
If -1, then the number of jobs is set to the number of CPU cores.

Attributes
----------
embedding_vectors_ : array-like, shape [n_components, n_samples]
Stores the embedding vectors

reconstruction_error_ : float
Reconstruction error associated with embedding_vectors_

nbrs_ : NearestNeighbors object
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.

References
----------

..  Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding.  Science 290:2323 (2000).
..  Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A.  100:5591 (2003).
..  Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382
..  Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ.  8:406 (2004)
"""

def __init__(self, n_neighbors=5, n_components=2, reg=1E-3,
eigen_solver='auto', tol=1E-6, max_iter=100,
method='standard', hessian_tol=1E-4, modified_tol=1E-12,
neighbors_algorithm='auto', random_state=None, n_jobs=1):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.reg = reg
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.method = method
self.hessian_tol = hessian_tol
self.modified_tol = modified_tol
self.random_state = random_state
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs

def _fit_transform(self, X):
self.nbrs_ = NearestNeighbors(self.n_neighbors,
algorithm=self.neighbors_algorithm,
n_jobs=self.n_jobs)

random_state = check_random_state(self.random_state)
X = check_array(X, dtype=float)
self.nbrs_.fit(X)
self.embedding_, self.reconstruction_error_ = \
locally_linear_embedding(
self.nbrs_, self.n_neighbors, self.n_components,
eigen_solver=self.eigen_solver, tol=self.tol,
max_iter=self.max_iter, method=self.method,
hessian_tol=self.hessian_tol, modified_tol=self.modified_tol,
random_state=random_state, reg=self.reg, n_jobs=self.n_jobs)

[docs]    def fit(self, X, y=None):
"""Compute the embedding vectors for data X

Parameters
----------
X : array-like of shape [n_samples, n_features]
training set.

y: Ignored.

Returns
-------
self : returns an instance of self.
"""
self._fit_transform(X)
return self

[docs]    def fit_transform(self, X, y=None):
"""Compute the embedding vectors for data X and transform X.

Parameters
----------
X : array-like of shape [n_samples, n_features]
training set.

y: Ignored.

Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
self._fit_transform(X)
return self.embedding_

[docs]    def transform(self, X):
"""
Transform new points into embedding space.

Parameters
----------
X : array-like, shape = [n_samples, n_features]

Returns
-------
X_new : array, shape = [n_samples, n_components]

Notes
-----
Because of scaling performed by this method, it is discouraged to use
it together with methods that are not scale-invariant (like SVMs)
"""
check_is_fitted(self, "nbrs_")

X = check_array(X)
ind = self.nbrs_.kneighbors(X, n_neighbors=self.n_neighbors,
return_distance=False)
weights = barycenter_weights(X, self.nbrs_._fit_X[ind],
reg=self.reg)
X_new = np.empty((X.shape, self.n_components))
for i in range(X.shape):
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
return X_new