diff --git a/GNUmakefile b/GNUmakefile new file mode 100644 index 0000000..d105fd8 --- /dev/null +++ b/GNUmakefile @@ -0,0 +1,10 @@ +############################################################################# +# Development make file. +############################################################################# + +SOURCE_DIR := diffusion_maps + +$(SOURCE_DIR)/version.py: + @scripts/make-version > $@ + +.PHONY: $(SOURCE_DIR)/version.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0a11cb0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) +Copyright (c) 2017 Juan M. Bello Rivas + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.org b/README.org new file mode 100644 index 0000000..0092225 --- /dev/null +++ b/README.org @@ -0,0 +1,65 @@ +#+TITLE: Diffusion Maps and Geometric Harmonics for Python +#+AUTHOR: Juan M. Bello Rivas +#+EMAIL: jmbr@superadditive.com +#+DATE: <2017-05-20 Sat> + +* Overview + +The =diffusion-maps= library for Python provides a fast and accurate implementation of diffusion maps[fn:1] and geometric harmonics[fn:2]. Its speed stems from the use of sparse linear algebra and (optionally) graphics processing units to accelerate computations. +The included code routinely solves eigenvalue problems 3 x faster than SciPy using GPUs on matrices with over 200 million non-zero entries. + +The package includes a command-line utility for the quick calculation of diffusion maps on data sets. + +Some of the features of the =diffusion-maps= module include: + +- Fast evaluation of distance matrices using nearest neighbors. + +- Fast and accurate computation of eigenvalue/eigenvector pairs using sparse linear algebra. + +- Optional GPU-accelerated sparse linear algebra routines. + +- Optional interface to the [[https://github.com/opencollab/arpack-ng][ARPACK-NG]] library. + +- Simple and easily modifiable code. + +[fn:1] Coifman, R. R., & Lafon, S. (2006). Diffusion maps. Applied and Computational Harmonic Analysis, 21(1), 5–30. http://doi.org/10.1016/j.acha.2006.04.006 + +[fn:2] Coifman, R. R., & Lafon, S. (2006). Geometric harmonics: A novel tool for multiscale out-of-sample extension of empirical functions. Applied and Computational Harmonic Analysis, 21(1), 31–52. http://doi.org/10.1016/j.acha.2005.07.005 + +#+CAPTION: Geometric harmonics for $z = sin(x^2 + y^2)$. +#+NAME: fig:geometric-harmonics +[[./geometric-harmonics.png]] + +* Prerequisites + +The library is implemented in Python 3.5+ and uses [[http://www.numpy.org/][NumPy]] and [[https://www.scipy.org/][SciPy]]. It is recommended to install [[https://mathema.tician.de/software/pycuda/][PyCUDA]] to enable the GPU-accelerated eigenvalue solver. + +The =diffusion-maps= command can display the resulting diffusion maps using [[https://matplotlib.org/][Matplotlib]] if it is available. + +* Installation + + Use ~python setup.py install~ to install on your system or ~python setup.py install --user~ for a user-specific installation. + +* Command-line utility + +The ~diffusion-maps~ command reads data sets stored in NPY format. The simplest way to use it is to invoke it as follows: + +#+BEGIN_SRC bash +diffusion-maps DATA-SET.NPY EPSILON-VALUE +#+END_SRC + +There exist parameters to save and visualize different types of results, to specify how many eigenvalue/eigenvector pairs to compute, etc. See the help page displayed by: + +#+BEGIN_SRC bash +diffusion-maps --help +#+END_SRC + +* Additional documentation + +[[http://www.sphinx-doc.org/en/stable/][Sphinx]]-based API documentation is available in the =doc/= folder. Run + +#+BEGIN_SRC bash +make -C doc html +#+END_SRC + +to build the documentation. diff --git a/diffusion_maps/__init__.py b/diffusion_maps/__init__.py new file mode 100644 index 0000000..e775abf --- /dev/null +++ b/diffusion_maps/__init__.py @@ -0,0 +1,7 @@ +"""Diffusion maps module. + +""" + +from .diffusion_maps import * +from .geometric_harmonics import * +from .version import * diff --git a/diffusion_maps/clock.py b/diffusion_maps/clock.py new file mode 100644 index 0000000..21823a1 --- /dev/null +++ b/diffusion_maps/clock.py @@ -0,0 +1,61 @@ +"""Clock for keeping track of the wall time. + +""" + + +__all__ = ['ClockError', 'Clock'] + +import datetime +import time + +from typing import Optional # noqa: F401. Used for mypy. + + +class ClockError(Exception): + """Invalid clock operation.""" + pass + + +class Clock: + """Clock for keeping track of time. + + """ + def __init__(self) -> None: + self.start = None # type: Optional[float] + self.stop = None # type: Optional[float] + + def tic(self) -> None: + """Start the clock.""" + self.start = time.monotonic() + self.stop = None + + def toc(self) -> None: + """Stop the clock.""" + assert self.start is not None + self.stop = time.monotonic() + + def __str__(self) -> str: + """Human-readable representation of elapsed time.""" + if self.start is None: + raise ClockError('The clock has not been started') + else: + start = datetime.datetime.fromtimestamp(self.start) + + if self.stop is None: + stop = datetime.datetime.fromtimestamp(time.monotonic()) + else: + stop = datetime.datetime.fromtimestamp(self.stop) + + delta = stop - start + + return str(delta) + + def __enter__(self): + if self.start is None and self.stop is None: + self.tic() + + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.start is not None: + self.toc() diff --git a/diffusion_maps/command_line_interface.py b/diffusion_maps/command_line_interface.py new file mode 100644 index 0000000..8666a17 --- /dev/null +++ b/diffusion_maps/command_line_interface.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# PYTHON_ARGCOMPLETE_OK -*- mode: python -*- + +"""Command line interface for the computation of diffusion maps. + +""" + +import argparse +try: + import argcomplete +except ImportError: + argcomplete = None +import logging +import os +import sys + +import numpy as np + +import diffusion_maps.default as default +import diffusion_maps.version as version +from diffusion_maps import downsample, DiffusionMaps +from diffusion_maps.profiler import Profiler +from diffusion_maps.plot import plot_eigenvectors + + +def output_eigenvalues(ew: np.array) -> None: + """Output the table of eigenvalues. + + """ + logging.info('List of eigenvalues:') + fields = ['Real part', 'Imaginary part'] + fmt = '{:>12} | {:<21}' + logging.info('-' * 30) + logging.info(fmt.format(*fields)) + logging.info('-' * 30) + fmt = '{:+2.9f} | {:+2.9f}' + for eigenvalue in ew: + logging.info(fmt.format(eigenvalue.real, eigenvalue.imag)) + + +def use_cuda(args: argparse.Namespace) -> bool: + """Determine whether to use GPU-accelerated code or not. + + """ + try: + import pycuda # noqa + use_cuda = True and not args.no_gpu + except ImportError: + use_cuda = False + + return use_cuda + + +def main(): + parser = argparse.ArgumentParser(description='Diffusion maps') + parser.add_argument('data_file', metavar='FILE', type=str, + help='process %(metavar)s (should be in NPY format)') + parser.add_argument('epsilon', metavar='VALUE', type=float, + help='kernel bandwidth') + parser.add_argument('-n', '--num-samples', type=float, metavar='NUM', + required=False, help='number of data points to use') + parser.add_argument('-e', '--num-eigenpairs', type=int, metavar='NUM', + required=False, default=default.num_eigenpairs, + help='number of eigenvalue/eigenvector pairs to ' + 'compute') + parser.add_argument('-c', '--cut-off', type=float, required=False, + metavar='DISTANCE', help='cut-off to use to enforce ' + 'sparsity in the diffusion maps computation.') + parser.add_argument('-o', '--output-data', type=str, required=False, + default='actual-data.npy', metavar='FILE', help='save ' + 'actual data used in computation to %(metavar)s') + parser.add_argument('-w', '--eigenvalues', type=str, + default='eigenvalues.dat', required=False, + metavar='FILE', help='save eigenvalues to ' + '%(metavar)s') + parser.add_argument('-v', '--eigenvectors', type=str, + default='eigenvectors.npy', required=False, + metavar='FILE', help='save eigenvectors to ' + '%(metavar)s') + parser.add_argument('-m', '--matrix', type=str, required=False, + metavar='FILE', help='save transition matrix to ' + '%(metavar)s') + parser.add_argument('-p', '--plot', action='store_true', default=False, + help='plot first two eigenvectors') + parser.add_argument('--no-gpu', action='store_true', required=False, + help='disable GPU eigensolver') + parser.add_argument('--debug', action='store_true', required=False, + help='print debugging information') + parser.add_argument('--profile', required=False, metavar='FILE', + type=argparse.FileType('w', encoding='utf-8'), + help='run under profiler and save report to ' + '%(metavar)s') + + args = parser.parse_args(sys.argv[1:]) + + if args.debug is True: + logging.basicConfig(level=logging.DEBUG, format='%(message)s') + else: + logging.basicConfig(level=logging.INFO, format='%(message)s') + + prog_name = os.path.basename(sys.argv[0]) + logging.info('{} {}'.format(prog_name, version.v_long)) + logging.info('') + logging.info('Reading data from {!r}...'.format(args.data_file)) + + orig_data = np.load(args.data_file) + if args.num_samples: + data = downsample(orig_data, int(args.num_samples)) + else: + data = orig_data + + logging.info('Computing {} diffusion maps with epsilon = {:g} ' + 'on {} data points...' + .format(args.num_eigenpairs-1, args.epsilon, data.shape[0])) + + with Profiler(args.profile): + dm = DiffusionMaps(data, args.epsilon, + num_eigenpairs=args.num_eigenpairs, + use_cuda=use_cuda(args)) + + if args.profile: + args.profile.close() + + output_eigenvalues(dm.eigenvalues) + + if args.matrix: + logging.info('Saving transition matrix to {!r}' + .format(args.matrix)) + import scipy.io + scipy.io.mmwrite(args.matrix, dm.kernel_matrix) + + if args.eigenvalues: + logging.info('Saving eigenvalues to {!r}' + .format(args.eigenvalues)) + np.savetxt(args.eigenvalues, dm.eigenvalues) + + if args.eigenvectors: + logging.info('Saving eigenvectors to {!r}' + .format(args.eigenvectors)) + np.save(args.eigenvectors, dm.eigenvectors) + + if args.output_data and args.num_samples: + logging.info('Saving downsampled data to {!r}' + .format(args.output_data)) + np.save(args.output_data, data) + + if args.plot is True: + plot_eigenvectors(data, dm.eigenvectors) + + +if __name__ == '__main__': + main() diff --git a/diffusion_maps/cpu_eigensolver.py b/diffusion_maps/cpu_eigensolver.py new file mode 100644 index 0000000..740e824 --- /dev/null +++ b/diffusion_maps/cpu_eigensolver.py @@ -0,0 +1,49 @@ +"""Compute dominant eigenvectors of a sparse matrix. + +""" + +__all__ = ['eigensolver'] + +from typing import Optional, Tuple + +import numpy as np +import scipy.sparse +import scipy.sparse.linalg + +from . import default + + +def eigensolver(matrix: scipy.sparse.csr_matrix, + num_eigenpairs: int = default.num_eigenpairs, + sigma: Optional[float] = None, + initial_vector: Optional[np.array] = None) \ + -> Tuple[np.array, np.array]: + """Solve eigenvalue problem for sparse matrix. + + Parameters + ---------- + matrix : scipy.sparse.csr_matrix + A matrix in compressed sparse row format. + num_eigenpairs : int, optional + Number of eigenvalue/eigenvector pairs to obtain. + sigma : float, optional + Find eigenvalues close to the value of sigma. The default value is + near 1.0. + initial_vector : np.array, optional + Initial vector to use in the Arnoldi iteration. If not set, a vector + with all entries equal to one will be used. + + Returns + ------- + ew : np.array + Eigenvalues in descending order of magnitude. + ev : np.array + Eigenvectors corresponding to the eigenvalues in `ew`. + + """ + if initial_vector is None: + initial_vector = np.ones(matrix.shape[0]) + ew, ev = scipy.sparse.linalg.eigs(matrix, k=num_eigenpairs, which='LM', + sigma=sigma, v0=initial_vector) + ii = np.argsort(np.abs(ew))[::-1] + return ew[ii], ev[:, ii].T diff --git a/diffusion_maps/cusparse.py b/diffusion_maps/cusparse.py new file mode 100644 index 0000000..688afd1 --- /dev/null +++ b/diffusion_maps/cusparse.py @@ -0,0 +1,118 @@ +"""Minimalistic interface to the NVIDIA cuSPARSE library. + +""" + +__all__ = ['cusparseCreate', 'cusparseDestroy', 'cusparseGetVersion', + 'cusparseCreateMatDescr', 'cusparseDestroyMatDescr', + 'cusparseDcsrmv'] + + +import ctypes +import ctypes.util +from enum import IntEnum + +import pycuda.autoinit # noqa +import pycuda.gpuarray as gpuarray + + +libcusparse = ctypes.cdll.LoadLibrary(ctypes.util.find_library('cusparse')) + +libcusparse.cusparseCreate.restype = int +libcusparse.cusparseCreate.argtypes = [ctypes.c_void_p] + +libcusparse.cusparseDestroy.restype = int +libcusparse.cusparseDestroy.argtypes = [ctypes.c_int] + +libcusparse.cusparseGetVersion.restype = int +libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int, ctypes.c_void_p] + +libcusparse.cusparseCreateMatDescr.restype = int +libcusparse.cusparseCreateMatDescr.argtypes = [ctypes.c_void_p] + +libcusparse.cusparseDestroyMatDescr.restype = int +libcusparse.cusparseDestroyMatDescr.argtypes = [ctypes.c_int] + +libcusparse.cusparseDcsrmv.restype = int +libcusparse.cusparseDcsrmv.argtypes = [ctypes.c_int, ctypes.c_int, + ctypes.c_int, ctypes.c_int, + ctypes.c_int, ctypes.c_void_p, + ctypes.c_int, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, + ctypes.c_void_p, ctypes.c_void_p, + ctypes.c_void_p] + + +class cusparseOperation(IntEnum): + CUSPARSE_OPERATION_NON_TRANSPOSE = 0 + CUSPARSE_OPERATION_TRANSPOSE = 1 + CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 + + +class cusparseError(Exception): + """Error in call to cuSPARSE library.""" + def __init__(self, status): + self.status = status + + def __repr__(self): + return ('{}(status={})' + .format(self.__class__.__name__, self.status)) + + +def cusparseCreate() -> ctypes.c_int: + handle = ctypes.c_int() + + status = libcusparse.cusparseCreate(ctypes.byref(handle)) + if status != 0: + raise cusparseError(status) + + return handle + + +def cusparseDestroy(handle: ctypes.c_int) -> None: + status = libcusparse.cusparseDestroy(handle) + if status != 0: + raise cusparseError(status) + + return handle + + +def cusparseGetVersion(handle: ctypes.c_int) -> int: + version = ctypes.c_int() + + status = libcusparse.cusparseGetVersion(handle, ctypes.byref(version)) + if status != 0: + raise cusparseError(status) + + return status + + +def cusparseCreateMatDescr() -> ctypes.c_int: + descr = ctypes.c_int() + + status = libcusparse.cusparseCreateMatDescr(ctypes.byref(descr)) + if status != 0: + raise cusparseError(status) + + return descr + + +def cusparseDestroyMatDescr(descr: ctypes.c_int) -> None: + status = libcusparse.cusparseDestroyMatDescr(descr) + if status != 0: + raise cusparseError(status) + + +def cusparseDcsrmv(handle: ctypes.c_int, transA: cusparseOperation, m: int, + n: int, nnz: int, alpha: float, descrA: ctypes.c_int, + csrValA: gpuarray.GPUArray, csrRowPtrA: gpuarray.GPUArray, + csrColIndA: gpuarray.GPUArray, x: gpuarray.GPUArray, beta: + float, y: gpuarray.GPUArray): + alpha_ = ctypes.c_double(alpha) + beta_ = ctypes.c_double(beta) + status = libcusparse.cusparseDcsrmv(handle, transA, m, n, nnz, + ctypes.byref(alpha_), descrA, + csrValA.ptr, csrRowPtrA.ptr, + csrColIndA.ptr, x.ptr, + ctypes.byref(beta_), y.ptr) + if status != 0: + raise cusparseError(status) diff --git a/diffusion_maps/default.py b/diffusion_maps/default.py new file mode 100644 index 0000000..fccb984 --- /dev/null +++ b/diffusion_maps/default.py @@ -0,0 +1,7 @@ +"""Default values. + +""" + +num_eigenpairs = 11 + +use_cuda = True diff --git a/diffusion_maps/diffusion_maps.py b/diffusion_maps/diffusion_maps.py new file mode 100644 index 0000000..29132ca --- /dev/null +++ b/diffusion_maps/diffusion_maps.py @@ -0,0 +1,216 @@ +"""Diffusion maps module. + +This module implements the diffusion maps method for dimensionality +reduction, as introduced in: + +Coifman, R. R., & Lafon, S. (2006). Diffusion maps. Applied and Computational +Harmonic Analysis, 21(1), 5–30. DOI:10.1016/j.acha.2006.04.006 + +""" + +__all__ = ['DiffusionMaps', 'downsample'] + + +import logging +from typing import Optional, Dict + +import numpy as np +import scipy +import scipy.sparse +from scipy.spatial import cKDTree + +from . import default +from . import utils +from . import clock +Clock = clock.Clock + + +def downsample(data: np.array, num_samples: int) -> np.array: + """Randomly sample a subset of a data set while preserving order. + + The sampling is done without replacement. + + Parameters + ---------- + data : np.array + Array whose 0-th axis indexes the data points. + num_samples : int + Number of items to randomly (uniformly) sample from the data. This + is typically less than the total number of elements in the data set. + + Returns + ------- + sampled_data : np.array + A total of `num_samples` uniformly randomly sampled data points from + `data`. + + """ + assert num_samples <= data.shape[0] + indices = sorted(np.random.choice(range(data.shape[0]), num_samples, + replace=False)) + return data[indices, :] + + +def make_stochastic_matrix(matrix: scipy.sparse.csr_matrix) -> None: + """Normalize a sparse, non-negative matrix in CSR format. + + Normalizes (in the 1-norm) each row of a non-negative matrix and returns + the result. + + Parameters + ---------- + matrix : np.array + A matrix with non-negative entries to be normalized. + + """ + data = matrix.data + indptr = matrix.indptr + for i in range(matrix.shape[0]): + a, b = indptr[i:i+2] + norm1 = np.sum(data[a:b]) + data[a:b] /= norm1 + + +class DiffusionMaps: + """Diffusion maps. + + Attributes + ---------- + epsilon : float + Bandwidth for kernel. + _cut_off : float + Cut off for the computation of pairwise distances between points. + _kdtree : cKDTree + KD-tree for accelerating pairwise distance computation. + eigenvectors : np.array + Right eigenvectors of `K`. + eigenvalues : np.array + Eigenvalues of `K`. + kernel_matrix : scipy.sparse.spmatrix + (Possibly stochastic) matrix obtained by evaluating a Gaussian kernel + on the data points. + + """ + def __init__(self, points: np.array, epsilon: float, + cut_off: Optional[float] = None, + num_eigenpairs: Optional[int] = default.num_eigenpairs, + normalize_kernel: Optional[bool] = True, + kdtree_options: Optional[Dict] = None, + use_cuda: Optional[bool] = default.use_cuda) \ + -> None: + """Compute diffusion maps. + + This function computes the eigendecomposition of the transition + matrix associated to a random walk on the data using a bandwidth + (time) equal to epsilon. + + Parameters + ---------- + points : np.array + Data set to analyze. Its 0-th axis must index each data point. + epsilon : float + Bandwidth to use for the kernel. + cut_off : float, optional + Cut-off for the distance matrix computation. It should be at + least equal to `epsilon`. + num_eigenpairs : int, optional + Number of eigenpairs to compute. Default is + `default.num_eigenpairs`. + normalize_kernel : bool, optional + Whether to convert the kernel into a stochastic matrix or + not. Default is `True`. + kdtree_options : dict, optional + A dictionary containing parameters to pass to the underlying + cKDTree object. + use_cuda : bool, optional + Determine whether to use CUDA-enabled eigenvalue solver or not. + + """ + self.epsilon = epsilon + + if cut_off is None: + self._cut_off = self.__get_cut_off(self.epsilon) + else: + self._cut_off = cut_off + + if kdtree_options is None: + kdtree_options = dict() + with Clock() as clock: + self._kdtree = cKDTree(points, **kdtree_options) + logging.debug('KD-tree computation: {} seconds.'.format(clock)) + + with Clock() as clock: + distance_matrix \ + = self._kdtree.sparse_distance_matrix(self._kdtree, + self._cut_off, + output_type='coo_matrix') + logging.debug('Sparse distance matrix computation: {} seconds.' + .format(clock)) + + logging.debug('Distance matrix has {} nonzero entries ({:.4f}% dense).' + .format(distance_matrix.nnz, distance_matrix.nnz + / np.prod(distance_matrix.shape))) + + with Clock() as clock: + distance_matrix = utils.coo_tocsr(distance_matrix) + logging.debug('Conversion from COO to CSR format: {} seconds.' + .format(clock)) + + with Clock() as clock: + self.kernel_matrix = self._compute_kernel_matrix(distance_matrix) + logging.debug('Kernel matrix computation: {} seconds.' + .format(clock)) + + with Clock() as clock: + if normalize_kernel is True: + make_stochastic_matrix(self.kernel_matrix) + logging.debug('Normalization: {} seconds.'.format(clock)) + + with Clock() as clock: + if use_cuda is True: + from .gpu_eigensolver import eigensolver + ew, ev = eigensolver(self.kernel_matrix, num_eigenpairs) + logging.debug('GPU eigensolver: {} seconds.'.format(clock)) + else: + from .cpu_eigensolver import eigensolver + ew, ev = eigensolver(self.kernel_matrix, num_eigenpairs) + logging.debug('CPU eigensolver: {} seconds.'.format(clock)) + + self.eigenvalues = ew + self.eigenvectors = ev + + @staticmethod + def __get_cut_off(epsilon: float) -> float: + """Return a reasonable cut off value. + + """ + return 2.0 * epsilon # XXX Validate this. + + def _compute_kernel_matrix(self, distance_matrix: scipy.sparse.spmatrix) \ + -> scipy.sparse.spmatrix: + """Compute kernel matrix. + + Returns the (unnormalized) Gaussian kernel matrix corresponding to + the data set and choice of bandwidth `epsilon`. + + Parameters + ---------- + distance_matrix : scipy.sparse.spmatrix + A sparse matrix whose entries are the distances between data + points. + + See also + -------- + _compute_distance_matrix, make_stochastic_matrix + + """ + data = distance_matrix.data + transformed_data = self.kernel_function(data) + kernel_matrix = distance_matrix._with_data(transformed_data, copy=True) + return kernel_matrix + + def kernel_function(self, distances: np.array) -> np.array: + """Evaluate kernel function. + + """ + return np.exp(-np.square(distances) / (2.0 * self.epsilon)) diff --git a/diffusion_maps/geometric_harmonics.py b/diffusion_maps/geometric_harmonics.py new file mode 100644 index 0000000..05feb22 --- /dev/null +++ b/diffusion_maps/geometric_harmonics.py @@ -0,0 +1,60 @@ +"""Geometric harmonics module. + +This module implements out-of-sample evaluation of functions using the +Geometric Harmonics method introduced in: + +Coifman, R. R., & Lafon, S. (2006). Geometric harmonics: A novel tool for +multiscale out-of-sample extension of empirical functions. Applied and +Computational Harmonic Analysis, 21(1), 31–52. DOI:10.1016/j.acha.2005.07.005 + +""" + +__all__ = ['GeometricHarmonicsInterpolator'] + +from typing import Optional, Dict + +import numpy as np +import scipy.spatial +from scipy.interpolate.interpnd import (NDInterpolatorBase, + _ndim_coords_from_arrays) + +from diffusion_maps.diffusion_maps import DiffusionMaps + + +class GeometricHarmonicsInterpolator(NDInterpolatorBase): + """Geometric Harmonics interpolator. + + """ + def __init__(self, points: np.array, values: np.array, epsilon: float, + diffusion_maps_options: Optional[Dict] = None) -> None: + NDInterpolatorBase.__init__(self, points, values, + need_contiguous=False, need_values=True) + self.epsilon = epsilon + if diffusion_maps_options is None: + diffusion_maps_options = dict() + diffusion_maps_options['normalize_kernel'] = False + self.diffusion_maps = DiffusionMaps(self.points, epsilon, + **diffusion_maps_options) + + def __call__(self, *args): + """Evaluate interpolator at the given points. + + """ + xi = _ndim_coords_from_arrays(args, ndim=self.points.shape[1]) + xi = self._check_call_shape(xi) + + kdtree = scipy.spatial.cKDTree(xi) + dmaps_kdtree = self.diffusion_maps._kdtree + radius = self.diffusion_maps._cut_off + + ew = self.diffusion_maps.eigenvalues + ev = self.diffusion_maps.eigenvectors + aux = ev.T @ np.diag(1.0 / ew) @ ev @ self.values + + distance_matrix \ + = kdtree.sparse_distance_matrix(dmaps_kdtree, radius, + output_type='coo_matrix') + kernel_matrix \ + = self.diffusion_maps._compute_kernel_matrix(distance_matrix) + + return np.squeeze(kernel_matrix @ aux) diff --git a/diffusion_maps/gpu_eigensolver.py b/diffusion_maps/gpu_eigensolver.py new file mode 100644 index 0000000..f8b6d7c --- /dev/null +++ b/diffusion_maps/gpu_eigensolver.py @@ -0,0 +1,256 @@ +"""Arnoldi iteration eigensolver using ARPACK-NG. + +See also: https://github.com/opencollab/arpack-ng + +""" + + +from ctypes import (byref, cdll, create_string_buffer, c_int, c_char, + c_char_p, c_double, POINTER, sizeof) +import ctypes.util +import logging +from typing import Optional, Tuple +import sys + +try: + import pycuda.gpuarray as gpuarray +except ImportError: + gpuarray = None + +import numpy as np +import scipy.sparse + +from . import matrix_vector_product as mvp +from . import clock + + +EPSILON = sys.float_info.epsilon + +MAX_ITERATIONS = int(1e7) + + +arpack = cdll.LoadLibrary(ctypes.util.find_library('arpack')) + +dnaupd = arpack.dnaupd_ +dnaupd.argtypes = [POINTER(c_int), c_char_p, POINTER(c_int), c_char_p, + POINTER(c_int), POINTER(c_double), POINTER(c_double), + POINTER(c_int), POINTER(c_double), POINTER(c_int), + POINTER(c_int), POINTER(c_int), POINTER(c_double), + POINTER(c_double), POINTER(c_int), POINTER(c_int)] +dnaupd_messages = dict([ + (0, "Normal exit."), + (1, "Maximum number of iterations taken. " + "All possible eigenvalues of OP has been found. " + "IPARAM(5) returns the number of wanted converged Ritz values."), + (2, "No longer an informational error. " + "Deprecated starting with release 2 of ARPACK."), + (3, "No shifts could be applied during a cycle of the Implicitly " + "restarted Arnoldi iteration. One possibility is to increase the" + " size of NCV relative to NEV."), + (-1, "N must be positive."), + (-2, "NEV must be positive."), + (-3, "NCV-NEV >= 2 and less than or equal to N."), + (-4, "The maximum number of Arnoldi update iteration must be " + "greater than zero."), + (-5, "WHICH must be one of 'LM', 'SM', 'LR', 'SR', 'LI', 'SI'"), + (-6, "BMAT must be one of 'I' or 'G'."), + (-7, "Length of private work array is not sufficient."), + (-8, "Error return from LAPACK eigenvalue calculation;"), + (-9, "Starting vector is zero."), + (-10, "IPARAM(7) must be 1,2,3,4."), + (-11, "IPARAM(7) = 1 and BMAT = 'G' are incompatible."), + (-12, "IPARAM(1) must be equal to 0 or 1."), + (-9999, "Could not build an Arnoldi factorization. " + "IPARAM(5) returns the size of the current Arnoldi factorization.")]) + +dneupd = arpack.dneupd_ +# dneupd.argtypes = [POINTER(c_int), c_char_p, POINTER(c_int), +# POINTER(c_double), POINTER(c_double), POINTER(c_double), +# POINTER(c_int), POINTER(c_double), POINTER(c_double), +# POINTER(c_double), c_char_p, POINTER(c_int), c_char_p, +# POINTER(c_int), POINTER(c_double), POINTER(c_double), +# POINTER(c_int), POINTER(c_double), POINTER(c_int), +# POINTER(c_int), POINTER(c_int), POINTER(c_double), +# POINTER(c_double), POINTER(c_int), POINTER(c_int)] +dneupd_messages = dict([ + (0, "Normal exit."), + (1, "The Schur form computed by LAPACK routine dlahqr could not be " + "reordered by LAPACK routine dtrsen. Re-enter subroutine dneupd with " + "IPARAM(5)=NCV and increase the size of the arrays DR and DI to have " + "dimension at least dimension NCV and allocate at least NCV columns " + "for Z. NOTE, \"Not necessary if Z and V share the same space. " + "Please notify the authors if this error occurs.\""), + (-1, "N must be positive."), + (-2, "NEV must be positive."), + (-3, "NCV-NEV >= 2 and less than or equal to N."), + (-5, "WHICH must be one of 'LM', 'SM', 'LR', 'SR', 'LI', 'SI'"), + (-6, "BMAT must be one of 'I' or 'G'."), + (-7, "Length of private work WORKL array is not sufficient."), + (-8, "Error return from calculation of a real Schur form. " + "Informational error from LAPACK routine dlahqr."), + (-9, "Error return from calculation of eigenvectors. " + "Informational error from LAPACK routine dtrevc."), + (-10, "IPARAM(7) must be 1,2,3,4."), + (-11, "IPARAM(7) = 1 and BMAT = 'G' are incompatible."), + (-12, "HOWMNY = 'S' not yet implemented."), + (-13, "HOWMNY must be one of 'A' or 'P' if RVEC = .true."), + (-14, "DNAUPD did not find any eigenvalues to sufficient accuracy."), + (-15, "DNEUPD got a different count of the number of converged Ritz " + "values than DNAUPD got. This indicates the user probably made an " + "error in passing data from DNAUPD to DNEUPD or that the data was " + "modified before entering DNEUPD.")]) + + +class ArpackError(Exception): + pass + + +def eigensolver(matrix: scipy.sparse.csr_matrix, + num_eigenpairs: Optional[int] = 10, + sigma: Optional[float] = None, + initial_vector: Optional[np.array] = None) \ + -> Tuple[np.array, np.array]: + """Solve eigenvalue problem for sparse matrix. + + Parameters + ---------- + matrix : scipy.sparse.spmatrix + A matrix in compressed sparse row format. + num_eigenpairs : int, optional + Number of eigenpairs to compute. Default is 10. + sigma : float, optional + Find eigenvalues close to the value of sigma. The default value is + near 1.0. Currently unsupported on the GPU. + initial_vector : np.array, optional + Initial vector to use in the Arnoldi iteration. If not set, a vector + with all entries equal to one will be used. + + Returns + ------- + ew : np.array + Eigenvalues in descending order of magnitude. + ev : np.array + Eigenvectors corresponding to the eigenvalues in `ew`. + + """ + if sigma is not None: + raise RuntimeError('Shift/invert mode not implemented on the GPU') + + N = matrix.shape[0] + + if initial_vector is None: + initial_vector = np.ones(N) + + n = c_int(N) # Dimension of the eigenproblem. + maxn = n + nev = c_int(num_eigenpairs + 1) # Number of eigenvalues to compute. + ncv = c_int(num_eigenpairs + 3) # Number of columns of the matrix V. + maxncv = ncv.value + ldv = c_int(maxn.value) + + # assert 0 < nev.value < n.value - 1 + # assert 2 - nev.value <= ncv.value <= n.value + + tol = c_double(EPSILON) + + d = (c_double * (3 * maxncv))() + + resid = initial_vector.ctypes.data_as(POINTER(c_double)) + + vnp = np.zeros((maxncv, ldv.value), dtype=np.float64) + v = vnp.ctypes.data_as(POINTER(c_double)) + + workdnp = np.zeros(3 * maxn.value, dtype=np.float64) + workd = workdnp.ctypes.data_as(POINTER(c_double)) + workev = (c_double * (3 * maxncv))() + workl = (c_double * (3 * maxncv * maxncv + 6 * maxncv))() + + ipntr = (c_int * 14)() + select = (c_int * maxncv)() + + bmat = create_string_buffer(b'I') # B = I, standard eigenvalue problem. + which = create_string_buffer(b'LM') # Eigenvalues of largest magnitude. + ido = c_int(0) + lworkl = c_int(len(workl)) + info = c_int(1) + + ishfts = c_int(1) # Use exact shifts. + maxitr = c_int(MAX_ITERATIONS) + # mode = c_int(3) # A x = lambda x (OP = inv(A - sigma I), B = I) + mode = c_int(1) # A x = lambda x (OP = A, B = I) + iparam = (c_int * 11)(ishfts, 0, maxitr, 0, 0, 0, mode) + + ierr = c_int(0) + rvec = c_int(1) + howmny = c_char(b'A') + if sigma is not None: + sigmar = c_double(np.real(sigma)) + sigmai = c_double(np.imag(sigma)) + else: + sigmar = c_double() + sigmai = c_double() + + MVP = mvp.MatrixVectorProduct(matrix) + + # eye = scipy.sparse.spdiags(np.ones(N), 0, N, N) + # A = (matrix - (sigma) * eye).tocsc() + # solve = scipy.sparse.linalg.factorized(A) + + logging.debug('Running Arnoldi iteration with tolerance {:g}...' + .format(tol.value)) + + clk = clock.Clock() + clk.tic() + for itr in range(maxitr.value): + dnaupd(byref(ido), bmat, byref(n), which, byref(nev), byref(tol), + resid, byref(ncv), v, byref(ldv), iparam, ipntr, workd, workl, + byref(lworkl), byref(info)) + + if info.value != 0: + raise ArpackError(dnaupd_messages[info.value]) + + if ido.value == 99: + break + elif abs(ido.value) != 1: + logging.warning('DNAUPD repoted IDO = {}'.format(ido.value)) + break + + idx_rhs, idx_sol = ipntr[0] - 1, ipntr[1] - 1 + rhs = workdnp[idx_rhs:(idx_rhs+N)] + + # # sol = solve(rhs) + # sol = matrix @ rhs + # workdnp[idx_sol:idx_sol+N] = sol + sol = MVP.product(gpuarray.to_gpu(rhs.astype(np.float64))) + workdnp[idx_sol:idx_sol+N] = sol.get() + clk.toc() + + logging.debug('Done with Arnoldi iteration after {} steps. ' + 'Elapsed time: {} seconds'.format(itr, clk)) + + logging.debug('Running post-processing step...') + + clk.tic() + d0 = byref(d, 0) + d1 = byref(d, maxncv * sizeof(c_double)) + dneupd(byref(rvec), byref(howmny), select, d0, d1, v, byref(ldv), + byref(sigmar), byref(sigmai), workev, byref(bmat), byref(n), + which, byref(nev), byref(tol), resid, byref(ncv), v, byref(ldv), + iparam, ipntr, workd, workl, byref(lworkl), byref(ierr)) + clk.toc() + + logging.debug('Done with postprocessing step after {} seconds. ' + 'Status: {}'.format(clk, ('OK' if ierr.value == 0 + else 'FAIL'))) + + if ierr.value != 0: + raise ArpackError(dneupd_messages[ierr.value]) + + nconv = iparam[4] - 1 + logging.debug('Converged on {} eigenpairs.'.format(nconv)) + + ew = np.array(d[:nconv]) + ii = np.argsort(np.abs(ew))[::-1] + ev = vnp[ii, :] + + return ew[ii], ev diff --git a/diffusion_maps/matrix_vector_product.py b/diffusion_maps/matrix_vector_product.py new file mode 100644 index 0000000..ed68b3d --- /dev/null +++ b/diffusion_maps/matrix_vector_product.py @@ -0,0 +1,37 @@ +import numpy as np + +import scipy.sparse + +import pycuda.gpuarray as gpuarray + +from . import cusparse as cs + + +class MatrixVectorProduct: + """Perform GPU-based, sparse matrix-vector products.""" + def __init__(self, matrix: scipy.sparse.csr_matrix) -> None: + self.m = matrix.shape[0] + self.n = matrix.shape[1] + self.nnz = matrix.nnz + self.csrValA = gpuarray.to_gpu(matrix.data.astype(np.float64)) + self.csrRowPtrA = gpuarray.to_gpu(matrix.indptr) + self.csrColIndA = gpuarray.to_gpu(matrix.indices) + self.handle = cs.cusparseCreate() + self.descr = cs.cusparseCreateMatDescr() + + def __del__(self) -> None: + if self.descr is not None: + cs.cusparseDestroyMatDescr(self.descr) + self.descr = None + if self.handle is not None: + cs.cusparseDestroy(self.handle) + self.handle = None + + def product(self, x: gpuarray.GPUArray) -> gpuarray.GPUArray: + """Multiply sparse matrix by dense vector.""" + y = gpuarray.empty_like(x) + op = cs.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE + cs.cusparseDcsrmv(self.handle, op, self.m, self.n, self.nnz, 1.0, + self.descr, self.csrValA, self.csrRowPtrA, + self.csrColIndA, x, 0.0, y) + return y diff --git a/diffusion_maps/plot.py b/diffusion_maps/plot.py new file mode 100644 index 0000000..8704c0d --- /dev/null +++ b/diffusion_maps/plot.py @@ -0,0 +1,67 @@ +"""Module for plotting eigenvectors. + +""" + +__all__ = ['plot_eigenvectors'] + +from typing import Tuple + +import matplotlib.pyplot as plt + +import numpy as np + +from . import default + + +def get_rows_and_columns(num_plots: int) -> Tuple[int, int]: + """Get optimal number of rows and columns to display figures. + + Parameters + ---------- + num_plots : int + Number of subplots + + Returns + ------- + rows : int + Optimal number of rows. + cols : int + Optimal number of columns. + + """ + if num_plots <= 10: + layouts = { + 1: (1, 1), 2: (1, 2), 3: (1, 3), 4: (2, 2), 5: (2, 3), + 6: (2, 3), 7: (2, 4), 8: (2, 4), 9: (3, 9), 10: (2, 5) + } + rows, cols = layouts[num_plots] + else: + rows = int(np.ceil(np.sqrt(num_plots))) + cols = rows + + return rows, cols + + +def plot_eigenvectors(data: np.array, eigenvectors: np.array) -> None: + """Plot eigenvectors onto 2D data. + + """ + x = data[:, 0] + y = data[:, 1] + + num_eigenvectors = min(eigenvectors.shape[0]-1, default.num_eigenpairs-1) + + rows, cols = get_rows_and_columns(num_eigenvectors) + + for k in range(1, eigenvectors.shape[0]): + plt.subplot(rows, cols, k) + plt.scatter(x, y, c=eigenvectors[k, :], cmap='RdBu_r', + rasterized=True) + plt.xlabel('$x$') + plt.ylabel('$y$') + cb = plt.colorbar() + cb.set_label('Eigenvector value') + plt.title('$\\psi_{{{}}}$'.format(k)) + + # plt.tight_layout() + plt.show() diff --git a/diffusion_maps/profiler.py b/diffusion_maps/profiler.py new file mode 100644 index 0000000..4e5deb0 --- /dev/null +++ b/diffusion_maps/profiler.py @@ -0,0 +1,29 @@ +"""Context manager for running code under the Python profiler. + +""" + + +__all__ = ['Profiler'] + +import cProfile as profile + + +class Profiler: + """Run code under the profiler and report at the end.""" + def __init__(self, stream): + """Initialize profiler with an open stream.""" + self.stream = stream + + def __enter__(self): + if self.stream is not None: + self.profile = profile.Profile() + self.profile.enable() + + def __exit__(self, exc_type, exc_value, traceback): + if self.stream is not None: + self.profile.disable() + + import pstats + ps = pstats.Stats(self.profile, stream=self.stream) + ps.sort_stats('cumulative') + ps.print_stats() diff --git a/diffusion_maps/tests/__init__.py b/diffusion_maps/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/diffusion_maps/tests/test_diffusion_maps.py b/diffusion_maps/tests/test_diffusion_maps.py new file mode 100644 index 0000000..91c65c5 --- /dev/null +++ b/diffusion_maps/tests/test_diffusion_maps.py @@ -0,0 +1,120 @@ +"""Unit test for the diffusion_maps module. + +""" + +import logging +import unittest + +import numpy as np +# import matplotlib.pyplot as plt + +from diffusion_maps import DiffusionMaps, downsample + + +def make_strip(xmin: float, ymin: float, width: float, + height: float, num_samples: int) -> np.array: + """Draw samples from a 2D strip with uniform distribution. + + """ + x = width * np.random.rand(num_samples) - xmin + y = height * np.random.rand(num_samples) - ymin + + return np.stack((x, y), axis=-1) + + +class DiffusionMapsTest(unittest.TestCase): + def setUp(self): + logging.basicConfig(level=logging.DEBUG) + self.xmin = 0.0 + self.ymin = 0.0 + self.width = 1.0 + self.height = 1e-1 + self.num_samples = 50000 + self.data = make_strip(self.xmin, self.ymin, + self.width, self.height, + self.num_samples) + + @staticmethod + def _compute_rayleigh_quotients(matrix, eigenvectors): + """Compute Rayleigh quotients.""" + N = eigenvectors.shape[0] + rayleigh_quotients = np.zeros(N) + for n in range(N): + v = eigenvectors[n, :] + rayleigh_quotients[n] = np.dot(v, matrix @ v) / np.dot(v, v) + rayleigh_quotients = np.sort(np.abs(rayleigh_quotients)) + return rayleigh_quotients[::-1] + + def test_accuracy(self): + num_samples = 10000 + logging.debug('Computing diffusion maps on a matrix of size {}' + .format(num_samples)) + num_eigenpairs = 10 + epsilon = 5e-1 + downsampled_data = downsample(self.data, num_samples) + + dm = DiffusionMaps(downsampled_data, epsilon, + num_eigenpairs=num_eigenpairs) + + ew = dm.eigenvalues + rq = self._compute_rayleigh_quotients(dm.kernel_matrix, + dm.eigenvectors) + + logging.debug('Eigenvalues: {}'.format(ew)) + logging.debug('Rayleigh quotients: {}'.format(rq)) + + self.assertTrue(np.allclose(np.abs(ew), np.abs(rq))) + + # import scipy.io + # scipy.io.mmwrite('kernel_matrix.mtx', dm.kernel_matrix) + + def test_multiple_epsilon_values(self): + num_samples = 5000 + num_maps = 10 + num_eigenpairs = 10 + epsilon_min, epsilon_max = 1e-1, 1e1 + epsilons = np.logspace(np.log10(epsilon_min), + np.log10(epsilon_max), num_maps) + + downsampled_data = downsample(self.data, num_samples) + + evs = np.zeros((num_maps, num_eigenpairs, downsampled_data.shape[0])) + ews = np.zeros((num_maps, num_eigenpairs)) + + logging.basicConfig(level=logging.WARNING) + + for i, epsilon in enumerate(reversed(epsilons)): + dm = DiffusionMaps(downsampled_data, epsilon, + num_eigenpairs=num_eigenpairs) + + evs[i, :, :] = dm.eigenvectors + ews[i, :] = dm.eigenvalues + + ew = dm.eigenvalues + rq = self._compute_rayleigh_quotients(dm.kernel_matrix, + dm.eigenvectors) + self.assertTrue(np.allclose(np.abs(ew), np.abs(rq))) + + # plt.title('$\\epsilon$ = {:.3f}'.format(epsilon)) + # for k in range(1, 10): + # plt.subplot(2, 5, k) + # plt.scatter(downsampled_data[:, 0], downsampled_data[:, 1], + # c=evs[i, k, :]) + # plt.xlim([self.xmin, self.xmin + self.width]) + # plt.ylim([self.ymin, self.ymin + self.height]) + # plt.tight_layout() + # plt.gca().set_title('$\\psi_{}$'.format(k)) + # plt.subplot(2, 5, 10) + # plt.step(range(ews[i, :].shape[0]), np.abs(ews[i, :])) + # plt.title('epsilon = {:.2f}'.format(epsilon)) + # plt.show() + + +if __name__ == '__main__': + import os + verbose = os.getenv('VERBOSE') + if verbose is not None: + logging.basicConfig(level=logging.DEBUG, format='%(message)s') + else: + logging.basicConfig(level=logging.ERROR, format='%(message)s') + unittest.main() diff --git a/diffusion_maps/tests/test_geometric_harmonics.py b/diffusion_maps/tests/test_geometric_harmonics.py new file mode 100644 index 0000000..030e932 --- /dev/null +++ b/diffusion_maps/tests/test_geometric_harmonics.py @@ -0,0 +1,120 @@ +"""Unit test for the Geometric Harmonics module. + +""" + +import logging +import unittest + +import numpy as np + +import matplotlib.pyplot as plt + +from diffusion_maps import GeometricHarmonicsInterpolator + +from diffusion_maps.clock import Clock + + +def make_points(num_points: int) -> np.array: + xx, yy = np.meshgrid(np.linspace(-4, 4, num_points), + np.linspace(-4, 4, num_points)) + return np.stack((xx.ravel(), yy.ravel())).T + + +def plot(points: np.array, values: np.array, **kwargs) -> None: + title = kwargs.pop('title', None) + if title: + plt.title(title) + plt.scatter(points[:, 0], points[:, 1], c=values, + marker='o', rasterized=True, s=2.5, **kwargs) + cb = plt.colorbar() + cb.set_clim([np.min(values), np.max(values)]) + cb.set_ticks(np.linspace(np.min(values), np.max(values), 5)) + plt.xlim([-4, 4]) + plt.ylim([-4, 4]) + plt.xlabel('$x$') + plt.ylabel('$y$') + plt.gca().set_aspect('equal') + + +def f(points: np.array) -> np.array: + """Function to interpolate. + + """ + # return np.ones(points.shape[0]) + # return np.arange(points.shape[0]) + return np.sin(np.linalg.norm(points, axis=-1)) + + +def show_info(title: str, values: np.array) -> None: + """Log relevant information. + + """ + logging.info('{}: mean = {:g}, std. = {:g}, max. abs. = {:g}' + .format(title, np.mean(values), np.std(values), + np.max(np.abs(values)))) + + +class GeometricHarmonicsTest(unittest.TestCase): + def setUp(self): + # self.num_points = 1000 + # self.points = downsample(np.load('data.npy'), self.num_points) + # self.values = np.ones(self.num_points) + + # np.save('actual-data.npy', self.points) + + # self.points = np.load('actual-data.npy') + # self.num_points = self.points.shape[0] + # self.values = np.ones(self.num_points) + + self.points = make_points(23) + self.num_points = self.points.shape[0] + self.values = f(self.points) + + def test_geometric_harmonics_interpolator(self): + logging.basicConfig(level=logging.DEBUG) + + eps = 1e-1 + dmaps_opts = {'num_eigenpairs': self.num_points-3, + 'cut_off': 1e1 * eps} + ghi = GeometricHarmonicsInterpolator(self.points, self.values, + eps, dmaps_opts) + + points = make_points(100) + + with Clock() as clock: + values = ghi(points) + logging.debug('Evaluation of geometric harmonics done ({} seconds).'. + format(clock)) + + residual = values - f(points) + self.assertLess(np.max(np.abs(residual)), 7.5e-2) + + show_info('Original function', f(points)) + show_info('Sampled points', self.values) + show_info('Reconstructed function', values) + show_info('Residual', residual) + + # plt.subplot(2, 2, 1) + # plot(points, f(points), title='Original function') + # + # plt.subplot(2, 2, 2) + # plot(self.points, self.values, title='Sampled function') + # + # plt.subplot(2, 2, 4) + # plot(points, values, title='Reconstructed function') + # + # plt.subplot(2, 2, 3) + # plot(points, residual, title='Residual', cmap='RdBu_r') + # + # plt.tight_layout() + # plt.show() + + +if __name__ == '__main__': + import os + verbose = os.getenv('VERBOSE') + if verbose is not None: + logging.basicConfig(level=logging.DEBUG, format='%(message)s') + else: + logging.basicConfig(level=logging.ERROR, format='%(message)s') + unittest.main() diff --git a/diffusion_maps/utils.py b/diffusion_maps/utils.py new file mode 100644 index 0000000..8ef648d --- /dev/null +++ b/diffusion_maps/utils.py @@ -0,0 +1,40 @@ +"""Miscellaneous utilities. + +""" + + +__all__ = ['coo_tocsr'] + + +import numpy as np +import scipy.sparse + + +def coo_tocsr(A: scipy.sparse.coo_matrix) -> scipy.sparse.csr_matrix: + """Convert matrix to Compressed Sparse Row format, fast. + + This function is derived from the corresponding SciPy code but it avoids + the sanity checks that slow `scipy.sparse.coo_matrix.to_csr down`. In + particular, by not summing duplicates we can attain important speed-ups + for large matrices. + + """ + from scipy.sparse import csr_matrix + if A.nnz == 0: + return csr_matrix(A.shape, dtype=A.dtype) + + m, n = A.shape + # Using 32-bit integer indices allows for matrices of up to 2,147,483,647 + # non-zero entries. + idx_dtype = np.int32 + row = A.row.astype(idx_dtype, copy=False) + col = A.col.astype(idx_dtype, copy=False) + + indptr = np.empty(n+1, dtype=idx_dtype) + indices = np.empty_like(row, dtype=idx_dtype) + data = np.empty_like(A.data, dtype=A.dtype) + + scipy.sparse._sparsetools.coo_tocsr(m, n, A.nnz, row, col, A.data, + indptr, indices, data) + + return csr_matrix((data, indices, indptr), shape=A.shape) diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..943885b --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = diffusion-maps +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..30c31a0 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# diffusion-maps documentation build configuration file, created by +# sphinx-quickstart on Sat May 20 12:03:02 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'diffusion-maps' +copyright = '2017, Juan M. Bello-Rivas' +author = 'Juan M. Bello-Rivas' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '' +# The full version, including alpha/beta/rc tags. +release = '' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'diffusion-mapsdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'diffusion-maps.tex', 'diffusion-maps Documentation', + 'Juan M. Bello-Rivas', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'diffusion-maps', 'diffusion-maps Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'diffusion-maps', 'diffusion-maps Documentation', + author, 'diffusion-maps', 'One line description of project.', + 'Miscellaneous'), +] + + + diff --git a/doc/diffusion_maps.rst b/doc/diffusion_maps.rst new file mode 100644 index 0000000..0ea3aad --- /dev/null +++ b/doc/diffusion_maps.rst @@ -0,0 +1,125 @@ +diffusion\_maps package +======================= + +Subpackages +----------- + +.. toctree:: + + diffusion_maps.tests + +Submodules +---------- + +diffusion\_maps\.clock module +----------------------------- + +.. automodule:: diffusion_maps.clock + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.command\_line\_interface module +------------------------------------------------ + +.. automodule:: diffusion_maps.command_line_interface + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.cpu\_eigensolver module +---------------------------------------- + +.. automodule:: diffusion_maps.cpu_eigensolver + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.cusparse module +-------------------------------- + +.. automodule:: diffusion_maps.cusparse + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.default module +------------------------------- + +.. automodule:: diffusion_maps.default + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.diffusion\_maps module +--------------------------------------- + +.. automodule:: diffusion_maps.diffusion_maps + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.geometric\_harmonics module +-------------------------------------------- + +.. automodule:: diffusion_maps.geometric_harmonics + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.gpu\_eigensolver module +---------------------------------------- + +.. automodule:: diffusion_maps.gpu_eigensolver + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.matrix\_vector\_product module +----------------------------------------------- + +.. automodule:: diffusion_maps.matrix_vector_product + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.plot module +---------------------------- + +.. automodule:: diffusion_maps.plot + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.profiler module +-------------------------------- + +.. automodule:: diffusion_maps.profiler + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.utils module +----------------------------- + +.. automodule:: diffusion_maps.utils + :members: + :undoc-members: + :show-inheritance: + +diffusion\_maps\.version module +------------------------------- + +.. automodule:: diffusion_maps.version + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: diffusion_maps + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..3e209a6 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,22 @@ +.. diffusion-maps documentation master file, created by + sphinx-quickstart on Sat May 20 12:03:02 2017. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to diffusion-maps's documentation! +========================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + modules + +.. automodule:: diffusion_maps + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..4e0923d --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=diffusion-maps + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/doc/modules.rst b/doc/modules.rst new file mode 100644 index 0000000..64f1ec4 --- /dev/null +++ b/doc/modules.rst @@ -0,0 +1,7 @@ +diffusion_maps +============== + +.. toctree:: + :maxdepth: 4 + + diffusion_maps diff --git a/geometric-harmonics.png b/geometric-harmonics.png new file mode 100644 index 0000000..03efcdd Binary files /dev/null and b/geometric-harmonics.png differ diff --git a/scripts/make-version b/scripts/make-version new file mode 100755 index 0000000..c6139e6 --- /dev/null +++ b/scripts/make-version @@ -0,0 +1,21 @@ +#!/bin/sh + +# GIT_ID=`git rev-parse HEAD` +GIT_ID=`git describe --always` +GIT_NUM=`git rev-list HEAD --count` +DATE=`date +%Y-%b-%d` +DNUM=`date +%Y%m%d` + +(cat <&2 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..72f7f88 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +ignore = E402, W503, E226 + +#[easy_install] +#find-links = ../offline +#allow-hosts = None diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..17ac700 --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python + +from setuptools import setup, find_packages + +import diffusion_maps + + +setup(name='diffusion-maps', + version=diffusion_maps.version.v_short, + description='Diffusion maps', + long_description='Library for computing diffusion maps', + license='MIT License', + author='Juan M. Bello-Rivas', + author_email='jmbr@superadditive.com', + packages=find_packages(), + package_dir={'diffusion_maps': 'diffusion_maps'}, + package_data={'': ['LICENSE']}, + test_suite='nose.collector', + tests_require=['nose'], + install_requires=['scipy', 'numpy'], + extras_require={ + 'plotting': ['matplotlib'], + 'cuda': ['pycuda'] + }, + entry_points={ + 'console_scripts': 'diffusion-maps = diffusion_maps.command_line_interface:main' + })