Source code for nipy.algorithms.utils.pca

# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
"""
This module provides a class for principal components analysis (PCA).

PCA is an orthonormal, linear transform (i.e., a rotation) that maps the
data to a new coordinate system such that the maximal variability of the
data lies on the first coordinate (or the first principal component), the
second greatest variability is projected onto the second coordinate, and
so on.  The resulting data has unit covariance (i.e., it is decorrelated).
This technique can be used to reduce the dimensionality of the data.

More specifically, the data is projected onto the eigenvectors of the
covariance matrix.
"""
from __future__ import absolute_import

import numpy as np
import numpy.linalg as npl

from ...core.image.image import rollimg
from ...core.reference.coordinate_map import (io_axis_indices, orth_axes,
                                              drop_io_dim, AxisError)


[docs]def pca(data, axis=0, mask=None, ncomp=None, standardize=True,
        design_keep=None, design_resid='mean', tol_ratio=0.01):
    """Compute the SVD PCA of an array-like thing over `axis`.

    Parameters
    ----------
    data : ndarray-like (np.float)
       The array on which to perform PCA over axis `axis` (below)
    axis : int, optional
       The axis over which to perform PCA (axis identifying
       observations).  Default is 0 (first)
    mask : ndarray-like (np.bool), optional
       An optional mask, should have shape given by data axes, with
       `axis` removed, i.e.: ``s = data.shape; s.pop(axis);
       msk_shape=s``
    ncomp : {None, int}, optional
       How many component basis projections to return. If ncomp is None
       (the default) then the number of components is given by the
       calculated rank of the data, after applying `design_keep`,
       `design_resid` and `tol_ratio` below.  We always return all the
       basis vectors and percent variance for each component; `ncomp`
       refers only to the number of basis_projections returned.
    standardize : bool, optional
       If True, standardize so each time series (after application of
       `design_keep` and `design_resid`) has the same standard
       deviation, as calculated by the ``np.std`` function.
    design_keep : None or ndarray, optional
       Data is projected onto the column span of design_keep.
       None (default) equivalent to ``np.identity(data.shape[axis])``
    design_resid : str or None or ndarray, optional
       After projecting onto the column span of design_keep, data is
       projected perpendicular to the column span of this matrix.  If
       None, we do no such second projection.  If a string 'mean', then
       the mean of the data is removed, equivalent to passing a column
       vector matrix of 1s.
    tol_ratio : float, optional
       If ``XZ`` is the vector of singular values of the projection
       matrix from `design_keep` and `design_resid`, and S are the
       singular values of ``XZ``, then `tol_ratio` is the value used to
       calculate the effective rank of the projection of the design, as
       in ``rank = ((S / S.max) > tol_ratio).sum()``

    Returns
    -------
    results : dict
        $G$ is the number of non-trivial components found after applying
       `tol_ratio` to the projections of `design_keep` and
       `design_resid`.

       `results` has keys:

       * ``basis_vectors``: series over `axis`, shape (data.shape[axis], G) -
          the eigenvectors of the PCA
       * ``pcnt_var``: percent variance explained by component, shape
          (G,)
       * ``basis_projections``: PCA components, with components varying
          over axis `axis`; thus shape given by: ``s = list(data.shape);
          s[axis] = ncomp``
       * ``axis``: axis over which PCA has been performed.

    Notes
    -----
    See ``pca_image.m`` from ``fmristat`` for Keith Worsley's code on
    which some of this is based.

    See: http://en.wikipedia.org/wiki/Principal_component_analysis for
    some inspiration for naming - particularly 'basis_vectors' and
    'basis_projections'

    Examples
    --------
    >>> arr = np.random.normal(size=(17, 10, 12, 14))
    >>> msk = np.all(arr > -2, axis=0)
    >>> res = pca(arr, mask=msk, ncomp=9)

    Basis vectors are columns.  There is one column for each component.  The
    number of components is the calculated rank of the data matrix after
    applying the various projections listed in the parameters.  In this case we
    are only removing the mean, so the number of components is one less than the
    axis over which we do the PCA (here axis=0 by default).

    >>> res['basis_vectors'].shape
    (17, 16)

    Basis projections are arrays with components in the dimension over which we
    have done the PCA (axis=0 by default).  Because we set `ncomp` above, we
    only retain `ncomp` components.

    >>> res['basis_projections'].shape
    (9, 10, 12, 14)
    """
    data = np.asarray(data)
    # We roll the PCA axis to be first, for convenience
    if axis is None:
        raise ValueError('axis cannot be None')
    data = np.rollaxis(data, axis)
    if mask is not None:
        mask = np.asarray(mask)
        if not data.shape[1:] == mask.shape:
            raise ValueError('Mask should match dimensions of data other than '
                             'the axis over which to do the PCA')
    if design_resid == 'mean':
        # equivalent to: design_resid = np.ones((data.shape[0], 1))
        def project_resid(Y):
            return Y - Y.mean(0)[None,...]
    elif design_resid is None:
        def project_resid(Y): return Y
    else: # matrix passed, we hope
        projector = np.dot(design_resid, npl.pinv(design_resid))
        def project_resid(Y):
            return Y - np.dot(projector, Y)
    if standardize:
        def rmse_scales_func(std_source):
            # modifies array in place
            resid = project_resid(std_source)
            # root mean square of the residual
            rmse = np.sqrt(np.square(resid).sum(axis=0) / resid.shape[0])
            # positive 1/rmse
            return np.where(rmse<=0, 0, 1. / rmse)
    else:
        rmse_scales_func = None
    """
    Perform the computations needed for the PCA.  This stores the
    covariance/correlation matrix of the data in the attribute 'C'.  The
    components are stored as the attributes 'components', for an fMRI
    image these are the time series explaining the most variance.

    Now, we compute projection matrices. First, data is projected onto
    the columnspace of design_keep, then it is projected perpendicular
    to column space of design_resid.
    """
    if design_keep is None:
        X = np.eye(data.shape[0])
    else:
        X = np.dot(design_keep, npl.pinv(design_keep))
    XZ = project_resid(X)
    UX, SX, VX = npl.svd(XZ, full_matrices=0)
    # The matrix UX has orthonormal columns and represents the
    # final "column space" that the data will be projected onto.
    rank = (SX/SX.max() > tol_ratio).sum()
    UX = UX[:,:rank].T
    # calculate covariance matrix in full-rank column space.  The returned
    # array is roughly: YX = dot(UX, data); C = dot(YX, YX.T), perhaps where the
    # data has been standarized, perhaps summed over slices
    C_full_rank  = _get_covariance(data, UX, rmse_scales_func, mask)
    # find the eigenvalues D and eigenvectors Vs of the covariance
    # matrix
    D, Vs = npl.eigh(C_full_rank)
    # Compute basis vectors in original column space
    basis_vectors = np.dot(UX.T, Vs).T
    # sort both in descending order of eigenvalues
    order = np.argsort(-D)
    D = D[order]
    basis_vectors = basis_vectors[order]
    pcntvar = D * 100 / D.sum()
    """
    Output the component basis_projections
    """
    if ncomp is None:
        ncomp = rank
    subVX = basis_vectors[:ncomp]
    out = _get_basis_projections(data, subVX, rmse_scales_func)
    # Roll PCA image axis back to original position in data array
    if axis < 0:
        axis += data.ndim
    out = np.rollaxis(out, 0, axis+1)
    return {'basis_vectors': basis_vectors.T,
            'pcnt_var': pcntvar,
            'basis_projections': out,
            'axis': axis}


def _get_covariance(data, UX, rmse_scales_func, mask):
    # number of points in PCA dimension
    rank, n_pts = UX.shape
    C = np.zeros((rank, rank))
    # nan_to_num only for floating point masks
    if mask is not None:
        nan_to_num = mask.dtype.type in (np.sctypes['float'] +
                                         np.sctypes['complex'])
    # loop over next dimension to save memory
    if data.ndim == 2:
        # If we have 2D data, just do the covariance all in one shot, by using
        # a slice that is the equivalent of the ':' slice syntax
        slices = [slice(None)]
    else:
        # If we have more then 2D, then we iterate over slices in the second
        # dimension, in order to save memory
        slices = [slice(i,i+1) for i in range(data.shape[1])]
    for i, s_slice in enumerate(slices):
        Y = data[:,s_slice].reshape((n_pts, -1))
        # project data into required space
        YX = np.dot(UX, Y)
        if rmse_scales_func is not None:
            YX *= rmse_scales_func(Y)
        if mask is not None:
            # weight data with mask.  Usually the weights will be 0,1
            msk_slice = mask[s_slice].reshape(Y.shape[1])
            if nan_to_num: # but if floats, check for NaNs too.
                msk_slice = np.nan_to_num(msk_slice)
            YX = YX * msk_slice
        C += np.dot(YX, YX.T)
    return C


def _get_basis_projections(data, subVX, rmse_scales_func):
    ncomp = subVX.shape[0]
    out = np.empty((ncomp,) + data.shape[1:], np.float)
    for i in range(data.shape[1]):
        Y = data[:,i].reshape((data.shape[0], -1))
        U = np.dot(subVX, Y)
        if rmse_scales_func is not None:
            U *= rmse_scales_func(Y)
        U.shape = (U.shape[0],) + data.shape[2:]
        out[:,i] = U
    return out


[docs]def pca_image(img, axis='t', mask=None, ncomp=None, standardize=True,
              design_keep=None, design_resid='mean', tol_ratio=0.01):
    """ Compute the PCA of an image over a specified axis

    Parameters
    ----------
    img : Image
        The image on which to perform PCA over the given `axis`
    axis : str or int, optional
        Axis over which to perform PCA. Default is 't'. If `axis` is an integer,
        gives the index of the input (domain) axis of `img`. If `axis` is a str, can be
        an input (domain) name, or an output (range) name, that maps to an input
        (domain) name.
    mask : Image, optional
        An optional mask, should have shape == image.shape[:3] and the same
        coordinate map as `img` but with `axis` dropped
    ncomp : {None, int}, optional
       How many component basis projections to return. If ncomp is None
       (the default) then the number of components is given by the
       calculated rank of the data, after applying `design_keep`,
       `design_resid` and `tol_ratio` below.  We always return all the
       basis vectors and percent variance for each component; `ncomp`
       refers only to the number of basis_projections returned.
    standardize : bool, optional
       If True, standardize so each time series (after application of
       `design_keep` and `design_resid`) has the same standard
       deviation, as calculated by the ``np.std`` function.
    design_keep : None or ndarray, optional
       Data is projected onto the column span of design_keep.
       None (default) equivalent to ``np.identity(data.shape[axis])``
    design_resid : str or None or ndarray, optional
       After projecting onto the column span of design_keep, data is
       projected perpendicular to the column span of this matrix.  If
       None, we do no such second projection.  If a string 'mean', then
       the mean of the data is removed, equivalent to passing a column
       vector matrix of 1s.
    tol_ratio : float, optional
       If ``XZ`` is the vector of singular values of the projection
       matrix from `design_keep` and `design_resid`, and S are the
       singular values of ``XZ``, then `tol_ratio` is the value used to
       calculate the effective rank of the projection of the design, as
       in ``rank = ((S / S.max) > tol_ratio).sum()``

    Returns
    -------
    results : dict
        $L$ is the number of non-trivial components found after applying
       `tol_ratio` to the projections of `design_keep` and
       `design_resid`.

       `results` has keys:
       * ``basis_vectors``: series over `axis`, shape (data.shape[axis], L) -
          the eigenvectors of the PCA
       * ``pcnt_var``: percent variance explained by component, shape
          (L,)
       * ``basis_projections``: PCA components, with components varying
          over axis `axis`; thus shape given by: ``s = list(data.shape);
          s[axis] = ncomp``
       * ``axis``: axis over which PCA has been performed.

    Examples
    --------
    >>> from nipy.testing import funcfile
    >>> from nipy import load_image
    >>> func_img = load_image(funcfile)

    Time is the fourth axis

    >>> func_img.coordmap.function_range
    CoordinateSystem(coord_names=('aligned-x=L->R', 'aligned-y=P->A', 'aligned-z=I->S', 't'), name='aligned', coord_dtype=float64)
    >>> func_img.shape
    (17, 21, 3, 20)

    Calculate the PCA over time, by default

    >>> res = pca_image(func_img)
    >>> res['basis_projections'].coordmap.function_range
    CoordinateSystem(coord_names=('aligned-x=L->R', 'aligned-y=P->A', 'aligned-z=I->S', 'PCA components'), name='aligned', coord_dtype=float64)

    The number of components is one less than the number of time points

    >>> res['basis_projections'].shape
    (17, 21, 3, 19)
    """
    img_klass = img.__class__
    # Which axes are we operating over?
    in_ax, out_ax = io_axis_indices(img.coordmap, axis)
    if None in (in_ax, out_ax):
        raise AxisError('Cannot identify matching input output axes with "%s"'
                        % axis)
    if not orth_axes(in_ax, out_ax, img.coordmap.affine):
        raise AxisError('Input and output axes found from "%s" not othogonal '
                        'to rest of affine' % axis)
    # Roll the chosen axis to input position zero
    work_img = rollimg(img, axis)
    if mask is not None:
        if not mask.coordmap.similar_to(drop_io_dim(img.coordmap, axis)):
            raise ValueError("Mask should have matching coordmap to `img` "
                             "coordmap with dropped axis %s" % axis)
    data = work_img.get_data()
    if mask is not None:
        mask_data = mask.get_data()
    else:
        mask_data = None
    # do the PCA
    res = pca(data, 0, mask_data, ncomp, standardize,
              design_keep, design_resid, tol_ratio)
    # Clean up images after PCA
    # Rename the axis we dropped, at position 0 after rollimg
    output_coordmap = work_img.coordmap.renamed_domain(
        {0: 'PCA components'})
    # And the matching output axis - which has not moved position
    output_coordmap = output_coordmap.renamed_range(
        {out_ax: 'PCA components'})
    output_img = img_klass(res['basis_projections'], output_coordmap)
    # We have to roll the axis back to the original position
    output_img = rollimg(output_img, 0, in_ax + 1)
    key = 'basis_vectors over %s' % axis
    res[key] = res['basis_vectors']
    res['basis_projections'] = output_img
    # Signal the roll in results
    res['axis'] = in_ax
    return res