Source code for nipy.algorithms.statistics.mixed_effects_stat

"""
Module for computation of mixed effects statistics with an EM algorithm.
i.e.
solves problems of the form
y = X beta + e1 + e2,
where X and Y are known, e1 and e2 are centered with diagonal covariance.
V1 = var(e1) is known, and V2 = var(e2) = lambda identity.
the code estimates beta and lambda using an EM algorithm.
Likelihood ratio tests can then be used to test the columns of beta.

Author: Bertrand Thirion, 2012.

>>> N, P = 15, 500
>>> V1 = np.random.randn(N, P) ** 2
>>> effects = np.ones(P)
>>> Y = generate_data(np.ones(N), effects, .25, V1)
>>> T1 = one_sample_ttest(Y, V1, n_iter=5)
>>> T2 = t_stat(Y)
>>> assert(T1.std() < T2.std())
"""
from __future__ import absolute_import
from __future__ import print_function

import numpy as np

EPS = 100 * np.finfo(float).eps


[docs]def generate_data(X, beta, V2, V1):
    """ Generate a group of individuals from the provided parameters

    Parameters
    ----------
    X: array of shape (n_samples, n_reg),
       the design matrix of the model
    beta: float or array of shape (n_reg, n_tests),
          the associated effects
    V2: float or array of shape (n_tests),
              group variance
    V1: array of shape(n_samples, n_tests),
               the individual variances

    Returns
    -------
    Y: array of shape(n_samples, n_tests)
       the individual data related to the two-level normal model
    """
    # check that the variances are  positive
    if (V1 < 0).any():
        raise ValueError('Variance should be positive')
    Y = np.random.randn(*V1.shape)
    Y *= np.sqrt(V2 + V1)
    if X.ndim == 1:
        X = X[:, np.newaxis]
    if np.isscalar(beta):
        beta = beta * np.ones((X.shape[1], V1.shape[1]))
    if beta.ndim == 1:
        beta = beta[np.newaxis]

    Y += np.dot(X, beta)
    return Y


[docs]def check_arrays(Y, V1):
    """Check that the given data can be used for the models

    Parameters
    ----------
    Y: array of shape (n_samples, n_tests) or (n_samples)
       the estimated effects
    V1: array of shape (n_samples, n_tests) or (n_samples)
          first-level variance
    """
    if (V1 < 0).any():
        raise ValueError("a negative variance has been provided")

    if np.size(Y) == Y.shape[0]:
        Y = Y[:, np.newaxis]

    if np.size(V1) == V1.shape[0]:
        V1 = V1[:, np.newaxis]

    if Y.shape != V1.shape:
        raise ValueError("Y and V1 do not have the same shape")
    return Y, V1


[docs]def t_stat(Y):
    """ Returns the t stat of the sample on each row of the matrix

    Parameters
    ----------
    Y, array of shape (n_samples, n_tests)

    Returns
    -------
    t_variates, array of shape (n_tests)
    """
    return Y.mean(0) / Y.std(0) * np.sqrt(Y.shape[0] - 1)


[docs]class MixedEffectsModel(object):
    """Class to handle multiple one-sample mixed effects models
    """

[docs]    def __init__(self, X, n_iter=5, verbose=False):
        """
        Set the effects and first-level variance,
        and initialize related quantities

        Parameters
        ----------
        X: array of shape(n_samples, n_effects),
           the design matrix
        n_iter: int, optional,
               number of iterations of the EM algorithm
        verbose: bool, optional, verbosity mode
        """
        self.n_iter = n_iter
        self.verbose = verbose
        self.X = X
        self.pinv_X = np.linalg.pinv(X)

[docs]    def log_like(self, Y, V1):
        """ Compute the log-likelihood of (Y, V1) under the model

        Parameters
        ----------
        Y, array of shape (n_samples, n_tests) or (n_samples)
           the estimated effects
        V1, array of shape (n_samples, n_tests) or (n_samples)
            first-level variance

        Returns
        -------
        logl: array of shape self.n_tests,
              the log-likelihood of the model
        """
        Y, V1 = check_arrays(Y, V1)
        tvar = self.V2 + V1
        logl = np.sum(((Y - self.Y_) ** 2) / tvar, 0)
        logl += np.sum(np.log(tvar), 0)
        logl += np.log(2 * np.pi) * Y.shape[0]
        logl *= (- 0.5)
        return logl

[docs]    def predict(self, Y, V1):
        """Return the log_likelihood of the data.See the log_like method"""
        return self.log_like(Y, V1)

[docs]    def score(self, Y, V1):
        """Return the log_likelihood of the data. See the log_like method"""
        return self.log_like(Y, V1)

    def _one_step(self, Y, V1):
        """Applies one step of an EM algorithm to estimate self.mean_, self.var

        Parameters
        ----------
        Y, array of shape (n_samples, n_tests) or (n_samples)
              the estimated effects
        V1, array of shape (n_samples, n_tests) or (n_samples)
                 first-level variance
        """
        # E step
        prec = 1. / (self.V2 + V1)
        Y_ = prec * (self.V2 * Y + V1 * self.Y_)
        cvar = V1 * self.V2 * prec

        # M step
        self.beta_ = np.dot(self.pinv_X, Y_)
        self.Y_ = np.dot(self.X, self.beta_)
        self.V2 = np.mean((Y_ - self.Y_) ** 2, 0) + cvar.mean(0)

[docs]    def fit(self, Y, V1):
        """ Launches the EM algorithm to estimate self

        Parameters
        ----------
        Y, array of shape (n_samples, n_tests) or (n_samples)
              the estimated effects
        V1, array of shape (n_samples, n_tests) or (n_samples)
                 first-level variance

        Returns
        -------
        self
        """
        # Basic data checks
        if self.X.shape[0] != Y.shape[0]:
            raise ValueError('X and Y must have the same numbers of rows')
        Y, V1 = check_arrays(Y, V1)
        self.beta_ = np.dot(self.pinv_X, Y)
        self.Y_ = np.dot(self.X, self.beta_)
        self.V2 = np.mean((Y - self.Y_) ** 2, 0)

        if self.verbose:
            log_like_init = self.log_like(Y, V1)
            print('Average log-likelihood: ', log_like_init.mean())

        for i in range(self.n_iter):
            self._one_step(Y, V1)

            if self.verbose:
                log_like_ = self.log_like(Y, V1)
                if (log_like_ < (log_like_init - EPS)).any():
                    raise ValueError('The log-likelihood cannot decrease')
                log_like_init = log_like_
                print('Iteration %d, average log-likelihood: %f' % (
                        i, log_like_.mean()))
        return self


[docs]def two_sample_ftest(Y, V1, group, n_iter=5, verbose=False):
    """Returns the mixed effects t-stat for each row of the X
    (one sample test)
    This uses the Formula in Roche et al., NeuroImage 2007

    Parameters
    ----------
    Y: array of shape (n_samples, n_tests)
       the data
    V1: array of shape (n_samples, n_tests)
        first-level variance assocated with the data
    group: array of shape (n_samples)
       a vector of indicators yielding the samples membership
    n_iter: int, optional,
           number of iterations of the EM algorithm
    verbose: bool, optional, verbosity mode

    Returns
    -------
    tstat: array of shape (n_tests),
           statistical values obtained from the likelihood ratio test
    """
    # check that group is correct
    if group.size != Y.shape[0]:
        raise ValueError('The number of labels is not the number of samples')
    if (np.unique(group) != np.array([0, 1])).all():
        raise ValueError('group should be composed only of zeros and ones')

    # create design matrices
    X = np.vstack((np.ones_like(group), group)).T
    return mfx_stat(Y, V1, X, 1, n_iter=n_iter, verbose=verbose,
                    return_t=False, return_f=True)[0]


[docs]def two_sample_ttest(Y, V1, group, n_iter=5, verbose=False):
    """Returns the mixed effects t-stat for each row of the X
    (one sample test)
    This uses the Formula in Roche et al., NeuroImage 2007

    Parameters
    ----------
    Y: array of shape (n_samples, n_tests)
       the data
    V1: array of shape (n_samples, n_tests)
        first-level variance assocated with the data
    group: array of shape (n_samples)
       a vector of indicators yielding the samples membership
    n_iter: int, optional,
           number of iterations of the EM algorithm
    verbose: bool, optional, verbosity mode

    Returns
    -------
    tstat: array of shape (n_tests),
           statistical values obtained from the likelihood ratio test
    """
    X = np.vstack((np.ones_like(group), group)).T
    return mfx_stat(Y, V1, X, 1, n_iter=n_iter, verbose=verbose,
                    return_t=True)[0]


[docs]def one_sample_ftest(Y, V1, n_iter=5, verbose=False):
    """Returns the mixed effects F-stat for each row of the X
    (one sample test)
    This uses the Formula in Roche et al., NeuroImage 2007

    Parameters
    ----------
    Y: array of shape (n_samples, n_tests)
       the data
    V1: array of shape (n_samples, n_tests)
        first-level variance ssociated with the data
    n_iter: int, optional,
           number of iterations of the EM algorithm
    verbose: bool, optional, verbosity mode

    Returns
    -------
    fstat, array of shape (n_tests),
           statistical values obtained from the likelihood ratio test
    sign, array of shape (n_tests),
          sign of the mean for each test (allow for post-hoc signed tests)
    """
    return mfx_stat(Y, V1, np.ones((Y.shape[0], 1)), 0, n_iter=n_iter,
                    verbose=verbose, return_t=False, return_f=True)[0]


[docs]def one_sample_ttest(Y, V1, n_iter=5, verbose=False):
    """Returns the mixed effects t-stat for each row of the X
    (one sample test)
    This uses the Formula in Roche et al., NeuroImage 2007

    Parameters
    ----------
    Y: array of shape (n_samples, n_tests)
       the observations
    V1: array of shape (n_samples, n_tests)
        first-level variance associated with the observations
    n_iter: int, optional,
           number of iterations of the EM algorithm
    verbose: bool, optional, verbosity mode

    Returns
    -------
    tstat: array of shape (n_tests),
           statistical values obtained from the likelihood ratio test
    """
    return mfx_stat(Y, V1, np.ones((Y.shape[0], 1)), 0, n_iter=n_iter,
                    verbose=verbose, return_t=True)[0]


[docs]def mfx_stat(Y, V1, X, column, n_iter=5, return_t=True,
             return_f=False, return_effect=False,
             return_var=False, verbose=False):
    """Run a mixed-effects model test on the column of the design matrix

    Parameters
    ----------
    Y: array of shape (n_samples, n_tests)
       the data
    V1: array of shape (n_samples, n_tests)
        first-level variance assocated with the data
    X: array of shape(n_samples, n_regressors)
       the design matrix of the model
    column: int,
            index of the column of X to be tested
    n_iter: int, optional,
           number of iterations of the EM algorithm
    return_t: bool, optional,
              should one return the t test (True by default)
    return_f: bool, optional,
              should one return the F test (False by default)
    return_effect: bool, optional,
              should one return the effect estimate (False by default)
    return_var: bool, optional,
              should one return the variance estimate (False by default)

    verbose: bool, optional, verbosity mode

    Returns
    -------
    (tstat, fstat, effect, var): tuple of arrays of shape (n_tests),
                                 those required by the input return booleans

    """
    # check that X/columns are correct
    column = int(column)
    if X.shape[0] != Y.shape[0]:
        raise ValueError('X.shape[0] is not the number of samples')
    if (column > X.shape[1]):
        raise ValueError('the column index is more than the number of columns')

    # create design matrices
    contrast_mask = 1 - np.eye(X.shape[1])[column]
    X0 = X * contrast_mask

    # instantiate the mixed effects models
    model_0 = MixedEffectsModel(X0, n_iter=n_iter, verbose=verbose).fit(Y, V1)
    model_1 = MixedEffectsModel(X, n_iter=n_iter, verbose=verbose).fit(Y, V1)

    # compute the log-likelihood ratio statistic
    fstat = 2 * (model_1.log_like(Y, V1) - model_0.log_like(Y, V1))
    fstat = np.maximum(0, fstat)
    sign = np.sign(model_1.beta_[column])

    output = ()
    if return_t:
        output += (np.sqrt(fstat) * sign,)
    if return_f:
        output += (fstat,)
    if return_var:
        output += (model_1.V2,)
    if return_effect:
        output += (model_1.beta_[column],)
    return output