Source code for nipy.labs.spatial_models.bayesian_structural_analysis

# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
"""
The main routine of this package that aims at performing the
extraction of ROIs from multisubject dataset using the localization
and activation strength of extracted regions.

This has been published in:
- Thirion et al. High level group analysis of FMRI data based on
Dirichlet process mixture models, IPMI 2007
- Thirion et al.
Accurate Definition of Brain Regions Position Through the
Functional Landmark Approach, MICCAI 2010

Author : Bertrand Thirion, 2006-2013
"""
from __future__ import absolute_import

import numpy as np
import scipy.stats as st

from .structural_bfls import build_landmarks
from nipy.algorithms.graph import wgraph_from_coo_matrix
from ...algorithms.statistics.empirical_pvalue import \
    NormalEmpiricalNull, three_classes_GMM_fit, gamma_gaussian_fit
from .hroi import HROI_as_discrete_domain_blobs

####################################################################
# Ancillary functions
####################################################################


def _stat_to_proba(test, learn=None, method='prior', alpha=0.01, verbose=0):
    """Convert a set of statistics to posterior probabilities of  being
    generated under H0

    Parameters
    ----------
    test: array of shape(n_samples),
           data that is assessed
    learn: array of shape(n_samples), optional,
           data to learn a mixture model
           defaults to learn
    method: {'gauss_mixture', 'emp_null', 'gam_gauss', 'prior'}, optional,
           'gauss_mixture' A Gaussian Mixture Model is used
           'emp_null' a null mode is fitted to test
           'gam_gauss' a Gamma-Gaussian mixture is used
           'prior' a hard-coded function is used
    alpha: float in the [0,1] range, optional,
           parameter that yields the prior probability that a region is active
           should be chosen close to 0
    verbose: int, optional,
             verbosity mode

    Returns
    -------
    posterior_null: array of shape(n_samples)
                    an estimation of the probability that the observation
                    is generated under the null
    """
    if method == 'gauss_mixture':
        prior_strength = 100
        fixed_scale = True
        posterior_probas = three_classes_GMM_fit(
            learn, test, alpha, prior_strength, verbose, fixed_scale)
        posterior_null = posterior_probas[:, 1]
    elif method == 'emp_null':
        enn = NormalEmpiricalNull(learn)
        enn.learn()
        posterior_null = np.reshape(enn.fdr(test), np.size(test))
    elif method == 'gam_gauss':
        posterior_probas = gamma_gaussian_fit(learn, test, verbose)
        posterior_null = posterior_probas[:, 1]
    elif method == 'prior':
        y0 = st.norm.pdf(test)
        shape_, scale_ = 3., 2.
        y1 = st.gamma.pdf(test, shape_, scale=scale_)
        posterior_null = np.ravel(
            (1 - alpha) * y0 / (alpha * y1 + (1 - alpha) * y0))
    else:
        raise ValueError('Unknown method')
    return posterior_null


def _compute_individual_regions(domain, stats, threshold=3.0, smin=5,
                               method='gauss_mixture'):
    """ Compute the individual regions that are real activation candidates

    Parameters
    ----------
    domain : StructuredDomain instance,
          generic descriptor of the space domain
    stats: an array of shape (n_voxels, n_subjects)
           the multi-subject statistical maps
    threshold: float, optional
           first level threshold
    smin: int, optional
          minimal size of the regions to validate them
    method: {'gauss_mixture', 'emp_null', 'gam_gauss', 'prior'}, optional,
           'gauss_mixture' A Gaussian Mixture Model is used
           'emp_null' a null mode is fitted to test
           'gam_gauss' a Gamma-Gaussian mixture is used
           'prior' a hard-coded function is used

    Returns
    -------
    hrois: list of nipy.labs.spatial_models.hroi.HierrachicalROI instances
           that represent individual ROIs
           let nr be the number of terminal regions across subjects
    prior_h0: array of shape (nr),
              the mixture-based prior probability
              that the terminal regions are false positives
    subjects: array of shape (nr),
              the subject index associated with the terminal regions
    coords: array of shape (nr, coord.shape[1]),
            the coordinates of the of the terminal regions

    Notes
    -----
    FIXME: should allow for subject specific domains
    """
    hrois = []
    coords = []
    prior_h0 = []
    subjects = []
    n_subjects = stats.shape[1]
    nvox = stats.shape[0]

    for subject in range(n_subjects):
        # description in terms of blobs
        stats_ = np.reshape(stats[:, subject], (nvox, 1))
        hroi = HROI_as_discrete_domain_blobs(
            domain, stats_, threshold=threshold, smin=smin)

        if hroi is not None and hroi.k > 0:
            # get the leave regions (i.e. the local maxima)
            leaves = [hroi.select_id(id) for id in hroi.get_leaves_id()]

            # get the region mean statistical value
            mean_val = hroi.representative_feature('signal', 'weighted mean')
            mean_val = mean_val[leaves]

            # get the regions position
            mean_pos = np.asarray(
                [np.mean(coord, 0) for coord in hroi.get_coord()])
            hroi.set_roi_feature('position', mean_pos)
            coords.append(mean_pos[leaves])

            # compute the prior proba of being null
            learning_set = np.squeeze(stats_[stats_ != 0])
            prior_h0.append(_stat_to_proba(mean_val, learning_set, method))
            subjects.append(subject * np.ones(mean_val.size).astype(np.int))
        else:
            subjects.append([])
            prior_h0.append([])
            coords.append(np.empty((0, domain.dim)))
        hrois.append(hroi)

    prior_h0 = np.concatenate(prior_h0)
    subjects = np.concatenate(subjects)
    coords = np.concatenate(coords)
    return hrois, prior_h0, subjects, coords


def _dpmm(coords, alpha, null_density, dof, prior_precision, prior_h0,
          subjects, sampling_coords=None, n_iter=1000, burnin=100,
          co_clust=False):
    """Apply the dpmm analysis to compute clusters from regions coordinates
    """
    from nipy.algorithms.clustering.imm import MixedIMM

    dim = coords.shape[1]
    migmm = MixedIMM(alpha, dim)
    migmm.set_priors(coords)
    migmm.set_constant_densities(
        null_dens=null_density, prior_dens=null_density)
    migmm._prior_dof = dof
    migmm._prior_scale = np.diag(prior_precision[0] / dof)
    migmm._inv_prior_scale_ = [np.diag(dof * 1. / (prior_precision[0]))]
    migmm.sample(coords, null_class_proba=prior_h0, niter=burnin, init=False,
                 kfold=subjects)

    # sampling
    like, pproba, co_clustering = migmm.sample(
        coords, null_class_proba=prior_h0, niter=n_iter, kfold=subjects,
        sampling_points=sampling_coords, co_clustering=True)

    if co_clust:
        return like, 1 - pproba, co_clustering
    else:
        return like, 1 - pproba


def _update_hroi_labels(hrois, new_labels):
    """Update the labels of the hroisusing new_labels"""
    for subject in range(len(hrois)):
        if hrois[subject].k > 0:
            us = hrois[subject].get_roi_feature('label')
            us[us > - 1] = new_labels[us[us > - 1]]
            hrois[subject].set_roi_feature('label', us)


def _bsa_dpmm(hrois, prior_h0, subjects, coords, sigma, prevalence_pval,
             prevalence_threshold, dof=10, alpha=.5, n_iter=1000, burnin=100,
             algorithm='density'):
    """ Estimation of the population level model of activation density using
    dpmm and inference

    Parameters
    ----------
    hrois: list of nipy.labs.spatial_models.hroi.HierarchicalROI instances
           representing individual ROIs
           Let nr be the number of terminal regions across subjects
    prior_h0: array of shape (nr)
              mixture-based prior probability
              that the terminal regions are true positives
    subjects: array of shape (nr)
              subject index associated with the terminal regions
    coords: array of shape (nr, coord.shape[1])
            coordinates of the of the terminal regions
    sigma: float > 0,
           expected cluster scatter in the common space in units of coord
    prevalence_pval: float in the [0,1] interval, optional
                     p-value of the prevalence test
    prevalence_threshold: float in the rannge [0,nsubj]
                         null hypothesis on region prevalence
    dof: float > 0, optional,
         degrees of freedom of the prior
    alpha: float > 0, optional,
           creation parameter of the DPMM
    niter: int, optional,
           number of iterations of the DPMM
    burnin: int, optional,
            number of iterations of the DPMM
    algorithm: {'density', 'co_occurrence'}, optional,
               algorithm used in the DPMM inference

    Returns
    -------
    landmarks: instance of sbf.LandmarkRegions
               that describes the ROIs found in inter-subject inference
               If no such thing can be defined landmarks is set to None
    hrois: List of nipy.labs.spatial_models.hroi.HierarchicalROI instances
           representing individual ROIs
    """
    from nipy.algorithms.graph.field import field_from_coo_matrix_and_data
    domain = hrois[0].domain
    n_subjects = len(hrois)

    landmarks = None
    density = np.zeros(domain.size)
    if len(subjects) < 1:
        return landmarks, hrois

    null_density = 1. / domain.local_volume.sum()

    # prepare the DPMM
    dim = domain.em_dim
    prior_precision = 1. / (sigma ** 2) * np.ones((1, dim))

    # n_iter = number of iterations to estimate density
    if algorithm == 'density':
        density, post_proba = _dpmm(
            coords, alpha, null_density, dof, prior_precision, prior_h0,
            subjects, domain.coord, n_iter=n_iter, burnin=burnin)
            # associate labels with coords
        Fbeta = field_from_coo_matrix_and_data(domain.topology, density)
        _, label = Fbeta.custom_watershed(0, null_density)
        midx = np.array([np.argmin(np.sum((domain.coord - coord_) ** 2, 1))
                         for coord_ in coords])
        components = label[midx]
    elif algorithm == 'co-occurrence':
        post_proba, density, co_clustering = _dpmm(
            coords, alpha, null_density, dof, prior_precision, prior_h0,
            subjects,  n_iter=n_iter, burnin=burnin, co_clust=True)
        contingency_graph = wgraph_from_coo_matrix(co_clustering)
        if contingency_graph.E > 0:
            contingency_graph.remove_edges(contingency_graph.weights > .5)

        components = contingency_graph.cc()
        components[density < null_density] = components.max() + 1 +\
            np.arange(np.sum(density < null_density))
    else:
        raise ValueError('Unknown algorithm')

    # append some information to the hroi in each subject
    for subject in range(n_subjects):
        bfs = hrois[subject]
        if bfs is None:
            continue
        if bfs.k == 0:
            bfs.set_roi_feature('label', np.array([]))
            continue

        leaves_pos = [bfs.select_id(k) for k in bfs.get_leaves_id()]
        # save posterior proba
        post_proba_ = np.zeros(bfs.k)
        post_proba_[leaves_pos] = post_proba[subjects == subject]
        bfs.set_roi_feature('posterior_proba', post_proba_)

        # save prior proba
        prior_proba = np.zeros(bfs.k)
        prior_proba[leaves_pos] = 1 - prior_h0[subjects == subject]
        bfs.set_roi_feature('prior_proba', prior_proba)

        # assign labels to ROIs
        roi_label = - np.ones(bfs.k).astype(np.int)
        roi_label[leaves_pos] = components[subjects == subject]
        # when parent regions has similarly labelled children,
        # include it also
        roi_label = bfs.make_forest().propagate_upward(roi_label)
        bfs.set_roi_feature('label', roi_label)

    # derive the group-level landmarks
    # with a threshold on the number of subjects
    # that are represented in each one
    landmarks, new_labels = build_landmarks(
        domain, coords, subjects, np.array(components), 1 - prior_h0,
        prevalence_pval, prevalence_threshold, sigma)

    # relabel the regions
    _update_hroi_labels(hrois, new_labels)

    return landmarks, hrois


###########################################################################
# Main function
###########################################################################


[docs]def compute_landmarks( domain, stats, sigma, prevalence_pval=0.5, prevalence_threshold=0, threshold=3.0, smin=5, method='prior', algorithm='density', n_iter=1000, burnin=100): """ Compute the Bayesian Structural Activation patterns Parameters ---------- domain: StructuredDomain instance, Description of the spatial context of the data stats: array of shape (nbnodes, subjects): the multi-subject statistical maps sigma: float > 0: expected cluster std in the common space in units of coord prevalence_pval: float in the [0,1] interval, optional posterior significance threshold prevalence_threshold: float, optional, reference threshold for the prevalence value threshold: float, optional, first level threshold smin: int, optional, minimal size of the regions to validate them method: {'gauss_mixture', 'emp_null', 'gam_gauss', 'prior'}, optional, 'gauss_mixture' A Gaussian Mixture Model is used 'emp_null' a null mode is fitted to test 'gam_gauss' a Gamma-Gaussian mixture is used 'prior' a hard-coded function is used algorithm: string, one of ['density', 'co-occurrence'], optional method used to compute the landmarks niter: int, optional, number of iterations of the DPMM burnin: int, optional, number of iterations of the DPMM Returns ------- landmarks: Instance of sbf.LandmarkRegions or None, Describes the ROIs found in inter-subject inference None if nothing can be defined hrois: list of nipy.labs.spatial_models.hroi.Nroi instances representing individual ROIs """ hrois, prior_h0, subjects, coords = _compute_individual_regions( domain, stats, threshold, smin, method) landmarks, hrois = _bsa_dpmm( hrois, prior_h0, subjects, coords, sigma, prevalence_pval, prevalence_threshold, algorithm=algorithm, n_iter=n_iter, burnin=burnin) return landmarks, hrois