Source code for nipy.algorithms.statistics.empirical_pvalue

# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
"""Routines to get corrected p-values estimates, based on the observations.

It implements 3 approaches:

- Benjamini-Hochberg FDR: http://en.wikipedia.org/wiki/False_discovery_rate

- a class that fits a Gaussian model to the central part of an
  histogram, following [1]

  [1] Schwartzman A, Dougherty RF, Lee J, Ghahremani D, Taylor
  JE. Empirical null and false discovery rate analysis in
  neuroimaging. Neuroimage. 2009 Jan 1;44(1):71-82.  PubMed PMID:
  18547821. DOI: 10.1016/j.neuroimage.2008.04.182

  This is typically necessary to estimate a FDR when one is not
  certain that the data behaves as a standard normal under H_0.

- a model based on Gaussian mixture modelling 'a la Oxford'

Author : Bertrand Thirion, Yaroslav Halchenko, 2008-2012
"""
from __future__ import print_function
from __future__ import absolute_import

import numpy as np
from numpy.linalg import pinv
import scipy.stats as st


[docs]def check_p_values(p_values): """Basic checks on the p_values array: values should be within [0,1] Assures also that p_values are at least in 1d array. None of the checks is performed if p_values is None. Parameters ---------- p_values : array of shape (n) The sample p-values Returns ------- p_values : array of shape (n) The sample p-values """ if p_values is None: return None # Take all elements unfolded and assure having at least 1d p_values = np.atleast_1d(np.ravel(p_values)) if np.any(np.isnan(p_values)): raise ValueError("%d values are NaN" % (sum(np.isnan(p_values)))) if p_values.min() < 0: raise ValueError("Negative p-values. Min=%g" % (p_values.min(),)) if p_values.max() > 1: raise ValueError("P-values greater than 1! Max=%g" % ( p_values.max(),)) return p_values
[docs]def gaussian_fdr(x): """Return the FDR associated with each value assuming a Gaussian distribution """ return fdr(st.norm.sf(np.squeeze(x)))
[docs]def gaussian_fdr_threshold(x, alpha=0.05): """Return FDR threshold given normal variates Given an array x of normal variates, this function returns the critical p-value associated with alpha. x is explicitly assumed to be normal distributed under H_0 Parameters ----------- x: ndarray input data alpha: float, optional desired significance Returns ------- threshold : float threshold, given as a Gaussian critical value """ pvals = st.norm.sf(x) pth = fdr_threshold(pvals, alpha) return st.norm.isf(pth)
[docs]def fdr_threshold(p_values, alpha=0.05): """Return FDR threshold given p values Parameters ----------- p_values : array of shape (n), optional The samples p-value alpha : float, optional The desired FDR significance Returns ------- critical_p_value: float The p value corresponding to the FDR alpha """ p_values = check_p_values(p_values) n_samples = np.size(p_values) p_corr = alpha / n_samples sp_values = np.sort(p_values) critical_set = sp_values[ sp_values < p_corr * np.arange(1, n_samples + 1)] if len(critical_set) > 0: critical_p_value = critical_set.max() else: critical_p_value = p_corr return critical_p_value
[docs]def fdr(p_values=None, verbose=0): """Returns the FDR associated with each p value Parameters ----------- p_values : ndarray of shape (n) The samples p-value Returns ------- q : array of shape(n) The corresponding fdr values """ p_values = check_p_values(p_values) n_samples = p_values.size order = p_values.argsort() sp_values = p_values[order] # compute q while in ascending order q = np.minimum(1, n_samples * sp_values / np.arange(1, n_samples + 1)) for i in range(n_samples - 1, 0, - 1): q[i - 1] = min(q[i], q[i - 1]) # reorder the results inverse_order = np.arange(n_samples) inverse_order[order] = np.arange(n_samples) q = q[inverse_order] if verbose: import matplotlib.pylab as mp mp.figure() mp.xlabel('Input p-value') mp.plot(p_values, q, '.') mp.ylabel('Associated fdr') return q
[docs]class NormalEmpiricalNull(object): """Class to compute the empirical null normal fit to the data. The data which is used to estimate the FDR, assuming a Gaussian null from Schwartzmann et al., NeuroImage 44 (2009) 71--82 """
[docs] def __init__(self, x): """Initialize an empirical null normal object. Parameters ----------- x : 1D ndarray The data used to estimate the empirical null. """ x = np.reshape(x, (- 1)) self.x = np.sort(x) self.n = np.size(x) self.learned = 0
[docs] def learn(self, left=0.2, right=0.8): """ Estimate the proportion, mean and variance of a Gaussian distribution for a fraction of the data Parameters ---------- left: float, optional Left cut parameter to prevent fitting non-gaussian data right: float, optional Right cut parameter to prevent fitting non-gaussian data Notes ----- This method stores the following attributes: * mu = mu * p0 = min(1, np.exp(lp0)) * sqsigma: variance of the estimated normal distribution * sigma: np.sqrt(sqsigma) : standard deviation of the estimated normal distribution """ # take a central subsample of x x = self.x[int(self.n * left): int(self.n * right)] # generate the histogram step = 3.5 * np.std(self.x) / np.exp(np.log(self.n) / 3) bins = max(10, int((self.x.max() - self.x.min()) // step)) hist, ledge = np.histogram(x, bins=bins) step = ledge[1] - ledge[0] medge = ledge + 0.5 * step # remove null bins hist = hist[hist > 0].astype(np.float) medge = medge[:-1][hist > 0] # edges include rightmost outer # fit the histogram dmtx = np.ones((3, len(hist))) dmtx[1] = medge dmtx[2] = medge ** 2 coef = np.dot(np.log(hist), pinv(dmtx)) sqsigma = - 1.0 / (2 * coef[2]) sqsigma = max(sqsigma, 1.e-6) mu = coef[1] * sqsigma lp0 = (coef[0] - np.log(step * self.n) + 0.5 * np.log(2 * np.pi * sqsigma) + mu ** 2 / (2 * sqsigma)) self.mu = mu self.p0 = min(1, np.exp(lp0)) self.sigma = np.sqrt(sqsigma) self.sqsigma = sqsigma
[docs] def fdrcurve(self): """ Returns the FDR associated with any point of self.x """ import scipy.stats as st if self.learned == 0: self.learn() efp = (self.p0 * st.norm.sf(self.x, self.mu, self.sigma) * self.n / np.arange(self.n, 0, - 1)) efp = np.minimum(efp, 1) ix = np.argsort(self.x) for i in range(np.size(efp) - 1, 0, - 1): efp[ix[i - 1]] = np.maximum(efp[ix[i]], efp[ix[i - 1]]) self.sorted_x = self.x[ix] self.sorted_fdr = efp[ix] return efp
[docs] def threshold(self, alpha=0.05, verbose=0): """ Compute the threshold corresponding to an alpha-level FDR for x Parameters ----------- alpha : float, optional the chosen false discovery rate threshold. verbose : boolean, optional the verbosity level, if True a plot is generated. Returns ------- theta: float the critical value associated with the provided FDR """ efp = self.fdrcurve() if verbose: self.plot(efp, alpha) if efp[-1] > alpha: print("the maximal value is %f , the corresponding FDR is %f " % (self.x[ - 1], efp[ - 1])) return np.inf j = np.argmin(efp[:: - 1] < alpha) + 1 return 0.5 * (self.x[ - j] + self.x[ - j + 1])
[docs] def uncorrected_threshold(self, alpha=0.001, verbose=0): """Compute the threshold corresponding to a specificity alpha for x Parameters ----------- alpha : float, optional the chosen false discovery rate (FDR) threshold. verbose : boolean, optional the verbosity level, if True a plot is generated. Returns ------- theta: float the critical value associated with the provided p-value """ if self.learned == 0: self.learn() threshold = st.norm.isf(alpha, self.mu, self.sigma) if not np.isfinite(threshold): threshold = np.inf if verbose: self.plot() return threshold
[docs] def fdr(self, theta): """Given a threshold theta, find the estimated FDR Parameters ---------- theta : float or array of shape (n_samples) values to test Returns ------- afp : value of array of shape(n) """ from scipy.stats import norm self.fdrcurve() if np.isscalar(theta): if theta > self.sorted_x[ - 1]: return 0 maj = np.where(self.sorted_x >= theta)[0][0] efp = (self.p0 * norm.sf(theta, self.mu, self.sigma) * self.n\ / np.sum(self.x >= theta)) efp = np.maximum(self.sorted_fdr[maj], efp) else: efp = [] for th in theta: if th > self.sorted_x[ - 1]: efp.append(0) continue maj = self.sorted_fdr[np.where(self.sorted_x >= th)[0][0]] efp.append(np.maximum(maj, self.p0 * st.norm.sf(th, self.mu, self.sigma) * self.n / np.sum(self.x >= th))) efp = np.array(efp) # efp = np.minimum(efp, 1) return efp
[docs] def plot(self, efp=None, alpha=0.05, bar=1, mpaxes=None): """Plot the histogram of x Parameters ------------ efp : float, optional The empirical FDR (corresponding to x) if efp==None, the false positive rate threshold plot is not drawn. alpha : float, optional The chosen FDR threshold bar=1 : bool, optional mpaxes=None: if not None, handle to an axes where the fig will be drawn. Avoids creating unnecessarily new figures """ if not self.learned: self.learn() n = np.size(self.x) bins = max(10, int(2 * np.exp(np.log(n) / 3.))) hist, ledge = np.histogram(self.x, bins=bins) hist = hist.astype('f') / hist.sum() step = ledge[1] - ledge[0] medge = ledge + 0.5 * step import scipy.stats as st g = self.p0 * st.norm.pdf(medge, self.mu, self.sigma) hist /= step import matplotlib.pylab as mp if mpaxes is None: mp.figure() ax = mp.subplot(1, 1, 1) else: ax = mpaxes if bar: # We need to cut ledge to len(hist) to accommodate for pre and # post numpy 1.3 hist semantic change. ax.bar(ledge[:len(hist)], hist, step) else: ax.plot(medge[:len(hist)], hist, linewidth=2) ax.plot(medge, g, 'r', linewidth=2) ax.set_title('Robust fit of the histogram', fontsize=12) l = ax.legend(('empirical null', 'data'), loc=0) for t in l.get_texts(): t.set_fontsize(12) ax.set_xticklabels(ax.get_xticks(), fontsize=12) ax.set_yticklabels(ax.get_yticks(), fontsize=12) if efp is not None: ax.plot(self.x, np.minimum(alpha, efp), 'k')
[docs]def three_classes_GMM_fit(x, test=None, alpha=0.01, prior_strength=100, verbose=0, fixed_scale=False, mpaxes=None, bias=0, theta=0, return_estimator=False): """Fit the data with a 3-classes Gaussian Mixture Model, i.e. compute some probability that the voxels of a certain map are in class disactivated, null or active Parameters ---------- x: array of shape (nvox,1) The map to be analysed test: array of shape(nbitems,1), optional the test values for which the p-value needs to be computed by default (if None), test=x alpha: float, optional the prior weights of the positive and negative classes prior_strength: float, optional the confidence on the prior (should be compared to size(x)) verbose: int verbosity mode fixed_scale: bool, optional boolean, variance parameterization. if True, the variance is locked to 1 otherwise, it is estimated from the data mpaxes: axes handle used to plot the figure in verbose mode if None, new axes are created bias: bool allows a rescaling of the posterior probability that takes into account the threshold theta. Not rigorous. theta: float the threshold used to correct the posterior p-values when bias=1; normally, it is such that test>theta note that if theta = -np.inf, the method has a standard behaviour return_estimator: boolean, optional If return_estimator is true, the estimator object is returned. Returns ------- bfp : array of shape (nbitems,3): the posterior probability of each test item belonging to each component in the GMM (sum to 1 across the 3 classes) if np.size(test)==0, i.e. nbitem==0, None is returned estimator : nipy.labs.clustering.GMM object The estimator object, returned only if return_estimator is true. Notes ----- Our convention is that: * class 1 represents the negative class * class 2 represents the null class * class 3 represents the positive class """ from ..clustering.bgmm import VBGMM from ..clustering.gmm import GridDescriptor nvox = np.size(x) x = np.reshape(x, (nvox, 1)) if test is None: test = x if np.size(test) == 0: return None sx = np.sort(x, 0) nclasses = 3 # set the priors from a reasonable model of the data (!) # prior means mb0 = np.mean(sx[:int(alpha * nvox)]) mb2 = np.mean(sx[int((1 - alpha) * nvox):]) prior_means = np.reshape(np.array([mb0, 0, mb2]), (nclasses, 1)) if fixed_scale: prior_scale = np.ones((nclasses, 1, 1)) * 1. / (prior_strength) else: prior_scale = np.ones((nclasses, 1, 1)) * 1. / \ (prior_strength * np.var(x)) prior_dof = np.ones(nclasses) * prior_strength prior_weights = np.array([alpha, 1 - 2 * alpha, alpha]) * prior_strength prior_shrinkage = np.ones(nclasses) * prior_strength # instantiate the class and set the priors BayesianGMM = VBGMM(nclasses, 1, prior_means, prior_scale, prior_weights, prior_shrinkage, prior_dof) BayesianGMM.set_priors(prior_means, prior_weights, prior_scale, prior_dof, prior_shrinkage) # estimate the model BayesianGMM.estimate(x, delta=1.e-8, verbose=max(0, verbose-1)) # create a sampling grid if (verbose or bias): gd = GridDescriptor(1) gd.set([x.min(), x.max()], 100) gdm = gd.make_grid().squeeze() lj = BayesianGMM.likelihood(gd.make_grid()) # estimate the prior weights bfp = BayesianGMM.likelihood(test) if bias: lw = np.sum(lj[gdm > theta], 0) weights = BayesianGMM.weights / (BayesianGMM.weights.sum()) bfp = (lw / weights) * BayesianGMM.slikelihood(test) if verbose and (mpaxes is not False): BayesianGMM.show_components(x, gd, lj, mpaxes) bfp = (bfp.T / bfp.sum(1)).T if not return_estimator: return bfp else: return bfp, BayesianGMM
[docs]def gamma_gaussian_fit(x, test=None, verbose=0, mpaxes=False, bias=1, gaussian_mix=0, return_estimator=False): """ Computing some prior probabilities that the voxels of a certain map are in class disactivated, null or active using a gamma-Gaussian mixture Parameters ------------ x: array of shape (nvox,) the map to be analysed test: array of shape (nbitems,), optional the test values for which the p-value needs to be computed by default, test = x verbose: 0, 1 or 2, optional verbosity mode, 0 is quiet, and 2 calls matplotlib to display graphs. mpaxes: matplotlib axes, optional axes handle used to plot the figure in verbose mode if None, new axes are created if false, nothing is done bias: float, optional lower bound on the Gaussian variance (to avoid shrinkage) gaussian_mix: float, optional if nonzero, lower bound on the Gaussian mixing weight (to avoid shrinkage) return_estimator: boolean, optional if return_estimator is true, the estimator object is returned. Returns ------- bfp: array of shape (nbitems,3) The probability of each component in the mixture model for each test value estimator: nipy.labs.clustering.ggmixture.GGGM object The estimator object, returned only if return_estimator is true. """ from ..clustering import ggmixture Ggg = ggmixture.GGGM() Ggg.init_fdr(x) Ggg.estimate(x, niter=100, delta=1.e-8, bias=bias, verbose=0, gaussian_mix=gaussian_mix) if mpaxes is not False: # hyper-verbose mode Ggg.show(x, mpaxes=mpaxes) Ggg.parameters() if test is None: test = x test = np.reshape(test, np.size(test)) bfp = np.array(Ggg.posterior(test)).T if return_estimator: return bfp, Ggg return bfp
[docs]def smoothed_histogram_from_samples(x, bins=None, nbins=256, normalized=False): """ Smooth histogram corresponding to density underlying the samples in `x` Parameters ---------- x: array of shape(n_samples) input data bins: array of shape(nbins+1), optional the bins location nbins: int, optional the number of bins of the resulting histogram normalized: bool, optional if True, the result is returned as a density value Returns ------- h: array of shape (nbins) the histogram bins: array of shape(nbins+1), the bins location """ from scipy.ndimage import gaussian_filter1d # first define the bins if bins is None: h, bins = np.histogram(x, nbins) bins = bins.mean() + 1.2 * (bins - bins.mean()) h, bins = np.histogram(x, bins) # possibly normalize to density h = 1.0 * h dc = bins[1] - bins[0] if normalized: h /= (dc * h.sum()) # define the optimal width sigma = x.std() / (dc * np.exp(.2 * np.log(x.size))) # smooth the histogram h = gaussian_filter1d(h, sigma, mode='constant') return h, bins