Source code for selectinf.distributions.pvalue

"""

This module contains functions needed to evaluate post selection
p-values for non polyhedral selection procedures through a variety of means.

These p-values appear for the group LASSO global null test as well as the nuclear norm
p-value test.

They are described in the `Kac Rice`_ paper.

.. _Kac Rice: http://arxiv.org/abs/1308.3020
.. _Spacings: http://arxiv.org/abs/1401.3889
.. _post selection LASSO: http://arxiv.org/abs/1311.6238


"""

import numpy as np
from scipy.stats import chi

from scipy.stats import norm as ndist, truncnorm 
from scipy.integrate import quad

from mpmath import mp
mp.dps = 80

[docs]def norm_q(prob): r""" A multi-precision calculation of the standard normal quantile function: .. math:: \int_{-\infty}^{q(p)} \frac{e^{-z^2/2}}{\sqrt{2\pi}} \; dz = p where $p$ is `prob`. Parameters ---------- prob : float Returns ------- quantile : float """ return np.array(mp.erfinv(2*prob-1)*mp.sqrt(2))
[docs]def norm_pdf(observed): r""" A multi-precision calculation of the standard normal density function: .. math:: \frac{e^{-T^2/2}}{\sqrt{2\pi}} where `T` is observed. Parameters ---------- observed : float Returns ------- density : float """ return np.array(mp.npdf(observed))
[docs]def norm_interval(lower, upper): r""" A multiprecision evaluation of .. math:: \Phi(U) - \Phi(L) Parameters ---------- lower : float The lower limit $L$ upper : float The upper limit $U$ """ if lower > 0 and upper > 0: return mp.ncdf(-lower) - mp.ncdf(-upper) else: return mp.ncdf(upper) - mp.ncdf(lower)
[docs]def truncnorm_cdf(observed, lower, upper): r""" Compute the truncated normal distribution function. .. math:: \frac{\Phi(U) - \Phi(T)}{\Phi(U) - \Phi(L)} where $T$ is `observed`, $L$ is `lower_bound` and $U$ is `upper_bound`. Parameters ---------- observed : float lower : float upper : float Returns ------- P : float """ x, a, b = observed, lower, upper x = max(x, a) x = min(x, b) #a = max(min(a, 5), -5) #b = max(min(b, 5), -5) #x = max(min(x, 5), -5) #if a==b: # return 1. if a > 0 and b > 0: Fx, Fa, Fb = mp.ncdf(-x), mp.ncdf(-a), mp.ncdf(-b) return float(( Fa - Fx ) / (Fa - Fb)) else: Fx, Fa, Fb = mp.ncdf(x), mp.ncdf(a), mp.ncdf(b) return float( ( Fx - Fa ) / ( Fb - Fa ) )
[docs]def chi_pvalue(observed, lower_bound, upper_bound, sd, df, method='MC', nsim=1000): r""" Compute a truncated $\chi$ p-value based on the conditional survival function. Parameters ---------- observed : float lower_bound : float upper_bound : float sd : float Standard deviation. df : float Degrees of freedom. method: string One of ['MC', 'cdf', 'sf'] Returns ------- pvalue : float Notes ----- Let $T$ be `observed`, $L$ be `lower_bound` and $U$ be `upper_bound`, and $\sigma$ be `sd`. The p-value, for $L \leq T \leq U$ is .. math:: \frac{P(\chi^2_k / \sigma^2 \geq T^2) - P(\chi^2_k / \sigma^2 \geq U^2)} {P(\chi^2_k / \sigma^2 \geq L^2) - P(\chi^2_k / \sigma^2 \geq U^2)} It can be computed using `scipy.stats.chi` either its `cdf` (distribution function) or `sf` (survival function) or evaluated by Monte Carlo if method is `MC`. """ L, T, U = lower_bound, observed, upper_bound # shorthand if method == 'cdf': pval = ((chi.cdf(U / sd, df) - chi.cdf(T / sd, df)) / (chi.cdf(U / sd, df) - chi.cdf(L / sd, df))) elif method == 'sf': pval = ((chi.sf(U / sd, df) - chi.sf(T / sd, df)) / (chi.sf(U / sd, df) - chi.sf(L / sd, df))) elif method == 'MC': if df == 1: H = [] else: H = [0]*(df-1) pval = general_pvalue(T / sd, L / sd, U / sd, H, nsim=nsim) else: raise ValueError('method should be one of ["cdf", "sf", "MC"]') if pval == 1: # the distribution functions may have failed -- use MC pval = general_pvalue(T / sd, L / sd, U / sd, H, nsim=50000) if pval > 1: pval = 1 return pval
[docs]def gauss_poly(lower_bound, upper_bound, curvature, nsim=100): r""" Computes the integral of a polynomial times the standard Gaussian density over an interval. Introduced in `Kac Rice`_, display (33) of v2. Parameters ---------- lower_bound : float upper_bound : float curvature : np.array A diagonal matrix related to curvature. It is assumed that `curvature + lower_bound I` is non-negative definite. nsim : int How many draws from $N(0,1)$ should we use? Returns ------- integral : float Notes ----- The return value is a Monte Carlo estimate of .. math:: \int_{L}^{U} \det(\Lambda + z I) \frac{e^{-z^2/2\sigma^2}}{\sqrt{2\pi\sigma^2}} \, dz where $L$ is `lower_bound`, $U$ is `upper_bound` and $\Lambda$ is the diagonal matrix `curvature`. """ T, H = lower_bound, curvature proportion = ndist.cdf(upper_bound - T) - ndist.cdf(0) U = np.random.sample(nsim) Z = ndist.ppf(U * proportion + ndist.cdf(0)) if H != []: HT = H + T exponent = np.log(np.fabs(np.add.outer(Z, HT))).sum(1) - T*Z - T**2/2. else: exponent = - T*Z - T**2/2. C = exponent.max() return np.exp(exponent - C).mean() * proportion, C
[docs]def general_pvalue(observed, lower_bound, upper_bound, curvature, nsim=100): r""" Computes the integral of a polynomial times the standard Gaussian density over an interval. Introduced in `Kac Rice`_, display (35) of v2. Parameters ---------- observed : float lower_bound : float upper_bound : float curvature : np.array A diagonal matrix related to curvature. It is assumed that `curvature + lower_bound I` is non-negative definite. nsim : int How many draws from $N(0,1)$ should we use? Returns ------- integral : float Notes ----- The return value is a Monte Carlo estimate of .. math:: \frac{\int_{T}^{U} \det(\Lambda + z I) \frac{e^{-z^2/2\sigma^2}}{\sqrt{2\pi\sigma^2}} \, dz} {\int_{L}^{U} \det(\Lambda + z I) \frac{e^{-z^2/2\sigma^2}}{\sqrt{2\pi\sigma^2}} \, dz} where $T$ is `observed`, $L$ is `lower_bound`, $U$ is `upper_bound` and $\Lambda$ is the diagonal matrix `curvature`. """ exponent_1, C1 = gauss_poly(observed, upper_bound, curvature, nsim=nsim) exponent_2, C2 = gauss_poly(lower_bound, upper_bound, curvature, nsim=nsim) return np.exp(C1-C2) * exponent_1 / exponent_2
[docs]class SelectionInterval(object): """ Compute a selection interval for a Gaussian truncated to an interval. """
[docs] def __init__(self, lower_bound, observed, upper_bound, sigma): self.lower_bound, self.observed, self.upper_bound, self.sigma = lower_bound, observed, upper_bound, sigma
[docs] def pivot(self, exp): L, obs, U, sd = self.lower_bound, self.observed, self.upper_bound, self.sigma return truncnorm_cdf((obs-exp)/sd, (L-exp)/sd, (U-exp)/sd)
[docs] def conf_int(self, lb, ub, alpha=0.05): F = lambda exp: self.pivot(exp) L = _find_root(F, 1.0 - 0.5 * alpha, lb, ub) U = _find_root(F, 0.5 * alpha, L, ub) return L, U
def _find_root(f, y, lb, ub, tol=1e-6): """ searches for solution to f(x) = y in (lb, ub), where f is a monotone decreasing function """ # make sure solution is in range a, b = lb, ub fa, fb = f(a), f(b) # assume a < b if fa > y and fb > y: while fb > y : b, fb = b + (b-a), f(b + (b-a)) elif fa < y and fb < y: while fa < y : a, fa = a - (b-a), f(a - (b-a)) # determine the necessary number of iterations max_iter = int( np.ceil( ( np.log(tol) - np.log(b-a) ) / np.log(0.5) ) ) # bisect (slow but sure) until solution is obtained for _ in xrange(max_iter): c, fc = (a+b)/2, f((a+b)/2) if fc > y: a = c elif fc < y: b = c return c