Source code for selectinf.distributions.pvalue

"""

This module contains functions needed to evaluate post selection
p-values for non polyhedral selection procedures through a variety of means.

These p-values appear for the group LASSO global null test as well as the nuclear norm
p-value test.

They are described in the `Kac Rice`_ paper.

.. _Kac Rice: http://arxiv.org/abs/1308.3020
.. _Spacings: http://arxiv.org/abs/1401.3889
.. _post selection LASSO: http://arxiv.org/abs/1311.6238


"""

import numpy as np
from scipy.stats import chi

from scipy.stats import norm as ndist, truncnorm 
from scipy.integrate import quad

from mpmath import mp
mp.dps = 80

[docs]def norm_q(prob):
    r"""
    A multi-precision calculation of the
    standard normal quantile function:

    .. math::

       \int_{-\infty}^{q(p)} \frac{e^{-z^2/2}}{\sqrt{2\pi}} \; dz = p

    where $p$ is `prob`.

    Parameters
    ----------

    prob : float

    Returns
    -------

    quantile : float

    """
    return np.array(mp.erfinv(2*prob-1)*mp.sqrt(2))


[docs]def norm_pdf(observed):
    r"""
    A multi-precision calculation of the
    standard normal density function:

    .. math::

       \frac{e^{-T^2/2}}{\sqrt{2\pi}}

    where `T` is observed.

    Parameters
    ----------

    observed : float

    Returns
    -------

    density : float

    """
    return np.array(mp.npdf(observed))

[docs]def norm_interval(lower, upper):
    r"""
    A multiprecision evaluation of

    .. math::

        \Phi(U) - \Phi(L)

    Parameters
    ----------

    lower : float
        The lower limit $L$

    upper : float
        The upper limit $U$

    """
    if lower > 0 and upper > 0:
        return mp.ncdf(-lower) - mp.ncdf(-upper)
    else:
        return mp.ncdf(upper) - mp.ncdf(lower)

[docs]def truncnorm_cdf(observed, lower, upper):
    r"""
    Compute the truncated normal 
    distribution function.

    .. math::

        \frac{\Phi(U) - \Phi(T)}{\Phi(U) - \Phi(L)}

    where $T$ is `observed`, $L$ is `lower_bound` and $U$ is `upper_bound`.

    Parameters
    ----------

    observed : float

    lower : float

    upper : float

    Returns
    -------

    P : float

    """
    x, a, b = observed, lower, upper

    x = max(x, a)
    x = min(x, b)

    #a = max(min(a, 5), -5)
    #b = max(min(b, 5), -5)
    #x = max(min(x, 5), -5)
    #if a==b:
    #    return 1.

    if a > 0 and b > 0:
        Fx, Fa, Fb = mp.ncdf(-x), mp.ncdf(-a), mp.ncdf(-b)
        return float(( Fa - Fx ) / (Fa - Fb))
    else:
        Fx, Fa, Fb = mp.ncdf(x), mp.ncdf(a), mp.ncdf(b)
        return float( ( Fx - Fa ) / ( Fb - Fa ) )


[docs]def chi_pvalue(observed, lower_bound, upper_bound, sd, df, method='MC', nsim=1000):
    r"""

    Compute a truncated $\chi$ p-value based on the 
    conditional survival function. 

    Parameters
    ----------

    observed : float

    lower_bound : float

    upper_bound : float

    sd : float
        Standard deviation.

    df : float
        Degrees of freedom.

    method: string
        One of ['MC', 'cdf', 'sf']

    Returns
    -------

    pvalue : float

    Notes
    -----

    Let $T$ be `observed`, $L$ be `lower_bound` and $U$ be `upper_bound`,
    and $\sigma$ be `sd`.
    The p-value, for $L \leq T \leq U$ is

    .. math::

         \frac{P(\chi^2_k / \sigma^2 \geq T^2) - P(\chi^2_k / \sigma^2 \geq U^2)}
         {P(\chi^2_k / \sigma^2 \geq L^2) - P(\chi^2_k / \sigma^2 \geq U^2)} 

    It can be computed using `scipy.stats.chi` either its `cdf` (distribution 
    function) or `sf` (survival function) or evaluated
    by Monte Carlo if method is `MC`.

    """

    L, T, U = lower_bound, observed, upper_bound # shorthand

    if method == 'cdf':
        pval = ((chi.cdf(U / sd, df) - chi.cdf(T / sd, df)) / 
                (chi.cdf(U / sd, df) - chi.cdf(L / sd, df)))
    elif method == 'sf':
        pval = ((chi.sf(U / sd, df) - chi.sf(T / sd, df)) / 
                (chi.sf(U / sd, df) - chi.sf(L / sd, df)))
    elif method == 'MC':
        if df == 1:
            H = []
        else:
            H = [0]*(df-1)
        pval = general_pvalue(T / sd, L / sd, U / sd, H, nsim=nsim)
    else:
        raise ValueError('method should be one of ["cdf", "sf", "MC"]')
    if pval == 1: # the distribution functions may have failed -- use MC
        pval = general_pvalue(T / sd, L / sd, U / sd, H, nsim=50000)
    if pval > 1:
        pval = 1
    return pval

[docs]def gauss_poly(lower_bound, upper_bound, curvature, nsim=100):
    r"""
    Computes the integral of a polynomial times the 
    standard Gaussian density over an interval.

    Introduced in `Kac Rice`_, display (33) of v2.

    Parameters
    ----------

    lower_bound : float

    upper_bound : float

    curvature : np.array
        A diagonal matrix related to curvature.
        It is assumed that `curvature + lower_bound I` is non-negative definite.

    nsim : int
        How many draws from $N(0,1)$ should we use?

    Returns
    -------

    integral : float

    Notes
    -----

    The return value is a Monte Carlo estimate of

    .. math::

        \int_{L}^{U} \det(\Lambda + z I)
        \frac{e^{-z^2/2\sigma^2}}{\sqrt{2\pi\sigma^2}} \, dz

    where $L$ is `lower_bound`, $U$ is `upper_bound` and $\Lambda$ is the
    diagonal matrix `curvature`.

    """

    T, H = lower_bound, curvature
    proportion = ndist.cdf(upper_bound - T) - ndist.cdf(0)
    U = np.random.sample(nsim)  
    Z = ndist.ppf(U * proportion + ndist.cdf(0))
    if H != []:
        HT = H + T 
        exponent = np.log(np.fabs(np.add.outer(Z, HT))).sum(1) - T*Z - T**2/2.
    else:
        exponent = - T*Z - T**2/2.
    C = exponent.max()

    return np.exp(exponent - C).mean() * proportion, C

[docs]def general_pvalue(observed, lower_bound, upper_bound, curvature, nsim=100):

    r"""
    Computes the integral of a polynomial times the 
    standard Gaussian density over an interval.

    Introduced in `Kac Rice`_, display (35) of v2.

    Parameters
    ----------

    observed : float

    lower_bound : float

    upper_bound : float

    curvature : np.array
        A diagonal matrix related to curvature.
        It is assumed that `curvature + lower_bound I` is non-negative definite.

    nsim : int
        How many draws from $N(0,1)$ should we use?

    Returns
    -------

    integral : float

    Notes
    -----

    The return value is a Monte Carlo estimate of

    .. math::

        \frac{\int_{T}^{U} \det(\Lambda + z I)
        \frac{e^{-z^2/2\sigma^2}}{\sqrt{2\pi\sigma^2}} \, dz}
        {\int_{L}^{U} \det(\Lambda + z I)
        \frac{e^{-z^2/2\sigma^2}}{\sqrt{2\pi\sigma^2}} \, dz}

    where $T$ is `observed`, $L$ is `lower_bound`, 
    $U$ is `upper_bound` and $\Lambda$ is the
    diagonal matrix `curvature`.

    """

    exponent_1, C1 = gauss_poly(observed, upper_bound, curvature, nsim=nsim)
    exponent_2, C2 = gauss_poly(lower_bound, upper_bound, curvature, nsim=nsim)

    return np.exp(C1-C2) * exponent_1 / exponent_2


[docs]class SelectionInterval(object):
    """
    Compute a selection interval for
    a Gaussian truncated to an interval.
    """

[docs]    def __init__(self, lower_bound, observed, upper_bound, sigma):
        self.lower_bound, self.observed, self.upper_bound, self.sigma = lower_bound, observed, upper_bound, sigma


[docs]    def pivot(self, exp):
        L, obs, U, sd = self.lower_bound, self.observed, self.upper_bound, self.sigma 
        return truncnorm_cdf((obs-exp)/sd, (L-exp)/sd, (U-exp)/sd)

[docs]    def conf_int(self, lb, ub, alpha=0.05):
        F = lambda exp: self.pivot(exp)         
        L = _find_root(F, 1.0 - 0.5 * alpha, lb, ub)
        U = _find_root(F, 0.5 * alpha, L, ub)
        return L, U

def _find_root(f, y, lb, ub, tol=1e-6):
    """
    searches for solution to f(x) = y in (lb, ub), where 
    f is a monotone decreasing function
    """       
    
    # make sure solution is in range
    a, b   = lb, ub
    fa, fb = f(a), f(b)
    
    # assume a < b
    if fa > y and fb > y:
        while fb > y : 
            b, fb = b + (b-a), f(b + (b-a))
    elif fa < y and fb < y:
        while fa < y : 
            a, fa = a - (b-a), f(a - (b-a))
    
    # determine the necessary number of iterations
    max_iter = int( np.ceil( ( np.log(tol) - np.log(b-a) ) / np.log(0.5) ) )
    
    # bisect (slow but sure) until solution is obtained
    for _ in xrange(max_iter):
        c, fc  = (a+b)/2, f((a+b)/2)
        if fc > y: a = c
        elif fc < y: b = c

    return c