Source code for selectinf.glm

import functools # for bootstrap partial mapping

import numpy as np
from scipy.stats import norm as ndist

from regreg.api import glm, identity_quadratic

from .base import restricted_estimator

import regreg.api as rr
import regreg.affine as ra

[docs]def pairs_bootstrap_glm(glm_loss,
                        active, 
                        beta_full=None, 
                        inactive=None, 
                        scaling=1.,
                        solve_args={'min_its':50, 'tol':1.e-10}):
    """
    Construct a non-parametric bootstrap sampler that 
    samples the estimates ($\bar{\beta}_E^*$) of a generalized 
    linear model (GLM) restricted to `active`
    as well as, optionally, the inactive coordinates of the score of the 
    GLM evaluated at the estimates ($\nabla \ell(\bar{\beta}_E)[-E]$) where
    $\bar{\beta}_E$ is padded with zeros where necessary.
    
    Parameters
    ----------

    glm_loss : regreg.smooth.glm.glm
        The loss of the generalized linear model.

    active : np.bool
        Boolean indexing array

    beta_full : np.float (optional)
        Solution to the restricted problem, zero except where active is nonzero.

    inactive : np.bool (optional)
        Boolean indexing array

    scaling : float
        Scaling to keep entries of roughly constant order. Active entries
        are multiplied by sqrt(scaling) inactive ones are divided
        by sqrt(scaling).

    solve_args : dict
        Arguments passed to solver of restricted problem (`restricted_estimator`) if 
        beta_full is None.

    Returns
    -------

    bootstrap_sampler : callable
        A callable object that takes a sample of indices and returns
        the corresponding bootstrap sample.

    """
    X, Y = glm_loss.data

    if beta_full is None:
        beta_active = restricted_estimator(glm_loss, active, solve_args=solve_args)
        beta_full = np.zeros(glm_loss.shape)
        beta_full[active] = beta_active
    else:
        beta_active = beta_full[active]

    X_active = X[:,active]

    nactive = active.sum()
    ntotal = nactive

    if inactive is not None:
        X_inactive = X[:,inactive]
        ntotal += inactive.sum()

    _bootW = np.diag(glm_loss.saturated_loss.hessian(X_active.dot(beta_active)))
    _bootQ = X_active.T.dot(_bootW.dot(X_active))
    _bootQinv = np.linalg.inv(_bootQ)
    if inactive is not None:
        _bootC = X_inactive.T.dot(_bootW.dot(X_active))
        _bootI = _bootC.dot(_bootQinv)
    else:
        _bootI = None

    nactive = active.sum()
    if inactive is not None:
        X_full = np.hstack([X_active, X_inactive])
        beta_overall = np.zeros(X_full.shape[1])
        beta_overall[:nactive] = beta_active
    else:
        X_full = X_active
        beta_overall = beta_active

    _boot_mu = lambda X_full, beta_overall: glm_loss.saturated_loss.mean_function(X_full.dot(beta_overall))

    if ntotal > nactive:
        observed = np.hstack([beta_active, -glm_loss.smooth_objective(beta_full, 'grad')[inactive]])
    else:
        observed = beta_active

    # scaling is a lipschitz constant for a gradient squared
    _sqrt_scaling = np.sqrt(scaling)

    def _boot_score(X_full, Y, ntotal, _bootQinv, _bootI, nactive, _sqrt_scaling, beta_overall, indices):
        X_star = X_full[indices]
        Y_star = Y[indices]
        score = X_star.T.dot(Y_star - _boot_mu(X_star, beta_overall))
        result = np.zeros(ntotal)
        result[:nactive] = _bootQinv.dot(score[:nactive])
        if ntotal > nactive:
            result[nactive:] = score[nactive:] - _bootI.dot(score[:nactive])
        result[:nactive] *= _sqrt_scaling
        result[nactive:] /= _sqrt_scaling
        return result

    observed[:nactive] *= _sqrt_scaling
    observed[nactive:] /= _sqrt_scaling

    return functools.partial(_boot_score, X_full, Y, ntotal, _bootQinv, _bootI, nactive, _sqrt_scaling, beta_overall), observed

[docs]def pairs_inactive_score_glm(glm_loss, 
                             active, 
                             beta_active, 
                             scaling=1.,
                             inactive=None,
                             solve_args={'min_its':50, 'tol':1.e-10}):

    """
    Construct a non-parametric bootstrap sampler that 
    samples the inactive coordinates of the score of the 
    GLM evaluated at the estimates ($\nabla \ell(\bar{\beta}_E)[-E]$) where
    $\bar{\beta}_E$ is padded with zeros where necessary.
    
    Parameters
    ----------

    glm_loss : regreg.smooth.glm.glm
        The loss of the generalized linear model.

    active : np.bool
        Boolean indexing array

    beta_active : np.float (optional)
        Solution to the restricted problem.

    scaling : float
        Scaling to keep entries of roughly constant order. Active entries
        are multiplied by sqrt(scaling) inactive ones are divided
        by sqrt(scaling).

    inactive : np.bool (optional)
        Which coordinates to return. If None, defaults
        to ~active.

    solve_args : dict
        Arguments passed to solver of restricted problem (`restricted_estimator`) if 
        beta_full is None.

    Returns
    -------

    bootstrap_sampler : callable
        A callable object that takes a sample of indices and returns
        the corresponding bootstrap sample.

    """

    if inactive is None:
        inactive = ~active

    beta_full = np.zeros(glm_loss.shape)
    beta_full[active] = beta_active

    _full_boot_score = pairs_bootstrap_glm(glm_loss, 
                                           active, 
                                           beta_full=beta_full,
                                           inactive=inactive,
                                           scaling=scaling,
                                           solve_args=solve_args)[0]
    nactive = active.sum()

    def _boot_score(indices):
        return _full_boot_score(indices)[nactive:]

    return _boot_score


[docs]def pairs_bootstrap_score(glm_loss,
                          active, 
                          beta_active=None, 
                          solve_args={'min_its':50, 'tol':1.e-10}):
    """
    Construct a non-parametric bootstrap sampler that 
    samples the score ($\nabla \ell(\bar{\beta}_E)) ofa generalized 
    linear model (GLM) restricted to `active`
    as well as, optionally, the inactive coordinates of the score of the 
    GLM evaluated at the estimates ($\nabla \ell(\bar{\beta}_E)[-E]$) where
    $\bar{\beta}_E$ is padded with zeros where necessary.
    
    Parameters
    ----------

    glm_loss : regreg.smooth.glm.glm
        The loss of the generalized linear model.

    active : np.bool
        Boolean indexing array

    beta_active : np.float (optional)
        Solution to the restricted problem. 

    solve_args : dict
        Arguments passed to solver of restricted problem (`restricted_estimator`) if 
        beta_full is None.

    Returns
    -------

    bootstrap_sampler : callable
        A callable object that takes a sample of indices and returns
        the corresponding bootstrap sample.

    """

    X, Y = glm_loss.data

    if beta_active is None:
        beta_active = restricted_estimator(glm_loss, active, solve_args=solve_args)
    X_active = X[:,active]

    _bootW = np.diag(glm_loss.saturated_loss.hessian(X_active.dot(beta_active)))

    _boot_mu = lambda X_active, beta_active: glm_loss.saturated_loss.mean_function(X_active.dot(beta_active))

    def _boot_score(X, Y, active, beta_active, indices):
        X_star = X[indices]
        Y_star = Y[indices]
        score = -X_star.T.dot(Y_star - _boot_mu(X_star[:,active], beta_active))
        return score

    return functools.partial(_boot_score, X, Y, active, beta_active)

[docs]def set_alpha_matrix(glm_loss,
                     active,
                     beta_full=None,
                     inactive=None,
                     scaling=1.,
                     solve_args={'min_its': 50, 'tol': 1.e-10}):
    """
    DESCRIBE WHAT THIS DOES

    Parameters
    ----------

    glm_loss : regreg.smooth.glm.glm
        The loss of the generalized linear model.

    active : np.bool
        Boolean indexing array

    beta_full : np.float (optional)
        Solution to the restricted problem, zero except where active is nonzero.

    inactive : np.bool (optional)
        Boolean indexing array

    scaling : float
        Scaling to keep entries of roughly constant order. Active entries
        are multiplied by sqrt(scaling) inactive ones are divided
        by sqrt(scaling).

    solve_args : dict
        Arguments passed to solver of restricted problem (`restricted_estimator`) if 
        beta_full is None.

    Returns
    -------

    ???????

    """
    X, Y = glm_loss.data

    if beta_full is None:
        beta_active = restricted_estimator(glm_loss, active, solve_args=solve_args)
        beta_full = np.zeros(glm_loss.shape)
        beta_full[active] = beta_active
    else:
        beta_active = beta_full[active]

    X_active = X[:,active]

    nactive = active.sum()
    ntotal = nactive

    if inactive is not None:
        X_inactive = X[:,inactive]
        ntotal += inactive.sum()

    _W = np.diag(glm_loss.saturated_loss.hessian(X_active.dot(beta_active)))
    _Q = X_active.T.dot(_W.dot(X_active))
    _Qinv = np.linalg.inv(_Q)
    nactive = active.sum()
    if inactive is not None:
        X_full = np.hstack([X_active, X_inactive])
        beta_overall = np.zeros(X_full.shape[1])
        beta_overall[:nactive] = beta_active
    else:
        X_full = X_active
        beta_overall = beta_active

    obs_residuals = Y - glm_loss.saturated_loss.mean_function(X_full.dot(beta_overall))

    return np.dot(np.dot(_Qinv, X_active.T), np.diag(obs_residuals))

# Methods to form appropriate covariances

[docs]def bootstrap_cov(sampler, boot_target, cross_terms=(), nsample=2000):
    """
    m out of n bootstrap

    returns estimates of covariance matrices: boot_target with itself,
    and the blocks of (boot_target, boot_other) for other in cross_terms

    """

    _mean_target = 0.
    if len(cross_terms) > 0:
        _mean_cross = [0.] * len(cross_terms)
        _outer_cross = [0.] * len(cross_terms)
    _outer_target = 0.

    for j in range(nsample):
        indices = sampler()
        _boot_target = boot_target(indices)

        _mean_target += _boot_target
        _outer_target += np.multiply.outer(_boot_target, _boot_target)

        for i, _boot in enumerate(cross_terms):
            _boot_sample = _boot(indices)
            _mean_cross[i] += _boot_sample
            _outer_cross[i] += np.multiply.outer(_boot_target, _boot_sample)

    _mean_target /= nsample
    _outer_target /= nsample

    for i in range(len(cross_terms)):
        _mean_cross[i] /= nsample
        _outer_cross[i] /= nsample

    _cov_target = _outer_target - np.multiply.outer(_mean_target, _mean_target)
    if len(cross_terms) == 0:
        return _cov_target
    return [_cov_target] + [_o - np.multiply.outer(_mean_target, _m) for _m, _o in zip(_mean_cross, _outer_cross)]

[docs]def glm_nonparametric_bootstrap(m, n):
    """
    The m out of n bootstrap.
    """
    return functools.partial(bootstrap_cov, lambda: np.random.choice(n, size=(m,), replace=True))

[docs]def resid_bootstrap(gaussian_loss,
                    active, # boolean
                    inactive=None,
                    scaling=1.):

    X, Y = gaussian_loss.data
    X_active = X[:,active]

    nactive = active.sum()
    ntotal = nactive

    if inactive is not None:
        X_inactive = X[:,inactive]
        ntotal += inactive.sum()

    X_active_inv = np.linalg.pinv(X_active)
    beta_active = X_active_inv.dot(Y)

    if ntotal > nactive:
        beta_full = np.zeros(X.shape[1])
        beta_full[active] = beta_active
        observed = np.hstack([beta_active, -gaussian_loss.smooth_objective(beta_full, 'grad')[inactive]])
    else:
        observed = beta_active

    if ntotal > nactive:
        X_inactive = X[:,inactive]
        X_inactive_resid = X_inactive - X_active.dot(X_active_inv.dot(X_inactive))

    _sqrt_scaling = np.sqrt(scaling)

    def _boot_score(Y_star):
        beta_hat = X_active_inv.dot(Y_star)
        result = np.zeros(ntotal)
        result[:nactive] = beta_hat
        if ntotal > nactive:
            result[nactive:] = X_inactive_resid.T.dot(Y_star)
        result[:nactive] *= _sqrt_scaling
        result[nactive:] /= _sqrt_scaling
        return result

    return _boot_score, observed

[docs]def parametric_cov(glm_loss, 
                   target_with_linear_func, 
                   cross_terms=(),
                   dispersion=None,
                   solve_args={'min_its':50, 'tol':1.e-10}):

    # cross_terms are different active sets

    target, linear_func = target_with_linear_func

    target_bool = np.zeros(glm_loss.shape, np.bool)
    target_bool[target] = True
    target = target_bool

    linear_funcT = linear_func.T

    X, Y = glm_loss.data
    n, p = X.shape

    def _WQ(active):
        beta_active = restricted_estimator(glm_loss, active, solve_args=solve_args)
        W = glm_loss.saturated_loss.hessian(X[:,active].dot(beta_active))
        return W

    # weights and Q at the target
    W_T = _WQ(target)

    X_T = X[:,target]
    XW_T = W_T[:, None] * X_T
    Q_T_inv = np.linalg.inv(X_T.T.dot(XW_T))

    beta_T = restricted_estimator(glm_loss, target, solve_args=solve_args)

    # this is Pearson's X^2 dispersion estimator
    if dispersion is None:
        sigma_T = np.sqrt(np.sum((Y-glm_loss.saturated_loss.mean_function(X_T.dot(beta_T)))**2 / W_T)/(n-np.sum(target)))
    else:
        sigma_T = dispersion

    covariances = [linear_func.dot(Q_T_inv).dot(linear_funcT) * (sigma_T**2)]

    for cross in cross_terms:
        # the covariances are for (\bar{\beta}_{C}, N_C) -- C for cross

        cross_bool = np.zeros(X.shape[1], np.bool)
        cross_bool[cross] = True; cross = cross_bool

        X_C = X[:, cross]
        X_IT = X[:, ~cross].T
        Q_C_inv = np.linalg.inv(X_C.T.dot(W_T[:, None] * X_C))
        beta_block = Q_C_inv.dot(X[:, cross].T.dot(XW_T)).dot(Q_T_inv)
        null_block = X_IT.dot(XW_T) - X_IT.dot(W_T[:, None] * X_C).dot(Q_C_inv).dot(X[:, cross].T.dot(XW_T))
        null_block = null_block.dot(Q_T_inv)

        beta_C = restricted_estimator(glm_loss, cross, solve_args=solve_args)
        if dispersion is None:
            sigma_C = sigma_T # Hmm... not sure here 
            # sigma_C = np.sqrt(np.sum((Y - glm_loss.saturated_loss.mean_function(X_C.dot(beta_C)) / W_C) ** 2) / (n - np.sum(cross)))
        else:
            sigma_C = dispersion

        covariances.append(np.vstack([beta_block, null_block]).dot(linear_funcT).T * sigma_T * sigma_C)

    return covariances


[docs]def glm_parametric_covariance(glm_loss, solve_args={'min_its':50, 'tol':1.e-10}):
    """
    A constructor for parametric covariance
    """
    return functools.partial(parametric_cov, glm_loss, solve_args=solve_args)


[docs]def standard_split_ci(glm_loss, X, y, active, leftout_indices, alpha=0.1):
    """
    Data plitting confidence intervals via bootstrap.
    """
    loss = glm_loss(X[leftout_indices,], y[leftout_indices])
    boot_target, target_observed = pairs_bootstrap_glm(loss, active)
    nactive = np.sum(active)
    size= np.sum(leftout_indices)
    observed = target_observed[:nactive]
    boot_target_restricted = lambda indices: boot_target(indices)[:nactive]
    sampler = lambda: np.random.choice(size, size=(size,), replace=True)
    target_cov = bootstrap_cov(sampler, boot_target_restricted)

    quantile = - ndist.ppf(alpha / float(2))
    LU = np.zeros((2, target_observed.shape[0]))
    for j in range(observed.shape[0]):
        sigma = np.sqrt(target_cov[j, j])
        LU[0, j] = observed[j] - sigma * quantile
        LU[1, j] = observed[j] + sigma * quantile
    return LU.T