Source code for selectinf.randomized.group_lasso


from __future__ import print_function
import functools
from copy import copy

import numpy as np
from scipy.stats import norm as ndist

import regreg.api as rr

from .query import query, affine_gaussian_sampler

from .randomization import randomization
from ..base import restricted_estimator
from ..algorithms.debiased_lasso import (debiasing_matrix,
                                         pseudoinverse_debiasing_matrix)

#### High dimensional version
#### - parametric covariance
#### - Gaussian randomization

[docs]class group_lasso(query):


    r"""
    A class for the randomized LASSO for post-selection inference.
    The problem solved is

    .. math::

        \text{minimize}_{\beta} \ell(\beta) + 
            \sum_{i=1}^p \lambda_i |\beta_i\| - \omega^T\beta + \frac{\epsilon}{2} \|\beta\|^2_2

    where $\lambda$ is `lam`, $\omega$ is a randomization generated below
    and the last term is a small ridge penalty. Each static method
    forms $\ell$ as well as the $\ell_1$ penalty. The generic class
    forms the remaining two terms in the objective.

    """

[docs]    def __init__(self,
                  loglike,
                  groups,
                  weights,
                  ridge_term,
                  randomizer,
                  perturb=None):
         r"""
         Create a new post-selection object for the LASSO problem

         Parameters
         ----------

         loglike : `regreg.smooth.glm.glm`
             A (negative) log-likelihood as implemented in `regreg`.

         feature_weights : np.ndarray
             Feature weights for L-1 penalty. If a float,
             it is brodcast to all features.

         ridge_term : float
             How big a ridge term to add?

         randomizer : object
             Randomizer -- contains representation of randomization density.

         perturb : np.ndarray
             Random perturbation subtracted as a linear
             term in the objective function.
         """

         self.loglike = loglike
         self.nfeature = p = self.loglike.shape[0]

         self.ridge_term = ridge_term
         self.penalty = rr.group_lasso(groups,
                                       weights=weights,
                                       lagrange=1.)
         self._initial_omega = perturb  # random perturbation

         self.randomizer = randomizer

[docs]    def fit(self,
            solve_args={'tol': 1.e-12, 'min_its': 50},
            perturb=None):
        """

        Fit the randomized lasso using `regreg`.

        Parameters
        ----------

        solve_args : keyword args
            Passed to `regreg.problems.simple_problem.solve`.

        Returns
        -------

        signs : np.float
            Support and non-zero signs of randomized lasso solution.

        """

        p = self.nfeature

        (self.initial_soln, 
         self.initial_subgrad) = self._solve_randomized_problem(
                                      perturb=perturb, 
                                      solve_args=solve_args)

        # assuming solution is non-zero here!

        active = []
        active_dirs = {}
        unpenalized = []
        overall = np.ones(p, np.bool)

        ordered_groups = []
        ordered_opt = []
        ordered_vars = []

        tol = 1.e-6

        for g in sorted(np.unique(self.penalty.groups)):
            group = self.penalty.groups == g

            soln = self.initial_soln
            if np.linalg.norm(soln[group]) * tol * np.linalg.norm(soln):
                ordered_groups.append(g)
                ordered_vars.extend(np.nonzero(group)[0])

                if self.penalty.weights[g] == 0:
                    unpenalized.append(g)
                    ordered_opt.append(soln[group])
                else:
                    active.append(g)
                    dir = soln[group] / np.linalg.norm(soln[group])
                    active_dirs[g] = dir
                    ordered_opt.append(np.linalg.norm(soln[group]) - self.penalty.weights[g])
            else:
                overall[group] = False

        self.selection_variable = {'directions': active_dirs,
                                   'active_groups':active}

        self._ordered_groups = ordered_groups

        # initial state for opt variables

        self.observed_opt_state = np.hstack(ordered_opt)

        _beta_unpenalized = restricted_estimator(self.loglike, 
                                                 overall, 
                                                 solve_args=solve_args)

        beta_bar = np.zeros(p)
        beta_bar[overall] = _beta_unpenalized
        self._beta_full = beta_bar

        # form linear part

        num_opt_var = self.observed_opt_state.shape[0]

        # setup hessian and ridge term

        X, y = self.loglike.data
        W = self._W = self.loglike.saturated_loss.hessian(X.dot(beta_bar))
        opt_linear = np.dot(X.T, X[:, ordered_vars] * W[:, None])

        # set the observed score (data dependent) state

        # observed_score_state is
        # \nabla \ell(\bar{\beta}_E) + Q(\bar{\beta}_E) \bar{\beta}_E
        # in linear regression this is _ALWAYS_ -X^TY
        # 
        # should be asymptotically equivalent to
        # \nabla \ell(\beta^*) + Q(\beta^*)\beta^*

        self.observed_score_state = -opt_linear.dot(_beta_unpenalized)
        self.observed_score_state[~overall] += self.loglike.smooth_objective(beta_bar, 'grad')[~overall]

        # now adjust for ridge term

        for i, var in enumerate(ordered_vars):
            opt_linear[var, i] += self.ridge_term

        opt_offset = self.initial_subgrad
         
        # for group LASSO, we will have
        # a different sampler for each group
        # based on conditioning on all scalings
        # except that group

        # means targets will also need to say what group 
        # they are "targeting"...

        self._samplers = {}

        # dispersion will have to be adjusted for data splitting

        dispersion = 1.

        (prec_opt_linear,
         logdens_linear) = self._get_precision_opt_linear(opt_linear,
                                                          ordered_vars,
                                                          dispersion)

        for group, dens_info in _reference_density_info(soln, 
                                                        ordered_groups,
                                                        ordered_vars,
                                                        opt_linear,
                                                        opt_offset,
                                                        self.observed_score_state,
                                                        self.initial_subgrad,
                                                        self.penalty, 
                                                        prec_opt_linear).items():

            (dir_g,
             idx_g,
             implied_mean, 
             implied_variance, 
             log_det,
             log_cond_density) = dens_info
            
            group_idx = self.penalty.groups == group
            initial_scaling = np.linalg.norm(soln[group]) - self.penalty.weights[group]

            sampler = polynomial_gaussian_sampler(implied_mean,
                                                  implied_variance,
                                                  initial_scaling,
                                                  self.observed_score_state,
                                                  log_cond_density,
                                                  log_det,
                                                  (np.atleast_2d(logdens_linear.T[:,idx_g].dot(dir_g).T), 
                                                   opt_offset))
            self._samplers[group] = sampler

        self._setup = True

        return self.selection_variable

[docs]    def summary(self,
                observed_target, 
                group_assignments,
                target_cov, 
                target_score_cov, 
                alternatives,
                parameter=None,
                level=0.9,
                ndraw=10000,
                compute_intervals=False):

        # for smoke test, let's just 
        # use the first non-zero group for everything

        if parameter is None:
            parameter = np.zeros_like(observed_target)

        pvalues = np.zeros_like(observed_target)
        pivots = np.zeros_like(observed_target)
        intervals = np.zeros((parameter.shape[0], 2))

        for group in np.unique(group_assignments):
            group_idx = group_assignments == group

            (pvalues_, 
             pivots_,
             intervals_) = self._inference_for_target(
                               observed_target[group_idx],
                               group,
                               target_cov[group_idx][:, group_idx],
                               target_score_cov[group_idx],
                               [alternatives[i] for i in np.nonzero(group_idx)[0]],
                               parameter=parameter[group_idx],
                               level=level,
                               ndraw=ndraw,
                               compute_intervals=compute_intervals)
            pvalues[group_idx] = pvalues_
            pivots[group_idx] = pivots_
            intervals[group_idx] = np.array(intervals_)

        return pvalues, pivots, intervals

    def _inference_for_target(self,
                              observed_target, 
                              group,
                              target_cov, 
                              target_score_cov, 
                              alternatives,
                              opt_sample=None,
                              target_sample=None,
                              parameter=None,
                              level=0.9,
                              ndraw=10000,
                              compute_intervals=False):
        """
        Produce p-values and confidence intervals for targets
        of model including selected features

        Parameters
        ----------

        target : one of ['selected', 'full']

        features : np.bool
            Binary encoding of which features to use in final
            model and targets.

        parameter : np.array
            Hypothesized value for parameter -- defaults to 0.

        level : float
            Confidence level.

        ndraw : int (optional)
            Defaults to 1000.

        compute_intervals : bool
            Compute confidence intervals?

        dispersion : float (optional)
            Use a known value for dispersion, or Pearson's X^2?
        """

        sampler = self._samplers[group]
        if parameter is None:
            parameter = np.zeros_like(observed_target)

        if opt_sample is None:
            opt_sample, logW = sampler.sample(ndraw)
        else:
            ndraw = opt_sample.shape[0]

        pivots = sampler.coefficient_pvalues(observed_target,
                                             target_cov,
                                             target_score_cov,
                                             parameter=parameter,
                                             sample=(opt_sample, logW),
                                             normal_sample=target_sample,
                                             alternatives=alternatives)

        if not np.all(parameter == 0):
            pvalues = sampler.coefficient_pvalues(observed_target,
                                                  target_cov,
                                                  target_score_cov,
                                                  parameter=np.zeros_like(parameter),
                                                  sample=(opt_sample, logW),
                                                  normal_sample=target_sample,
                                                  alternatives=alternatives)
        else:
            pvalues = pivots

        intervals = None
        if compute_intervals:

            intervals = sampler.confidence_intervals(observed_target,
                                                     target_cov,
                                                     target_score_cov,
                                                     sample=(opt_sample, logW),
                                                     normal_sample=target_sample,
                                                     level=level)

        return pivots, pvalues, intervals


    def _get_precision_opt_linear(self, opt_linear, variables, dispersion=1):
        """
        Precision of randomization times columns of restricted Hessian
        """
        _, prec = self.randomizer.cov_prec 
        if np.asarray(prec).shape in [(), (0,)]:
            value = prec * opt_linear / dispersion
        else:
            value = prec.dot(opt_linear) / dispersion

        cond_precision = opt_linear.T.dot(value)
        cond_cov = np.linalg.inv(cond_precision)
        logdens_linear = cond_cov.dot(value.T) * dispersion # is this last dispersion correct?

        return value, logdens_linear

    def _solve_randomized_problem(self, 
                                  perturb=None, 
                                  solve_args={'tol': 1.e-12, 'min_its': 50}):

        # take a new perturbation if supplied
        if perturb is not None:
            self._initial_omega = perturb
        if self._initial_omega is None:
            self._initial_omega = self.randomizer.sample()

        quad = rr.identity_quadratic(self.ridge_term, 
                                     0, 
                                     -self._initial_omega, 
                                     0)
        
        problem = rr.simple_problem(self.loglike, self.penalty)

        initial_soln = problem.solve(quad, **solve_args) 
        initial_subgrad = -(self.loglike.smooth_objective(initial_soln, 
                                                          'grad') +
                            quad.objective(initial_soln, 'grad'))

        return initial_soln, initial_subgrad

[docs]    @staticmethod
    def gaussian(X,
                 Y,
                 groups,
                 weights,
                 sigma=1.,
                 quadratic=None,
                 ridge_term=None,
                 randomizer_scale=None):
        r"""
        Squared-error LASSO with feature weights.
        Objective function is (before randomization)
        
        .. math::
        
            \beta \mapsto \frac{1}{2} \|Y-X\beta\|^2_2 + \sum_{i=1}^p \lambda_i |\beta_i|

        where $\lambda$ is `feature_weights`. The ridge term
        is determined by the Hessian and `np.std(Y)` by default,
        as is the randomizer scale.

        Parameters
        ----------

        X : ndarray
            Shape (n,p) -- the design matrix.

        Y : ndarray
            Shape (n,) -- the response.

        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.

        sigma : float (optional)
            Noise variance. Set to 1 if `covariance_estimator` is not None.
            This scales the loglikelihood by `sigma**(-2)`.

        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.

        ridge_term : float
            How big a ridge term to add?

        randomizer_scale : float
            Scale for IID components of randomizer.

        randomizer : str
            One of ['laplace', 'logistic', 'gaussian']

        Returns
        -------

        L : `selection.randomized.lasso.lasso`

        """

        loglike = rr.glm.gaussian(X, Y, coef=1. / sigma ** 2, quadratic=quadratic)
        n, p = X.shape

        mean_diag = np.mean((X ** 2).sum(0))
        if ridge_term is None:
            ridge_term = np.std(Y) * np.sqrt(mean_diag) / np.sqrt(n - 1)

        if randomizer_scale is None:
            randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(Y) * np.sqrt(n / (n - 1.))

        randomizer = randomization.isotropic_gaussian((p,), randomizer_scale)

        if sigma != 1:
            blah
            weights = copy(weights)
            for k in weights.keys():
                weights[k] = weights[k] / sigma**2

        return group_lasso(loglike, 
                           groups, 
                           weights,
                           ridge_term, 
                           randomizer)

[docs]    @staticmethod
    def logistic(X,
                 successes,
                 groups,
                 weights,
                 trials=None,
                 quadratic=None,
                 ridge_term=None,
                 randomizer_scale=None):
        r"""
        Logistic LASSO with feature weights (before randomization)

        .. math::

             \beta \mapsto \ell(X\beta) + \sum_{i=1}^p \lambda_i |\beta_i|

        where $\ell$ is the negative of the logistic
        log-likelihood (half the logistic deviance)
        and $\lambda$ is `feature_weights`.

        Parameters
        ----------

        X : ndarray
            Shape (n,p) -- the design matrix.

        successes : ndarray
            Shape (n,) -- response vector. An integer number of successes.
            For data that is proportions, multiply the proportions
            by the number of trials first.

        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.

        trials : ndarray (optional)
            Number of trials per response, defaults to
            ones the same shape as Y.

        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.

        ridge_term : float
            How big a ridge term to add?

        randomizer_scale : float
            Scale for IID components of randomizer.

        randomizer : str
            One of ['laplace', 'logistic', 'gaussian']

        Returns
        -------

        L : `selection.randomized.lasso.lasso`
        """

        n, p = X.shape

        loglike = rr.glm.logistic(X, successes, trials=trials, quadratic=quadratic)

        mean_diag = np.mean((X ** 2).sum(0))

        if ridge_term is None:
            ridge_term = np.std(Y) * np.sqrt(mean_diag) / np.sqrt(n - 1)

        if randomizer_scale is None:
            randomizer_scale = np.sqrt(mean_diag) * 0.5

        randomizer = randomization.isotropic_gaussian((p,), randomizer_scale)

        return group_lasso(loglike, 
                           groups,
                           weights,
                           ridge_term, 
                           randomizer)

[docs]    @staticmethod
    def coxph(X,
              times,
              status,
              groups,
              weights,
              quadratic=None,
              ridge_term=None,
              randomizer_scale=None):
        r"""
        Cox proportional hazards LASSO with feature weights.
        Objective function is (before randomization)

        .. math::

            \beta \mapsto \ell^{\text{Cox}}(\beta) + 
            \sum_{i=1}^p \lambda_i |\beta_i|

        where $\ell^{\text{Cox}}$ is the
        negative of the log of the Cox partial
        likelihood and $\lambda$ is `feature_weights`.
        Uses Efron's tie breaking method.

        Parameters
        ----------

        X : ndarray
            Shape (n,p) -- the design matrix.

        times : ndarray
            Shape (n,) -- the survival times.

        status : ndarray
            Shape (n,) -- the censoring status.

        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.

        covariance_estimator : optional
            If None, use the parameteric
            covariance estimate of the selected model.

        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.

        ridge_term : float
            How big a ridge term to add?

        randomizer_scale : float
            Scale for IID components of randomizer.

        randomizer : str
            One of ['laplace', 'logistic', 'gaussian']

        Returns
        -------

        L : `selection.randomized.lasso.lasso`

        """
        loglike = coxph_obj(X, times, status, quadratic=quadratic)

        # scale for randomization seems kind of meaningless here...

        mean_diag = np.mean((X ** 2).sum(0))

        if ridge_term is None:
            ridge_term = np.std(times) * np.sqrt(mean_diag) / np.sqrt(n - 1)

        if randomizer_scale is None:
            randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(Y) * np.sqrt(n / (n - 1.))

        randomizer = randomization.isotropic_gaussian((p,), randomizer_scale)

        return group_lasso(loglike,
                           groups,
                           weights,
                           ridge_term,
                           randomizer)

[docs]    @staticmethod
    def poisson(X,
                counts,
                groups,
                weights,
                quadratic=None,
                ridge_term=None,
                randomizer_scale=None):
        r"""
        Poisson log-linear LASSO with feature weights.
        Objective function is (before randomization)

        .. math::

            \beta \mapsto \ell^{\text{Poisson}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i|

        where $\ell^{\text{Poisson}}$ is the negative
        of the log of the Poisson likelihood (half the deviance)
        and $\lambda$ is `feature_weights`.

        Parameters
        ----------

        X : ndarray
            Shape (n,p) -- the design matrix.

        counts : ndarray
            Shape (n,) -- the response.

        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.

        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.

        ridge_term : float
            How big a ridge term to add?

        randomizer_scale : float
            Scale for IID components of randomizer.

        randomizer : str
            One of ['laplace', 'logistic', 'gaussian']

        Returns
        -------

        L : `selection.randomized.lasso.lasso`

        """
        n, p = X.shape
        loglike = rr.glm.poisson(X, counts, quadratic=quadratic)

        # scale for randomizer seems kind of meaningless here...

        mean_diag = np.mean((X ** 2).sum(0))

        if ridge_term is None:
            ridge_term = np.std(counts) * np.sqrt(mean_diag) / np.sqrt(n - 1)

        if randomizer_scale is None:
            randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(counts) * np.sqrt(n / (n - 1.))

        randomizer = randomization.isotropic_gaussian((p,), randomizer_scale)

        return group_lasso(loglike,
                           groups,
                           weights,
                           ridge_term,
                           randomizer)

[docs]    @staticmethod
    def sqrt_lasso(X,
                   Y,
                   groups,
                   weights,
                   quadratic=None,
                   ridge_term=None,
                   randomizer_scale=None,
                   solve_args={'min_its': 200},
                   perturb=None):
        r"""
        Use sqrt-LASSO to choose variables.
        Objective function is (before randomization)

        .. math::

            \beta \mapsto \|Y-X\beta\|_2 + \sum_{i=1}^p \lambda_i |\beta_i|

        where $\lambda$ is `feature_weights`. After solving the problem
        treat as if `gaussian` with implied variance and choice of
        multiplier. See arxiv.org/abs/1504.08031 for details.

        Parameters
        ----------

        X : ndarray
            Shape (n,p) -- the design matrix.

        Y : ndarray
            Shape (n,) -- the response.

        feature_weights: [float, sequence]
            Penalty weights. An intercept, or other unpenalized
            features are handled by setting those entries of
            `feature_weights` to 0. If `feature_weights` is
            a float, then all parameters are penalized equally.

        quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
            An optional quadratic term to be added to the objective.
            Can also be a linear term by setting quadratic
            coefficient to 0.

        covariance : str
            One of 'parametric' or 'sandwich'. Method
            used to estimate covariance for inference
            in second stage.

        solve_args : dict
            Arguments passed to solver.

        ridge_term : float
            How big a ridge term to add?

        randomizer_scale : float
            Scale for IID components of randomizer.

        randomizer : str
            One of ['laplace', 'logistic', 'gaussian']

        Returns
        -------

        L : `selection.randomized.lasso.lasso`

        Notes
        -----

        Unlike other variants of LASSO, this
        solves the problem on construction as the active
        set is needed to find equivalent gaussian LASSO.
        Assumes parametric model is correct for inference,
        i.e. does not accept a covariance estimator.
        """

        n, p = X.shape

        if np.asarray(feature_weights).shape == ():
            feature_weights = np.ones(p) * feature_weights

        mean_diag = np.mean((X ** 2).sum(0))
        if ridge_term is None:
            ridge_term = np.sqrt(mean_diag) / (n - 1)

        if randomizer_scale is None:
            randomizer_scale = 0.5 * np.sqrt(mean_diag) / np.sqrt(n - 1)

        if perturb is None:
            perturb = np.random.standard_normal(p) * randomizer_scale

        randomQ = rr.identity_quadratic(ridge_term, 0, -perturb, 0)  # a ridge + linear term

        if quadratic is not None:
            totalQ = randomQ + quadratic
        else:
            totalQ = randomQ

        soln, sqrt_loss = solve_sqrt_lasso(X,
                                           Y,
                                           weights=feature_weights,
                                           quadratic=totalQ,
                                           solve_args=solve_args,
                                           force_fat=True)

        denom = np.linalg.norm(Y - X.dot(soln))
        loglike = rr.glm.gaussian(X, Y)

        randomizer = randomization.isotropic_gaussian((p,), randomizer_scale * denom)
        
        weights = copy(weights)
        for k in weights.keys():
            weights[k] = weights[k] * denom

        obj = lasso(loglike, 
                    groups,
                    weights,
                    ridge_term * denom,
                    randomizer,
                    perturb=perturb * denom)
        obj._sqrt_soln = soln

        return obj

# private functions

def _reference_density_info(soln, 
                            ordered_groups, # ordering is used in assumptions about columns opt_linear
                            ordered_variables,
                            opt_linear,
                            opt_offset,
                            observed_score_state,
                            observed_subgrad,
                            group_lasso_penalty, 
                            # randomization precision times opt_linear
                            precision_opt_linear, 
                            tol=1.e-6):
    '''
    
    Parameters
    ----------
    
    Compute generalized eigenvalues above and return
    function to evaluate jacobian as a function of $r_g=\|z_g\|_2$
    fixing everything in the optimization variables except $r_g$.
    
    Above, $A_0$ is the Hessian of loss evaluated at an appropriate point.
    '''
    pen = group_lasso_penalty       # shorthand 
    nz = soln != 0                  # nonzero
    nnz = nz.sum()                  # num nonzero
    Hr = np.zeros((nnz, nnz))       # restricted hessian   

    nz_groups = []

    for group in ordered_groups:
        group_idx = pen.groups == group
        group_soln = soln[pen.groups == group]

        ng = group_idx.sum()
        group_direction = u_g = group_soln / np.linalg.norm(group_soln)
        group_norm = r_g = np.linalg.norm(group_soln)   # really r_g^*
        group_weight = lambda_g = pen.weights[group]

        fraction = np.sqrt(r_g / (lambda_g + r_g))

        # one of the blocks in D_0^{1/2}
        group_block = (np.identity(ng) * fraction + (1 - fraction) * 
                       np.multiply.outer(u_g, u_g))
        group_P = np.identity(ng) - np.multiply.outer(u_g, u_g)
        nz_groups.append((group,         # a group index g
                          group_idx,   # indices where group==idx
                          group_block, 
                          group_P,
                          r_g,          
                          lambda_g,
                          group_direction)
                        )
            
    # setup the block hessian Hr=D_0^{1/2}A_0D_0^{1/2}
    
    ctr_g = 0
    for group_g in nz_groups:
        which_idx_g, sqrt_g = group_g[1], group_g[2]
        idx_g = slice(ctr_g, ctr_g + which_idx_g.sum())
        ctr_h = 0
        for group_h in nz_groups:
            which_idx_h, sqrt_h = group_h[1], group_h[2]
            idx_h = slice(ctr_h, ctr_h + which_idx_h.sum())
            H_hg = opt_linear[which_idx_h][:,idx_g] # E columns of Hessian + epsilon I
            assert(np.allclose(H_hg, opt_linear[which_idx_g][:,idx_h].T))
            if group_h[0] == group_g[0]: # subtract the identity matrix
                H_hg -= np.identity(H_hg.shape[0])
            Hr[idx_g][:,idx_h] += sqrt_h.dot(H_hg).dot(sqrt_g).T # multiply left and right by D_0^{1/2}
            ctr_h += which_idx_h.sum()
        ctr_g += which_idx_g.sum()
        
    implied_precision = np.dot(opt_linear.T,
                               precision_opt_linear)
    
    # compute (I+Hr)^{-1}Hr
    
    final_matrix = np.linalg.inv(Hr)
    
    ctr_g = 0
    ref_dens_info = {}

    for group_g in nz_groups:
        which_g, which_idx_g, _, P_g, r_g, lambda_g, u_g = group_g
        idx_g = slice(ctr_g, ctr_g + which_idx_g.sum())

        if which_idx_g.sum() > 1:
            block_g = final_matrix[idx_g][:,idx_g]
            block_g = P_g.dot(block_g).dot(P_g)
            # \tilde{\gamma}'s
            eigvals_g = np.linalg.eigvalsh(block_g)[1:]        

            # factors in the determinant
            factors_g = lambda_g / ((eigvals_g + 1) * r_g)           
            k_g = which_idx_g.sum()
            def logdet_g(factors_g, r_g, k_g, lambda_g, r):
                r = np.reshape(r, (-1))
                num = np.multiply.outer(factors_g, r - r_g)
                den = np.add.outer(lambda_g * np.ones_like(factors_g), 
                                     r)
                return np.squeeze(np.log(1 + num / den).sum(0) + 
                                  + np.log(lambda_g + r) * (k_g - 1))

            logdet_g = functools.partial(logdet_g, 
                                         factors_g, 
                                         r_g, 
                                         k_g, 
                                         lambda_g)
            
        else: 
            logdet_g = lambda r: np.zeros_like(r).reshape(-1)

        implied_variance = 1 / (np.dot(implied_precision[idx_g][:,idx_g],
                                       u_g) * u_g).sum()

        # zero out this group's coordinate in the solution

        soln_idx = soln.copy()[ordered_variables]
        soln_idx[idx_g] = 0
        offset = (observed_subgrad + 
                  opt_linear.dot(soln_idx))
        direction = precision_opt_linear[:,idx_g].dot(u_g)
        num_implied_mean = -((observed_score_state + offset) * 
                             direction).sum()
        implied_mean = num_implied_mean * implied_variance

        def log_cond_density(offset, 
                             direction, 
                             implied_variance, 
                             log_det,
                             r, 
                             score_state):
            r = np.reshape(r, (-1,))

            num_implied_mean = ((score_state + offset) * direction).sum()
            implied_mean = num_implied_mean * implied_variance

            return np.squeeze(
                    (log_det(r) - 0.5 * 
                    (r - implied_mean)**2 / implied_variance))

        log_cond_density_g = functools.partial(log_cond_density,
                                               offset,
                                               direction,
                                               implied_variance,
                                               logdet_g)

        ref_dens_info[which_g] = (u_g,
                                  idx_g,
                                  implied_mean,
                                  implied_variance,
                                  logdet_g,
                                  log_cond_density_g)

        ctr_g += which_idx_g.sum()
        
    return ref_dens_info

[docs]class polynomial_gaussian_sampler(affine_gaussian_sampler):

    """
    1-dimensional Gaussian density restricted to [0,\infty) times a polynomial
    """

[docs]    def __init__(self,
                 implied_mean,
                 implied_covariance,
                 initial_point,
                 observed_score_state,
                 log_cond_density,
                 log_det,
                 logdens_transform, # describes how score enters log_density.
                 selection_info=None,
                 useC=False):

        self.mean = implied_mean
        self.covariance = np.zeros((1, 1))
        self.covariance[0, 0] = implied_covariance

        self._log_cond_density = log_cond_density
        self._log_det = log_det
        self.initial_point = initial_point
        self.observed_score_state = observed_score_state
        self.selection_info = selection_info
        self.logdens_transform = logdens_transform

[docs]    def sample(self, ndraw):
        '''
        Sample from a Gaussian truncated at zero
        with our mean and covariance,
        but give weight based on `self.log_det`

        Parameters
        ----------

        ndraw : int
           How long a chain to return?

        '''

        mean, variance = self.mean, self.covariance[0,0]
        sd = np.sqrt(variance)
        Zscore = mean / sd
        selection_prob = ndist.sf(-Zscore)
        truncated_Z = ndist.ppf((1 - selection_prob) + selection_prob * np.random.sample(ndraw))
        truncated_normal = truncated_Z * sd + mean
        logW = self._log_det(truncated_normal)
        return truncated_normal.reshape((-1,1)), logW

[docs]    def selective_MLE(self, 
                      observed_target, 
                      target_cov, 
                      target_score_cov, 
                      # initial (observed) value of optimization variables -- 
                      # used as a feasible point.
                      # precise value used only for independent estimator 
                      init_soln, 
                      solve_args={'tol':1.e-12}, 
                      level=0.9):

        raise NotImplementedError

[docs]    def reparam_map(self, 
                    parameter_target, 
                    observed_target, 
                    target_cov, 
                    target_score_cov, 
                    init_soln, 
                    solve_args={'tol':1.e-12},
                    useC=True):

        raise NotImplementedError

    def _log_density_ray(self,
                         candidate,
                         direction,
                         nuisance,
                         gaussian_sample,
                         opt_sample):

        value = affine_gaussian_sampler._log_density_ray(self,
                                                         candidate,
                                                         direction,
                                                         nuisance,
                                                         gaussian_sample,
                                                         opt_sample)
        value += self._log_det(opt_sample)
        return value

# functions to construct targets of inference
# and covariance with score representation

[docs]def selected_targets(loglike, 
                     W, 
                     active_groups,
                     penalty,
                     sign_info={}, 
                     dispersion=None,
                     solve_args={'tol': 1.e-12, 'min_its': 50}):

    X, y = loglike.data
    n, p = X.shape
    features = []
    
    group_assignments = []
    for group in active_groups:
        group_idx = penalty.groups == group
        features.extend(np.nonzero(group_idx)[0])
        group_assignments.extend([group] * group_idx.sum())

    Xfeat = X[:, features]
    Qfeat = Xfeat.T.dot(W[:, None] * Xfeat)
    observed_target = restricted_estimator(loglike, features, solve_args=solve_args)
    cov_target = np.linalg.inv(Qfeat)
    _score_linear = -Xfeat.T.dot(W[:, None] * X).T
    crosscov_target_score = _score_linear.dot(cov_target)
    alternatives = ['twosided'] * len(features)

    if dispersion is None:  # use Pearson's X^2
        dispersion = ((y - loglike.saturated_loss.mean_function(
            Xfeat.dot(observed_target))) ** 2 / W).sum() / (n - Xfeat.shape[1])

    return (observed_target, 
            group_assignments,
            cov_target * dispersion, 
            crosscov_target_score.T * dispersion, 
            alternatives)

[docs]def full_targets(loglike, 
                 W, 
                 active_groups,
                 penalty,
                 dispersion=None,
                 solve_args={'tol': 1.e-12, 'min_its': 50}):
    
    X, y = loglike.data
    n, p = X.shape
    features = []
    
    group_assignments = []
    for group in active_groups:
        group_idx = penalty.groups == group
        features.extend(np.nonzero(group_idx)[0])
        group_assignments.extend([group] * group_idx.sum())

    # target is one-step estimator

    Qfull = X.T.dot(W[:, None] * X)
    Qfull_inv = np.linalg.inv(Qfull)
    full_estimator = loglike.solve(**solve_args)
    cov_target = Qfull_inv[features][:, features]
    observed_target = full_estimator[features]

    crosscov_target_score = np.zeros((p, cov_target.shape[0]))
    crosscov_target_score[features] = -np.identity(cov_target.shape[0])

    if dispersion is None:  # use Pearson's X^2
        dispersion = (((y - loglike.saturated_loss.mean_function(X.dot(full_estimator))) ** 2 / W).sum() / 
                      (n - p))

    alternatives = ['twosided'] * len(features)

    return (observed_target, 
            group_assignments,
            cov_target * dispersion, 
            crosscov_target_score.T * dispersion, 
            alternatives)

[docs]def debiased_targets(loglike, 
                     W, 
                     active_groups,
                     penalty,
                     sign_info={}, 
                     dispersion=None,
                     approximate_inverse='JM',
                     debiasing_args={}):

    X, y = loglike.data
    n, p = X.shape
    features = []
    
    group_assignments = []
    for group in active_groups:
        group_idx = penalty.groups == group
        features.extend(np.nonzero(group_idx)[0])
        group_assignments.extend([group] * group_idx.sum())

    # relevant rows of approximate inverse

    if approximate_inverse == 'JM':
        Qinv_hat = np.atleast_2d(debiasing_matrix(X * np.sqrt(W)[:, None], 
                                                  features,
                                                  **debiasing_args)) / n
    else:
        Qinv_hat = np.atleast_2d(pseudoinverse_debiasing_matrix(X * np.sqrt(W)[:, None],
                                                                features,
                                                                **debiasing_args))

    problem = rr.simple_problem(loglike, penalty)
    nonrand_soln = problem.solve()
    G_nonrand = loglike.smooth_objective(nonrand_soln, 'grad')

    observed_target = nonrand_soln[features] - Qinv_hat.dot(G_nonrand)

    if p > n:
        M1 = Qinv_hat.dot(X.T)
        cov_target = (M1 * W[None, :]).dot(M1.T)
        crosscov_target_score = -(M1 * W[None, :]).dot(X).T
    else:
        Qfull = X.T.dot(W[:, None] * X)
        cov_target = Qinv_hat.dot(Qfull.dot(Qinv_hat.T))
        crosscov_target_score = -Qinv_hat.dot(Qfull).T

    if dispersion is None:  # use Pearson's X^2
        Xfeat = X[:, features]
        Qrelax = Xfeat.T.dot(W[:, None] * Xfeat)
        relaxed_soln = nonrand_soln[features] - np.linalg.inv(Qrelax).dot(G_nonrand[features])
        dispersion = (((y - loglike.saturated_loss.mean_function(Xfeat.dot(relaxed_soln)))**2 / W).sum() / 
                      (n - len(features)))

    alternatives = ['twosided'] * len(features)
    return (observed_target, 
            group_assignments,
            cov_target * dispersion, 
            crosscov_target_score.T * dispersion, 
            alternatives)

[docs]def form_targets(target, 
                 loglike, 
                 W, 
                 active_groups,
                 penalty,
                 **kwargs):
    _target = {'full':full_targets,
               'selected':selected_targets,
               'debiased':debiased_targets}[target]
    return _target(loglike,
                   W,
                   features,
                   penalty,
                   **kwargs)