"""
This module contains a class `lasso`_ that implements
post selection for the lasso
as described in `post selection LASSO`_.
.. _covTest: http://arxiv.org/abs/1301.7161
.. _Kac Rice: http://arxiv.org/abs/1308.3020
.. _Spacings: http://arxiv.org/abs/1401.3889
.. _post selection LASSO: http://arxiv.org/abs/1311.6238
.. _sample carving: http://arxiv.org/abs/1410.2597
"""
from __future__ import division
import warnings, functools
from copy import copy
import numpy as np
import pandas as pd
from scipy.stats import norm as ndist, t as tdist
from scipy.linalg import block_diag
from regreg.api import (glm,
weighted_l1norm,
simple_problem,
coxph as coxph_obj,
smooth_sum,
squared_error,
identity_quadratic,
quadratic_loss)
from .sqrt_lasso import solve_sqrt_lasso, estimate_sigma
from .debiased_lasso import (debiasing_matrix,
pseudoinverse_debiasing_matrix)
from ..constraints.affine import (constraints,
selection_interval,
interval_constraints,
sample_from_constraints,
gibbs_test,
stack)
from ..distributions.discrete_family import discrete_family
from ..truncated.gaussian import truncated_gaussian_old as TG
from ..glm import pairs_bootstrap_glm
[docs]class lasso(object):
r"""
A class for the LASSO for post-selection inference.
The problem solved is
.. math::
\text{minimize}_{\beta} \frac{1}{2n} \|y-X\beta\|^2_2 +
\lambda \|\beta\|_1
where $\lambda$ is `lam`.
"""
[docs] def __init__(self,
loglike,
feature_weights,
covariance_estimator=None,
ignore_inactive_constraints=False):
r"""
Create a new post-selection dor the LASSO problem
Parameters
----------
loglike : `regreg.smooth.glm.glm`
A (negative) log-likelihood as implemented in `regreg`.
feature_weights : np.ndarray
Feature weights for L-1 penalty. If a float,
it is brodcast to all features.
covariance_estimator : callable (optional)
If None, use the parameteric
covariance estimate of the selected model.
Notes
-----
If not None, `covariance_estimator` should
take arguments (beta, active, inactive)
and return an estimate of the covariance of
$(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
the unpenalized estimator and the inactive
coordinates of the gradient of the likelihood at
the unpenalized estimator.
"""
self.loglike = loglike
if np.asarray(feature_weights).shape == ():
feature_weights = np.ones(loglike.shape) * feature_weights
self.feature_weights = np.asarray(feature_weights)
self.covariance_estimator = covariance_estimator
self.ignore_inactive_constraints = ignore_inactive_constraints
[docs] def fit(self, lasso_solution=None, solve_args={'tol': 1.e-12, 'min_its': 50}):
"""
Fit the lasso using `regreg`.
This sets the attributes `soln`, `onestep` and
forms the constraints necessary for post-selection inference
by calling `form_constraints()`.
Parameters
----------
lasso_solution : optional
If not None, this is taken to be the solution
of the optimization problem. No checks
are done, though the implied affine
constraints will generally not be satisfied.
solve_args : keyword args
Passed to `regreg.problems.simple_problem.solve`.
Returns
-------
soln : np.float
Solution to lasso.
Notes
-----
If `self` already has an attribute `lasso_solution`
this will be taken to be the solution and
no optimization problem will be solved. Supplying
the optional argument `lasso_solution` will
overwrite `self`'s `lasso_solution`.
"""
self._penalty = weighted_l1norm(self.feature_weights, lagrange=1.)
if lasso_solution is None and not hasattr(self, "lasso_solution"):
problem = simple_problem(self.loglike, self._penalty)
self.lasso_solution = problem.solve(**solve_args)
elif lasso_solution is not None:
self.lasso_solution = lasso_solution
lasso_solution = self.lasso_solution # shorthand after setting it correctly above
if not np.all(lasso_solution == 0):
self.active = np.nonzero(lasso_solution != 0)[0]
self.inactive = lasso_solution == 0
self.active_signs = np.sign(lasso_solution[self.active])
self._active_soln = lasso_solution[self.active]
H = self.loglike.hessian(self.lasso_solution)
H_AA = H[self.active][:, self.active]
H_AAinv = np.linalg.inv(H_AA)
Q = self.loglike.quadratic
G_Q = Q.objective(self.lasso_solution, 'grad')
G = self.loglike.gradient(self.lasso_solution) + G_Q
G_A = G[self.active]
G_I = self._G_I = G[self.inactive]
dbeta_A = H_AAinv.dot(G_A)
self.onestep_estimator = self._active_soln - dbeta_A
self.active_penalized = self.feature_weights[self.active] != 0
if self.active_penalized.sum():
self._constraints = constraints(-np.diag(self.active_signs)[self.active_penalized],
(self.active_signs * dbeta_A)[self.active_penalized],
covariance=H_AAinv)
else:
self._constraints = constraints(np.identity(self.active.shape[0]),
1.e12 * np.ones(self.active.shape[0]) * H_AAinv.max(),
# XXX np.inf seems to fail tests
covariance=H_AAinv)
if self.inactive.sum():
# inactive constraints
H_IA = H[self.inactive][:, self.active]
H_II = H[self.inactive][:, self.inactive]
inactive_cov = H_II - H_IA.dot(H_AAinv).dot(H_IA.T)
irrepresentable = H_IA.dot(H_AAinv)
inactive_mean = irrepresentable.dot(-G_A)
self._inactive_constraints = constraints(np.vstack([np.identity(self.inactive.sum()),
-np.identity(self.inactive.sum())]),
np.hstack([self.feature_weights[self.inactive],
self.feature_weights[self.inactive]]),
covariance=inactive_cov,
mean=inactive_mean)
if not self._inactive_constraints(G_I):
warnings.warn(
'inactive constraint of KKT conditions not satisfied -- perhaps need to solve with more accuracy')
if self.covariance_estimator is not None:
# make full constraints
# A: active
# I: inactive
# F: full, (A,I) stacked
_cov_FA = self.covariance_estimator(self.onestep_estimator,
self.active,
self.inactive)
_cov_IA = _cov_FA[len(self.active):]
_cov_AA = _cov_FA[:len(self.active)]
_beta_bar = self.onestep_estimator
if not self.ignore_inactive_constraints:
# X_{-E}^T(y - X_E \bar{\beta}_E)
_inactive_score = - G_I - inactive_mean
_indep_linear_part = _cov_IA.dot(np.linalg.inv(_cov_AA))
# we "fix" _nuisance, effectively conditioning on it
_nuisance = _inactive_score - _indep_linear_part.dot(_beta_bar)
_upper_lim = (self.feature_weights[self.inactive] -
_nuisance -
inactive_mean)
_lower_lim = (_nuisance +
self.feature_weights[self.inactive] +
inactive_mean)
_upper_linear = _indep_linear_part
_lower_linear = -_indep_linear_part
C = self._constraints
_full_linear = np.vstack([C.linear_part,
_upper_linear,
_lower_linear])
_full_offset = np.hstack([C.offset,
_upper_lim,
_lower_lim])
self._constraints = constraints(_full_linear,
_full_offset,
covariance=_cov_AA)
else:
self._constraints.covariance[:] = _cov_AA
if not self._constraints(_beta_bar):
warnings.warn('constraints of KKT conditions on one-step estimator ' +
' not satisfied -- perhaps need to solve with more' +
'accuracy')
else:
self._inactive_constraints = None
else:
self.active = []
self.inactive = np.arange(lasso_solution.shape[0])
self._constraints = None
self._inactive_constraints = None
return self.lasso_solution
[docs] def summary(self,
alternative='twosided',
level=0.95,
compute_intervals=False,
truth=None):
"""
Summary table for inference adjusted for selection.
Parameters
----------
alternative : str
One of ["twosided","onesided"]
level : float
Form level*100% selective confidence intervals.
compute_intervals : bool
Should we compute confidence intervals?
truth : np.array
True values of each beta for selected variables. If not None, a column 'pval' are p-values
computed under these corresponding null hypotheses.
Returns
-------
pval_summary : np.recarray
Array with one entry per active variable.
Columns are 'variable', 'pval', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'.
"""
if alternative not in ['twosided', 'onesided']:
raise ValueError("alternative must be one of ['twosided', 'onesided']")
if truth is None:
truth = np.zeros_like(self.active_signs)
result = []
C = self._constraints
if C is not None:
one_step = self.onestep_estimator
for i in range(one_step.shape[0]):
eta = np.zeros_like(one_step)
eta[i] = self.active_signs[i]
_alt = {"onesided": 'greater',
'twosided': "twosided"}[alternative]
if C.linear_part.shape[0] > 0: # there were some constraints
_pval = C.pivot(eta, one_step, null_value=truth[i], alternative=_alt)
else:
obs = (eta * one_step).sum()
sd = np.sqrt((eta * C.covariance.dot(eta)))
Z = obs / sd
_pval = 2 * ndist.sf(np.fabs(Z))
alpha = 1 - level
if compute_intervals:
if C.linear_part.shape[0] > 0: # there were some constraints
try:
_interval = C.interval(eta, one_step,
alpha=alpha)
except OverflowError:
_interval = (-np.inf, np.inf)
_interval = sorted([_interval[0] * self.active_signs[i],
_interval[1] * self.active_signs[i]])
else:
_interval = (obs - ndist.ppf(1 - alpha / 2) * sd,
obs + ndist.ppf(1 - alpha / 2) * sd)
else:
_interval = [np.nan, np.nan]
_bounds = np.array(C.bounds(eta, one_step))
sd = _bounds[-1]
lower_trunc, est, upper_trunc = sorted(_bounds[:3] * self.active_signs[i])
result.append((self.active[i],
_pval,
self.lasso_solution[self.active[i]],
one_step[i],
_interval[0],
_interval[1],
lower_trunc,
upper_trunc,
sd))
df = pd.DataFrame(index=self.active,
data=dict([(n, d) for n, d in zip(['variable',
'pval',
'lasso',
'onestep',
'lower_confidence',
'upper_confidence',
'lower_trunc',
'upper_trunc',
'sd'],
np.array(result).T)]))
df['variable'] = df['variable'].astype(int)
return df
@property
def soln(self):
"""
Solution to the lasso problem, set by `fit` method.
"""
if not hasattr(self, "lasso_solution"):
self.fit()
return self.lasso_solution
@property
def constraints(self):
"""
Affine constraints for this LASSO problem.
These are the constraints determined only
by the active block.
"""
return self._constraints
[docs] @classmethod
def gaussian(klass,
X,
Y,
feature_weights,
sigma=1.,
covariance_estimator=None,
quadratic=None):
r"""
Squared-error LASSO with feature weights.
Objective function is
$$
\beta \mapsto \frac{1}{2} \|Y-X\beta\|^2_2 + \sum_{i=1}^p \lambda_i |\beta_i|
$$
where $\lambda$ is `feature_weights`.
Parameters
----------
X : ndarray
Shape (n,p) -- the design matrix.
Y : ndarray
Shape (n,) -- the response.
feature_weights: [float, sequence]
Penalty weights. An intercept, or other unpenalized
features are handled by setting those entries of
`feature_weights` to 0. If `feature_weights` is
a float, then all parameters are penalized equally.
sigma : float (optional)
Noise variance. Set to 1 if `covariance_estimator` is not None.
This scales the loglikelihood by `sigma**(-2)`.
covariance_estimator : callable (optional)
If None, use the parameteric
covariance estimate of the selected model.
quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
An optional quadratic term to be added to the objective.
Can also be a linear term by setting quadratic
coefficient to 0.
Returns
-------
L : `selection.algorithms.lasso.lasso`
Notes
-----
If not None, `covariance_estimator` should
take arguments (beta, active, inactive)
and return an estimate of some of the
rows and columns of the covariance of
$(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
the unpenalized estimator and the inactive
coordinates of the gradient of the likelihood at
the unpenalized estimator.
"""
if covariance_estimator is not None:
sigma = 1.
loglike = glm.gaussian(X, Y, coef=1. / sigma ** 2, quadratic=quadratic)
return klass(loglike, np.asarray(feature_weights) / sigma ** 2,
covariance_estimator=covariance_estimator)
[docs] @classmethod
def logistic(klass,
X,
successes,
feature_weights,
trials=None,
covariance_estimator=None,
quadratic=None):
r"""
Logistic LASSO with feature weights.
Objective function is
$$
\beta \mapsto \ell(X\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
$$
where $\ell$ is the negative of the logistic
log-likelihood (half the logistic deviance)
and $\lambda$ is `feature_weights`.
Parameters
----------
X : ndarray
Shape (n,p) -- the design matrix.
successes : ndarray
Shape (n,) -- response vector. An integer number of successes.
For data that is proportions, multiply the proportions
by the number of trials first.
feature_weights: [float, sequence]
Penalty weights. An intercept, or other unpenalized
features are handled by setting those entries of
`feature_weights` to 0. If `feature_weights` is
a float, then all parameters are penalized equally.
trials : ndarray (optional)
Number of trials per response, defaults to
ones the same shape as Y.
covariance_estimator : optional
If None, use the parameteric
covariance estimate of the selected model.
quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
An optional quadratic term to be added to the objective.
Can also be a linear term by setting quadratic
coefficient to 0.
Returns
-------
L : `selection.algorithms.lasso.lasso`
Notes
-----
If not None, `covariance_estimator` should
take arguments (beta, active, inactive)
and return an estimate of the covariance of
$(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
the unpenalized estimator and the inactive
coordinates of the gradient of the likelihood at
the unpenalized estimator.
"""
loglike = glm.logistic(X, successes, trials=trials, quadratic=quadratic)
return klass(loglike, feature_weights,
covariance_estimator=covariance_estimator)
[docs] @classmethod
def coxph(klass,
X,
times,
status,
feature_weights,
covariance_estimator=None,
quadratic=None):
r"""
Cox proportional hazards LASSO with feature weights.
Objective function is
$$
\beta \mapsto \ell^{\text{Cox}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
$$
where $\ell^{\text{Cox}}$ is the
negative of the log of the Cox partial
likelihood and $\lambda$ is `feature_weights`.
Uses Efron's tie breaking method.
Parameters
----------
X : ndarray
Shape (n,p) -- the design matrix.
times : ndarray
Shape (n,) -- the survival times.
status : ndarray
Shape (n,) -- the censoring status.
feature_weights: [float, sequence]
Penalty weights. An intercept, or other unpenalized
features are handled by setting those entries of
`feature_weights` to 0. If `feature_weights` is
a float, then all parameters are penalized equally.
covariance_estimator : optional
If None, use the parameteric
covariance estimate of the selected model.
quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
An optional quadratic term to be added to the objective.
Can also be a linear term by setting quadratic
coefficient to 0.
Returns
-------
L : `selection.algorithms.lasso.lasso`
Notes
-----
If not None, `covariance_estimator` should
take arguments (beta, active, inactive)
and return an estimate of the covariance of
$(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
the unpenalized estimator and the inactive
coordinates of the gradient of the likelihood at
the unpenalized estimator.
"""
loglike = coxph_obj(X, times, status, quadratic=quadratic)
return klass(loglike, feature_weights,
covariance_estimator=covariance_estimator)
[docs] @classmethod
def poisson(klass,
X,
counts,
feature_weights,
covariance_estimator=None,
quadratic=None):
r"""
Poisson log-linear LASSO with feature weights.
Objective function is
$$
\beta \mapsto \ell^{\text{Poisson}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
$$
where $\ell^{\text{Poisson}}$ is the negative
of the log of the Poisson likelihood (half the deviance)
and $\lambda$ is `feature_weights`.
Parameters
----------
X : ndarray
Shape (n,p) -- the design matrix.
counts : ndarray
Shape (n,) -- the response.
feature_weights: [float, sequence]
Penalty weights. An intercept, or other unpenalized
features are handled by setting those entries of
`feature_weights` to 0. If `feature_weights` is
a float, then all parameters are penalized equally.
covariance_estimator : optional
If None, use the parameteric
covariance estimate of the selected model.
quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
An optional quadratic term to be added to the objective.
Can also be a linear term by setting quadratic
coefficient to 0.
Returns
-------
L : `selection.algorithms.lasso.lasso`
Notes
-----
If not None, `covariance_estimator` should
take arguments (beta, active, inactive)
and return an estimate of the covariance of
$(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
the unpenalized estimator and the inactive
coordinates of the gradient of the likelihood at
the unpenalized estimator.
"""
loglike = glm.poisson(X, counts, quadratic=quadratic)
return klass(loglike, feature_weights,
covariance_estimator=covariance_estimator)
[docs] @classmethod
def sqrt_lasso(klass,
X,
Y,
feature_weights,
quadratic=None,
covariance='parametric',
sigma_estimate='truncated',
solve_args={'min_its': 200}):
r"""
Use sqrt-LASSO to choose variables.
Objective function is
$$
\beta \mapsto \|Y-X\beta\|_2 + \sum_{i=1}^p \lambda_i |\beta_i|
$$
where $\lambda$ is `feature_weights`. After solving the problem
treat as if `gaussian` with implied variance and choice of
multiplier. See arxiv.org/abs/1504.08031 for details.
Parameters
----------
X : ndarray
Shape (n,p) -- the design matrix.
Y : ndarray
Shape (n,) -- the response.
feature_weights: [float, sequence]
Penalty weights. An intercept, or other unpenalized
features are handled by setting those entries of
`feature_weights` to 0. If `feature_weights` is
a float, then all parameters are penalized equally.
quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
An optional quadratic term to be added to the objective.
Can also be a linear term by setting quadratic
coefficient to 0.
covariance : str
One of 'parametric' or 'sandwich'. Method
used to estimate covariance for inference
in second stage.
sigma_estimate : str
One of 'truncated' or 'OLS'. Method
used to estimate $\sigma$ when using
parametric covariance.
solve_args : dict
Arguments passed to solver.
Returns
-------
L : `selection.algorithms.lasso.lasso`
Notes
-----
Unlike other variants of LASSO, this
solves the problem on construction as the active
set is needed to find equivalent gaussian LASSO.
Assumes parametric model is correct for inference,
i.e. does not accept a covariance estimator.
"""
n, p = X.shape
if np.asarray(feature_weights).shape == ():
feature_weights = np.ones(p) * feature_weights
feature_weights = np.asarray(feature_weights)
# TODO: refits sqrt lasso more than once -- make an override for avoiding refitting?
soln = solve_sqrt_lasso(X, Y, weights=feature_weights, quadratic=quadratic, solve_args=solve_args)[0]
# find active set, and estimate of sigma
active = (soln != 0)
nactive = active.sum()
if nactive:
subgrad = np.sign(soln[active]) * feature_weights[active]
X_E = X[:, active]
X_Ei = np.linalg.pinv(X_E)
sigma_E = np.linalg.norm(Y - X_E.dot(X_Ei.dot(Y))) / np.sqrt(n - nactive)
multiplier = np.sqrt((n - nactive) / (1 - np.linalg.norm(X_Ei.T.dot(subgrad)) ** 2))
# check truncation interval for sigma_E
# the KKT conditions imply an inequality like
# \hat{\sigma}_E \cdot LHS \leq RHS
penalized = feature_weights[active] != 0
if penalized.sum():
D_E = np.sign(soln[active][penalized]) # diagonal matrix of signs
LHS = D_E * np.linalg.solve(X_E.T.dot(X_E), subgrad)[penalized]
RHS = D_E * X_Ei.dot(Y)[penalized]
ratio = RHS / LHS
group1 = LHS > 0
upper_bound = np.inf
if group1.sum():
upper_bound = min(upper_bound, np.min(ratio[group1])) # necessarily these will have RHS > 0
group2 = ((LHS <= 0) *
(RHS <= 0)) # we can ignore the other possibility since this gives a lower bound of 0
lower_bound = 0
if group2.sum():
lower_bound = max(lower_bound, np.max(ratio[group2]))
upper_bound /= multiplier
lower_bound /= multiplier
else:
lower_bound = 0
upper_bound = np.inf
_sigma_estimator_args = (sigma_E,
n - nactive,
lower_bound,
upper_bound)
if sigma_estimate == 'truncated':
_sigma_hat = estimate_sigma(*_sigma_estimator_args)
elif sigma_estimate == 'OLS':
_sigma_hat = sigma_E
else:
raise ValueError('sigma_estimate must be one of ["truncated", "OLS"]')
else:
_sigma_hat = np.linalg.norm(Y) / np.sqrt(n)
multiplier = np.sqrt(n)
sigma_E = _sigma_hat
# XXX how should quadratic be changed?
# multiply everything by sigma_E?
if quadratic is not None:
qc = quadratic.collapsed()
qc.coef *= np.sqrt(n - nactive) / sigma_E
qc.linear_term *= np.sqrt(n - nactive) / sigma_E
quadratic = qc
loglike = glm.gaussian(X, Y, quadratic=quadratic)
if covariance == 'parametric':
cov_est = glm_parametric_estimator(loglike, dispersion=_sigma_hat)
elif covariance == 'sandwich':
cov_est = glm_sandwich_estimator(loglike, B=2000)
else:
raise ValueError('covariance must be one of ["parametric", "sandwich"]')
L = klass(loglike, feature_weights * multiplier * sigma_E,
covariance_estimator=cov_est,
ignore_inactive_constraints=True)
# these arguments are reused for data carving
if nactive:
L._sigma_hat = _sigma_hat
L._sigma_estimator_args = _sigma_estimator_args
L._weight_multiplier = multiplier * sigma_E
L._multiplier = multiplier
L.lasso_solution = soln
return L
[docs]def nominal_intervals(lasso_obj, level=0.95):
"""
Intervals for OLS parameters of active variables
that have not been adjusted for selection.
"""
unadjusted_intervals = []
alpha = 1 - level
if lasso_obj.active is not []:
SigmaE = lasso_obj.constraints.covariance
for i in range(lasso_obj.active.shape[0]):
eta = np.ones_like(lasso_obj.onestep_estimator)
eta[i] = 1.
center = lasso_obj.onestep_estimator[i]
width = ndist.ppf(1 - alpha / 2.) * np.sqrt(SigmaE[i, i])
_interval = [center - width, center + width]
unadjusted_intervals.append((lasso_obj.active[i], eta, center,
_interval))
return unadjusted_intervals
[docs]def glm_sandwich_estimator(loss, B=1000):
"""
Bootstrap estimator of covariance of
.. math::
(\bar{\beta}_E, X_{-E}^T(y-X_E\bar{\beta}_E)
the OLS estimator of population regression
coefficients and inactive correlation with the
OLS residuals.
Returns
-------
estimator : callable
Takes arguments (beta, active, inactive)
"""
def _estimator(loss, B, beta, active, inactive):
X, Y = loss.data
n, p = X.shape # shorthand
beta_full = np.zeros(p)
beta_full[active] = beta
# make sure active / inactive are bool
active_bool = np.zeros(p, np.bool)
active_bool[active] = 1
inactive_bool = np.zeros(p, np.bool)
inactive_bool[inactive] = 1
bootstrapper = pairs_bootstrap_glm(loss,
active_bool,
beta_full=beta_full,
inactive=inactive_bool)[0]
nactive = active_bool.sum()
first_moment = np.zeros(p)
second_moment = np.zeros((p, nactive))
for b in range(B):
indices = np.random.choice(n, n, replace=True)
Z_star = bootstrapper(indices)
first_moment += Z_star
second_moment += np.multiply.outer(Z_star, Z_star[:nactive])
first_moment /= B
second_moment /= B
cov = second_moment - np.multiply.outer(first_moment,
first_moment[:nactive])
return cov
return functools.partial(_estimator, loss, B)
[docs]def glm_parametric_estimator(loglike, dispersion=None):
"""
Parametric estimator of covariance of
.. math::
(\bar{\beta}_E, X_{-E}^T(y-\nabla \ell(X_E\bar{\beta}_E))
the OLS estimator of population regression
coefficients and inactive correlation with the
OLS residuals.
If `sigma` is None, it computes usual unbiased estimate
of variance in Gaussian model and plugs it in,
assuming parametric form is correct.
Returns
-------
estimator : callable
Takes arguments (beta, active, inactive)
"""
def _estimator(loglike, dispersion, beta, active, inactive):
X, Y = loglike.data
n, p = X.shape
nactive = len(active)
linear_predictor = X[:, active].dot(beta)
W = loglike.saturated_loss.hessian(linear_predictor)
Sigma_A = X[:, active].T.dot(W[:, None] * X[:, active])
Sigma_Ainv = np.linalg.inv(Sigma_A)
_unscaled = np.zeros((p, len(active)))
_unscaled[:nactive] = Sigma_Ainv
# the lower block is left at 0 because
# under the parametric model, these pieces
# are independent
# if no dispersion, use Pearson's X^2
if dispersion is None:
eta = X[:, active].dot(beta)
dispersion = ((loglike.saturated_loss.smooth_objective(eta, 'grad')) ** 2 / W).sum() / (n - nactive)
_unscaled *= dispersion ** 2
return _unscaled
return functools.partial(_estimator, loglike, dispersion)
[docs]def standard_lasso(X, y, sigma=1, lam_frac=1., **solve_args):
"""
Fit a LASSO with a default choice of Lagrange parameter
equal to `lam_frac` times $\sigma \cdot E(|X^T\epsilon|)$
with $\epsilon$ IID N(0,1).
Parameters
----------
y : np.float
Response vector
X : np.float
Design matrix
sigma : np.float
Noise variance
lam_frac : float
Multiplier for choice of $\lambda$
solve_args : keyword args
Passed to `regreg.problems.simple_problem.solve`.
Returns
-------
lasso_selection : `lasso`
Instance of `lasso` after fitting.
"""
n, p = X.shape
lam = lam_frac * np.mean(np.fabs(X.T.dot(np.random.standard_normal((n, 50000)))).max(0)) * sigma
lasso_selector = lasso.gaussian(X, y, lam, sigma=sigma)
lasso_selector.fit(**solve_args)
return lasso_selector
[docs]class data_carving(lasso):
"""
Notes
-----
Even if a covariance estimator is supplied,
we assume that we can drop inactive constraints,
i.e. the same (asymptotic) independence that
holds for parametric model is assumed to hold here
as well.
"""
[docs] def __init__(self,
loglike_select,
loglike_inference,
loglike_full,
feature_weights,
covariance_estimator=None):
lasso.__init__(self, loglike_select, feature_weights, covariance_estimator=covariance_estimator)
self.loglike_inference = loglike_inference
self.loglike_full = loglike_full
[docs] @classmethod
def gaussian(klass,
X,
Y,
feature_weights,
split_frac=0.9,
sigma=1.,
stage_one=None):
n, p = X.shape
if stage_one is None:
splitn = int(n * split_frac)
indices = np.arange(n)
np.random.shuffle(indices)
stage_one = indices[:splitn]
stage_two = indices[splitn:]
else:
stage_two = [i for i in np.arange(n) if i not in stage_one]
Y1, X1 = Y[stage_one], X[stage_one]
Y2, X2 = Y[stage_two], X[stage_two]
loglike = glm.gaussian(X, Y, coef=1. / sigma ** 2)
loglike1 = glm.gaussian(X1, Y1, coef=1. / sigma ** 2)
loglike2 = glm.gaussian(X2, Y2, coef=1. / sigma ** 2)
return klass(loglike1, loglike2, loglike, feature_weights / sigma ** 2)
[docs] @classmethod
def logistic(klass,
X,
successes,
feature_weights,
trials=None,
split_frac=0.9,
sigma=1.,
stage_one=None):
n, p = X.shape
if stage_one is None:
splitn = int(n * split_frac)
indices = np.arange(n)
np.random.shuffle(indices)
stage_one = indices[:splitn]
stage_two = indices[splitn:]
else:
stage_two = [i for i in np.arange(n) if i not in stage_one]
if trials is None:
trials = np.ones_like(successes)
successes1, X1, trials1 = successes[stage_one], X[stage_one], trials[stage_one]
successes2, X2, trials2 = successes[stage_two], X[stage_two], trials[stage_two]
loglike = glm.logistic(X, successes, trials=trials)
loglike1 = glm.logistic(X1, successes1, trials=trials1)
loglike2 = glm.logistic(X2, successes2, trials=trials2)
return klass(loglike1, loglike2, loglike, feature_weights)
[docs] @classmethod
def poisson(klass,
X,
counts,
feature_weights,
split_frac=0.9,
sigma=1.,
stage_one=None):
n, p = X.shape
if stage_one is None:
splitn = int(n * split_frac)
indices = np.arange(n)
np.random.shuffle(indices)
stage_one = indices[:splitn]
stage_two = indices[splitn:]
else:
stage_two = [i for i in np.arange(n) if i not in stage_one]
counts1, X1 = counts[stage_one], X[stage_one]
counts2, X2 = counts[stage_two], X[stage_two]
loglike = glm.poisson(X, counts)
loglike1 = glm.poisson(X1, counts1)
loglike2 = glm.poisson(X2, counts2)
return klass(loglike1, loglike2, loglike, feature_weights)
[docs] @classmethod
def coxph(klass,
X,
times,
status,
feature_weights,
split_frac=0.9,
sigma=1.,
stage_one=None):
n, p = X.shape
if stage_one is None:
splitn = int(n * split_frac)
indices = np.arange(n)
np.random.shuffle(indices)
stage_one = indices[:splitn]
stage_two = indices[splitn:]
else:
stage_two = [i for i in np.arange(n) if i not in stage_one]
times1, X1, status1 = times[stage_one], X[stage_one], status[stage_one]
times2, X2, status2 = times[stage_two], X[stage_two], status[stage_two]
loglike = coxph_obj(X, times, status)
loglike1 = coxph_obj(X1, times1, status1)
loglike2 = coxph_obj(X2, times2, status2)
return klass(loglike1, loglike2, loglike, feature_weights)
[docs] @classmethod
def sqrt_lasso(klass,
X,
Y,
feature_weights,
split_frac=0.9,
stage_one=None,
solve_args={'min_its': 200}):
n, p = X.shape
if stage_one is None:
splitn = int(n * split_frac)
indices = np.arange(n)
np.random.shuffle(indices)
stage_one = indices[:splitn]
stage_two = indices[splitn:]
else:
stage_two = [i for i in np.arange(n) if i not in stage_one]
Y1, X1 = Y[stage_one], X[stage_one]
Y2, X2 = Y[stage_two], X[stage_two]
# TODO: refits sqrt lasso more than once
L = lasso.sqrt_lasso(X1, Y1, feature_weights, solve_args=solve_args)
soln = L.fit(solve_args=solve_args)
_sigma_E1, _df1, _lower, _upper = L._sigma_estimator_args
_df2 = max(len(stage_two) - len(L.active), 0)
if _df2:
X_E2 = X2[:, L.active]
_sigma_E2 = np.linalg.norm(Y2 - X_E2.dot(np.linalg.pinv(X_E2).dot(Y2))) / (len(stage_two) - len(L.active))
_sigma_hat = estimate_sigma(np.sqrt((_sigma_E1 ** 2 * _df1 + _sigma_E2 ** 2 * _df2) / (_df1 + _df2)),
_df1,
_lower,
_upper,
untruncated_df=_df2)
else:
_sigma_hat = L._sigma_hat
cov_est = glm_parametric_estimator(L.loglike, dispersion=_sigma_hat)
loglike = glm.gaussian(X, Y)
loglike1 = glm.gaussian(X1, Y1)
loglike2 = glm.gaussian(X2, Y2)
L = klass(loglike1, loglike2, loglike, feature_weights * L._weight_multiplier,
covariance_estimator=cov_est)
L.lasso_solution = soln
return L
[docs] def fit(self, solve_args={'tol': 1.e-12, 'min_its': 50}):
lasso.fit(self, solve_args=solve_args)
n1 = self.loglike.get_data()[0].shape[0]
n = self.loglike_full.get_data()[0].shape[0]
_feature_weights = self.feature_weights.copy()
_feature_weights[self.active] = 0.
_feature_weights[self.inactive] = np.inf
_unpenalized_problem = simple_problem(self.loglike_full,
weighted_l1norm(_feature_weights, lagrange=1.))
_unpenalized = _unpenalized_problem.solve(**solve_args)
_unpenalized_active = _unpenalized[self.active]
s = len(self.active)
if self.covariance_estimator is None:
H = self.loglike_full.hessian(_unpenalized)
H_AA = H[self.active][:, self.active]
_cov_block = np.linalg.inv(H_AA)
self._carve_invcov = H_AA
else:
C = self.covariance_estimator(_unpenalized_active, self.active, self.inactive)
_cov_block = C[:len(self.active)][:, :len(self.active)]
self._carve_invcov = np.linalg.pinv(_cov_block)
_subsample_block = (n * 1. / n1) * _cov_block
_carve_cov = np.zeros((2 * s, 2 * s))
_carve_cov[:s][:, :s] = _cov_block
_carve_cov[s:][:, :s] = _subsample_block
_carve_cov[:s][:, s:] = _subsample_block
_carve_cov[s:][:, s:] = _subsample_block
_carve_linear_part = self._constraints.linear_part.dot(np.identity(2 * s)[s:])
_carve_offset = self._constraints.offset
self._carve_constraints = constraints(_carve_linear_part,
_carve_offset,
covariance=_carve_cov)
self._carve_feasible = np.hstack([_unpenalized_active, self.onestep_estimator])
self._unpenalized_active = _unpenalized_active
[docs] def hypothesis_test(self,
variable,
burnin=2000,
ndraw=8000,
compute_intervals=False):
if variable not in self.active:
raise ValueError('expecting an active variable')
# shorthand
j = list(self.active).index(variable)
twice_s = self._carve_constraints.linear_part.shape[1]
s = sparsity = int(twice_s / 2)
keep = np.ones(s, np.bool)
keep[j] = 0
conditioning = self._carve_invcov.dot(np.identity(twice_s)[:s])[keep]
contrast = np.zeros(2 * s)
contrast[j] = 1.
# condition to remove dependence on nuisance parameters
if len(self.active) > 1:
conditional_law = self._carve_constraints.conditional(conditioning,
conditioning.dot(self._carve_feasible))
else:
conditional_law = self._carve_constraints
observed = (contrast * self._carve_feasible).sum()
if self._carve_constraints.linear_part.shape[0] > 0:
_, _, _, family = gibbs_test(conditional_law,
self._carve_feasible,
contrast,
sigma_known=True,
white=False,
ndraw=ndraw,
burnin=burnin,
how_often=10,
UMPU=False)
pval = family.cdf(0, observed)
pval = 2 * min(pval, 1 - pval)
else: # only unpenalized coefficients were nonzero, no constraints
sd = np.sqrt((contrast * self._carve_constraints.covariance.dot(contrast)).sum())
Z = observed / sd
pval = 2 * ndist.sf(np.fabs(Z))
return pval
[docs]class data_splitting(data_carving):
[docs] def fit(self, solve_args={'tol': 1.e-12, 'min_its': 500}, use_full_cov=True):
lasso.fit(self, solve_args=solve_args)
_feature_weights = self.feature_weights.copy()
_feature_weights[self.active] = 0.
_feature_weights[self.inactive] = np.inf
_unpenalized_problem = simple_problem(self.loglike_inference,
weighted_l1norm(_feature_weights, lagrange=1.))
_unpenalized = _unpenalized_problem.solve(**solve_args)
self._unpenalized_active = _unpenalized[self.active]
if use_full_cov:
H = self.loglike_full.hessian(_unpenalized)
n_inference = self.loglike_inference.data[0].shape[0]
n_full = self.loglike_full.data[0].shape[0]
H *= (1. * n_inference / n_full)
else:
H = self.loglike_inference.hessian(_unpenalized)
H_AA = H[self.active][:, self.active]
self._cov_inference = np.linalg.inv(H_AA)
[docs] def hypothesis_test(self,
variable,
df=np.inf):
"""
Wald test for an active variable.
"""
if variable not in self.active:
raise ValueError('expecting an active variable')
# shorthand
j = list(self.active).index(variable)
Z = self._unpenalized_active[j] / np.sqrt(self._cov_inference[j, j])
if df == np.inf:
return 2 * ndist.sf(np.abs(Z))
else:
return 2 * tdist.sf(np.abs(Z), df)
def _data_carving_deprec(X, y,
lam_frac=2.,
sigma=1.,
stage_one=None,
split_frac=0.9,
coverage=0.95,
ndraw=8000,
burnin=2000,
splitting=False,
compute_intervals=True,
UMPU=False):
"""
Fit a LASSO with a default choice of Lagrange parameter
equal to `lam_frac` times $\sigma \cdot E(|X^T\epsilon|)$
with $\epsilon$ IID N(0,1) on a proportion (`split_frac`) of
the data.
Parameters
----------
y : np.float
Response vector
X : np.float
Design matrix
sigma : np.float
Noise variance
lam_frac : float (optional)
Multiplier for choice of $\lambda$. Defaults to 2.
coverage : float
Coverage for selective intervals. Defaults to 0.95.
stage_one : [np.array(np.int), None] (optional)
Index of data points to be used in first stage.
If None, a randomly chosen set of entries is used based on
`split_frac`.
split_frac : float (optional)
What proportion of the data to use in the first stage?
Defaults to 0.9.
ndraw : int (optional)
How many draws to keep from Gibbs hit-and-run sampler.
Defaults to 8000.
burnin : int (optional)
Defaults to 2000.
splitting : bool (optional)
If True, also return splitting pvalues and intervals.
compute_intervals : bool (optional)
Compute selective intervals?
UMPU : bool (optional)
Perform the UMPU test?
Returns
-------
results : [(variable, pvalue, interval)
Indices of active variables,
selected (twosided) pvalue and selective interval.
If splitting, then each entry also includes
a (split_pvalue, split_interval) using stage_two
for inference.
stage_one : `lasso`
Results of fitting LASSO to stage one data.
"""
n, p = X.shape
first_stage, stage_one, stage_two = split_model(X,
y,
sigma=sigma,
lam_frac=lam_frac,
split_frac=split_frac,
stage_one=stage_one)
splitn = stage_one.shape[0]
L = first_stage # shorthand
s = sparsity = L.active.shape[0]
if splitn < n:
# JT: this is all about computing constraints for active
# variables -- we already have this!
# quantities related to models fit on
# stage_one and full dataset
y1, X1 = y[stage_one], X[stage_one]
X_E = X[:, L.active]
X_Ei = np.linalg.pinv(X_E)
X_E1 = X1[:, L.active]
X_Ei1 = np.linalg.pinv(X_E1)
inv_info_E = X_Ei.dot(X_Ei.T)
inv_info_E1 = X_Ei1.dot(X_Ei1.T)
beta_E = X_Ei.dot(y)
beta_E1 = X_Ei1.dot(y[stage_one])
if n - splitn > s:
linear_part = np.zeros((s, 2 * s))
linear_part[:, s:] = -np.diag(L.active_signs)
b = L.constraints.offset
con = constraints(linear_part, b)
# specify covariance of 2s Gaussian vector
cov = np.zeros((2 * s, 2 * s))
cov[:s, :s] = inv_info_E
cov[s:, :s] = inv_info_E
cov[:s, s:] = inv_info_E
cov[s:, s:] = inv_info_E1
con.covariance[:] = cov * sigma ** 2
# for the conditional law
# we will change the linear function for each coefficient
selector = np.zeros((s, 2 * s))
selector[:, :s] = np.identity(s)
conditional_linear = X_E.T.dot(X_E).dot(selector)
# a valid initial condition
initial = np.hstack([beta_E, beta_E1])
OLS_func = selector
else:
linear_part = np.zeros((s, s + n - splitn))
linear_part[:, :s] = -np.diag(L.active_signs)
b = L.constraints.offset
con = constraints(linear_part, b)
# specify covariance of Gaussian vector
cov = np.zeros((s + n - splitn, s + n - splitn))
cov[:s, :s] = inv_info_E1
cov[s:, :s] = 0
cov[:s, s:] = 0
cov[s:, s:] = np.identity(n - splitn)
con.covariance[:] = cov * sigma ** 2
conditional_linear = np.zeros((s, s + n - splitn))
conditional_linear[:, :s] = np.linalg.pinv(inv_info_E1)
conditional_linear[:, s:] = X[stage_two, :][:, L.active].T
selector1 = np.zeros((s, s + n - splitn))
selector1[:, :s] = np.identity(s)
selector2 = np.zeros((n - splitn, s + n - splitn))
selector2[:, s:] = np.identity(n - splitn)
# write the OLS estimates of full model in terms of X_E1^{dagger}y_1, y2
OLS_func = inv_info_E.dot(conditional_linear)
# a valid initial condition
initial = np.hstack([beta_E1, y[stage_two]])
pvalues = []
intervals = []
if splitting:
y2, X2 = y[stage_two], X[stage_two]
X_E2 = X2[:, L.active]
X_Ei2 = np.linalg.pinv(X_E2)
beta_E2 = X_Ei2.dot(y2)
inv_info_E2 = X_Ei2.dot(X_Ei2.T)
splitting_pvalues = []
splitting_intervals = []
if n - splitn < s:
warnings.warn('not enough data for second stage of sample splitting')
split_cutoff = np.fabs(ndist.ppf((1. - coverage) / 2))
# compute p-values intervals
cov_inv = np.linalg.pinv(con.covariance)
for j in range(X_E.shape[1]):
keep = np.ones(s, np.bool)
keep[j] = 0
eta = OLS_func[j]
con_cp = copy(con)
conditional_law = con_cp.conditional(conditional_linear[keep], \
X_E.T.dot(y)[keep])
# tilt so that samples are closer to observed values
# the multiplier should be the pseudoMLE so that
# the observed value is likely
observed = (initial * eta).sum()
if compute_intervals:
_, _, _, family = gibbs_test(conditional_law,
initial,
eta,
sigma_known=True,
white=False,
ndraw=ndraw,
burnin=burnin,
how_often=10,
UMPU=UMPU,
tilt=conditional_law.covariance.dot(eta))
lower_lim, upper_lim = family.equal_tailed_interval(observed, 1 - coverage)
# in the model we've chosen, the parameter beta is associated
# to the natural parameter as below
# exercise: justify this!
lower_lim_final = eta.dot(conditional_law.covariance.dot(eta)) * lower_lim
upper_lim_final = eta.dot(conditional_law.covariance.dot(eta)) * upper_lim
intervals.append((lower_lim_final, upper_lim_final))
else: # we do not really need to tilt just for p-values
_, _, _, family = gibbs_test(conditional_law,
initial,
eta,
sigma_known=True,
white=False,
ndraw=ndraw,
burnin=burnin,
how_often=10,
UMPU=UMPU)
intervals.append((np.nan, np.nan))
pval = family.cdf(0, observed)
pval = 2 * min(pval, 1 - pval)
pvalues.append(pval)
if splitting:
if s < n - splitn: # enough data to generically
# test hypotheses. proceed as usual
split_pval = ndist.cdf(beta_E2[j] / (np.sqrt(inv_info_E2[j, j]) * sigma))
split_pval = 2 * min(split_pval, 1. - split_pval)
splitting_pvalues.append(split_pval)
splitting_interval = (beta_E2[j] -
split_cutoff * np.sqrt(inv_info_E2[j, j]) * sigma,
beta_E2[j] +
split_cutoff * np.sqrt(inv_info_E2[j, j]) * sigma)
splitting_intervals.append(splitting_interval)
else:
splitting_pvalues.append(np.random.sample())
splitting_intervals.append((np.nan, np.nan))
if not splitting:
return zip(L.active,
pvalues,
intervals), L
else:
return zip(L.active,
pvalues,
intervals,
splitting_pvalues,
splitting_intervals), L
else:
pvalues = [p for _, p in L.summary("twosided")['pval']]
intervals = np.array([L.intervals['lower'], L.intervals['upper']]).T
if splitting:
splitting_pvalues = np.random.sample(len(pvalues))
splitting_intervals = [(np.nan, np.nan) for _ in
range(len(pvalues))]
return zip(L.active,
pvalues,
intervals,
splitting_pvalues,
splitting_intervals), L
else:
return zip(L.active,
pvalues,
intervals), L
[docs]def split_model(X,
y,
sigma=1,
lam_frac=1.,
split_frac=0.9,
stage_one=None):
"""
Fit a LASSO with a default choice of Lagrange parameter
equal to `lam_frac` times $\sigma \cdot E(|X^T\epsilon|)$
with $\epsilon$ IID N(0,1) on a proportion (`split_frac`) of
the data.
Parameters
----------
y : np.float
Response vector
X : np.float
Design matrix
sigma : np.float
Noise variance
lam_frac : float (optional)
Multiplier for choice of $\lambda$. Defaults to 2.
split_frac : float (optional)
What proportion of the data to use in the first stage?
Defaults to 0.9.
stage_one : [np.array(np.int), None] (optional)
Index of data points to be used in first stage.
If None, a randomly chosen set of entries is used based on
`split_frac`.
Returns
-------
first_stage : `lasso`
Lasso object from stage one.
stage_one : np.array(int)
Indices used for stage one.
stage_two : np.array(int)
Indices used for stage two.
"""
n, p = X.shape
if stage_one is None:
splitn = int(n * split_frac)
indices = np.arange(n)
np.random.shuffle(indices)
stage_one = indices[:splitn]
stage_two = indices[splitn:]
else:
stage_two = [i for i in np.arange(n) if i not in stage_one]
y1, X1 = y[stage_one], X[stage_one]
first_stage = standard_lasso(X1, y1, sigma=sigma, lam_frac=lam_frac)
return first_stage, stage_one, stage_two
[docs]def additive_noise(X,
y,
sigma,
lam_frac=1.,
perturb_frac=0.2,
y_star=None,
coverage=0.95,
ndraw=8000,
compute_intervals=True,
burnin=2000):
"""
Additive noise LASSO.
Parameters
----------
y : np.float
Response vector
X : np.float
Design matrix
sigma : np.float
Noise variance
lam_frac : float (optional)
Multiplier for choice of $\lambda$. Defaults to 2.
perturb_frac : float (optional)
How much noise to add? Noise added has variance
proportional to existing variance.
coverage : float
Coverage for selective intervals. Defaults to 0.95.
ndraw : int (optional)
How many draws to keep from Gibbs hit-and-run sampler.
Defaults to 8000.
burnin : int (optional)
Defaults to 2000.
compute_intervals : bool (optional)
Compute selective intervals?
Returns
-------
results : [(variable, pvalue, interval)
Indices of active variables,
selected (twosided) pvalue and selective interval.
If splitting, then each entry also includes
a (split_pvalue, split_interval) using stage_two
for inference.
randomized_lasso : `lasso`
Results of fitting LASSO to randomized data.
"""
n, p = X.shape
# Add some noise to y and fit the LASSO at a fixed lambda
gamma = np.sqrt(perturb_frac) * sigma
sigma_star = np.sqrt(sigma ** 2 + gamma ** 2)
lam = lam_frac * np.mean(np.fabs(X.T.dot(np.random.standard_normal((n, 5000)))).max(0)) * sigma_star
y_star = y + np.random.standard_normal(n) * gamma
randomized_lasso = L = standard_lasso(X, y_star, sigma=sigma_star, lam_frac=lam_frac)
L.fit()
# Form the constraint matrix on (y,y^*)
X_E = X[:, L.active]
X_Ei = np.linalg.pinv(X_E)
Cov_E = X_Ei.dot(X_Ei.T)
W_E = Cov_E.dot(L.active_signs)
pvalues = []
intervals = []
beta_E = X_Ei.dot(y)
# compute each pvalue
for j in range(X_E.shape[1]):
s_obs = L.active.shape[0]
keep = np.ones(s_obs, np.bool)
keep[j] = 0
# form the 2s Gaussian vector we will condition on
X_minus_j = X_E[:, keep]
P_minus_j = X_minus_j.dot(np.linalg.pinv(X_minus_j))
R_minus_j = np.identity(n) - P_minus_j
theta_E = L.active_signs * (X_Ei.dot(P_minus_j.dot(y)) - lam * W_E)
scale = np.sqrt(Cov_E[j, j])
kappa = 1. / scale ** 2
alpha_E = kappa * L.active_signs * Cov_E[j]
A = np.hstack([-alpha_E.reshape((s_obs, 1)), np.identity(s_obs)])
con = constraints(A, theta_E)
cov = np.zeros((s_obs + 1, s_obs + 1))
cov[0, 0] = scale ** 2 * sigma ** 2
cov[1:, 1:] = Cov_E * gamma ** 2 * np.outer(L.active_signs, L.active_signs)
con.covariance[:] = cov
initial = np.zeros(s_obs + 1)
initial[0] = beta_E[j]
initial[1:] = -X_Ei.dot(y_star - y) * L.active_signs
eta = np.zeros(s_obs + 1)
eta[0] = 1.
observed = (initial * eta).sum()
if compute_intervals:
_, _, _, family = gibbs_test(con,
initial,
eta,
UMPU=False,
sigma_known=True,
ndraw=ndraw,
burnin=burnin,
how_often=5,
tilt=con.covariance.dot(eta))
lower_lim, upper_lim = family.equal_tailed_interval(observed, 1 - coverage)
# in the model we've chosen, the parameter beta is associated
# to the natural parameter as below
# exercise: justify this!
lower_lim_final = eta.dot(con.covariance.dot(eta)) * lower_lim
upper_lim_final = eta.dot(con.covariance.dot(eta)) * upper_lim
intervals.append((lower_lim_final, upper_lim_final))
else:
_, _, _, family = gibbs_test(con,
initial,
eta,
UMPU=False,
sigma_known=True,
ndraw=ndraw,
burnin=burnin,
how_often=5,
tilt=con.covariance.dot(eta))
intervals.append((np.nan, np.nan))
pval = family.cdf(0, observed)
pval = 2 * min(pval, 1 - pval)
pvalues.append(pval)
return zip(L.active,
pvalues,
intervals), randomized_lasso
## Liu, Markovic and Tibshirani method based on full model
## conditioning only on the event j \in E for each active j
# Liu, Markovic, Tibs selection
def _solve_restricted_problem(Qbeta_bar, Xinfo, lagrange, initial=None,
wide=True,
min_its=30, tol=1.e-12):
p = Qbeta_bar.shape[0]
if wide:
X, W = Xinfo
loss = squared_error(X * np.sqrt(W)[:, None], np.zeros(X.shape[0]))
else:
Q = Xinfo
loss = quadratic_loss(Q.shape[0], Q=Q)
loss.quadratic = identity_quadratic(0,
0,
-Qbeta_bar,
0)
lagrange = np.asarray(lagrange)
if lagrange.shape in [(), (1,)]:
lagrange = np.ones(p) * lagrange
pen = weighted_l1norm(lagrange, lagrange=1.)
problem = simple_problem(loss, pen)
if initial is not None:
problem.coefs[:] = initial
soln = problem.solve(tol=tol, min_its=min_its)
return soln
def _truncation_interval(Qbeta_bar,
Xinfo,
Qi_jj,
j,
beta_barj,
lagrange,
wide=True):
if lagrange[j] != 0:
lagrange_cp = lagrange.copy()
else:
return -np.inf, np.inf
lagrange_cp[j] = np.inf
# TODO: use initial solution for speed
restricted_soln = _solve_restricted_problem(Qbeta_bar,
Xinfo,
lagrange_cp,
wide=wide)
p = Qbeta_bar.shape[0]
Ij = np.zeros(p)
Ij[j] = 1.
nuisance = Qbeta_bar - Ij / Qi_jj * beta_barj
if wide:
X, W = Xinfo
Qj = X.T.dot(X[:, j] * W)
else:
Q = Xinfo
Qj = Q[j]
center = nuisance[j] - Qj.dot(restricted_soln)
upper = (lagrange[j] - center) * Qi_jj
lower = (-lagrange[j] - center) * Qi_jj
if not (beta_barj < lower or beta_barj > upper):
warnings.warn("implied KKT constraint not satisfied")
return lower, upper
[docs]class ROSI(lasso):
r"""
A class for the LASSO for post-selection inference.
The problem solved is
.. math::
\text{minimize}_{\beta} \frac{1}{2n} \|y-X\beta\|^2_2 +
\lambda \|\beta\|_1
where $\lambda$ is `lam`.
Notes
-----
In solving the debiasing problem to approximate the inverse
of (X^TWX) in a GLM, this class makes the implicit assumption
that the scaling of X is such that diag(X^TWX) is O(n)
with n=X.shape[0]. That is, X's are similar to IID samples
from a population that does not depend on n.
"""
[docs] def __init__(self,
loglike,
feature_weights,
approximate_inverse='BN'):
r"""
Create a new post-selection for the LASSO problem
Parameters
----------
loglike : `regreg.smooth.glm.glm`
A (negative) log-likelihood as implemented in `regreg`.
feature_weights : np.ndarray
Feature weights for L-1 penalty. If a float,
it is brodcast to all features.
approximate_inverse : str (optional)
One of "JM" (Javanmard, Montanari) or "BN" (Boot, Niedderling) or None.
A form of approximate inverse when p is close to (or larger) than n.
"""
self.loglike = loglike
if np.asarray(feature_weights).shape == ():
feature_weights = np.ones(loglike.shape) * feature_weights
self.feature_weights = np.asarray(feature_weights)
self.approximate_inverse = approximate_inverse
[docs] def fit(self,
lasso_solution=None,
solve_args={'tol': 1.e-12, 'min_its': 50},
debiasing_args={}):
"""
Fit the lasso using `regreg`.
This sets the attributes `soln`, `onestep` and
forms the constraints necessary for post-selection inference
by calling `form_constraints()`.
Parameters
----------
lasso_solution : optional
If not None, this is taken to be the solution
of the optimization problem. No checks
are done, though the implied affine
constraints will generally not be satisfied.
solve_args : keyword args
Passed to `regreg.problems.simple_problem.solve`.
debiasing_args : dict
Arguments passed to `.debiased_lasso.debiasing_matrix`
or `.debiased_lasso.pseudoinverse_debiasing_matrix` depending
on `self.approximate_inverse`.
Returns
-------
soln : np.float
Solution to lasso.
Notes
-----
If `self` already has an attribute `lasso_solution`
this will be taken to be the solution and
no optimization problem will be solved. Supplying
the optional argument `lasso_solution` will
overwrite `self`'s `lasso_solution`.
"""
self._penalty = weighted_l1norm(self.feature_weights, lagrange=1.)
if lasso_solution is None and not hasattr(self, "lasso_solution"):
problem = simple_problem(self.loglike, self._penalty)
self.lasso_solution = problem.solve(**solve_args)
elif lasso_solution is not None:
self.lasso_solution = lasso_solution
lasso_solution = self.lasso_solution # shorthand after setting it correctly above
if not np.all(lasso_solution == 0):
self.active = np.nonzero(lasso_solution != 0)[0]
self.inactive = lasso_solution == 0
self.active_signs = np.sign(lasso_solution[self.active])
self._active_soln = lasso_solution[self.active]
X, y = self.loglike.data # presuming GLM here
n, p = X.shape
W = self.loglike.saturated_loss.hessian(X.dot(lasso_solution))
# Needed for finding truncation intervals
self._Qbeta_bar = X.T.dot(W * X.dot(lasso_solution)) - self.loglike.smooth_objective(lasso_solution, 'grad')
self._W = W
if n > p and self.approximate_inverse is None:
Q = self.loglike.hessian(lasso_solution)
E = self.active
Qi = np.linalg.inv(Q)
self._QiE = Qi[E][:, E]
_beta_bar = Qi.dot(self._Qbeta_bar)
self._beta_barE = _beta_bar[E]
one_step = self._beta_barE
# Pearson's X^2 to estimate sigma
y_hat = self.loglike.saturated_loss.mean_function(X.dot(_beta_bar))
self._pearson_sigma = np.sqrt(((y - y_hat) ** 2 / self._W).sum() / (n - p))
elif self.approximate_inverse is None:
raise ValueError('n is less than or equal to p, an approximate inverse is needed.')
else:
X, y = self.loglike.data
# target is one-step estimator
G = self.loglike.smooth_objective(lasso_solution, 'grad')
if self.approximate_inverse == 'JM':
M = self._M = debiasing_matrix(X * np.sqrt(W)[:, None],
self.active,
**debiasing_args) / n
# the n is to make sure we get rows of the inverse
# of (X^TWX) instead of (X^TWX/n).
elif self.approximate_inverse == 'BN':
M = self._M = pseudoinverse_debiasing_matrix(X * np.sqrt(W)[:, None],
self.active,
**debiasing_args)
else:
raise ValueError('approximate inverse should be one of ["JM", "BN"]')
Qinv_hat = np.atleast_2d(M)
observed_target = lasso_solution[self.active] - Qinv_hat.dot(G)
M1 = Qinv_hat.dot(X.T)
self._QiE = (M1 * self._W[None, :]).dot(M1.T)
Xfeat = X[:, self.active]
Qrelax = Xfeat.T.dot(self._W[:, None] * Xfeat)
relaxed_soln = lasso_solution[self.active] - np.linalg.inv(Qrelax).dot(G[self.active])
self._beta_barE = observed_target
# relaxed Pearson's X^2 to estimate sigma
y_hat = self.loglike.saturated_loss.mean_function(Xfeat.dot(relaxed_soln))
self._pearson_sigma = np.sqrt(((y - y_hat) ** 2 / self._W).sum() / (n - len(self.active)))
else:
self.active = []
self.inactive = np.arange(lasso_solution.shape[0])
return self.lasso_solution
[docs] def summary(self,
level=0.95,
compute_intervals=False,
dispersion=None,
truth=None):
"""
Summary table for inference adjusted for selection.
Parameters
----------
level : float
Form level*100% selective confidence intervals.
compute_intervals : bool
Should we compute confidence intervals?
dispersion : float
Estimate of dispersion. Defaults to a Pearson's X^2 estimate in the relaxed model.
truth : np.array
True values of each beta for selected variables. If not None, a column 'pval' are p-values
computed under these corresponding null hypotheses.
Returns
-------
pval_summary : np.recarray
Array with one entry per active variable.
Columns are 'variable', 'pval', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'.
"""
if len(self.active) > 0:
X, y = self.loglike.data
active_set, QiE, beta_barE, Qbeta_bar = self.active, self._QiE, self._beta_barE, self._Qbeta_bar
W, sigma = self._W, self._pearson_sigma
if dispersion is None:
sqrt_dispersion = sigma
else:
sqrt_dispersion = np.sqrt(dispersion)
result = []
# note that QiE is the dispersion free covariance
# matrix!
# dispersion comes into truncated Gaussian below
if truth is None:
truth = np.zeros(len(active_set))
for j in range(len(active_set)):
idx = self.active[j]
lower, upper = _truncation_interval(Qbeta_bar,
(X, W),
QiE[j, j],
idx,
beta_barE[j],
self.feature_weights,
wide=True)
sd = sqrt_dispersion * np.sqrt(QiE[j, j])
tg = TG([(-np.inf, lower), (upper, np.inf)], scale=sd, mu=truth[j])
pvalue = tg.cdf(beta_barE[j])
pvalue = float(2 * min(pvalue, 1 - pvalue))
if compute_intervals:
l, u = tg.equal_tailed_interval(beta_barE[j], alpha=1-level)
else:
l, u = np.nan, np.nan
result.append((idx, pvalue, self.lasso_solution[idx], beta_barE[j], sd, l, u, lower, upper))
df = pd.DataFrame(index=self.active,
data=dict([(n, d) for n, d in zip(['variable',
'pval',
'lasso',
'onestep',
'sd',
'lower_confidence',
'upper_confidence',
'lower_truncation',
'upper_truncation'],
np.array(result).T)]))
df['variable'] = df['variable'].astype(int)
return df
@property
def soln(self):
"""
Solution to the lasso problem, set by `fit` method.
"""
if not hasattr(self, "lasso_solution"):
self.fit()
return self.lasso_solution
[docs] @classmethod
def gaussian(klass,
X,
Y,
feature_weights,
sigma=1.,
covariance_estimator=None,
quadratic=None,
approximate_inverse=None):
r"""
Squared-error LASSO with feature weights.
Objective function is
$$
\beta \mapsto \frac{1}{2} \|Y-X\beta\|^2_2 + \sum_{i=1}^p \lambda_i |\beta_i|
$$
where $\lambda$ is `feature_weights`.
Parameters
----------
X : ndarray
Shape (n,p) -- the design matrix.
Y : ndarray
Shape (n,) -- the response.
feature_weights: [float, sequence]
Penalty weights. An intercept, or other unpenalized
features are handled by setting those entries of
`feature_weights` to 0. If `feature_weights` is
a float, then all parameters are penalized equally.
sigma : float (optional)
Noise variance. Set to 1 if `covariance_estimator` is not None.
This scales the loglikelihood by `sigma**(-2)`.
covariance_estimator : callable (optional)
If None, use the parameteric
covariance estimate of the selected model.
quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
An optional quadratic term to be added to the objective.
Can also be a linear term by setting quadratic
coefficient to 0.
Returns
-------
L : `selection.algorithms.lasso.lasso`
Notes
-----
If not None, `covariance_estimator` should
take arguments (beta, active, inactive)
and return an estimate of some of the
rows and columns of the covariance of
$(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
the unpenalized estimator and the inactive
coordinates of the gradient of the likelihood at
the unpenalized estimator.
"""
if covariance_estimator is not None:
sigma = 1.
loglike = glm.gaussian(X, Y, coef=1. / sigma ** 2, quadratic=quadratic)
return klass(loglike, np.asarray(feature_weights) / sigma ** 2,
approximate_inverse=approximate_inverse)
[docs] @classmethod
def logistic(klass,
X,
successes,
feature_weights,
trials=None,
covariance_estimator=None,
quadratic=None,
approximate_inverse=None):
r"""
Logistic LASSO with feature weights.
Objective function is
$$
\beta \mapsto \ell(X\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
$$
where $\ell$ is the negative of the logistic
log-likelihood (half the logistic deviance)
and $\lambda$ is `feature_weights`.
Parameters
----------
X : ndarray
Shape (n,p) -- the design matrix.
successes : ndarray
Shape (n,) -- response vector. An integer number of successes.
For data that is proportions, multiply the proportions
by the number of trials first.
feature_weights: [float, sequence]
Penalty weights. An intercept, or other unpenalized
features are handled by setting those entries of
`feature_weights` to 0. If `feature_weights` is
a float, then all parameters are penalized equally.
trials : ndarray (optional)
Number of trials per response, defaults to
ones the same shape as Y.
covariance_estimator : optional
If None, use the parameteric
covariance estimate of the selected model.
quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
An optional quadratic term to be added to the objective.
Can also be a linear term by setting quadratic
coefficient to 0.
Returns
-------
L : `selection.algorithms.lasso.lasso`
Notes
-----
If not None, `covariance_estimator` should
take arguments (beta, active, inactive)
and return an estimate of the covariance of
$(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
the unpenalized estimator and the inactive
coordinates of the gradient of the likelihood at
the unpenalized estimator.
"""
loglike = glm.logistic(X, successes, trials=trials, quadratic=quadratic)
return klass(loglike, feature_weights,
approximate_inverse=approximate_inverse)
[docs] @classmethod
def poisson(klass,
X,
counts,
feature_weights,
covariance_estimator=None,
quadratic=None,
approximate_inverse=None):
r"""
Poisson log-linear LASSO with feature weights.
Objective function is
$$
\beta \mapsto \ell^{\text{Poisson}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i|
$$
where $\ell^{\text{Poisson}}$ is the negative
of the log of the Poisson likelihood (half the deviance)
and $\lambda$ is `feature_weights`.
Parameters
----------
X : ndarray
Shape (n,p) -- the design matrix.
counts : ndarray
Shape (n,) -- the response.
feature_weights: [float, sequence]
Penalty weights. An intercept, or other unpenalized
features are handled by setting those entries of
`feature_weights` to 0. If `feature_weights` is
a float, then all parameters are penalized equally.
covariance_estimator : optional
If None, use the parameteric
covariance estimate of the selected model.
quadratic : `regreg.identity_quadratic.identity_quadratic` (optional)
An optional quadratic term to be added to the objective.
Can also be a linear term by setting quadratic
coefficient to 0.
Returns
-------
L : `selection.algorithms.lasso.lasso`
Notes
-----
If not None, `covariance_estimator` should
take arguments (beta, active, inactive)
and return an estimate of the covariance of
$(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
the unpenalized estimator and the inactive
coordinates of the gradient of the likelihood at
the unpenalized estimator.
"""
loglike = glm.poisson(X, counts, quadratic=quadratic)
return klass(loglike, feature_weights,
approximate_inverse=approximate_inverse)
[docs]class ROSI_modelQ(lasso):
r"""
A class for the LASSO for post-selection inference
in which
The problem solved is
.. math::
\text{minimize}_{\beta} -(X\beta)^Ty + \frac{1}{2} \beta^TQ\beta +
\sum_i \lambda_i |\beta_i|
where $\lambda$ is `feature_weights`.
Notes
-----
In solving the debiasing problem to approximate the inverse
of (X^TWX) in a GLM, this class makes the implicit assumption
that the scaling of X is such that diag(X^TWX) is O(n)
with n=X.shape[0]. That is, X's are similar to IID samples
from a population that does not depend on n.
"""
[docs] def __init__(self,
Q, # population or semi-supervised version of X.T.dot(X)
X,
y,
feature_weights):
r"""
Create a new post-selection for the LASSO problem
Parameters
----------
Q : np.ndarray((p,p))
X : np.ndarray((n, p))
y : np.ndarray(n)
feature_weights : np.ndarray
Feature weights for L-1 penalty. If a float,
it is brodcast to all features.
"""
self.Q = Q
self.X, self.y = X, y
self._loss = quadratic_loss(Q.shape[0], Q=Q)
self._linear_term = identity_quadratic(0, 0, -X.T.dot(y), 0)
if np.asarray(feature_weights).shape == ():
feature_weights = np.ones(Q.shape[0]) * feature_weights
self.feature_weights = np.asarray(feature_weights)
[docs] def fit(self,
solve_args={'tol': 1.e-12, 'min_its': 50},
debiasing_args={}):
"""
Fit the lasso using `regreg`.
This sets the attributes `soln`, `onestep` and
forms the constraints necessary for post-selection inference
by calling `form_constraints()`.
Parameters
----------
lasso_solution : optional
If not None, this is taken to be the solution
of the optimization problem. No checks
are done, though the implied affine
constraints will generally not be satisfied.
solve_args : keyword args
Passed to `regreg.problems.simple_problem.solve`.
Returns
-------
soln : np.float
Solution to lasso.
Notes
-----
If `self` already has an attribute `lasso_solution`
this will be taken to be the solution and
no optimization problem will be solved. Supplying
the optional argument `lasso_solution` will
overwrite `self`'s `lasso_solution`.
"""
self._penalty = weighted_l1norm(self.feature_weights, lagrange=1.)
problem = simple_problem(self._loss, self._penalty)
self.lasso_solution = problem.solve(self._linear_term, **solve_args)
lasso_solution = self.lasso_solution # shorthand after setting it correctly above
if not np.all(lasso_solution == 0):
self.active = np.nonzero(lasso_solution != 0)[0]
self.inactive = lasso_solution == 0
self.active_signs = np.sign(lasso_solution[self.active])
self._active_soln = lasso_solution[self.active]
# Needed for finding truncation intervals
G = (self._loss.smooth_objective(lasso_solution, 'grad') +
self._linear_term.objective(lasso_solution, 'grad'))
self._Qbeta_bar = self.Q.dot(lasso_solution) - G
Q = self.Q
E = self.active
QiE = np.linalg.inv(Q)[E] # maybe we want to use a debised estimate
self._QiE = QiE[:, E]
self._beta_barE = QiE.dot(self._Qbeta_bar)
# Pearson's X^2 to estimate sigma from relaxed estimator
y, X = self.y, self.X
n, p = X.shape
relaxed_beta_barE = np.linalg.inv(Q[E][:, E]).dot(X[:, E].T.dot(y))
self._pearson_sigma = np.sqrt(((y - X[:, E].dot(relaxed_beta_barE)) ** 2).sum() / (n - len(self.active)))
else:
self.active = []
self.inactive = np.arange(lasso_solution.shape[0])
return self.lasso_solution
[docs] def summary(self, level=0.05,
compute_intervals=False,
dispersion=None):
"""
Summary table for inference adjusted for selection.
Parameters
----------
level : float
Form level*100% selective confidence intervals.
compute_intervals : bool
Should we compute confidence intervals?
dispersion : float
Estimate of dispersion. Defaults to a Pearson's X^2 estimate in the relaxed model.
Returns
-------
pval_summary : np.recarray
Array with one entry per active variable.
Columns are 'variable', 'pval', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'.
"""
if len(self.active) > 0:
X, y = self.X, self.y
sigma = self._pearson_sigma
if dispersion is None:
sqrt_dispersion = sigma
else:
sqrt_dispersion = np.sqrt(dispersion)
active_set, QiE, beta_barE, Qbeta_bar = self.active, self._QiE, self._beta_barE, self._Qbeta_bar
result = []
for j in range(len(active_set)):
idx = self.active[j]
lower, upper = _truncation_interval(Qbeta_bar,
self.Q,
QiE[j, j],
idx,
beta_barE[j],
self.feature_weights,
wide=False)
sd = sqrt_dispersion * np.sqrt(QiE[j, j])
tg = TG([(-np.inf, lower), (upper, np.inf)], scale=sd)
pvalue = tg.cdf(beta_barE[j])
pvalue = float(2 * min(pvalue, 1 - pvalue))
if compute_intervals:
l, u = tg.equal_tailed_interval(beta_barE[j], alpha=1 - level)
else:
l, u = np.nan, np.nan
result.append((idx, pvalue, self.lasso_solution[idx], beta_barE[j], sd, l, u, lower, upper))
df = pd.DataFrame(index=self.active,
data=dict([(n, d) for n, d in zip(['variable',
'pval',
'lasso',
'onestep',
'sd',
'lower_confidence',
'upper_confidence',
'lower_truncation',
'upper_truncation'],
np.array(result).T)]))
df['variable'] = df['variable'].astype(int)
return df