Source code for selectinf.learning.samplers

import numpy as np
from scipy.stats import norm as ndist

# randomization mechanism

[docs]class normal_sampler(object):

    """
    Our basic model for noise, and input to 
    selection algorithms. This represents
    Gaussian data with a center, e.g. X.T.dot(y)
    in linear regression and a covariance Sigma.

    This object emits noisy versions of `center` as

    center + scale * N(0, Sigma)

    """
[docs]    def __init__(self, center, covariance):
        '''
        Parameters
        ----------

        center : np.float(p)
            Center of Gaussian noise source.

        covariance : np.float((p, p))
            Covariance of noise added (up to scale factor).

        '''
        (self.center,
         self.covariance) = (np.asarray(center),
                             np.asarray(covariance))
        self.shape = self.center.shape

    def __call__(self, size=None, scale=1.):

        '''

        Parameters
        ----------

        size : tuple or int
            How many copies to draw

        scale : float
            Scale (in data units) applied to unitless noise before adding.

        Returns
        -------

        noisy_sample : np.float

        Generate noisy version of the center. With scale==0.,
        return the full center.

        TODO: for some calculations, a log of each call would be helpful
        for constructing UMVU, say.

        '''

        if not hasattr(self, 'cholT'):
            self.cholT = np.linalg.cholesky(self.covariance).T

        if type(size) == type(1):
            size = (size,)
        size = size or (1,)
        if self.shape == ():
            _shape = (1,)
        else:
            _shape = self.shape
        return scale * np.squeeze(np.random.standard_normal(size + _shape).dot(self.cholT)) + self.center

[docs]class split_sampler(object):

    """
    Data splitting noise source.
    This is approximately
    Gaussian with center np.sum(sample_stat, 0)
    and noise suitably scaled, depending
    on splitting fraction.

    This object emits noisy versions of `center` as

    center + scale * N(0, Sigma)

    """

[docs]    def __init__(self, sample_stat, covariance): # covariance of sum of rows
        '''
        Parameters
        ----------

        sample_stat : np.float((n, p))
             Data matrix. In linear regression this is X * y[:, None]

        covariance : np.float((p, p))
             Covariance of np.sum(sample_stat, 0). Could be computed
             e.g. by bootstrap or parametric method given a design X.
        '''
        self.sample_stat = np.asarray(sample_stat)
        self.nsample = self.sample_stat.shape[0]
        self.center = np.sum(self.sample_stat, 0)
        self.covariance = covariance
        self.shape = self.center.shape

    def __call__(self, size=None, scale=0.5):

        '''
        Parameters
        ----------

        size : tuple or int
            How many copies to draw

        scale : float
            Scale (in data units) applied to unitless noise before adding.

        Returns
        -------

        noisy_sample : np.float

        Generate noisy version of the center. With scale==0.,
        return the full center.

        The equivalent data splitting fraction is 1 / (scale**2 + 1).
        Argument is kept as `scale` instead of `frac` so that the general
        learning algorithm can replace this `splitter_source` with a corresponding
        `normal_source`.

        TODO: for some calculations, a log of each call would be helpful
        for constructing UMVU, say.
        '''

        # (1 - frac) / frac = scale**2

        frac = 1 / (scale**2 + 1)

        if type(size) == type(1):
            size = (size,)
        size = size or (1,)
        if self.shape == ():
            _shape = (1,)
        else:
            _shape = self.shape

        final_sample = []
        idx = np.arange(self.nsample)
        for _ in range(np.product(size)):
            sample_ = self.sample_stat[np.random.choice(idx, int(frac * self.nsample), replace=False)]
            final_sample.append(np.sum(sample_, 0) / frac) # rescale to the scale of a sum of nsample rows
        val = np.squeeze(np.array(final_sample).reshape(size + _shape))
        return val