Source code for nipy.algorithms.statistics.formula.formulae

# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:

'''
Formula objects
===============

A formula is basically a sympy expression for the mean of something of
the form::

   mean = sum([Beta(e)*e for e in expr])

Or, a linear combination of sympy expressions, with each one multiplied
by its own "Beta". The elements of expr can be instances of Term (for a
linear regression formula, they would all be instances of Term). But, in
general, there might be some other parameters (i.e. sympy.Symbol
instances) that are not Terms.

The design matrix is made up of columns that are the derivatives of mean
with respect to everything that is not a Term, evaluted at a recarray
that has field names given by [str(t) for t in self.terms].

For those familiar with R's formula syntax, if we wanted a design matrix
like the following::

    > s.table = read.table("http://www-stat.stanford.edu/~jtaylo/courses/stats191/data/supervisor.table", header=T)
    > d = model.matrix(lm(Y ~ X1*X3, s.table)
    )
    > d
       (Intercept) X1 X3 X1:X3
    1            1 51 39  1989
    2            1 64 54  3456
    3            1 70 69  4830
    4            1 63 47  2961
    5            1 78 66  5148
    6            1 55 44  2420
    7            1 67 56  3752
    8            1 75 55  4125
    9            1 82 67  5494
    10           1 61 47  2867
    11           1 53 58  3074
    12           1 60 39  2340
    13           1 62 42  2604
    14           1 83 45  3735
    15           1 77 72  5544
    16           1 90 72  6480
    17           1 85 69  5865
    18           1 60 75  4500
    19           1 70 57  3990
    20           1 58 54  3132
    21           1 40 34  1360
    22           1 61 62  3782
    23           1 66 50  3300
    24           1 37 58  2146
    25           1 54 48  2592
    26           1 77 63  4851
    27           1 75 74  5550
    28           1 57 45  2565
    29           1 85 71  6035
    30           1 82 59  4838
    attr(,"assign")
    [1] 0 1 2 3
    >

With the Formula, it looks like this:

>>> r = np.rec.array([
...     (43, 51, 30, 39, 61, 92, 45), (63, 64, 51, 54, 63, 73, 47), 
...     (71, 70, 68, 69, 76, 86, 48), (61, 63, 45, 47, 54, 84, 35),
...     (81, 78, 56, 66, 71, 83, 47), (43, 55, 49, 44, 54, 49, 34),
...     (58, 67, 42, 56, 66, 68, 35), (71, 75, 50, 55, 70, 66, 41),
...     (72, 82, 72, 67, 71, 83, 31), (67, 61, 45, 47, 62, 80, 41),
...     (64, 53, 53, 58, 58, 67, 34), (67, 60, 47, 39, 59, 74, 41),
...     (69, 62, 57, 42, 55, 63, 25), (68, 83, 83, 45, 59, 77, 35),
...     (77, 77, 54, 72, 79, 77, 46), (81, 90, 50, 72, 60, 54, 36),
...     (74, 85, 64, 69, 79, 79, 63), (65, 60, 65, 75, 55, 80, 60),
...     (65, 70, 46, 57, 75, 85, 46), (50, 58, 68, 54, 64, 78, 52),
...     (50, 40, 33, 34, 43, 64, 33), (64, 61, 52, 62, 66, 80, 41),
...     (53, 66, 52, 50, 63, 80, 37), (40, 37, 42, 58, 50, 57, 49),
...     (63, 54, 42, 48, 66, 75, 33), (66, 77, 66, 63, 88, 76, 72),
...     (78, 75, 58, 74, 80, 78, 49), (48, 57, 44, 45, 51, 83, 38),
...     (85, 85, 71, 71, 77, 74, 55), (82, 82, 39, 59, 64, 78, 39)],
...              dtype=[('y', '<i8'), ('x1', '<i8'), ('x2', '<i8'),
...                     ('x3', '<i8'), ('x4', '<i8'), ('x5', '<i8'),
...                     ('x6', '<i8')])
>>> x1 = Term('x1'); x3 = Term('x3')
>>> f = Formula([x1, x3, x1*x3]) + I
>>> f.mean
_b0*x1 + _b1*x3 + _b2*x1*x3 + _b3

The I is the "intercept" term, I have explicity not used R's default of
adding it to everything.

>>> f.design(r)  #doctest: +STRUCTARR_EQUAL
array([(51.0, 39.0, 1989.0, 1.0), (64.0, 54.0, 3456.0, 1.0),
       (70.0, 69.0, 4830.0, 1.0), (63.0, 47.0, 2961.0, 1.0),
       (78.0, 66.0, 5148.0, 1.0), (55.0, 44.0, 2420.0, 1.0),
       (67.0, 56.0, 3752.0, 1.0), (75.0, 55.0, 4125.0, 1.0),
       (82.0, 67.0, 5494.0, 1.0), (61.0, 47.0, 2867.0, 1.0),
       (53.0, 58.0, 3074.0, 1.0), (60.0, 39.0, 2340.0, 1.0),
       (62.0, 42.0, 2604.0, 1.0), (83.0, 45.0, 3735.0, 1.0),
       (77.0, 72.0, 5544.0, 1.0), (90.0, 72.0, 6480.0, 1.0),
       (85.0, 69.0, 5865.0, 1.0), (60.0, 75.0, 4500.0, 1.0),
       (70.0, 57.0, 3990.0, 1.0), (58.0, 54.0, 3132.0, 1.0),
       (40.0, 34.0, 1360.0, 1.0), (61.0, 62.0, 3782.0, 1.0),
       (66.0, 50.0, 3300.0, 1.0), (37.0, 58.0, 2146.0, 1.0),
       (54.0, 48.0, 2592.0, 1.0), (77.0, 63.0, 4851.0, 1.0),
       (75.0, 74.0, 5550.0, 1.0), (57.0, 45.0, 2565.0, 1.0),
       (85.0, 71.0, 6035.0, 1.0), (82.0, 59.0, 4838.0, 1.0)], 
      dtype=[('x1', '<f8'), ('x3', '<f8'), ('x1*x3', '<f8'), ('1', '<f8')])
'''
from __future__ import print_function
from __future__ import absolute_import

import warnings
import itertools
from string import ascii_letters, digits

import numpy as np
from scipy.linalg import pinv

import sympy
from sympy import Dummy, default_sort_key
from sympy.utilities.lambdify import (implemented_function, lambdify)

from nipy.utils import _NoValue, VisibleDeprecationWarning
from nipy.utils.compat3 import to_str

from nipy.algorithms.utils.matrices import matrix_rank, full_rank

# Legacy repr printing from numpy.
from nipy.testing import legacy_printing as setup_module  # noqa


@np.deprecate(message = "Please use sympy.Dummy instead of this function")
def make_dummy(name):
    """ Make dummy variable of given name

    Parameters
    ----------
    name : str
        name of dummy variable

    Returns
    -------
    dum : `Dummy` instance

    Notes
    -----
    The interface to Dummy changed between 0.6.7 and 0.7.0, and we used this
    function to keep compatibility. Now we depend on sympy 0.7.0 and this
    function is obsolete.
    """
    return Dummy(name)


[docs]def define(*args, **kwargs):
    # Moved to utils module
    import warnings
    from . import utils
    warnings.warn('Please use define function from utils module',
                  DeprecationWarning,
                  stacklevel=2)
    return utils.define(*args, **kwargs)


[docs]class Term(sympy.Symbol):
    """A sympy.Symbol type to represent a term an a regression model

    Terms can be added to other sympy expressions with the single
    convention that a term plus itself returns itself.

    It is meant to emulate something on the right hand side of a formula
    in R. In particular, its name can be the name of a field in a
    recarray used to create a design matrix.

    >>> t = Term('x')
    >>> xval = np.array([(3,),(4,),(5,)], np.dtype([('x', np.float)]))
    >>> f = t.formula
    >>> d = f.design(xval)
    >>> print(d.dtype.descr)
    [('x', '<f8')]
    >>> f.design(xval, return_float=True)
    array([ 3.,  4.,  5.])
    """
    # This flag is defined to avoid using isinstance in getterms
    # and getparams.
    _term_flag = True

    def _getformula(self):
        return Formula([self])
    formula = property(_getformula,
                       doc="Return a Formula with only terms=[self].")

    def __add__(self, other):
        if self == other:
            return self
        return sympy.Symbol.__add__(self, other)


# time symbol
T = Term('t')


[docs]def terms(names, **kwargs):
    ''' Return list of terms with names given by `names`

    This is just a convenience in defining a set of terms, and is the
    equivalent of ``sympy.symbols`` for defining symbols in sympy.

    We enforce the sympy 0.7.0 behavior of returning symbol "abc" from input
    "abc", rthan than 3 symbols "a", "b", "c".

    Parameters
    ----------
    names : str or sequence of str
       If a single str, can specify multiple ``Term``s with string
       containing space or ',' as separator.
    \\**kwargs : keyword arguments
       keyword arguments as for ``sympy.symbols``

    Returns
    -------
    ts : ``Term`` or tuple
       ``Term`` instance or list of ``Term`` instance objects named from `names`

    Examples
    --------
    >>> terms(('a', 'b', 'c'))
    (a, b, c)
    >>> terms('a, b, c')
    (a, b, c)
    >>> terms('abc')
    abc
    '''
    if 'each_char' in kwargs:
        raise TypeError('deprecated "each_char" kwarg removed in sympy>0.7.3')
    syms = sympy.symbols(names, **kwargs)
    try:
        len(syms)
    except TypeError:
        return Term(syms.name)
    return tuple(Term(s.name) for s in syms)


[docs]class FactorTerm(Term):
    """ Boolean Term derived from a Factor.

    Its properties are the same as a Term except that its product with
    itself is itself.
    """
    # This flag is defined to avoid using isinstance in getterms
    _factor_term_flag = True

    def __new__(cls, name, level):
        # Names or levels can be byte strings
        new = Term.__new__(cls, "%s_%s" % (to_str(name), to_str(level)))
        new.level = level
        new.factor_name = name
        return new

    def __mul__(self, other):

        if self == other:
            return self
        else:
            return sympy.Symbol.__mul__(self, other)


[docs]class Beta(sympy.symbol.Dummy):
    ''' A symbol tied to a Term `term` '''
    def __new__(cls, name, term):
        new = sympy.symbol.Dummy.__new__(cls, name)
        new._term = term
        return new


[docs]def getparams(expression):
    """ Return the parameters of an expression that are not Term 
    instances but are instances of sympy.Symbol.

    Examples
    --------
    >>> x, y, z = [Term(l) for l in 'xyz']
    >>> f = Formula([x,y,z])
    >>> getparams(f)
    []
    >>> f.mean
    _b0*x + _b1*y + _b2*z
    >>> getparams(f.mean)
    [_b0, _b1, _b2]
    >>> th = sympy.Symbol('theta')
    >>> f.mean*sympy.exp(th)
    (_b0*x + _b1*y + _b2*z)*exp(theta)
    >>> getparams(f.mean*sympy.exp(th))
    [_b0, _b1, _b2, theta]
    """
    atoms = set([])
    expression = np.array(expression)
    if expression.shape == ():
        expression = expression.reshape((1,))
    if expression.ndim > 1:
        expression = expression.reshape((np.product(expression.shape),))
    for term in expression:
        atoms = atoms.union(sympy.sympify(term).atoms())
    params = []
    for atom in atoms:
        if isinstance(atom, sympy.Symbol) and not is_term(atom):
            params.append(atom)
    params.sort(key=default_sort_key)
    return params


[docs]def getterms(expression):
    """ Return the all instances of Term in an expression.

    Examples
    --------
    >>> x, y, z = [Term(l) for l in 'xyz']
    >>> f = Formula([x,y,z])
    >>> getterms(f)
    [x, y, z]
    >>> getterms(f.mean)
    [x, y, z]
    """
    atoms = set([])
    expression = np.array(expression)
    if expression.shape == ():
        expression = expression.reshape((1,))
    if expression.ndim > 1:
        expression = expression.reshape((np.product(expression.shape),))
    for e in expression:
        atoms = atoms.union(e.atoms())
    terms = []
    for atom in atoms:
        if is_term(atom):
            terms.append(atom)
    terms.sort(key=default_sort_key)
    return terms


def _recarray_from_array(arr, names, drop_name_dim=_NoValue):
    """ Create recarray from input array `arr`, field names `names`
    """
    if not arr.dtype.isbuiltin:  # Structured array as input
        # Rename fields
        dtype = np.dtype([(n, d[1]) for n, d in zip(names, arr.dtype.descr)])
        return arr.view(dtype)
    # Can drop name axis for > 1D arrays or row vectors (scalar per name).
    can_name_drop = arr.ndim > 1 or len(names) > 1
    if can_name_drop and drop_name_dim is _NoValue:
        warnings.warn(
            'Default behavior of make_recarray and > 1D arrays will '
            'change in next Nipy release.  Current default returns\n'
            'array with same number of dimensions as input, with '
            'axis corresponding to the field names having length 1\n; '
            'Future default will be to drop this length 1 axis. Please '
            'change your code to use explicit True or False for\n'
            'compatibility with future Nipy.',
            VisibleDeprecationWarning,
            stacklevel=2)
        # This default will change to True in next version of Nipy
        drop_name_dim = False
    dtype = np.dtype([(n, arr.dtype) for n in names])
    # At least for numpy <= 1.7.1, the dimension that numpy applies the names
    # to depends on the memory layout (C or F).  Ensure C layout for consistent
    # application of names to last dimension.
    rec_arr = np.ascontiguousarray(arr).view(dtype)
    if can_name_drop and drop_name_dim:
        rec_arr.shape = arr.shape[:-1]
    return rec_arr


[docs]def make_recarray(rows, names, dtypes=None, drop_name_dim=_NoValue):
    """ Create recarray from `rows` with field names `names`

    Create a recarray with named columns from a list or ndarray of `rows` and
    sequence of `names` for the columns. If `rows` is an ndarray, `dtypes` must
    be None, otherwise we raise a ValueError. Otherwise, if `dtypes` is None,
    we cast the data in all columns in `rows` as np.float. If `dtypes` is not
    None, the routine uses `dtypes` as a dtype specifier for the output
    structured array.

    Parameters
    ----------
    rows: list or array
        Rows that will be turned into an recarray.
    names: sequence
        Sequence of strings - names for the columns.
    dtypes: None or sequence of str or sequence of np.dtype, optional
        Used to create a np.dtype, can be sequence of np.dtype or string.
    drop_name_dim : {_NoValue, False, True}, optional
        Flag for compatibility with future default behavior.  Current default
        is False.  If True, drops the length 1 dimension corresponding to the
        axis transformed into fields when converting into a recarray.  If
        _NoValue specified, gives default.  Default will change to True in the
        next version of Nipy.

    Returns
    -------
    v : np.ndarray
        Structured array with field names given by `names`.

    Examples
    --------
    The following tests depend on machine byte order for their exact output.

    >>> arr = np.array([[3, 4], [4, 6], [6, 8]])
    >>> make_recarray(arr, ['x', 'y'],
    ...               drop_name_dim=True) #doctest: +STRUCTARR_EQUAL
    array([(3, 4), (4, 6), (6, 8)],
          dtype=[('x', '<i8'), ('y', '<i8')])
    >>> make_recarray(arr, ['x', 'y'],
    ...               drop_name_dim=False) #doctest: +STRUCTARR_EQUAL
    array([[(3, 4)],
           [(4, 6)],
           [(6, 8)]],
          dtype=[('x', '<i8'), ('y', '<i8')])
    >>> r = make_recarray(arr, ['w', 'u'], drop_name_dim=True)
    >>> make_recarray(r, ['x', 'y'],
    ...               drop_name_dim=True) #doctest: +STRUCTARR_EQUAL
    array([(3, 4), (4, 6), (6, 8)],
          dtype=[('x', '<i8'), ('y', '<i8')])
    >>> make_recarray([[3, 4], [4, 6], [7, 9]], 'wv',
    ...               [np.float, np.int])  #doctest: +STRUCTARR_EQUAL
    array([(3.0, 4), (4.0, 6), (7.0, 9)],
          dtype=[('w', '<f8'), ('v', '<i8')])

    Raises
    ------
    ValueError
        `dtypes` not None when `rows` is array.
    """
    # XXX This function is sort of one of convenience
    # Would be nice to use DataArray or something like that
    # to add axis names.
    if isinstance(rows, np.ndarray):
        if dtypes is not None:
            raise ValueError('dtypes not used if rows is an ndarray')
        return _recarray_from_array(rows, names, drop_name_dim)
    # Structured array from list
    if dtypes is None:
        dtype = np.dtype([(n, np.float) for n in names])
    else:
        dtype = np.dtype([(n, d) for n, d in zip(names, dtypes)])
    # Peek at first value in iterable
    irows = iter(rows)
    row0 = next(irows)
    irows = itertools.chain([row0], irows)
    if np.array(row0).shape == ():  # a vector
        if len(names) != 1:  # a 'row vector'
            return np.array(tuple(irows), dtype)
        return np.array([(r,) for r in irows], dtype)
    return np.array([tuple(r) for r in irows], dtype)


[docs]class Formula(object):
    """ A Formula is a model for a mean in a regression model.

    It is often given by a sequence of sympy expressions, with the mean
    model being the sum of each term multiplied by a linear regression
    coefficient.

    The expressions may depend on additional Symbol instances, giving a
    non-linear regression model.
    """
    # This flag is defined for test isformula(obj) instead of isinstance
    _formula_flag = True

[docs]    def __init__(self, seq, char = 'b'):
        """
        Parameters
        ----------
        seq : sequence of ``sympy.Basic``
        char : str, optional
            character for regression coefficient
        """
        self._terms = np.asarray(seq)
        self._counter = 0
        self.char = char

    # Properties
    def _getcoefs(self):
        if not hasattr(self, '_coefs'):
            self._coefs = {}
            for term in self.terms:
                self._coefs.setdefault(term, Beta("%s%d" % (self.char, self._counter), term))
                self._counter += 1
        return self._coefs
    coefs = property(_getcoefs, doc='Coefficients in the linear regression formula.')

    def _getterms(self):
        t = self._terms
        # The Rmode flag is meant to emulate R's implicit addition of an 
        # intercept to every formula. It currently cannot be changed.
        Rmode = False
        if Rmode:
            if sympy.Number(1) not in self._terms:
                t = np.array(list(t) + [sympy.Number(1)])
        return t
    terms = property(_getterms, doc='Terms in the linear regression formula.')

    def _getmean(self):
        """ Expression for mean

        Expression for the mean, expressed as a linear combination of
        terms, each with dummy variables in front.
        """
        b = [self.coefs[term] for term in self.terms]
        return np.sum(np.array(b)*self.terms)
    mean = property(_getmean, doc="Expression for the mean, expressed "
                    "as a linear combination of terms, each with dummy "
                    "variables in front.")

    def _getdiff(self):
        params = sorted(list(set(getparams(self.mean))), key=default_sort_key)
        return [sympy.diff(self.mean, p).doit() for p in params]
    design_expr = property(_getdiff)

    def _getdtype(self):
        vnames = [str(s) for s in self.design_expr]
        return np.dtype([(n, np.float) for n in vnames])
    dtype = property(_getdtype, doc='The dtype of the design matrix of the Formula.')

    def __repr__(self):
        return "Formula(%r)" % (list(self.terms),)

    def __getitem__(self, key):
        """ Return the term such that str(term) == key.

        Parameters
        ----------
        key : str
            name of term to retrieve

        Returns
        -------
        term : sympy.Expression
        """
        names = [str(t) for t in self.terms]
        try:
            idx = names.index(key)
        except ValueError:
            raise ValueError('term %s not found' % key)
        return self.terms[idx]

[docs]    @staticmethod
    def fromrec(rec, keep=[], drop=[]):
        """ Construct Formula from recarray

        For fields with a string-dtype, it is assumed that these are
        qualtiatitve regressors, i.e. Factors.

        Parameters
        ----------
        rec: recarray
            Recarray whose field names will be used to create a formula.
        keep: []
            Field names to explicitly keep, dropping all others.
        drop: []
            Field names to drop.
        """
        f = {}
        for n in rec.dtype.names:
            if rec[n].dtype.kind in 'SOU':
                f[n] = Factor.fromcol(rec[n], n)
            else:
                f[n] = Term(n).formula
        for d in drop:
            del(f[d])
        if keep:
            return np.sum([t for n, t in f.items() if n in keep])
        else:
            return np.sum(list(f.values()))

[docs]    def subs(self, old, new):
        """ Perform a sympy substitution on all terms in the Formula

        Returns a new instance of the same class

        Parameters
        ----------
        old : sympy.Basic
           The expression to be changed
        new : sympy.Basic
           The value to change it to.

        Returns
        -------
        newf : Formula

        Examples
        --------
        >>> s, t = [Term(l) for l in 'st']
        >>> f, g = [sympy.Function(l) for l in 'fg']
        >>> form = Formula([f(t),g(s)])
        >>> newform = form.subs(g, sympy.Function('h'))
        >>> newform.terms
        array([f(t), h(s)], dtype=object)
        >>> form.terms
        array([f(t), g(s)], dtype=object)
        """
        return self.__class__([term.subs(old, new) for term in self.terms])

    def __add__(self, other):
        """ New Formula combining terms of `self` with those of `other`.

        Parameters
        ----------
        other : Formula instance
            Object for which ``is_formula(other)`` is True

        Returns
        -------
        added : Formula instance
            Formula combining terms of `self` with terms of `other`

        Examples
        --------
        >>> x, y, z = [Term(l) for l in 'xyz']
        >>> f1 = Formula([x,y,z])
        >>> f2 = Formula([y])+I
        >>> f3=f1+f2
        >>> sorted(f1.terms, key=default_sort_key)
        [x, y, z]
        >>> sorted(f2.terms, key=default_sort_key)
        [1, y]
        >>> sorted(f3.terms, key=default_sort_key)
        [1, x, y, y, z]
        """
        if not is_formula(other):
            raise ValueError('only Formula objects can be added to a Formula')
        f = Formula(np.hstack([self.terms, other.terms]))
        return f

    def __sub__(self, other):
        """ New Formula by deleting terms in `other` from those in `self`

        Create and return a new Formula by deleting terms in `other` from those
        in `self`.

        No exceptions are raised for terms in `other` that do not appear in
        `self`.

        Parameters
        ----------
        other : Formula instance
            Object for which ``is_formula(other)`` is True

        Returns
        -------
        subbed : Formula instance
            Formula with terms of `other` removed from terms of `self`

        Examples
        --------
        >>> x, y, z = [Term(l) for l in 'xyz']
        >>> f1 = Formula([x, y, z])
        >>> f2 = Formula([y]) + I
        >>> f1.mean
        _b0*x + _b1*y + _b2*z
        >>> f2.mean
        _b0*y + _b1
        >>> f3 = f2 - f1
        >>> f3.mean
        _b0
        >>> f4 = f1 - f2
        >>> f4.mean
        _b0*x + _b1*z
        """
        if not is_formula(other):
            raise ValueError(
                'only Formula objects can be subtracted from a Formula')
        # Preserve order of terms in subtraction
        unwanted = set(other.terms)
        d = [term for term in self.terms if term not in unwanted]
        return Formula(d)

    def __array__(self):
        return self.terms

    def _getparams(self):
        return getparams(self.mean)
    params = property(_getparams, doc='The parameters in the Formula.')

    def __mul__(self, other):
        if not is_formula(other):
            raise ValueError('only two Formulas can be multiplied together')
        if is_factor(self):
            if self == other:
                return self
        v = []
        # Compute the pairwise product of each term
        # If either one is a Term, use Term's multiplication
        for sterm in self.terms:
            for oterm in other.terms:
                if is_term(sterm):
                    v.append(Term.__mul__(sterm, oterm))
                elif is_term(oterm):
                    v.append(Term.__mul__(oterm, sterm))
                else:
                    v.append(sterm*oterm)
        terms = sorted(set(v), key=default_sort_key)
        return Formula(tuple(terms))

    def __eq__(self, other):
        s = np.array(self)
        o = np.array(other)
        if s.shape != o.shape:
            return False
        return np.alltrue(np.equal(np.array(self), np.array(other)))

    def _setup_design(self):
        """ Initialize design

        Create a callable object to evaluate the design matrix at a given set
        of parameter values to be specified by a recarray and observed Term
        values, also specified by a recarray.
        """
        # the design expression is the differentiation of the expression
        # for the mean.  It is a list
        d = self.design_expr
        # Before evaluating, we recreate the formula
        # with numbered terms, and numbered parameters.

        # This renaming has no impact on the
        # final design matrix as the
        # callable, self._f below, is a lambda
        # that does not care about the names of the terms.

        # First, find all terms in the mean expression,
        # and rename them in the form "__t%d__" with a
        # random offset.
        # This may cause a possible problem
        # when there are parameters named something like "__t%d__".
        # Using the random offset will minimize the possibility
        # of this happening.

        # This renaming is here principally because of the intercept.

        random_offset = np.random.randint(low=0, high=2**30)

        terms = getterms(self.mean)

        newterms = []
        for i, t in enumerate(terms):
            newt = sympy.Symbol("__t%d__" % (i + random_offset))
            for j, _ in enumerate(d):
                d[j] = d[j].subs(t, newt)
            newterms.append(newt)

        # Next, find all parameters that remain in the design expression.
        # In a standard regression model, there will be no parameters
        # because they will all be differentiated away in computing
        # self.design_expr. In nonlinear models, parameters will remain.

        params = getparams(self.design_expr)
        newparams = []
        for i, p in enumerate(params):
            newp = Dummy("__p%d__" % (i + random_offset))
            for j, _ in enumerate(d):
                d[j] = d[j].subs(p, newp)
            newparams.append(newp)

        # If there are any aliased functions, these need to be added
        # to the name space before sympy lambdifies the expression

        # These "aliased" functions are used for things like
        # the natural splines, etc. You can represent natural splines
        # with sympy but the expression is pretty awful.  Note that
        # ``d`` here is list giving the differentiation of the
        # expression for the mean.  self._f(...) therefore also returns
        # a list
        self._f = lambdify(newparams + newterms, d, ("numpy"))

        # The input to self.design will be a recarray of that must 
        # have field names that the Formula will expect to see.
        # However, if any of self.terms are FactorTerms, then the field
        # in the recarray will not actually be in the Term.
        # 
        # For example, if there is a Factor 'f' with levels ['a','b'],
        # there will be terms 'f_a' and 'f_b', though the input to
        # design will have a field named 'f'. In this sense,
        # the recarray used in the call to self.design
        # is not really made up of terms, but "preterms".

        # In this case, the callable

        preterm = []
        for t in terms:
            if not is_factor_term(t):
                preterm.append(str(t))
            else:
                preterm.append(t.factor_name)
        preterm = list(set(preterm))

        # There is also an argument for parameters that are not
        # Terms. 

        self._dtypes = {'param':np.dtype([(str(p), np.float) for p in params]),
                        'term':np.dtype([(str(t), np.float) for t in terms]),
                        'preterm':np.dtype([(n, np.float) for n in preterm])}

        self.__terms = terms

[docs]    def design(self,
               input,
               param=None,
               return_float=False,
               contrasts=None):
        """ Construct the design matrix, and optional contrast matrices.

        Parameters
        ----------
        input : np.recarray
           Recarray including fields needed to compute the Terms in
           getparams(self.design_expr).
        param : None or np.recarray
           Recarray including fields that are not Terms in
           getparams(self.design_expr)
        return_float : bool, optional
           If True, return a np.float array rather than a np.recarray
        contrasts : None or dict, optional
           Contrasts. The items in this dictionary should be (str,
           Formula) pairs where a contrast matrix is constructed for
           each Formula by evaluating its design at the same parameters
           as self.design. If not None, then the return_float is set to True.

        Returns
        -------
        des : 2D array
            design matrix
        cmatrices : dict, optional
            Dictionary with keys from `contrasts` input, and contrast matrices
            corresponding to `des` design matrix.  Returned only if `contrasts`
            input is not None
        """
        self._setup_design()

        preterm_recarray = input
        param_recarray = param

        # The input to design should have field names for all fields in self._dtypes['preterm']
        if not set(preterm_recarray.dtype.names).issuperset(self._dtypes['preterm'].names):
            raise ValueError("for term, expecting a recarray with "
                             "dtype having the following names: %r"
                             % (self._dtypes['preterm'].names,))
        # The parameters should have field names for all fields in self._dtypes['param']
        if param_recarray is not None:
            if not set(param_recarray.dtype.names).issuperset(self._dtypes['param'].names):
                raise ValueError("for param, expecting a recarray with "
                                 "dtype having the following names: %r"
                                 % (self._dtypes['param'].names,))
        # If the only term is an intercept,
        # the return value is a matrix of 1's.
        if list(self.terms) == [sympy.Number(1)]:
            a = np.ones(preterm_recarray.shape[0], np.float)
            if not return_float:
                a = a.view(np.dtype([('intercept', np.float)]))
            return a
        elif not self._dtypes['term']:
            raise ValueError("none of the expresssions are self.terms "
                             "are Term instances; shape of resulting "
                             "undefined")
        # The term_recarray is essentially the same as preterm_recarray,
        # except that all factors in self are expanded
        # into their respective binary columns.
        term_recarray = np.zeros(preterm_recarray.shape[0],
                                 dtype=self._dtypes['term'])
        for t in self.__terms:
            if not is_factor_term(t):
                term_recarray[t.name] = preterm_recarray[t.name]
            else:
                factor_col = preterm_recarray[t.factor_name]
                # Python 3: If column type is bytes, convert to string, to allow
                # level comparison
                if factor_col.dtype.kind == 'S':
                    factor_col = factor_col.astype('U')
                fl_ind =  np.array([x == t.level
                                    for x in factor_col]).reshape(-1)
                term_recarray['%s_%s' % (t.factor_name, t.level)] = fl_ind
        # The lambda created in self._setup_design needs to take a tuple of
        # columns as argument, not an ndarray, so each column
        # is extracted and put into float_tuple.
        float_array = term_recarray.view(np.float)
        float_array.shape = (term_recarray.shape[0], -1)
        float_array = float_array.T
        float_tuple = tuple(float_array)
        # If there are any parameters, they also must be extracted
        # and put into a tuple with the order specified
        # by self._dtypes['param']
        if param_recarray is not None:
            param = tuple(float(param_recarray[n]) for n in self._dtypes['param'].names)
        else:
            param = ()
        # Evaluate the design at the parameters and tuple of arrays
        D = self._f(*(param+float_tuple))
        # TODO: check if this next stepis necessary
        # I think it is because the lambda evaluates sympy.Number(1) to 1
        # and not an array.
        D_tuple = [np.asarray(w) for w in D]

        need_to_modify_shape = []
        OK_row_shapes = []
        for i, row in enumerate(D_tuple):
            if row.shape in [(),(1,)]:
                need_to_modify_shape.append(i)
            else:
                OK_row_shapes.append(row.shape[0])
        # Make sure that each array has the correct shape.
        # The columns in need_to_modify should just be
        # the intercept column, which evaluates to have shape == ().
        # This makes sure that it has the correct number of rows.
        for i in need_to_modify_shape:
            D_tuple[i].shape = ()
            D_tuple[i] = np.multiply.outer(D_tuple[i], np.ones(preterm_recarray.shape[0]))
        # At this point, all the columns have the correct shape and the
        # design matrix is almost ready to output.
        D = np.array(D_tuple).T
        # If we will return a float matrix or any contrasts,
        # we may have some reshaping to do.
        if contrasts is None:
            contrasts = {}
        if return_float or contrasts:
            # If the design matrix is just a column of 1s
            # return a 1-dimensional array.
            D = np.squeeze(D.astype(np.float))
            # If there are contrasts, the pseudo-inverse of D
            # must be computed.
            if contrasts:
                if D.ndim == 1:
                    _D = D.reshape((D.shape[0], 1))
                else:
                    _D = D
                pinvD = np.linalg.pinv(_D)
        else:
            # Correct the dtype.
            # XXX There seems to be a lot of messing around with the dtype.
            # This would be a convenient place to just add
            # labels like a DataArray.
            D = np.array([tuple(r) for r in D], self.dtype)
        # Compute the contrast matrices, if any.
        if contrasts:
            cmatrices = {}
            for key, cf in contrasts.items():
                if not is_formula(cf):
                    cf = Formula([cf])
                L = cf.design(input, param=param_recarray, 
                              return_float=True)
                cmatrices[key] = contrast_from_cols_or_rows(L, _D, pseudo=pinvD)
            return D, cmatrices
        else:
            return D


[docs]def natural_spline(t, knots=None, order=3, intercept=False):
    """ Return a Formula containing a natural spline

    Spline for a Term with specified `knots` and `order`.

    Parameters
    ----------
    t : ``Term``
    knots : None or sequence, optional
       Sequence of float.  Default None (same as empty list)
    order : int, optional
       Order of the spline. Defaults to a cubic (==3)
    intercept : bool, optional
       If True, include a constant function in the natural
       spline. Default is False

    Returns
    -------
    formula : Formula
         A Formula with (len(knots) + order) Terms (if intercept=False,
         otherwise includes one more Term), made up of the natural spline
         functions.

    Examples
    --------
    >>> x = Term('x')
    >>> n = natural_spline(x, knots=[1,3,4], order=3)
    >>> xval = np.array([3,5,7.]).view(np.dtype([('x', np.float)]))
    >>> n.design(xval, return_float=True)
    array([[   3.,    9.,   27.,    8.,    0.,   -0.],
           [   5.,   25.,  125.,   64.,    8.,    1.],
           [   7.,   49.,  343.,  216.,   64.,   27.]])
    >>> d = n.design(xval)
    >>> print(d.dtype.descr)
    [('ns_1(x)', '<f8'), ('ns_2(x)', '<f8'), ('ns_3(x)', '<f8'), ('ns_4(x)', '<f8'), ('ns_5(x)', '<f8'), ('ns_6(x)', '<f8')]
    """
    if knots is None:
        knots = {}
    fns = []
    for i in range(order+1):
        n = 'ns_%d' % i
        def f(x, i=i):
            return x**i
        s = implemented_function(n, f)
        fns.append(s(t))

    for j, k in enumerate(knots):
        n = 'ns_%d' % (j+i+1,)
        def f(x, k=k, order=order):
            return (x-k)**order * np.greater(x, k)
        s = implemented_function(n, f)
        fns.append(s(t))

    if not intercept:
        fns.pop(0)

    ff = Formula(fns)
    return ff

# The intercept formula

I = Formula([sympy.Number(1)])


[docs]class Factor(Formula):
    """ A qualitative variable in a regression model

    A Factor is similar to R's factor. The levels of the Factor can be
    either strings or ints.
    """
    # This flag is defined to avoid using isinstance in getterms
    # and getparams.
    _factor_flag = True

[docs]    def __init__(self, name, levels, char='b'):
        """ Initialize Factor

        Parameters
        ----------
        name : str
        levels : [str or int]
            A sequence of strings or ints.
        char : str, optional
            prefix character for regression coefficients
        """
        # Check whether they can all be cast to strings or ints without
        # loss.
        levelsarr = np.asarray(levels)
        if levelsarr.ndim == 0 and levelsarr.dtype.kind in 'SOU':
            levelsarr = np.asarray(list(levels))
        if levelsarr.dtype.kind not in 'SOU': # the levels are not strings
            if not np.alltrue(np.equal(levelsarr, np.round(levelsarr))):
                raise ValueError('levels must be strings or ints')
            levelsarr = levelsarr.astype(np.int)
        elif levelsarr.dtype.kind == 'S': # Byte strings, convert
            levelsarr = levelsarr.astype('U')
        Formula.__init__(self, [FactorTerm(name, l) for l in levelsarr],
                         char=char)
        self.levels = list(levelsarr)
        self.name = name

    # TODO: allow different specifications of the contrasts
    # here.... this is like R's contr.sum

[docs]    def get_term(self, level):
        """
        Retrieve a term of the Factor...
        """
        if level not in self.levels:
            raise ValueError('level not found')
        return self["%s_%s" % (self.name, str(level))]

    def _getmaineffect(self, ref=-1):
        v = list(self._terms.copy())
        ref_term = v[ref]
        v.pop(ref)
        return Formula([vv - ref_term for vv in v])
    main_effect = property(_getmaineffect)

[docs]    def stratify(self, variable):
        """ Create a new variable, stratified by the levels of a Factor.

        Parameters
        ----------
        variable : str or simple sympy expression
            If sympy expression, then string representation must be all lower
            or upper case letters, i.e. it can be interpreted as a name.

        Returns
        -------
        formula : Formula
            Formula whose mean has one parameter named variable%d, for each
            level in self.levels

        Examples
        --------
        >>> f = Factor('a', ['x','y'])
        >>> sf = f.stratify('theta')
        >>> sf.mean
        _theta0*a_x + _theta1*a_y
        """
        if not set(str(variable)).issubset(ascii_letters + digits):
            raise ValueError('variable should be interpretable as a '
                             'name and not have anything but digits '
                             'and numbers')
        variable = sympy.sympify(variable)
        f = Formula(self._terms, char=variable)
        f.name = self.name
        return f

[docs]    @staticmethod
    def fromcol(col, name):
        """ Create a Factor from a column array.

        Parameters
        ----------
        col : ndarray
            an array with ndim==1
        name : str
            name of the Factor

        Returns
        -------
        factor : Factor

        Examples
        --------
        >>> data = np.array([(3,'a'),(4,'a'),(5,'b'),(3,'b')], np.dtype([('x', np.float), ('y', 'S1')]))
        >>> f1 = Factor.fromcol(data['y'], 'y')
        >>> f2 = Factor.fromcol(data['x'], 'x')
        >>> d = f1.design(data)
        >>> print(d.dtype.descr)
        [('y_a', '<f8'), ('y_b', '<f8')]
        >>> d = f2.design(data)
        >>> print(d.dtype.descr)
        [('x_3', '<f8'), ('x_4', '<f8'), ('x_5', '<f8')]
        """
        col = np.asarray(col)
        if col.ndim != 1 or (col.dtype.names and len(col.dtype.names) > 1):
            raise ValueError('expecting an array that can be thought '
                             'of as a column or field of a recarray')
        levels = np.unique(col)
        if not col.dtype.names and not name:
            name = 'factor'
        elif col.dtype.names:
            name = col.dtype.names[0]
        return Factor(name, levels)


[docs]def contrast_from_cols_or_rows(L, D, pseudo=None):
    """ Construct a contrast matrix from a design matrix D

    (possibly with its pseudo inverse already computed)
    and a matrix L that either specifies something in
    the column space of D or the row space of D.

    Parameters
    ----------
    L : ndarray
       Matrix used to try and construct a contrast.
    D : ndarray
       Design matrix used to create the contrast.
    pseudo : None or array-like, optional
       If not None, gives pseudo-inverse of `D`.  Allows you to pass
       this if it is already calculated.

    Returns
    -------
    C : ndarray
       Matrix with C.shape[1] == D.shape[1] representing an estimable
       contrast.

    Notes
    -----
    From an n x p design matrix D and a matrix L, tries to determine a p
    x q contrast matrix C which determines a contrast of full rank,
    i.e. the n x q matrix

    dot(transpose(C), pinv(D))

    is full rank.

    L must satisfy either L.shape[0] == n or L.shape[1] == p.

    If L.shape[0] == n, then L is thought of as representing
    columns in the column space of D.

    If L.shape[1] == p, then L is thought of as what is known
    as a contrast matrix. In this case, this function returns an estimable
    contrast corresponding to the dot(D, L.T)

    This always produces a meaningful contrast, not always
    with the intended properties because q is always non-zero unless
    L is identically 0. That is, it produces a contrast that spans
    the column space of L (after projection onto the column space of D).
    """
    L = np.asarray(L)
    D = np.asarray(D)
    n, p = D.shape
    if L.shape[0] != n and L.shape[1] != p:
        raise ValueError('shape of L and D mismatched')
    if pseudo is None:
        pseudo = pinv(D)
    if L.shape[0] == n:
        C = np.dot(pseudo, L).T
    else:
        C = np.dot(pseudo, np.dot(D, L.T)).T
    Lp = np.dot(D, C.T)
    if len(Lp.shape) == 1:
        Lp.shape = (n, 1)
    Lp_rank = matrix_rank(Lp)
    if Lp_rank != Lp.shape[1]:
        Lp = full_rank(Lp, Lp_rank)
        C = np.dot(pseudo, Lp).T
    return np.squeeze(C)


[docs]class RandomEffects(Formula):
    """ Covariance matrices for common random effects analyses.

    Examples
    --------
    Two subjects (here named 2 and 3):

    >>> subj = make_recarray([2,2,2,3,3], 's')
    >>> subj_factor = Factor('s', [2,3])

    By default the covariance matrix is symbolic.  The display differs a little
    between sympy versions (hence we don't check it in the doctests):

    >>> c = RandomEffects(subj_factor.terms)
    >>> c.cov(subj) #doctest: +IGNORE_OUTPUT
    array([[_s2_0, _s2_0, _s2_0, 0, 0],
           [_s2_0, _s2_0, _s2_0, 0, 0],
           [_s2_0, _s2_0, _s2_0, 0, 0],
           [0, 0, 0, _s2_1, _s2_1],
           [0, 0, 0, _s2_1, _s2_1]], dtype=object)

    With a numeric `sigma`, you get a numeric array:

    >>> c = RandomEffects(subj_factor.terms, sigma=np.array([[4,1],[1,6]]))
    >>> c.cov(subj)
    array([[ 4.,  4.,  4.,  1.,  1.],
           [ 4.,  4.,  4.,  1.,  1.],
           [ 4.,  4.,  4.,  1.,  1.],
           [ 1.,  1.,  1.,  6.,  6.],
           [ 1.,  1.,  1.,  6.,  6.]])
    """
[docs]    def __init__(self, seq, sigma=None, char = 'e'):
        """ Initialize random effects instance

        Parameters
        ----------
        seq : [``sympy.Basic``]
        sigma : ndarray
             Covariance of the random effects. Defaults
             to a diagonal with entries for each random
             effect.
        char : character for regression coefficient
        """

        self._terms = np.asarray(seq)
        q = self._terms.shape[0]

        self._counter = 0
        if sigma is None:
            self.sigma = np.diag([Dummy('s2_%d' % i) for i in range(q)])
        else:
            self.sigma = sigma
        if self.sigma.shape != (q,q):
            raise ValueError('incorrect shape for covariance '
                             'of random effects, '
                             'should have shape %r' % ((q,q)))
        self.char = char

[docs]    def cov(self, term, param=None):
        """
        Compute the covariance matrix for some given data.

        Parameters
        ----------
        term : np.recarray
             Recarray including fields corresponding to the Terms in
             getparams(self.design_expr).
        param : np.recarray
             Recarray including fields that are not Terms in
             getparams(self.design_expr)

        Returns
        -------
        C : ndarray
             Covariance matrix implied by design and self.sigma.
        """
        D = self.design(term, param=param, return_float=True)
        return np.dot(D, np.dot(self.sigma, D.T))


[docs]def is_term(obj):
    """ Is obj a Term?
    """
    return hasattr(obj, "_term_flag")


[docs]def is_factor_term(obj):
    """ Is obj a FactorTerm?
    """
    return hasattr(obj, "_factor_term_flag")


[docs]def is_formula(obj):
    """ Is obj a Formula?
    """
    return hasattr(obj, "_formula_flag")


[docs]def is_factor(obj):
    """ Is obj a Factor?
    """
    return hasattr(obj, "_factor_flag")