Source code for rootpy.plotting.contrib.plot_corrcoef_matrix

from __future__ import absolute_import
from ...extern.six.moves import range
from ...extern.six import string_types


__all__ = [
    'plot_corrcoef_matrix',
    'corrcoef',
    'cov',
]


[docs]def plot_corrcoef_matrix(matrix, names=None,
                         cmap=None, cmap_text=None,
                         fontsize=12, grid=False,
                         axes=None):
    """
    This function will draw a lower-triangular correlation matrix

    Parameters
    ----------

    matrix : 2-dimensional numpy array/matrix
        A correlation coefficient matrix

    names : list of strings, optional (default=None)
        List of the parameter names corresponding to the rows in ``matrix``.

    cmap : matplotlib color map, optional (default=None)
        Color map used to color the matrix cells.

    cmap_text : matplotlib color map, optional (default=None)
        Color map used to color the cell value text. If None, then
        all values will be black.

    fontsize : int, optional (default=12)
        Font size of parameter name and correlation value text.

    grid : bool, optional (default=False)
        If True, then draw dashed grid lines around the matrix elements.

    axes : matplotlib Axes instance, optional (default=None)
        The axes to plot on. If None then use the global current axes.

    Notes
    -----

    NumPy and matplotlib are required

    Examples
    --------

    >>> matrix = corrcoef(data.T, weights=weights)
    >>> plot_corrcoef_matrix(matrix, names)

    """
    import numpy as np
    from matplotlib import pyplot as plt
    from matplotlib import cm

    if axes is None:
        axes = plt.gca()

    matrix = np.asarray(matrix)

    if matrix.ndim != 2:
        raise ValueError("matrix is not a 2-dimensional array or matrix")
    if matrix.shape[0] != matrix.shape[1]:
        raise ValueError("matrix is not square")
    if names is not None and len(names) != matrix.shape[0]:
        raise ValueError("the number of names does not match the number of "
                         "rows/columns in the matrix")

    # mask out the upper triangular matrix
    matrix[np.triu_indices(matrix.shape[0])] = np.nan

    if isinstance(cmap_text, string_types):
        cmap_text = cm.get_cmap(cmap_text, 201)
    if cmap is None:
        cmap = cm.get_cmap('jet', 201)
    elif isinstance(cmap, string_types):
        cmap = cm.get_cmap(cmap, 201)
    # make NaN pixels white
    cmap.set_bad('w')

    axes.imshow(matrix, interpolation='nearest',
                cmap=cmap, origin='upper',
                vmin=-1, vmax=1)

    axes.set_frame_on(False)
    plt.setp(axes.get_yticklabels(), visible=False)
    plt.setp(axes.get_yticklines(), visible=False)
    plt.setp(axes.get_xticklabels(), visible=False)
    plt.setp(axes.get_xticklines(), visible=False)

    if grid:
        # draw grid lines
        for slot in range(1, matrix.shape[0] - 1):
            # vertical
            axes.plot((slot - 0.5, slot - 0.5),
                      (slot - 0.5, matrix.shape[0] - 0.5), 'k:', linewidth=1)
            # horizontal
            axes.plot((-0.5, slot + 0.5),
                      (slot + 0.5, slot + 0.5), 'k:', linewidth=1)
        if names is not None:
            for slot in range(1, matrix.shape[0]):
                # diagonal
                axes.plot((slot - 0.5, slot + 1.5),
                          (slot - 0.5, slot - 2.5), 'k:', linewidth=1)

    # label cell values
    for row, col in zip(*np.tril_indices(matrix.shape[0], k=-1)):
        value = matrix[row][col]
        if cmap_text is not None:
            color = cmap_text((value + 1.) / 2.)
        else:
            color = 'black'
        axes.text(
            col, row,
            "{0:d}%".format(int(value * 100)),
            color=color,
            ha='center', va='center',
            fontsize=fontsize)

    if names is not None:
        # write parameter names
        for i, name in enumerate(names):
            axes.annotate(
                name, (i, i),
                rotation=45,
                ha='left', va='bottom',
                transform=axes.transData,
                fontsize=fontsize)


def cov(m, y=None, rowvar=1, bias=0, ddof=None, weights=None, repeat_weights=0):
    """
    Estimate a covariance matrix, given data.

    Covariance indicates the level to which two variables vary together.
    If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`,
    then the covariance matrix element :math:`C_{ij}` is the covariance of
    :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance
    of :math:`x_i`.

    Parameters
    ----------
    m : array_like
        A 1-D or 2-D array containing multiple variables and observations.
        Each row of `m` represents a variable, and each column a single
        observation of all those variables. Also see `rowvar` below.
    y : array_like, optional
        An additional set of variables and observations. `y` has the same
        form as that of `m`.
    rowvar : int, optional
        If `rowvar` is non-zero (default), then each row represents a
        variable, with observations in the columns. Otherwise, the relationship
        is transposed: each column represents a variable, while the rows
        contain observations.
    bias : int, optional
        Default normalization is by ``(N - 1)``, where ``N`` is the number of
        observations given (unbiased estimate). If `bias` is 1, then
        normalization is by ``N``. These values can be overridden by using
        the keyword ``ddof`` in numpy versions >= 1.5.
    ddof : int, optional
        .. versionadded:: 1.5
        If not ``None`` normalization is by ``(N - ddof)``, where ``N`` is
        the number of observations; this overrides the value implied by
        ``bias``. The default value is ``None``.
    weights : array-like, optional
        A 1-D array of weights with a length equal to the number of
        observations.
    repeat_weights : int, optional
        The default treatment of weights in the weighted covariance is to first
        normalize them to unit sum and use the biased weighted covariance
        equation. If `repeat_weights` is 1 then the weights must represent an
        integer number of occurrences of each observation and both a biased and
        unbiased weighted covariance is defined because the total sample size
        can be determined.

    Returns
    -------
    out : ndarray
        The covariance matrix of the variables.

    See Also
    --------
    corrcoef : Normalized covariance matrix

    Examples
    --------
    Consider two variables, :math:`x_0` and :math:`x_1`, which
    correlate perfectly, but in opposite directions:

    >>> x = np.array([[0, 2], [1, 1], [2, 0]]).T
    >>> x
    array([[0, 1, 2],
           [2, 1, 0]])

    Note how :math:`x_0` increases while :math:`x_1` decreases. The covariance
    matrix shows this clearly:

    >>> np.cov(x)
    array([[ 1., -1.],
           [-1.,  1.]])

    Note that element :math:`C_{0,1}`, which shows the correlation between
    :math:`x_0` and :math:`x_1`, is negative.

    Further, note how `x` and `y` are combined:

    >>> x = [-2.1, -1,  4.3]
    >>> y = [3,  1.1,  0.12]
    >>> X = np.vstack((x,y))
    >>> print np.cov(X)
    [[ 11.71        -4.286     ]
     [ -4.286        2.14413333]]
    >>> print np.cov(x, y)
    [[ 11.71        -4.286     ]
     [ -4.286        2.14413333]]
    >>> print np.cov(x)
    11.71

    """
    import numpy as np
    # Check inputs
    if ddof is not None and ddof != int(ddof):
        raise ValueError(
            "ddof must be integer")

    X = np.array(m, ndmin=2, dtype=float)
    if X.size == 0:
        # handle empty arrays
        return np.array(m)
    if X.shape[0] == 1:
        rowvar = 1
    if rowvar:
        axis = 0
        tup = (slice(None), np.newaxis)
    else:
        axis = 1
        tup = (np.newaxis, slice(None))

    if y is not None:
        y = np.array(y, copy=False, ndmin=2, dtype=float)
        X = np.concatenate((X, y), axis)

    if ddof is None:
        if bias == 0:
            ddof = 1
        else:
            ddof = 0

    if weights is not None:
        weights = np.array(weights, dtype=float)
        weights_sum = weights.sum()
        if weights_sum <= 0:
            raise ValueError(
                "sum of weights is non-positive")
        X -= np.average(X, axis=1-axis, weights=weights)[tup]

        if repeat_weights:
            # each weight represents a number of repetitions of an observation
            # the total sample size can be determined in this case and we have
            # both an unbiased and biased weighted covariance
            fact = weights_sum - ddof
        else:
            # normalize weights so they sum to unity
            weights /= weights_sum
            # unbiased weighted covariance is not defined if the weights are
            # not integral frequencies (repeat-type)
            fact = (1. - np.power(weights, 2).sum())
    else:
        weights = 1
        X -= X.mean(axis=1-axis)[tup]
        if rowvar:
            N = X.shape[1]
        else:
            N = X.shape[0]
        fact = float(N - ddof)

    if not rowvar:
        return (np.dot(weights * X.T, X.conj()) / fact).squeeze()
    else:
        return (np.dot(weights * X, X.T.conj()) / fact).squeeze()


def corrcoef(x, y=None, rowvar=1, bias=0, ddof=None, weights=None,
             repeat_weights=0):
    """
    Return correlation coefficients.

    Please refer to the documentation for `cov` for more detail.  The
    relationship between the correlation coefficient matrix, `P`, and the
    covariance matrix, `C`, is

    .. math:: P_{ij} = \\frac{ C_{ij} } { \\sqrt{ C_{ii} * C_{jj} } }

    The values of `P` are between -1 and 1, inclusive.

    Parameters
    ----------
    x : array_like
        A 1-D or 2-D array containing multiple variables and observations.
        Each row of `m` represents a variable, and each column a single
        observation of all those variables. Also see `rowvar` below.
    y : array_like, optional
        An additional set of variables and observations. `y` has the same
        shape as `m`.
    rowvar : int, optional
        If `rowvar` is non-zero (default), then each row represents a
        variable, with observations in the columns. Otherwise, the relationship
        is transposed: each column represents a variable, while the rows
        contain observations.
    bias : int, optional
        Default normalization is by ``(N - 1)``, where ``N`` is the number of
        observations (unbiased estimate). If `bias` is 1, then
        normalization is by ``N``. These values can be overridden by using
        the keyword ``ddof`` in numpy versions >= 1.5.
    ddof : {None, int}, optional
        .. versionadded:: 1.5
        If not ``None`` normalization is by ``(N - ddof)``, where ``N`` is
        the number of observations; this overrides the value implied by
        ``bias``. The default value is ``None``.
    weights : array-like, optional
        A 1-D array of weights with a length equal to the number of
        observations.
    repeat_weights : int, optional
        The default treatment of weights in the weighted covariance is to first
        normalize them to unit sum and use the biased weighted covariance
        equation. If `repeat_weights` is 1 then the weights must represent an
        integer number of occurrences of each observation and both a biased and
        unbiased weighted covariance is defined because the total sample size
        can be determined.

    Returns
    -------
    out : ndarray
        The correlation coefficient matrix of the variables.

    See Also
    --------
    cov : Covariance matrix

    """
    import numpy as np
    c = cov(x, y, rowvar, bias, ddof, weights, repeat_weights)
    if c.size == 0:
        # handle empty arrays
        return c
    try:
        d = np.diag(c)
    except ValueError:  # scalar covariance
        return 1
    return c / np.sqrt(np.multiply.outer(d, d))