Source code for nlpstats.correlations.williams

import numpy as np
import numpy.typing as npt
import scipy.stats
from typing import Callable, NamedTuple, Union

from nlpstats.correlations.correlations import correlate


[docs]class WilliamsResult(NamedTuple):
    pvalue: float
    """The p-value of the test"""


[docs]def williams_test(
    X: npt.ArrayLike,
    Y: npt.ArrayLike,
    Z: npt.ArrayLike,
    level: str,
    coefficient: Union[Callable, str],
    alternative: str = "two-sided",
) -> WilliamsResult:
    """Calculates a hypothesis test between two correlations using Williams' test.

    See `Graham & Baldwin (2014) <https://aclanthology.org/D14-1020.pdf>`_
    for details on Williams' test.

    The rows of :code:`X`, :code:`Y`, and :code:`Z` must correspond
    to each other, and the columns of :code:`X` and :code:`Y` must too.
    For input- and global-level correlations, the columns of :code:`X`
    and :code:`Y` must also be paired with those of :code:`Z`.

    If a value from the matrices is missing, it should be replaced with
    :code:`np.nan`. The :code:`np.nan` locations must always be identical for
    :code:`X` and :code:`Y`. The same is true for :code:`Z` for input- and
    global-level correlations.

    Parameters
    ----------
    X : npt.ArrayLike
        A two-dimensional score matrix in which :code:`X[i][j]` contains the
        :code:`X` score for the :code:`i` th system on the :code:`j` th input.
    Y : npt.ArrayLike
        A two-dimensional score matrix in which :code:`Y[i][j]` contains the
        :code:`Y` score for the :code:`i` th system on the :code:`j` th input.
    Z : npt.ArrayLike
        A two-dimensional score matrix in which :code:`Z[i][j]` contains the
        :code:`Z` score for the :code:`i` th system on the :code:`j` th input.
    level : str
        The level of correlation, either :code:`"system"`, :code:`"input"`, or :code:`"global"`.
    coefficient : Union[Callable, str]
        The correlation coefficient to use, either :code:`"pearson"`, :code:`"spearman"`,
        :code:`"kendall"`.
    alternative : str
        The alternative hypothesis. :code:`"two-sided"` corresponds to an alternative
        hypothesis that :math:`r(X, Z) \\neq r(Y, Z)`, :code:`"greater"` correponds
        to :math:`r(X, Z) > r(Y, Z)` and :code:`"less"` corresponds to
        :math:`r(X, Z) < r(Y, Z)`.

    Returns
    -------
    WilliamsResult
    """

    # In the math, Z is metric 1. We take the absolute value of the correlations because
    # it does not matter whether they are positively or negatively correlated with each other. The WMT scripts
    # do the same before calling r.test
    r12 = abs(correlate(X, Z, level, coefficient))
    r13 = abs(correlate(Y, Z, level, coefficient))
    r23 = abs(correlate(X, Y, level, coefficient))

    if level == "system":
        # The number of systems
        n = X.shape[0]
    elif level == "input":
        # Assume n is the summary-correlation with the largest n.
        # We find that by counting how many non-nans are in each column,
        # then taking the max
        n = (~np.isnan(X)).sum(axis=0).max()
    elif level == "global":
        # The number of non-NaN entries
        n = (~np.isnan(X)).sum()
    else:
        raise Exception(f"Unknown correlation level: {level}")

    # Implementation based on https://github.com/cran/psych/blob/master/R/r.test.R
    diff = r12 - r13
    det = 1 - (r12**2) - (r23**2) - (r13**2) + (2 * r12 * r23 * r13)
    av = (r12 + r13) / 2
    cube = (1 - r23) ** 3
    t2 = diff * np.sqrt(
        (n - 1) * (1 + r23) / ((2 * (n - 1) / (n - 3)) * det + av**2 * cube)
    )

    # r.test implicitly assumes that r12 > r13 because it takes the absolute value of the
    # t statistic. Since we don't, we have to have special handling for one-tailed tests
    # so we don't map a negative t statistic to a positive one.
    if alternative == "two-sided":
        pvalue = scipy.stats.t.sf(abs(t2), n - 3) * 2
    elif alternative == "greater":
        pvalue = scipy.stats.t.sf(t2, n - 3)
    elif alternative == "less":
        pvalue = scipy.stats.t.sf(-1 * t2, n - 3)
    else:
        raise ValueError(f"Unknown alternative: {alternative}")

    return WilliamsResult(pvalue)