Source code for nlpstats.correlations.williams

import numpy as np
import numpy.typing as npt
import scipy.stats
from typing import Callable, NamedTuple, Union

from nlpstats.correlations.correlations import correlate


[docs]class WilliamsResult(NamedTuple): pvalue: float """The p-value of the test"""
[docs]def williams_test( X: npt.ArrayLike, Y: npt.ArrayLike, Z: npt.ArrayLike, level: str, coefficient: Union[Callable, str], alternative: str = "two-sided", ) -> WilliamsResult: """Calculates a hypothesis test between two correlations using Williams' test. See `Graham & Baldwin (2014) <https://aclanthology.org/D14-1020.pdf>`_ for details on Williams' test. The rows of :code:`X`, :code:`Y`, and :code:`Z` must correspond to each other, and the columns of :code:`X` and :code:`Y` must too. For input- and global-level correlations, the columns of :code:`X` and :code:`Y` must also be paired with those of :code:`Z`. If a value from the matrices is missing, it should be replaced with :code:`np.nan`. The :code:`np.nan` locations must always be identical for :code:`X` and :code:`Y`. The same is true for :code:`Z` for input- and global-level correlations. Parameters ---------- X : npt.ArrayLike A two-dimensional score matrix in which :code:`X[i][j]` contains the :code:`X` score for the :code:`i` th system on the :code:`j` th input. Y : npt.ArrayLike A two-dimensional score matrix in which :code:`Y[i][j]` contains the :code:`Y` score for the :code:`i` th system on the :code:`j` th input. Z : npt.ArrayLike A two-dimensional score matrix in which :code:`Z[i][j]` contains the :code:`Z` score for the :code:`i` th system on the :code:`j` th input. level : str The level of correlation, either :code:`"system"`, :code:`"input"`, or :code:`"global"`. coefficient : Union[Callable, str] The correlation coefficient to use, either :code:`"pearson"`, :code:`"spearman"`, :code:`"kendall"`. alternative : str The alternative hypothesis. :code:`"two-sided"` corresponds to an alternative hypothesis that :math:`r(X, Z) \\neq r(Y, Z)`, :code:`"greater"` correponds to :math:`r(X, Z) > r(Y, Z)` and :code:`"less"` corresponds to :math:`r(X, Z) < r(Y, Z)`. Returns ------- WilliamsResult """ # In the math, Z is metric 1. We take the absolute value of the correlations because # it does not matter whether they are positively or negatively correlated with each other. The WMT scripts # do the same before calling r.test r12 = abs(correlate(X, Z, level, coefficient)) r13 = abs(correlate(Y, Z, level, coefficient)) r23 = abs(correlate(X, Y, level, coefficient)) if level == "system": # The number of systems n = X.shape[0] elif level == "input": # Assume n is the summary-correlation with the largest n. # We find that by counting how many non-nans are in each column, # then taking the max n = (~np.isnan(X)).sum(axis=0).max() elif level == "global": # The number of non-NaN entries n = (~np.isnan(X)).sum() else: raise Exception(f"Unknown correlation level: {level}") # Implementation based on https://github.com/cran/psych/blob/master/R/r.test.R diff = r12 - r13 det = 1 - (r12**2) - (r23**2) - (r13**2) + (2 * r12 * r23 * r13) av = (r12 + r13) / 2 cube = (1 - r23) ** 3 t2 = diff * np.sqrt( (n - 1) * (1 + r23) / ((2 * (n - 1) / (n - 3)) * det + av**2 * cube) ) # r.test implicitly assumes that r12 > r13 because it takes the absolute value of the # t statistic. Since we don't, we have to have special handling for one-tailed tests # so we don't map a negative t statistic to a positive one. if alternative == "two-sided": pvalue = scipy.stats.t.sf(abs(t2), n - 3) * 2 elif alternative == "greater": pvalue = scipy.stats.t.sf(t2, n - 3) elif alternative == "less": pvalue = scipy.stats.t.sf(-1 * t2, n - 3) else: raise ValueError(f"Unknown alternative: {alternative}") return WilliamsResult(pvalue)