Source code for nlpstats.correlations.fisher

import numpy as np
import numpy.typing as npt
import scipy.stats
from typing import NamedTuple

from nlpstats.correlations.correlations import correlate


[docs]class FisherResult(NamedTuple):
    lower: float
    """The lower-bound"""

    upper: float
    """The upper-bound"""


[docs]def fisher(
    X: npt.ArrayLike,
    Z: npt.ArrayLike,
    level: str,
    coefficient: str,
    confidence_level: float = 0.95,
) -> FisherResult:
    """Calculates a confidence interval for a correlation via the Fisher transformation.

    The Fisher function is a parametric method for calculating the confidence interval
    for a correlation (see `Bonett & Wright (2000) <https://link.springer.com/content/pdf/10.1007/BF02294183.pdf>`_).

    The rows of :code:`X` and :code:`Z` should always correspond to each other.
    That is, :code:`X[i]` and :code:`Z[i]` contain the scores for the outputs from
    system :code:`i`. For input- and global-level correlations, the columns
    should also correspond to each other and thus :code:`X` and :code:`Z` must be
    the same shape; there is no such requirement for system-level correlations.

    If a score is missing for a specific output, that value should be equal to
    :code:`np.nan`. For input- and global-level correlations, :code:`X` and
    :code:`Z` must have :code:`np.nan` values in the same locations.

    Parameters
    ----------
    X : npt.ArrayLike
        A two-dimensional score matrix
    Z : npt.ArrayLike
        A two-dimensional score matrix
    level : str
        The level of correlation, either :code:`"system"`, :code:`"input"`, or :code:`"global"`.
    coefficient : Union[Callable, str]
        The correlation coefficient to use, either :code:`"pearson"`, :code:`"spearman"`,
        :code:`"kendall"`.
    confidence_level : float
        The confidence level of the correlation interval, between 0 and 1.

    Returns
    -------
    FisherResult
    """
    _fisher_iv(confidence_level)

    r = correlate(X, Z, level, coefficient)

    # See Bonett and Wright (2000) for details
    if coefficient == "pearson":
        b, c = 3, 1
    elif coefficient == "spearman":
        b, c = 3, np.sqrt(1 + r**2 / 2)
    elif coefficient == "kendall":
        b, c = 4, np.sqrt(0.437)
    else:
        raise ValueError(f"Unknown correlation coefficient: {coefficient}")

    if level == "system":
        # The number of systems
        n = X.shape[0]
    elif level == "input":
        # Assume n is the summary-correlation with the largest n.
        # We find that by counting how many non-nans are in each column,
        # then taking the max
        n = (~np.isnan(X)).sum(axis=0).max()
    elif level == "global":
        # The number of non-NaN entries
        n = (~np.isnan(X)).sum()
    else:
        raise Exception(f"Unknown correlation level: {level}")

    alpha = 1 - confidence_level
    if n > b:
        z_r = np.arctanh(r)
        z = scipy.stats.norm.ppf(1.0 - alpha / 2)
        z_l = z_r - z * c / np.sqrt(n - b)
        z_u = z_r + z * c / np.sqrt(n - b)
        r_l = np.tanh(z_l)
        r_u = np.tanh(z_u)
    else:
        r_l, r_u = np.nan, np.nan
    return FisherResult(r_l, r_u)


def _fisher_iv(confidence_level: float) -> None:
    if confidence_level <= 0 or confidence_level >= 1:
        raise ValueError(f"`confidence_level` must be between 0 and 1 (exclusive)")