Source code for dynsight._internal.analysis.entropy

from __future__ import annotations

from typing import TYPE_CHECKING, Literal

if TYPE_CHECKING:
    from numpy.typing import NDArray

import infomeasure as im
import numpy as np
import numpy.typing as npt
from scipy.spatial import cKDTree
from scipy.spatial.distance import cdist
from scipy.special import digamma, gamma



[docs]
def shannon(
    data: NDArray[np.float64],
    method: Literal["histo", "kl"],
    base: float = 2.0,
    n_neigh: int = 4,
) -> float:
    """Compute the Shannon entropy of a data distribution.

    Parameters:
        data:
            The data for which the entropy is to be computed. Has shape
            (n_samples, n_features).

        method:
            How the Shannon entropy is computed. You should use "histo" for
            discrete variables, and "kl" for continuous variables. If "histo"
            is chosen, the "n_neigh" arg is irrelevant. See the documentation
            of the infomeasure package for more details (link in the notes
            below).

        base:
            The units of measure of the returned value. Use "2" for bits,
            "np.e" for nats.

        n_neigh:
            The number of neighbors considered in the KL estimator. The
            default value n_neigh = 4 is recommended in the literature.

    Returns:
        The value of the Shannon entropy of the data.

    Notes:
        This function uses the ``infomeasure.entropy()`` function, see
        https://infomeasure.readthedocs.io/en/latest/guide/entropy/.

    Example:

        .. testcode:: shannon-test

            import numpy as np
            from dynsight.analysis import shannon
            rng = np.random.default_rng(seed=42)

            ### Discrete case: fair coin. H = 1 bit. ###
            int_data = rng.integers(low=0, high=2, size=100000)
            h_int = shannon(data=int_data, method="histo")

            ### Bivariate case: 2 fair coins. H = 2 bit. ###
            int_data = rng.integers(low=0, high=2, size=(100000, 2))
            h_int_2 = shannon(data=int_data, method="histo")

            ### Continuous case: uniform distribution in [0, 10]. ###
            float_data = rng.random(200000) * 10
            h_float = shannon(data=float_data, method="kl")

        .. testcode:: shannon-test
            :hide:

            assert np.isclose(h_int, 1.0, atol=1e-3)
            assert np.isclose(h_int_2, 2.0, atol=1e-3)
            assert np.isclose(h_float, np.log2(10), atol=5e-3)
    """
    if method not in ("histo", "kl"):
        msg = "method must be 'histo' or 'kl'."
        raise ValueError(msg)
    if method == "histo":
        return im.entropy(data, approach="discrete", base=base)
    # If instead method == "kl":
    min_samples = 2
    if data.shape[0] <= min_samples:
        return 0.0  # can't compute KL entropy with only 2 samples
    return im.entropy(data, approach="metric", k=n_neigh, base=base)




[docs]
def compute_negentropy(
    data: NDArray[np.float64],
    units: Literal["bit", "nat"] = "bit",
) -> float:
    """Estimate negentropy of a dataset.

    Negentropy is a measure of non-Gaussianity representing the distance
    from a Gaussian distribution; it's used to quantify the amount of
    information in a signal, the Gaussian being the less informative
    distribution for a given variance.

    .. math::

        Neg(X) = H(X_{Gauss}) - H(X)

    Parameters:
        data:
            The dataset for which the entropy is to be computed.

        units:
            The units of measure of the output negentropy. If "bit", it is
            computed with log base 2, if "nat" with natural log.

    Returns:
        The negentropy of the dataset.


    Example:

        .. testcode:: negentropy-test

            import numpy as np
            from dynsight.analysis import compute_negentropy

            np.random.seed(1234)
            data = np.random.rand(10000)

            negentropy = compute_negentropy(data)

        .. testcode:: negentropy-test
            :hide:

            assert np.isclose(negentropy, 0.268, rtol=5e-3)

    """
    if units not in ("bit", "nat"):
        msg = "units must be bit or nat."
        raise ValueError(msg)
    base = 2 if units == "bit" else np.e
    data = data.flatten()
    rng = np.random.default_rng(seed=1234)
    data_norm = (data - np.mean(data)) / np.std(data, ddof=1)
    sigma = np.std(data_norm, ddof=1)
    data_gauss = rng.normal(loc=0.0, scale=sigma, size=data.size)
    h_gauss = shannon(data_gauss, method="kl", base=base)
    h_data = shannon(data_norm, method="kl", base=base)
    return h_gauss - h_data




[docs]
def info_gain(
    data: npt.NDArray[np.float64],
    labels: npt.NDArray[np.int64],
    method: Literal["histo", "kl"],
    base: float = 2.0,
    n_neigh: int = 4,
) -> tuple[float, float, float, float]:
    """Compute the information gained by the clustering.

    Parameters:
        data:
            The dataset over which the clustering is performed. Has shape
            (n_samples, n_features).

        labels:
            The clustering labels. Has shape (n_samples,).

        method:
            How the Shannon entropy is computed. You should use "histo" for
            discrete variables, and "kl" for continuous variables. If "histo"
            is chosen, the "n_neigh" arg is irrelevant. See the documentation
            of the infomeasure package for more details (link in the notes
            below).

        base:
            The units of measure of the returned value. Use "2" for bits,
            "np.e" for nats.

        n_neigh:
            The number of neighbors considered in the KL estimator. The
            default value n_neigh = 4 is recommended in the literature.

    Returns:
        * The absolute information gain :math:`H_0 - H_{clust}`
        * The relative information gain :math:`(H_0 - H_{clust}) / H_0`
        * The Shannon entropy of the initial data :math:`H_0`
        * The shannon entropy of the clustered data :math:`H_{clust}`

    Notes:
        This function uses the ``infomeasure.entropy()`` function, see
        https://infomeasure.readthedocs.io/en/latest/guide/entropy/.

    Example:

        .. testcode:: info-gain-test

            import numpy as np
            from dynsight.analysis import info_gain
            rng = np.random.default_rng(seed=42)

            ### Descrete case ###
            int_data = rng.integers(low=0, high=4, size=100000)
            labels = (int_data < 2).astype(int)
            delta_i, *_ = info_gain(
                data=int_data,
                labels=labels,
                method="histo",
            )

        .. testcode:: info-gain-test
            :hide:

            assert np.isclose(delta_i, 1.0)

        .. testcode:: info-gain-test

            ### Continuous case ###
            float_data = rng.random(200000)
            labels = (float_data < 0.5).astype(int)
            delta_i, *_ = info_gain(
                data=float_data,
                labels=labels,
                method="kl",
            )

        .. testcode:: info-gain-test
            :hide:

            assert np.isclose(delta_i, 1.0)
    """
    if data.shape[0] != labels.shape[0]:
        msg = (
            f"data ({data.shape}) and labels ({labels.shape}) "
            "must have same shape[0]"
        )
        raise RuntimeError(msg)
    if method not in ("histo", "kl"):
        msg = "method must be 'histo' or 'kl'."
        raise ValueError(msg)

    n_clusters = np.unique(labels).size
    frac, entr = np.zeros(n_clusters), np.zeros(n_clusters)

    if method == "histo":
        # Compute the total entropy of the data
        total_entropy = shannon(data, method="histo", base=base)

        # Compute the fraction and the entropy of the single clusters
        for i, label in enumerate(np.unique(labels)):
            mask = labels == label
            frac[i] = np.sum(mask) / labels.size
            entr[i] = shannon(data[mask], method="histo", base=base)
    else:  # method == "kl"
        # Compute the total entropy of the data
        total_entropy = shannon(data, method="kl", base=base, n_neigh=n_neigh)

        # Compute the fraction and the entropy of the single clusters
        for i, label in enumerate(np.unique(labels)):
            mask = labels == label
            frac[i] = np.sum(mask) / labels.size
            entr[i] = shannon(
                data[mask],
                method="kl",
                base=base,
                n_neigh=n_neigh,
            )

    # Compute the entropy of the clustered data
    clustered_entropy = np.dot(frac, entr)
    info_gain = total_entropy - clustered_entropy

    return (
        info_gain,
        info_gain / total_entropy,
        total_entropy,
        clustered_entropy,
    )




[docs]
def compute_shannon(
    data: NDArray[np.float64],
    data_range: tuple[float, float],
    n_bins: int,
    units: Literal["bit", "nat", "frac"] = "frac",
) -> float:
    """Compute the Shannon entropy of a univariate data distribution.

    It is normalized so that a uniform distribution has unitary entropy.

    .. deprecated:: v2025.08.27
        This function is deprecated and will be removed after June 2026.
        Use ``analysis.shannon()`` instead.

    Parameters:
        data:
            The dataset for which the entropy is to be computed.

        data_range:
            A tuple (min, max) specifying the range over which the data
            histogram must be computed.

        n_bins:
            The number of bins with which the data histogram must be computed.

        units:
            The units of measure of the output entropy. If "frac", entropy is
            normalized between 0 and 1 by dividing by log(n_bins). If "bit",
            it is computed with log base 2, if "nat" with natural log.

    Returns:
        The value of the normalized Shannon entropy of the dataset.

    Example:

        .. testcode:: shannon1-test

            import numpy as np
            from dynsight.analysis import compute_shannon

            np.random.seed(1234)
            data = np.random.rand(100, 100)
            data_range = (float(np.min(data)), float(np.max(data)))

            data_entropy = compute_shannon(
                data,
                data_range,
                n_bins=40,
            )

        .. testcode:: shannon1-test
            :hide:

            assert np.isclose(data_entropy, 0.9993954419344714)

    """
    if data.size == 0:
        msg = "data is empty"
        raise ValueError(msg)
    if units not in ("bit", "nat", "frac"):
        msg = "units must be bit, nat or frac."
        raise ValueError(msg)
    counts, _ = np.histogram(
        data,
        bins=n_bins,
        range=data_range,
    )
    probs = counts / np.sum(counts)  # Data probabilities are needed
    entropy = -np.sum([p * np.log2(p) for p in probs if p > 0.0])

    if units == "bit":
        return entropy
    if units == "nat":
        return entropy * np.log(2)
    return entropy / np.log2(n_bins)




[docs]
def compute_kl_entropy(
    data: NDArray[np.float64],
    n_neigh: int = 1,
    units: Literal["bit", "nat"] = "bit",
) -> float:
    """Estimate Shannon differential entropy using Kozachenko-Leonenko.

    The Kozachenko-Leonenko k-nearest neighbors method approximates
    differential entropy based on distances to nearest neighbors
    in the sample space. It's main advantage is being parameter-free.

    .. deprecated:: v2025.08.27
        This function is deprecated and will be removed after June 2026.
        Use ``analysis.shannon()`` instead.

    Parameters:
        data:
            The dataset for which the entropy is to be computed.
            Shape (n_data,)

        n_neigh:
            The number of neighbors considered in the KL estimator.

        units:
            The units of measure of the output entropy. If "bit", it is
            computed with log base 2, if "nat" with natural log.

    Returns:
        The Shannon differential entropy of the dataset, in bits.

    Example:

        .. testcode:: kl-entropy-test

            import numpy as np
            from dynsight.analysis import compute_kl_entropy

            np.random.seed(1234)
            data = np.random.rand(10000)

            data_entropy = compute_kl_entropy(data)

        .. testcode:: kl-entropy-test
            :hide:

            assert np.isclose(data_entropy, 0.9891067080934253)

    """
    if units not in ("bit", "nat"):
        msg = "units must be bit or nat."
        raise ValueError(msg)
    data = np.sort(data.flatten())
    n_data = len(data)
    eps = data[n_neigh:] - data[:-n_neigh]  # n_neigh-th neighbor distances
    eps = np.clip(eps, 1e-10, None)  # avoid log(0)
    const = digamma(n_data) - digamma(n_neigh) + np.log(2)  # 1D volume
    if units == "nat":
        const = digamma(n_data) - digamma(n_neigh) + np.log(2)
        return const + np.mean(np.log(eps))
    const = (digamma(n_data) - digamma(n_neigh)) / np.log(2) + 1.0
    return const + np.mean(np.log2(eps))




[docs]
def compute_shannon_multi(
    data: NDArray[np.float64],
    data_ranges: list[tuple[float, float]],
    n_bins: list[int],
    units: Literal["bit", "nat", "frac"] = "frac",
) -> float:
    """Compute the Shannon entropy of a multivariate data distribution.

    It is normalized so that a uniform distribution has unitary entropy.

    .. deprecated:: v2025.08.27
        This function is deprecated and will be removed after June 2026.
        Use ``analysis.shannon()`` instead.

    Parameters:
        data:
            shape (n_samples, n_dimensions)
            The dataset for which the entropy is to be computed.

        data_ranges:
            A list of tuples [(min1, max1), (min2, max2), ...] specifying
            the range over which the histogram must be computed for each
            dimension.

        n_bins:
            A list of integers specifying the number of bins for each
            dimension.

        units:
            The units of measure of the output entropy. If "frac", entropy is
            normalized between 0 and 1 by dividing by log(n_bins). If "bit",
            it is computed with log base 2, if "nat" with natural log.

    Returns:
        The value of the normalized Shannon entropy of the dataset.

    Example:

        .. testcode:: shannon-multi-test

            import numpy as np
            from dynsight.analysis import compute_shannon_multi

            np.random.seed(1234)
            data = np.random.rand(1000, 2)  # 2D dataset
            data_ranges = [(0.0, 1.0), (0.0, 1.0)]
            n_bins = [40, 40]

            data_entropy = compute_shannon_multi(
                data,
                data_ranges,
                n_bins,
            )

        .. testcode:: shannon-multi-test
            :hide:

            assert np.isclose(data_entropy, 0.8837924363474094)

    """
    if data.size == 0:
        msg = "data is empty"
        raise ValueError(msg)
    _, n_dims = data.shape
    if n_dims != len(data_ranges) or n_dims != len(n_bins):
        msg = "Mismatch between data dimensions, data_ranges, and n_bins"
        raise ValueError(msg)
    if units not in ("bit", "nat", "frac"):
        msg = "units must be bit, nat or frac."
        raise ValueError(msg)

    counts, _ = np.histogramdd(data, bins=n_bins, range=data_ranges)
    probs = counts / np.sum(counts)  # Probability distribution
    entropy = -np.sum(probs[probs > 0] * np.log2(probs[probs > 0]))

    if units == "bit":
        return entropy
    if units == "nat":
        return entropy * np.log(2)
    return entropy / np.log2(np.prod(n_bins))  # Normalization




[docs]
def compute_kl_entropy_multi(
    data: NDArray[np.float64],
    n_neigh: int = 1,
    units: Literal["bit", "nat"] = "bit",
) -> float:
    """Estimate Shannon differential entropy using Kozachenko-Leonenko.

    This function works for multivariate distribution.
    The Kozachenko-Leonenko k-nearest neighbors method approximates
    differential entropy based on distances to nearest neighbors
    in the sample space. It's main advantage is being parameter-free.

    .. deprecated:: v2025.08.27
        This function is deprecated and will be removed after June 2026.
        Use ``analysis.shannon()`` instead.

    Parameters:
        data:
            The dataset for which the entropy is to be computed.
            Shape (n_data, n_dims)

        n_neigh:
            The number of neighbors considered in the KL estimator.

        units:
            The units of measure of the output entropy. If "bit", it is
            computed with log base 2, if "nat" with natural log.

    Returns:
        The Shannon differential entropy of the dataset, in bits.

    Example:

        .. testcode:: klm-entropy-test

            import numpy as np
            from dynsight.analysis import compute_kl_entropy_multi

            np.random.seed(1234)
            data = np.random.rand(10000, 2)

            data_entropy = compute_kl_entropy_multi(data)

        .. testcode:: klm-entropy-test
            :hide:

            assert np.isclose(data_entropy, 0.013521446183128614)

    """
    if units not in ("bit", "nat"):
        msg = "units must be bit or nat."
        raise ValueError(msg)
    n_samples, dim = data.shape
    tree = cKDTree(data)
    eps, _ = tree.query(data, k=n_neigh + 1, p=2)
    eps = eps[:, -1]  # distance to the n_neigh-th neighbor
    eps = np.clip(eps, 1e-10, None)  # avoid log(0)
    unit_ball_volume = (np.pi ** (dim / 2)) / gamma(dim / 2 + 1)
    # --- Compute in nats ---
    entropy_nats = (
        digamma(n_samples)
        - digamma(n_neigh)
        + np.log(unit_ball_volume)
        + (dim / n_samples) * np.sum(np.log(eps))
    )

    if units == "nat":
        return entropy_nats
    # bits
    return entropy_nats / np.log(2)




[docs]
def compute_entropy_gain(
    data: npt.NDArray[np.float64],
    labels: npt.NDArray[np.int64],
    method: Literal["histo", "kl"] = "histo",
    n_bins: int = 20,
) -> tuple[float, float, float, float]:
    """Compute the relative information gained by the clustering.

    .. deprecated:: v2025.08.27
        This function is deprecated and will be removed after June 2026.
        Use ``analysis.info_gain()`` instead.

    Parameters:
        data:
            The dataset over which the clustering is performed.

        labels:
            The clustering labels. Has the same shape as "data".

        n_bins:
            The number of bins with which the data histogram must be computed.
            Default is 20.

        method:
            How the Shannon entropy is computed. You should use "histo" for
            discrete variables, and "kl" for continuous variables. If "kl" is
            chosen, the "n_bins" arg is irrelevant. See the documentation of
            ``compute_shannon()`` and ``compute_kl_entropy()`` for more
            details.

    Returns:
        * The absolute information gain :math:`H_0 - H_{clust}`
        * The relative information gain :math:`(H_0 - H_{clust}) / H_0`
        * The Shannon entropy of the initial data :math:`H_0`
        * The shannon entropy of the clustered data :math:`H_{clust}`

    Note:
        The output are expressed as fractions if method is "histo", in bit if
        method is "kl".

    Example:

        .. testcode:: shannon2-test

            import numpy as np
            from dynsight.analysis import compute_entropy_gain

            np.random.seed(1234)
            data = np.random.rand(100, 100)
            labels = np.random.randint(-1, 2, size=(100, 100))

            _, entropy_gain, *_ = compute_entropy_gain(
                data,
                labels,
                n_bins=40,
            )

        .. testcode:: shannon2-test
            :hide:

            assert np.isclose(entropy_gain, 0.0010065005804883983)

    """
    if data.shape[0] != labels.shape[0]:
        msg = (
            f"data ({data.shape}) and labels ({labels.shape}) "
            "must have same shape[0]"
        )
        raise RuntimeError(msg)
    if method not in ("histo", "kl"):
        msg = "method must be histo or kl."
        raise ValueError(msg)

    n_clusters = np.unique(labels).size
    frac, entr = np.zeros(n_clusters), np.zeros(n_clusters)

    if method == "histo":
        data_range = (float(np.min(data)), float(np.max(data)))
        # Compute the total entropy of the data
        total_entropy = compute_shannon(
            data,
            data_range,
            n_bins,
        )

        # Compute the fraction and the entropy of the single clusters
        for i, label in enumerate(np.unique(labels)):
            mask = labels == label
            frac[i] = np.sum(mask) / labels.size
            entr[i] = compute_shannon(
                data[mask],
                data_range,
                n_bins,
            )
    else:  # method == "kl"
        # Compute the total entropy of the data
        total_entropy = compute_kl_entropy(data)

        # Compute the fraction and the entropy of the single clusters
        for i, label in enumerate(np.unique(labels)):
            mask = labels == label
            frac[i] = np.sum(mask) / labels.size
            entr[i] = compute_kl_entropy(data[mask])

    # Compute the entropy of the clustered data
    clustered_entropy = np.dot(frac, entr)
    info_gain = total_entropy - clustered_entropy

    return (
        info_gain,
        info_gain / total_entropy,
        total_entropy,
        clustered_entropy,
    )




[docs]
def compute_entropy_gain_multi(
    data: npt.NDArray[np.float64],
    labels: npt.NDArray[np.int64],
    n_bins: list[int],
    method: Literal["histo", "kl"] = "histo",
) -> tuple[float, float, float, float]:
    """Compute the relative information gained by the clustering.

    .. deprecated:: v2025.08.27
        This function is deprecated and will be removed after June 2026.
        Use ``analysis.info_gain()`` instead.

    Parameters:
        data:
            shape (n_samples, n_dimensions)
            The dataset over which the clustering is performed.

        labels:
            shape (n_samples,)
            The clustering labels.

        n_bins:
            The number of bins with which the data histogram must be computed,
            one for each dimension.

        method:
            How the Shannon entropy is computed. You should use "histo" for
            discrete variables, and "kl" for continuous variables. If "kl" is
            chosen, the "n_bins" arg is irrelevant. See the documentation of
            ``compute_shannon_multi()`` and ``compute_kl_entropy_multi()`` for
            more details.

    Returns:
        * The absolute information gain :math:`H_0 - H_{clust}`
        * The relative information gain :math:`(H_0 - H_{clust}) / H_0`
        * The Shannon entropy of the initial data :math:`H_0`
        * The shannon entropy of the clustered data :math:`H_{clust}`

    Example:

        .. testcode:: shannon2-multi-test

            import numpy as np
            from dynsight.analysis import compute_entropy_gain_multi

            np.random.seed(1234)
            data = np.random.rand(1000, 2)  # 2D dataset
            n_bins = [40, 40]
            labels = np.random.randint(-1, 2, size=1000)

            _, entropy_gain, *_ = compute_entropy_gain_multi(
                data,
                labels,
                n_bins=n_bins,
            )

        .. testcode:: shannon2-multi-test
            :hide:

            assert np.isclose(entropy_gain, 0.13171418273750357)

    """
    if data.shape[0] != labels.shape[0]:
        msg = (
            f"data ({data.shape}) and labels ({labels.shape}) "
            "must have same shape[0]"
        )
        raise RuntimeError(msg)
    if method not in ("histo", "kl"):
        msg = "method must be histo or kl."
        raise ValueError(msg)

    n_clusters = np.unique(labels).size
    frac, entr = np.zeros(n_clusters), np.zeros(n_clusters)

    if method == "histo":
        data_range = [
            (float(np.min(tmp)), float(np.max(tmp))) for tmp in data.T
        ]

        # Compute the total entropy of the data
        total_entropy = compute_shannon_multi(
            data,
            data_range,
            n_bins,
        )

        # Compute the fraction and the entropy of the single clusters
        for i, label in enumerate(np.unique(labels)):
            mask = labels == label
            frac[i] = np.sum(mask) / labels.size
            entr[i] = compute_shannon_multi(
                data[mask],
                data_range,
                n_bins,
            )
    else:  # method == "kl"
        # Compute the total entropy of the data
        total_entropy = compute_kl_entropy_multi(data)

        # Compute the fraction and the entropy of the single clusters
        for i, label in enumerate(np.unique(labels)):
            mask = labels == label
            frac[i] = np.sum(mask) / labels.size
            entr[i] = compute_kl_entropy_multi(data[mask])

    # Compute the entropy of the clustered data
    clustered_entropy = np.dot(frac, entr)
    info_gain = total_entropy - clustered_entropy

    return (
        info_gain,
        info_gain / total_entropy,
        total_entropy,
        clustered_entropy,
    )




[docs]
def sample_entropy(
    time_series: NDArray[np.float64],
    r_factor: np.float64 | float,
    m_par: int = 2,
) -> float:
    """Computes the Sample Entropy of a single time-series.

    The Chebyshev distance is used. SampEn takes values between 0 and +inf.
    If the time-series is too short for the chosen m_par, raises ValueError.
    If no matching sequences can be found, raises RuntimeError.

    Parameters:
        time_series : np.ndarray of shape (n_frames,)
            The time-series data.

        r_factor : float
            The similarity threshold between signal windows. A common choice
            is 0.2 * the standard deviation of the time-series.

        m_par : int (default 2)
            The m parameter (length of the considered overlapping windows).

    Returns:
        Sample entropy of the input time-series.

    Example:

        .. testcode:: sampen1-test

            import numpy as np
            from dynsight.analysis import sample_entropy

            np.random.seed(1234)
            data = np.random.rand(100)
            r_factor = 0.5 * np.std(data)

            sampen = sample_entropy(
                data,
                r_factor=r_factor,
                m_par=2,
            )

        .. testcode:: sampen1-test
            :hide:

            assert np.isclose(sampen, 1.6094379124341003)

    """
    n_sum = len(time_series) - m_par

    if n_sum < 1:
        msg = "Time-series too short"
        raise ValueError(msg)

    m1_seq = np.array([time_series[i : i + m_par + 1] for i in range(n_sum)])
    m2_seq = np.array(
        [time_series[i : i + m_par + 2] for i in range(n_sum - 1)]
    )

    dist_matrix_1 = cdist(m1_seq, m1_seq, metric="chebyshev")
    dist_matrix_2 = cdist(m2_seq, m2_seq, metric="chebyshev")

    np.fill_diagonal(dist_matrix_1, 2 * r_factor)
    np.fill_diagonal(dist_matrix_2, 2 * r_factor)

    pos = np.sum(dist_matrix_1 < r_factor)
    mat = np.sum(dist_matrix_2 < r_factor)

    if pos == 0 or mat == 0:
        msg = "No matching sequences found."
        raise RuntimeError(msg)

    return -np.log(mat / pos)