Source code for dynsight._internal.analysis.entropy

from __future__ import annotations

from typing import TYPE_CHECKING, Literal

if TYPE_CHECKING:
    from numpy.typing import NDArray

import infomeasure as im
import numpy as np
import numpy.typing as npt
from scipy.spatial import cKDTree
from scipy.spatial.distance import cdist
from scipy.special import digamma, gamma


[docs] def shannon( data: NDArray[np.float64], method: Literal["histo", "kl"], base: float = 2.0, n_neigh: int = 4, ) -> float: """Compute the Shannon entropy of a data distribution. Parameters: data: The data for which the entropy is to be computed. Has shape (n_samples, n_features). method: How the Shannon entropy is computed. You should use "histo" for discrete variables, and "kl" for continuous variables. If "histo" is chosen, the "n_neigh" arg is irrelevant. See the documentation of the infomeasure package for more details (link in the notes below). base: The units of measure of the returned value. Use "2" for bits, "np.e" for nats. n_neigh: The number of neighbors considered in the KL estimator. The default value n_neigh = 4 is recommended in the literature. Returns: The value of the Shannon entropy of the data. Notes: This function uses the ``infomeasure.entropy()`` function, see https://infomeasure.readthedocs.io/en/latest/guide/entropy/. Example: .. testcode:: shannon-test import numpy as np from dynsight.analysis import shannon rng = np.random.default_rng(seed=42) ### Discrete case: fair coin. H = 1 bit. ### int_data = rng.integers(low=0, high=2, size=100000) h_int = shannon(data=int_data, method="histo") ### Bivariate case: 2 fair coins. H = 2 bit. ### int_data = rng.integers(low=0, high=2, size=(100000, 2)) h_int_2 = shannon(data=int_data, method="histo") ### Continuous case: uniform distribution in [0, 10]. ### float_data = rng.random(200000) * 10 h_float = shannon(data=float_data, method="kl") .. testcode:: shannon-test :hide: assert np.isclose(h_int, 1.0, atol=1e-3) assert np.isclose(h_int_2, 2.0, atol=1e-3) assert np.isclose(h_float, np.log2(10), atol=5e-3) """ if method not in ("histo", "kl"): msg = "method must be 'histo' or 'kl'." raise ValueError(msg) if method == "histo": return im.entropy(data, approach="discrete", base=base) # If instead method == "kl": min_samples = 2 if data.shape[0] <= min_samples: return 0.0 # can't compute KL entropy with only 2 samples return im.entropy(data, approach="metric", k=n_neigh, base=base)
[docs] def compute_negentropy( data: NDArray[np.float64], units: Literal["bit", "nat"] = "bit", ) -> float: """Estimate negentropy of a dataset. Negentropy is a measure of non-Gaussianity representing the distance from a Gaussian distribution; it's used to quantify the amount of information in a signal, the Gaussian being the less informative distribution for a given variance. .. math:: Neg(X) = H(X_{Gauss}) - H(X) Parameters: data: The dataset for which the entropy is to be computed. units: The units of measure of the output negentropy. If "bit", it is computed with log base 2, if "nat" with natural log. Returns: The negentropy of the dataset. Example: .. testcode:: negentropy-test import numpy as np from dynsight.analysis import compute_negentropy np.random.seed(1234) data = np.random.rand(10000) negentropy = compute_negentropy(data) .. testcode:: negentropy-test :hide: assert np.isclose(negentropy, 0.268, rtol=5e-3) """ if units not in ("bit", "nat"): msg = "units must be bit or nat." raise ValueError(msg) base = 2 if units == "bit" else np.e data = data.flatten() rng = np.random.default_rng(seed=1234) data_norm = (data - np.mean(data)) / np.std(data, ddof=1) sigma = np.std(data_norm, ddof=1) data_gauss = rng.normal(loc=0.0, scale=sigma, size=data.size) h_gauss = shannon(data_gauss, method="kl", base=base) h_data = shannon(data_norm, method="kl", base=base) return h_gauss - h_data
[docs] def info_gain( data: npt.NDArray[np.float64], labels: npt.NDArray[np.int64], method: Literal["histo", "kl"], base: float = 2.0, n_neigh: int = 4, ) -> tuple[float, float, float, float]: """Compute the information gained by the clustering. Parameters: data: The dataset over which the clustering is performed. Has shape (n_samples, n_features). labels: The clustering labels. Has shape (n_samples,). method: How the Shannon entropy is computed. You should use "histo" for discrete variables, and "kl" for continuous variables. If "histo" is chosen, the "n_neigh" arg is irrelevant. See the documentation of the infomeasure package for more details (link in the notes below). base: The units of measure of the returned value. Use "2" for bits, "np.e" for nats. n_neigh: The number of neighbors considered in the KL estimator. The default value n_neigh = 4 is recommended in the literature. Returns: * The absolute information gain :math:`H_0 - H_{clust}` * The relative information gain :math:`(H_0 - H_{clust}) / H_0` * The Shannon entropy of the initial data :math:`H_0` * The shannon entropy of the clustered data :math:`H_{clust}` Notes: This function uses the ``infomeasure.entropy()`` function, see https://infomeasure.readthedocs.io/en/latest/guide/entropy/. Example: .. testcode:: info-gain-test import numpy as np from dynsight.analysis import info_gain rng = np.random.default_rng(seed=42) ### Descrete case ### int_data = rng.integers(low=0, high=4, size=100000) labels = (int_data < 2).astype(int) delta_i, *_ = info_gain( data=int_data, labels=labels, method="histo", ) .. testcode:: info-gain-test :hide: assert np.isclose(delta_i, 1.0) .. testcode:: info-gain-test ### Continuous case ### float_data = rng.random(200000) labels = (float_data < 0.5).astype(int) delta_i, *_ = info_gain( data=float_data, labels=labels, method="kl", ) .. testcode:: info-gain-test :hide: assert np.isclose(delta_i, 1.0) """ if data.shape[0] != labels.shape[0]: msg = ( f"data ({data.shape}) and labels ({labels.shape}) " "must have same shape[0]" ) raise RuntimeError(msg) if method not in ("histo", "kl"): msg = "method must be 'histo' or 'kl'." raise ValueError(msg) n_clusters = np.unique(labels).size frac, entr = np.zeros(n_clusters), np.zeros(n_clusters) if method == "histo": # Compute the total entropy of the data total_entropy = shannon(data, method="histo", base=base) # Compute the fraction and the entropy of the single clusters for i, label in enumerate(np.unique(labels)): mask = labels == label frac[i] = np.sum(mask) / labels.size entr[i] = shannon(data[mask], method="histo", base=base) else: # method == "kl" # Compute the total entropy of the data total_entropy = shannon(data, method="kl", base=base, n_neigh=n_neigh) # Compute the fraction and the entropy of the single clusters for i, label in enumerate(np.unique(labels)): mask = labels == label frac[i] = np.sum(mask) / labels.size entr[i] = shannon( data[mask], method="kl", base=base, n_neigh=n_neigh, ) # Compute the entropy of the clustered data clustered_entropy = np.dot(frac, entr) info_gain = total_entropy - clustered_entropy return ( info_gain, info_gain / total_entropy, total_entropy, clustered_entropy, )
[docs] def compute_shannon( data: NDArray[np.float64], data_range: tuple[float, float], n_bins: int, units: Literal["bit", "nat", "frac"] = "frac", ) -> float: """Compute the Shannon entropy of a univariate data distribution. It is normalized so that a uniform distribution has unitary entropy. .. deprecated:: v2025.08.27 This function is deprecated and will be removed after June 2026. Use ``analysis.shannon()`` instead. Parameters: data: The dataset for which the entropy is to be computed. data_range: A tuple (min, max) specifying the range over which the data histogram must be computed. n_bins: The number of bins with which the data histogram must be computed. units: The units of measure of the output entropy. If "frac", entropy is normalized between 0 and 1 by dividing by log(n_bins). If "bit", it is computed with log base 2, if "nat" with natural log. Returns: The value of the normalized Shannon entropy of the dataset. Example: .. testcode:: shannon1-test import numpy as np from dynsight.analysis import compute_shannon np.random.seed(1234) data = np.random.rand(100, 100) data_range = (float(np.min(data)), float(np.max(data))) data_entropy = compute_shannon( data, data_range, n_bins=40, ) .. testcode:: shannon1-test :hide: assert np.isclose(data_entropy, 0.9993954419344714) """ if data.size == 0: msg = "data is empty" raise ValueError(msg) if units not in ("bit", "nat", "frac"): msg = "units must be bit, nat or frac." raise ValueError(msg) counts, _ = np.histogram( data, bins=n_bins, range=data_range, ) probs = counts / np.sum(counts) # Data probabilities are needed entropy = -np.sum([p * np.log2(p) for p in probs if p > 0.0]) if units == "bit": return entropy if units == "nat": return entropy * np.log(2) return entropy / np.log2(n_bins)
[docs] def compute_kl_entropy( data: NDArray[np.float64], n_neigh: int = 1, units: Literal["bit", "nat"] = "bit", ) -> float: """Estimate Shannon differential entropy using Kozachenko-Leonenko. The Kozachenko-Leonenko k-nearest neighbors method approximates differential entropy based on distances to nearest neighbors in the sample space. It's main advantage is being parameter-free. .. deprecated:: v2025.08.27 This function is deprecated and will be removed after June 2026. Use ``analysis.shannon()`` instead. Parameters: data: The dataset for which the entropy is to be computed. Shape (n_data,) n_neigh: The number of neighbors considered in the KL estimator. units: The units of measure of the output entropy. If "bit", it is computed with log base 2, if "nat" with natural log. Returns: The Shannon differential entropy of the dataset, in bits. Example: .. testcode:: kl-entropy-test import numpy as np from dynsight.analysis import compute_kl_entropy np.random.seed(1234) data = np.random.rand(10000) data_entropy = compute_kl_entropy(data) .. testcode:: kl-entropy-test :hide: assert np.isclose(data_entropy, 0.9891067080934253) """ if units not in ("bit", "nat"): msg = "units must be bit or nat." raise ValueError(msg) data = np.sort(data.flatten()) n_data = len(data) eps = data[n_neigh:] - data[:-n_neigh] # n_neigh-th neighbor distances eps = np.clip(eps, 1e-10, None) # avoid log(0) const = digamma(n_data) - digamma(n_neigh) + np.log(2) # 1D volume if units == "nat": const = digamma(n_data) - digamma(n_neigh) + np.log(2) return const + np.mean(np.log(eps)) const = (digamma(n_data) - digamma(n_neigh)) / np.log(2) + 1.0 return const + np.mean(np.log2(eps))
[docs] def compute_shannon_multi( data: NDArray[np.float64], data_ranges: list[tuple[float, float]], n_bins: list[int], units: Literal["bit", "nat", "frac"] = "frac", ) -> float: """Compute the Shannon entropy of a multivariate data distribution. It is normalized so that a uniform distribution has unitary entropy. .. deprecated:: v2025.08.27 This function is deprecated and will be removed after June 2026. Use ``analysis.shannon()`` instead. Parameters: data: shape (n_samples, n_dimensions) The dataset for which the entropy is to be computed. data_ranges: A list of tuples [(min1, max1), (min2, max2), ...] specifying the range over which the histogram must be computed for each dimension. n_bins: A list of integers specifying the number of bins for each dimension. units: The units of measure of the output entropy. If "frac", entropy is normalized between 0 and 1 by dividing by log(n_bins). If "bit", it is computed with log base 2, if "nat" with natural log. Returns: The value of the normalized Shannon entropy of the dataset. Example: .. testcode:: shannon-multi-test import numpy as np from dynsight.analysis import compute_shannon_multi np.random.seed(1234) data = np.random.rand(1000, 2) # 2D dataset data_ranges = [(0.0, 1.0), (0.0, 1.0)] n_bins = [40, 40] data_entropy = compute_shannon_multi( data, data_ranges, n_bins, ) .. testcode:: shannon-multi-test :hide: assert np.isclose(data_entropy, 0.8837924363474094) """ if data.size == 0: msg = "data is empty" raise ValueError(msg) _, n_dims = data.shape if n_dims != len(data_ranges) or n_dims != len(n_bins): msg = "Mismatch between data dimensions, data_ranges, and n_bins" raise ValueError(msg) if units not in ("bit", "nat", "frac"): msg = "units must be bit, nat or frac." raise ValueError(msg) counts, _ = np.histogramdd(data, bins=n_bins, range=data_ranges) probs = counts / np.sum(counts) # Probability distribution entropy = -np.sum(probs[probs > 0] * np.log2(probs[probs > 0])) if units == "bit": return entropy if units == "nat": return entropy * np.log(2) return entropy / np.log2(np.prod(n_bins)) # Normalization
[docs] def compute_kl_entropy_multi( data: NDArray[np.float64], n_neigh: int = 1, units: Literal["bit", "nat"] = "bit", ) -> float: """Estimate Shannon differential entropy using Kozachenko-Leonenko. This function works for multivariate distribution. The Kozachenko-Leonenko k-nearest neighbors method approximates differential entropy based on distances to nearest neighbors in the sample space. It's main advantage is being parameter-free. .. deprecated:: v2025.08.27 This function is deprecated and will be removed after June 2026. Use ``analysis.shannon()`` instead. Parameters: data: The dataset for which the entropy is to be computed. Shape (n_data, n_dims) n_neigh: The number of neighbors considered in the KL estimator. units: The units of measure of the output entropy. If "bit", it is computed with log base 2, if "nat" with natural log. Returns: The Shannon differential entropy of the dataset, in bits. Example: .. testcode:: klm-entropy-test import numpy as np from dynsight.analysis import compute_kl_entropy_multi np.random.seed(1234) data = np.random.rand(10000, 2) data_entropy = compute_kl_entropy_multi(data) .. testcode:: klm-entropy-test :hide: assert np.isclose(data_entropy, 0.013521446183128614) """ if units not in ("bit", "nat"): msg = "units must be bit or nat." raise ValueError(msg) n_samples, dim = data.shape tree = cKDTree(data) eps, _ = tree.query(data, k=n_neigh + 1, p=2) eps = eps[:, -1] # distance to the n_neigh-th neighbor eps = np.clip(eps, 1e-10, None) # avoid log(0) unit_ball_volume = (np.pi ** (dim / 2)) / gamma(dim / 2 + 1) # --- Compute in nats --- entropy_nats = ( digamma(n_samples) - digamma(n_neigh) + np.log(unit_ball_volume) + (dim / n_samples) * np.sum(np.log(eps)) ) if units == "nat": return entropy_nats # bits return entropy_nats / np.log(2)
[docs] def compute_entropy_gain( data: npt.NDArray[np.float64], labels: npt.NDArray[np.int64], method: Literal["histo", "kl"] = "histo", n_bins: int = 20, ) -> tuple[float, float, float, float]: """Compute the relative information gained by the clustering. .. deprecated:: v2025.08.27 This function is deprecated and will be removed after June 2026. Use ``analysis.info_gain()`` instead. Parameters: data: The dataset over which the clustering is performed. labels: The clustering labels. Has the same shape as "data". n_bins: The number of bins with which the data histogram must be computed. Default is 20. method: How the Shannon entropy is computed. You should use "histo" for discrete variables, and "kl" for continuous variables. If "kl" is chosen, the "n_bins" arg is irrelevant. See the documentation of ``compute_shannon()`` and ``compute_kl_entropy()`` for more details. Returns: * The absolute information gain :math:`H_0 - H_{clust}` * The relative information gain :math:`(H_0 - H_{clust}) / H_0` * The Shannon entropy of the initial data :math:`H_0` * The shannon entropy of the clustered data :math:`H_{clust}` Note: The output are expressed as fractions if method is "histo", in bit if method is "kl". Example: .. testcode:: shannon2-test import numpy as np from dynsight.analysis import compute_entropy_gain np.random.seed(1234) data = np.random.rand(100, 100) labels = np.random.randint(-1, 2, size=(100, 100)) _, entropy_gain, *_ = compute_entropy_gain( data, labels, n_bins=40, ) .. testcode:: shannon2-test :hide: assert np.isclose(entropy_gain, 0.0010065005804883983) """ if data.shape[0] != labels.shape[0]: msg = ( f"data ({data.shape}) and labels ({labels.shape}) " "must have same shape[0]" ) raise RuntimeError(msg) if method not in ("histo", "kl"): msg = "method must be histo or kl." raise ValueError(msg) n_clusters = np.unique(labels).size frac, entr = np.zeros(n_clusters), np.zeros(n_clusters) if method == "histo": data_range = (float(np.min(data)), float(np.max(data))) # Compute the total entropy of the data total_entropy = compute_shannon( data, data_range, n_bins, ) # Compute the fraction and the entropy of the single clusters for i, label in enumerate(np.unique(labels)): mask = labels == label frac[i] = np.sum(mask) / labels.size entr[i] = compute_shannon( data[mask], data_range, n_bins, ) else: # method == "kl" # Compute the total entropy of the data total_entropy = compute_kl_entropy(data) # Compute the fraction and the entropy of the single clusters for i, label in enumerate(np.unique(labels)): mask = labels == label frac[i] = np.sum(mask) / labels.size entr[i] = compute_kl_entropy(data[mask]) # Compute the entropy of the clustered data clustered_entropy = np.dot(frac, entr) info_gain = total_entropy - clustered_entropy return ( info_gain, info_gain / total_entropy, total_entropy, clustered_entropy, )
[docs] def compute_entropy_gain_multi( data: npt.NDArray[np.float64], labels: npt.NDArray[np.int64], n_bins: list[int], method: Literal["histo", "kl"] = "histo", ) -> tuple[float, float, float, float]: """Compute the relative information gained by the clustering. .. deprecated:: v2025.08.27 This function is deprecated and will be removed after June 2026. Use ``analysis.info_gain()`` instead. Parameters: data: shape (n_samples, n_dimensions) The dataset over which the clustering is performed. labels: shape (n_samples,) The clustering labels. n_bins: The number of bins with which the data histogram must be computed, one for each dimension. method: How the Shannon entropy is computed. You should use "histo" for discrete variables, and "kl" for continuous variables. If "kl" is chosen, the "n_bins" arg is irrelevant. See the documentation of ``compute_shannon_multi()`` and ``compute_kl_entropy_multi()`` for more details. Returns: * The absolute information gain :math:`H_0 - H_{clust}` * The relative information gain :math:`(H_0 - H_{clust}) / H_0` * The Shannon entropy of the initial data :math:`H_0` * The shannon entropy of the clustered data :math:`H_{clust}` Example: .. testcode:: shannon2-multi-test import numpy as np from dynsight.analysis import compute_entropy_gain_multi np.random.seed(1234) data = np.random.rand(1000, 2) # 2D dataset n_bins = [40, 40] labels = np.random.randint(-1, 2, size=1000) _, entropy_gain, *_ = compute_entropy_gain_multi( data, labels, n_bins=n_bins, ) .. testcode:: shannon2-multi-test :hide: assert np.isclose(entropy_gain, 0.13171418273750357) """ if data.shape[0] != labels.shape[0]: msg = ( f"data ({data.shape}) and labels ({labels.shape}) " "must have same shape[0]" ) raise RuntimeError(msg) if method not in ("histo", "kl"): msg = "method must be histo or kl." raise ValueError(msg) n_clusters = np.unique(labels).size frac, entr = np.zeros(n_clusters), np.zeros(n_clusters) if method == "histo": data_range = [ (float(np.min(tmp)), float(np.max(tmp))) for tmp in data.T ] # Compute the total entropy of the data total_entropy = compute_shannon_multi( data, data_range, n_bins, ) # Compute the fraction and the entropy of the single clusters for i, label in enumerate(np.unique(labels)): mask = labels == label frac[i] = np.sum(mask) / labels.size entr[i] = compute_shannon_multi( data[mask], data_range, n_bins, ) else: # method == "kl" # Compute the total entropy of the data total_entropy = compute_kl_entropy_multi(data) # Compute the fraction and the entropy of the single clusters for i, label in enumerate(np.unique(labels)): mask = labels == label frac[i] = np.sum(mask) / labels.size entr[i] = compute_kl_entropy_multi(data[mask]) # Compute the entropy of the clustered data clustered_entropy = np.dot(frac, entr) info_gain = total_entropy - clustered_entropy return ( info_gain, info_gain / total_entropy, total_entropy, clustered_entropy, )
[docs] def sample_entropy( time_series: NDArray[np.float64], r_factor: np.float64 | float, m_par: int = 2, ) -> float: """Computes the Sample Entropy of a single time-series. The Chebyshev distance is used. SampEn takes values between 0 and +inf. If the time-series is too short for the chosen m_par, raises ValueError. If no matching sequences can be found, raises RuntimeError. Parameters: time_series : np.ndarray of shape (n_frames,) The time-series data. r_factor : float The similarity threshold between signal windows. A common choice is 0.2 * the standard deviation of the time-series. m_par : int (default 2) The m parameter (length of the considered overlapping windows). Returns: Sample entropy of the input time-series. Example: .. testcode:: sampen1-test import numpy as np from dynsight.analysis import sample_entropy np.random.seed(1234) data = np.random.rand(100) r_factor = 0.5 * np.std(data) sampen = sample_entropy( data, r_factor=r_factor, m_par=2, ) .. testcode:: sampen1-test :hide: assert np.isclose(sampen, 1.6094379124341003) """ n_sum = len(time_series) - m_par if n_sum < 1: msg = "Time-series too short" raise ValueError(msg) m1_seq = np.array([time_series[i : i + m_par + 1] for i in range(n_sum)]) m2_seq = np.array( [time_series[i : i + m_par + 2] for i in range(n_sum - 1)] ) dist_matrix_1 = cdist(m1_seq, m1_seq, metric="chebyshev") dist_matrix_2 = cdist(m2_seq, m2_seq, metric="chebyshev") np.fill_diagonal(dist_matrix_1, 2 * r_factor) np.fill_diagonal(dist_matrix_2, 2 * r_factor) pos = np.sum(dist_matrix_1 < r_factor) mat = np.sum(dist_matrix_2 < r_factor) if pos == 0 or mat == 0: msg = "No matching sequences found." raise RuntimeError(msg) return -np.log(mat / pos)