Source code for dynsight._internal.data_processing.clusters

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np

if TYPE_CHECKING:
    from numpy.typing import NDArray

from dynsight.logs import logger



[docs]
def cleaning_cluster_population(
    labels: NDArray[np.int64],
    threshold: float,
    assigned_env: int,
    excluded_env: int | list[int] | None = None,
) -> NDArray[np.int64]:
    """Replace labels of low-population clusters with a reference label.

    This function identifies clusters whose relative population is below a
    given threshold and reassigns their labels to a specified environment.
    The population of each cluster is computed as the fraction of elements
    belonging to that label, either for 2D inputs (`(n_atoms, n_frames)`)
    or for 3D inputs (`(n_atoms, n_frames, n_dims)`, where n_dims can
    correspond to the different ∆t from Onion clustering).
    Clusters with a population smaller than or equal to the `threshold` are
    considered negligible and are replaced by the `assigned_env` label,
    while all other labels are preserved.
    `excluded_env` give the possibility to exclude some clusters from
    the re-labeling.

    Parameters:
        labels:
            NumPy array containing the label values.
            The array should have dimensions corresponding
            to either (n_atoms, n_frames) for 2D inputs,
            or (n_atoms, n_frames, n_dims) for 3D inputs.
        threshold:
            A float value from 0 to 1 that defines the threshold at which
            small clusters are neglected.
        assigned_env:
            The label at which smaller clusters are assigned to, if the label
            already exists the population extracted will be merged to the
            existing one.
        excluded_env:
            Clusters that need to be preserved even if their population is
            under the threshold.

    Returns:
        A NumPy array of the same shape as the input descriptor array,
        containing the updated labels. If the input
        array is 2D (n_atoms, n_frames), the output will be a 2D array of
        the same shape. Otherwise, if the input is 3D
        (n_atoms, n_frames, n_dims), the output will also be a 3D array
        of the same shape.
        The labels of bigger clusters are uneffected by the re-labeling.

    Raises:
        ValueError:
            If the input descriptor array does not have 2 or 3 dimensions,
            an error is raised.

    Example:

        .. code-block:: python

            from dynsight.data_processing import cleaning_cluster_population
            import numpy as np

            original_labels = np.load('labels_array.npy')

            cleaned_labels = cleaning_cluster_population(
                labels=original_labels,
                threshold=0.1,
                assigned_env=99,
            )

        In this example, the labels of the smaller clusters (lower than 10%)
        from `original_labels` are replaced with label 99. The result is
        stored in `cleaned_labels`, a NumPy array.
    """
    dimension = 2
    if labels.ndim not in (dimension, dimension + 1):
        msg = "descriptor_array must be 2D or 3D."
        raise ValueError(msg)

    if excluded_env is None:
        excluded_arr: NDArray[np.int64] = np.array([], dtype=np.int64)
    elif isinstance(excluded_env, int):
        excluded_arr = np.array([excluded_env], dtype=np.int64)
    else:
        excluded_arr = np.array(excluded_env, dtype=np.int64)

    missing = np.setdiff1d(excluded_arr, np.unique(labels))

    if missing.size > 0:
        logger.warning(f"Excluded value(s) not found in labels: {missing}")

    if labels.ndim == dimension:
        flat = labels.ravel()
        unique, counts = np.unique(flat, return_counts=True)

        populations = counts / flat.size
        small_clusters = unique[populations <= threshold]

        small_clusters = small_clusters[~np.isin(small_clusters, excluded_arr)]

        new_labels = labels.copy()
        if small_clusters.size > 0:
            new_labels[np.isin(labels, small_clusters)] = assigned_env

    elif labels.ndim == dimension + 1:
        new_labels = labels.copy()
        for i in range(labels.shape[2]):
            lab = labels[:, :, i]
            flat = lab.ravel()
            unique, counts = np.unique(flat, return_counts=True)

            populations = counts / flat.size
            small_clusters = unique[populations <= threshold]

            small_clusters = small_clusters[
                ~np.isin(small_clusters, excluded_arr)
            ]

            if small_clusters.size > 0:
                mask = np.isin(lab, small_clusters)
                new_labels[:, :, i][mask] = assigned_env

    return new_labels