Source code for glass_box_umap.core

from __future__ import annotations
from dataclasses import dataclass
from typing import Literal

import numpy as np
import torch
from numpy.typing import NDArray

from .jacobian import compute_jacobian, project_jacobian, reduce_contributions
from .parametric_umap import ParametricUMAP
from .parametric_umap.core import _to_numpy_float32



[docs]
@dataclass(eq=False, kw_only=True)
class GlassBoxUMAP(ParametricUMAP):
    """Glass Box UMAP model.

    Attributes:
        n_neighbors: Number of nearest neighbors used to construct the
            high-dimensional graph.
        min_dist: Minimum distance between points in the low-dimensional
            embedding.
        metric: Distance metric used for computing nearest neighbors.
        n_components: Dimensionality of the learned embedding.
        random_state: Random seed for reproducibility. If ``None``, no seed
            is set.
        encoder_kwargs: Additional keyword arguments passed to the encoder
            constructor.
        pca_components: Number of PCA components for input preprocessing.
            If ``None``, no PCA is applied. PCA requires 2D input
            ``(n_samples, n_features)``; leave this ``None`` when fitting on
            multi-dimensional data (e.g. images for a convolutional encoder).
        lr: Learning rate for the optimizer.
        epochs: Number of training epochs.
        batch_size: Batch size for training and (default) inference.
        negative_sample_rate: Number of negative samples per positive edge
            in the UMAP loss.
        repulsion_strength: Weighting of the repulsive term in the UMAP loss.
        num_workers: Number of data loading workers.
        checkpoint_dir: Directory for saving training checkpoints. If ``None``,
            a temporary directory is used.
    """


[docs]
    def compute_contributions(
        self,
        X: NDArray[np.floating] | torch.Tensor,
        batch_size: int | None = None,
        reduction: Literal["l2"] | None = None,
    ) -> NDArray[np.float32]:
        """Compute per-feature contributions to the embedding via Gradient x Input.

        Projects gradients back to raw feature space if PCA preprocessing was used.

        Args:
            X:
                The input data (same format as passed to fit/transform).
                Shape: (n_samples, n_features).
            batch_size:
                Batch size for Jacobian computation. Defaults to ``self.batch_size``.
            reduction:
                How to reduce contributions across embedding dimensions. If ``"l2"``,
                takes the L2 norm across components, returning shape
                (n_samples, n_features). If ``None``, returns the full
                (n_samples, n_components, n_features) array.

        Returns:
            Feature contributions array. Shape is (n_samples, n_components, n_features)
            when reduction is ``None``, or (n_samples, n_features) when a reduction
            is applied.
        """
        self._fitted_model.eval()
        self._fitted_model.to(self._device)

        if batch_size is None:
            batch_size = self.batch_size

        assert self._mean is not None
        X_centered = _to_numpy_float32(X) - self._mean

        if self._pca is not None:
            X_encoder = torch.from_numpy(self._pca.transform(X_centered).astype(np.float32))
        else:
            X_encoder = torch.from_numpy(X_centered)

        X_encoder = X_encoder.to(self._device)

        jacobians = self.compute_jacobian(X_encoder, batch_size=batch_size)

        if self._pca is not None:
            proj_tensor = torch.tensor(
                self._pca.components_,
                dtype=torch.float32,
                device=self._device,
            )
            jacobians = project_jacobian(jacobians, proj_tensor)

        X_centered_t = torch.from_numpy(X_centered).unsqueeze(1).to(self._device)
        feature_contributions = (jacobians * X_centered_t).cpu().numpy()

        if reduction is not None:
            feature_contributions = reduce_contributions(feature_contributions, method=reduction)

        return feature_contributions



[docs]
    def compute_jacobian(self, x: torch.Tensor, batch_size: int = 1024) -> torch.Tensor:
        """Compute the Jacobian of a model using ``vmap`` + ``jacrev`` with ``functional_call``.

        See :func:`glass_box_umap.jacobian.compute_jacobian` for details.
        """
        return compute_jacobian(self._fitted_model.encoder, x, batch_size)