Source code for att.llm.loader
"""Standardized loader for LLM hidden-state archives."""
from __future__ import annotations
import numpy as np
[docs]
class HiddenStateLoader:
"""Load and query LLM hidden-state archives produced by extract_hidden_states.py.
Parameters
----------
path : str
Path to .npz archive containing hidden states.
"""
def __init__(self, path: str):
self._path = path
data = np.load(path, allow_pickle=True)
self._last_hidden: np.ndarray = data["last_hidden_states"] # (N, d)
self._levels: np.ndarray = data["difficulty_levels"] # (N,)
self._layer_hidden: np.ndarray = data["layer_hidden_states"] # (N, L+1, d)
self._token_trajectories: np.ndarray = data["token_trajectories"] # (N,) object
self._seq_lengths: np.ndarray = data["seq_lengths"] # (N,)
self._model_name: str = str(data["model_name"])
self._hidden_dim: int = int(data["hidden_dim"])
self._num_layers: int = int(data["num_layers"]) # L+1 (includes embedding)
@property
def last_hidden(self) -> np.ndarray:
"""(N, d) last-token hidden states at the final transformer layer."""
return self._last_hidden
@property
def levels(self) -> np.ndarray:
"""(N,) difficulty levels (1-5)."""
return self._levels
@property
def layer_hidden(self) -> np.ndarray:
"""(N, L+1, d) hidden states at the final token across all layers.
Index 0 is the embedding layer output; index -1 is the final transformer layer.
"""
return self._layer_hidden
@property
def token_trajectories(self) -> np.ndarray:
"""(N,) object array where each element is (T_i, d) token-position hidden states."""
return self._token_trajectories
@property
def seq_lengths(self) -> np.ndarray:
"""(N,) sequence lengths per problem."""
return self._seq_lengths
@property
def model_name(self) -> str:
"""HuggingFace model ID used for extraction."""
return self._model_name
@property
def hidden_dim(self) -> int:
"""Dimensionality of hidden-state vectors."""
return self._hidden_dim
@property
def num_layers(self) -> int:
"""Number of layers including embedding layer (L+1)."""
return self._num_layers
@property
def n_problems(self) -> int:
"""Total number of problems."""
return len(self._levels)
@property
def unique_levels(self) -> np.ndarray:
"""Sorted array of unique difficulty levels."""
return np.unique(self._levels)
[docs]
def get_level_mask(self, level: int) -> np.ndarray:
"""Boolean mask for problems at a given difficulty level."""
return self._levels == level
[docs]
def get_level_cloud(
self,
level: int,
layer: int | None = None,
) -> np.ndarray:
"""Point cloud of hidden states for a difficulty level.
Parameters
----------
level : int
Difficulty level (1-5).
layer : int or None
If None, uses last_hidden (final layer, last token).
If int, extracts from layer_hidden at that layer index.
Returns
-------
(n_level, d) array of hidden-state vectors.
"""
mask = self.get_level_mask(level)
if layer is None:
return self._last_hidden[mask]
return self._layer_hidden[mask, layer, :]
[docs]
def get_layer_cloud(
self,
layer: int,
levels: list[int] | None = None,
) -> np.ndarray:
"""Point cloud of hidden states at a specific layer across problems.
Parameters
----------
layer : int
Layer index (0=embedding, -1=final transformer layer).
levels : list of int or None
If None, includes all problems. Otherwise filters to specified levels.
Returns
-------
(n_problems, d) array of hidden-state vectors.
"""
if levels is None:
return self._layer_hidden[:, layer, :]
mask = np.isin(self._levels, levels)
return self._layer_hidden[mask, layer, :]
[docs]
def level_counts(self) -> dict[int, int]:
"""Number of problems per difficulty level."""
return {int(lv): int((self._levels == lv).sum()) for lv in self.unique_levels}
def __repr__(self) -> str:
counts = self.level_counts()
levels_str = ", ".join(f"L{k}={v}" for k, v in sorted(counts.items()))
return (
f"HiddenStateLoader(model={self._model_name!r}, "
f"n={self.n_problems}, d={self._hidden_dim}, "
f"layers={self._num_layers}, {levels_str})"
)