Update from rebase

luciaquirke · luciaquirke · commit 98d3b14e2717 · 2025-09-18T03:45:13.000Z
diff --git a/bergson/attributor.py b/bergson/attributor.py
@@ -121,67 +121,6 @@ def search(
 
         return torch.topk(scores, k)
 
-    def search_module(
-        self, queries: Tensor, k: int, module: str
-    ) -> tuple[Tensor, Tensor]:
-        """
-        Search for the `k` nearest examples in the index based on the query or queries.
-        If fewer than `k` examples are found FAISS will return items with the index -1
-        and the maximum negative distance.
-
-        Args:
-            queries: The query tensor of shape [..., d].
-            k: The number of nearest examples to return for each query.
-            nprobe: The number of FAISS vector clusters to search if using ANN.
-
-        Returns:
-            A namedtuple containing the top `k` indices and inner products for each
-            query. Both have shape [..., k].
-        """
-        assert isinstance(
-            self.grads, dict
-        ), "Gradients must be a dictionary of tensors."
-        assert module in self.grads, f"Module {module} not found in gradients."
-
-        k = min(k, self.grads[module].shape[0])
-
-        q = queries
-
-        if self.unit_norm:
-            q /= q.norm(dim=1, keepdim=True)
-
-        if not self.faiss_cfg:
-            return torch.topk(q.to(self.device) @ self.grads[module].mT, k)
-
-        q = q.cpu().numpy()
-
-        shard_distances = []
-        shard_indices = []
-        offset = 0
-
-        for index in self.faiss_shards:
-            index.nprobe = self.faiss_cfg.nprobe
-            distances, indices = index.search(q, k)
-
-            indices += offset
-            offset += index.ntotal
-
-            shard_distances.append(distances)
-            shard_indices.append(indices)
-
-        distances = np.concatenate(shard_distances, axis=1)
-        indices = np.concatenate(shard_indices, axis=1)
-
-        # Rerank results overfetched from multiple shards
-        if len(self.faiss_shards) > 1:
-            topk_indices = np.argsort(distances, axis=1)[:, :k]
-            indices = indices[np.arange(indices.shape[0])[:, None], topk_indices]
-            distances = distances[np.arange(distances.shape[0])[:, None], topk_indices]
-
-        return torch.from_numpy(distances.squeeze()), torch.from_numpy(
-            indices.squeeze()
-        )
-
     @contextmanager
     def trace(
         self, module: nn.Module, k: int, *, precondition: bool = False
diff --git a/bergson/huggingface.py b/bergson/huggingface.py
@@ -31,23 +31,25 @@ def __init__(
         path: str,
         head_cfgs: dict[str, HeadConfig],
         projection_dim: int = 16,
+        dtype: DTypeLike = np.float16,
         accumulate_grads: bool = False,
         use_optimizer_state: bool = True,
         track_order: bool = False,
     ):
         """
         Args:
             path: The path to save the gradients
+            head_cfgs: Information used to split matrix-valued parameters into
+                per-head matrices before down projection.
             projection_dim: The dimension to project the gradients onto
+            dtype: The dtype of the on-disk gradient store
             accumulate_grads: Whether to take the sum of the gradients
                 of the same example across epochs. If `False`, the
                 gradients for each epoch are stored separately.
             use_optimizer_state: Whether to use the optimizer state to
                 normalize the gradients. If `False`, no normalization is
                 applied.
             track_order: Whether to record the shuffled order of training data.
-        head_cfgs: Information used to split matrix-valued parameters into
-            per-head matrices before down projection.
         """
         super().__init__()
 
@@ -73,9 +75,6 @@ def __init__(
         # TODO: Handle this more elegantly
         self.torch_dtype = torch.float32 if self.dtype == np.float32 else torch.float16
 
-        # TODO: Handle this more elegantly
-        self.torch_dtype = torch.float32 if self.dtype == np.float32 else torch.float16
-
     def write_grads(self, grad_buffer: np.memmap):
         # Ensure the nonblocking copies are all finished
         torch.cuda.synchronize()
@@ -84,12 +83,6 @@ def write_grads(self, grad_buffer: np.memmap):
 
         self.mod_grads.clear()
 
-    def on_step_begin(self, args, state, control, **kwargs):
-        """Track the current step and epoch for training order recording."""
-        if self.order:
-            self._current_step = state.global_step
-            self._current_epoch = int(state.epoch or 0)
-
     def on_train_begin(
         self,
         args: TrainingArguments,
@@ -266,21 +259,6 @@ def on_step_end(
         if not self.use_optimizer_state:
             return
 
-        # Record training order if enabled
-        if self.order:
-            assert (
-                self.batch_indices is not None
-            ), "Batch indices are not available for training order tracking"
-
-            self.order.extend(
-                {
-                    "_idx": int(idx),
-                    "global_step": getattr(self, "_current_step", 0),
-                    "epoch": getattr(self, "_current_epoch", 0),
-                }
-                for idx in self.batch_indices.tolist()
-            )
-
         # The optimizer doesn't actually know the names of the parameters
         model = getattr(model, "base_model", model)
         param_to_name = {
diff --git a/examples/find_induction_heads.py b/examples/find_induction_heads.py
@@ -39,9 +39,10 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 import wandb
-from bergson.attributor import Attributor
 
 # from bergson.data import load_gradient_dataset
+from bergson import HeadConfig
+from bergson.attributor import Attributor
 from bergson.collection import collect_gradients
 from bergson.gradients import GradientProcessor
 from bergson.huggingface import (
@@ -349,9 +350,11 @@ def create_transformer():
     return model, tokenizer
 
 
-def load_tinystories_data(tokenizer, max_length=512, N=10000):
+def load_tinystories_data(tokenizer, max_length=512, N: int | None = 10_000):
     """Load and preprocess TinyStories dataset."""
     dataset = load_dataset("EleutherAI/SmolLM2-135M-10B", split="train")
+    if N is not None:
+        dataset = dataset.select(range(min(N, len(dataset))))
     # dataset = load_dataset("roneneldan/TinyStories", split="train")
     # dataset = dataset.select(range(min(N, len(dataset))))
 
@@ -552,9 +555,14 @@ def setup_training(
 
     bergson_callback = GradientCollectorCallback(
         path=f"{output_dir}/gradients",
+        head_cfgs={
+            "h.0.attn.c_attn": HeadConfig(12, 192, 2),
+            "h.0.attn.c_proj": HeadConfig(12, 64, 2),
+            "h.1.attn.c_attn": HeadConfig(12, 192, 2),
+            "h.1.attn.c_proj": HeadConfig(12, 64, 2),
+        },
         projection_dim=projection_dim,
         dtype=np.float32,
-        torch_dtype=torch.float32,
         accumulate_grads=False,
         track_order=True,
     )
@@ -683,7 +691,10 @@ def main(args):
     model, tokenizer = create_transformer()
 
     # # Load TinyStories data
-    train_dataset, eval_dataset = load_tinystories_data(tokenizer)
+    if args.small:
+        train_dataset, eval_dataset = load_tinystories_data(tokenizer, N=1000)
+    else:
+        train_dataset, eval_dataset = load_tinystories_data(tokenizer)
 
     # # Create induction head dataset
     test_induction_head_labels()
@@ -899,6 +910,7 @@ def main(args):
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--train", action="store_true")
     parser.add_argument("--unit_norm", action="store_true")
+    parser.add_argument("--small", action="store_true")
     parser.add_argument("--tag", type=str, default="")
     args = parser.parse_args()
     main(args)