brain-score · mike-ferguson · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/brainscore_language/models/conftest.py b/brainscore_language/models/conftest.py
@@ -0,0 +1,45 @@
+"""
+Shared pytest fixtures for all model tests.
+
+Some HuggingFace models (e.g. Gemma) are "gated" -- they require an authenticated
+token to download weights. This conftest provides a session-scoped fixture that
+automatically fetches a HuggingFace read token from AWS Secrets Manager and logs in
+before any model test runs. If the HF_TOKEN environment variable is already set
+(e.g. via `huggingface-cli login` or manual export), the AWS lookup is skipped.
+
+Because this file lives in brainscore_language/models/, pytest automatically applies
+its fixtures to every test in this directory and all model subdirectories (gemma/,
+gpt/, etc.), so individual test files don't need any auth boilerplate.
+"""
+
+import json
+import os
+
+import boto3
+import pytest
+from huggingface_hub import login
+
+
+@pytest.fixture(autouse=True, scope="session")
+def set_hf_token():
+    """Pull HuggingFace token from AWS Secrets Manager if not already set."""
+    if os.environ.get("HF_TOKEN"):
+        return
+    try:
+        client = boto3.client("secretsmanager", region_name="us-east-2")
+        resp = client.get_secret_value(SecretId="hugging_face_read_token")
+        secret = resp["SecretString"]
+        # Handle both plain string ("hf_...") and JSON ({"key": "hf_..."}) formats
+        try:
+            parsed = json.loads(secret)
+            if isinstance(parsed, dict):
+                token = next(iter(parsed.values()))
+            else:
+                token = str(parsed)
+        except (json.JSONDecodeError, StopIteration):
+            token = secret
+        token = token.strip()
+        os.environ["HF_TOKEN"] = token
+        login(token=token)
+    except Exception as e:
+        pytest.skip(f"HF_TOKEN not set and unable to fetch from AWS Secrets Manager: {e}")
diff --git a/brainscore_language/models/gemma/__init__.py b/brainscore_language/models/gemma/__init__.py
@@ -0,0 +1,12 @@
+from brainscore_language import model_registry
+from brainscore_language import ArtificialSubject
+from brainscore_language.model_helpers.huggingface import HuggingfaceSubject
+
+# Gemma 2B: 18 transformer layers, hidden size 2048.
+# Layer 17 (last) chosen as default mapping pending benchmark-driven selection.
+model_registry['gemma-2b'] = lambda: HuggingfaceSubject(
+    model_id='google/gemma-2b',
+    region_layer_mapping={
+        ArtificialSubject.RecordingTarget.language_system: 'model.layers.17'
+    },
+)
diff --git a/brainscore_language/models/gemma/metadata.yml b/brainscore_language/models/gemma/metadata.yml
@@ -0,0 +1,15 @@
+models:
+  gemma:
+    architecture: DCNN
+    model_family: gemma
+    total_parameter_count: 1234567
+    trainable_parameter_count: 1234567
+    total_layers: 55
+    trainable_layers: 40
+    model_size_mb: 1202
+    training_dataset: null
+    task_specialization: null
+    brainscore_link: https://github.com/brain-score/language/tree/master/brainscore_language/models/gemma
+    huggingface_link: null
+    extra_notes: Temporary hardcoded metadata - will be replaced with actual generation
+    runnable: true
diff --git a/brainscore_language/models/gemma/test.py b/brainscore_language/models/gemma/test.py
@@ -0,0 +1,46 @@
+import numpy as np
+import pytest
+
+from brainscore_language import load_model
+from brainscore_language.artificial_subject import ArtificialSubject
+
+
+@pytest.mark.memory_intense
+def test_load_model():
+    """Model can be loaded from the registry without errors."""
+    model = load_model('gemma-2b')
+    assert model is not None
+
+
+@pytest.mark.memory_intense
+def test_identifier():
+    model = load_model('gemma-2b')
+    assert model.identifier() == 'google/gemma-2b'
+
+
+@pytest.mark.memory_intense
+def test_neural():
+    """Model produces neural representations with the expected shape."""
+    model = load_model('gemma-2b')
+    text = ['the quick brown fox', 'jumps over', 'the lazy dog']
+    model.start_neural_recording(
+        recording_target=ArtificialSubject.RecordingTarget.language_system,
+        recording_type=ArtificialSubject.RecordingType.fMRI,
+    )
+    representations = model.digest_text(text)['neural']
+    assert len(representations['presentation']) == 3
+    np.testing.assert_array_equal(representations['stimulus'], text)
+    assert len(representations['neuroid']) == 2048
+
+
+@pytest.mark.memory_intense
+def test_next_word():
+    """Model can perform next-word prediction and returns a non-empty string."""
+    model = load_model('gemma-2b')
+    text = ['the quick brown fox', 'jumps over', 'the lazy']
+    model.start_behavioral_task(task=ArtificialSubject.Task.next_word)
+    next_words = model.digest_text(text)['behavior']
+    assert len(next_words) == 3
+    for word in next_words.values:
+        assert isinstance(word, str)
+        assert len(word.strip()) > 0