EvolvingLMMs-Lab · XingruiWang · Feb 27, 2026 · Mar 1, 2026 · May 13, 2026 · May 13, 2026
diff --git a/debug_xmod_bench_lite.slurm b/debug_xmod_bench_lite.slurm
@@ -0,0 +1,49 @@
+#!/bin/bash
+#SBATCH --job-name=xmod_debug
+#SBATCH --gres=gpu:6000_ada:4
+#SBATCH --mem=40G
+#SBATCH --cpus-per-task=8
+#SBATCH --time=00:30:00
+#SBATCH --output=/home/xwang378/scratch/2025/lmms-eval/logs/xmod_bench_lite/debug_%x_%j.log
+#SBATCH --error=/home/xwang378/scratch/2025/lmms-eval/logs/xmod_bench_lite/debug_%x_%j.log
+
+set -e
+MODEL=${MODEL:-qwen2_5_omni}
+PRETRAINED=${PRETRAINED:-Qwen/Qwen2.5-Omni-7B}
+ENV=${ENV:-qwenomni3}
+TASK=${TASK:-xmod_bench_lite_a2t}
+MODEL_ARGS_EXTRA=${MODEL_ARGS_EXTRA:-device_map=auto,attn_implementation=flash_attention_2}
+
+REPO=/home/xwang378/scratch/2025/lmms-eval
+export XMODBENCH=/home/xwang378/scratch/2025/AudioBench_data
+
+export HF_HOME=/scratch/xwang378/hf_cache
+export HUGGINGFACE_HUB_CACHE="$HF_HOME/hub"
+export HF_DATASETS_CACHE="$HF_HOME/datasets"
+export TRANSFORMERS_CACHE="$HF_HOME/hub"
+export NUMBA_CACHE_DIR=/tmp/numba_${SLURM_JOB_ID}
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+mkdir -p "$NUMBA_CACHE_DIR" "$HF_HOME/hub" "$HF_HOME/datasets"
+
+echo "=== Debug job $SLURM_JOB_ID  model=$MODEL  env=$ENV  task=$TASK ==="
+echo "Node: $(hostname)  GPU: $CUDA_VISIBLE_DEVICES"
+date
+
+mkdir -p "$REPO/logs/xmod_bench_lite/debug_results"
+cd "$REPO"
+
+module load conda
+conda activate "$ENV"
+
+python -m lmms_eval \
+    --model "$MODEL" \
+    --model_args pretrained=$PRETRAINED,$MODEL_ARGS_EXTRA \
+    --tasks "$TASK" \
+    --limit 8 \
+    --batch_size 1 \
+    --output_path "$REPO/logs/xmod_bench_lite/debug_results" \
+    --log_samples \
+    --log_samples_suffix "debug_${MODEL}_${TASK}"
+
+echo "=== Debug done ==="
+date
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
@@ -122,6 +122,10 @@
     "qwen3_vl": "Qwen3_VL",
     "qwen3_5": "Qwen3_5",
     "qwen2_5_vl": "Qwen2_5_VL",
+    "qwen2_5_omni_interleave": "Qwen2_5_OmniInterleave",
+    "qwen3_omni_interleave": "Qwen3_OmniInterleave",
+    "omnivinci_interleave": "OmniVinciInterleave",
+    "baichuan_omni_interleave": "BaichuanOmniInterleave",
     "thyme": "Thyme",
     "openai": "OpenAICompatible",
     "vllm": "VLLM",

diff --git a/lmms_eval/models/chat/_interleave_base.py b/lmms_eval/models/chat/_interleave_base.py
@@ -0,0 +1,100 @@
+"""Shared scaffolding for interleaved-multimedia chat wrappers.
+
+Many lmms-eval *simple* models (qwen2.5-omni, qwen3-omni, omnivinci,
+baichuan-omni, vita, ...) only attach one media object per request via
+`doc_to_visual`. Tasks like XModBench put media in the question stem AND
+every answer option (up to 5 per item), so the simple path drops most of it.
+
+The fix is the same for every model: become a *chat-style* model
+(`is_simple = False`), read the task's `doc_to_messages` output, and feed the
+full interleaved prompt to the underlying model. Only the final
+"messages -> model output string" step is model-specific.
+
+`InterleaveChatMixin` implements the common request loop once. A concrete
+wrapper subclasses it (plus the model's simple class) and implements a single
+method:
+
+    def _infer_one(self, messages: list, gen_kwargs: dict) -> str: ...
+
+where `messages` is the raw `doc_to_messages` output (a list of
+{"role", "content":[{type,url|text}]} dicts).
+
+Per-media size/frame caps that match the upstream XModBench/AudioBench
+runners are exposed as module constants so wrappers stay consistent.
+"""
+
+import traceback
+from typing import List
+
+from loguru import logger as eval_logger
+from tqdm import tqdm
+
+from lmms_eval import utils
+from lmms_eval.api.instance import Instance
+
+# Per-media caps. The upstream AudioBench runner used fps=12/max_frames=60/
+# 512px, but that assumes ~80 GB GPUs; on 24 GB a5000s a single
+# video-condition item plus 4 audio options OOMs (~50% of v2a/v2t video
+# samples were silently dropped). A tighter video budget keeps every sample
+# on-GPU at minor frame-density cost (XModBench video tasks — emotion,
+# spatial, temporal — don't need 60 frames).
+VIDEO_KWARGS = {"fps": 2, "max_frames": 16, "max_pixels": 384 * 384}
+IMAGE_KWARGS = {"max_pixels": 512 * 512}
+
+
+class InterleaveChatMixin:
+    """Mixin providing a chat-style generate_until over doc_to_messages.
+
+    Subclasses may override `video_kwargs` / `image_kwargs` class attributes
+    to use a tighter media budget (e.g. Baichuan-Omni's video path needs far
+    more memory than Qwen-Omni's for the same clip).
+
+    Subclasses must define `_infer_one(self, messages, gen_kwargs) -> str`.
+    """
+
+    is_simple = False
+
+    # Per-model media budget; override in a subclass if it OOMs.
+    video_kwargs = VIDEO_KWARGS
+    image_kwargs = IMAGE_KWARGS
+
+    def _infer_one(self, messages: list, gen_kwargs: dict) -> str:  # pragma: no cover
+        raise NotImplementedError
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+
+        def _collate(x):
+            return x[0], x[0]
+
+        re_ords = utils.Collator(
+            [reg.args for reg in requests],
+            _collate,
+            group_fn=lambda x: x[2],
+            grouping=True,
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        for chunk in chunks:
+            ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk)
+            task_name = task[0]
+            split_name = split[0]
+            gen_kwargs = dict(all_gen_kwargs[0])
+
+            doc = self.task_dict[task_name][split_name][doc_id[0]]
+            messages = doc_to_messages[0](doc)
+
+            try:
+                answer = self._infer_one(messages, gen_kwargs)
+            except Exception as e:
+                eval_logger.error(f"Error in generating: {e}\n{traceback.format_exc()}")
+                answer = ""
+
+            res.append(answer)
+            self.cache_hook.add_partial("generate_until", (ctx[0], gen_kwargs), answer)
+            pbar.update(1)
+
+        res = re_ords.get_original(res)
+        pbar.close()
+        return res
diff --git a/lmms_eval/models/chat/baichuan_omni_interleave.py b/lmms_eval/models/chat/baichuan_omni_interleave.py
@@ -0,0 +1,105 @@
+"""baichuan_omni_interleave — Baichuan-Omni over interleaved doc_to_messages.
+
+See `_interleave_base.InterleaveChatMixin` for the shared rationale and
+request loop. Only the Baichuan-Omni-specific inference step lives here.
+
+Baichuan-Omni's prompt is a single string with media encoded as
+`<start>{"local"|"path": ...}<end>` segments. We emit those segments in the
+exact order of the doc_to_messages blocks so the question stem and every
+option keep their positions.
+"""
+
+import torch
+import ujson
+from loguru import logger as eval_logger
+
+from lmms_eval.api.registry import register_model
+from lmms_eval.models.chat._interleave_base import InterleaveChatMixin
+from lmms_eval.models.simple.baichuan_omni import BaichuanOmni
+
+
+@register_model("baichuan_omni_interleave")
+class BaichuanOmniInterleave(InterleaveChatMixin, BaichuanOmni):
+    """Baichuan-Omni that consumes interleaved doc_to_messages prompts."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # XModBench items carry 4 vision options; at Baichuan's default
+        # ~1 MP/image (and 768*28*28 video) the a2v/t2v configs OOM even on
+        # 4x48GB. OmniImageProcessor caches max_pixels at __init__
+        # (processor_omni.py:164), so setting config.max_pixels afterwards is
+        # too late — set the cached attribute on the already-built
+        # visual/video sub-processors directly. No upstream change.
+        proc = getattr(self.model, "processor", None)
+        for sub in ("visual_processor", "video_processor"):
+            ip = getattr(proc, sub, None)
+            if ip is not None and hasattr(ip, "max_pixels"):
+                ip.max_pixels = 256 * 28 * 28  # ~0.2 MP (vs ~1 MP / 0.6 MP)
+
+    def _interleaved_content(self, messages: list) -> str:
+        parts = []
+        for msg in messages:
+            content = msg.get("content")
+            if not isinstance(content, list):
+                continue
+            for c in content:
+                t = c.get("type")
+                if t == "text":
+                    parts.append(c.get("text", ""))
+                elif t == "image":
+                    parts.append(self.image_start_token + ujson.dumps({"local": c["url"]}, ensure_ascii=False) + self.image_end_token)
+                elif t == "video":
+                    parts.append(self.video_start_token + ujson.dumps({"local": c["url"]}, ensure_ascii=False) + self.video_end_token)
+                elif t == "audio":
+                    parts.append(self.audio_start_token + ujson.dumps({"path": c["url"]}, ensure_ascii=False) + self.audio_end_token)
+        return "".join(parts)
+
+    def _infer_one(self, messages: list, gen_kwargs: dict) -> str:
+        user_content = self._interleaved_content(messages)
+        prompt = self._format_prompt(user_content)
+
+        inputs = self.model.processor([prompt])
+        input_ids = inputs.input_ids.cuda()
+        attention_mask = inputs.attention_mask.cuda() if inputs.attention_mask is not None else None
+        model_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "tokenizer": self.tokenizer,
+        }
+        if inputs.audios is not None:
+            model_inputs["audios"] = inputs.audios.cuda()
+        if inputs.encoder_length is not None:
+            model_inputs["encoder_length"] = inputs.encoder_length.cuda()
+        if inputs.bridge_length is not None:
+            model_inputs["bridge_length"] = inputs.bridge_length.cuda()
+        if inputs.images is not None:
+            model_inputs["images"] = [torch.tensor(img, dtype=torch.float32).cuda() for img in inputs.images]
+            if inputs.patch_nums is not None:
+                model_inputs["patch_nums"] = inputs.patch_nums
+            if inputs.images_grid is not None:
+                model_inputs["images_grid"] = inputs.images_grid
+        if inputs.videos is not None:
+            model_inputs["videos"] = [torch.tensor(vid, dtype=torch.float32).cuda() for vid in inputs.videos]
+            if inputs.videos_patch_nums is not None:
+                model_inputs["videos_patch_nums"] = inputs.videos_patch_nums
+            if inputs.videos_grid is not None:
+                model_inputs["videos_grid"] = inputs.videos_grid
+
+        max_new_tokens = gen_kwargs.get("max_new_tokens", 1024)
+        temperature = gen_kwargs.get("temperature", 0.0)
+        do_sample = temperature > 0
+
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **model_inputs,
+                max_new_tokens=max_new_tokens,
+                stop_strings=["<|endoftext|>"],
+                temperature=temperature if do_sample else None,
+                do_sample=do_sample,
+                use_cache=self.use_cache,
+                eos_token_id=self.tokenizer.eos_token_id,
+                pad_token_id=self.tokenizer.pad_token_id,
+            )
+        output_ids = outputs[0] if isinstance(outputs, tuple) else outputs
+        generated_ids = output_ids[0, input_ids.shape[1] :]
+        return self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
diff --git a/lmms_eval/models/chat/omnivinci_interleave.py b/lmms_eval/models/chat/omnivinci_interleave.py
@@ -0,0 +1,104 @@
+"""omnivinci_interleave — OmniVinci over interleaved doc_to_messages.
+
+See `_interleave_base.InterleaveChatMixin` for the shared rationale and
+request loop. Only the OmniVinci/VILA-specific inference step lives here.
+"""
+
+from lmms_eval.api.registry import register_model
+from lmms_eval.models.chat._interleave_base import InterleaveChatMixin
+from lmms_eval.models.simple.omnivinci import OmniVinci
+
+
+@register_model("omnivinci_interleave")
+class OmniVinciInterleave(InterleaveChatMixin, OmniVinci):
+    """OmniVinci that consumes interleaved doc_to_messages prompts."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # The upstream omnivinci wrapper only sets processor.config
+        # .load_audio_in_video. The official AudioBench OmniVinci runner also
+        # sets audio_chunk_length and num_video_frames; without
+        # audio_chunk_length the processor builds an incomplete
+        # mm_info["audio_info"] and __embed_media_tokens raises IndexError on
+        # the "sound" branch. Mirror the official settings here (no change to
+        # the upstream model file).
+        cfg = getattr(self.processor, "config", None)
+        if cfg is not None:
+            cfg.audio_chunk_length = "max_3600"
+            cfg.num_video_frames = getattr(self, "num_video_frames", 128)
+            cfg.load_audio_in_video = getattr(self, "load_audio_in_video", True)
+        mcfg = getattr(self._model, "config", None)
+        if mcfg is not None:
+            mcfg.audio_chunk_length = "max_3600"
+        # Note: a2v/t2v (4 vision options) and t2a/v2a (4 audio options) hit
+        # VILA-internal limits under interleaved prompts — dynamic_s2 tiling
+        # OOMs, image_aspect_ratio=resize degenerates to empty output, and
+        # multi-audio raises IndexError in __embed_media_tokens. These are
+        # not resolvable without upstream model edits, so OmniVinci is
+        # reported best-effort on its 2 clean configs (a2t, v2t).
+
+    def _build_message(self, messages: list) -> list:
+        """doc_to_messages blocks -> OmniVinci message.
+
+        OmniVinci/VILA's mm_info builder indexes media by *sample*; an extra
+        system turn shifts that indexing and triggers an IndexError in
+        __embed_media_tokens. The upstream AudioBench OmniVinci runner uses a
+        single user message with no system role — mirror it exactly.
+        """
+        user_content = []
+        for msg in messages:
+            content = msg.get("content")
+            if not isinstance(content, list):
+                continue
+            for c in content:
+                t = c.get("type")
+                if t == "text":
+                    user_content.append({"type": "text", "text": c.get("text", "")})
+                elif t == "image":
+                    user_content.append({"type": "image", "image": c["url"]})
+                elif t == "audio":
+                    user_content.append({"type": "audio", "audio": c["url"]})
+                elif t == "video":
+                    user_content.append({"type": "video", "video": c["url"]})
+        return [{"role": "user", "content": user_content}]
+
+    def _infer_one(self, messages: list, gen_kwargs: dict) -> str:
+        message = self._build_message(messages)
+
+        vila_text = self.processor.apply_chat_template(message, add_generation_prompt=True, tokenize=False)
+        inputs = self.processor([vila_text])
+        if hasattr(inputs, "input_ids") and inputs.input_ids is not None:
+            if self.device_map == "auto":
+                inputs.input_ids = inputs.input_ids.to("cuda")
+            else:
+                inputs.input_ids = inputs.input_ids.to(self.model.device)
+
+        temperature = gen_kwargs.get("temperature", 0)
+        gen_params = {
+            "max_new_tokens": gen_kwargs.get("max_new_tokens", 1024),
+            "do_sample": temperature > 0,
+            "use_cache": self.use_cache,
+        }
+        if temperature > 0:
+            gen_params["temperature"] = temperature
+            gen_params["top_p"] = gen_kwargs.get("top_p", None)
+        if self.eot_token_id is not None:
+            gen_params["eos_token_id"] = self.eot_token_id
+            gen_params["pad_token_id"] = self.tokenizer.pad_token_id
+
+        generate_kwargs = {
+            "input_ids": inputs.input_ids,
+            "media": getattr(inputs, "media", None),
+            "media_config": getattr(inputs, "media_config", None),
+            **gen_params,
+        }
+        if self.generation_config is not None:
+            self.generation_config.update(**gen_params)
+            for key in list(gen_params.keys()):
+                generate_kwargs.pop(key, None)
+            generate_kwargs["generation_config"] = self.generation_config
+
+        outputs = self.model.generate(**generate_kwargs)
+        if isinstance(outputs, tuple):
+            outputs = outputs[0]
+        return self.processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]