Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
4e75022
feat: add XModBench cross-modal multiple-choice benchmark
XingruiWang Feb 27, 2026
dea96d7
fix: update XModBench sample counts and environment variable references
XingruiWang Mar 1, 2026
726132b
feat(xmod_bench): load data from HuggingFace dataset RyanWW/XModBench
XingruiWang May 13, 2026
9c25ff5
Merge branch 'main' into feat/xmod-bench
XingruiWang May 13, 2026
991c8b9
feat(xmod_bench): add Lite (6k) split, canonical metrics, Level-2 sum…
XingruiWang May 13, 2026
8f913d7
feat(models): add interleaved-multimedia chat wrappers for omni models
XingruiWang May 15, 2026
81d1280
fix(interleave): decode full sequence instead of trimming by input_ids
XingruiWang May 15, 2026
95c4c1c
fix(interleave): lower video frame budget to fit 24GB GPUs
XingruiWang May 15, 2026
78fb853
fix(submit_lite): move t2a to heavy GPU profile
XingruiWang May 16, 2026
a3f9859
feat(interleave): per-model media budget + omnivinci processor config
XingruiWang May 16, 2026
2de65ab
fix(baichuan_interleave): cap processor max_pixels for 4-option configs
XingruiWang May 16, 2026
6b194da
fix(baichuan_interleave): set max_pixels on cached sub-processors
XingruiWang May 16, 2026
6a4142b
docs(xmod_bench): add Lite reproduction results vs paper
XingruiWang May 16, 2026
cfc31b7
fix(omnivinci_interleave): force fixed image tile for 4-option configs
XingruiWang May 16, 2026
0d05610
docs+omnivinci: finalize results; revert resize (degenerate output)
XingruiWang May 16, 2026
b04e983
Merge branch 'EvolvingLMMs-Lab:main' into feat/xmod-bench
XingruiWang May 18, 2026
152efbf
style: auto-fix lint (black + isort)
github-actions[bot] May 18, 2026
654e844
chore(xmod_bench): drop stray Untitled, restore upstream qwen2_5_omni.py
XingruiWang May 18, 2026
cbb4f20
docs(xmod_bench): add PR.md (pull-request description)
XingruiWang May 18, 2026
5ca073e
style: auto-fix lint (black + isort)
github-actions[bot] May 18, 2026
3657db4
Merge pull request #1 from XingruiWang/feat/xmod-bench
XingruiWang May 18, 2026
aef4cb8
Merge branch 'EvolvingLMMs-Lab:main' into main
XingruiWang May 29, 2026
03b68e3
style: auto-fix lint (black + isort)
github-actions[bot] May 29, 2026
0ced2d4
Merge branch 'EvolvingLMMs-Lab:main' into main
XingruiWang Jun 10, 2026
14c715e
Merge branch 'EvolvingLMMs-Lab:main' into main
XingruiWang Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions debug_xmod_bench_lite.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash
#SBATCH --job-name=xmod_debug
#SBATCH --gres=gpu:6000_ada:4
#SBATCH --mem=40G
#SBATCH --cpus-per-task=8
#SBATCH --time=00:30:00
#SBATCH --output=/home/xwang378/scratch/2025/lmms-eval/logs/xmod_bench_lite/debug_%x_%j.log
#SBATCH --error=/home/xwang378/scratch/2025/lmms-eval/logs/xmod_bench_lite/debug_%x_%j.log

set -e
MODEL=${MODEL:-qwen2_5_omni}
PRETRAINED=${PRETRAINED:-Qwen/Qwen2.5-Omni-7B}
ENV=${ENV:-qwenomni3}
TASK=${TASK:-xmod_bench_lite_a2t}
MODEL_ARGS_EXTRA=${MODEL_ARGS_EXTRA:-device_map=auto,attn_implementation=flash_attention_2}

REPO=/home/xwang378/scratch/2025/lmms-eval
export XMODBENCH=/home/xwang378/scratch/2025/AudioBench_data

export HF_HOME=/scratch/xwang378/hf_cache
export HUGGINGFACE_HUB_CACHE="$HF_HOME/hub"
export HF_DATASETS_CACHE="$HF_HOME/datasets"
export TRANSFORMERS_CACHE="$HF_HOME/hub"
export NUMBA_CACHE_DIR=/tmp/numba_${SLURM_JOB_ID}
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
mkdir -p "$NUMBA_CACHE_DIR" "$HF_HOME/hub" "$HF_HOME/datasets"

echo "=== Debug job $SLURM_JOB_ID model=$MODEL env=$ENV task=$TASK ==="
echo "Node: $(hostname) GPU: $CUDA_VISIBLE_DEVICES"
date

mkdir -p "$REPO/logs/xmod_bench_lite/debug_results"
cd "$REPO"

module load conda
conda activate "$ENV"

python -m lmms_eval \
--model "$MODEL" \
--model_args pretrained=$PRETRAINED,$MODEL_ARGS_EXTRA \
--tasks "$TASK" \
--limit 8 \
--batch_size 1 \
--output_path "$REPO/logs/xmod_bench_lite/debug_results" \
--log_samples \
--log_samples_suffix "debug_${MODEL}_${TASK}"

echo "=== Debug done ==="
date
4 changes: 4 additions & 0 deletions lmms_eval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@
"qwen3_vl": "Qwen3_VL",
"qwen3_5": "Qwen3_5",
"qwen2_5_vl": "Qwen2_5_VL",
"qwen2_5_omni_interleave": "Qwen2_5_OmniInterleave",
"qwen3_omni_interleave": "Qwen3_OmniInterleave",
"omnivinci_interleave": "OmniVinciInterleave",
"baichuan_omni_interleave": "BaichuanOmniInterleave",
"thyme": "Thyme",
"openai": "OpenAICompatible",
"vllm": "VLLM",
Expand Down
100 changes: 100 additions & 0 deletions lmms_eval/models/chat/_interleave_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Shared scaffolding for interleaved-multimedia chat wrappers.

Many lmms-eval *simple* models (qwen2.5-omni, qwen3-omni, omnivinci,
baichuan-omni, vita, ...) only attach one media object per request via
`doc_to_visual`. Tasks like XModBench put media in the question stem AND
every answer option (up to 5 per item), so the simple path drops most of it.

The fix is the same for every model: become a *chat-style* model
(`is_simple = False`), read the task's `doc_to_messages` output, and feed the
full interleaved prompt to the underlying model. Only the final
"messages -> model output string" step is model-specific.

`InterleaveChatMixin` implements the common request loop once. A concrete
wrapper subclasses it (plus the model's simple class) and implements a single
method:

def _infer_one(self, messages: list, gen_kwargs: dict) -> str: ...

where `messages` is the raw `doc_to_messages` output (a list of
{"role", "content":[{type,url|text}]} dicts).

Per-media size/frame caps that match the upstream XModBench/AudioBench
runners are exposed as module constants so wrappers stay consistent.
"""

import traceback
from typing import List

from loguru import logger as eval_logger
from tqdm import tqdm

from lmms_eval import utils
from lmms_eval.api.instance import Instance

# Per-media caps. The upstream AudioBench runner used fps=12/max_frames=60/
# 512px, but that assumes ~80 GB GPUs; on 24 GB a5000s a single
# video-condition item plus 4 audio options OOMs (~50% of v2a/v2t video
# samples were silently dropped). A tighter video budget keeps every sample
# on-GPU at minor frame-density cost (XModBench video tasks — emotion,
# spatial, temporal — don't need 60 frames).
VIDEO_KWARGS = {"fps": 2, "max_frames": 16, "max_pixels": 384 * 384}
IMAGE_KWARGS = {"max_pixels": 512 * 512}


class InterleaveChatMixin:
"""Mixin providing a chat-style generate_until over doc_to_messages.

Subclasses may override `video_kwargs` / `image_kwargs` class attributes
to use a tighter media budget (e.g. Baichuan-Omni's video path needs far
more memory than Qwen-Omni's for the same clip).

Subclasses must define `_infer_one(self, messages, gen_kwargs) -> str`.
"""

is_simple = False

# Per-model media budget; override in a subclass if it OOMs.
video_kwargs = VIDEO_KWARGS
image_kwargs = IMAGE_KWARGS

def _infer_one(self, messages: list, gen_kwargs: dict) -> str: # pragma: no cover
raise NotImplementedError

def generate_until(self, requests: List[Instance]) -> List[str]:
res = []

def _collate(x):
return x[0], x[0]

re_ords = utils.Collator(
[reg.args for reg in requests],
_collate,
group_fn=lambda x: x[2],
grouping=True,
)
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")

for chunk in chunks:
ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk)
task_name = task[0]
split_name = split[0]
gen_kwargs = dict(all_gen_kwargs[0])

doc = self.task_dict[task_name][split_name][doc_id[0]]
messages = doc_to_messages[0](doc)

try:
answer = self._infer_one(messages, gen_kwargs)
except Exception as e:
eval_logger.error(f"Error in generating: {e}\n{traceback.format_exc()}")
answer = ""

res.append(answer)
self.cache_hook.add_partial("generate_until", (ctx[0], gen_kwargs), answer)
pbar.update(1)

res = re_ords.get_original(res)
pbar.close()
return res
105 changes: 105 additions & 0 deletions lmms_eval/models/chat/baichuan_omni_interleave.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""baichuan_omni_interleave — Baichuan-Omni over interleaved doc_to_messages.

See `_interleave_base.InterleaveChatMixin` for the shared rationale and
request loop. Only the Baichuan-Omni-specific inference step lives here.

Baichuan-Omni's prompt is a single string with media encoded as
`<start>{"local"|"path": ...}<end>` segments. We emit those segments in the
exact order of the doc_to_messages blocks so the question stem and every
option keep their positions.
"""

import torch
import ujson
from loguru import logger as eval_logger

from lmms_eval.api.registry import register_model
from lmms_eval.models.chat._interleave_base import InterleaveChatMixin
from lmms_eval.models.simple.baichuan_omni import BaichuanOmni


@register_model("baichuan_omni_interleave")
class BaichuanOmniInterleave(InterleaveChatMixin, BaichuanOmni):
"""Baichuan-Omni that consumes interleaved doc_to_messages prompts."""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# XModBench items carry 4 vision options; at Baichuan's default
# ~1 MP/image (and 768*28*28 video) the a2v/t2v configs OOM even on
# 4x48GB. OmniImageProcessor caches max_pixels at __init__
# (processor_omni.py:164), so setting config.max_pixels afterwards is
# too late — set the cached attribute on the already-built
# visual/video sub-processors directly. No upstream change.
proc = getattr(self.model, "processor", None)
for sub in ("visual_processor", "video_processor"):
ip = getattr(proc, sub, None)
if ip is not None and hasattr(ip, "max_pixels"):
ip.max_pixels = 256 * 28 * 28 # ~0.2 MP (vs ~1 MP / 0.6 MP)

def _interleaved_content(self, messages: list) -> str:
parts = []
for msg in messages:
content = msg.get("content")
if not isinstance(content, list):
continue
for c in content:
t = c.get("type")
if t == "text":
parts.append(c.get("text", ""))
elif t == "image":
parts.append(self.image_start_token + ujson.dumps({"local": c["url"]}, ensure_ascii=False) + self.image_end_token)
elif t == "video":
parts.append(self.video_start_token + ujson.dumps({"local": c["url"]}, ensure_ascii=False) + self.video_end_token)
elif t == "audio":
parts.append(self.audio_start_token + ujson.dumps({"path": c["url"]}, ensure_ascii=False) + self.audio_end_token)
return "".join(parts)

def _infer_one(self, messages: list, gen_kwargs: dict) -> str:
user_content = self._interleaved_content(messages)
prompt = self._format_prompt(user_content)

inputs = self.model.processor([prompt])
input_ids = inputs.input_ids.cuda()
attention_mask = inputs.attention_mask.cuda() if inputs.attention_mask is not None else None
model_inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"tokenizer": self.tokenizer,
}
if inputs.audios is not None:
model_inputs["audios"] = inputs.audios.cuda()
if inputs.encoder_length is not None:
model_inputs["encoder_length"] = inputs.encoder_length.cuda()
if inputs.bridge_length is not None:
model_inputs["bridge_length"] = inputs.bridge_length.cuda()
if inputs.images is not None:
model_inputs["images"] = [torch.tensor(img, dtype=torch.float32).cuda() for img in inputs.images]
if inputs.patch_nums is not None:
model_inputs["patch_nums"] = inputs.patch_nums
if inputs.images_grid is not None:
model_inputs["images_grid"] = inputs.images_grid
if inputs.videos is not None:
model_inputs["videos"] = [torch.tensor(vid, dtype=torch.float32).cuda() for vid in inputs.videos]
if inputs.videos_patch_nums is not None:
model_inputs["videos_patch_nums"] = inputs.videos_patch_nums
if inputs.videos_grid is not None:
model_inputs["videos_grid"] = inputs.videos_grid

max_new_tokens = gen_kwargs.get("max_new_tokens", 1024)
temperature = gen_kwargs.get("temperature", 0.0)
do_sample = temperature > 0

with torch.no_grad():
outputs = self.model.generate(
**model_inputs,
max_new_tokens=max_new_tokens,
stop_strings=["<|endoftext|>"],
temperature=temperature if do_sample else None,
do_sample=do_sample,
use_cache=self.use_cache,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id,
)
output_ids = outputs[0] if isinstance(outputs, tuple) else outputs
generated_ids = output_ids[0, input_ids.shape[1] :]
return self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
104 changes: 104 additions & 0 deletions lmms_eval/models/chat/omnivinci_interleave.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""omnivinci_interleave — OmniVinci over interleaved doc_to_messages.

See `_interleave_base.InterleaveChatMixin` for the shared rationale and
request loop. Only the OmniVinci/VILA-specific inference step lives here.
"""

from lmms_eval.api.registry import register_model
from lmms_eval.models.chat._interleave_base import InterleaveChatMixin
from lmms_eval.models.simple.omnivinci import OmniVinci


@register_model("omnivinci_interleave")
class OmniVinciInterleave(InterleaveChatMixin, OmniVinci):
"""OmniVinci that consumes interleaved doc_to_messages prompts."""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# The upstream omnivinci wrapper only sets processor.config
# .load_audio_in_video. The official AudioBench OmniVinci runner also
# sets audio_chunk_length and num_video_frames; without
# audio_chunk_length the processor builds an incomplete
# mm_info["audio_info"] and __embed_media_tokens raises IndexError on
# the "sound" branch. Mirror the official settings here (no change to
# the upstream model file).
cfg = getattr(self.processor, "config", None)
if cfg is not None:
cfg.audio_chunk_length = "max_3600"
cfg.num_video_frames = getattr(self, "num_video_frames", 128)
cfg.load_audio_in_video = getattr(self, "load_audio_in_video", True)
mcfg = getattr(self._model, "config", None)
if mcfg is not None:
mcfg.audio_chunk_length = "max_3600"
# Note: a2v/t2v (4 vision options) and t2a/v2a (4 audio options) hit
# VILA-internal limits under interleaved prompts — dynamic_s2 tiling
# OOMs, image_aspect_ratio=resize degenerates to empty output, and
# multi-audio raises IndexError in __embed_media_tokens. These are
# not resolvable without upstream model edits, so OmniVinci is
# reported best-effort on its 2 clean configs (a2t, v2t).

def _build_message(self, messages: list) -> list:
"""doc_to_messages blocks -> OmniVinci message.

OmniVinci/VILA's mm_info builder indexes media by *sample*; an extra
system turn shifts that indexing and triggers an IndexError in
__embed_media_tokens. The upstream AudioBench OmniVinci runner uses a
single user message with no system role — mirror it exactly.
"""
user_content = []
for msg in messages:
content = msg.get("content")
if not isinstance(content, list):
continue
for c in content:
t = c.get("type")
if t == "text":
user_content.append({"type": "text", "text": c.get("text", "")})
elif t == "image":
user_content.append({"type": "image", "image": c["url"]})
elif t == "audio":
user_content.append({"type": "audio", "audio": c["url"]})
elif t == "video":
user_content.append({"type": "video", "video": c["url"]})
return [{"role": "user", "content": user_content}]

def _infer_one(self, messages: list, gen_kwargs: dict) -> str:
message = self._build_message(messages)

vila_text = self.processor.apply_chat_template(message, add_generation_prompt=True, tokenize=False)
inputs = self.processor([vila_text])
if hasattr(inputs, "input_ids") and inputs.input_ids is not None:
if self.device_map == "auto":
inputs.input_ids = inputs.input_ids.to("cuda")
else:
inputs.input_ids = inputs.input_ids.to(self.model.device)

temperature = gen_kwargs.get("temperature", 0)
gen_params = {
"max_new_tokens": gen_kwargs.get("max_new_tokens", 1024),
"do_sample": temperature > 0,
"use_cache": self.use_cache,
}
if temperature > 0:
gen_params["temperature"] = temperature
gen_params["top_p"] = gen_kwargs.get("top_p", None)
if self.eot_token_id is not None:
gen_params["eos_token_id"] = self.eot_token_id
gen_params["pad_token_id"] = self.tokenizer.pad_token_id

generate_kwargs = {
"input_ids": inputs.input_ids,
"media": getattr(inputs, "media", None),
"media_config": getattr(inputs, "media_config", None),
**gen_params,
}
if self.generation_config is not None:
self.generation_config.update(**gen_params)
for key in list(gen_params.keys()):
generate_kwargs.pop(key, None)
generate_kwargs["generation_config"] = self.generation_config

outputs = self.model.generate(**generate_kwargs)
if isinstance(outputs, tuple):
outputs = outputs[0]
return self.processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
Loading