Skip to content
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ Read technical notes:
| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl |
| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 |
| [LFM 2.5 (VL)](https://huggingface.co/LiquidAI) | 1.2B/1.6B | lfm2/lfm2_vl |
| [LFM 2.5 (VL/Audio)](https://huggingface.co/LiquidAI) | 1.2B/1.5B/1.6B | lfm2/lfm2_vl/lfm2_audio |
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 |
Expand Down
2 changes: 1 addition & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl |
| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 |
| [LFM 2.5 (VL)](https://huggingface.co/LiquidAI) | 1.2B/1.6B | lfm2/lfm2_vl |
| [LFM 2.5 (VL/Audio)](https://huggingface.co/LiquidAI) | 1.2B/1.5B/1.6B | lfm2/lfm2_vl/lfm2_audio |
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 |
Expand Down
149 changes: 149 additions & 0 deletions src/llamafactory/data/mm_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2159,6 +2159,154 @@ def process_messages(
return messages


@dataclass
class LFM2AudioPlugin(BasePlugin):
r"""Plugin for LFM2.5-Audio models.

LFM2.5-Audio Architecture:
- FastConformer audio encoder (16kHz input, 8x subsampling)
- Audio markers: <|audio_start|> ... <|text_start|>
- Uses liquid_audio package for feature extraction (optional)

Token Structure:
- <|audio_start|> (token 128): Audio region start
- <|text_start|> (token 129): Audio region end / text start
- audio_token: Placeholder token repeated for sequence length
"""

audio_bos_token: str = "<|audio_start|>"
audio_eos_token: str = "<|text_start|>"

@override
def _validate_input(
self,
processor: Optional["MMProcessor"],
images: list["ImageInput"],
videos: list["VideoInput"],
audios: list["AudioInput"],
) -> None:
r"""Validate inputs. Allow audio without standard HF feature_extractor.

LFM2.5-Audio uses liquid_audio package for audio processing, not standard
HuggingFace feature_extractor. We skip the audio validation here.
"""
# Only validate images/videos, skip audio feature_extractor check
if len(images) != 0 or len(videos) != 0:
super()._validate_input(processor, images, videos, [])

@override
def _get_mm_inputs(
self,
images: list["ImageInput"],
videos: list["VideoInput"],
audios: list["AudioInput"],
processor: Optional["MMProcessor"],
) -> dict[str, "torch.Tensor"]:
r"""Extract audio features using liquid_audio or HF processor.

LFM2.5-Audio uses custom liquid_audio processor, not standard HuggingFace.
This method tries to extract features if a compatible processor is available.
"""
mm_inputs: dict[str, torch.Tensor] = {}

if len(audios) == 0 or processor is None:
return mm_inputs

# Try liquid_audio processor first (has audio_processor attribute)
if hasattr(processor, "audio_processor") and processor.audio_processor is not None:
audio_processor = processor.audio_processor
audios_regularized = self._regularize_audios(audios, sampling_rate=16000)["audios"]
# liquid_audio returns log-mel features
features = audio_processor(audios_regularized, sampling_rate=16000)
mm_inputs["audio_features"] = features
# Calculate sequence lengths from feature shapes (8x subsampling in FastConformer)
if hasattr(features, "shape"):
seq_len = (features.shape[-1] - 1) // 8 + 1
mm_inputs["audio_seq_lengths"] = [seq_len] * len(audios)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current implementation for calculating audio_seq_lengths assumes all audios in a batch have the same length. It computes a single seq_len based on the padded feature length and applies it to all audio files. This can lead to an incorrect number of placeholder tokens for shorter audio files in a batch with variable-length audios.

The fallback path for Hugging Face's feature_extractor is more robust as it uses the attention_mask to determine the actual length of each audio. I recommend a similar approach here. Please check if the liquid_audio processor can return an attention mask or a list of lengths. If not, you might need to compute the sequence lengths based on the lengths of the audios_regularized list before they are padded and passed to the audio_processor.

# Fallback: standard HF feature_extractor
elif hasattr(processor, "feature_extractor") and processor.feature_extractor is not None:
feature_extractor: SequenceFeatureExtractor = processor.feature_extractor
audios_regularized = self._regularize_audios(audios, sampling_rate=16000)["audios"]
mm_inputs.update(
feature_extractor(
audios_regularized,
sampling_rate=16000,
return_attention_mask=True,
padding="max_length",
return_tensors="pt",
)
)
mm_inputs["feature_attention_mask"] = mm_inputs.pop("attention_mask", None)

return mm_inputs

@override
def process_messages(
self,
messages: list[dict[str, str]],
images: list["ImageInput"],
videos: list["VideoInput"],
audios: list["AudioInput"],
processor: Optional["MMProcessor"],
) -> list[dict[str, str]]:
r"""Replace audio placeholders with boundary-wrapped tokens.

Produces: <|audio_start|>{audio_token * seqlen}<|text_start|>
"""
self._validate_input(processor, images, videos, audios)
self._validate_messages(messages, images, videos, audios)

num_audio_tokens = 0
messages = deepcopy(messages)

# Calculate audio sequence lengths if processor is available
audio_seqlens: list[int] = []
if self.expand_mm_tokens and processor is not None:
mm_inputs = self._get_mm_inputs([], [], audios, processor)
if "audio_seq_lengths" in mm_inputs:
# liquid_audio path
audio_seqlens = mm_inputs["audio_seq_lengths"]
elif "feature_attention_mask" in mm_inputs and mm_inputs["feature_attention_mask"] is not None:
# HF path - calculate from attention mask (8x subsampling)
input_lengths = mm_inputs["feature_attention_mask"].sum(-1).numpy()
audio_seqlens = [(int(length) - 1) // 8 + 1 for length in input_lengths]

for message in messages:
content = message["content"]
while AUDIO_PLACEHOLDER in content:
# Get audio sequence length
if self.expand_mm_tokens and num_audio_tokens < len(audio_seqlens):
audio_seqlen = audio_seqlens[num_audio_tokens]
else:
audio_seqlen = 1 # Fallback: single token

# Build: <|audio_start|>{audio_token * seqlen}<|text_start|>
audio_tokens = self.audio_token * audio_seqlen if self.audio_token else ""
replacement = f"{self.audio_bos_token}{audio_tokens}{self.audio_eos_token}"

content = content.replace(AUDIO_PLACEHOLDER, replacement, 1)
num_audio_tokens += 1

message["content"] = content

return messages

@override
def get_mm_inputs(
self,
images: list["ImageInput"],
videos: list["VideoInput"],
audios: list["AudioInput"],
imglens: list[int],
vidlens: list[int],
audlens: list[int],
batch_ids: list[list[int]],
processor: Optional["MMProcessor"],
) -> dict[str, Union[list[int], "torch.Tensor"]]:
self._validate_input(processor, images, videos, audios)
return self._get_mm_inputs(images, videos, audios, processor)


PLUGINS = {
"base": BasePlugin,
"ernie_vl": ErnieVLPlugin,
Expand All @@ -2172,6 +2320,7 @@ def process_messages(
"llava_next": LlavaNextPlugin,
"llava_next_video": LlavaNextVideoPlugin,
"lfm2_vl": LFMVLPlugin,
"lfm2_audio": LFM2AudioPlugin,
"minicpm_v": MiniCPMVPlugin,
"mllama": MllamaPlugin,
"paligemma": PaliGemmaPlugin,
Expand Down
26 changes: 26 additions & 0 deletions src/llamafactory/data/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,6 +1371,32 @@ def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args:
)


register_template(
name="lfm2_audio",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="lfm2"),
format_observation=StringFormatter(
slots=[
"<|im_start|>tool\n<|tool_response_start|>{{content}}<|tool_response_end|><|im_end|>\n"
"<|im_start|>assistant\n"
]
),
format_tools=ToolFormatter(tool_format="lfm2"),
default_system="You are a helpful audio assistant by Liquid AI.",
stop_words=["<|im_end|>"],
tool_call_words=("<|tool_call_start|>", "<|tool_call_end|>"),
replace_eos=True,
mm_plugin=get_mm_plugin(
name="lfm2_audio",
audio_token="<|reserved_1|>", # Token ID 17 - placeholder between markers
audio_bos_token="<|audio_start|>", # Token ID 128
audio_eos_token="<|text_start|>", # Token ID 129
),
)


register_template(
name="llama2",
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
Expand Down
11 changes: 11 additions & 0 deletions src/llamafactory/extras/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1517,6 +1517,17 @@ def register_model_group(
)


register_model_group(
models={
"LFM2.5-Audio-1.5B": {
DownloadSource.DEFAULT: "LiquidAI/LFM2.5-Audio-1.5B",
},
},
template="lfm2_audio",
multimodal=True,
)


register_model_group(
models={
"Llama-7B": {
Expand Down
4 changes: 4 additions & 0 deletions src/llamafactory/extras/packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,7 @@ def is_uvicorn_available():

def is_vllm_available():
return _is_package_available("vllm")


def is_liquid_audio_available():
return _is_package_available("liquid_audio")
13 changes: 13 additions & 0 deletions src/llamafactory/model/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from ..extras.packages import is_torch_version_greater_than
from .adapter import init_adapter
from .model_utils.ktransformers import load_kt_pretrained_model
from .model_utils.lfm2_audio import is_lfm2_audio_model, load_lfm2_audio_pretrained_model
from .model_utils.liger_kernel import apply_liger_kernel
from .model_utils.misc import register_autoclass
from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
Expand Down Expand Up @@ -127,6 +128,14 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
r"""Load model config."""
init_kwargs = _get_init_kwargs(model_args)

# Special handling for LFM2.5-Audio models
if is_lfm2_audio_model(model_args.model_name_or_path):
from .model_utils.lfm2_audio import LFM2AudioConfig

logger.info_rank0("Detected LFM2.5-Audio model, using custom config loader.")
return LFM2AudioConfig()

return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)


Expand Down Expand Up @@ -155,6 +164,10 @@ def load_model(
lazy_load = True
elif is_trainable:
model = load_unsloth_pretrained_model(config, model_args, finetuning_args)
elif is_lfm2_audio_model(model_args.model_name_or_path):
# Load LFM2.5-Audio model using liquid_audio package
logger.info_rank0("Loading LFM2.5-Audio model with liquid_audio package...")
model = load_lfm2_audio_pretrained_model(model_args, **init_kwargs)

if model is None and not lazy_load:
init_kwargs["config"] = config
Expand Down
Loading