Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
485308a
[QEff]: Add gpt_oss
vbaddi Aug 6, 2025
59e2115
nit: update transforms
vbaddi Aug 6, 2025
a6c2812
nit: add header to __init__
vbaddi Aug 6, 2025
8e5783e
apirunner change
ochougul Aug 7, 2025
5c3c971
added test along with simplified Hybridcache
ochougul Aug 7, 2025
ce53d3c
added test assert
ochougul Aug 7, 2025
e0bd90f
nit: update modeling and make transform uniform
vbaddi Aug 7, 2025
18795f2
nit: add changes from gpt_oss_swa branch
vbaddi Aug 7, 2025
30ed222
nit: update test gpt file
vbaddi Aug 8, 2025
14afedb
MOE optimized
ochougul Aug 8, 2025
99f4795
nit: update modeling with new decode moe forward
vbaddi Aug 11, 2025
8637f1f
simplified slidingwindow KV gather and attention is permutation invar…
ochougul Aug 19, 2025
87414a2
nit: seperate gate, up projections for MoE
vbaddi Aug 20, 2025
3adccf6
added MXFP4 quantizer support to directly load GPT-OSS models via QEF…
ochougul Oct 8, 2025
890eed7
nit: rebase to mainline and resolve conflicts
Oct 14, 2025
402f8cb
nit: add license details to mxfp4 quantizer
Oct 14, 2025
e557472
nit: rebase to mainline
vbaddi Oct 15, 2025
05c599d
nit: remove test file and add sample test in config
Oct 15, 2025
dfddd55
nit: remove streamer from .generate() api in example file
vbaddi Oct 15, 2025
04ae7d8
nit: device_ids typo in example script
vbaddi Oct 15, 2025
3cadfba
nit: fix model_name in tests
vbaddi Oct 15, 2025
21a6620
Enable CB for GptOssModel
mamtsing Nov 3, 2025
1bd5d83
Merge branch 'main' into add_gpt_oss
quic-mamta Nov 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 74 additions & 1 deletion QEfficient/base/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,77 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
return model, transformed


VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration", "QEffLlama4ForCausalLM"}
class SplitGateUpWeightsTransformGPTOSS(PytorchTransform):
"""
split fused Gate+Up weights and copy into the model

For every transformer layer inside `model`:
• expects <PREFIX>.experts.gate_up_proj in the *source* `sd`
• copies halves into
<PREFIX>.experts.gate_proj <-- Gate [E,H,I]
<PREFIX>.experts.up_proj <-- Up [E,H,I]
"""

@classmethod
def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
transformed = False
model_class = model.__class__.__name__ if hasattr(model, "model") else model.__class__.__name__

if model_class not in VLM_SPLIT_GATE_UP_WEIGHTS:
return model, transformed

model_tmp = model.language_model if hasattr(model, "language_model") else model
num_layers = len(model_tmp.model.layers)
delete_fused_key = True
sd = model_tmp.state_dict()

for layer_idx in range(num_layers):
# ---- build the textual prefix once per layer ----------
prefix = f"model.layers.{layer_idx}.mlp.experts."
fused_key = prefix + "gate_up_proj"
fused_bias_key = prefix + "gate_up_proj_bias"
gate_key = prefix + "gate_proj"
up_key = prefix + "up_proj"
gate_bias_key = prefix + "gate_proj_bias"
up_bias_key = prefix + "up_proj_bias"

# ---- split [E,H,2I] → two [E,H,I] tensors ----------------------
fused = sd[fused_key] # [E, H, 2I]
fused_bias = sd[fused_bias_key] # [E, 2I]
E, H, two_I = fused.shape
# ffn_dim = two_I // 2

# For GptOss, gate/up are interleaved: [gate0, up0, gate1, up1, ...]
gate = fused[..., ::2] # [E, H, I] - even indices
up = fused[..., 1::2] # [E, H, I] - odd indices
gate_bias = fused_bias[..., ::2] # [E, I] - even indices
up_bias = fused_bias[..., 1::2] # [E, I] - odd indices

experts = model_tmp.model.layers[layer_idx].mlp.experts
experts.gate_proj.data.copy_(gate)
experts.up_proj.data.copy_(up)
experts.gate_proj_bias.data.copy_(gate_bias)
experts.up_proj_bias.data.copy_(up_bias)

# ---- update the state-dict so load_state_dict sees the right keys
sd[gate_key] = gate
sd[up_key] = up
sd[gate_bias_key] = gate_bias
sd[up_bias_key] = up_bias

if delete_fused_key:
del sd[fused_key]
del sd[fused_bias_key]

logger.info(f"[layer {layer_idx:02d}] loaded gate_proj & up_proj from fused tensor (shape {fused.shape})")
transformed = True

if hasattr(model, "language_model"):
model.language_model = model_tmp
else:
model = model_tmp

return model, transformed


VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration", "QEffLlama4ForCausalLM", "QEffGptOssForCausalLM"}
6 changes: 6 additions & 0 deletions QEfficient/transformers/models/gpt_oss/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# ----------------------------------------------------------------------------
Loading
Loading