Fix sharding issue

jrplatin · jrplatin · commit d5ed33f87a6c · 2025-11-26T01:50:14.000Z
Signed-off-by: Jacob Platin &lt;jacobplatin@google.com&gt;
diff --git a/tpu_inference/models/common/model_loader.py b/tpu_inference/models/common/model_loader.py
@@ -96,7 +96,9 @@ def create_jit_model(
             The jitted model.
         """
         state = nnx.state(model)
-        nnx.update(model, state)
+        pspecs = nnx.get_partition_spec(state)
+        sharded_state = jax.lax.with_sharding_constraint(state, pspecs)
+        nnx.update(model, sharded_state)
         if not use_qwix_on_abstract_model:
             # NOTE: if Qwix is not configured, this will be a no-op
             model = apply_qwix_quantization(vllm_config,
@@ -142,7 +144,7 @@ def create_sharded_model():
             # NOTE: we don't support quantization for the old Qwen2ForCausalLM implementation
             return model
 
-        with mesh:
+        with jax.set_mesh(mesh):
             jit_model = create_sharded_model()
             # In this case, we are applying Qwix quantization to the true, concrete model
             jit_model = apply_qwix_quantization(vllm_config,
@@ -179,7 +181,7 @@ def create_sharded_model():
         # Although the created model can already work, we still need to jit
         # the model creation again, otherwise the model forward will have
         # non-trivial overhead in PjitFunction.
-        with mesh:
+        with jax.set_mesh(mesh):
             loader = get_model_loader(vllm_config.load_config)
             if isinstance(loader, RunaiModelStreamerLoader):
                 model_weights = vllm_config.model_config.model