From ba786adc40ff5de968db21b040463f0764048107 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 20:21:15 +0000
Subject: [PATCH 01/11] Add position_ids bounds checking to WebGPU
 RotaryEmbedding shaders

Add shader-side bounds checks to the WebGPU RotaryEmbedding and
FusedQKRotaryEmbedding GPU shaders to prevent out-of-bounds reads
from cos_cache/sin_cache when position_ids values exceed the cache
dimensions.

For RotaryEmbeddingProgram:
- Check raw_pos < 0 to catch negative position_ids (i32 from
  truncated int64 avoids u32 wraparound bypass)
- Check position_id >= cos_cache_shape[0] after u32 conversion
  and sequence offset addition
- On OOB, pass through input unchanged (matches CUDA kernel behavior)

For FusedQKRotaryEmbeddingProgram:
- Check position_id >= cos_cache_shape[0] before accessing cos/sin
  cache
- On OOB, pass through both Q and K inputs unchanged

This complements the CPU and CUDA fixes from PR #27597 (commit
056bab35e7) which missed the WebGPU execution provider.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (4fe56e20) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../webgpu/bert/rotary_embedding.cc           | 69 +++++++++++++------
 1 file changed, 49 insertions(+), 20 deletions(-)
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index 9f81e490971cd..38015ce2e6e20 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -35,13 +35,28 @@ Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const {
                                "  if (global_idx >= size) { return; }\n"
                                "  if (bsnh[3] < half_rotary_emb_dim) {\n"
                             << "    let position_ids_idx = " << position_ids.BroadcastedIndicesToOffset("bsnh.xy", output_indices) << ";\n"
-                            << "    let position_id = u32(" << position_ids.GetByOffset("position_ids_idx") << ") + select(0, bsnh[1], position_ids_idx == 0);\n"
+                            << "    let raw_pos = " << position_ids.GetByOffset("position_ids_idx") << ";\n"
                             << "    let i = dot(bsnh, uniforms.input_output_stride) + select(0, bsnh[3], " << interleaved_str << ");\n"
                             << "    let j = i + select(half_rotary_emb_dim, 1, " << interleaved_str << ");\n"
-                            << "    let re = " << input.GetByOffset("i") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " - " << input.GetByOffset("j") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
-                            << "    " << output.SetByOffset("i", "re") << "\n"
-                            << "    let im = " << input.GetByOffset("i") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " + " << input.GetByOffset("j") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
-                            << "    " << output.SetByOffset("j", "im") << "\n"
+                               "    let max_position = uniforms.cos_cache_shape[0];\n"
+                               // Bounds check: raw_pos < 0 catches negative position_ids (i32 from truncated int64).
+                               // After u32 conversion + offset, check >= max_position catches too-large values.
+                               // On OOB, pass through input unchanged (same as CUDA kernel behavior).
+                               "    if (raw_pos < 0) {\n"
+                            << "      " << output.SetByOffset("i", input.GetByOffset("i")) << "\n"
+                            << "      " << output.SetByOffset("j", input.GetByOffset("j")) << "\n"
+                               "    } else {\n"
+                               "      let position_id = u32(raw_pos) + select(0, bsnh[1], position_ids_idx == 0);\n"
+                               "      if (position_id >= max_position) {\n"
+                            << "        " << output.SetByOffset("i", input.GetByOffset("i")) << "\n"
+                            << "        " << output.SetByOffset("j", input.GetByOffset("j")) << "\n"
+                               "      } else {\n"
+                            << "        let re = " << input.GetByOffset("i") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " - " << input.GetByOffset("j") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+                            << "        " << output.SetByOffset("i", "re") << "\n"
+                            << "        let im = " << input.GetByOffset("i") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " + " << input.GetByOffset("j") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+                            << "        " << output.SetByOffset("j", "im") << "\n"
+                               "      }\n"
+                               "    }\n"
                             << "  } else { \n"
                                "    let k = dot(bsnh, uniforms.input_output_stride) + half_rotary_emb_dim;\n"
                             << "    " << output.SetByOffset("k", input.GetByOffset("k")) << "\n"
@@ -75,23 +90,37 @@ Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) c
       << "    let total_seqlen = seqlen + 1u;\n"
       << "    let past_seqlen = total_seqlen - uniforms.q_global_shape[1];\n"
       << "    let position_id = past_seqlen + sequence_idx;\n"
-      << "    let cos_v = " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
-      << "    let sin_v = " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
       << "    let qi = dot(bsnh, uniforms.q_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
       << "    let qj = qi + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
-      << "    let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n"
-      << "    " << q_output.SetByOffset("qi", "q_re") << "\n"
-      << "    let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n"
-      << "    " << q_output.SetByOffset("qj", "q_im") << "\n"
-      // Conditionally process Key (only for heads that exist in K domain)
-      << "    if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
-      << "      let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
-      << "      let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
-      << "      let k_re = " << k_input.GetByOffset("ki") << " * cos_v - " << k_input.GetByOffset("kj") << " * sin_v;\n"
-      << "      " << k_output.SetByOffset("ki", "k_re") << "\n"
-      << "      let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n"
-      << "      " << k_output.SetByOffset("kj", "k_im") << "\n"
-      << "    }\n"
+         // Bounds check: position_id must be within cos/sin cache range.
+         // On OOB, pass through input unchanged (same as CUDA kernel behavior).
+         "    let max_position = uniforms.cos_cache_shape[0];\n"
+         "    if (position_id >= max_position) {\n"
+      << "      " << q_output.SetByOffset("qi", q_input.GetByOffset("qi")) << "\n"
+      << "      " << q_output.SetByOffset("qj", q_input.GetByOffset("qj")) << "\n"
+      << "      if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
+      << "        let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
+      << "        let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
+      << "        " << k_output.SetByOffset("ki", k_input.GetByOffset("ki")) << "\n"
+      << "        " << k_output.SetByOffset("kj", k_input.GetByOffset("kj")) << "\n"
+         "      }\n"
+         "    } else {\n"
+      << "      let cos_v = " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+      << "      let sin_v = " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+      << "      let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n"
+      << "      " << q_output.SetByOffset("qi", "q_re") << "\n"
+      << "      let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n"
+      << "      " << q_output.SetByOffset("qj", "q_im") << "\n"
+         // Conditionally process Key (only for heads that exist in K domain)
+      << "      if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
+      << "        let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
+      << "        let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
+      << "        let k_re = " << k_input.GetByOffset("ki") << " * cos_v - " << k_input.GetByOffset("kj") << " * sin_v;\n"
+      << "        " << k_output.SetByOffset("ki", "k_re") << "\n"
+      << "        let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n"
+      << "        " << k_output.SetByOffset("kj", "k_im") << "\n"
+         "      }\n"
+         "    }\n"
       << "  } else {\n"
       << "    let qk = dot(bsnh, uniforms.q_input_output_stride) + half_rotary_dim;\n"
       << "    " << q_output.SetByOffset("qk", q_input.GetByOffset("qk")) << "\n"

From 7b0e94a61c6a7df2c15934ebb70d830fb53fb5d4 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 20:25:42 +0000
Subject: [PATCH 02/11] Add host-side position_ids bounds validation to WebGPU
 RotaryEmbedding

Add host-side validation of position_ids values before shader dispatch
in all three WebGPU RotaryEmbedding implementations. This prevents
out-of-bounds reads from cos_cache/sin_cache when position_ids values
exceed the cache dimensions.

Changes:

1. contrib_ops/webgpu/bert/rotary_embedding.cc:
   - Add InputMemoryType(OrtMemTypeCPUInput, 1) to keep position_ids
     on CPU for validation
   - Add bounds checking in ComputeInternal() before shader dispatch:
     format 0 (scalar): base_pos in [0, max_seq_len - seq_len]
     format 1 (2D array): each value in [0, max_sequence_length)
   - Returns INVALID_ARGUMENT error on violation
   - Shader-side bounds checks remain as defense-in-depth

2. core/providers/webgpu/llm/rotary_embedding.cc:
   - Add InputMemoryType(OrtMemTypeCPUInput, 3) for optional
     position_ids input
   - Add bounds checking in the position_ids != nullptr branch
   - Returns INVALID_ARGUMENT error on violation

3. js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts:
   - Add value validation in validateInputs() using getBigInt64Array()
   - Validates both format 0 (scalar offset) and format 1 (2D array)
   - Throws Error with descriptive message on violation

All three implementations follow the same validation pattern as the
CPU contrib fix (PR #27597), returning errors rather than silently
passing through.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (4fe56e20) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../wasm/jsep/webgpu/ops/rotary-embedding.ts  | 25 ++++++++++++++++
 .../webgpu/bert/rotary_embedding.cc           | 29 ++++++++++++++++++-
 .../providers/webgpu/llm/rotary_embedding.cc  | 16 +++++++++-
 3 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
index fe2567e71d49a..de9a596cafe57 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
@@ -62,6 +62,31 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
     }
   }
 
+  // Validate position_ids values are within cos/sin cache bounds.
+  const positionIdsData = positionIds.getBigInt64Array();
+  if (positionIdsData.length === 1) {
+    // Format 0: single base offset. Effective positions are [base_pos, base_pos + sequence_length - 1].
+    const basePos = positionIdsData[0];
+    const maxValidBase = BigInt(maxSequenceLength) - BigInt(sequenceLength);
+    if (basePos < 0n || basePos > maxValidBase) {
+      throw new Error(
+        `position_ids base value ${basePos} with sequence_length ${sequenceLength}` +
+          ` exceeds cos/sin cache range [0, ${maxSequenceLength})`,
+      );
+    }
+  } else {
+    // Format 1: 2D array (batch_size, sequence_length). Each value must be in [0, max_sequence_length).
+    const maxSeqBigInt = BigInt(maxSequenceLength);
+    for (let i = 0; i < positionIdsData.length; i++) {
+      const pos = positionIdsData[i];
+      if (pos < 0n || pos >= maxSeqBigInt) {
+        throw new Error(
+          `position_ids value ${pos} at index ${i} is out of range [0, ${maxSequenceLength})`,
+        );
+      }
+    }
+  }
+
   if (headSize / 2 !== cosCache.dims[1] && rotaryEmbeddingDim / 2 !== cosCache.dims[1]) {
     throw new Error(
       `Input 'cos_cache' dimension 1 should be same as head_size / 2 or rotary_embedding_dim / 2, got ${
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index 38015ce2e6e20..5d3ba9690ba51 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -17,7 +17,8 @@ ONNX_OPERATOR_KERNEL_EX(
     kWebGpuExecutionProvider,
     (*KernelDefBuilder::Create())
         .TypeConstraint("T", WebGpuSupportedFloatTypes())
-        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>()),
+        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>())
+        .InputMemoryType(OrtMemTypeCPUInput, 1),  // position_ids on CPU for bounds validation
     RotaryEmbedding);
 
 Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const {
@@ -155,6 +156,32 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con
   const auto hidden_size = batch_stride / sequence_length;
   const auto half_rotary_embedding_dim = onnxruntime::narrow<uint32_t>(cos_cache->Shape()[1]);
   const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_;
+  const auto max_sequence_length = static_cast<int64_t>(cos_cache->Shape()[0]);
+
+  // Validate position_ids values are within cos/sin cache bounds (position_ids kept on CPU via InputMemoryType).
+  const auto* pos_ids_data = position_ids->Data<int64_t>();
+  const auto pos_ids_size = position_ids->Shape().Size();
+  if (pos_ids_size == 1) {
+    // Format 0: single base offset. Effective positions are [base_pos, base_pos + sequence_length - 1].
+    int64_t base_pos = pos_ids_data[0];
+    int64_t max_valid_base = max_sequence_length - static_cast<int64_t>(sequence_length);
+    if (base_pos < 0 || base_pos > max_valid_base) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "position_ids base value ", base_pos,
+                             " with sequence_length ", sequence_length,
+                             " exceeds cos/sin cache range [0, ", max_sequence_length, ")");
+    }
+  } else {
+    // Format 1: 2D array (batch_size, sequence_length). Each value must be in [0, max_sequence_length).
+    for (int64_t i = 0; i < pos_ids_size; ++i) {
+      int64_t pos = pos_ids_data[i];
+      if (pos < 0 || pos >= max_sequence_length) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "position_ids value ", pos, " at index ", i,
+                               " is out of range [0, ", max_sequence_length, ")");
+      }
+    }
+  }
 
   // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape
   // [batch size, sequence length, num of heads, num of pairs to rotate + num of dims to copy]
diff --git a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
index ee46c76f1ea54..013fbe4b48bed 100644
--- a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
+++ b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
@@ -16,7 +16,8 @@ ONNX_OPERATOR_KERNEL_EX(
     kWebGpuExecutionProvider,
     (*KernelDefBuilder::Create())
         .TypeConstraint("T", WebGpuSupportedFloatTypes())
-        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>()),
+        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>())
+        .InputMemoryType(OrtMemTypeCPUInput, 3),  // position_ids on CPU for bounds validation
     RotaryEmbedding);
 
 RotaryEmbedding::RotaryEmbedding(const OpKernelInfo& info) : WebGpuKernel(info) {
@@ -83,6 +84,19 @@ Status RotaryEmbedding::ComputeInternal(ComputeContext& context) const {
 
   if (position_ids != nullptr) {
     // position_ids provided: cos/sin cache is 2D (max_pos, D/2)
+    // Validate position_ids values are within cache bounds (position_ids kept on CPU via InputMemoryType).
+    const auto max_sequence_length = static_cast<int64_t>(cos_cache->Shape()[0]);
+    const auto* pos_ids_data = position_ids->Data<int64_t>();
+    const auto pos_ids_size = position_ids->Shape().Size();
+    for (int64_t i = 0; i < pos_ids_size; ++i) {
+      int64_t pos = pos_ids_data[i];
+      if (pos < 0 || pos >= max_sequence_length) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "position_ids value ", pos, " at index ", i,
+                               " is out of range [0, ", max_sequence_length, ")");
+      }
+    }
+
     contrib::webgpu::RotaryEmbeddingProgram program{interleaved_};
     program
         .CacheHint(interleaved_)

From 296a44d34add2d882a069f1b54a6fcd257a10400 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 20:30:28 +0000
Subject: [PATCH 03/11] Add WebGPU OOB tests for RotaryEmbedding position_ids
 bounds validation

Add WebGPU-targeted OOB unit tests to both contrib (kMSDomain) and ONNX
domain test files. Tests verify that out-of-bounds, negative, and
format-0 overflow position_ids values are rejected with
INVALID_ARGUMENT, matching the host-side validation added to the
WebGPU RotaryEmbedding kernels.

Tests gracefully skip via GTEST_SKIP() when WebGPU EP is not available.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (4fe56e20) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../contrib_ops/rotary_embedding_op_test.cc   | 101 +++++++++++++++++
 .../cpu/llm/rotary_embedding_op_test.cc       | 103 ++++++++++++++++++
 2 files changed, 204 insertions(+)

diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
index 1fc410c37da14..31787b7a32dbd 100644
--- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -1051,6 +1051,107 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_RejectsRank4MalformedCacheWidth
   execution_providers.push_back(DefaultCpuExecutionProvider());
   test.Run(OpTester::ExpectResult::kExpectFailure,
            "Input 'cos_cache' dimension 1 should be same as head_size / 2 or rotary_embedding_dim / 2, got 8",
+// Test that OOB position_ids (format 1) are rejected on WebGPU (host-side validation).
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_WebGPU) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 2;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
+                       std::vector<float>(batch_size * sequence_length * hidden_size, 1.0f));
+  // position_id = 999 exceeds max_sequence_length = 8
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {0, 999});
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
+                        std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "position_ids value 999 at index 1 is out of range",
+           {}, nullptr, &execution_providers);
+}
+
+// Test that format-0 OOB position_ids base offset is rejected on WebGPU.
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Format0_OOB_WebGPU) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 2;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
+                       std::vector<float>(batch_size * sequence_length * hidden_size, 1.0f));
+  // Format 0: single value. Effective positions = [7, 8] — position 8 is out of range [0, 8).
+  test.AddInput<int64_t>("position_ids", {1}, {7});
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
+                        std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "position_ids base value 7 with sequence_length 2 exceeds cos/sin cache range",
+           {}, nullptr, &execution_providers);
+}
+
+// Test that negative position_ids are rejected on WebGPU.
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Negative_WebGPU) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 1;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
+                       std::vector<float>(hidden_size, 1.0f));
+  // Format 0: negative base offset
+  test.AddInput<int64_t>("position_ids", {1}, {-5});
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
+                        std::vector<float>(hidden_size, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "position_ids base value -5 with sequence_length 1 exceeds cos/sin cache range",
            {}, nullptr, &execution_providers);
 }
 
diff --git a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
index 6a3b0d8160d53..1dfccf8f90ab6 100644
--- a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
@@ -1289,6 +1289,109 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_RejectsRank3HiddenSizeNotDivisibleByNu
   execution_providers.push_back(DefaultCpuExecutionProvider());
   test.Run(OpTester::ExpectResult::kExpectFailure,
            "hidden_size=5 must be divisible by num_heads=2 for rank-3 input", {}, nullptr, &execution_providers);
+// Test that OOB position_ids are rejected on WebGPU (host-side validation).
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_WebGPU) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 1;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+  test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
+                       std::vector<float>(hidden_size, 1.0f));
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+  // position_id = 2048 exceeds max_sequence_length = 8
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {2048});
+
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
+                        std::vector<float>(hidden_size, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "position_ids value 2048 at index 0 is out of range",
+           {}, nullptr, &execution_providers);
+}
+
+// Test that negative position_ids are rejected on WebGPU.
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Negative_WebGPU) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 1;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+  test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
+                       std::vector<float>(hidden_size, 1.0f));
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+  // position_id = -1 is negative
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {-1});
+
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
+                        std::vector<float>(hidden_size, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "position_ids value -1 at index 0 is out of range",
+           {}, nullptr, &execution_providers);
+}
+
+// Test that OOB position_ids in a batch are rejected on WebGPU.
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_InBatch_WebGPU) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 2;
+  int sequence_length = 2;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+  test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
+                       std::vector<float>(batch_size * sequence_length * hidden_size, 1.0f));
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+  // Second batch has position_id = 100 which exceeds max_sequence_length = 8
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {0, 1, 2, 100});
+
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
+                        std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "position_ids value 100 at index 3 is out of range",
+           {}, nullptr, &execution_providers);
 }
 
 }  // namespace test

From a7e1f57fa0fe1f16ff2a7878339c874b708e97c3 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 20:34:31 +0000
Subject: [PATCH 04/11] Add readability comments to WebGPU RotaryEmbedding
 bounds checks

Address readability review feedback:
- FusedQK shader: clarify why no negative check (position_id derived from
  past_seqlen + sequence_idx, always non-negative)
- ONNX domain kernel: clarify why no format 0 check (ONNX RotaryEmbedding
  always uses explicit position_ids, no base-offset mode)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (4fe56e20) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc   | 1 +
 onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index 5d3ba9690ba51..9aba2282f7387 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -90,6 +90,7 @@ Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) c
       << "    let seqlen = u32(seqlen_i);\n"
       << "    let total_seqlen = seqlen + 1u;\n"
       << "    let past_seqlen = total_seqlen - uniforms.q_global_shape[1];\n"
+      // position_id is derived from past_seqlen + sequence_idx (always non-negative).
       << "    let position_id = past_seqlen + sequence_idx;\n"
       << "    let qi = dot(bsnh, uniforms.q_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
       << "    let qj = qi + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
diff --git a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
index 013fbe4b48bed..26aeb1744790e 100644
--- a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
+++ b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
@@ -84,6 +84,7 @@ Status RotaryEmbedding::ComputeInternal(ComputeContext& context) const {
 
   if (position_ids != nullptr) {
     // position_ids provided: cos/sin cache is 2D (max_pos, D/2)
+    // ONNX RotaryEmbedding always uses explicit position_ids (no format 0 base-offset mode).
     // Validate position_ids values are within cache bounds (position_ids kept on CPU via InputMemoryType).
     const auto max_sequence_length = static_cast<int64_t>(cos_cache->Shape()[0]);
     const auto* pos_ids_data = position_ids->Data<int64_t>();

From 3765c8cf9136c09ed7358d7263f98baa9a2f2a7d Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 20:37:10 +0000
Subject: [PATCH 05/11] Add format-0 bounds check to ONNX domain WebGPU
 RotaryEmbedding

Address code review finding: the shared shader treats single-element
position_ids as format 0 (base offset + sequence_idx), so the ONNX
domain host-side validation must also check that base_pos +
sequence_length - 1 < max_sequence_length.

Also add corresponding format-0 OOB WebGPU test case.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (4fe56e20) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../providers/webgpu/llm/rotary_embedding.cc  | 25 +++++++++----
 .../cpu/llm/rotary_embedding_op_test.cc       | 36 +++++++++++++++++++
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
index 26aeb1744790e..bb921edfbe348 100644
--- a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
+++ b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
@@ -84,17 +84,30 @@ Status RotaryEmbedding::ComputeInternal(ComputeContext& context) const {
 
   if (position_ids != nullptr) {
     // position_ids provided: cos/sin cache is 2D (max_pos, D/2)
-    // ONNX RotaryEmbedding always uses explicit position_ids (no format 0 base-offset mode).
     // Validate position_ids values are within cache bounds (position_ids kept on CPU via InputMemoryType).
     const auto max_sequence_length = static_cast<int64_t>(cos_cache->Shape()[0]);
     const auto* pos_ids_data = position_ids->Data<int64_t>();
     const auto pos_ids_size = position_ids->Shape().Size();
-    for (int64_t i = 0; i < pos_ids_size; ++i) {
-      int64_t pos = pos_ids_data[i];
-      if (pos < 0 || pos >= max_sequence_length) {
+    if (pos_ids_size == 1) {
+      // Format 0: single base offset. Shader adds sequence_idx, so effective range is
+      // [base_pos, base_pos + sequence_length - 1]. All must be < max_sequence_length.
+      int64_t base_pos = pos_ids_data[0];
+      int64_t max_valid_base = max_sequence_length - static_cast<int64_t>(sequence_length);
+      if (base_pos < 0 || base_pos > max_valid_base) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "position_ids value ", pos, " at index ", i,
-                               " is out of range [0, ", max_sequence_length, ")");
+                               "position_ids base value ", base_pos,
+                               " with sequence_length ", sequence_length,
+                               " exceeds cos/sin cache range [0, ", max_sequence_length, ")");
+      }
+    } else {
+      // Format 1: 2D array (batch_size, sequence_length). Each value must be in [0, max_sequence_length).
+      for (int64_t i = 0; i < pos_ids_size; ++i) {
+        int64_t pos = pos_ids_data[i];
+        if (pos < 0 || pos >= max_sequence_length) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                                 "position_ids value ", pos, " at index ", i,
+                                 " is out of range [0, ", max_sequence_length, ")");
+        }
       }
     }
 
diff --git a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
index 1dfccf8f90ab6..e954889e3fe97 100644
--- a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
@@ -1394,5 +1394,41 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_InBatch_WebGPU) {
            {}, nullptr, &execution_providers);
 }
 
+// Test that format-0 (single position_id) OOB is rejected on WebGPU.
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Format0_OOB_WebGPU) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 2;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+  test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
+                       std::vector<float>(batch_size * sequence_length * hidden_size, 1.0f));
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+  // Format 0: single value. Effective positions = [7, 8] — position 8 is out of range [0, 8).
+  test.AddInput<int64_t>("position_ids", {1}, {7});
+
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
+                        std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "position_ids base value 7 with sequence_length 2 exceeds cos/sin cache range",
+           {}, nullptr, &execution_providers);
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From d38b5e30040b4d617dab9dc92c67f349ada90056 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 20:56:44 +0000
Subject: [PATCH 06/11] Apply lintrunner clang-format auto-fixes

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (4fe56e20) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../webgpu/bert/rotary_embedding.cc           | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index 9aba2282f7387..62243a6f218f8 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -39,25 +39,25 @@ Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const {
                             << "    let raw_pos = " << position_ids.GetByOffset("position_ids_idx") << ";\n"
                             << "    let i = dot(bsnh, uniforms.input_output_stride) + select(0, bsnh[3], " << interleaved_str << ");\n"
                             << "    let j = i + select(half_rotary_emb_dim, 1, " << interleaved_str << ");\n"
-                               "    let max_position = uniforms.cos_cache_shape[0];\n"
-                               // Bounds check: raw_pos < 0 catches negative position_ids (i32 from truncated int64).
-                               // After u32 conversion + offset, check >= max_position catches too-large values.
-                               // On OOB, pass through input unchanged (same as CUDA kernel behavior).
-                               "    if (raw_pos < 0) {\n"
+                                                                                                       "    let max_position = uniforms.cos_cache_shape[0];\n"
+                                                                                                       // Bounds check: raw_pos < 0 catches negative position_ids (i32 from truncated int64).
+                                                                                                       // After u32 conversion + offset, check >= max_position catches too-large values.
+                                                                                                       // On OOB, pass through input unchanged (same as CUDA kernel behavior).
+                                                                                                       "    if (raw_pos < 0) {\n"
                             << "      " << output.SetByOffset("i", input.GetByOffset("i")) << "\n"
                             << "      " << output.SetByOffset("j", input.GetByOffset("j")) << "\n"
-                               "    } else {\n"
-                               "      let position_id = u32(raw_pos) + select(0, bsnh[1], position_ids_idx == 0);\n"
-                               "      if (position_id >= max_position) {\n"
+                                                                                              "    } else {\n"
+                                                                                              "      let position_id = u32(raw_pos) + select(0, bsnh[1], position_ids_idx == 0);\n"
+                                                                                              "      if (position_id >= max_position) {\n"
                             << "        " << output.SetByOffset("i", input.GetByOffset("i")) << "\n"
                             << "        " << output.SetByOffset("j", input.GetByOffset("j")) << "\n"
-                               "      } else {\n"
+                                                                                                "      } else {\n"
                             << "        let re = " << input.GetByOffset("i") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " - " << input.GetByOffset("j") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
                             << "        " << output.SetByOffset("i", "re") << "\n"
                             << "        let im = " << input.GetByOffset("i") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " + " << input.GetByOffset("j") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
                             << "        " << output.SetByOffset("j", "im") << "\n"
-                               "      }\n"
-                               "    }\n"
+                                                                              "      }\n"
+                                                                              "    }\n"
                             << "  } else { \n"
                                "    let k = dot(bsnh, uniforms.input_output_stride) + half_rotary_emb_dim;\n"
                             << "    " << output.SetByOffset("k", input.GetByOffset("k")) << "\n"
@@ -94,10 +94,10 @@ Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) c
       << "    let position_id = past_seqlen + sequence_idx;\n"
       << "    let qi = dot(bsnh, uniforms.q_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
       << "    let qj = qi + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
-         // Bounds check: position_id must be within cos/sin cache range.
-         // On OOB, pass through input unchanged (same as CUDA kernel behavior).
-         "    let max_position = uniforms.cos_cache_shape[0];\n"
-         "    if (position_id >= max_position) {\n"
+                                                                                // Bounds check: position_id must be within cos/sin cache range.
+                                                                                // On OOB, pass through input unchanged (same as CUDA kernel behavior).
+                                                                                "    let max_position = uniforms.cos_cache_shape[0];\n"
+                                                                                "    if (position_id >= max_position) {\n"
       << "      " << q_output.SetByOffset("qi", q_input.GetByOffset("qi")) << "\n"
       << "      " << q_output.SetByOffset("qj", q_input.GetByOffset("qj")) << "\n"
       << "      if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
@@ -105,15 +105,15 @@ Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) c
       << "        let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
       << "        " << k_output.SetByOffset("ki", k_input.GetByOffset("ki")) << "\n"
       << "        " << k_output.SetByOffset("kj", k_input.GetByOffset("kj")) << "\n"
-         "      }\n"
-         "    } else {\n"
+                                                                                "      }\n"
+                                                                                "    } else {\n"
       << "      let cos_v = " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
       << "      let sin_v = " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
       << "      let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n"
       << "      " << q_output.SetByOffset("qi", "q_re") << "\n"
       << "      let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n"
       << "      " << q_output.SetByOffset("qj", "q_im") << "\n"
-         // Conditionally process Key (only for heads that exist in K domain)
+      // Conditionally process Key (only for heads that exist in K domain)
       << "      if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
       << "        let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
       << "        let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
@@ -121,8 +121,8 @@ Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) c
       << "        " << k_output.SetByOffset("ki", "k_re") << "\n"
       << "        let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n"
       << "        " << k_output.SetByOffset("kj", "k_im") << "\n"
-         "      }\n"
-         "    }\n"
+                                                             "      }\n"
+                                                             "    }\n"
       << "  } else {\n"
       << "    let qk = dot(bsnh, uniforms.q_input_output_stride) + half_rotary_dim;\n"
       << "    " << q_output.SetByOffset("qk", q_input.GetByOffset("qk")) << "\n"

From 7fdb83e7259d6dc636229dddc6f6963175a8a569 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 21:05:40 +0000
Subject: [PATCH 07/11] Apply prettier formatting to rotary-embedding.ts

Fix Web CI precheck: run prettier on the TypeScript file to match
the project's JS/TS formatting requirements.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Agent-signed-off: Developer (4fe56e20) [claude-opus-4.6]
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
index de9a596cafe57..3122327bf2efb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
@@ -80,9 +80,7 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
     for (let i = 0; i < positionIdsData.length; i++) {
       const pos = positionIdsData[i];
       if (pos < 0n || pos >= maxSeqBigInt) {
-        throw new Error(
-          `position_ids value ${pos} at index ${i} is out of range [0, ${maxSequenceLength})`,
-        );
+        throw new Error(`position_ids value ${pos} at index ${i} is out of range [0, ${maxSequenceLength})`);
       }
     }
   }

From c7c420989ee6828bd3b849b2de145a236d600e63 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 22:14:35 +0000
Subject: [PATCH 08/11] Fix BigInt64Array alignment error in WASM JSEP path

TensorViewImpl.getBigInt64Array() threw RangeError when the WASM heap
offset was not 8-byte aligned. Fix by detecting unaligned offsets and
copying bytes into an aligned buffer before creating the BigInt64Array.

This fixes the Web CI failure where all RotaryEmbedding tests failed
with 'start offset of BigInt64Array should be a multiple of 8'.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 js/web/lib/wasm/jsep/init.ts                      | 15 ++++++++++++---
 .../lib/wasm/jsep/webgpu/ops/rotary-embedding.ts  | 11 ++++++-----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 50fb26fef1d41..295e0f23c113f 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -39,9 +39,18 @@ class TensorViewImpl implements TensorView {
       throw new Error('Invalid data type');
     }
     const elementCount = ShapeUtil.size(this.dims);
-    return elementCount === 0
-      ? new BigInt64Array()
-      : new BigInt64Array(this.module.HEAP8.buffer, this.data, elementCount);
+    if (elementCount === 0) {
+      return new BigInt64Array();
+    }
+    // BigInt64Array requires the byte offset to be a multiple of 8. WASM allocators may return
+    // offsets that are not 8-byte aligned, so fall back to copying bytes into an aligned buffer.
+    if (this.data % 8 === 0) {
+      return new BigInt64Array(this.module.HEAP8.buffer, this.data, elementCount);
+    }
+    const byteLength = elementCount * 8;
+    const alignedBuffer = new ArrayBuffer(byteLength);
+    new Uint8Array(alignedBuffer).set(new Uint8Array(this.module.HEAP8.buffer, this.data, byteLength));
+    return new BigInt64Array(alignedBuffer);
   }
 
   getInt32Array(): Int32Array {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
index 3122327bf2efb..8a2b4e3847b95 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
@@ -63,10 +63,11 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
   }
 
   // Validate position_ids values are within cos/sin cache bounds.
-  const positionIdsData = positionIds.getBigInt64Array();
-  if (positionIdsData.length === 1) {
+  const positionIdsElementCount = ShapeUtil.size(positionIds.dims);
+  const positionIdsBigInt = positionIds.getBigInt64Array();
+  if (positionIdsElementCount === 1) {
     // Format 0: single base offset. Effective positions are [base_pos, base_pos + sequence_length - 1].
-    const basePos = positionIdsData[0];
+    const basePos = positionIdsBigInt[0];
     const maxValidBase = BigInt(maxSequenceLength) - BigInt(sequenceLength);
     if (basePos < 0n || basePos > maxValidBase) {
       throw new Error(
@@ -77,8 +78,8 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
   } else {
     // Format 1: 2D array (batch_size, sequence_length). Each value must be in [0, max_sequence_length).
     const maxSeqBigInt = BigInt(maxSequenceLength);
-    for (let i = 0; i < positionIdsData.length; i++) {
-      const pos = positionIdsData[i];
+    for (let i = 0; i < positionIdsElementCount; i++) {
+      const pos = positionIdsBigInt[i];
       if (pos < 0n || pos >= maxSeqBigInt) {
         throw new Error(`position_ids value ${pos} at index ${i} is out of range [0, ${maxSequenceLength})`);
       }

From e1df1e713f484932561bfe364a5571a1b93b23a5 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Apr 2026 22:22:21 +0000
Subject: [PATCH 09/11] Address review: remove InputMemoryType, use shader-side
 defense-in-depth
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove InputMemoryType(OrtMemTypeCPUInput) from both WebGPU kernel
registrations (contrib and ONNX domain) and the associated host-side
position_ids value scanning. InputMemoryType is incompatible with
AddInputs() — a CPU tensor's DataRaw() would be cast to WGPUBuffer,
causing a crash at dispatch time.

Defense strategy is now:
- Shader-side: WGSL bounds checks pass through input unchanged on OOB
  (same as CUDA kernel behavior)
- JSEP/browser: TypeScript validation in rotary-embedding.ts catches
  OOB before shader dispatch
- init.ts: getBigInt64Array() handles unaligned WASM heap offsets

WebGPU OOB tests changed from kExpectFailure to kExpectSuccess,
verifying pass-through behavior (output equals input on OOB).
ONNX domain tests updated to use rank-2 position_ids for cross-EP
consistency. TS validation reordered per Copilot review: sequence_length
check before per-value bounds validation.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 js/web/lib/wasm/jsep/init.ts                  |   1 +
 .../wasm/jsep/webgpu/ops/rotary-embedding.ts  |   8 +-
 .../webgpu/bert/rotary_embedding.cc           |  34 ++----
 .../providers/webgpu/llm/rotary_embedding.cc  |  35 ++----
 .../contrib_ops/rotary_embedding_op_test.cc   |  77 +++++++------
 .../cpu/llm/rotary_embedding_op_test.cc       | 102 +++++++-----------
 6 files changed, 103 insertions(+), 154 deletions(-)

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 295e0f23c113f..c1590213b10c3 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -44,6 +44,7 @@ class TensorViewImpl implements TensorView {
     }
     // BigInt64Array requires the byte offset to be a multiple of 8. WASM allocators may return
     // offsets that are not 8-byte aligned, so fall back to copying bytes into an aligned buffer.
+    // Note: the returned array is a read-only copy when unaligned (mutations won't propagate to WASM heap).
     if (this.data % 8 === 0) {
       return new BigInt64Array(this.module.HEAP8.buffer, this.data, elementCount);
     }
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
index 8a2b4e3847b95..fd9708ce23d51 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
@@ -63,6 +63,10 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
   }
 
   // Validate position_ids values are within cos/sin cache bounds.
+  if (sequenceLength > maxSequenceLength) {
+    throw new Error('Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported');
+  }
+
   const positionIdsElementCount = ShapeUtil.size(positionIds.dims);
   const positionIdsBigInt = positionIds.getBigInt64Array();
   if (positionIdsElementCount === 1) {
@@ -93,10 +97,6 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
       }`,
     );
   }
-
-  if (sequenceLength > maxSequenceLength) {
-    throw new Error('Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported');
-  }
 };
 
 export const createRotaryEmbeddingProgramInfo = (
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index 62243a6f218f8..69d2db391ce3c 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -17,8 +17,7 @@ ONNX_OPERATOR_KERNEL_EX(
     kWebGpuExecutionProvider,
     (*KernelDefBuilder::Create())
         .TypeConstraint("T", WebGpuSupportedFloatTypes())
-        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>())
-        .InputMemoryType(OrtMemTypeCPUInput, 1),  // position_ids on CPU for bounds validation
+        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>()),
     RotaryEmbedding);
 
 Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const {
@@ -157,32 +156,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con
   const auto hidden_size = batch_stride / sequence_length;
   const auto half_rotary_embedding_dim = onnxruntime::narrow<uint32_t>(cos_cache->Shape()[1]);
   const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_;
-  const auto max_sequence_length = static_cast<int64_t>(cos_cache->Shape()[0]);
-
-  // Validate position_ids values are within cos/sin cache bounds (position_ids kept on CPU via InputMemoryType).
-  const auto* pos_ids_data = position_ids->Data<int64_t>();
-  const auto pos_ids_size = position_ids->Shape().Size();
-  if (pos_ids_size == 1) {
-    // Format 0: single base offset. Effective positions are [base_pos, base_pos + sequence_length - 1].
-    int64_t base_pos = pos_ids_data[0];
-    int64_t max_valid_base = max_sequence_length - static_cast<int64_t>(sequence_length);
-    if (base_pos < 0 || base_pos > max_valid_base) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "position_ids base value ", base_pos,
-                             " with sequence_length ", sequence_length,
-                             " exceeds cos/sin cache range [0, ", max_sequence_length, ")");
-    }
-  } else {
-    // Format 1: 2D array (batch_size, sequence_length). Each value must be in [0, max_sequence_length).
-    for (int64_t i = 0; i < pos_ids_size; ++i) {
-      int64_t pos = pos_ids_data[i];
-      if (pos < 0 || pos >= max_sequence_length) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "position_ids value ", pos, " at index ", i,
-                               " is out of range [0, ", max_sequence_length, ")");
-      }
-    }
-  }
+
+  // position_ids bounds validation is handled by shader-side defense-in-depth checks
+  // (OOB position_ids → pass-through input unchanged). Host-side value scanning is not possible
+  // because WebGPU program inputs must be GPU buffers (InputMemoryType(OrtMemTypeCPUInput) is
+  // incompatible with AddInputs).
 
   // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape
   // [batch size, sequence length, num of heads, num of pairs to rotate + num of dims to copy]
diff --git a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
index bb921edfbe348..234b1d54e69c5 100644
--- a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
+++ b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
@@ -16,8 +16,7 @@ ONNX_OPERATOR_KERNEL_EX(
     kWebGpuExecutionProvider,
     (*KernelDefBuilder::Create())
         .TypeConstraint("T", WebGpuSupportedFloatTypes())
-        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>())
-        .InputMemoryType(OrtMemTypeCPUInput, 3),  // position_ids on CPU for bounds validation
+        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>()),
     RotaryEmbedding);
 
 RotaryEmbedding::RotaryEmbedding(const OpKernelInfo& info) : WebGpuKernel(info) {
@@ -84,32 +83,12 @@ Status RotaryEmbedding::ComputeInternal(ComputeContext& context) const {
 
   if (position_ids != nullptr) {
     // position_ids provided: cos/sin cache is 2D (max_pos, D/2)
-    // Validate position_ids values are within cache bounds (position_ids kept on CPU via InputMemoryType).
-    const auto max_sequence_length = static_cast<int64_t>(cos_cache->Shape()[0]);
-    const auto* pos_ids_data = position_ids->Data<int64_t>();
-    const auto pos_ids_size = position_ids->Shape().Size();
-    if (pos_ids_size == 1) {
-      // Format 0: single base offset. Shader adds sequence_idx, so effective range is
-      // [base_pos, base_pos + sequence_length - 1]. All must be < max_sequence_length.
-      int64_t base_pos = pos_ids_data[0];
-      int64_t max_valid_base = max_sequence_length - static_cast<int64_t>(sequence_length);
-      if (base_pos < 0 || base_pos > max_valid_base) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "position_ids base value ", base_pos,
-                               " with sequence_length ", sequence_length,
-                               " exceeds cos/sin cache range [0, ", max_sequence_length, ")");
-      }
-    } else {
-      // Format 1: 2D array (batch_size, sequence_length). Each value must be in [0, max_sequence_length).
-      for (int64_t i = 0; i < pos_ids_size; ++i) {
-        int64_t pos = pos_ids_data[i];
-        if (pos < 0 || pos >= max_sequence_length) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                                 "position_ids value ", pos, " at index ", i,
-                                 " is out of range [0, ", max_sequence_length, ")");
-        }
-      }
-    }
+    // position_ids bounds validation is handled by shader-side defense-in-depth checks
+    // (OOB position_ids → pass-through input unchanged). Host-side value scanning is not possible
+    // because WebGPU program inputs must be GPU buffers (InputMemoryType(OrtMemTypeCPUInput) is
+    // incompatible with AddInputs).
+    // Note: ONNX RotaryEmbedding has no base-offset mode (format 0) — position_ids is always
+    // a 2D tensor (batch_size, sequence_length) when provided.
 
     contrib::webgpu::RotaryEmbeddingProgram program{interleaved_};
     program
diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
index 31787b7a32dbd..96d1b8cd26ca7 100644
--- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -1051,8 +1051,11 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_RejectsRank4MalformedCacheWidth
   execution_providers.push_back(DefaultCpuExecutionProvider());
   test.Run(OpTester::ExpectResult::kExpectFailure,
            "Input 'cos_cache' dimension 1 should be same as head_size / 2 or rotary_embedding_dim / 2, got 8",
-// Test that OOB position_ids (format 1) are rejected on WebGPU (host-side validation).
-TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_WebGPU) {
+           {}, nullptr, &execution_providers);
+}
+
+// Test that OOB position_ids on WebGPU (format 1) pass through input unchanged (shader-side defense).
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_WebGPU_Passthrough) {
   if (nullptr == DefaultWebGpuExecutionProvider().get()) {
     GTEST_SKIP() << "WebGPU execution provider is not available.";
   }
@@ -1067,26 +1070,30 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_WebGPU) {
   OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
   test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
 
-  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
-                       std::vector<float>(batch_size * sequence_length * hidden_size, 1.0f));
-  // position_id = 999 exceeds max_sequence_length = 8
-  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {0, 999});
+  std::vector<float> input_data(batch_size * sequence_length * hidden_size);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Both position_ids exceed max_sequence_length = 8 — shader passes through input unchanged.
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {999, 999});
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
 
-  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
-                        std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
+  // Output should equal input when position_id is OOB (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultWebGpuExecutionProvider());
-  test.Run(OpTester::ExpectResult::kExpectFailure, "position_ids value 999 at index 1 is out of range",
-           {}, nullptr, &execution_providers);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-// Test that format-0 OOB position_ids base offset is rejected on WebGPU.
-TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Format0_OOB_WebGPU) {
+// Test that format-0 OOB position_ids base offset passes through on WebGPU (shader-side defense).
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Format0_OOB_WebGPU_Passthrough) {
   if (nullptr == DefaultWebGpuExecutionProvider().get()) {
     GTEST_SKIP() << "WebGPU execution provider is not available.";
   }
@@ -1101,27 +1108,30 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Format0_OOB_WebGPU)
   OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
   test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
 
-  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
-                       std::vector<float>(batch_size * sequence_length * hidden_size, 1.0f));
-  // Format 0: single value. Effective positions = [7, 8] — position 8 is out of range [0, 8).
-  test.AddInput<int64_t>("position_ids", {1}, {7});
+  std::vector<float> input_data(batch_size * sequence_length * hidden_size);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Format 0: base offset 8, effective positions = [8, 9] — both OOB for max_sequence_length = 8.
+  test.AddInput<int64_t>("position_ids", {1}, {8});
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
 
-  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
-                        std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
+  // Output should equal input when all positions are OOB (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultWebGpuExecutionProvider());
-  test.Run(OpTester::ExpectResult::kExpectFailure,
-           "position_ids base value 7 with sequence_length 2 exceeds cos/sin cache range",
-           {}, nullptr, &execution_providers);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-// Test that negative position_ids are rejected on WebGPU.
-TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Negative_WebGPU) {
+// Test that negative position_ids pass through on WebGPU (shader-side defense catches raw_pos < 0).
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Negative_WebGPU_Passthrough) {
   if (nullptr == DefaultWebGpuExecutionProvider().get()) {
     GTEST_SKIP() << "WebGPU execution provider is not available.";
   }
@@ -1136,23 +1146,26 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Negative_WebGPU) {
   OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
   test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
 
-  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
-                       std::vector<float>(hidden_size, 1.0f));
-  // Format 0: negative base offset
-  test.AddInput<int64_t>("position_ids", {1}, {-5});
+  std::vector<float> input_data(hidden_size);
+  for (int i = 0; i < hidden_size; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Negative position_id — shader checks raw_pos < 0 and passes through.
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {-5});
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
 
-  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
-                        std::vector<float>(hidden_size, 0.0f));
+  // Output should equal input when position_id is negative (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultWebGpuExecutionProvider());
-  test.Run(OpTester::ExpectResult::kExpectFailure,
-           "position_ids base value -5 with sequence_length 1 exceeds cos/sin cache range",
-           {}, nullptr, &execution_providers);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
index e954889e3fe97..7e74e46f1632b 100644
--- a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
@@ -1289,8 +1289,10 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_RejectsRank3HiddenSizeNotDivisibleByNu
   execution_providers.push_back(DefaultCpuExecutionProvider());
   test.Run(OpTester::ExpectResult::kExpectFailure,
            "hidden_size=5 must be divisible by num_heads=2 for rank-3 input", {}, nullptr, &execution_providers);
-// Test that OOB position_ids are rejected on WebGPU (host-side validation).
-TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_WebGPU) {
+}
+
+// Test that OOB position_ids on WebGPU pass through input unchanged (shader-side defense).
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_WebGPU_Passthrough) {
   if (nullptr == DefaultWebGpuExecutionProvider().get()) {
     GTEST_SKIP() << "WebGPU execution provider is not available.";
   }
@@ -1306,26 +1308,30 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_WebGPU) {
   test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
   test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
 
-  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
-                       std::vector<float>(hidden_size, 1.0f));
+  std::vector<float> input_data(hidden_size);
+  for (int i = 0; i < hidden_size; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
-  // position_id = 2048 exceeds max_sequence_length = 8
+  // position_id = 2048 exceeds max_sequence_length = 8 — shader passes through input unchanged.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {2048});
 
-  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
-                        std::vector<float>(hidden_size, 0.0f));
+  // Output should equal input when position_id is OOB (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultWebGpuExecutionProvider());
-  test.Run(OpTester::ExpectResult::kExpectFailure, "position_ids value 2048 at index 0 is out of range",
-           {}, nullptr, &execution_providers);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-// Test that negative position_ids are rejected on WebGPU.
-TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Negative_WebGPU) {
+// Test that negative position_ids pass through on WebGPU (shader-side defense catches raw_pos < 0).
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Negative_WebGPU_Passthrough) {
   if (nullptr == DefaultWebGpuExecutionProvider().get()) {
     GTEST_SKIP() << "WebGPU execution provider is not available.";
   }
@@ -1341,26 +1347,30 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Negative_WebGPU) {
   test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
   test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
 
-  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
-                       std::vector<float>(hidden_size, 1.0f));
+  std::vector<float> input_data(hidden_size);
+  for (int i = 0; i < hidden_size; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
-  // position_id = -1 is negative
+  // Negative position_id — shader checks raw_pos < 0 and passes through.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {-1});
 
-  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
-                        std::vector<float>(hidden_size, 0.0f));
+  // Output should equal input when position_id is negative (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultWebGpuExecutionProvider());
-  test.Run(OpTester::ExpectResult::kExpectFailure, "position_ids value -1 at index 0 is out of range",
-           {}, nullptr, &execution_providers);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-// Test that OOB position_ids in a batch are rejected on WebGPU.
-TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_InBatch_WebGPU) {
+// Test that OOB position_ids in a batch pass through on WebGPU (shader-side defense).
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_InBatch_WebGPU_Passthrough) {
   if (nullptr == DefaultWebGpuExecutionProvider().get()) {
     GTEST_SKIP() << "WebGPU execution provider is not available.";
   }
@@ -1376,58 +1386,26 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_InBatch_WebGPU) {
   test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
   test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
 
-  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
-                       std::vector<float>(batch_size * sequence_length * hidden_size, 1.0f));
-  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
-  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
-  // Second batch has position_id = 100 which exceeds max_sequence_length = 8
-  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {0, 1, 2, 100});
-
-  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
-                        std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
-
-  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-  execution_providers.push_back(DefaultWebGpuExecutionProvider());
-  test.Run(OpTester::ExpectResult::kExpectFailure, "position_ids value 100 at index 3 is out of range",
-           {}, nullptr, &execution_providers);
-}
-
-// Test that format-0 (single position_id) OOB is rejected on WebGPU.
-TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Format0_OOB_WebGPU) {
-  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
-    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  std::vector<float> input_data(batch_size * sequence_length * hidden_size);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>(i + 1);
   }
 
-  int batch_size = 1;
-  int sequence_length = 2;
-  int num_heads = 2;
-  int head_size = 4;
-  int max_sequence_length = 8;
-  int hidden_size = num_heads * head_size;
-
-  OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain);
-  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
-  test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
-
-  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size},
-                       std::vector<float>(batch_size * sequence_length * hidden_size, 1.0f));
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
                        std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
-  // Format 0: single value. Effective positions = [7, 8] — position 8 is out of range [0, 8).
-  test.AddInput<int64_t>("position_ids", {1}, {7});
+  // All OOB position_ids — shader passes through input unchanged.
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {100, 200, 300, 400});
 
-  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
-                        std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
+  // Output should equal input when all position_ids are OOB (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultWebGpuExecutionProvider());
-  test.Run(OpTester::ExpectResult::kExpectFailure,
-           "position_ids base value 7 with sequence_length 2 exceeds cos/sin cache range",
-           {}, nullptr, &execution_providers);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
 }  // namespace test

From e5a824e7257d62c6637f35bf0f8189abcc455558 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Mon, 27 Apr 2026 20:24:53 +0000
Subject: [PATCH 10/11] Remove TS position_ids value validation (GPU-resident
 tensor)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

position_ids tensor data is GPU-resident in the WebGPU/JSEP path — its
TensorView.data field is a GPU buffer ID, not a WASM heap pointer.
Calling getBigInt64Array() reads from a random WASM heap location,
returning garbage values that cause ALL existing RotaryEmbedding JSEP
tests to fail validation.

Other ops (pad, split, tile, etc.) can safely call getBigInt64Array()
because their int64 inputs use InputMemoryType(OrtMemTypeCPU) which
keeps data on CPU with valid WASM heap pointers. RotaryEmbedding cannot
use InputMemoryType for position_ids because AddInputs() requires GPU
buffers.

Defense strategy: shader-side WGSL bounds checks handle OOB position_ids
(pass-through behavior), which works regardless of CPU/GPU residency.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../wasm/jsep/webgpu/ops/rotary-embedding.ts  | 26 +++----------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
index fd9708ce23d51..9bbad9839d616 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
@@ -62,33 +62,13 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
     }
   }
 
-  // Validate position_ids values are within cos/sin cache bounds.
   if (sequenceLength > maxSequenceLength) {
     throw new Error('Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported');
   }
 
-  const positionIdsElementCount = ShapeUtil.size(positionIds.dims);
-  const positionIdsBigInt = positionIds.getBigInt64Array();
-  if (positionIdsElementCount === 1) {
-    // Format 0: single base offset. Effective positions are [base_pos, base_pos + sequence_length - 1].
-    const basePos = positionIdsBigInt[0];
-    const maxValidBase = BigInt(maxSequenceLength) - BigInt(sequenceLength);
-    if (basePos < 0n || basePos > maxValidBase) {
-      throw new Error(
-        `position_ids base value ${basePos} with sequence_length ${sequenceLength}` +
-          ` exceeds cos/sin cache range [0, ${maxSequenceLength})`,
-      );
-    }
-  } else {
-    // Format 1: 2D array (batch_size, sequence_length). Each value must be in [0, max_sequence_length).
-    const maxSeqBigInt = BigInt(maxSequenceLength);
-    for (let i = 0; i < positionIdsElementCount; i++) {
-      const pos = positionIdsBigInt[i];
-      if (pos < 0n || pos >= maxSeqBigInt) {
-        throw new Error(`position_ids value ${pos} at index ${i} is out of range [0, ${maxSequenceLength})`);
-      }
-    }
-  }
+  // Note: position_ids value validation is handled by shader-side bounds checks (defense-in-depth).
+  // We cannot validate position_ids values here because the tensor is GPU-resident — its data field
+  // is a GPU buffer ID, not a WASM heap pointer, so getBigInt64Array() would read garbage.
 
   if (headSize / 2 !== cosCache.dims[1] && rotaryEmbeddingDim / 2 !== cosCache.dims[1]) {
     throw new Error(

From 50926db1546bf8baf377993de82d67b16951be7f Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Tue, 28 Apr 2026 23:15:33 +0000
Subject: [PATCH 11/11] Address review: non-trivial cache values in OOB tests,
 revert init.ts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Use cos=0.5, sin=0.866 in all 8 pass-through OOB tests (both CUDA
   and WebGPU, both contrib and ONNX domain). With identity cache values
   (cos=1, sin=0), the rotary embedding is a no-op, making pass-through
   output identical to valid output — the test cannot verify bounds
   checking. Non-trivial values ensure pass-through (output=input) is
   distinguishable from valid rotary output.

2. Revert getBigInt64Array() alignment fix in init.ts — no longer needed
   since TS value validation for position_ids was removed. Keeps PR
   scope minimal.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 js/web/lib/wasm/jsep/init.ts                  | 16 +++------------
 .../contrib_ops/rotary_embedding_op_test.cc   | 20 +++++++++++--------
 .../cpu/llm/rotary_embedding_op_test.cc       | 20 +++++++++++--------
 3 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index c1590213b10c3..50fb26fef1d41 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -39,19 +39,9 @@ class TensorViewImpl implements TensorView {
       throw new Error('Invalid data type');
     }
     const elementCount = ShapeUtil.size(this.dims);
-    if (elementCount === 0) {
-      return new BigInt64Array();
-    }
-    // BigInt64Array requires the byte offset to be a multiple of 8. WASM allocators may return
-    // offsets that are not 8-byte aligned, so fall back to copying bytes into an aligned buffer.
-    // Note: the returned array is a read-only copy when unaligned (mutations won't propagate to WASM heap).
-    if (this.data % 8 === 0) {
-      return new BigInt64Array(this.module.HEAP8.buffer, this.data, elementCount);
-    }
-    const byteLength = elementCount * 8;
-    const alignedBuffer = new ArrayBuffer(byteLength);
-    new Uint8Array(alignedBuffer).set(new Uint8Array(this.module.HEAP8.buffer, this.data, byteLength));
-    return new BigInt64Array(alignedBuffer);
+    return elementCount === 0
+      ? new BigInt64Array()
+      : new BigInt64Array(this.module.HEAP8.buffer, this.data, elementCount);
   }
 
   getInt32Array(): Int32Array {
diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
index 96d1b8cd26ca7..880c10137f3fe 100644
--- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -937,10 +937,11 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_CUDA_Passthroug
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
   // position_id = 2048 exceeds max_sequence_length = 8 — CUDA should pass through input unchanged.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {2048});
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
 
   // Output should equal input when position_id is OOB (pass-through).
   test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
@@ -1078,10 +1079,11 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_WebGPU_Passthro
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
   // Both position_ids exceed max_sequence_length = 8 — shader passes through input unchanged.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {999, 999});
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
 
   // Output should equal input when position_id is OOB (pass-through).
   test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
@@ -1116,10 +1118,11 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Format0_OOB_WebGPU_
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
   // Format 0: base offset 8, effective positions = [8, 9] — both OOB for max_sequence_length = 8.
   test.AddInput<int64_t>("position_ids", {1}, {8});
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
 
   // Output should equal input when all positions are OOB (pass-through).
   test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
@@ -1154,10 +1157,11 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Negative_WebGPU_Pas
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
   // Negative position_id — shader checks raw_pos < 0 and passes through.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {-5});
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
 
   // Output should equal input when position_id is negative (pass-through).
   test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
diff --git a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
index 7e74e46f1632b..2f51b8a7a5690 100644
--- a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
@@ -1208,10 +1208,11 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_CUDA_Passthrough) {
   }
 
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
   // position_id = 2048 exceeds max_sequence_length = 8 — CUDA should pass through input unchanged.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {2048});
 
@@ -1314,10 +1315,11 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_WebGPU_Passthrough) {
   }
 
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
   // position_id = 2048 exceeds max_sequence_length = 8 — shader passes through input unchanged.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {2048});
 
@@ -1353,10 +1355,11 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Negative_WebGPU_Passthroug
   }
 
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
   // Negative position_id — shader checks raw_pos < 0 and passes through.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {-1});
 
@@ -1392,10 +1395,11 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_InBatch_WebGPU_Passthr
   }
 
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
   // All OOB position_ids — shader passes through input unchanged.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {100, 200, 300, 400});