pytorch
diff --git a/‎.github/workflows/doc-build.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/doc-build.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/mps/mps_preprocess.py
Lines changed: 15 additions & 1 deletion b/‎backends/apple/mps/mps_preprocess.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎examples/demo-apps/android/LlamaDemo/README.md
Lines changed: 2 additions & 2 deletions b/‎examples/demo-apps/android/LlamaDemo/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/models/llama/export_llama_lib.py
Lines changed: 13 additions & 1 deletion b/‎examples/models/llama/export_llama_lib.py
Lines changed: 13 additions & 1 deletion
diff --git a/‎examples/models/llama/source_transformation/sdpa.py
Lines changed: 50 additions & 14 deletions b/‎examples/models/llama/source_transformation/sdpa.py
Lines changed: 50 additions & 14 deletions
diff --git a/‎examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
Lines changed: 2 additions & 2 deletions b/‎examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/qualcomm/scripts/mobilebert_fine_tune.py
Lines changed: 1 addition & 3 deletions b/‎examples/qualcomm/scripts/mobilebert_fine_tune.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎exir/program/_program.py
Lines changed: 27 additions & 2 deletions b/‎exir/program/_program.py
Lines changed: 27 additions & 2 deletions
diff --git a/‎exir/program/test/test_program.py
Lines changed: 22 additions & 1 deletion b/‎exir/program/test/test_program.py
Lines changed: 22 additions & 1 deletion
diff --git a/‎extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java
Lines changed: 12 additions & 12 deletions b/‎extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java
Lines changed: 12 additions & 12 deletions
@@ -21,12 +21,12 @@ jobs:
       - name: Check URLs
         run: bash ./scripts/check_urls.sh
 
-  check-links:
+  check-xrefs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
       - name: Check Links
-        run: bash ./scripts/check_links.sh
+        run: bash ./scripts/check_xrefs.sh
 
   build:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
 
@@ -6,6 +6,7 @@
 from typing import ClassVar, Dict, final, List, Tuple
 
 import torch
+from executorch import exir
 
 from executorch.backends.apple.mps.operators.node_visitor import (
     get_node_visitors,
@@ -35,6 +36,7 @@
 
 from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from executorch.exir.program._program import _transform
+from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
 from torch.export.exported_program import ExportedProgram
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -87,7 +89,19 @@ def preprocess(
         #    the `output_ids` array in the schema.
 
         # TODO: Remove this once we have a better support for the dim-order ops.
-        edge_program = _transform(edge_program, DimOrderOpsRevertPass())
+        # Need to override the verifier to skip the non dim-order ops from tripping the default verifier.
+        edge_program = _transform(
+            edge_program,
+            DimOrderOpsRevertPass(),
+            override_verifiers=[
+                EXIREdgeDialectVerifier(
+                    edge_compile_config=exir.EdgeCompileConfig(
+                        _check_ir_validity=False,  # Disable the edge dialect verifier, since we are in the mps backend.
+                    ),
+                    class_only=True,
+                )
+            ],
+        )
 
         mps_graph = MPSGraph(
             version="0",
 
@@ -135,8 +135,8 @@ Ensure you have the following functions in your callback class that you provided
   }
 
   @Override
-  public void onStats(float tps) {
-    //...tps (tokens per second) stats is provided by framework
+  public void onStats(String stats) {
+    //... will be a json. See extension/llm/stats.h for the field definitions
   }
 
 ```
 
@@ -1227,10 +1227,22 @@ def _get_source_transforms(  # noqa
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
+    use_attention_mask_for_custom_sdpa = False
+    if isinstance(args, argparse.Namespace):
+        if getattr(args, "use_custom_sdpa_with_attention_mask", None):
+            use_attention_mask_for_custom_sdpa = True
+
     if args.use_sdpa_with_kv_cache:
         transforms.append(replace_kv_cache_with_custom_kv_cache)
         # todo: do this optionally
-        transforms.append(replace_sdpa_with_custom_op)
+        # if use attention mask instead of causal attention
+        # then create partial function that sets use_attention_mask=True
+        if use_attention_mask_for_custom_sdpa:
+            transforms.append(
+                partial(replace_sdpa_with_custom_op, use_attention_mask=True)
+            )
+        else:
+            transforms.append(replace_sdpa_with_custom_op)
 
     if args.quantize_kv_cache:
         assert args.use_kv_cache, "quantize_kv_cache requires use_kv_cache=True"
 
@@ -22,9 +22,15 @@ class SDPACustom(torch.nn.Module):
     def __init__(
         self,
         dim: int,
+        max_context_len,
+        enable_dynamic_shape,
+        use_attention_mask: bool = False,
     ):
         super().__init__()
         self.dim = dim
+        self.max_context_len = max_context_len
+        self.use_attention_mask = use_attention_mask
+        self.enable_dynamic_shape = enable_dynamic_shape
 
     def forward(
         self,
@@ -36,6 +42,16 @@ def forward(
         seqlen,
         mask,
     ):
+        if self.use_attention_mask:
+            if self.enable_dynamic_shape:
+                start_pos = input_pos[-1].item()
+                torch._check_is_size(start_pos)
+                torch._check(start_pos < self.max_context_len)
+                seq_length = q.size(2)
+                mask = mask.narrow(0, start_pos, seq_length)
+            else:
+                mask = mask[input_pos]
+
         q = q.transpose(1, 2)  # (bs, seqlen, n_local_heads, head_dim)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
@@ -47,34 +63,54 @@ def forward(
         k = k.to(dtype=torch.float)
         v = v.to(dtype=torch.float)
 
-        output = torch.ops.llama.custom_sdpa(
-            q,
-            k,
-            v,
-            input_pos[0].item(),
-            None,  # Attention mask
-            0,  # dropout probability. Ignored by the code
-            True,  # is_causal
-        )
+        if self.use_attention_mask:
+            output = torch.ops.llama.custom_sdpa(
+                q,
+                k,
+                v,
+                input_pos[0].item(),
+                mask,  # Attention mask
+                0,  # dropout probability. Ignored by the code
+                False,  # is_causal
+            )
+        else:
+            output = torch.ops.llama.custom_sdpa(
+                q,
+                k,
+                v,
+                input_pos[0].item(),
+                None,  # Attention mask
+                0,  # dropout probability. Ignored by the code
+                True,  # is_causal
+            )
         return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
-def _replace_sdpa_with_custom_op(module: torch.nn.Module):
+def _replace_sdpa_with_custom_op(
+    module: torch.nn.Module, use_attention_mask: bool = False
+):
     for name, child in module.named_children():
         if isinstance(child, SDPA):
             setattr(
                 module,
                 name,
-                SDPACustom(child.dim),
+                SDPACustom(
+                    child.dim,
+                    child.max_context_len,
+                    child.enable_dynamic_shape,
+                    use_attention_mask=use_attention_mask,
+                ),
             )
         else:
-            _replace_sdpa_with_custom_op(child)
+            _replace_sdpa_with_custom_op(child, use_attention_mask=use_attention_mask)
 
 
-def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
+def replace_sdpa_with_custom_op(
+    module: torch.nn.Module, use_attention_mask: bool = False
+) -> torch.nn.Module:
     from executorch.extension.llm.custom_ops import custom_ops  # noqa
 
-    _replace_sdpa_with_custom_op(module)
+    _replace_sdpa_with_custom_op(module, use_attention_mask=use_attention_mask)
     return module
 
 
 
@@ -71,8 +71,8 @@ def test_simple(self, is_dynamic_shape=False):
         self.seq_len = 3
         self._init_cache()
         q, k_val, v_val = self._init_kv()
-        self.float_sdpa = SDPACustom(self.dim)
-        self.quantized_sdpa = SDPACustom(self.dim)
+        self.float_sdpa = SDPACustom(self.dim, self.max_context_len, True)
+        self.quantized_sdpa = SDPACustom(self.dim, self.max_context_len, True)
         k, v = self.custom_kv_cache.update(input_pos, k_val, v_val)
         float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
         k, v = self.quantized_kv_cache.update(input_pos, k_val, v_val)
 
@@ -102,9 +102,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
     from transformers import get_linear_schedule_with_warmup
 
     # grab dataset
-    url = (
-        "https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv"
-    )
+    url = "https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv"
     content = requests.get(url, allow_redirects=True).content
     data = pd.read_csv(BytesIO(content))
 
 
@@ -212,7 +212,30 @@ def _get_updated_graph_signature(
     return new_signature
 
 
-def _transform(self, *passes: PassType) -> "ExportedProgram":
+def _transform(
+    self,
+    *passes: PassType,
+    override_verifiers: None | list[Type[Verifier]] = None,
+) -> "ExportedProgram":
+    """
+    Transforms the program according to the provided passes.
+
+    Args:
+        self: The ExportedProgram instance to transform
+        *passes: A sequence of passes to apply to the program
+        override_verifiers: Optional list of verifier classes to use instead of the default verifiers.
+            This is needed if the transforms yields illegal graph that the default verifier cannot handle.
+
+    Returns:
+        ExportedProgram: A new ExportedProgram with the transformations applied, or self if no changes were made
+    """
+    # A user friendly check to avoid vararg surprises, PEP 3102
+    assert not any(
+        isinstance(p, (list, Verifier)) for p in passes
+    ), f"Expected all passes to be of PassType, not list or Verifier. Use override_verifiers kwarg instead. Got: {list(passes)}"
+
+    for p in list(passes):
+        print(type(p))
     pm = PassManager(list(passes))
     res = pm(self.graph_module)
     transformed_gm = res.graph_module if res is not None else self.graph_module
@@ -221,7 +244,9 @@ def _transform(self, *passes: PassType) -> "ExportedProgram":
     if transformed_gm is self.graph_module and not res.modified:
         return self
 
-    return _update_exported_program_graph_module(self, transformed_gm)
+    return _update_exported_program_graph_module(
+        self, transformed_gm, override_verifiers
+    )
 
 
 def _update_exported_program_graph_module(
 
@@ -22,6 +22,7 @@
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.program._program import (
+    _transform,
     EdgeProgramManager,
     ExecutorchProgramManager,
     to_edge,
@@ -34,6 +35,7 @@
 from executorch.extension.pybindings.portable_lib import (
     _load_for_executorch_from_buffer,
 )
+from torch._export.verifier import Verifier
 from torch.export import Dim, export, ExportedProgram
 from torch.export._trace import _export
 
@@ -273,7 +275,6 @@ def get_executorch_memory_planning_passes() -> Dict[str, MemoryPlanningPass]:
             for output_val in method.outputs:
                 evalue = method.values[output_val]
                 self.assertNotEqual(evalue.val.allocation_info, None)
-        else:
             for input_val in method.inputs:
                 evalue = method.values[input_val]
                 self.assertEqual(evalue.val.allocation_info, None)
@@ -847,3 +848,23 @@ def test_save_fails(self):
         et = edge.to_executorch()
         with self.assertRaises(ValueError):
             _ = et.save("/tmp/test_save.pt")
+
+    def test__transform_override_verifiers(self):
+        """Test that _transform can override verifiers in the exported program."""
+
+        class MyVerifier(Verifier):
+            dialect: str = "MY_DIALECT"
+
+            def __init__(self):
+                super().__init__()
+
+        model = TestLinear()
+        program = torch.export.export(model, model._get_random_inputs(), strict=True)
+        self.assertFalse(issubclass(program.verifiers[0], MyVerifier))
+
+        # Apply transformation with custom verifier
+        transformed = _transform(
+            program, AddToMulPassEdge(), override_verifiers=[MyVerifier]
+        )
+        self.assertTrue(issubclass(transformed.verifiers[0], MyVerifier))
+        self.assertFalse(issubclass(program.verifiers[0], MyVerifier))
@@ -97,16 +97,7 @@ public void onResult(String result) {
 
             @Override
             public void onStats(String stats) {
-                float tps = 0;
-                try {
-                    JSONObject jsonObject = new JSONObject(stats);
-                    int numGeneratedTokens = jsonObject.getInt("generated_tokens");
-                    int inferenceEndMs = jsonObject.getInt("inference_end_ms");
-                    int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
-                    tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
-                    LlmModuleInstrumentationTest.this.onStats(tps);
-                } catch (JSONException e) {
-                }
+                LlmModuleInstrumentationTest.this.onStats(stats);
             }
         });
 
@@ -120,7 +111,16 @@ public void onResult(String result) {
     }
 
     @Override
-    public void onStats(float tps) {
-        tokensPerSecond.add(tps);
+    public void onStats(String stats) {
+        float tps = 0;
+        try {
+            JSONObject jsonObject = new JSONObject(stats);
+            int numGeneratedTokens = jsonObject.getInt("generated_tokens");
+            int inferenceEndMs = jsonObject.getInt("inference_end_ms");
+            int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
+            tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
+            tokensPerSecond.add(tps);
+        } catch (JSONException e) {
+        }
     }
 }
Original file line number	Diff line number	Diff line change
`@@ -135,8 +135,8 @@ Ensure you have the following functions in your callback class that you provided`
`135`	`135`	`}`
`136`	`136`
`137`	`137`	`@Override`
`138`		`- public void onStats(float tps) {`
`139`		`- //...tps (tokens per second) stats is provided by framework`
	`138`	`+ public void onStats(String stats) {`
	`139`	`+ //... will be a json. See extension/llm/stats.h for the field definitions`
`140`	`140`	`}`
`141`	`141`
`142`	`142`	```