diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index b93b7b0283c..ff0277c2056 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -17,6 +17,7 @@ parameterized==0.9.0
 
 # Doc build requirements, same as https://github.com/pytorch/pytorch/blob/main/.ci/docker/requirements-docs.txt
 sphinx==5.3.0
+sphinx-reredirects==0.1.4
 sphinx-gallery==0.14.0
 breathe==4.34.0
 exhale==0.2.3
diff --git a/.ci/scripts/build_android_instrumentation.sh b/.ci/scripts/build_android_instrumentation.sh
deleted file mode 100644
index 5e074d9e215..00000000000
--- a/.ci/scripts/build_android_instrumentation.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -ex
-
-if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
-  PYTHON_EXECUTABLE=python3
-fi
-which "${PYTHON_EXECUTABLE}"
-
-mkdir -p "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
-cp extension/module/test/resources/add.pte "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
-
-pushd "${BUILD_AAR_DIR}"
-ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
-ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
-popd
diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py
index b2126f84e78..9a4723d7e56 100755
--- a/.ci/scripts/gather_benchmark_configs.py
+++ b/.ci/scripts/gather_benchmark_configs.py
@@ -23,6 +23,7 @@
     "samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa",
     "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
     "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
+    "google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
 }
 
 # Predefined benchmark configurations
diff --git a/.ci/scripts/test_ios_ci.sh b/.ci/scripts/test_ios_ci.sh
index 50c6448d4b2..e87dbec8444 100755
--- a/.ci/scripts/test_ios_ci.sh
+++ b/.ci/scripts/test_ios_ci.sh
@@ -7,7 +7,7 @@
 
 set -e
 
-APP_PATH="examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo"
+APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo"
 MODEL_NAME="mv3"
 SIMULATOR_NAME="executorch"
 
@@ -34,6 +34,10 @@ say() {
   echo -e "\033[1m\n\t** $1 **\n\033[0m"
 }
 
+say "Cloning the Demo App"
+
+git clone --depth 1 https://github.com/pytorch-labs/executorch-examples.git
+
 say "Installing CoreML Backend Requirements"
 
 ./backends/apple/coreml/scripts/install_requirements.sh
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 15df725f9c1..8a1d5683b33 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -154,7 +154,7 @@ run_and_verify() {
         EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
     else
         # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
-        EXPECTED_PREFIX="ASSISTANT:"
+        EXPECTED_PREFIX="ASSISTANT: image"
     fi
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
diff --git a/.github/release.yml b/.github/release.yml
index 8caa4ede084..fc4accd252a 100644
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -15,57 +15,82 @@ changelog:
     - title: ARM
       labels:
         - "release notes: arm"
+        - "module: arm"
+        - "partner: arm"
     - title: NXP
-        labels:
+      labels:
         - "release notes: nxp"
+        - "module: nxp"
     - title: Exir
-        labels:
+      labels:
         - "release notes: exir"
+        - "module: exir"
     - title: Misc
-        labels:
+      labels:
         - "release notes: misc"
     - title: Apple
-        labels:
+      labels:
         - "release notes: apple"
+        - "module: coreml"
+        - "module: mps"
+    - title: Android
+      labels:
+        - "module: android"
+    - title: IOS
+      labels:
+        - "module: ios"
     - title: Build
-        labels:
+      labels:
         - "release notes: build"
     - title: Vulkan
-        labels:
+      labels:
         - "release notes: vulkan"
+        - "module: vulkan"
     - title: Cadence
-        labels:
+      labels:
         - "release notes: cadence"
+        - "module: cadence"
     - title: Runtime
-        labels:
+      labels:
         - "release notes: runtime"
+        - "module: runtime"
     - title: XNNPACK
-        labels:
+      labels:
         - "release notes: xnnpack"
+        - "module: xnnpack"
     - title: Devtools
-        labels:
+      labels:
         - "release notes: devtools"   
+        - "module: devtools"
     - title: Examples
-        labels:
+      labels:
         - "release notes: examples"
+    - title: LLM
+      labels:
+        - "module: llm"
     - title: Mediatek
-        labels:
+      labels:
         - "release notes: mediatek"
+        - "partner: mediatek"
     - title: Openvino
-        labels:
+      labels:
         - "release notes: openvino"
     - title: Qualcomm
-        labels:
+      labels:
         - "release notes: qualcomm"
+        - "partner: qualcomm"
+        - "module: qnn"
     - title: Training
-        labels:
+      labels:
         - "release notes: training"
+        - "module: training"
     - title: Quantization
-        labels:
+      labels:
         - "release notes: quantization" 
     - title: Ops & kernels
-        labels:
-        - "release notes: ops & kernels" 
+      labels:
+        - "release notes: ops & kernels"
+        - "module: kernels"
     - title: Other Changes
       labels:
         - "*"
diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
index e29833015d3..630ae2747bf 100644
--- a/.github/workflows/_android.yml
+++ b/.github/workflows/_android.yml
@@ -14,7 +14,7 @@ jobs:
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: android-apps
@@ -22,6 +22,10 @@ jobs:
       script: |
         set -eux
 
+        # Use sccache for NDK compiler as well
+        export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+        export CMAKE_C_COMPILER_LAUNCHER=sccache
+
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
@@ -36,8 +40,9 @@ jobs:
         cp ${BUILD_AAR_DIR}/executorch.aar $ARTIFACTS_DIR_NAME
 
         mkdir -p ${ARTIFACTS_DIR_NAME}/library_test_dir
-        bash .ci/scripts/build_android_instrumentation.sh
-        cp ${BUILD_AAR_DIR}/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir"
+        bash extension/android/executorch_android/android_test_setup.sh
+        (cd extension/android; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest)
+        cp extension/android/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir"
 
         mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
         bash examples/models/llama/install_requirements.sh
@@ -130,7 +135,8 @@ jobs:
           # https://github.com/ReactiveCircus/android-emulator-runner. The max number
           # of cores we can set is 6, any higher number will be reduced to 6.
           cores: 6
-          ram-size: 12288M
+          ram-size: 16384M
+          heap-size: 12288M
           force-avd-creation: false
           disable-animations: true
           emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none
diff --git a/.github/workflows/android-perf-private-device-experiment.yml b/.github/workflows/android-perf-private-device-experiment.yml
new file mode 100644
index 00000000000..0108ab119ca
--- /dev/null
+++ b/.github/workflows/android-perf-private-device-experiment.yml
@@ -0,0 +1,62 @@
+name: android-perf (private devices)
+
+on:
+  schedule:
+    - cron: 0 0,4,8,12,16,20 * * *
+  pull_request:
+    paths:
+      - .github/workflows/android-perf-private-device-experiment.yml
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/android-perf-private-device-experiment.yml
+  # Note: GitHub has an upper limit of 10 inputs
+  workflow_dispatch:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: google_pixel_3_private_rooted
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: google_pixel_3_private_rooted
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+
+concurrency:
+  group: android-perf-private-devices-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  android:
+    uses: ./.github/workflows/android-perf.yml
+    secrets: inherit
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
+      devices: google_pixel_3_private_rooted
+      benchmark_configs: ${{ inputs.benchmark_configs }}
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 09a6453094f..5245d2f4f12 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -345,7 +345,7 @@ jobs:
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       upload-artifact: android-apps
@@ -353,6 +353,10 @@ jobs:
       script: |
         set -eux
 
+        # Use sccache for NDK compiler as well
+        export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+        export CMAKE_C_COMPILER_LAUNCHER=sccache
+
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
@@ -392,7 +396,7 @@ jobs:
       fail-fast: false
     with:
       # Due to scheduling a job may be pushed beyond the default 60m threshold
-      timeout: 120
+      timeout: 240
       device-type: android
       runner: linux.2xlarge
       test-infra-ref: ''
diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
index 24aa6c1ad27..b31ff644d94 100644
--- a/.github/workflows/android-release-artifacts.yml
+++ b/.github/workflows/android-release-artifacts.yml
@@ -7,6 +7,17 @@ on:
         description: Version name to be uploaded for AAR release
         required: false
         type: string
+      upload_to_maven:
+        description: Upload the AAR to maven staging repository
+        required: false
+        type: boolean
+      flavor:
+        type: choice
+        options:
+          - "xnnpack"
+          - "vulkan+xnnpack"
+  schedule:
+    - cron: 0 10 * * *
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -22,6 +33,10 @@ jobs:
         shell: bash
         run: |
           VERSION="${{ inputs.version }}"
+          if [ -z "$VERSION" ]; then
+            echo "No version name specified. Will create a snapshot AAR"
+            exit 0
+          fi
           if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" | grep "200 OK"; then
             echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar"
             echo "Will skip build/upload"
@@ -31,14 +46,18 @@ jobs:
   build-aar:
     name: build-aar
     needs: check-if-aar-exists
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    if: ${{ !github.event.pull_request.head.repo.fork }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.7
+    secrets: inherit
     permissions:
       id-token: write
       contents: read
     with:
-      runner: linux.2xlarge
+      secrets-env: EXECUTORCH_MAVEN_SIGNING_KEYID EXECUTORCH_MAVEN_SIGNING_PASSWORD EXECUTORCH_MAVEN_CENTRAL_PASSWORD EXECUTORCH_MAVEN_CENTRAL_USERNAME EXECUTORCH_MAVEN_SIGNING_GPG_KEY_CONTENTS
+      # As this job has access to Maven credential, run this on a fresh ephemeral runner
+      runner: ephemeral.linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.sha }}
       timeout: 90
       upload-artifact: android-apps
@@ -46,12 +65,37 @@ jobs:
       script: |
         set -eux
 
+        # Use sccache for NDK compiler as well
+        export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+        export CMAKE_C_COMPILER_LAUNCHER=sccache
+
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool buck2
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
+        mkdir -p ~/.gradle
+        touch ~/.gradle/gradle.properties
+        echo "signing.keyId=${SECRET_EXECUTORCH_MAVEN_SIGNING_KEYID}" >> ~/.gradle/gradle.properties
+        echo "signing.password=${SECRET_EXECUTORCH_MAVEN_SIGNING_PASSWORD}" >> ~/.gradle/gradle.properties
+        echo "mavenCentralUsername=${SECRET_EXECUTORCH_MAVEN_CENTRAL_USERNAME}" >> ~/.gradle/gradle.properties
+        echo "mavenCentralPassword=${SECRET_EXECUTORCH_MAVEN_CENTRAL_PASSWORD}" >> ~/.gradle/gradle.properties
+        echo "signing.secretKeyRingFile=/tmp/secring.gpg" >> ~/.gradle/gradle.properties
+
+        echo -n "$SECRET_EXECUTORCH_MAVEN_SIGNING_GPG_KEY_CONTENTS" | base64 -d > /tmp/secring.gpg
+
+        # Update the version name in build.gradle in case of maven publish
+        VERSION="${{ inputs.version }}"
+        if [ ! -z "$VERSION" ]; then
+          sed -i "s/\(coordinates(\"org.pytorch\", \"executorch-android\", \"\)\([0-9]\+.[0-9]\+.[0-9]\+\)\(\")\)/\1$VERSION\3/" extension/android/executorch_android/build.gradle
+        fi
+
+        FLAVOR="${{ inputs.flavor }}"
+        if [[ "$FLAVOR" == "vulkan+xnnpack" ]]; then
+          export EXECUTORCH_BUILD_VULKAN=ON
+        fi
+
         # Build AAR Package
         mkdir aar-out
         export BUILD_AAR_DIR=aar-out
@@ -61,6 +105,12 @@ jobs:
 
         shasum -a 256 "${ARTIFACTS_DIR_NAME}/executorch.aar"
 
+        # Publish to maven staging
+        UPLOAD_TO_MAVEN="${{ inputs.upload_to_maven }}"
+        if [[ "$UPLOAD_TO_MAVEN" == "true" ]]; then
+          (cd extension/android; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:publishToMavenCentral)
+        fi
+
   upload-release-aar:
     name: upload-release-aar
     needs: build-aar
@@ -84,6 +134,8 @@ jobs:
           pip install awscli==1.32.18
           AWS_CMD="aws s3 cp"
           VERSION="${{ inputs.version }}"
-          VERSION_NAME="${VERSION:-temp_snapshot}"
-          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar --acl public-read
-          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar.sha256sums --acl public-read
+          if [ -z "$VERSION" ]; then
+            VERSION="snapshot-$(date +"%Y%m%d")"
+          fi
+          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION}/executorch.aar --acl public-read
+          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION}/executorch.aar.sha256sums --acl public-read
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index b8b63078643..7fa40e3ea75 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -14,6 +14,20 @@ on:
     - cron: '0 0 * * *'
 
 jobs:
+  check-urls:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Check URLs
+        run: bash ./scripts/check_urls.sh
+
+  check-xrefs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Check Links
+        run: bash ./scripts/check_xrefs.sh
+
   build:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
diff --git a/.mypy.ini b/.mypy.ini
index 8c1c9dbcadc..5ee07ddb2bf 100644
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -80,6 +80,9 @@ ignore_missing_imports = True
 [mypy-serializer.*]
 ignore_missing_imports = True
 
+[mypy-tosa_tools.*]
+ignore_missing_imports = True
+
 [mypy-setuptools.*]
 ignore_missing_imports = True
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6dbb66afdaa..34538d1e5ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -761,12 +761,16 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizers)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_MODULE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
+if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
@@ -810,6 +814,10 @@ if(EXECUTORCH_BUILD_PYBIND)
       torch
   )
 
+  if(EXECUTORCH_BUILD_TESTS)
+    list(APPEND _dep_libs test_backend_compiler_lib)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _dep_libs optimized_native_cpu_ops_lib)
   else()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ed1e2b30323..c0df9cefebe 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,19 +1,109 @@
 Thank you for your interest in contributing to ExecuTorch! We want to make
 it easy to contribute to this project.
 
-&nbsp;
 
 ## Dev Install
 
 Set up your environment by following the instructions at
-https://pytorch.org/executorch/stable/getting-started-setup.html to clone
+https://pytorch.org/executorch/main/getting-started-setup to clone
 the repo and install the necessary requirements.
 
+Refer to this [document](docs/source/using-executorch-building-from-source.md) to build ExecuTorch from source.
+
+### Dev Setup for Android
+For Android, please refer to the [Android documentation](docs/source/using-executorch-android.md).
+
+### Dev Setup for Apple
+For Apple, please refer to the [iOS documentation](docs/source/using-executorch-ios.md).
+&nbsp;
+
+## Codebase structure
+
+<pre>
+
+executorch
+├── <a href="backends">backends</a> - Backend delegate implementations for various hardware targets. Each backend uses partitioner to split the graph into subgraphs that can be executed on specific hardware, quantizer to optimize model precision, and runtime components to execute the graph on target hardware. For details refer to the <a href="docs/source/backend-delegates-integration.md">backend documentation</a> and the <a href="docs/source/using-executorch-export.md">Export and Lowering tutorial</a> for more information.
+│   ├── <a href="backends/apple">apple</a> - Apple-specific backends.
+│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends-coreml.md">doc</a>.
+│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends-mps.md">doc</a>.
+│   ├── <a href="backends/arm">arm</a> - ARM architecture backends. See <a href="docs/source/backends-arm-ethos-u.md">doc</a>.
+│   ├── <a href="backends/cadence">cadence</a> - Cadence-specific backends. See <a href="docs/source/backends-cadence.md">doc</a>.
+│   ├── <a href="backends/example">example</a> - Example backend implementations.
+│   ├── <a href="backends/mediatek">mediatek</a> - MediaTek-specific backends. See <a href="docs/source/backends-mediatek.md">doc</a>.
+│   ├── <a href="backends/openvino">openvino</a> - OpenVINO backend for Intel hardware.
+│   ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
+│   ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
+│   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
+│   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends-xnnpack.md">doc</a>.
+├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
+├── <a href="configurations">configurations</a> - Configuration files.
+├── <a href="devtools">devtools</a> - Model profiling, debugging, and inspection. Please refer to the <a href="docs/source/devtools-overview.md">tools documentation</a> for more information.
+│   ├── <a href="devtools/bundled_program">bundled_program</a> - a tool for validating ExecuTorch model. See <a href="docs/source/bundled-io.md">doc</a>.
+│   ├── <a href="devtools/etdump">etdump</a> - ETDump - a format for saving profiling and debugging data from runtime. See <a href="docs/source/etdump.md">doc</a>.
+│   ├── <a href="devtools/etrecord">etrecord</a> - ETRecord - AOT debug artifact for ExecuTorch. See <a href="https://pytorch.org/executorch/main/etrecord">doc</a>.
+│   ├── <a href="devtools/inspector">inspector</a> - Python API to inspect ETDump and ETRecord. See <a href="https://pytorch.org/executorch/main/model-inspector">doc</a>.
+│   └── <a href="devtools/visualization">visualization</a> - Visualization tools for representing model structure and performance metrics.
+├── <a href="docs">docs</a> - Static docs tooling and documentation source files.
+├── <a href="examples">examples</a> - Examples of various user flows, such as model export, delegates, and runtime execution.
+├── <a href="exir">exir</a> - Ahead-of-time library: model capture and lowering APIs. EXport Intermediate Representation (EXIR) is a format for representing the result of <a href="https://pytorch.org/docs/stable/export.html">torch.export</a>. This directory contains utilities and passes for lowering the EXIR graphs into different <a href="docs/source/ir-exir.md">dialects</a> and eventually suitable to run on target hardware.
+│   ├── <a href="exir/_serialize">_serialize</a> - Serialize final export artifact.
+│   ├── <a href="exir/backend">backend</a> - Backend delegate ahead of time APIs.
+│   ├── <a href="exir/capture">capture</a> - Program capture.
+│   ├── <a href="exir/dialects">dialects</a> - Op sets for various dialects in the export process. Please refer to the <a href="docs/source/ir-exir.md">EXIR spec</a> and the <a href="docs/source/compiler-backend-dialect.md">backend dialect</a> doc for more details.
+│   ├── <a href="exir/emit">emit</a> - Conversion from ExportedProgram to ExecuTorch execution instructions.
+│   ├── <a href="exir/operator">operator</a> - Operator node manipulation utilities.
+│   ├── <a href="exir/passes">passes</a> - Built-in compiler passes.
+│   ├── <a href="exir/program">program</a> - Export artifacts.
+│   ├── <a href="exir/serde">serde</a> - Graph module serialization/deserialization.
+│   ├── <a href="exir/verification">verification</a> - IR verification.
+├── <a href="extension">extension</a> - Extensions built on top of the runtime.
+│   ├── <a href="extension/android">android</a> - ExecuTorch wrappers for Android apps. Please refer to the <a href="docs/source/using-executorch-android.md">Android documentation</a> and <a href="https://pytorch.org/executorch/main/javadoc">Javadoc</a> for more information.
+│   ├── <a href="extension/apple">apple</a> - ExecuTorch wrappers for iOS apps. Please refer to the <a href="docs/source/using-executorch-ios.md">iOS documentation</a> on how to integrate into Apple platform</a> for more information.
+│   ├── <a href="extension/aten_util">aten_util</a> - Converts to and from PyTorch ATen types.
+│   ├── <a href="extension/data_loader">data_loader</a> - 1st party data loader implementations.
+│   ├── <a href="extension/evalue_util">evalue_util</a> - Helpers for working with EValue objects.
+│   ├── <a href="extension/gguf_util">gguf_util</a> - Tools to convert from the GGUF format.
+│   ├── <a href="extension/kernel_util">kernel_util</a> - Helpers for registering kernels.
+│   ├── <a href="extension/llm">llm</a> - Library to run LLM on ExecuTorch including common optimization passes, runtime C++ components. Please refer to the <a href="docs/source/llm/getting-started.md">LLM documentation</a> for more information.
+│   ├── <a href="extension/memory_allocator">memory_allocator</a> - 1st party memory allocator implementations.
+│   ├── <a href="extension/module">module</a> - A simplified C++ wrapper for the runtime. An abstraction that deserializes and executes an ExecuTorch artifact (.pte file). Refer to the <a href="docs/source/extension-module.md">module documentation</a> for more information.
+│   ├── <a href="extension/parallel">parallel</a> - C++ threadpool integration.
+│   ├── <a href="extension/pybindings">pybindings</a> - Python API for executorch runtime. This is powering up the <a href="docs/source/runtime-python-api-reference.rst">runtime Python API</a> for ExecuTorch.
+│   ├── <a href="extension/pytree">pytree</a> - C++ and Python flattening and unflattening lib for pytrees.
+│   ├── <a href="extension/runner_util">runner_util</a> - Helpers for writing C++ PTE-execution tools.
+│   ├── <a href="extension/tensor">tensor</a> - Tensor maker and <code>TensorPtr</code>, details in <a href="docs/source/extension-tensor.md">this documentation</a>. For how to use <code>TensorPtr</code> and <code>Module</code>, please refer to the <a href="docs/source/using-executorch-cpp.md">"Using ExecuTorch with C++"</a> doc.
+│   ├── <a href="extension/testing_util">testing_util</a> - Helpers for writing C++ tests.
+│   ├── <a href="extension/threadpool">threadpool</a> - Threadpool.
+│   └── <a href="extension/training">training</a> - Experimental libraries for on-device training.
+├── <a href="kernels">kernels</a> - 1st party kernel implementations.
+│   ├── <a href="kernels/aten">aten</a> - ATen kernel implementations.
+│   ├── <a href="kernels/optimized">optimized</a> - Optimized kernel implementations.
+│   ├── <a href="kernels/portable">portable</a> - Reference implementations of ATen operators.
+│   ├── <a href="kernels/prim_ops">prim_ops</a> - Special ops used in executorch runtime for control flow and symbolic primitives.
+│   └── <a href="kernels/quantized">quantized</a> - Quantized kernel implementations.
+├── <a href="profiler">profiler</a> - Utilities for profiling runtime execution.
+├── <a href="runtime">runtime</a> - Core C++ runtime. These components are used to execute the ExecuTorch program. Please refer to the <a href="docs/source/runtime-overview.md">runtime documentation</a> for more information.
+│   ├── <a href="runtime/backend">backend</a> - Backend delegate runtime APIs.
+│   ├── <a href="runtime/core">core</a> - Core structures used across all levels of the runtime. Basic components such as <code>Tensor</code>, <code>EValue</code>, <code>Error</code> and <code>Result</code> etc.
+│   ├── <a href="runtime/executor">executor</a> - Model loading, initialization, and execution. Runtime components that execute the ExecuTorch program, such as <code>Program</code>, <code>Method</code>. Refer to the <a href="https://pytorch.org/executorch/main/executorch-runtime-api-reference">runtime API documentation</a> for more information.
+│   ├── <a href="runtime/kernel">kernel</a> - Kernel registration and management.
+│   └── <a href="runtime/platform">platform</a> - Layer between architecture specific code and portable C++.
+├── <a href="schema">schema</a> - ExecuTorch PTE file format flatbuffer schemas.
+├── <a href="scripts">scripts</a> - Utility scripts for building libs, size management, dependency management, etc.
+├── <a href="shim_et">shim_et</a> - Compatibility layer between OSS and Internal builds.
+├── <a href="test">test</a> - Broad scoped end-to-end tests.
+├── <a href="third-party">third-party</a> - Third-party dependencies.
+├── <a href="tools">tools</a> - Tools for building ExecuTorch from source, for different built tools (CMake, Buck).
+└── <a href="util">util</a> - Various helpers and scripts.
+</pre>
+
 &nbsp;
 
 ## Contributing workflow
 We actively welcome your pull requests (PRs).
 
+If you're completely new to open-source projects, GitHub, or ExecuTorch, please see our [New Contributor Guide](docs/source/new-contributor-guide.md) for a step-by-step walkthrough on making your first contribution. Otherwise, read on.
+
 1. [Claim an issue](#claiming-issues), if present, before starting work. If an
    issue doesn't cover the work you plan to do, consider creating one to provide
    context about it, and to build consensus about the scope and solution.
@@ -24,7 +114,7 @@ We actively welcome your pull requests (PRs).
 1. If you've changed APIs or added a new tool or feature, [update the
    documentation](#updating-documentation).
 1. If you added an experimental API or deprecated an existing API, follow the
-   [API Life Cycle and Deprecation Policy](/docs/source/api-life-cycle.md).
+   [API Life Cycle and Deprecation Policy](docs/source/api-life-cycle.md).
 1. Make sure your code follows the [style guides](#coding-style) and passes the
    [lint checks](#lintrunner).
 1. If you haven't already, complete the [Contributor License Agreement ("CLA")](#contributor-license-agreement-cla).
@@ -103,9 +193,6 @@ in the Github repo.
 
 ## Coding Style
 
-Goal: Encourage standards that make it easier to read, edit, maintain, and debug
-the ExecuTorch code.
-
 ### lintrunner
 
 We use [`lintrunner`](https://pypi.org/project/lintrunner/) to help make sure the
@@ -158,7 +245,7 @@ modifications to the Google C++ style guide.
 
 ### C++ Portability Guidelines
 
-See also [Portable C++ Programming](/docs/source/portable-cpp-programming.md)
+See also [Portable C++ Programming](docs/source/portable-cpp-programming.md)
 for detailed advice.
 
 #### C++ language version
@@ -170,7 +257,7 @@ toolchains, and having access to relatively modern C++ features.
 
 #### C/C++ standard library usage
 
-**Restricted usage of the C++ standard library.**
+**Restricted usage of the C++ standard library**
 
 Rationale: ExecuTorch is intended to be portable to bare-metal systems that lack
 certain features, like dynamic memory, threading, and locking, required by parts
@@ -191,7 +278,7 @@ careful to also manually destroy objects initialized in this way.
 
 #### C++ language features
 
-**Exceptions: Do not use.**
+**Exceptions: Do not use**
 - Rationale: Exceptions are not widely supported on some classes of
   microcontrollers and DSPs, and they can significantly increase binary size.
 
@@ -200,12 +287,12 @@ must work with threading**
 - Rationale: The core runtime must work on systems that do not have threading
   support.
 
-**RTTI, dynamic_cast, and `<typeid>`: Do not use.**
+**RTTI, dynamic_cast, and `<typeid>`: Do not use**
 - Rationale: RTTI adds extra data to every virtual class. ExecuTorch doesn't
   have a strong need for `dynamic_cast` and friends, so it's better to reduce
   the binary size.
 
-**Templates and template metaprogramming: Be careful and avoid if possible.**
+**Templates and template metaprogramming: Be careful and avoid if possible**
 - Rationale: Most templating results in code generation, and is one of the most
   common sources of binary bloat. Some use of templates is fine (e.g. an
   `ArrayRef<T>`, or code that handles multiple `ScalarType` types), but for the
@@ -221,7 +308,7 @@ CI is run automatically on all pull requests. However, if you want to run tests
 
 - The `sh test/build_size_test.sh` script will compile the C++runtime along with portable kernels.
 - The `test/run_oss_cpp_tests.sh` script will build and run C++ tests locally
-- Running `pytest` from the root directory will run Python tests locally.
+- Running `pytest` from the root directory will run Python tests locally. Make sure to run this after finishing [Dev Install](#dev-install).
 
 ### Writing Tests
 To help keep code quality high, ExecuTorch uses a combination of unit tests and
@@ -270,7 +357,7 @@ docs](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/
 for basics.
 
 1. Push your branch to your fork of `pytorch/executorch`. Most people do not
-  have permission to push a branch directoy to the upstream repo.
+  have permission to push a branch directory to the upstream repo.
 1. Create your PR
    - Use the `main` branch as the base.
    - Give the PR a clear and descriptive title. It will become the title of the
@@ -279,7 +366,8 @@ for basics.
      - Good title: "Add XYZ method to ABC"
    - Give the PR a clear and thorough description. Don't just describe what the PR
      does: the diff will do that. Explain *why* you are making this change, in a
-     way that will make sense to someone years from now.
+     way that will make sense to someone years from now. If the PR is a bug fix,
+     include the issue number at the beginning of the description: "Fixes #1234"
    - Explain how you have tested your changes by including repeatable instructions for
      testing the PR.
      - If you added tests, this can be as simple as the command you used to run the
@@ -321,26 +409,17 @@ for basics.
    - If the reviewers have requests or questions, follow up with them.
    - The goal of the reviewer is to ensure that the code in the `main` branch of
      the repo is consistent, maintainable, and of high quality.
-1. Once the PR has been approved,
-   - If you have the "write permission" in this repo, you can merge it yourself
-     by clicking the "Squash and merge" button once it is green and all CI
-     signals are passing.
-   - If you don't have "write permission" in this repo, the reviewer will take
-     care of the PR. The reviewer may import the PR into Meta's internal system
-     to validate it against internal CI.
-   - If the PR is approved but not merged within 5 business days, please comment
-     on the PR to ask about its status.
-   - Note that if the `main` [CI](#continuous-integration) jobs are broken, we
-     will only merge PRs that fix the broken jobs until all critical jobs are
-     fixed.
+1. Once the PR has been approved, you can merge it yourself
+     by clicking the "Squash and merge" button once it is
+     green and all CI signals are passing.
 
 &nbsp;
 
 ## For Backend Delegate Authors
 
-- Use [this](/docs/source/backend-delegates-integration.md) guide when
+- Use [this](docs/source/backend-delegates-integration.md) guide when
   integrating your delegate with ExecuTorch.
-- Refer to [this](/docs/source/backend-delegates-dependencies.md) set of
+- Refer to [this](docs/source/backend-delegates-dependencies.md) set of
   guidelines when including a third-party depenency for your delegate.
 
 &nbsp;
diff --git a/Package.swift b/Package.swift
index 1322b918c07..b8a8b7d064b 100644
--- a/Package.swift
+++ b/Package.swift
@@ -15,7 +15,7 @@
 //
 // For details on building frameworks locally or using prebuilt binaries,
 // see the documentation:
-// https://pytorch.org/executorch/main/using-executorch-ios.html
+// https://pytorch.org/executorch/main/using-executorch-ios
 
 import PackageDescription
 
diff --git a/README-wheel.md b/README-wheel.md
index 9f074ab5ee3..12906bfd382 100644
--- a/README-wheel.md
+++ b/README-wheel.md
@@ -10,32 +10,21 @@ The `executorch` pip package is in beta.
 
 The prebuilt `executorch.runtime` module included in this package provides a way
 to run ExecuTorch `.pte` files, with some restrictions:
-* Only [core ATen
-  operators](https://pytorch.org/executorch/stable/ir-ops-set-definition.html)
-  are linked into the prebuilt module
-* Only the [XNNPACK backend
-  delegate](https://pytorch.org/executorch/main/native-delegates-executorch-xnnpack-delegate.html)
-  is linked into the prebuilt module.
-* \[macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html)
-  and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend
-  delegates are also linked into the prebuilt module.
+* Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
+* Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
+* \[macOS only] [Core ML](docs/source/backends-coreml.md) and [MPS](docs/source/backends-mps.md) backend
+  are also linked into the prebuilt module.
 
-Please visit the [ExecuTorch website](https://pytorch.org/executorch/) for
+Please visit the [ExecuTorch website](https://pytorch.org/executorch) for
 tutorials and documentation. Here are some starting points:
-* [Getting
-  Started](https://pytorch.org/executorch/stable/getting-started-setup.html)
+* [Getting Started](https://pytorch.org/executorch/main/getting-started-setup)
   * Set up the ExecuTorch environment and run PyTorch models locally.
-* [Working with
-  local LLMs](https://pytorch.org/executorch/stable/llm/getting-started.html)
+* [Working with local LLMs](docs/source/llm/getting-started.md)
   * Learn how to use ExecuTorch to export and accelerate a large-language model
     from scratch.
-* [Exporting to
-  ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial.html)
+* [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
   * Learn the fundamentals of exporting a PyTorch `nn.Module` to ExecuTorch, and
     optimizing its performance using quantization and hardware delegation.
-* Running LLaMA on
-  [iOS](https://pytorch.org/executorch/stable/llm/llama-demo-ios.html) and
-  [Android](https://pytorch.org/executorch/stable/llm/llama-demo-android.html)
-  devices.
+* Running LLaMA on [iOS](docs/source/llm/llama-demo-ios.md) and [Android](docs/source/llm/llama-demo-android.md) devices.
   * Build and run LLaMA in a demo mobile app, and learn how to integrate models
     with your own apps.
diff --git a/README.md b/README.md
index dd1fafe715b..c0d594e7733 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-  <img src="./docs/source/_static/img/et-logo.png" alt="Logo" width="200">
+  <img src="docs/source/_static/img/et-logo.png" alt="Logo" width="200">
   <h1 align="center">ExecuTorch: A powerful on-device AI Framework</h1>
 </div>
 
@@ -8,7 +8,7 @@
   <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="Contributors"></a>
   <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="Stargazers"></a>
   <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
-  <a href="https://pytorch.org/executorch/stable/index.html"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
+  <a href="https://pytorch.org/executorch/main/index"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
   <hr>
 </div>
 
@@ -49,9 +49,9 @@ Key value propositions of ExecuTorch are:
 ## Getting Started
 To get started you can:
 
-- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index.html) on getting things running locally and deploy a model to a device
-- Use this [Colab Notebook](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
-- Jump straight into LLMs use cases by following specific instructions for [Llama](./examples/models/llama/README.md) and [Llava](./examples/models/llava/README.md)
+- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index) to get things running locally and deploy a model to a device
+- Use this [Colab Notebook](https://pytorch.org/executorch/main/getting-started-setup#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
+- Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md)
 
 ## Feedback and Engagement
 
@@ -65,62 +65,7 @@ We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md
 
 ## Directory Structure
 
-```
-executorch
-├── backends                        #  Backend delegate implementations.
-├── codegen                         #  Tooling to autogenerate bindings between kernels and the runtime.
-├── configurations
-├── docs                            #  Static docs tooling.
-├── examples                        #  Examples of various user flows, such as model export, delegates, and runtime execution.
-├── exir                            #  Ahead-of-time library: model capture and lowering APIs.
-|   ├── _serialize                  #  Serialize final export artifact.
-|   ├── backend                     #  Backend delegate ahead of time APIs
-|   ├── capture                     #  Program capture.
-|   ├── dialects                    #  Op sets for various dialects in the export process.
-|   ├── emit                        #  Conversion from ExportedProgram to ExecuTorch execution instructions.
-|   ├── operator                    #  Operator node manipulation utilities.
-|   ├── passes                      #  Built-in compiler passes.
-|   ├── program                     #  Export artifacts.
-|   ├── serde                       #  Graph module serialization/deserialization.
-|   ├── verification                #  IR verification.
-├── extension                       #  Extensions built on top of the runtime.
-|   ├── android                     #  ExecuTorch wrappers for Android apps.
-|   ├── apple                       #  ExecuTorch wrappers for iOS apps.
-|   ├── aten_util                   #  Converts to and from PyTorch ATen types.
-|   ├── data_loader                 #  1st party data loader implementations.
-|   ├── evalue_util                 #  Helpers for working with EValue objects.
-|   ├── gguf_util                   #  Tools to convert from the GGUF format.
-|   ├── kernel_util                 #  Helpers for registering kernels.
-|   ├── memory_allocator            #  1st party memory allocator implementations.
-|   ├── module                      #  A simplified C++ wrapper for the runtime.
-|   ├── parallel                    #  C++ threadpool integration.
-|   ├── pybindings                  #  Python API for executorch runtime.
-|   ├── pytree                      #  C++ and Python flattening and unflattening lib for pytrees.
-|   ├── runner_util                 #  Helpers for writing C++ PTE-execution tools.
-|   ├── testing_util                #  Helpers for writing C++ tests.
-|   ├── training                    #  Experimental libraries for on-device training
-├── kernels                         #  1st party kernel implementations.
-|   ├── aten
-|   ├── optimized
-|   ├── portable                    #  Reference implementations of ATen operators.
-|   ├── prim_ops                    #  Special ops used in executorch runtime for control flow and symbolic primitives.
-|   ├── quantized
-├── profiler                        #  Utilities for profiling runtime execution.
-├── runtime                         #  Core C++ runtime.
-|   ├── backend                     #  Backend delegate runtime APIs.
-|   ├── core                        #  Core structures used across all levels of the runtime.
-|   ├── executor                    #  Model loading, initialization, and execution.
-|   ├── kernel                      #  Kernel registration and management.
-|   ├── platform                    #  Layer between architecture specific code and portable C++.
-├── schema                          #  ExecuTorch PTE file format flatbuffer schemas.
-├── scripts                         #  Utility scripts for building libs, size management, dependency management, etc.
-├── tools                           #  Development tool management.
-├── devtools                        #  Model profiling, debugging, and introspection.
-├── shim                            #  Compatibility layer between OSS and Internal builds
-├── test                            #  Broad scoped end-to-end tests.
-├── third-party                     #  Third-party dependencies.
-├── util                            #  Various helpers and scripts.
-```
+Please refer to the [Codebase structure](CONTRIBUTING.md#codebase-structure) section of the [Contributing Guidelines](CONTRIBUTING.md) for more details.
 
 ## License
 ExecuTorch is BSD licensed, as found in the LICENSE file.
diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md
index e8a062774d0..d063dfc8b71 100644
--- a/backends/apple/coreml/README.md
+++ b/backends/apple/coreml/README.md
@@ -1,8 +1,7 @@
 # ExecuTorch Core ML Delegate
 
-
 This subtree contains the Core ML Delegate implementation for ExecuTorch.
-Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.
+Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.  To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends-coreml.md). 
 
 ## Layout
 - `compiler/` : Lowers a module to Core ML backend.
@@ -19,110 +18,6 @@ Core ML is an optimized framework for running machine learning models on Apple d
     - `workspace` : Xcode workspace for the runtime.
 - `third-party/`: External dependencies.
 
-## Partition and Delegation
-
-To delegate a Program to the **Core ML** backend, the client must call `to_backend` with the **CoreMLPartitioner**.
-
-```python
-import torch
-import executorch.exir
-
-from executorch.backends.apple.coreml.compiler import CoreMLBackend
-from executorch.backends.apple.coreml.partition import CoreMLPartitioner
-
-class Model(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.sin(x)
-
-source_model = Model()
-example_inputs = (torch.ones(1), )
-
-# Export the source model to Edge IR representation
-aten_program = torch.export.export(source_model, example_inputs)
-edge_program_manager = executorch.exir.to_edge(aten_program)
-
-# Delegate to Core ML backend
-delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
-
-# Serialize delegated program
-executorch_program = delegated_program_manager.to_executorch()
-with open("model.pte", "wb") as f:
-    f.write(executorch_program.buffer)
-```
-
-The module will be fully or partially delegated to **Core ML**, depending on whether all or part of ops are supported by the **Core ML** backend. User may force skip certain ops by `CoreMLPartitioner(skip_ops_for_coreml_delegation=...)`
-
-The `to_backend` implementation is a thin wrapper over [coremltools](https://apple.github.io/coremltools/docs-guides/), `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**.
-
-## Quantization
-
-To quantize a Program in a Core ML favored way, the client may utilize **CoreMLQuantizer**.
-
-```python
-import torch
-import executorch.exir
-
-from torch.export import export_for_training
-from torch.ao.quantization.quantize_pt2e import (
-    convert_pt2e,
-    prepare_pt2e,
-    prepare_qat_pt2e,
-)
-
-from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
-from coremltools.optimize.torch.quantization.quantization_config import (
-    LinearQuantizerConfig,
-    QuantizationScheme,
-)
-
-class Model(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.conv = torch.nn.Conv2d(
-            in_channels=3, out_channels=16, kernel_size=3, padding=1
-        )
-        self.relu = torch.nn.ReLU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        a = self.conv(x)
-        return self.relu(a)
-
-source_model = Model()
-example_inputs = (torch.randn((1, 3, 256, 256)), )
-
-pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()
-
-quantization_config = LinearQuantizerConfig.from_dict(
-    {
-        "global_config": {
-            "quantization_scheme": QuantizationScheme.symmetric,
-            "activation_dtype": torch.quint8,
-            "weight_dtype": torch.qint8,
-            "weight_per_channel": True,
-        }
-    }
-)
-quantizer = CoreMLQuantizer(quantization_config)
-
-# For post-training quantization, use `prepare_pt2e`
-# For quantization-aware trainin,g use `prepare_qat_pt2e`
-prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer)
-
-prepared_graph(*example_inputs)
-converted_graph = convert_pt2e(prepared_graph)
-```
-
-The `converted_graph` is the quantized torch model, and can be delegated to **Core ML** similarly through **CoreMLPartitioner**
-
-## Runtime
-
-To execute a Core ML delegated program, the application must link to the `coremldelegate` library. Once linked there are no additional steps required, ExecuTorch when running the program would call the Core ML runtime to execute the Core ML delegated part of the program.
-
-Please follow the instructions described in the [Core ML setup](/backends/apple/coreml/setup.md) to link the `coremldelegate` library.
-
 ## Help & Improvements
 If you have problems or questions or have suggestions for ways to make
 implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues).
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.h b/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.h
index d97b3cf9b76..01655ca06c1 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.h
@@ -7,7 +7,7 @@
 
 #import <Foundation/Foundation.h>
 
-#import <asset.h>
+#import "asset.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.mm
index 6b1723f7113..455edf89480 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.mm
@@ -5,15 +5,15 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <ETCoreMLAsset.h>
+#import "ETCoreMLAsset.h"
+
+#import "ETCoreMLLogging.h"
+#import "objc_safe_cast.h"
 
 #import <fcntl.h>
 #import <os/lock.h>
 #import <stdio.h>
 #import <system_error>
-
-#import <objc_safe_cast.h>
-
 namespace  {
 using namespace executorchcoreml;
 
@@ -85,6 +85,10 @@ - (void)dealloc {
 
 - (BOOL)_keepAliveAndReturnError:(NSError * __autoreleasing *)error {
     if (!_isValid) {
+        ETCoreMLLogErrorAndSetNSError(error,
+                                      ETCoreMLErrorCorruptedModel, 
+                                      "The asset with identifier = %@ is invalid. Some required asset files appear to be missing.",
+                                       _identifier);
         return NO;
     }
     
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h
index 04fef204e1a..11d957044e9 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h
@@ -7,7 +7,7 @@
 
 #import <Foundation/Foundation.h>
 
-#import <database.hpp>
+#import "database.hpp"
 
 @class ETCoreMLAsset;
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
index 73e9cc0f33b..256026e1f09 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
@@ -6,12 +6,14 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 #import "ETCoreMLAssetManager.h"
-#import <ETCoreMLAsset.h>
-#import <ETCoreMLLogging.h>
-#import <database.hpp>
+
+#import "ETCoreMLAsset.h"
+#import "ETCoreMLLogging.h"
+#import "database.hpp"
+#import "json_key_value_store.hpp"
+#import "serde_json.h"
+
 #import <iostream>
-#import <json_key_value_store.hpp>
-#import <serde_json.h>
 #import <sstream>
 
 namespace  {
@@ -365,8 +367,7 @@ - (void)cleanupAssetIfNeeded:(ETCoreMLAsset *)asset {
         NSError *cleanupError = nil;
         if (![self _removeAssetWithIdentifier:asset.identifier error:&cleanupError]) {
             ETCoreMLLogError(cleanupError,
-                             "%@: Failed to remove asset with identifier = %@",
-                             NSStringFromClass(ETCoreMLAssetManager.class),
+                             "Failed to remove asset with identifier = %@",
                              identifier);
         }
     });
@@ -440,9 +441,7 @@ - (void)triggerCompaction {
     dispatch_async(self.syncQueue, ^{
         NSError *localError = nil;
         if (![weakSelf _compact:self.maxAssetsSizeInBytes error:&localError]) {
-            ETCoreMLLogError(localError,
-                             "%@: Failed to compact asset store.",
-                             NSStringFromClass(ETCoreMLAssetManager.class));
+            ETCoreMLLogError(localError, "Failed to compact asset store.");
         }
     });
 }
@@ -486,11 +485,11 @@ - (nullable ETCoreMLAsset *)assetWithIdentifier:(NSString *)identifier
     
     if ([result keepAliveAndReturnError:error]) {
         [self.assetsInUseMap setObject:result forKey:identifier];
-    } else {
-        [self cleanupAssetIfNeeded:result];
-    }
+        return  result;
+    }         
     
-    return result;
+    [self cleanupAssetIfNeeded:result];
+    return nil;
 }
 
 - (BOOL)_containsAssetWithIdentifier:(NSString *)identifier
@@ -587,8 +586,7 @@ - (BOOL)removeAssetWithIdentifier:(NSString *)identifier
             [assets addObject:asset];
         } else if (localError) {
             ETCoreMLLogError(localError,
-                             "%@: Failed to retrieve asset with identifier = %@",
-                             NSStringFromClass(ETCoreMLAssetManager.class),
+                             "Failed to retrieve asset with identifier = %@.",
                              identifier);
         }
         
@@ -647,8 +645,7 @@ - (NSUInteger)_compact:(NSUInteger)sizeInBytes error:(NSError * __autoreleasing
         NSString *identifier = @(asset.identifier.c_str());
         if (![self _removeAssetWithIdentifier:identifier error:&cleanupError] && cleanupError) {
             ETCoreMLLogError(cleanupError,
-                             "%@: Failed to remove asset with identifier = %@",
-                             NSStringFromClass(ETCoreMLAssetManager.class),
+                             "Failed to remove asset with identifier = %@.",
                              identifier);
         }
     }
@@ -689,8 +686,7 @@ - (void)removeFilesInTrashDirectory {
     for (NSURL *itemURL in enumerator) {
         if (![fileManager removeItemAtURL:itemURL error:&localError]) {
             ETCoreMLLogError(localError,
-                             "%@: Failed to remove item in trash directory with name = %@",
-                             NSStringFromClass(ETCoreMLAssetManager.class),
+                             "Failed to remove item in trash directory with name = %@",
                              itemURL.lastPathComponent);
         }
     }
@@ -720,9 +716,7 @@ - (BOOL)_purge:(NSError * __autoreleasing *)error {
         NSError *localError = nil;
         // Create the assets directory, if we fail here it's okay.
         if (![self.fileManager createDirectoryAtURL:self.assetsDirectoryURL withIntermediateDirectories:NO attributes:@{} error:&localError]) {
-            ETCoreMLLogError(localError,
-                             "%@: Failed to create assets directory",
-                             NSStringFromClass(ETCoreMLAssetManager.class));
+            ETCoreMLLogError(localError, "Failed to create assets directory.");
         }
         
         return true;
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
index 13b1023bcbc..3cf9e3df5f4 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h
@@ -1,13 +1,13 @@
 //
-//  ETCoreMLDefaultModelExecutor.h
-//  executorchcoreml_tests
+// ETCoreMLDefaultModelExecutor.h
 //
-//  Created by Gyan Sinha on 2/25/24.
+// Copyright © 2024 Apple Inc. All rights reserved.
 //
+// Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 #import <CoreML/CoreML.h>
 
-#import <ETCoreMLModelExecutor.h>
+#import "ETCoreMLModelExecutor.h"
 
 @class ETCoreMLModel;
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
index 226307f3c8f..63bc60695ce 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm
@@ -1,14 +1,14 @@
 //
-//  ETCoreMLDefaultModelExecutor.m
-//  executorchcoreml_tests
+//  ETCoreMLDefaultModelExecutor.mm
 //
-//  Created by Gyan Sinha on 2/25/24.
+// Copyright © 2024 Apple Inc. All rights reserved.
 //
+// Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <ETCoreMLAsset.h>
-#import <ETCoreMLDefaultModelExecutor.h>
-#import <ETCoreMLLogging.h>
-#import <ETCoreMLModel.h>
+#import "ETCoreMLAsset.h"
+#import "ETCoreMLDefaultModelExecutor.h"
+#import "ETCoreMLLogging.h"
+#import "ETCoreMLModel.h"
 
 @implementation ETCoreMLDefaultModelExecutor
 
@@ -27,7 +27,9 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model {
                                                  eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable __unused)eventLogger
                                                        error:(NSError * __autoreleasing *)error {
     if (self.ignoreOutputBackings) {
-        predictionOptions.outputBackings = @{};
+        if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) {
+            predictionOptions.outputBackings = @{};
+        }
     }
 
     id<MLFeatureProvider> outputs = [self.model predictionFromFeatures:inputs
@@ -44,8 +46,7 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model {
         if (!featureValue.multiArrayValue) {
             ETCoreMLLogErrorAndSetNSError(error,
                                           ETCoreMLErrorBrokenModel,
-                                          "%@: Model is broken, expected multiarray for output=%@.",
-                                          NSStringFromClass(self.class),
+                                          "Model is broken, expected multiarray for output=%@.",
                                           outputName);
             return nil;
         }
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
index d9c4d4ef638..d1bb7c2caa5 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h
@@ -6,9 +6,9 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 #import <Foundation/Foundation.h>
+#import <os/log.h>
 
 #import <executorch/runtime/platform/log.h>
-#import <os/log.h>
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -18,15 +18,15 @@ extern NSErrorDomain const ETCoreMLErrorDomain;
 /// The error codes that are exposed publicly.
 typedef NS_ERROR_ENUM(ETCoreMLErrorDomain, ETCoreMLError) {
     ETCoreMLErrorCorruptedData = 1, // AOT blob can't be parsed.
-    ETCoreMLErrorCorruptedMetadata, // AOT blob has incorrect or missing metadata.
-    ETCoreMLErrorCorruptedModel, // AOT blob has incorrect or missing CoreML model.
-    ETCoreMLErrorBrokenModel, // CoreML model doesn't match the input and output specification.
-    ETCoreMLErrorCompilationFailed, // CoreML model failed to compile.
-    ETCoreMLErrorModelCompilationNotSupported, // CoreML model compilation is not supported by the target.
-    ETCoreMLErrorModelProfilingNotSupported, // Model profiling is not supported by the target.
-    ETCoreMLErrorModelSaveFailed, // Failed to save CoreML model to disk.
-    ETCoreMLErrorModelCacheCreationFailed, // Failed to create model cache.
-    ETCoreMLErrorInternalError, // Internal error.
+    ETCoreMLErrorCorruptedMetadata = 2, // AOT blob has incorrect or missing metadata.
+    ETCoreMLErrorCorruptedModel = 3, // AOT blob has incorrect or missing CoreML model.
+    ETCoreMLErrorBrokenModel = 4, // CoreML model doesn't match the input and output specification.
+    ETCoreMLErrorCompilationFailed = 5, // CoreML model failed to compile.
+    ETCoreMLErrorModelCompilationNotSupported = 6, // CoreML model compilation is not supported by the target.
+    ETCoreMLErrorModelProfilingNotSupported = 7, // Model profiling is not supported by the target.
+    ETCoreMLErrorModelSaveFailed = 8, // Failed to save CoreML model to disk.
+    ETCoreMLErrorModelCacheCreationFailed = 9, // Failed to create model cache.
+    ETCoreMLErrorInternalError = 10, // Internal error.
 };
 
 @interface ETCoreMLErrorUtils : NSObject
@@ -47,47 +47,47 @@ typedef NS_ERROR_ENUM(ETCoreMLErrorDomain, ETCoreMLError) {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
 
+#if ET_LOG_ENABLED
+#define ETCoreMLLogError(error, formatString, ...)                                                      \
+    do {                                                                                                \
+        NSString* message = error.localizedDescription;                                                 \
+        message = [NSString stringWithFormat:@"[Core ML] " formatString " %@", ##__VA_ARGS__, message]; \
+        ET_LOG(Error, "%s", message.UTF8String);                                                        \
+    } while (0)
+#else
+#define ETCoreMLLogError(error, formatString, ...) \
+    os_log_error(ETCoreMLErrorUtils.loggingChannel, formatString " %@", ##__VA_ARGS__, error.localizedDescription)
+#endif
+
+#if ET_LOG_ENABLED
+#define ETCoreMLLogInfo(formatString, ...) \
+    ET_LOG(Info, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String)
+#else
+#define ETCoreMLLogInfo(formatString, ...) os_log_info(ETCoreMLErrorUtils.loggingChannel, formatString, ##__VA_ARGS__)
+#endif
+
 /// Record the error with `os_log_error` and fills `*errorOut` with `NSError`.
-#define ETCoreMLLogErrorAndSetNSError(errorOut, errorCode, formatString, ...)                                        \
-    if (ET_LOG_ENABLED) {                                                                                            \
-        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String);                    \
-    } else {                                                                                                         \
-        os_log_error(ETCoreMLErrorUtils.loggingChannel, formatString, ##__VA_ARGS__);                                \
-    }                                                                                                                \
-    if (errorOut) {                                                                                                  \
-        *errorOut =                                                                                                  \
-            [NSError errorWithDomain:ETCoreMLErrorDomain                                                             \
-                                code:errorCode                                                                       \
-                            userInfo:@{                                                                              \
-                                NSLocalizedDescriptionKey : [NSString stringWithFormat:@formatString, ##__VA_ARGS__] \
-                            }];                                                                                      \
-    }
+#define ETCoreMLLogErrorAndSetNSError(errorOut, errorCode, formatString, ...)                                 \
+    do {                                                                                                      \
+        NSDictionary* userInfo =                                                                              \
+            @{ NSLocalizedDescriptionKey : [NSString stringWithFormat:@formatString, ##__VA_ARGS__] };        \
+        NSError* localError = [NSError errorWithDomain:ETCoreMLErrorDomain code:errorCode userInfo:userInfo]; \
+        ETCoreMLLogError(localError, "");                                                                     \
+        if (errorOut) {                                                                                       \
+            *errorOut = localError;                                                                           \
+        }                                                                                                     \
+    } while (0)
 
 /// Record the error and its underlying error with `os_log_error` and fills `*errorOut` with `NSError`.
 #define ETCoreMLLogUnderlyingErrorAndSetNSError(errorOut, errorCode, underlyingNSError, formatString, ...) \
-    if (ET_LOG_ENABLED) {                                                                                  \
-        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String);          \
-    } else {                                                                                               \
-        os_log_error(ETCoreMLErrorUtils.loggingChannel,                                                    \
-                     formatString ", with underlying error= %@.",                                          \
-                     ##__VA_ARGS__,                                                                        \
-                     (underlyingNSError).localizedDescription);                                            \
-    }                                                                                                      \
-    if (errorOut) {                                                                                        \
-        *errorOut = [ETCoreMLErrorUtils errorWithCode:errorCode                                            \
-                                      underlyingError:underlyingNSError                                    \
-                                               format:@formatString, ##__VA_ARGS__];                       \
-    }
-
-#define ETCoreMLLogError(error, formatString, ...)                                                \
-    if (ET_LOG_ENABLED) {                                                                         \
-        ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String); \
-    } else {                                                                                      \
-        os_log_error(ETCoreMLErrorUtils.loggingChannel,                                           \
-                     formatString ", with error= %@.",                                            \
-                     ##__VA_ARGS__,                                                               \
-                     (error).localizedDescription);                                               \
-    }
+    do {                                                                                                   \
+        ETCoreMLLogError(underlyingNSError, formatString, ##__VA_ARGS__);                                  \
+        if (errorOut) {                                                                                    \
+            *errorOut = [ETCoreMLErrorUtils errorWithCode:errorCode                                        \
+                                          underlyingError:underlyingNSError                                \
+                                                   format:@formatString, ##__VA_ARGS__];                   \
+        }                                                                                                  \
+    } while (0)
 
 
 #pragma clang diagnostic pop
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.mm
index 15d60d35704..f76b86a36b3 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.mm
@@ -5,9 +5,9 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <ETCoreMLLogging.h>
+#import "ETCoreMLLogging.h"
 
-#import <ETCoreMLStrings.h>
+#import "ETCoreMLStrings.h"
 
 const NSErrorDomain ETCoreMLErrorDomain = @"com.apple.executorchcoreml";
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
index 6b39ae5f920..4201293d1c5 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm
@@ -5,7 +5,7 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <ETCoreMLModel.h>
+#import "ETCoreMLModel.h"
 
 #import "ETCoreMLAsset.h"
 #import "ETCoreMLLogging.h"
@@ -256,14 +256,23 @@ - (NSString *)identifier {
         }
         
         if (multiArrayArg && lCopyData) {
-            [multiArrayArg getMutableBytesWithHandler:^(void *_Nonnull mutableBytes,
-                                                        NSInteger __unused size,
-                                                        NSArray<NSNumber *> *strides) {
-                MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type(constraint.dataType).value(),
+            void (^copy_data)(void *, NSArray<NSNumber *> *) = ^(void *bytes, NSArray<NSNumber *> *strides) {
+                MultiArray buffer(bytes, MultiArray::MemoryLayout(to_multiarray_data_type(constraint.dataType).value(),
                                                                          layout.shape(),
                                                                          to_vector<ssize_t>(strides)));
                 arg.copy(buffer);
-            }];
+            };
+
+
+            if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) {
+                [multiArrayArg getMutableBytesWithHandler:^(void *_Nonnull mutableBytes,
+                                                            NSInteger __unused size,
+                                                            NSArray<NSNumber *> *strides) {
+                    copy_data(mutableBytes, strides);
+                }];
+            } else {
+                copy_data(multiArrayArg.dataPointer, multiArrayArg.strides);
+            }
         }
         
         [result addObject:multiArrayArg];
@@ -318,8 +327,7 @@ - (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error {
     BOOL result = [self.mlModel prewarmUsingState:self.state error:error];
     if (!result) {
         ETCoreMLLogError(localError,
-                         "%@: Failed to prewarm model with identifier = %@",
-                         NSStringFromClass(self.class),
+                         "Failed to prewarm model with identifier = %@",
                          self.identifier);
     }
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm
index c50bf3002fa..5b2c5a225a3 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm
@@ -5,8 +5,10 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <ETCoreMLModelCompiler.h>
-#import <ETCoreMLLogging.h>
+#import "ETCoreMLModelCompiler.h"
+
+#import "ETCoreMLLogging.h"
+
 #import <TargetConditionals.h>
 
 @implementation ETCoreMLModelCompiler
@@ -20,8 +22,7 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL
     (void)error;
     ETCoreMLLogErrorAndSetNSError(error,
                                   ETCoreMLErrorModelCompilationNotSupported,
-                                  "%@: Model compilation is not supported on the target, please make sure to export a compiled model.",
-                                  NSStringFromClass(ETCoreMLModelCompiler.class));
+                                  "Model compilation is not supported on the target, please make sure to export a compiled model.");
     return nil;
 #else
     __block NSError *localError = nil;
@@ -37,11 +38,10 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL
 
         long status = dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(maxWaitTimeInSeconds * NSEC_PER_SEC)));
         if (status != 0) {
-            ETCoreMLLogErrorAndSetNSError(error,
-                                        ETCoreMLErrorCompilationFailed,
-                                        "%@: Failed to compile model in %f seconds.",
-                                        NSStringFromClass(ETCoreMLModelCompiler.class),
-                                        maxWaitTimeInSeconds);
+            ETCoreMLLogErrorAndSetNSError(error, 
+                                          ETCoreMLErrorCompilationFailed,
+                                          "Failed to compile model in %f seconds.", 
+                                          maxWaitTimeInSeconds);
             return nil;
         }
     } else {
@@ -50,10 +50,9 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL
 
     if (localError) {
         ETCoreMLLogErrorAndSetNSError(error,
-                                    ETCoreMLErrorCompilationFailed,
-                                    "%@: Failed to compile model, error: %@",
-                                    NSStringFromClass(ETCoreMLModelCompiler.class),
-                                    localError);
+                                      ETCoreMLErrorCompilationFailed,
+                                      "Failed to compile model, error = %@.",
+                                      localError);
         return nil;
     } else {
         return result;
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm
index 11690793baa..05aa910d954 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm
@@ -5,14 +5,15 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <ETCoreMLAsset.h>
-#import <ETCoreMLAssetManager.h>
-#import <ETCoreMLDefaultModelExecutor.h>
-#import <ETCoreMLLogging.h>
-#import <ETCoreMLModel.h>
-#import <ETCoreMLModelLoader.h>
-#import <asset.h>
-#import <model_metadata.h>
+#import "ETCoreMLModelLoader.h"
+
+#import "asset.h"
+#import "ETCoreMLAsset.h"
+#import "ETCoreMLAssetManager.h"
+#import "ETCoreMLDefaultModelExecutor.h"
+#import "ETCoreMLLogging.h"
+#import "ETCoreMLModel.h"
+#import "model_metadata.h"
 
 using namespace executorchcoreml;
 
@@ -64,8 +65,7 @@ + (nullable ETCoreMLModel *)loadModelWithContentsOfURL:(NSURL *)compiledModelURL
     
     if (localError) {
         ETCoreMLLogError(localError,
-                         "%@: Failed to load model from compiled asset with identifier = %@",
-                         NSStringFromClass(ETCoreMLModelLoader.class),
+                         "Failed to load model from compiled asset with identifier = %@",
                          identifier);
     }
     
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index 3848f7c9b3c..c6da7750a11 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -5,6 +5,8 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
+#import "ETCoreMLModelManager.h"
+
 #import "ETCoreMLAsset.h"
 #import "ETCoreMLAssetManager.h"
 #import "ETCoreMLDefaultModelExecutor.h"
@@ -13,20 +15,20 @@
 #import "ETCoreMLModelCompiler.h"
 #import "ETCoreMLModelExecutor.h"
 #import "ETCoreMLModelLoader.h"
-#import "ETCoreMLModelManager.h"
 #import "ETCoreMLStrings.h"
 #import "MLModel_Prewarm.h"
 #import "MLMultiArray_Copy.h"
-#import <filesystem>
 #import "inmemory_filesystem_utils.hpp"
-#import <iostream>
-#import <memory>
 #import "model_metadata.h"
 #import "multiarray.h"
 #import "objc_array_util.h"
+#import "serde_json.h"
+
+#import <filesystem>
+#import <iostream>
+#import <memory>
 #import <optional>
 #import <os/lock.h>
-#import "serde_json.h"
 #import <string>
 #import <system_error>
 #import <vector>
@@ -73,11 +75,15 @@
 
 BOOL is_backed_by_same_buffer(MLMultiArray *array1, MLMultiArray *array2) {
     __block BOOL result = NO;
-    [array1 getBytesWithHandler:^(const void *bytes1, NSInteger __unused size1){
-        [array2 getBytesWithHandler:^(const void *bytes2, NSInteger __unused size2) {
-            result = (bytes1 == bytes2);
+    if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) {
+        [array1 getBytesWithHandler:^(const void *bytes1, NSInteger __unused size1){
+            [array2 getBytesWithHandler:^(const void *bytes2, NSInteger __unused size2) {
+                result = (bytes1 == bytes2);
+            }];
         }];
-    }];
+    } else {
+        result = (array1.dataPointer == array2.dataPointer);
+    }
     
     return result;
 }
@@ -86,17 +92,19 @@ BOOL is_backed_by_same_buffer(MLMultiArray *array1, MLMultiArray *array2) {
                                             NSOrderedSet<NSString *> *output_names,
                                             NSError * __autoreleasing *error) {
     MLPredictionOptions *options = [MLPredictionOptions new];
-    NSMutableDictionary<NSString *, id> *output_backings = [NSMutableDictionary new];
-    NSEnumerator<NSString *> *enumerator = [output_names objectEnumerator];
-    for (MLMultiArray *output in outputs) {
-        NSString *output_name = [enumerator nextObject];
-        if (output_name.length == 0) {
-            ETCoreMLLogErrorAndSetNSError(error, 0, "%@: Model is broken.", NSStringFromClass(ETCoreMLModelManager.class));
-            return nil;
+    if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) {
+        NSMutableDictionary<NSString *, id> *output_backings = [NSMutableDictionary dictionary];
+        NSEnumerator<NSString *> *enumerator = [output_names objectEnumerator];
+        for (MLMultiArray *output in outputs) {
+            NSString *output_name = [enumerator nextObject];
+            if (output_name.length == 0) {
+                ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, "Model is broken.");
+                return nil;
+            }
+            output_backings[output_name] = output;
         }
-        output_backings[output_name] = output;
+        options.outputBackings = output_backings;
     }
-    options.outputBackings = output_backings;
     
     return options;
 }
@@ -138,14 +146,25 @@ void set_outputs(NSArray<MLMultiArray *> *outputs, NSArray<MLMultiArray *> *mode
 }
 
 void copy(MLMultiArray *src, executorchcoreml::MultiArray& dst) {
-    [src getBytesWithHandler:^(const void * _Nonnull bytes, NSInteger size) {
+    void (^copy_data)(void *) = ^(void *bytes) {
         if (bytes == dst.data()) {
             return;
         }
-        
-        MultiArray::MemoryLayout src_layout(get_data_type(src.dataType).value(), to_vector<size_t>(src.shape), to_vector<ssize_t>(src.strides));
+            
+        MultiArray::MemoryLayout src_layout(
+            get_data_type(src.dataType).value(), 
+            to_vector<size_t>(src.shape), 
+            to_vector<ssize_t>(src.strides)
+        );
         MultiArray(const_cast<void *>(bytes), std::move(src_layout)).copy(dst);
-    }];
+    };
+    if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) {
+        [src getBytesWithHandler:^(const void * _Nonnull bytes, NSInteger size) {
+            copy_data(const_cast<void *>(bytes));
+        }];
+    } else {
+        copy_data(src.dataPointer);
+    }
 }
 
 void set_outputs(std::vector<executorchcoreml::MultiArray>& outputs,
@@ -212,8 +231,7 @@ void set_outputs(std::vector<executorchcoreml::MultiArray>& outputs,
         ETCoreMLLogUnderlyingErrorAndSetNSError(error,
                                                 ETCoreMLErrorModelSaveFailed,
                                                 local_error,
-                                                "%@: Failed to create directory when saving model with identifier = %@.",
-                                                NSStringFromClass(ETCoreMLModelManager.class),
+                                                "Failed to create directory when saving model with identifier = %@.",
                                                 identifier);
         return nil;
     }
@@ -236,8 +254,7 @@ void set_outputs(std::vector<executorchcoreml::MultiArray>& outputs,
     if (!inmemory_fs->write_item_to_disk(file_path, model_path, true, ec)) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorModelSaveFailed,
-                                      "%@: Failed to write model files to disk when saving model with identifier = %@.",
-                                      NSStringFromClass(ETCoreMLModelManager.class),
+                                      "Failed to write model files to disk when saving model with identifier = %@.",
                                       identifier);
         return nil;
     }
@@ -395,8 +412,7 @@ - (nullable ETCoreMLAsset *)assetWithIdentifier:(NSString *)identifier {
     modelAsset = [self.assetManager assetWithIdentifier:identifier error:&localError];
     if (localError) {
         ETCoreMLLogError(localError,
-                         "%@: Failed to retrieve asset with identifier = %@",
-                         NSStringFromClass(self.assetManager.class),
+                         "Failed to retrieve asset with identifier = %@.",
                          identifier);
     }
     
@@ -411,8 +427,7 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
     if (!modelAssetType) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedModel,
-                                      "%@: AOT blob is missing model file.",
-                                      NSStringFromClass(ETCoreMLModelManager.class));
+                                      "AOT blob is missing model file.");
         return nil;
     }
     
@@ -420,11 +435,12 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
     NSURL *modelURL = ::write_model_files(dstURL, self.fileManager, identifier, modelAssetType.value(), inMemoryFS, error);
     switch (modelAssetType.value()) {
         case ModelAssetType::CompiledModel: {
+            // Model is already compiled.
             return modelURL;
         }
             
         case ModelAssetType::Model: {
-            // we need to compiled the model.
+            // Compile the model.
             NSURL *compiledModelURL = [ETCoreMLModelCompiler compileModelAtURL:modelURL
                                                           maxWaitTimeInSeconds:(5 * 60)
                                                                          error:error];
@@ -442,6 +458,12 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
     NSString *identifier = @(metadata.identifier.c_str());
     // Otherwise try to retrieve the compiled asset.
     ETCoreMLAsset *compiledModelAsset = [self assetWithIdentifier:identifier];
+    if (compiledModelAsset) {
+        ETCoreMLLogInfo("Cache Hit: Successfully retrieved model with identifier=%@ from the models cache.", identifier);
+    } else {
+        ETCoreMLLogInfo("Cache Miss: Model with identifier=%@ was not found in the models cache.", identifier);
+    }
+    
     // Create a unique directory for writing model files.
     NSURL *dstURL = [self.assetManager.trashDirectoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString];
     auto modelAssetType = get_model_asset_type(inMemoryFS);
@@ -499,9 +521,11 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
     ETCoreMLAsset *asset = [self assetWithIdentifier:identifier];
     ETCoreMLModel *model = asset ? get_model_from_asset(asset, configuration, metadata, error) : nil;
     if (model) {
+        ETCoreMLLogInfo("Cache Hit: Successfully retrieved model with identifier=%@ from the models cache.", identifier);
         return [[ETCoreMLDefaultModelExecutor alloc] initWithModel:model];
     }
     
+    ETCoreMLLogInfo("Cache Miss: Model with identifier=%@ was not found in the models cache.", identifier);
     // Compile the model.
     NSURL *compiledModelURL = [self compiledModelURLWithIdentifier:identifier
                                                         inMemoryFS:inMemoryFS
@@ -531,8 +555,7 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
     if (!inMemoryFS) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedModel,
-                                      "%@: Model data is corrupted.",
-                                      NSStringFromClass(ETCoreMLModelManager.class));
+                                      "Model data is corrupted.");
         return nil;
     }
     
@@ -540,8 +563,7 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
     if (!metadata) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedMetadata,
-                                      "%@: Metadata is invalid or missing.",
-                                      NSStringFromClass(ETCoreMLModelManager.class));
+                                      "Metadata is invalid or missing.");
         return nil;
     }
     
@@ -607,9 +629,7 @@ - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount {
     NSArray<ETCoreMLAsset *> *assets = [self.assetManager mostRecentlyUsedAssetsWithMaxCount:maxCount error:&localError];
     
     if (localError) {
-        ETCoreMLLogError(localError,
-                         "%@: Failed to retrieve recently used assets.",
-                         NSStringFromClass(self.assetManager.class));
+        ETCoreMLLogError(localError, "Failed to retrieve recently used assets.");
     }
     
     if (assets.count == 0) {
@@ -627,8 +647,7 @@ - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount {
             NSError *prewarmError = nil;
             if (![asset prewarmAndReturnError:&prewarmError]) {
                 ETCoreMLLogError(prewarmError,
-                                 "%@: Failed to prewarm asset with identifier = %@",
-                                 NSStringFromClass(strongSelf.assetManager.class),
+                                 "Failed to prewarm asset with identifier = %@",
                                  asset.identifier);
                 return;
             }
@@ -664,18 +683,20 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset {
     
     NSArray<MLMultiArray *> *modelOutputs = [executor executeModelWithInputs:inputFeatures
                                                            predictionOptions:predictionOptions
-                                                             loggingOptions:loggingOptions
+                                                              loggingOptions:loggingOptions
                                                                  eventLogger:eventLogger
                                                                        error:&localError];
     // Try without output backings.
-    if (!modelOutputs && predictionOptions.outputBackings.count > 0) {
-        executor.ignoreOutputBackings = YES;
-        localError = nil;
-        modelOutputs = [executor executeModelWithInputs:inputFeatures
-                                      predictionOptions:predictionOptions
-                                         loggingOptions:loggingOptions
-                                            eventLogger:eventLogger
-                                                  error:&localError];
+    if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) {
+        if (!modelOutputs && predictionOptions.outputBackings.count > 0) {
+            executor.ignoreOutputBackings = YES;
+            localError = nil;
+            modelOutputs = [executor executeModelWithInputs:inputFeatures
+                                          predictionOptions:predictionOptions
+                                             loggingOptions:loggingOptions
+                                                eventLogger:eventLogger
+                                                      error:&localError];
+        }
     }
 
     if (error) {
@@ -693,9 +714,8 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
     id<ETCoreMLModelExecutor> executor = [self executorWithHandle:handle];
     if (!executor) {
         ETCoreMLLogErrorAndSetNSError(error,
-                                      0,
-                                      "%@: Model is already unloaded.",
-                                      NSStringFromClass(self.class));
+                                      ETCoreMLErrorInternalError,
+                                      "Model is already unloaded.");
         return NO;
     }
     
@@ -703,8 +723,7 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
     if (args.count != model.orderedInputNames.count + model.orderedOutputNames.count) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedModel,
-                                      "%@: Model is invalid, expected args count to be %lu but got %lu.",
-                                      NSStringFromClass(self.class),
+                                      "Model is invalid, expected args count to be %lu but got %lu.",
                                       static_cast<unsigned long>(model.orderedInputNames.count + model.orderedOutputNames.count),
                                       args.count);
         return NO;
@@ -741,9 +760,8 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
     id<ETCoreMLModelExecutor> executor = [self executorWithHandle:handle];
     if (!executor) {
         ETCoreMLLogErrorAndSetNSError(error,
-                                      0,
-                                      "%@: Model is already unloaded.",
-                                      NSStringFromClass(self.class));
+                                      ETCoreMLErrorInternalError,
+                                      "Model is already unloaded.");
         return NO;
     }
     
@@ -751,8 +769,7 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
     if (argsVec.size() != model.orderedInputNames.count + model.orderedOutputNames.count) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedModel,
-                                      "%@: Model is invalid, expected args count to be %lu but got %lu.",
-                                      NSStringFromClass(self.class),
+                                      "Model is invalid, expected args count to be %lu but got %lu.",
                                       static_cast<unsigned long>(model.orderedInputNames.count + model.orderedOutputNames.count),
                                       argsVec.size());
         return NO;
diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
index d6f59666cf0..6a737d1e82b 100644
--- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
+++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm
@@ -5,10 +5,34 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <MLModel_Prewarm.h>
+#import "MLModel_Prewarm.h"
+#include <objc/NSObjCRuntime.h>
 
 #import <algorithm>
 
+namespace  {
+    size_t get_number_of_bytes(MLMultiArrayDataType data_type) {
+        switch (data_type) {
+            case MLMultiArrayDataTypeFloat16: {
+                return 2;
+            }
+            case MLMultiArrayDataTypeFloat32: {
+                return 4;
+            }
+            case MLMultiArrayDataTypeInt32: {
+                return 4;
+            }
+            case MLMultiArrayDataTypeFloat64: {
+                return 8;
+            }
+            default: {
+                return 0;
+            }
+        }
+    }
+
+}
+
 @interface MLMultiArray (Prewarm)
 
 + (nullable MLMultiArray *)zeroedMultiArrayWithShape:(NSArray<NSNumber *> *)shape
@@ -28,11 +52,22 @@ + (MLMultiArray *)zeroedMultiArrayWithShape:(NSArray<NSNumber *> *)shape
         return nil;
     }
     
-    [multiArray getMutableBytesWithHandler:^(void *mutableBytes, NSInteger size, NSArray<NSNumber *> * __unused strides) {
-        uint8_t *start = reinterpret_cast<uint8_t *>(mutableBytes);
-        uint8_t *end = start + size;
-        std::fill(start, end, uint8_t(0));
-    }];
+
+    if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) {
+        void (^fill_zeroes)(void *, NSInteger) = ^(void *bytes, NSInteger size) {
+            uint8_t *start = reinterpret_cast<uint8_t *>(bytes);
+            uint8_t *end = start + size;
+            std::fill(start, end, uint8_t(0));
+        };
+
+        if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) {
+            [multiArray getMutableBytesWithHandler:^(void *mutableBytes, NSInteger size, NSArray<NSNumber *> * __unused strides) {
+                fill_zeroes(mutableBytes, size);
+            }];
+        } else {
+            fill_zeroes(multiArray.dataPointer, multiArray.count * get_number_of_bytes(multiArray.dataType));
+        }
+    }
     
     return multiArray;
 }
diff --git a/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm b/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
index b8a10fcbbbc..313ee3edaf9 100644
--- a/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
+++ b/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm
@@ -5,10 +5,10 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <MLMultiArray_Copy.h>
+#import "MLMultiArray_Copy.h"
 
-#import <objc_array_util.h>
-#import <multiarray.h>
+#import "objc_array_util.h"
+#import "multiarray.h"
 
 namespace {
 using namespace executorchcoreml;
@@ -27,13 +27,19 @@ MultiArray to_multi_array(void *data,
 @implementation MLMultiArray (Copy)
 
 - (void)copyInto:(MLMultiArray *)dstMultiArray {
-    [self getBytesWithHandler:^(const void *srcBytes, __unused NSInteger srcSize) {
-        [dstMultiArray getMutableBytesWithHandler:^(void *dstBytes, __unused NSInteger size, NSArray<NSNumber *> * strides) {
-            auto src = ::to_multi_array(const_cast<void *>(srcBytes), self.dataType, self.shape, self.strides);
-            auto dst = ::to_multi_array(dstBytes, dstMultiArray.dataType, dstMultiArray.shape, strides);
-            src.copy(dst);
+    if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) {
+        [self getBytesWithHandler:^(const void *srcBytes, __unused NSInteger srcSize) {
+            [dstMultiArray getMutableBytesWithHandler:^(void *dstBytes, __unused NSInteger size, NSArray<NSNumber *> * strides) {
+                auto src = ::to_multi_array(const_cast<void *>(srcBytes), self.dataType, self.shape, self.strides);
+                auto dst = ::to_multi_array(dstBytes, dstMultiArray.dataType, dstMultiArray.shape, strides);
+                src.copy(dst);
+            }];
         }];
-    }];
+    } else {
+        auto src = ::to_multi_array(self.dataPointer, self.dataType, self.shape, self.strides);
+        auto dst = ::to_multi_array(dstMultiArray.dataPointer, dstMultiArray.dataType, dstMultiArray.shape, dstMultiArray.strides);
+        src.copy(dst);
+    }
 }
 
 @end
diff --git a/backends/apple/coreml/runtime/delegate/asset.mm b/backends/apple/coreml/runtime/delegate/asset.mm
index c9a6e16d2af..6df2dfbd3c5 100644
--- a/backends/apple/coreml/runtime/delegate/asset.mm
+++ b/backends/apple/coreml/runtime/delegate/asset.mm
@@ -1,16 +1,16 @@
 //
-// ModelAsset.cpp
+// asset.cpp
 //
 // Copyright © 2024 Apple Inc. All rights reserved.
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 
-#import <asset.h>
+#import "asset.h"
 
 #import <optional>
 
-#import <objc_safe_cast.h>
+#import "objc_safe_cast.h"
 
 namespace  {
 
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.h b/backends/apple/coreml/runtime/delegate/backend_delegate.h
index 9af3df01af2..93c420e11d2 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.h
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.h
@@ -7,7 +7,8 @@
 
 #pragma once
 
-#include <model_logging_options.h>
+#include "model_logging_options.h"
+
 #include <system_error>
 #include <unordered_map>
 #include <vector>
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
index d8096e16781..2cb274f0a89 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
@@ -6,13 +6,15 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 
-#import <ETCoreMLAssetManager.h>
-#import <ETCoreMLModel.h>
-#import <ETCoreMLModelManager.h>
-#import <ETCoreMLStrings.h>
-#import <backend_delegate.h>
-#import <model_event_logger.h>
-#import <multiarray.h>
+#import "backend_delegate.h"
+
+#import "ETCoreMLAssetManager.h"
+#import "ETCoreMLLogging.h"
+#import "ETCoreMLModel.h"
+#import "ETCoreMLModelManager.h"
+#import "ETCoreMLStrings.h"
+#import "model_event_logger.h"
+#import "multiarray.h"
 
 namespace  {
 using namespace executorchcoreml;
@@ -282,6 +284,9 @@ explicit BackendDelegateImpl(const Config& config) noexcept
         ModelHandle *modelHandle = [model_manager_ loadModelFromAOTData:data
                                                           configuration:configuration
                                                                   error:&localError];
+        if (localError != nil) {
+            ETCoreMLLogError(localError, "Model init failed");
+        }
         return modelHandle;
     }
     
@@ -290,13 +295,16 @@ bool execute(Handle* handle,
                  const ModelLoggingOptions& logging_options,
                  ModelEventLogger *event_logger,
                  std::error_code& ec) const noexcept override {
-        NSError *error = nil;
+        NSError *localError = nil;
         if (![model_manager_ executeModelWithHandle:handle
                                             argsVec:args
                                      loggingOptions:logging_options
                                         eventLogger:event_logger
-                                              error:&error]) {
-            ec = static_cast<ErrorCode>(error.code);
+                                              error:&localError]) {
+            if (localError != nil) {
+                ETCoreMLLogError(localError, "Model execution failed");
+                ec = static_cast<ErrorCode>(localError.code);
+            }                                    
             return false;
         }
         
diff --git a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
index 380ec52b7d7..028191ce497 100644
--- a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
@@ -5,22 +5,25 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <ETCoreMLLogging.h>
-#import <ETCoreMLModel.h>
-#import <ETCoreMLStrings.h>
-#import <backend_delegate.h>
-#import <coreml_backend/delegate.h>
+#import "coreml_backend/delegate.h"
+
+#import "backend_delegate.h"
+#import "ETCoreMLLogging.h"
+#import "ETCoreMLModel.h"
+#import "ETCoreMLStrings.h"
+#import "model_event_logger.h"
+#import "model_logging_options.h"
+#import "multiarray.h"
+#import "objc_safe_cast.h"
+
 #import <executorch/runtime/core/evalue.h>
 #import <executorch/runtime/platform/log.h>
 #import <executorch/runtime/kernel/kernel_includes.h>
+
+#include <array>
 #import <memory>
-#import <model_event_logger.h>
-#import <model_logging_options.h>
-#import <multiarray.h>
-#import <objc_safe_cast.h>
 #import <unordered_map>
 #import <vector>
-#include <array>
 
 #ifdef ET_EVENT_TRACER_ENABLED
 #import <model_event_logger_impl.h>
diff --git a/backends/apple/coreml/runtime/delegate/model_metadata.h b/backends/apple/coreml/runtime/delegate/model_metadata.h
index 275aa39dd3b..8d0c1f0914d 100644
--- a/backends/apple/coreml/runtime/delegate/model_metadata.h
+++ b/backends/apple/coreml/runtime/delegate/model_metadata.h
@@ -10,7 +10,7 @@
 #import <string>
 #import <vector>
 
-#import <serde_json.h>
+#import "serde_json.h"
 
 namespace executorchcoreml {
 
diff --git a/backends/apple/coreml/runtime/delegate/multiarray.mm b/backends/apple/coreml/runtime/delegate/multiarray.mm
index de705991780..d38ac377799 100644
--- a/backends/apple/coreml/runtime/delegate/multiarray.mm
+++ b/backends/apple/coreml/runtime/delegate/multiarray.mm
@@ -6,13 +6,14 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <multiarray.h>
+#import "multiarray.h"
+
+#import "objc_array_util.h"
 
 #import <Accelerate/Accelerate.h>
 #import <CoreML/CoreML.h>
 #import <functional>
 #import <numeric>
-#import <objc_array_util.h>
 #import <optional>
 #import <vector>
 
diff --git a/backends/apple/coreml/runtime/delegate/serde_json.mm b/backends/apple/coreml/runtime/delegate/serde_json.mm
index 3568ffe4ce8..e39df4d734e 100644
--- a/backends/apple/coreml/runtime/delegate/serde_json.mm
+++ b/backends/apple/coreml/runtime/delegate/serde_json.mm
@@ -5,11 +5,11 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#import <serde_json.h>
+#import "serde_json.h"
 
-#import <asset.h>
-#import <objc_json_serde.h>
-#import <model_metadata.h>
+#import "asset.h"
+#import "objc_json_serde.h"
+#import "model_metadata.h"
 
 namespace  {
 struct FileInfoKeys {
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
index 988b5d808a0..87e086c5bbd 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm
@@ -65,9 +65,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod
                                                               assetManager:assetManager
                                                                      error:&localError];
     if (!model) {
-        ETCoreMLLogError(localError,
-                         "%@: Failed to create model profiler.",
-                         NSStringFromClass(ETCoreMLAssetManager.class));
+        ETCoreMLLogError(localError, "Failed to create model profiler.");
     }
     
     self = [super init];
@@ -98,8 +96,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod
     if (!self.profiler) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorModelProfilingNotSupported,
-                                      "%@: Model profiling is only available for macOS >= 14.4, iOS >= 17.4, tvOS >= 17.4 and watchOS >= 10.4.",
-                                      NSStringFromClass(ETCoreMLModelAnalyzer.class));
+                                      "Model profiling is only available for macOS >= 14.4, iOS >= 17.4, tvOS >= 17.4 and watchOS >= 10.4.");
         return nil;
     }
     
@@ -125,8 +122,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod
     if (!self.modelAsset) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedData,
-                                      "%@: There is no mlpackage, mlpackage is required for debugging a model. Please check the export path.",
-                                      NSStringFromClass(ETCoreMLModelAnalyzer.class));
+                                      "The AOT blob is missing an 'mlpackage', which is required for debugging the model. Please check the export path.");
         return nil;
     }
     
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm
index 3be28b56d66..1cac0de40f3 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm
@@ -7,7 +7,6 @@
 
 #import "ETCoreMLModelDebugger.h"
 
-#import <CoreML/CoreML.h>
 #import "ETCoreMLAsset.h"
 #import "ETCoreMLAssetManager.h"
 #import "ETCoreMLLogging.h"
@@ -16,12 +15,14 @@
 #import "ETCoreMLModelStructurePath.h"
 #import "ETCoreMLPair.h"
 #import "ETCoreMLStrings.h"
-#import <format/MIL.pb.h>
-#import <format/Model.pb.h>
-#import <fstream>
-#import <iostream>
+#import "format/MIL.pb.h"
+#import "format/Model.pb.h"
 #import "model_package_info.h"
 #import "objc_json_serde.h"
+
+#import <CoreML/CoreML.h>
+#import <fstream>
+#import <iostream>
 #import <string>
 #import <unordered_map>
 
@@ -43,13 +44,19 @@
     const auto& info_value = info.value();
     auto it = info_value.items.find(info_value.root_model_identifier);
     if (it == info_value.items.end()) {
-        ETCoreMLLogErrorAndSetNSError(error, 0, "%@ is broken, root model info doesn't exist.", model_package_url.lastPathComponent);
+        ETCoreMLLogErrorAndSetNSError(error, 
+                                      ETCoreMLErrorCorruptedModel,
+                                      "%@ is broken, root model info doesn't exist.",
+                                      model_package_url.lastPathComponent);
         return nil;
     }
     
     auto path = it->second.path;
     if (path.empty()) {
-        ETCoreMLLogErrorAndSetNSError(error, 0, "%@ is broken, root model path doesn't exist.", model_package_url.lastPathComponent);
+        ETCoreMLLogErrorAndSetNSError(error, 
+                                      ETCoreMLErrorCorruptedModel,
+                                      "%@ is broken, root model path doesn't exist.",
+                                      model_package_url.lastPathComponent);
         return nil;
     }
     
@@ -350,8 +357,8 @@ void set_model_outputs(id<MLFeatureProvider> output_features,
     NSMutableArray<MLMultiArray *> *values = [NSMutableArray arrayWithCapacity:output_names.count];
     for (NSString *output_name in output_names) {
         MLFeatureValue *feature_value = [output_features featureValueForName:output_name];
-        NSCAssert(feature_value.multiArrayValue != nil, @"%@: Expected a multiarray value for output name=%@.",
-                  NSStringFromClass(ETCoreMLModelDebugger.class),
+        NSCAssert(feature_value.multiArrayValue != nil, 
+                  @"Expected a multiarray value for output name=%@.",
                   output_name);
         [values addObject:feature_value.multiArrayValue];
     }
@@ -570,8 +577,7 @@ - (nullable ETCoreMLAsset *)compiledModelAssetWithOutputsAtPaths:(NSArray<ETCore
     
     if (localError) {
         ETCoreMLLogError(localError,
-                         "%@: Failed to retrieve asset with identifier=%@",
-                         NSStringFromClass(ETCoreMLModelDebugger.class),
+                         "Failed to retrieve asset with identifier=%@",
                          identifier);
     }
     
@@ -595,8 +601,7 @@ - (nullable ETCoreMLAsset *)compiledModelAssetWithOutputsAtPaths:(NSArray<ETCore
     
     if (localError) {
         ETCoreMLLogError(localError, 
-                         "%@: Failed to store asset with identifier=%@",
-                         NSStringFromClass(ETCoreMLModelDebugger.class),
+                         "Failed to store asset with identifier=%@",
                          identifier);
     }
     
@@ -624,14 +629,14 @@ - (nullable ETCoreMLAsset *)compiledModelAssetWithOutputsAtPaths:(NSArray<ETCore
     }
     
     if (localError) {
-        ETCoreMLLogError(localError, "%@: Failed to load model with outputs=%@",
-                         NSStringFromClass(ETCoreMLModelDebugger.class),
+        ETCoreMLLogError(localError,
+                         "Failed to load model with outputs=%@",
                          get_output_names(paths));
     }
     
     if ([self.assetManager removeAssetWithIdentifier:compiledAsset.identifier error:&localError]) {
-        ETCoreMLLogError(localError, "%@: Failed to remove compiled asset with identifier=%@",
-                         NSStringFromClass(ETCoreMLModelDebugger.class),
+        ETCoreMLLogError(localError,
+                         "Failed to remove compiled asset with identifier=%@",
                          compiledAsset.identifier);
     }
     
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
index 7a43a30d752..0aa28c6de03 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h
@@ -6,6 +6,7 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 #import "ETCoreMLPair.h"
+
 #import <CoreML/CoreML.h>
 #import <TargetConditionals.h>
 
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm
index 5998701eb0f..e381bbb03d1 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm
@@ -14,9 +14,10 @@
 #import "ETCoreMLOperationProfilingInfo.h"
 #import "ETCoreMLPair.h"
 #import "ETCoreMLStrings.h"
+#import "program_path.h"
+
 #import <mach/mach_time.h>
 #import <math.h>
-#import "program_path.h"
 
 namespace  {
 using namespace executorchcoreml::modelstructure;
@@ -42,8 +43,7 @@
         ETCoreMLLogUnderlyingErrorAndSetNSError(error,
                                                 ETCoreMLErrorCompilationFailed,
                                                 local_error,
-                                                "%@: Failed to get compute plan of model with name=%@.",
-                                                NSStringFromClass(ETCoreMLModelProfiler.class),
+                                                "Failed to get compute plan of model with name=%@.",
                                                 model_url.lastPathComponent);
         return nil;
     }
@@ -288,8 +288,7 @@ - (nullable instancetype)initWithModel:(ETCoreMLModel *)model
 #endif
     ETCoreMLLogErrorAndSetNSError(error,
                                   ETCoreMLErrorModelProfilingNotSupported,
-                                  "%@: Model profiling is only available for macOS >= 14.4, iOS >= 17.4, tvOS >= 17.4 and watchOS >= 10.4.",
-                                  NSStringFromClass(self.class));
+                                  "Model profiling is only available for macOS >= 14.4, iOS >= 17.4, tvOS >= 17.4 and watchOS >= 10.4.");
     return nil;
 }
 
diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h b/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h
index 80c49f8965e..13d4bb8e6ac 100644
--- a/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h
+++ b/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h
@@ -7,7 +7,7 @@
 
 #import <CoreML/CoreML.h>
 
-#import <ETCoreMLComputeUnits.h>
+#import "ETCoreMLComputeUnits.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
diff --git a/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm b/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm
index 12ac8ec15a3..be34e384b72 100644
--- a/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm
+++ b/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm
@@ -9,11 +9,13 @@
 
 #import "ETCoreMLModelStructurePath.h"
 #import "ETCoreMLOperationProfilingInfo.h"
-#import <executorch/runtime/core/event_tracer.h>
 #import "objc_array_util.h"
+#import "MLMultiArray_Copy.h"
+
+#import <executorch/runtime/core/event_tracer.h>
+
 #import <mach/mach_time.h>
 #import <numeric>
-#import "MLMultiArray_Copy.h"
 
 namespace {
 
diff --git a/backends/apple/coreml/runtime/sdk/model_package_info.mm b/backends/apple/coreml/runtime/sdk/model_package_info.mm
index b7b26178fde..f4b13048718 100644
--- a/backends/apple/coreml/runtime/sdk/model_package_info.mm
+++ b/backends/apple/coreml/runtime/sdk/model_package_info.mm
@@ -66,7 +66,7 @@ static void from_json(id json, ModelPackageInfo& package_info) {
     NSURL *manifest_url = [model_package_url URLByAppendingPathComponent:@"manifest.json"].URLByStandardizingPath;
     BOOL is_directory = NO;
     if (![fm fileExistsAtPath:manifest_url.path isDirectory:&is_directory] || is_directory) {
-        ETCoreMLLogErrorAndSetNSError(error, 0, "%@ is broken, manifest doesn't exist.", model_package_url.lastPathComponent);
+        ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, "%@ is broken, manifest doesn't exist.", model_package_url.lastPathComponent);
         return std::nullopt;
     }
     
diff --git a/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm b/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm
index 3c0908201ac..50b0f2ec766 100644
--- a/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm
+++ b/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm
@@ -250,16 +250,14 @@ + (BOOL)extractModelAssetAndMetadataFromAOTData:(NSData *)data
     if (!inMemoryFS) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedModel,
-                                      "%@: Model data is corrupted.",
-                                      NSStringFromClass(ETCoreMLTestUtils.class));
+                                      "Model data is corrupted.");
         return NO;
     }
     
     if (!extract_model_metadata(*inMemoryFS, metadata) || !metadata.is_valid()) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorCorruptedMetadata,
-                                      "%@: Model metadata is corrupted.",
-                                      NSStringFromClass(ETCoreMLTestUtils.class));
+                                      "Model metadata is corrupted.");
         return NO;
     }
     
@@ -269,8 +267,7 @@ + (BOOL)extractModelAssetAndMetadataFromAOTData:(NSData *)data
     if (![fileManager createDirectoryAtURL:modelURL withIntermediateDirectories:NO attributes:@{} error:error]) {
         ETCoreMLLogErrorAndSetNSError(error,
                                       ETCoreMLErrorModelSaveFailed,
-                                      "%@: Failed to create directory when saving model with name = %@.",
-                                      NSStringFromClass(ETCoreMLTestUtils.class),
+                                      "Failed to create directory when saving model with name = %@.",
                                       modelURL.lastPathComponent);
         return NO;
     }
diff --git a/backends/apple/coreml/runtime/test/setup.md b/backends/apple/coreml/runtime/test/setup.md
index 9876dfc8a3d..1e038c306a4 100644
--- a/backends/apple/coreml/runtime/test/setup.md
+++ b/backends/apple/coreml/runtime/test/setup.md
@@ -4,18 +4,18 @@ This is a tutorial for setting up tests for the **Core ML** backend.
 
 ## Running tests
 
-1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment.
+1. Follow the instructions described in [Setting Up ExecuTorch](../../../../../docs/source/getting-started-setup.rst) to set up ExecuTorch environment.
 
 2. Run `install_requirements.sh` to install dependencies required by the **Core ML** backend.
 
 ```bash
 cd executorch
 
-sh backends/apple/coreml/scripts/install_requirements.sh   
+sh backends/apple/coreml/scripts/install_requirements.sh
 
-``` 
+```
 
-3. Follow the instructions described in [Building with CMake](/docs/source/runtime-build-and-cross-compilation.md#building-with-cmake) to set up CMake build system.
+3. Follow the instructions described in [Building with CMake](../../../../../docs/source/using-executorch-cpp.md#building-with-cmake) to set up CMake build system.
 
 4. Install [Xcode](https://developer.apple.com/xcode/).
 
@@ -26,7 +26,7 @@ sh backends/apple/coreml/scripts/install_requirements.sh
 ```bash
 cd executorch
 
-# Builds macOS universal test bundle. 
+# Builds macOS universal test bundle.
 
 sh backends/apple/coreml/srcipts/build_tests.sh
 
@@ -40,7 +40,7 @@ cd executorch
 sh backends/apple/coreml/srcipts/run_tests.sh
 
 ```
- 
+
 ## Updating tests
 
 1. Open the Xcode workspace.
@@ -48,7 +48,7 @@ sh backends/apple/coreml/srcipts/run_tests.sh
 ```bash
 cd executorch
 
-# Builds macOS universal test bundle. 
+# Builds macOS universal test bundle.
 
 open backends/apple/coreml/runtime/workspace/executorchcoreml.xcworkspace
 
@@ -62,4 +62,4 @@ cd executorch
 # There is no need to build the tests.
 sh backends/apple/coreml/srcipts/run_tests.sh
 
-```
\ No newline at end of file
+```
diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md
index c6daae0d989..887873d4911 100644
--- a/backends/apple/coreml/setup.md
+++ b/backends/apple/coreml/setup.md
@@ -4,7 +4,7 @@ This is a tutorial for setting up the Core ML backend.
 
 ## AOT Setup
 
-1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment.
+1. Follow the instructions described in [Setting Up ExecuTorch](../../../docs/source/getting-started-setup.rst) to set up ExecuTorch environment.
 
 
 2. Run the example script to validate that the **Core ML** backend is set up correctly.
@@ -28,7 +28,7 @@ delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
 
 ## Integrating Core ML delegate into runtime.
 
-1. Follow the instructions described in [Building with CMake](/docs/source/runtime-build-and-cross-compilation.md#building-with-cmake) to set up CMake build system.
+1. Follow the instructions described in [Building with CMake](../../../docs/source/using-executorch-cpp.md#building-with-cmake) to set up CMake build system.
 
 2. Install [Xcode](https://developer.apple.com/xcode/).
 
diff --git a/backends/apple/mps/mps_preprocess.py b/backends/apple/mps/mps_preprocess.py
index 749f32a04e5..2982ebc2e01 100644
--- a/backends/apple/mps/mps_preprocess.py
+++ b/backends/apple/mps/mps_preprocess.py
@@ -6,6 +6,7 @@
 from typing import ClassVar, Dict, final, List, Tuple
 
 import torch
+from executorch import exir
 
 from executorch.backends.apple.mps.operators.node_visitor import (
     get_node_visitors,
@@ -35,6 +36,7 @@
 
 from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from executorch.exir.program._program import _transform
+from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
 from torch.export.exported_program import ExportedProgram
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -87,7 +89,19 @@ def preprocess(
         #    the `output_ids` array in the schema.
 
         # TODO: Remove this once we have a better support for the dim-order ops.
-        edge_program = _transform(edge_program, DimOrderOpsRevertPass())
+        # Need to override the verifier to skip the non dim-order ops from tripping the default verifier.
+        edge_program = _transform(
+            edge_program,
+            DimOrderOpsRevertPass(),
+            override_verifiers=[
+                EXIREdgeDialectVerifier(
+                    edge_compile_config=exir.EdgeCompileConfig(
+                        _check_ir_validity=False,  # Disable the edge dialect verifier, since we are in the mps backend.
+                    ),
+                    class_only=True,
+                )
+            ],
+        )
 
         mps_graph = MPSGraph(
             version="0",
diff --git a/backends/apple/mps/setup.md b/backends/apple/mps/setup.md
index 5c14ad673df..bd688fe8b78 100644
--- a/backends/apple/mps/setup.md
+++ b/backends/apple/mps/setup.md
@@ -12,11 +12,11 @@ The MPS backend device maps machine learning computational graphs and primitives
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
-* [ExecuTorch iOS Demo App](demo-apps-ios.md)
-* [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md)
+* [Introduction to ExecuTorch](../../../docs/source/intro-how-it-works.md)
+* [Setting up ExecuTorch](../../../docs/source/getting-started-setup.rst)
+* [Building ExecuTorch with CMake](../../../docs/source/using-executorch-cpp.md#building-with-cmake)
+* [ExecuTorch iOS Demo App](../../../docs/source/demo-apps-ios.md)
+* [ExecuTorch iOS LLaMA Demo App](../../../docs/source/llm/llama-demo-ios.md)
 :::
 ::::
 
@@ -40,7 +40,7 @@ In order to be able to successfully build and run a model using the MPS backend
 
 ## Setting up Developer Environment
 
-***Step 1.*** Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup).
+***Step 1.*** Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup).
 
 ***Step 2.*** Install dependencies needed to lower MPS delegate:
 
@@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp
 ```
 
 ### Profiling:
-1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
+1. [Optional] Generate an [ETRecord](../../../docs/source/etrecord.rst) while you're exporting your model.
 ```bash
 cd executorch
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
 ```
-2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](../../../docs/source/etdump.md).
 ```
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
 ```
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index dd7f3d02518..ddca8ea4a06 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -20,13 +20,16 @@
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_batchnorm_pass import DecomposeBatchNormPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
+from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_select import DecomposeSelectPass  # noqa
+from .decompose_silu_pass import DecomposeSiluPass  # noqa
 from .decompose_softmax_pass import DecomposeSoftmaxPass  # noqa
 from .decompose_softmax_unstable_pass import DecomposeSoftmaxUnstablePass  # noqa
+from .decompose_sqrt_pass import DecomposeSqrtPass  # noqa
 from .decompose_var_pass import DecomposeVarPass  # noqa
 from .fold_qdq_with_annotated_qparams_pass import (  # noqa
     FoldAndAnnotateQParamsPass,
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 703c6ff214c..dd4ca7ad7bd 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -25,13 +25,16 @@
     ConvertToClampPass,
     DecomposeBatchNormPass,
     DecomposeDivPass,
+    DecomposeGeluPass,
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
     DecomposeMeanDimPass,
     DecomposeSelectPass,
+    DecomposeSiluPass,
     DecomposeSoftmaxPass,
     DecomposeSoftmaxUnstablePass,
+    DecomposeSqrtPass,
     DecomposeVarPass,
     FoldAndAnnotateQParamsPass,
     FuseBatchnorm2DPass,
@@ -115,6 +118,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+        self.add_pass(DecomposeSqrtPass())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
@@ -130,6 +134,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxPass())
+        self.add_pass(DecomposeGeluPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
@@ -162,12 +167,22 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
 
         return self._transform(exported_program.graph_module)
 
+    def _tosa_1_0_int_quantized_pipeline(self, exported_program: ExportedProgram):
+        return self._tosa_080_BI_pipeline(exported_program)
+
+    def _tosa_1_0_fp_pipeline(self, exported_program: ExportedProgram):
+        return self._tosa_080_MI_pipeline(exported_program)
+
     def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
         """Apply passes before transforming program to backend"""
         if self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+BI"):
             return self._tosa_080_BI_pipeline(exported_program)
         elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+MI"):
             return self._tosa_080_MI_pipeline(exported_program)
+        elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-1.0+FP"):
+            return self._tosa_1_0_fp_pipeline(exported_program)
+        elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-1.0+INT"):
+            return self._tosa_1_0_int_quantized_pipeline(exported_program)
         else:
             raise NotImplementedError(
                 f"No pass pipeline implemented for {self.tosa_spec=}"
@@ -181,6 +196,8 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
+        self.add_pass(DecomposeSqrtPass())
+        self.add_pass(DecomposeSiluPass())
 
         if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
             # Numerically stable softmax uses amax which is not supported on Ethos-U55
diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py
index 3b97b944fd4..87512f9fb3c 100644
--- a/backends/arm/_passes/cast_int64_pass.py
+++ b/backends/arm/_passes/cast_int64_pass.py
@@ -12,7 +12,6 @@
 from torch._export.utils import is_buffer
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 class CastInt64BuffersToInt32Pass(ExportPass):
diff --git a/backends/arm/_passes/decompose_gelu_pass.py b/backends/arm/_passes/decompose_gelu_pass.py
new file mode 100644
index 00000000000..6e72175e68b
--- /dev/null
+++ b/backends/arm/_passes/decompose_gelu_pass.py
@@ -0,0 +1,149 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+torch_gelu = (torch.ops.aten.gelu.default,)
+
+edge_gelu = (exir_ops.edge.aten.gelu.default,)
+
+
+def _get_gelu_ops(op) -> tuple:
+    """
+    Returns the operators needed to decompose GELU
+    """
+
+    if op in edge_gelu:
+        return (
+            exir_ops.edge.aten.full.default,
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.tanh.default,
+            exir_ops.edge.aten.erf.default,
+        )
+    if op in torch_gelu:
+        return (
+            torch.ops.aten.full.default,
+            torch.ops.aten.add.Tensor,
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.tanh.default,
+            torch.ops.aten.erf.default,
+        )
+    raise RuntimeError(f"Can't get GeLU decomposition ops for op {op}")
+
+
+class DecomposeGeluPass(ExportPass):
+    """
+    This pass decomposes the GELU operator into primitive ops.
+    Aiming to adhere closely to the reference implementations built into
+    ExecuTorch. Including using the same pre-calculated constants.
+
+    This operator has two formulae depending on the value of the
+    approximate argument. Examples below include the added full
+    operators necessary for the initialization for constants used in
+    each respective formula.
+
+    aten.gelu(x, approximate="none") becomes:
+        %FULL_0_5 = full()
+        %FULL_1 = full()
+        %FULL_SQRT1_2 = full()
+        %op1 = mul(x, %FULL_SQRT1_2)
+        %op2 = erf(%op1)
+        %op3 = add(%op2, %FULL_1)
+        %op4 = mul(%op3, %FULL_0_5)
+        %op5 = mul(%x, %op4)
+
+    aten.gelu(x, approximate="tanh") becomes:
+        %FULL_0_5 = full()
+        %FULL_1 = full()
+        %FULL_SQRT2 = full()
+        %FULL_2_SQRTPI = full()
+        %FULL_CUBE_COEFF = full()
+        %SQRT_MUL = mul(%FULL_SQRT2, %FULL_2_SQRTPI)
+        %SQRT_2_PI = mul(%SQRT_MUL, %FULL_0_5)
+        %sqr_x = mul(x, x)
+        %cube_x = mul(sqr_x, x)
+        %op1 = mul(%cube_x, %FULL_CUBE_COEFF)
+        %op2 = add(%x, %op1)
+        %op3 = mul(%op2, %SQRT_2_PI)
+        %op4 = tanh(%op3)
+        %op5 = add(%op4, %FULL_1)
+        %op6 = mul(%x, %op5)
+        %op7 = mul(%op6, %FULL_0_5)
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in torch_gelu + edge_gelu:
+            return super().call_operator(op, args, kwargs, meta)
+
+        full_op, add_op, mul_op, tanh_op, erf_op = _get_gelu_ops(op)
+
+        input = get_node_arg(args, 0)
+        # If approximate is default (none) it does not appear in kwargs
+        approximate = get_node_arg(kwargs, "approximate", "none")
+
+        shape = meta["val"].size()
+        dtype = meta["val"].dtype
+
+        FULL_0_5 = super().call_operator(
+            full_op, ([1] * len(shape), 0.5), {"dtype": dtype}, meta
+        )
+        FULL_1 = super().call_operator(
+            full_op, ([1] * len(shape), 1), {"dtype": dtype}, meta
+        )
+
+        if approximate == "none":
+            # Constant mirrors ExecuTorch implementation for parity.
+            FULL_SQRT1_2 = super().call_operator(
+                full_op, ([1] * len(shape), 0.70710678118654752440), {}, meta
+            )
+
+            op1 = super().call_operator(mul_op, (input, FULL_SQRT1_2), {}, meta)
+            op2 = super().call_operator(erf_op, (op1,), {}, meta)
+            op3 = super().call_operator(add_op, (op2, FULL_1), {}, meta)
+            op4 = super().call_operator(mul_op, (op3, FULL_0_5), {}, meta)
+            return super().call_operator(mul_op, (input, op4), {}, meta)
+
+        elif approximate == "tanh":
+            # Constants mirror ExecuTorch implementation for parity.
+            FULL_SQRT2 = super().call_operator(
+                full_op,
+                ([1] * len(shape), 1.41421356237309504880),
+                {"dtype": dtype},
+                meta,
+            )
+            FULL_2_SQRTPI = super().call_operator(
+                full_op,
+                ([1] * len(shape), 1.12837916709551257390),
+                {"dtype": dtype},
+                meta,
+            )
+            FULL_CUBE_COEFF = super().call_operator(
+                full_op, ([1] * len(shape), 0.044715), {"dtype": dtype}, meta
+            )
+
+            # Mirrors ExecuTorch implementations for calculating this value
+            SQRT_MUL = super().call_operator(
+                mul_op, (FULL_SQRT2, FULL_2_SQRTPI), {}, meta
+            )
+            SQRT_2_PI = super().call_operator(mul_op, (SQRT_MUL, FULL_0_5), {}, meta)
+
+            # Avoiding using POW in order to reduce pass order reliance.
+            sqr_x = super().call_operator(mul_op, (input, input), {}, meta)
+            cube_x = super().call_operator(mul_op, (sqr_x, input), {}, meta)
+            op1 = super().call_operator(mul_op, (cube_x, FULL_CUBE_COEFF), {}, meta)
+            op2 = super().call_operator(add_op, (input, op1), {}, meta)
+            op3 = super().call_operator(mul_op, (op2, SQRT_2_PI), {}, meta)
+            op4 = super().call_operator(tanh_op, (op3,), {}, meta)
+            op5 = super().call_operator(add_op, (op4, FULL_1), {}, meta)
+            op6 = super().call_operator(mul_op, (input, op5), {}, meta)
+            return super().call_operator(mul_op, (op6, FULL_0_5), {}, meta)
+        else:
+            raise RuntimeError(
+                f"approximate argument expected 'none' or 'tanh' but got {approximate}"
+            )
diff --git a/backends/arm/_passes/decompose_silu_pass.py b/backends/arm/_passes/decompose_silu_pass.py
new file mode 100644
index 00000000000..68ebb3f4515
--- /dev/null
+++ b/backends/arm/_passes/decompose_silu_pass.py
@@ -0,0 +1,34 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import torch
+from executorch.exir.pass_base import ExportPass
+
+aten_silu_ops = (torch.ops.aten.silu.default, torch.ops.aten.silu_.default)
+
+
+class DecomposeSiluPass(ExportPass):
+    """
+    This pass decomposes silu into a mul and a sigmoid node.
+
+    Example:
+        y = silu(a)
+    Becomes:
+        x = sigmoid(a)
+        y = mul(a,x)
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in (aten_silu_ops):
+            return super().call_operator(op, args, kwargs, meta)
+        sigmoid_op = torch.ops.aten.sigmoid.default
+        mul_op = torch.ops.aten.mul.Tensor
+
+        original = args[0]
+        sigmoid = super().call_operator(sigmoid_op, (original,), {}, meta)
+
+        return super().call_operator(mul_op, (original, sigmoid), {}, meta)
diff --git a/backends/arm/_passes/decompose_sqrt_pass.py b/backends/arm/_passes/decompose_sqrt_pass.py
new file mode 100644
index 00000000000..d4a678affea
--- /dev/null
+++ b/backends/arm/_passes/decompose_sqrt_pass.py
@@ -0,0 +1,39 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+edge_sqrt_ops = (exir_ops.edge.aten.sqrt.default,)
+aten_sqrt_ops = (
+    torch.ops.aten.sqrt.default,
+    torch.ops.aten.sqrt_.default,
+)
+
+
+def get_sqrt_decomposition(op) -> tuple:
+    # TODO : "MLETORCH-863 : Replace current sqrt -> pow.Tensor_Scalar workaround with pow.Tensor_Tensor"
+    if op in edge_sqrt_ops:
+        return exir_ops.edge.aten.pow.Tensor_Scalar
+    if op in aten_sqrt_ops:
+        return torch.ops.aten.pow.Tensor_Scalar
+    raise RuntimeError(f"Can't get sqrt decomposition for op {op}")
+
+
+class DecomposeSqrtPass(ExportPass):
+
+    def call_operator(self, op, args, kwargs, meta):
+        """
+        Decomposes `sqrt(x)` into `pow(x, 0.5)` for backend support.
+        """
+
+        if op not in (edge_sqrt_ops + aten_sqrt_ops):
+            return super().call_operator(op, args, kwargs, meta)
+
+        pow_op = get_sqrt_decomposition(op)
+
+        return super().call_operator(pow_op, (args[0], 0.5), {}, meta)
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
index 02510600d82..a5f66829da5 100644
--- a/backends/arm/_passes/insert_table_ops.py
+++ b/backends/arm/_passes/insert_table_ops.py
@@ -56,6 +56,7 @@ class TableOps:
     # Targets that must be treated explicitly
     special_table_ops: Set[EdgeOpOverload] = {
         exir_ops.edge.aten.pow.Tensor_Scalar,
+        exir_ops.edge.aten.gelu.default,
     }
 
     def __init__(self, exported_program: ExportedProgram):
@@ -76,6 +77,19 @@ def __getitem__(self, node: Node):
                     # Exponent is a constant. Embed it into a lambda.
                     exp = cast(int, node.args[1])
                     return lambda x: torch.pow(x, exp).flatten()
+                case exir_ops.edge.aten.gelu.default:
+                    # If kwargs not present it is default "none"
+                    approximate = cast(
+                        str,
+                        (
+                            node.kwargs["approximate"]
+                            if "approximate" in node.kwargs
+                            else "none"
+                        ),
+                    )
+                    return lambda x: torch.nn.functional.gelu(
+                        x, approximate=approximate
+                    ).flatten()
                 case _:
                     # Op must be handled if it's inside self.special_ops
                     raise AssertionError("Unhandled table operation")
diff --git a/backends/arm/_passes/match_arg_ranks_pass.py b/backends/arm/_passes/match_arg_ranks_pass.py
index 2cfc9b2b86a..3554fc0954c 100644
--- a/backends/arm/_passes/match_arg_ranks_pass.py
+++ b/backends/arm/_passes/match_arg_ranks_pass.py
@@ -48,6 +48,9 @@ def __init__(self, exported_program):
         exir_ops.edge.aten.bitwise_right_shift.Tensor,
         exir_ops.edge.aten.bitwise_left_shift.Tensor,
         exir_ops.edge.aten.eq.Tensor,
+        exir_ops.edge.aten.gt.Tensor,
+        exir_ops.edge.aten.ge.Tensor,
+        exir_ops.edge.aten.lt.Tensor,
         exir_ops.edge.aten.pow.Tensor_Tensor,
         exir_ops.edge.aten.where.self,
     ]
diff --git a/backends/arm/_passes/replace_scalar_with_tensor_pass.py b/backends/arm/_passes/replace_scalar_with_tensor_pass.py
index 97e89132979..fed72e664f5 100644
--- a/backends/arm/_passes/replace_scalar_with_tensor_pass.py
+++ b/backends/arm/_passes/replace_scalar_with_tensor_pass.py
@@ -26,6 +26,9 @@
     exir_ops.edge.aten.__rshift__.Scalar: exir_ops.edge.aten.bitwise_right_shift.Tensor,
     exir_ops.edge.aten.__lshift__.Scalar: exir_ops.edge.aten.bitwise_left_shift.Tensor,
     exir_ops.edge.aten.eq.Scalar: exir_ops.edge.aten.eq.Tensor,
+    exir_ops.edge.aten.gt.Scalar: exir_ops.edge.aten.gt.Tensor,
+    exir_ops.edge.aten.ge.Scalar: exir_ops.edge.aten.ge.Tensor,
+    exir_ops.edge.aten.lt.Scalar: exir_ops.edge.aten.lt.Tensor,
     torch.ops.aten.add.Scalar: torch.ops.aten.add.Tensor,
     torch.ops.aten.sub.Scalar: torch.ops.aten.sub.Tensor,
     torch.ops.aten.mul.Scalar: torch.ops.aten.mul.Tensor,
@@ -33,6 +36,9 @@
     torch.ops.aten.__rshift__.Scalar: torch.ops.aten.bitwise_right_shift.Tensor,
     torch.ops.aten.__lshift__.Scalar: torch.ops.aten.bitwise_left_shift.Tensor,
     torch.ops.aten.eq.Scalar: torch.ops.aten.eq.Tensor,
+    torch.ops.aten.gt.Scalar: torch.ops.aten.gt.Tensor,
+    torch.ops.aten.ge.Scalar: torch.ops.aten.ge.Tensor,
+    torch.ops.aten.lt.Scalar: torch.ops.aten.lt.Tensor,
 }
 
 
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index e6a885b43fa..05b101bef7d 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -11,8 +11,6 @@
 # JIT compiler flows.
 #
 
-import logging
-
 from typing import List, Optional
 
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -20,10 +18,6 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
-
-
 class ArmCompileSpecBuilder:
     def __init__(self):
         self.compile_spec: List[CompileSpec] = []
diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py
index 9e13babe23a..75899eb7425 100644
--- a/backends/arm/operator_support/convolution_support.py
+++ b/backends/arm/operator_support/convolution_support.py
@@ -22,6 +22,8 @@ class ConvolutionSupported(SupportedTOSAOperatorCheck):
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
diff --git a/backends/arm/operator_support/ethos_u55_support.py b/backends/arm/operator_support/ethos_u55_support.py
index 69fda636423..7276e8efffe 100644
--- a/backends/arm/operator_support/ethos_u55_support.py
+++ b/backends/arm/operator_support/ethos_u55_support.py
@@ -134,9 +134,12 @@ class EthosU55NotSupported(OperatorSupportBase):
         exir_ops.edge.aten.eq.Tensor,
         exir_ops.edge.aten.eq.Scalar,
         exir_ops.edge.aten.ge.Tensor,
+        exir_ops.edge.aten.ge.Scalar,
         exir_ops.edge.aten.gt.Tensor,
+        exir_ops.edge.aten.gt.Scalar,
         exir_ops.edge.aten.le.Tensor,
         exir_ops.edge.aten.lt.Tensor,
+        exir_ops.edge.aten.lt.Scalar,
         exir_ops.edge.aten.flip.default,  # REVERSE
         exir_ops.edge.aten.grid_sampler_2d,  # GATHER
         exir_ops.edge.aten.scatter.src,
diff --git a/backends/arm/operator_support/minmax_support.py b/backends/arm/operator_support/minmax_support.py
index bdff368a5ce..86b949082eb 100644
--- a/backends/arm/operator_support/minmax_support.py
+++ b/backends/arm/operator_support/minmax_support.py
@@ -22,6 +22,7 @@ class MinMaxSupported(SupportedTOSAOperatorCheck):
     # TODO : "MLETORCH-718 : Quantization of indices in arm_quantizer"
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
diff --git a/backends/arm/operator_support/pool_2d_support.py b/backends/arm/operator_support/pool_2d_support.py
index 8291ede8ad9..750fab2730d 100644
--- a/backends/arm/operator_support/pool_2d_support.py
+++ b/backends/arm/operator_support/pool_2d_support.py
@@ -41,6 +41,8 @@ class AvgPool2dSupported(SupportedTOSAOperatorCheck):
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
@@ -94,6 +96,8 @@ class MaxPool2dSupported(SupportedTOSAOperatorCheck):
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
diff --git a/backends/arm/operator_support/reduce_sum_support.py b/backends/arm/operator_support/reduce_sum_support.py
index 37a71d7264c..a50bcbceab7 100644
--- a/backends/arm/operator_support/reduce_sum_support.py
+++ b/backends/arm/operator_support/reduce_sum_support.py
@@ -21,6 +21,8 @@ class SumSupported(SupportedTOSAOperatorCheck):
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
diff --git a/backends/arm/operator_support/right_shift_support.py b/backends/arm/operator_support/right_shift_support.py
index 6c61347ba68..49976b2346f 100644
--- a/backends/arm/operator_support/right_shift_support.py
+++ b/backends/arm/operator_support/right_shift_support.py
@@ -17,7 +17,6 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 @register_tosa_support_check
@@ -30,6 +29,8 @@ class RightShiftSupported(SupportedTOSAOperatorCheck):
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
diff --git a/backends/arm/operator_support/slice_copy_support.py b/backends/arm/operator_support/slice_copy_support.py
index 1f5ace91cde..ea18c408149 100644
--- a/backends/arm/operator_support/slice_copy_support.py
+++ b/backends/arm/operator_support/slice_copy_support.py
@@ -16,7 +16,6 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 @register_tosa_support_check
@@ -26,6 +25,8 @@ class SliceCopySupported(SupportedTOSAOperatorCheck):
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool:  # type: ignore[override, misc]
diff --git a/backends/arm/operator_support/to_copy_support.py b/backends/arm/operator_support/to_copy_support.py
index 7926b3dc053..aa0be8cfcd0 100644
--- a/backends/arm/operator_support/to_copy_support.py
+++ b/backends/arm/operator_support/to_copy_support.py
@@ -30,6 +30,8 @@ class ToCopySupported(SupportedTOSAOperatorCheck):
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     SupportedTypeDict = dict[torch.dtype, list[torch.dtype]]
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 09230e44257..952cfb17cf0 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -66,6 +66,8 @@ def is_node_tosa_supported(
 _tosa_spec_support: dict[TosaSpecification, list[Type[SupportedTOSAOperatorCheck]]] = {
     TosaSpecification.create_from_string("TOSA-0.80+BI"): [],
     TosaSpecification.create_from_string("TOSA-0.80+MI"): [],
+    TosaSpecification.create_from_string("TOSA-1.0+INT"): [],
+    TosaSpecification.create_from_string("TOSA-1.0+FP"): [],
 }
 
 
@@ -112,6 +114,7 @@ def tosa_support_factory(
     # Negative checks: Remove nodes from partitioning
     negative_checks: list[OperatorSupportBase] = [
         CheckInt64Inputs(exported_program, reporter),
+        CheckFloat64Inputs(exported_program, reporter),
         *[
             reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
             for check in (additional_checks if additional_checks else [])
@@ -175,9 +178,12 @@ def is_node_supported(
             exir_ops.edge.aten.full.default,
             exir_ops.edge.aten.full_like.default,
             exir_ops.edge.aten.ge.Tensor,
+            exir_ops.edge.aten.ge.Scalar,
             exir_ops.edge.aten.gt.Tensor,
+            exir_ops.edge.aten.gt.Scalar,
             exir_ops.edge.aten.le.Tensor,
             exir_ops.edge.aten.lt.Tensor,
+            exir_ops.edge.aten.lt.Scalar,
             exir_ops.edge.aten.mul.Tensor,
             exir_ops.edge.aten.add.Scalar,
             exir_ops.edge.aten.sub.Scalar,
@@ -194,6 +200,7 @@ def is_node_supported(
             exir_ops.edge.aten.reciprocal.default,
             exir_ops.edge.aten.relu.default,
             exir_ops.edge.aten.leaky_relu.default,
+            exir_ops.edge.aten.sqrt.default,
             exir_ops.edge.aten.rsqrt.default,
             exir_ops.edge.aten._softmax.default,
             exir_ops.edge.aten.select_copy.int,
@@ -221,6 +228,8 @@ def is_node_supported(
             exir_ops.edge.aten.bitwise_left_shift.Tensor,
             exir_ops.edge.aten.__lshift__.Scalar,
             torch.ops.aten.scalar_tensor.default,
+            exir_ops.edge.aten.gelu.default,
+            exir_ops.edge.aten.alias_copy.default,
         ]
 
         return supported
@@ -256,6 +265,7 @@ def is_node_supported(
                 exir_ops.edge.aten.var.correction,
                 exir_ops.edge.aten.var.dim,
                 exir_ops.edge.aten.add.Scalar,
+                exir_ops.edge.aten.sqrt.default,
                 exir_ops.edge.aten.sub.Scalar,
                 exir_ops.edge.aten.mul.Scalar,
                 exir_ops.edge.aten.div.Scalar,
@@ -356,6 +366,7 @@ def is_node_supported(
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.tanh.default,
             exir_ops.edge.aten.upsample_nearest2d.vec,
+            exir_ops.edge.aten.gelu.default,
         ):
             return True
         elif node.target in (
@@ -439,3 +450,26 @@ def is_node_supported(
                         )
                         return False
         return True
+
+
+class CheckFloat64Inputs(OperatorSupportBase):
+
+    def __init__(
+        self, exported_program: ExportedProgram, reporter: WhyNoPartitionReporter
+    ):
+        self.reporter = reporter
+        super().__init__()
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+
+        for input_node in node.all_input_nodes:
+            tensor = get_first_fake_tensor(input_node)
+            if tensor.dtype == torch.float64:
+                self.reporter.report_reject(
+                    node,
+                    f"Had float64 input {input_node.name} that couldn't be handled.",
+                )
+                return False
+        return True
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index b62e8940ed2..da050c5994e 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -21,9 +21,7 @@
     op_eq,
     op_erf,
     op_exp,
-    op_full,
     op_ge,
-    op_get_item,
     op_gt,
     op_le,
     op_log,
@@ -52,5 +50,6 @@
     op_view,
     op_where,
     ops_binary,
+    ops_identity,
     ops_unary,
 )
diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py
index f2c7ce9f9ce..5056c5f7f54 100644
--- a/backends/arm/operators/node_visitor.py
+++ b/backends/arm/operators/node_visitor.py
@@ -5,10 +5,10 @@
 
 # pyre-unsafe
 
-from typing import Dict, List
+from typing import Any, Dict, List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from torch.export import ExportedProgram
@@ -24,11 +24,18 @@ class NodeVisitor:
     # a specific TOSA version.
     # When all node_visitors has been refactored to target a specific
     # version, this list should be removed.
-    tosa_specs = [
+    tosa_specs_1_00 = [
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
+    ]
+
+    tosa_specs_0_80 = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
     ]
 
+    tosa_specs = tosa_specs_0_80 + tosa_specs_1_00
+
     def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecification):
         self._exported_program = exported_program
         self.tosa_spec = tosa_spec
@@ -36,7 +43,7 @@ def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecificati
     def define_node(
         self,
         node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
+        tosa_graph: Any,
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
@@ -47,6 +54,8 @@ def define_node(
 _node_visitor_dicts: Dict[TosaSpecification, Dict] = {
     TosaSpecification.create_from_string("TOSA-0.80+BI"): {},
     TosaSpecification.create_from_string("TOSA-0.80+MI"): {},
+    TosaSpecification.create_from_string("TOSA-1.0+INT"): {},
+    TosaSpecification.create_from_string("TOSA-1.0+FP"): {},
 }
 
 
diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
index 886a96fd520..648edde04f4 100644
--- a/backends/arm/operators/op_abs.py
+++ b/backends/arm/operators/op_abs.py
@@ -9,15 +9,13 @@
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -70,7 +68,7 @@ def define_node(
 
         # Do the INT32 Abs
         tosa_graph.addOperator(
-            TosaOp.Op().ABS,
+            ts.TosaOp.Op().ABS,
             [
                 rescaled_inputs[0].name,
             ],
@@ -126,7 +124,7 @@ def define_node(
 
             # MI lowering
             tosa_graph.addOperator(
-                TosaOp.Op().ABS,
+                ts.TosaOp.Op().ABS,
                 [inputs[0].name],
                 [output.name],
                 None,
diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py
index 1be4a218232..904a2405047 100644
--- a/backends/arm/operators/op_add.py
+++ b/backends/arm/operators/op_add.py
@@ -10,14 +10,13 @@
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -82,7 +81,7 @@ def define_node(
 
         # Do the INT32 Add
         tosa_graph.addOperator(
-            TosaOp.Op().ADD,
+            ts.TosaOp.Op().ADD,
             [input1.name, input2.name],
             [add_output.name],
             None,
@@ -135,7 +134,7 @@ def define_node(
 
             # MI lowering
             tosa_graph.addOperator(
-                TosaOp.Op().ADD,
+                ts.TosaOp.Op().ADD,
                 [input1.name, input2.name],
                 [output.name],
                 None,
diff --git a/backends/arm/operators/op_amax.py b/backends/arm/operators/op_amax.py
index 7347648c454..059f6c1e553 100644
--- a/backends/arm/operators/op_amax.py
+++ b/backends/arm/operators/op_amax.py
@@ -4,14 +4,13 @@
 # LICENSE file in the root directory of this source tree.
 from typing import List
 
-import serializer.tosa_serializer as ts
+import tosa_tools.v0_80.serializer.tosa_serializer as ts
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -48,5 +47,5 @@ def define_node(
         attr.AxisAttribute(input.dim_order.index(dim))
 
         tosa_graph.addOperator(
-            TosaOp.Op().REDUCE_MAX, [input.name], [output.name], attr
+            ts.TosaOp.Op().REDUCE_MAX, [input.name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_amin.py b/backends/arm/operators/op_amin.py
index 37625cfcc52..85e43b76c4c 100644
--- a/backends/arm/operators/op_amin.py
+++ b/backends/arm/operators/op_amin.py
@@ -4,14 +4,13 @@
 # LICENSE file in the root directory of this source tree.
 from typing import List
 
-import serializer.tosa_serializer as ts
+import tosa_tools.v0_80.serializer.tosa_serializer as ts
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -48,5 +47,5 @@ def define_node(
         attr.AxisAttribute(input.dim_order.index(dim))
 
         tosa_graph.addOperator(
-            TosaOp.Op().REDUCE_MIN, [input.name], [output.name], attr
+            ts.TosaOp.Op().REDUCE_MIN, [input.name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_any.py b/backends/arm/operators/op_any.py
index ffb2e8a3c5d..b65ebb2ac5d 100644
--- a/backends/arm/operators/op_any.py
+++ b/backends/arm/operators/op_any.py
@@ -6,14 +6,13 @@
 # pyre-unsafe
 from typing import cast, List
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (  # type: ignore
     NodeVisitor,
     register_node_visitor,
 )
 
 from executorch.backends.arm.tosa_mapping import TosaArg  # type: ignore
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -49,5 +48,5 @@ def define_node(
         attr.AxisAttribute(inputs[0].dim_order.index(dim))
 
         tosa_graph.addOperator(
-            TosaOp.Op().REDUCE_ANY, [inputs[0].name], [output.name], attr
+            ts.TosaOp.Op().REDUCE_ANY, [inputs[0].name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py
index 772f8353565..bdd3425fda5 100644
--- a/backends/arm/operators/op_avg_pool2d.py
+++ b/backends/arm/operators/op_avg_pool2d.py
@@ -6,9 +6,10 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
     get_output_qparams,
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
index af02fc30dd8..6dc0ec8002d 100644
--- a/backends/arm/operators/op_bmm.py
+++ b/backends/arm/operators/op_bmm.py
@@ -7,9 +7,10 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
     get_output_qparams,
@@ -20,7 +21,6 @@
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_quant_utils import build_rescale
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -64,7 +64,7 @@ def define_node(
         attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp)
 
         tosa_graph.addOperator(
-            TosaOp.Op().MATMUL,
+            ts.TosaOp.Op().MATMUL,
             [inputs[0].name, inputs[1].name],
             [bmm_output_name],
             attr,
diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py
index f786395cc39..6b1710301b1 100644
--- a/backends/arm/operators/op_cat.py
+++ b/backends/arm/operators/op_cat.py
@@ -7,13 +7,12 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -42,5 +41,8 @@ def define_node(
         attr.AxisAttribute(dim)
 
         tosa_graph.addOperator(
-            TosaOp.Op().CONCAT, [tensor.name for tensor in tensors], [output.name], attr
+            ts.TosaOp.Op().CONCAT,
+            [tensor.name for tensor in tensors],
+            [output.name],
+            attr,
         )
diff --git a/backends/arm/operators/op_clamp.py b/backends/arm/operators/op_clamp.py
index 7c4ad8682fa..b18ed640b5f 100644
--- a/backends/arm/operators/op_clamp.py
+++ b/backends/arm/operators/op_clamp.py
@@ -8,9 +8,9 @@
 
 from typing import Any, List, Tuple
 
-import serializer.tosa_serializer as ts  # type: ignore
-
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -18,7 +18,6 @@
 
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -51,7 +50,7 @@ def _create_clamp_node(
             min_fp32,
             max_fp32,
         )
-        tosa_graph.addOperator(TosaOp.Op().CLAMP, [input_name], [output_name], attr)
+        tosa_graph.addOperator(ts.TosaOp.Op().CLAMP, [input_name], [output_name], attr)
 
     def _get_min_max_arguments(
         self, node: Node, dtype_min: int | float, dtype_max: int | float
@@ -64,7 +63,8 @@ def cast_type(value: Any) -> int | float:
                 # Attempt to cast to float
                 return float(value)
 
-        assert 2 <= len(node.args) <= 3
+        if len(node.args) != 2 and len(node.args) != 3:
+            raise ValueError(f"Expected len(node.args) to be 2 or 3, got {node.args}")
 
         min_arg = dtype_min
         max_arg = dtype_max
@@ -85,7 +85,10 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert len(node.all_input_nodes) == 1
+        if len(node.all_input_nodes) != 1:
+            raise ValueError(
+                f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}"
+            )
 
         min_int8, max_int8 = self._get_min_max_arguments(
             node,
@@ -123,7 +126,10 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert len(node.all_input_nodes) == 1
+        if len(node.all_input_nodes) != 1:
+            raise ValueError(
+                f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}"
+            )
 
         if inputs[0].dtype == ts.DType.INT8:
             # Call the inherited define_node for handling integers
diff --git a/backends/arm/operators/op_constant_pad_nd.py b/backends/arm/operators/op_constant_pad_nd.py
index 73f6d2751c5..b2c31df96ab 100644
--- a/backends/arm/operators/op_constant_pad_nd.py
+++ b/backends/arm/operators/op_constant_pad_nd.py
@@ -7,9 +7,10 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts
 import torch
 
+import tosa_tools.v0_80.serializer.tosa_serializer as ts
+
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
 )
@@ -18,7 +19,6 @@
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -71,4 +71,6 @@ def define_node(
         attr = ts.TosaSerializerAttribute()
         attr.PadAttribute(tosa_graph.builder, output_pad, pad_const_qs, pad_const_fp)
 
-        tosa_graph.addOperator(TosaOp.Op().PAD, [inputs[0].name], [output.name], attr)
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().PAD, [inputs[0].name], [output.name], attr
+        )
diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index 2fe00b6758f..90475af1476 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -6,9 +6,10 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
     get_output_qparams,
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
index 02fc89099e0..7f87fb5a81d 100644
--- a/backends/arm/operators/op_eq.py
+++ b/backends/arm/operators/op_eq.py
@@ -9,13 +9,12 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 from torch.fx import Node
 
@@ -34,9 +33,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "EQ must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator EQ but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
@@ -51,7 +52,7 @@ def define_node(
 
         # Do the equal comparison
         tosa_graph.addOperator(
-            TosaOp.Op().EQUAL,
+            ts.TosaOp.Op().EQUAL,
             [input_nodes[0].name, input_nodes[1].name],
             output.name,
             None,
diff --git a/backends/arm/operators/op_erf.py b/backends/arm/operators/op_erf.py
index d0dc2af572f..01243716129 100644
--- a/backends/arm/operators/op_erf.py
+++ b/backends/arm/operators/op_erf.py
@@ -5,15 +5,15 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch.fx
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.tosa.Op as TosaOp  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py
index 4b8232ef6e7..ca067b3b8be 100644
--- a/backends/arm/operators/op_exp.py
+++ b/backends/arm/operators/op_exp.py
@@ -6,15 +6,13 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -36,7 +34,14 @@ def define_node(
         output: TosaArg,
     ) -> None:
 
-        assert len(node.all_input_nodes) == 1
-        assert inputs[0].dtype == output.dtype == ts.DType.FP32
-
-        tosa_graph.addOperator(TosaOp.Op().EXP, [inputs[0].name], [output.name])
+        if len(node.all_input_nodes) != 1:
+            raise ValueError(
+                f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}"
+            )
+        if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
+            raise ValueError(
+                f"Input and output for {self.target} need to be FP32, got input dtype: "
+                f"{inputs[0].dtype} and output dtype: {output.dtype}"
+            )
+
+        tosa_graph.addOperator(ts.TosaOp.Op().EXP, [inputs[0].name], [output.name])
diff --git a/backends/arm/operators/op_full.py b/backends/arm/operators/op_full.py
deleted file mode 100644
index f06b9873e63..00000000000
--- a/backends/arm/operators/op_full.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-from typing import List
-
-import numpy as np
-
-import serializer.tosa_serializer as ts  # type: ignore
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_utils import tosa_shape
-from torch.fx import Node
-
-
-@register_node_visitor
-class FullVisitor(NodeVisitor):
-    target = "aten.full.default"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-
-        shape = tosa_shape(inputs[0].special, output.dim_order)
-
-        value = inputs[1].number
-
-        if output.dtype == ts.DType.INT8:
-            fill_dtype = np.int8
-        else:
-            fill_dtype = np.float32  # type: ignore[assignment]
-        data = np.full(shape, value, dtype=fill_dtype)
-
-        tosa_graph.addConst(shape, output.dtype, data, node.name + "full-const")
-        tosa_graph.addOperator(
-            ts.TosaOp.Op.IDENTITY, [node.name + "full-const"], [output.name]
-        )
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
index e4de12f3327..b2193a2e7ed 100644
--- a/backends/arm/operators/op_ge.py
+++ b/backends/arm/operators/op_ge.py
@@ -9,13 +9,12 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 from torch.fx import Node
 
@@ -34,9 +33,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "GE must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator GE but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
@@ -50,7 +51,7 @@ def define_node(
             input_nodes = rescaled_inputs
 
         tosa_graph.addOperator(
-            TosaOp.Op().GREATER_EQUAL,
+            ts.TosaOp.Op().GREATER_EQUAL,
             [input_nodes[0].name, input_nodes[1].name],
             [output.name],
             None,
diff --git a/backends/arm/operators/op_get_item.py b/backends/arm/operators/op_get_item.py
deleted file mode 100644
index 577a8c8d2ea..00000000000
--- a/backends/arm/operators/op_get_item.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-from typing import List
-
-import serializer.tosa_serializer as ts  # type: ignore
-import torch
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
-
-
-@register_node_visitor
-class GetItemVisitor(NodeVisitor):
-    target = "getitem"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        item_name = inputs[0].name
-        ## Simply add an identityOp
-        tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name])
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
index 65cf8197bdc..06f29e4505c 100644
--- a/backends/arm/operators/op_gt.py
+++ b/backends/arm/operators/op_gt.py
@@ -9,13 +9,12 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 from torch.fx import Node
 
@@ -34,9 +33,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "GT must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator GT but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
@@ -50,7 +51,7 @@ def define_node(
             input_nodes = rescaled_inputs
 
         tosa_graph.addOperator(
-            TosaOp.Op().GREATER,
+            ts.TosaOp.Op().GREATER,
             [input_nodes[0].name, input_nodes[1].name],
             [output.name],
             None,
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
index 8fea2b92088..fadf4848359 100644
--- a/backends/arm/operators/op_le.py
+++ b/backends/arm/operators/op_le.py
@@ -9,13 +9,12 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 from torch.fx import Node
 
@@ -34,9 +33,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "LE must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator LE but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
@@ -50,7 +51,7 @@ def define_node(
             input_nodes = rescaled_inputs
 
         tosa_graph.addOperator(
-            TosaOp.Op().GREATER_EQUAL,
+            ts.TosaOp.Op().GREATER_EQUAL,
             [input_nodes[1].name, input_nodes[0].name],
             [output.name],
             None,
diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py
index d8a136e37f8..34911075065 100644
--- a/backends/arm/operators/op_log.py
+++ b/backends/arm/operators/op_log.py
@@ -6,15 +6,13 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -38,4 +36,4 @@ def define_node(
         assert len(node.all_input_nodes) == 1
         assert inputs[0].dtype == output.dtype == ts.DType.FP32
 
-        tosa_graph.addOperator(TosaOp.Op().LOG, [inputs[0].name], [output.name])
+        tosa_graph.addOperator(ts.TosaOp.Op().LOG, [inputs[0].name], [output.name])
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
index da93ab41799..a261cd2db9f 100644
--- a/backends/arm/operators/op_lt.py
+++ b/backends/arm/operators/op_lt.py
@@ -9,13 +9,12 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 from torch.fx import Node
 
@@ -34,9 +33,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "LT must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator LT but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
@@ -50,7 +51,7 @@ def define_node(
             input_nodes = rescaled_inputs
 
         tosa_graph.addOperator(
-            TosaOp.Op().GREATER,
+            ts.TosaOp.Op().GREATER,
             [input_nodes[1].name, input_nodes[0].name],
             [output.name],
             None,
diff --git a/backends/arm/operators/op_max_pool2d.py b/backends/arm/operators/op_max_pool2d.py
index 9dd627a3e4f..fcf2636977d 100644
--- a/backends/arm/operators/op_max_pool2d.py
+++ b/backends/arm/operators/op_max_pool2d.py
@@ -6,9 +6,10 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
     get_output_qparams,
@@ -18,7 +19,6 @@
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -75,7 +75,7 @@ def define_node(
         )
 
         tosa_graph.addOperator(
-            TosaOp.Op().MAX_POOL2D,
+            ts.TosaOp.Op().MAX_POOL2D,
             [input_tensor.name],
             [output.name],
             attr,
diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py
index 4eb7e47fac8..ee52e5276cd 100644
--- a/backends/arm/operators/op_maximum.py
+++ b/backends/arm/operators/op_maximum.py
@@ -8,7 +8,7 @@
 from typing import List
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
@@ -19,8 +19,6 @@
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_utils import tosa_shape
-
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -38,20 +36,27 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert inputs[0].dtype == inputs[1].dtype
+        if inputs[0].dtype != inputs[1].dtype and inputs[0].dtype != output.dtype:
+            raise TypeError(
+                f"Data type of inputs and output must be the same. Got input 0 dtype: "
+                f"{inputs[0].dtype}, input 1 dtype: {inputs[1].dtype} and output "
+                f"dtype: {output.dtype}"
+            )
 
         scale_back = 1.0
         max_output = output
         if inputs[0].dtype == ts.DType.INT8:
             input_qparams = get_input_qparams(node)
-            assert (
-                len(input_qparams) == 2
-            ), f"Both inputs needs to have quantization information for {node}"
-            # insert RESCALEs to int32
-            assert (
-                input_qparams[0] == input_qparams[1]
-            ), "Both inputs must have same quantization for MAX"
+            if len(input_qparams) != 2:
+                raise ValueError(
+                    f"Both inputs need to have quantization information for {node}"
+                )
+            if input_qparams[0] != input_qparams[1]:
+                raise ValueError(
+                    "Both inputs must have the same quantization parameters for MAX"
+                )
 
+            # insert RESCALEs to int32
             operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
                 tosa_graph, inputs, node
             )
@@ -62,7 +67,7 @@ def define_node(
             operand_inputs = inputs
 
         tosa_graph.addOperator(
-            TosaOp.Op().MAXIMUM,
+            ts.TosaOp.Op().MAXIMUM,
             [
                 operand_inputs[0].name,
                 operand_inputs[1].name,
diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py
index 1b8c1960411..88cb8d376fe 100644
--- a/backends/arm/operators/op_minimum.py
+++ b/backends/arm/operators/op_minimum.py
@@ -9,7 +9,7 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
@@ -20,8 +20,6 @@
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_utils import tosa_shape
-
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -39,20 +37,27 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert inputs[0].dtype == inputs[1].dtype
+        if inputs[0].dtype != inputs[1].dtype and inputs[0].dtype != output.dtype:
+            raise TypeError(
+                f"Data type of inputs and output must be the same. Got input 0 dtype: "
+                f"{inputs[0].dtype}, input 1 dtype: {inputs[1].dtype} and output "
+                f"dtype: {output.dtype}"
+            )
 
         scale_back = 1.0
         min_output = output
         if inputs[0].dtype == ts.DType.INT8:
             input_qparams = get_input_qparams(node)
-            assert (
-                len(input_qparams) == 2
-            ), f"Both inputs needs to have quantization information for {node}"
-            # insert RESCALEs to int32
-            assert (
-                input_qparams[0] == input_qparams[1]
-            ), "Both inputs must have same quantization for MIN"
+            if len(input_qparams) != 2:
+                raise ValueError(
+                    f"Both inputs need to have quantization information for {node}"
+                )
+            if input_qparams[0] != input_qparams[1]:
+                raise ValueError(
+                    "Both inputs must have the same quantization parameters for MIN"
+                )
 
+            # insert RESCALEs to int32
             operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
                 tosa_graph, inputs, node
             )
@@ -63,7 +68,7 @@ def define_node(
             operand_inputs = inputs
 
         tosa_graph.addOperator(
-            TosaOp.Op().MINIMUM,
+            ts.TosaOp.Op().MINIMUM,
             [
                 operand_inputs[0].name,
                 operand_inputs[1].name,
diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py
index 2f6c7e7130c..dcceb36b0ab 100644
--- a/backends/arm/operators/op_mul.py
+++ b/backends/arm/operators/op_mul.py
@@ -9,10 +9,10 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
-
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
 )
@@ -24,7 +24,6 @@
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.arm.tosa_utils import reshape_for_broadcast
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -87,7 +86,7 @@ def define_node(
         attr = ts.TosaSerializerAttribute()
         attr.MulAttribute(shift=0)
         tosa_graph.addOperator(
-            TosaOp.Op().MUL,
+            ts.TosaOp.Op().MUL,
             [input1.name, input2.name],
             [mul_output.name],
             attr,
@@ -119,5 +118,5 @@ def define_node(
         attr = ts.TosaSerializerAttribute()
         attr.MulAttribute(shift=0)
         tosa_graph.addOperator(
-            TosaOp.Op().MUL, [input1.name, input2.name], [output.name], attr
+            ts.TosaOp.Op().MUL, [input1.name, input2.name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py
index e659918baf2..c92a008a281 100644
--- a/backends/arm/operators/op_permute.py
+++ b/backends/arm/operators/op_permute.py
@@ -7,14 +7,14 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 
 def permutation_vector_to_matrix(permutation_vector: list[int]) -> torch.Tensor:
@@ -117,5 +117,5 @@ def define_node(
         attr = ts.TosaSerializerAttribute()
         attr.TransposeAttribute(permutation_vector)
         tosa_graph.addOperator(
-            TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr
+            ts.TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_pow.py b/backends/arm/operators/op_pow.py
index 0f251a8aa6d..d3b92feff12 100644
--- a/backends/arm/operators/op_pow.py
+++ b/backends/arm/operators/op_pow.py
@@ -7,14 +7,13 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts
+import tosa_tools.v0_80.serializer.tosa_serializer as ts
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -47,7 +46,7 @@ def define_node(
             )
 
         tosa_graph.addOperator(
-            TosaOp.Op().POW,
+            ts.TosaOp.Op().POW,
             [
                 inputs[0].name,
                 inputs[1].name,
diff --git a/backends/arm/operators/op_reciprocal.py b/backends/arm/operators/op_reciprocal.py
index 5410e1dd99a..c75fb99977e 100644
--- a/backends/arm/operators/op_reciprocal.py
+++ b/backends/arm/operators/op_reciprocal.py
@@ -6,15 +6,15 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -34,5 +34,16 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert inputs[0].dtype == output.dtype == ts.DType.FP32
-        tosa_graph.addOperator(TosaOp.Op().RECIPROCAL, [inputs[0].name], [output.name])
+        if len(node.all_input_nodes) != 1:
+            raise ValueError(
+                f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}"
+            )
+        if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
+            raise ValueError(
+                f"Input and output for {self.target} need to be FP32, got "
+                f"{inputs[0].dtype=} and {output.dtype=}"
+            )
+
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().RECIPROCAL, [inputs[0].name], [output.name]
+        )
diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
index b97d7023ef0..142ccb1d25a 100644
--- a/backends/arm/operators/op_repeat.py
+++ b/backends/arm/operators/op_repeat.py
@@ -5,15 +5,14 @@
 
 # pyre-unsafe
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_utils import tosa_shape
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -35,4 +34,6 @@ def define_node(
 
         attr = ts.TosaSerializerAttribute()
         attr.TileAttribute(tosa_shape(multiples, output.dim_order))
-        tosa_graph.addOperator(TosaOp.Op().TILE, [inputs[0].name], [output.name], attr)
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().TILE, [inputs[0].name], [output.name], attr
+        )
diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_rescale.py
index 098fbeccce1..c59015dcc14 100644
--- a/backends/arm/operators/op_rescale.py
+++ b/backends/arm/operators/op_rescale.py
@@ -8,10 +8,10 @@
 from typing import cast, List
 
 import executorch.backends.arm.tosa_quant_utils as tosa_quant_utils
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-import tosa.Op as TosaOp  # type: ignore
+import tosa_tools.v0_80.tosa.Op as TosaOp  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_rshift_tensor.py b/backends/arm/operators/op_rshift_tensor.py
index 8ea0343faaa..125f5493a29 100644
--- a/backends/arm/operators/op_rshift_tensor.py
+++ b/backends/arm/operators/op_rshift_tensor.py
@@ -7,15 +7,15 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import Tosa_0_80
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -39,7 +39,7 @@ def define_node(
         attr.ArithmeticRightShiftAttribute(round=round)
 
         tosa_graph.addOperator(
-            TosaOp.Op().ARITHMETIC_RIGHT_SHIFT,
+            ts.TosaOp.Op().ARITHMETIC_RIGHT_SHIFT,
             [inputs[0].name, inputs[1].name],
             [output.name],
             attr,
diff --git a/backends/arm/operators/op_rsqrt.py b/backends/arm/operators/op_rsqrt.py
index 0fbb203b081..e3937f8c44a 100644
--- a/backends/arm/operators/op_rsqrt.py
+++ b/backends/arm/operators/op_rsqrt.py
@@ -6,15 +6,15 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -34,5 +34,14 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert inputs[0].dtype == output.dtype == ts.DType.FP32
-        tosa_graph.addOperator(TosaOp.Op().RSQRT, [inputs[0].name], [output.name])
+        if len(node.all_input_nodes) != 1:
+            raise ValueError(
+                f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}"
+            )
+        if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
+            raise ValueError(
+                f"Input and output for {self.target} need to be FP32, got "
+                f"{inputs[0].dtype=} and {output.dtype=}"
+            )
+
+        tosa_graph.addOperator(ts.TosaOp.Op().RSQRT, [inputs[0].name], [output.name])
diff --git a/backends/arm/operators/op_sigmoid.py b/backends/arm/operators/op_sigmoid.py
index abf60bf747f..9a002036fee 100644
--- a/backends/arm/operators/op_sigmoid.py
+++ b/backends/arm/operators/op_sigmoid.py
@@ -6,15 +6,13 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -46,4 +44,4 @@ def define_node(
                 f"{inputs[0].dtype} and output_dtype: {output.dtype}"
             )
 
-        tosa_graph.addOperator(TosaOp.Op().SIGMOID, [inputs[0].name], [output.name])
+        tosa_graph.addOperator(ts.TosaOp.Op().SIGMOID, [inputs[0].name], [output.name])
diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py
index a3ce80c5b24..27ae977a5bc 100644
--- a/backends/arm/operators/op_slice.py
+++ b/backends/arm/operators/op_slice.py
@@ -7,13 +7,12 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -24,6 +23,18 @@ class SliceVisitor(NodeVisitor):
     def __init__(self, *args):
         super().__init__(*args)
 
+    def _fixup_start(self, start, shape, dim):
+        if start.number < 0:
+            return start.number % shape[dim]
+        else:
+            return start.number
+
+    def _fixup_end(self, end, shape, dim):
+        if end.number < 0:
+            return end.number % shape[dim]
+        else:
+            return min(end.number, shape[dim])
+
     def define_node(
         self,
         node: Node,
@@ -43,20 +54,24 @@ def define_node(
         # Translate and check parameters in Pytorch dim order.
         shape = input_node.shape
         dim = dim.number
-        if end.number < 0:
-            end_index = end.number % shape[dim]
-        else:
-            end_index = min(end.number, shape[dim])
-        size = end_index - start.number
+
+        start_index = self._fixup_start(start, shape, dim)
+        end_index = self._fixup_end(end, shape, dim)
+        size = end_index - start_index
+
         assert size > 0
         assert size <= shape[dim]
 
         # Convert aten args to Tosa's start and size attributes and in TOSA dim order.
         attr = ts.TosaSerializerAttribute()
-        start_attr = [start.number if i == dim else 0 for i in input_node.dim_order]
+
+        start_attr = [
+            self._fixup_start(start, shape, dim) if i == dim else 0
+            for i in input_node.dim_order
+        ]
         size_attr = [size if i == dim else shape[i] for i in input_node.dim_order]
         attr.SliceAttribute(start_attr, size_attr)
 
         tosa_graph.addOperator(
-            TosaOp.Op().SLICE, [input_node.name], [output.name], attr
+            ts.TosaOp.Op().SLICE, [input_node.name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
index 6cd422095ab..ef9ed31c88d 100644
--- a/backends/arm/operators/op_sub.py
+++ b/backends/arm/operators/op_sub.py
@@ -10,14 +10,13 @@
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -41,9 +40,19 @@ def define_node(
     ) -> None:
         # Specification (0.80) states that input and output types
         # should all be the same
-        assert inputs[0].dtype == inputs[1].dtype == output.dtype
+        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
+            raise TypeError(
+                f"All IO needs to have the same data type, got input 1: "
+                f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: "
+                f"{output.dtype}"
+            )
+
         # Handle int8 (quantized) and int32
-        assert inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]
+        supported_dtypes = [ts.DType.INT8, ts.DType.INT32]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f'IO data type needs to be {supported_dtypes}, got "{inputs[0].dtype}"'
+            )
 
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
@@ -63,7 +72,7 @@ def define_node(
 
         # Do the INT32 Sub
         tosa_graph.addOperator(
-            TosaOp.Op().SUB,
+            ts.TosaOp.Op().SUB,
             [
                 rescaled_inputs[0].name,
                 rescaled_inputs[1].name,
@@ -98,19 +107,31 @@ def define_node(
     ) -> None:
         # Specification (0.80) states that input and output types
         # should all be the same
-        assert inputs[0].dtype == inputs[1].dtype == output.dtype
+        if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype:
+            raise TypeError(
+                f"All IO needs to have the same data type, got input 1: "
+                f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: "
+                f"{output.dtype}"
+            )
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
             super().define_node(node, tosa_graph, inputs, output)
         else:
             # FP32 Sub lowering
-            assert inputs[0].dtype == ts.DType.FP32
-            assert output.dtype == ts.DType.FP32
+            if (
+                inputs[0].dtype != ts.DType.FP32
+                or inputs[1].dtype != ts.DType.FP32
+                or output.dtype != ts.DType.FP32
+            ):
+                raise TypeError(
+                    f"All IO needs to have data type fp32. Got: {inputs[0].dtype}, "
+                    f"input 2: {inputs[1].dtype} and output: {output.dtype}"
+                )
 
             # MI lowering
             tosa_graph.addOperator(
-                TosaOp.Op().SUB,
+                ts.TosaOp.Op().SUB,
                 [inputs[0].name, inputs[1].name],
                 [output.name],
                 None,
diff --git a/backends/arm/operators/op_sum.py b/backends/arm/operators/op_sum.py
index b5b388b3352..135566e48ac 100644
--- a/backends/arm/operators/op_sum.py
+++ b/backends/arm/operators/op_sum.py
@@ -10,14 +10,13 @@
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -69,7 +68,7 @@ def define_node(
             )
 
             tosa_graph.addOperator(
-                TosaOp.Op().REDUCE_SUM, [prev_node.name], [next_node.name], attr
+                ts.TosaOp.Op().REDUCE_SUM, [prev_node.name], [next_node.name], attr
             )
 
             prev_node = next_node
@@ -120,7 +119,7 @@ def define_node(
                 ).name
 
             tosa_graph.addOperator(
-                TosaOp.Op().REDUCE_SUM, [input_name], [output_name], attr
+                ts.TosaOp.Op().REDUCE_SUM, [input_name], [output_name], attr
             )
 
             input_name = output_name
diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_table.py
index 40214b265f0..6a2053bea0d 100644
--- a/backends/arm/operators/op_table.py
+++ b/backends/arm/operators/op_table.py
@@ -8,15 +8,14 @@
 from typing import List
 
 import numpy as np
-
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -49,5 +48,5 @@ def define_node(
         table_attr.TableAttribute(np.array(table))
 
         tosa_graph.addOperator(
-            TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
+            ts.TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
         )
diff --git a/backends/arm/operators/op_tanh.py b/backends/arm/operators/op_tanh.py
index 89dd15c97d6..51cf1ee786b 100644
--- a/backends/arm/operators/op_tanh.py
+++ b/backends/arm/operators/op_tanh.py
@@ -6,14 +6,13 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
@@ -44,4 +43,4 @@ def define_node(
                 f"{inputs[0].dtype} and output_dtype: {output.dtype}"
             )
 
-        tosa_graph.addOperator(TosaOp.Op().TANH, [inputs[0].name], [output.name])
+        tosa_graph.addOperator(ts.TosaOp.Op().TANH, [inputs[0].name], [output.name])
diff --git a/backends/arm/operators/op_to_copy.py b/backends/arm/operators/op_to_copy.py
index feaec3a41e9..90485b71d50 100644
--- a/backends/arm/operators/op_to_copy.py
+++ b/backends/arm/operators/op_to_copy.py
@@ -6,9 +6,10 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
-import tosa.Op as TosaOp  # type: ignore
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.tosa.Op as TosaOp  # type: ignore
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_to_dim_order_copy.py b/backends/arm/operators/op_to_dim_order_copy.py
index 397979a439d..f144beba29f 100644
--- a/backends/arm/operators/op_to_dim_order_copy.py
+++ b/backends/arm/operators/op_to_dim_order_copy.py
@@ -6,9 +6,10 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
-import tosa.Op as TosaOp  # type: ignore
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.tosa.Op as TosaOp  # type: ignore
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_transpose.py
index 54a79297dd6..b909aef2ac9 100644
--- a/backends/arm/operators/op_transpose.py
+++ b/backends/arm/operators/op_transpose.py
@@ -7,14 +7,14 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 
 @register_node_visitor
@@ -39,5 +39,5 @@ def define_node(
         attr = ts.TosaSerializerAttribute()
         attr.TransposeAttribute(perms)
         tosa_graph.addOperator(
-            TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr
+            ts.TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_upsample_nearest2d.py b/backends/arm/operators/op_upsample_nearest2d.py
index 38e4087d38d..23d24b78339 100644
--- a/backends/arm/operators/op_upsample_nearest2d.py
+++ b/backends/arm/operators/op_upsample_nearest2d.py
@@ -6,17 +6,17 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_utils import get_resize_parameters, tosa_shape
-from serializer.tosa_serializer import TosaOp
 
-from tosa.ResizeMode import ResizeMode  # type: ignore
+from tosa_tools.v0_80.tosa.ResizeMode import ResizeMode  # type: ignore
 
 
 @register_node_visitor
@@ -65,5 +65,5 @@ def in_int16_range(x):
         )
 
         tosa_graph.addOperator(
-            TosaOp.Op().RESIZE, [inputs[0].name], [output.name], attr
+            ts.TosaOp.Op().RESIZE, [inputs[0].name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py
index 119e32fa58f..e063b8e39ec 100644
--- a/backends/arm/operators/op_view.py
+++ b/backends/arm/operators/op_view.py
@@ -6,9 +6,10 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
-import tosa.Op as TosaOp  # type: ignore
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.tosa.Op as TosaOp  # type: ignore
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_where.py b/backends/arm/operators/op_where.py
index c8b35e831d4..ba2469e74e1 100644
--- a/backends/arm/operators/op_where.py
+++ b/backends/arm/operators/op_where.py
@@ -5,7 +5,8 @@
 
 from typing import List, Sequence
 
-import serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.tosa.Op as TosaOp  # type: ignore
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -13,7 +14,6 @@
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
 
 
diff --git a/backends/arm/operators/ops_binary.py b/backends/arm/operators/ops_binary.py
index 307710e38e9..a17da41f767 100644
--- a/backends/arm/operators/ops_binary.py
+++ b/backends/arm/operators/ops_binary.py
@@ -7,16 +7,16 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts
 import torch
 import torch.fx
 
+import tosa_tools.v0_80.serializer.tosa_serializer as ts
+
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 
 def binary_operator_factory(bw_target: str, tosa_op):
@@ -46,12 +46,12 @@ def define_node(
     register_node_visitor(BinaryOperator)
 
 
-binary_operator_factory("aten.bitwise_and.Tensor", TosaOp.Op().BITWISE_AND)
-binary_operator_factory("aten.bitwise_xor.Tensor", TosaOp.Op().BITWISE_XOR)
-binary_operator_factory("aten.bitwise_or.Tensor", TosaOp.Op().BITWISE_OR)
-binary_operator_factory("aten.logical_and.default", TosaOp.Op().LOGICAL_AND)
-binary_operator_factory("aten.logical_xor.default", TosaOp.Op().LOGICAL_XOR)
-binary_operator_factory("aten.logical_or.default", TosaOp.Op().LOGICAL_OR)
+binary_operator_factory("aten.bitwise_and.Tensor", ts.TosaOp.Op().BITWISE_AND)
+binary_operator_factory("aten.bitwise_xor.Tensor", ts.TosaOp.Op().BITWISE_XOR)
+binary_operator_factory("aten.bitwise_or.Tensor", ts.TosaOp.Op().BITWISE_OR)
+binary_operator_factory("aten.logical_and.default", ts.TosaOp.Op().LOGICAL_AND)
+binary_operator_factory("aten.logical_xor.default", ts.TosaOp.Op().LOGICAL_XOR)
+binary_operator_factory("aten.logical_or.default", ts.TosaOp.Op().LOGICAL_OR)
 binary_operator_factory(
-    "aten.bitwise_left_shift.Tensor", TosaOp.Op().LOGICAL_LEFT_SHIFT
+    "aten.bitwise_left_shift.Tensor", ts.TosaOp.Op().LOGICAL_LEFT_SHIFT
 )
diff --git a/backends/arm/operators/ops_identity.py b/backends/arm/operators/ops_identity.py
new file mode 100644
index 00000000000..0c6527cf336
--- /dev/null
+++ b/backends/arm/operators/ops_identity.py
@@ -0,0 +1,47 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import List
+
+import torch
+import torch.fx
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts
+
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+
+
+def identity_operator_factory(identity_target: str):
+    """
+    Creates and registers NodeVisitors for operators that map directly
+    to a TOSA IDENTITY op.
+    """
+
+    class IdentityOperatorVisitor(NodeVisitor):
+        target = identity_target
+
+        def define_node(
+            self,
+            node: torch.fx.Node,
+            tosa_graph: ts.TosaSerializer,
+            inputs: List[TosaArg],
+            output: TosaArg,
+        ) -> None:
+            # Simply add an identityOp
+            tosa_graph.addOperator(
+                ts.TosaOp.Op().IDENTITY, [inputs[0].name], [output.name]
+            )
+
+    register_node_visitor(IdentityOperatorVisitor)
+
+
+identity_operator_factory("getitem")
+identity_operator_factory("aten.alias_copy.default")
diff --git a/backends/arm/operators/ops_unary.py b/backends/arm/operators/ops_unary.py
index 0a7d45ffe98..3f713e086e6 100644
--- a/backends/arm/operators/ops_unary.py
+++ b/backends/arm/operators/ops_unary.py
@@ -6,15 +6,15 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch.fx
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
 )
 
 from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
 
 
 def unary_operator_factory(unary_target: str, tosa_op):
@@ -53,6 +53,6 @@ def define_node(
     register_node_visitor(UnaryOperator)
 
 
-unary_operator_factory("aten.ceil.default", TosaOp.Op().CEIL)
-unary_operator_factory("aten.floor.default", TosaOp.Op().FLOOR)
-unary_operator_factory("aten.logical_not.default", TosaOp.Op().LOGICAL_NOT)
+unary_operator_factory("aten.ceil.default", ts.TosaOp.Op().CEIL)
+unary_operator_factory("aten.floor.default", ts.TosaOp.Op().FLOOR)
+unary_operator_factory("aten.logical_not.default", ts.TosaOp.Op().LOGICAL_NOT)
diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
index f9b77e28493..6692b75c892 100644
--- a/backends/arm/process_node.py
+++ b/backends/arm/process_node.py
@@ -5,22 +5,33 @@
 #
 
 # pyre-unsafe
-from typing import cast, Dict
+from typing import Any, cast, Dict
 
 import numpy as np
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
 import torch.fx
 from executorch.backends.arm.operators.node_visitor import NodeVisitor
 from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.arm.tosa_specification import (
+    Tosa_0_80,
+    Tosa_1_00,
+    TosaSpecification,
+)
 from executorch.backends.arm.tosa_utils import getNodeArgs, tosa_shape
+from torch._export.utils import (
+    get_buffer,
+    get_lifted_tensor_constant,
+    get_param,
+    is_buffer,
+    is_lifted_tensor_constant,
+    is_param,
+)
 from torch.export.exported_program import ExportedProgram
 
 
 def process_call_function(
     node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
+    tosa_graph: Any,
     node_visitors: Dict[str, NodeVisitor],
     tosa_spec: TosaSpecification,
 ):
@@ -55,7 +66,7 @@ def process_call_function(
 
 def process_inputs(
     node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
+    tosa_graph: Any,
     tosa_spec: TosaSpecification,
 ):
     """Serialize an input node"""
@@ -73,6 +84,14 @@ def process_inputs(
             f"Failed processing input placeholder: {node.name}. "
             "Is the original torch function supported?"
         ) from e
+
+    if isinstance(tosa_spec, Tosa_0_80):
+        import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+    elif isinstance(tosa_spec, Tosa_1_00):
+        import serializer.tosa_serializer as ts
+    else:
+        raise ValueError(f"Unsupported TOSA spec: {tosa_spec}")
+
     input_shape = tosa_arg.shape
     input_dim_order = tosa_arg.dim_order
     tensor = ts.TosaSerializerTensor(
@@ -87,7 +106,7 @@ def process_inputs(
 
 def process_inputs_to_parameters(
     node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
+    tosa_graph: Any,
     edge_program: ExportedProgram,
     tosa_spec: TosaSpecification,
 ):
@@ -99,8 +118,7 @@ def process_inputs_to_parameters(
             f"Failed processing parameter placeholder: {node.name}. "
             "Is the original torch function supported?"
         ) from e
-    parameter_name = edge_program.graph_signature.inputs_to_parameters[tosa_arg.name]
-    parameter_data = edge_program.state_dict[parameter_name]
+    parameter_data = get_param(edge_program, node)
 
     assert isinstance(parameter_data, torch.Tensor), "Expect Attr to be tensor"
     parameter_values = parameter_data.detach().numpy()
@@ -117,7 +135,7 @@ def process_inputs_to_parameters(
 
 def process_inputs_to_buffers(
     node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
+    tosa_graph: Any,
     edge_program: ExportedProgram,
 ):
     """Serialize quantized weights"""
@@ -128,8 +146,7 @@ def process_inputs_to_buffers(
             f"Failed processing buffer placeholder: {node.name}. "
             "Is the original torch function supported?"
         ) from e
-    buffer_name = edge_program.graph_signature.inputs_to_buffers[node.name]
-    buffer_data = edge_program.state_dict[buffer_name]
+    buffer_data = get_buffer(edge_program, node)
 
     assert isinstance(buffer_data, torch.Tensor), "Expect Attr to be tensor"
     buffer_values = buffer_data.detach().numpy()
@@ -146,7 +163,7 @@ def process_inputs_to_buffers(
 
 def process_inputs_to_lifted_tensor_constants(
     node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
+    tosa_graph: Any,
     edge_program: ExportedProgram,
 ):
     try:
@@ -156,11 +173,8 @@ def process_inputs_to_lifted_tensor_constants(
             f"Failed processing lifted tensor constant placeholder: {node.name}. "
             "Is the original torch function supported?"
         ) from e
-    tensor_name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[
-        tosa_arg.name
-    ]
-    tensor = edge_program.tensor_constants[tensor_name]
-    tensor_data = tensor.detach().numpy()
+    tensor = get_lifted_tensor_constant(edge_program, node)
+    tensor_data = tensor.detach().numpy()  # type: ignore[union-attr]
 
     tosa_graph.addConst(
         tensor_data.shape, tosa_arg.dtype, tensor_data, name=tosa_arg.name
@@ -169,7 +183,7 @@ def process_inputs_to_lifted_tensor_constants(
 
 def process_placeholder(
     node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
+    tosa_graph: Any,
     edge_program: ExportedProgram,
     tosa_spec: TosaSpecification,
 ):
@@ -179,11 +193,11 @@ def process_placeholder(
 
     if node.name in edge_program.graph_signature.user_inputs:
         process_inputs(node, tosa_graph, tosa_spec)
-    elif node.name in edge_program.graph_signature.inputs_to_parameters:
+    elif is_param(edge_program, node):
         process_inputs_to_parameters(node, tosa_graph, edge_program, tosa_spec)
-    elif node.name in edge_program.graph_signature.inputs_to_buffers:
+    elif is_buffer(edge_program, node):
         process_inputs_to_buffers(node, tosa_graph, edge_program)
-    elif node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants:
+    elif is_lifted_tensor_constant(edge_program, node):
         process_inputs_to_lifted_tensor_constants(node, tosa_graph, edge_program)
     elif node.name in edge_program.graph_signature.inputs_to_lifted_custom_objs:
         raise NotImplementedError(
@@ -195,7 +209,7 @@ def process_placeholder(
 
 def process_output(
     node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
+    tosa_graph: Any,
 ):
     for output in cast(tuple[torch.fx.Node, ...], node.args[0]):
         tosa_graph.addOutputTensor(
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index e76ed5fb415..ee08f8e9eec 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -286,10 +286,10 @@ def _annotate_all_static_patterns(
         quantization_config: Optional[QuantizationConfig],
         filter_fn: Optional[Callable[[Node], bool]] = None,
     ) -> GraphModule:
-        """Loops over all STATIC_OPS and runs the corresponding registred annotator.
+        """Loops over all STATIC_OPS and runs the corresponding registered annotator.
         Args:
             model: The model to annotate statically.
-            quantization_config: Specifices the QuantizationSpecs for the model's
+            quantization_config: Specifies the QuantizationSpecs for the model's
                 input activations, output activations, weights and biases.
             filter_fn: An optional filter function that takes a node and returns whether the node should be annotated.
         Returns:
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index b0f9e90b10f..5398101fd9a 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -178,6 +178,7 @@ def _match_pattern(
     torch.ops.aten.hardswish_.default,
     torch.ops.aten.full_like.default,
     torch.ops.aten.pow.Tensor_Scalar,
+    torch.ops.aten.gelu.default,
 ]
 
 _one_to_one_shared_input_qspec = [
@@ -243,6 +244,11 @@ def _match_pattern(
     operator.getitem,
 ]
 
+_one_to_one_shared_input_or_input_act_qspec = [
+    torch.ops.aten.adaptive_avg_pool2d.default,
+    torch.ops.aten.alias_copy.default,
+]
+
 
 def get_quant_properties(  # noqa: C901
     node: Node, gm: torch.fx.GraphModule, quantization_config
@@ -331,7 +337,7 @@ def any_or_hardtanh_min_zero(n: Node):
             _QuantProperty(2, shared_qspec),  # type: ignore[arg-type]
         ]
         quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
-    elif node.target == torch.ops.aten.adaptive_avg_pool2d.default:
+    elif node.target in _one_to_one_shared_input_or_input_act_qspec:
         input_qspec = (
             SharedQuantizationSpec(node.args[0])  # type: ignore[arg-type]
             if arm_quantizer_utils.is_output_annotated(node.args[0])  # type: ignore
diff --git a/backends/arm/scripts/install_reference_model.sh b/backends/arm/scripts/install_reference_model.sh
new file mode 100755
index 00000000000..0141b195a0d
--- /dev/null
+++ b/backends/arm/scripts/install_reference_model.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+# Installation script to manage transition to 1.0
+
+# TOSA reference model
+tosa_reference_model_url="https://git.gitlab.arm.com/tosa/tosa-reference-model.git"
+tosa_reference_model_0_80_branch="v0.80"
+tosa_reference_model_0_80_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a"
+tosa_serialization_lib_0_80_rev="v0.80.1"
+tosa_reference_model_1_0_rev="f9b4ceb850964be03a39e965ad7a0546dc6c57ae"
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+
+source ${script_dir}/utils.sh
+
+
+function setup_tosa_reference_model() {
+    local work_dir="$1"
+
+    if [[ -z "$work_dir" ]]; then
+        echo "Error: work_dir parameter is required."
+        return 1
+    fi
+
+    mkdir -p "$work_dir"
+    pushd "$work_dir" || exit 1
+
+    # Install a patched version of TOSA reference model v0.80.1 to make it co-exist with 1.0 during the transition period
+    if [[ ! -d "reference_model" ]]; then
+        git clone --recurse-submodules --branch ${tosa_reference_model_0_80_branch} "$tosa_reference_model_url" reference_model
+    fi
+
+    patches_dir=${script_dir}/../third-party/reference_model/patches/v0.80
+    patch_repo reference_model ${tosa_reference_model_0_80_rev} ${patches_dir}
+    patch_repo reference_model/thirdparty/serialization_lib ${tosa_serialization_lib_0_80_rev} ${patches_dir}
+
+    pushd reference_model
+    rm -rf build
+    # reference_model flatbuffers version clashes with Vela.
+    # go with Vela's since it newer.
+    # Vela's flatbuffer requirement is expected to loosen, then remove this. MLETORCH-565
+    CMAKE_POLICY_VERSION_MINIMUM=3.5 pip install . --no-dependencies flatbuffers
+    popd
+
+    # Install the 1.0 branch from upstream
+    CMAKE_POLICY_VERSION_MINIMUM=3.5 BUILD_PYBIND=1 pip install "tosa-tools@git+${tosa_reference_model_url}@${tosa_reference_model_1_0_rev}" ml_dtypes==0.5.1 --no-dependencies flatbuffers
+}
+
+setup_tosa_reference_model $1
diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
new file mode 100644
index 00000000000..8aabf7c2c59
--- /dev/null
+++ b/backends/arm/scripts/parse_test_names.py
@@ -0,0 +1,102 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from executorch.exir.dialects.edge.spec.utils import SAMPLE_INPUT
+
+# Add edge ops which we lower but which are not included in exir/dialects/edge/edge.yaml here.
+CUSTOM_EDGE_OPS = ["linspace.default", "eye.default"]
+ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS
+
+# Add all targets and TOSA profiles we support here.
+TARGETS = {"tosa_BI", "tosa_MI", "u55_BI", "u85_BI"}
+
+
+def get_edge_ops():
+    """
+    Returns a set with edge_ops with names on the form to be used in unittests:
+    1. Names are in lowercase.
+    2. Overload is ignored if it is 'default', otherwise its appended with an underscore.
+    3. Overly verbose name are shortened by removing certain prefixes/suffixes.
+
+    Examples:
+        abs.default -> abs
+        split_copy.Tensor -> split_tensor
+    """
+    edge_ops = set()
+    for edge_name in ALL_EDGE_OPS:
+        op, overload = edge_name.split(".")
+
+        # Normalize names
+        op = op.lower()
+        op = op.removeprefix("_")
+        op = op.removesuffix("_copy")
+        op = op.removesuffix("_with_indices")
+        op = op.removesuffix("_no_training")
+        overload = overload.lower()
+
+        if overload == "default":
+            edge_ops.add(op)
+        else:
+            edge_ops.add(f"{op}_{overload}")
+
+    return edge_ops
+
+
+def parse_test_name(test_name: str, edge_ops: set[str]) -> tuple[str, str, bool]:
+    """
+    Parses a test name on the form
+        test_OP_TARGET_<not_delegated>_<any_other_info>
+    where OP must match a string in edge_ops and TARGET must match one string in TARGETS.
+    The "not_delegated" suffix indicates that the test tests that the op is not delegated.
+
+    Examples of valid names: "test_mm_u55_BI_not_delegated" or "test_add_scalar_tosa_MI_two_inputs".
+
+    Returns a tuple (OP, TARGET, IS_DELEGATED) if valid.
+    """
+    test_name = test_name.removeprefix("test_")
+    is_delegated = "not_delegated" not in test_name
+    assert (
+        "reject" not in test_name
+    ), f"Use 'not_delegated' instead of 'reject' in {test_name}"
+
+    op = "None"
+    target = "None"
+    for potential_target in TARGETS:
+        index = test_name.find(potential_target)
+        if index != -1:
+            op = test_name[: index - 1]
+            target = potential_target
+            break
+    # Special case for convolution
+    op = op.removesuffix("_1d")
+    op = op.removesuffix("_2d")
+
+    assert target != "None", f"{test_name} does not contain one of {TARGETS}"
+    assert (
+        op in edge_ops
+    ), f"Parsed unvalid OP from {test_name}, {op} does not exist in edge.yaml or CUSTOM_EDGE_OPS"
+
+    return op, target, is_delegated
+
+
+if __name__ == "__main__":
+    """Parses a list of test names given on the commandline."""
+    import sys
+
+    sys.tracebacklimit = 0  # Do not print stack trace
+
+    edge_ops = get_edge_ops()
+    exit_code = 0
+
+    for test_name in sys.argv[1:]:
+        try:
+            assert test_name[:5] == "test_", f"Unexpected input: {test_name}"
+            parse_test_name(test_name, edge_ops)
+        except AssertionError as e:
+            print(e)
+            exit_code = 1
+        else:
+            print(f"{test_name} OK")
+
+    sys.exit(exit_code)
diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push
index ac0584c6f73..4eeb0f50d71 100755
--- a/backends/arm/scripts/pre-push
+++ b/backends/arm/scripts/pre-push
@@ -166,6 +166,44 @@ for COMMIT in ${COMMITS}; do
         fi
     fi
 
+    # Op test checks
+    op_test_files=$(echo $commit_files | grep -oE 'backends/arm/test/ops/\S+')
+    if [ "$op_test_files" ]; then
+
+        # TODO: These checks can be removed when all unittests are refactored.
+        if grep -icq "SkipIfNoCorstone" $op_test_files; then
+            echo -e "${ERROR} @SkipIfNoCorstone300/320 is deprecated;"\
+                "please use XfailIfNoCorstone300/320 instead." >&2
+            FAILED=1
+        fi
+
+        if grep -icq "conftest.expectedFailureOnFVP" $op_test_files; then
+            echo -e "${ERROR} @conftest.expectedFailureOnFVP is deprecated;"\
+                "please use XfailIfCorstone300/320 instead." >&2
+            FAILED=1
+        fi
+
+        if grep -icq "unittest.TestCase" $op_test_files; then
+            echo -e "${ERROR} Use of the Unittest test framework is deprecated;"\
+            "please use Pytest instead." >&2
+            FAILED=1
+        fi
+
+        if grep -icq "on_fvp(" $op_test_files; then
+            echo -e "${ERROR} All unittests should run on FVP if relevant,"\
+            "on_fvp suffix can be excluded." >&2
+            FAILED=1
+        fi
+
+        # Check that the tested op and target is parsed correctly from the test name
+        test_names=$(grep -h "def test_" $op_test_files | cut -d"(" -f1 | cut -d" " -f2)
+        python ./backends/arm/scripts/parse_test_names.py $test_names
+        if [ $? -ne 0 ]; then
+            echo -e "${ERROR} Failed op test name check." >&2
+            FAILED=1
+        fi
+    fi
+
     echo "" # Newline to visually separate commit processing
 done
 
diff --git a/backends/arm/scripts/utils.sh b/backends/arm/scripts/utils.sh
index e3ed04ffa22..8b4c8d4f96f 100644
--- a/backends/arm/scripts/utils.sh
+++ b/backends/arm/scripts/utils.sh
@@ -46,7 +46,7 @@ function patch_repo() {
     local patch_dir="${3}/$name"
 
     echo -e "[${FUNCNAME[0]}] Patching ${name} repo_dir:${repo_dir} base_rev:${base_rev} patch_dir:${patch_dir}"
-    cd $repo_dir
+    pushd $repo_dir
     git fetch
     git reset --hard ${base_rev}
 
@@ -54,4 +54,5 @@ function patch_repo() {
         git am -3 ${patch_dir}/*.patch
 
     echo -e "[${FUNCNAME[0]}] Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${repo_dir} dir.\n"
+    popd
 }
diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py
index e5d7783fea3..12220acbae9 100644
--- a/backends/arm/test/conftest.py
+++ b/backends/arm/test/conftest.py
@@ -15,7 +15,7 @@
 import pytest
 
 try:
-    import tosa_reference_model
+    import tosa_tools.v0_80.tosa_reference_model as tosa_reference_model
 except ImportError:
     logging.warning("tosa_reference_model not found, can't run reference model tests")
     tosa_reference_model = None
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index 945c940a20b..60bf89b6e17 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import os
 import shutil
 import tempfile
@@ -15,9 +14,6 @@
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
 
 class Linear(torch.nn.Module):
     def __init__(
@@ -205,7 +201,6 @@ def test_collate_tosa_BI_tests(self):
 
 
 def test_dump_tosa_ops(caplog):
-    caplog.set_level(logging.INFO)
     model = Linear(20, 30)
     (
         ArmTester(
@@ -222,7 +217,6 @@ def test_dump_tosa_ops(caplog):
 
 
 def test_fail_dump_tosa_ops(caplog):
-    caplog.set_level(logging.INFO)
 
     class Add(torch.nn.Module):
         def forward(self, x):
diff --git a/backends/arm/test/misc/test_non_persistent_buffers.py b/backends/arm/test/misc/test_non_persistent_buffers.py
new file mode 100644
index 00000000000..1b9456ae470
--- /dev/null
+++ b/backends/arm/test/misc/test_non_persistent_buffers.py
@@ -0,0 +1,49 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+class NonPersistentBuffer(nn.Module):
+    """
+    Min code version registering a non-persistent input buffer.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.register_buffer("test_buff", torch.rand(2, 2, 2, 2), persistent=False)
+
+    def forward(self, x):
+        return x - self.test_buff
+
+
+test_input = {"input": (torch.ones(2, 2, 2, 2),)}
+
+input_t = tuple[torch.Tensor]
+
+
+@parametrize("test_data", test_input)
+def test_non_persistent_buffer_MI(test_data: input_t):
+    """
+    Test validates Arm backend handling of non-persistent buffers
+    and ensures that there are no asserts or errors when they are used.
+    """
+    TosaPipelineMI[input_t](NonPersistentBuffer(), test_data, "").run()
+
+
+@parametrize("test_data", test_input)
+def test_non_persistent_buffer_BI(test_data: input_t):
+    """
+    Test validates Arm backend handling of non-persistent buffers
+    and ensures that there are no asserts or errors when they are used.
+    """
+    TosaPipelineBI[input_t](NonPersistentBuffer(), test_data, "").run()
diff --git a/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
index f69d9d34462..49efbbb4a9c 100644
--- a/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
+++ b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
@@ -117,7 +117,12 @@ def test_softplus_tosa_BI(test_data: input_t1):
 # Since GELU will not be quantized by TosaQuantizer, the Dropout's input will not be quantized either.
 # If so, the Dropout should not be partitioned by TosaPartitioner for TOSA BI profile. This test tests that the
 # partitioner indeed does not partition the Dropout (clone) for TOSA BI.
-@common.parametrize("test_data", test_data)
+@common.parametrize(
+    "test_data",
+    test_data,
+    {"3d_rand": "MLETORCH-909: Partition test to not rely on unsupported ops"},
+    strict=False,
+)
 def test_linear_residaul_tosa_MI(test_data: input_t1):
     pipeline = TosaPipelineMI[input_t1](
         LinearResidualModule(),
diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py
index b293555e66b..fb0d5eb75d3 100644
--- a/backends/arm/test/models/test_conformer.py
+++ b/backends/arm/test/models/test_conformer.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 import torch
@@ -14,10 +13,6 @@
 from torchaudio.models import Conformer
 
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
 def get_test_inputs(dim, lengths, num_examples):
     return (torch.rand(num_examples, int(lengths.max()), dim), lengths)
 
diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py
index 89196674c48..644ad69222c 100644
--- a/backends/arm/test/models/test_llama.py
+++ b/backends/arm/test/models/test_llama.py
@@ -28,7 +28,6 @@
 sys.path.append(project_dir)
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 
 class TestLlama(unittest.TestCase):
@@ -79,30 +78,12 @@ def prepare_model(self):
 
         llama_model, llama_inputs, llama_meta = get_llama_model(args)
 
-        # TODO: Remove workaround since attention mask should not be persistent,
-        # it only works if input shape is always the same
-        freqs_c = "freqs_cos"
-        freqs_s = "freqs_sin"
-        for i in range(llama_model.n_layers):
-            val = llama_model.layers[i].attention.get_buffer("mask")
-            llama_model.layers[i].attention.register_buffer(
-                "mask", val, persistent=True
-            )
-            val = llama_model.layers[i].attention.rope.get_buffer(freqs_c)
-            llama_model.layers[i].attention.rope.register_buffer(
-                freqs_c, val, persistent=True
-            )
-            val = llama_model.layers[i].attention.rope.get_buffer(freqs_s)
-            llama_model.layers[i].attention.rope.register_buffer(
-                freqs_s, val, persistent=True
-            )
-
         return llama_model, llama_inputs, llama_meta
 
     def test_llama_tosa_MI(self):
         llama_model, llama_inputs, llama_meta = self.prepare_model()
 
-        if llama_model is None and llama_inputs is None and llama_meta is None:
+        if llama_model is None or llama_inputs is None:
             pytest.skip("Missing model and/or input files")
 
         with torch.no_grad():
@@ -123,3 +104,29 @@ def test_llama_tosa_MI(self):
                     rtol=1.1,  # TODO: MLETORCH-825 decrease tolerance
                 )
             )
+
+    @pytest.mark.xfail(reason="KeyError: scalar_tensor_1 (MLETORCH-907)")
+    def test_llama_tosa_BI(self):
+        llama_model, llama_inputs, llama_meta = self.prepare_model()
+
+        if llama_model is None or llama_inputs is None:
+            pytest.skip("Missing model and/or input files")
+
+        with torch.no_grad():
+            (
+                ArmTester(
+                    llama_model,
+                    example_inputs=llama_inputs,
+                    compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+                    constant_methods=llama_meta,
+                )
+                .quantize()
+                .export()
+                .to_edge_transform_and_lower()
+                .to_executorch()
+                .run_method_and_compare_outputs(
+                    inputs=llama_inputs,
+                    atol=4.3,
+                    rtol=1.1,  # TODO: Tolerance needs to be updated after MLETORCH-907
+                )
+            )
diff --git a/backends/arm/test/models/test_mobilenet_v3_arm.py b/backends/arm/test/models/test_mobilenet_v3_arm.py
index 7e0afe4a54f..f6f8f8f3e0c 100644
--- a/backends/arm/test/models/test_mobilenet_v3_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v3_arm.py
@@ -46,7 +46,7 @@ def test_mv3_tosa_BI():
         aten_op=[],
         exir_op=[],
         use_to_edge_transform_and_lower=True,
-        atol=0.3,
+        atol=0.5,
         qtol=1,
     )
     pipeline.run()
@@ -63,7 +63,7 @@ def test_mv3_u55_BI():
         exir_ops=[],
         run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
-        atol=0.3,
+        atol=0.5,
         qtol=1,
     )
     pipeline.run()
@@ -80,7 +80,7 @@ def test_mv3_u85_BI():
         exir_ops=[],
         run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
-        atol=0.3,
+        atol=0.5,
         qtol=1,
     )
     pipeline.run()
diff --git a/backends/arm/test/models/test_torch_functions.py b/backends/arm/test/models/test_torch_functions.py
index 19e2395adfe..5cd4bd3aaed 100644
--- a/backends/arm/test/models/test_torch_functions.py
+++ b/backends/arm/test/models/test_torch_functions.py
@@ -101,6 +101,7 @@ def forward(self, *args):
         "Requires dynamic output shape.",
         "topk": "NotImplementedError: No registered serialization name for <class 'torch.return_types.topk'> found",
         "sort": "NotImplementedError: No registered serialization name for <class 'torch.return_types.sort'> found",
+        "norm": "An error occurred when running the 'KeepDimsFalseToSqueezePass' pass after the following passes:",
     },
 )
 def test_torch_fns_MI(test_data):
@@ -129,6 +130,7 @@ def test_torch_fns_MI(test_data):
         "topk": "NotImplementedError: No registered serialization name for <class 'torch.return_types.topk'> found",
         "sort": "NotImplementedError: No registered serialization name for <class 'torch.return_types.sort'> found",
         "t": "MLETORCH-855: Issue with Quantization folding.",
+        "norm": "An error occurred when running the 'KeepDimsFalseToSqueezePass' pass after the following passes:",
     },
     strict=False,
 )
diff --git a/backends/arm/test/models/test_w2l_arm.py b/backends/arm/test/models/test_w2l_arm.py
index fb491ca2250..8cd2ff22b75 100644
--- a/backends/arm/test/models/test_w2l_arm.py
+++ b/backends/arm/test/models/test_w2l_arm.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 from typing import Tuple
 
@@ -19,10 +18,6 @@
 from torchaudio import models
 
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
 def get_test_inputs(batch_size, num_features, input_frames):
     return (torch.randn(batch_size, num_features, input_frames),)
 
diff --git a/backends/arm/test/ops/test_alias_copy.py b/backends/arm/test/ops/test_alias_copy.py
new file mode 100644
index 00000000000..66fa92bc445
--- /dev/null
+++ b/backends/arm/test/ops/test_alias_copy.py
@@ -0,0 +1,83 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]
+
+
+class AliasCopy(torch.nn.Module):
+    """
+    Tests proper handling of alias_copy when used directly.
+
+    alias_copy can also appear from PyTorch/ExecuTorch optimizations
+    such as `x.transpose(0, 0)`. This is optimized to an alias_copy but
+    not before dq/q operators are added.
+    """
+
+    aten_op = "torch.ops.aten.alias_copy.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_alias_copy_default"
+
+    test_data: dict[input_t1] = {
+        "1d_ramp": (torch.arange(-16, 16, 0.2),),
+        "2d_ones": (torch.ones(5, 5),),
+        "3d_rand": (torch.rand(3, 5, 5),),
+        "4d_zeros": (torch.zeros(1, 10, 10, 10),),
+    }
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+        return torch.alias_copy(x)
+
+
+@common.parametrize("test_data", AliasCopy.test_data)
+def test_alias_copy_tosa_MI(test_data: input_t1):
+    TosaPipelineMI[input_t1](
+        AliasCopy(),
+        test_data,
+        AliasCopy.aten_op,
+        AliasCopy.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", AliasCopy.test_data)
+def test_alias_copy_tosa_BI(test_data: input_t1):
+    TosaPipelineBI[input_t1](
+        AliasCopy(),
+        test_data,
+        AliasCopy.aten_op,
+        AliasCopy.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", AliasCopy.test_data)
+def test_alias_copy_u55_BI(test_data: input_t1):
+    EthosU55PipelineBI[input_t1](
+        AliasCopy(),
+        test_data,
+        AliasCopy.aten_op,
+        AliasCopy.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", AliasCopy.test_data)
+def test_alias_copy_u85_BI(test_data: input_t1):
+    EthosU85PipelineBI[input_t1](
+        AliasCopy(),
+        test_data,
+        AliasCopy.aten_op,
+        AliasCopy.exir_op,
+    ).run()
diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py
index 360429d3d6c..980ab28df64 100644
--- a/backends/arm/test/ops/test_batch_norm.py
+++ b/backends/arm/test/ops/test_batch_norm.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import Tuple
@@ -15,8 +14,6 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 test_data_suite = [
     # (test_name, test_data, [num_features, affine, track_running_stats, weight, bias, running_mean, running_var,] )
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index f6e13a2222e..0fb3c2675e9 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import Tuple
@@ -18,8 +17,6 @@
 from parameterized import parameterized
 from torch.nn.parameter import Parameter
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 """
 This file contain unit tests where conv are combined with other ops.
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index 062dbfacaef..d200a753ce5 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import Optional, Tuple, Union
@@ -17,8 +16,6 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 test_data_suite = [
     # (test_name, input, other, rounding_mode) See torch.div() for info
diff --git a/backends/arm/test/ops/test_eq.py b/backends/arm/test/ops/test_eq.py
index 7cbba632696..e3bcf877ffe 100644
--- a/backends/arm/test/ops/test_eq.py
+++ b/backends/arm/test/ops/test_eq.py
@@ -96,8 +96,16 @@ def test_eq_scalar_tosa_MI(test_module):
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_tensor | test_data_scalar)
-def test_eq_tosa_BI(test_module):
+@common.parametrize("test_module", test_data_tensor)
+def test_eq_tensor_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), Equal.aten_op_Tensor, Equal.exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_scalar)
+def test_eq_scalar_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
         test_module, test_module.get_inputs(), Equal.aten_op_Tensor, Equal.exir_op
     )
@@ -133,15 +141,34 @@ def test_eq_scalar_u55_BI(test_module):
 
 @common.parametrize(
     "test_module",
-    test_data_tensor | test_data_scalar,
+    test_data_tensor,
     xfails={
         "eq_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
+    },
+    strict=False,
+)
+@common.XfailIfNoCorstone320
+def test_eq_tensor_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        Equal.aten_op_Tensor,
+        Equal.exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_module",
+    test_data_scalar,
+    xfails={
         "eq_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
     },
     strict=False,
 )
 @common.XfailIfNoCorstone320
-def test_eq_u85_BI(test_module):
+def test_eq_scalar_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
         test_module,
         test_module.get_inputs(),
diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py
index a6193f6ea08..7bcd2c923a4 100644
--- a/backends/arm/test/ops/test_ge.py
+++ b/backends/arm/test/ops/test_ge.py
@@ -5,7 +5,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 
@@ -16,13 +15,14 @@
     TosaPipelineMI,
 )
 
-aten_op = "torch.ops.aten.ge.Tensor"
-exir_op = "executorch_exir_dialects_edge__ops_aten_ge_Tensor"
-
 input_t = Tuple[torch.Tensor]
 
 
 class GreaterEqual(torch.nn.Module):
+    aten_op_tensor = "torch.ops.aten.ge.Tensor"
+    aten_op_scalar = "torch.ops.aten.ge.Scalar"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_ge_Tensor"
+
     def __init__(self, input, other):
         super().__init__()
         self.input_ = input
@@ -31,7 +31,7 @@ def __init__(self, input, other):
     def forward(
         self,
         input_: torch.Tensor,
-        other_: torch.Tensor,
+        other_: torch.Tensor | int | float,
     ):
         return input_ >= other_
 
@@ -39,98 +39,143 @@ def get_inputs(self):
         return (self.input_, self.other_)
 
 
-op_ge_rank1_ones = GreaterEqual(
+op_ge_tensor_rank1_ones = GreaterEqual(
     torch.ones(5),
     torch.ones(5),
 )
-op_ge_rank2_rand = GreaterEqual(
+op_ge_tensor_rank2_rand = GreaterEqual(
     torch.rand(4, 5),
     torch.rand(1, 5),
 )
-op_ge_rank3_randn = GreaterEqual(
+op_ge_tensor_rank3_randn = GreaterEqual(
     torch.randn(10, 5, 2),
     torch.randn(10, 5, 2),
 )
-op_ge_rank4_randn = GreaterEqual(
+op_ge_tensor_rank4_randn = GreaterEqual(
     torch.randn(3, 2, 2, 2),
     torch.randn(3, 2, 2, 2),
 )
 
-test_data_common = {
-    "ge_rank1_ones": op_ge_rank1_ones,
-    "ge_rank2_rand": op_ge_rank2_rand,
-    "ge_rank3_randn": op_ge_rank3_randn,
-    "ge_rank4_randn": op_ge_rank4_randn,
+op_ge_scalar_rank1_ones = GreaterEqual(torch.ones(5), 1.0)
+op_ge_scalar_rank2_rand = GreaterEqual(torch.rand(4, 5), 0.2)
+op_ge_scalar_rank3_randn = GreaterEqual(torch.randn(10, 5, 2), -0.1)
+op_ge_scalar_rank4_randn = GreaterEqual(torch.randn(3, 2, 2, 2), 0.3)
+
+test_data_tensor = {
+    "ge_tensor_rank1_ones": op_ge_tensor_rank1_ones,
+    "ge_tensor_rank2_rand": op_ge_tensor_rank2_rand,
+    "ge_tensor_rank3_randn": op_ge_tensor_rank3_randn,
+    "ge_tensor_rank4_randn": op_ge_tensor_rank4_randn,
+}
+
+test_data_scalar = {
+    "ge_scalar_rank1_ones": op_ge_scalar_rank1_ones,
+    "ge_scalar_rank2_rand": op_ge_scalar_rank2_rand,
+    "ge_scalar_rank3_randn": op_ge_scalar_rank3_randn,
+    "ge_scalar_rank4_randn": op_ge_scalar_rank4_randn,
 }
 
 
-@common.parametrize("test_module", test_data_common)
-def test_ge_tosa_MI(test_module):
+@common.parametrize("test_module", test_data_tensor)
+def test_ge_tensor_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        GreaterEqual.aten_op_tensor,
+        GreaterEqual.exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_scalar)
+def test_ge_scalar_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module,
+        test_module.get_inputs(),
+        GreaterEqual.aten_op_scalar,
+        GreaterEqual.exir_op,
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_ge_tosa_BI(test_module):
+@common.parametrize("test_module", test_data_tensor)
+def test_ge_tensor_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module,
+        test_module.get_inputs(),
+        GreaterEqual.aten_op_tensor,
+        GreaterEqual.exir_op,
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_ge_u55_BI(test_module):
-    # GREATER_EQUAL is not supported on U55.
-    pipeline = OpNotSupportedPipeline[input_t](
+@common.parametrize("test_module", test_data_scalar)
+def test_ge_scalar_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
         test_module,
         test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
-        {exir_op: 1},
+        GreaterEqual.aten_op_tensor,
+        GreaterEqual.exir_op,
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_ge_u85_BI(test_module):
-    pipeline = EthosU85PipelineBI[input_t](
+@common.parametrize("test_module", test_data_tensor)
+@common.XfailIfNoCorstone300
+def test_ge_tensor_u55_BI(test_module):
+    # GREATER_EQUAL is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
         test_module,
         test_module.get_inputs(),
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        use_to_edge_transform_and_lower=True,
+        "TOSA-0.80+BI+u55",
+        {GreaterEqual.exir_op: 1},
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-@pytest.mark.skip(reason="The same as test_ge_u55_BI")
-def test_ge_u55_BI_on_fvp(test_module):
+@common.parametrize("test_module", test_data_scalar)
+@common.XfailIfNoCorstone300
+def test_ge_scalar_u55_BI(test_module):
     # GREATER_EQUAL is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
         test_module,
         test_module.get_inputs(),
         "TOSA-0.80+BI+u55",
-        {exir_op: 1},
+        {GreaterEqual.exir_op: 1},
+        n_expected_delegates=1,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_module",
+    test_data_tensor,
+    xfails={"ge_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85"},
+)
+@common.XfailIfNoCorstone320
+def test_ge_tensor_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        GreaterEqual.aten_op_tensor,
+        GreaterEqual.exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize(
     "test_module",
-    test_data_common,
-    xfails={"ge_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
+    test_data_scalar,
+    xfails={"ge_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85"},
 )
-@common.SkipIfNoCorstone320
-def test_ge_u85_BI_on_fvp(test_module):
+@common.XfailIfNoCorstone320
+def test_ge_scalar_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
         test_module,
         test_module.get_inputs(),
-        aten_op,
-        exir_op,
+        GreaterEqual.aten_op_tensor,
+        GreaterEqual.exir_op,
         run_on_fvp=True,
-        use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_gelu.py b/backends/arm/test/ops/test_gelu.py
new file mode 100644
index 00000000000..fb1253fdb0c
--- /dev/null
+++ b/backends/arm/test/ops/test_gelu.py
@@ -0,0 +1,125 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]
+
+
+class Gelu(torch.nn.Module):
+    aten_op = "torch.ops.aten.gelu.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_gelu_default"
+
+    test_data: dict[str, Tuple[str, input_t1]] = {
+        "zeros_none": (
+            "none",
+            torch.zeros(1, 10, 10, 10),
+        ),
+        "ones_none": (
+            "none",
+            torch.ones(10, 10, 10),
+        ),
+        "rand_none": (
+            "none",
+            (torch.rand(10, 10) - 0.5),
+        ),
+        "randn_pos_none": (
+            "none",
+            (torch.randn(1, 4, 4, 4) + 10),
+        ),
+        "randn_neg_none": (
+            "none",
+            (torch.randn(1, 4, 4, 4) - 10),
+        ),
+        "ramp_none": (
+            "none",
+            torch.arange(-16, 16, 0.2),
+        ),
+        "zeros_tanh": (
+            "tanh",
+            torch.zeros(1, 10, 10, 10),
+        ),
+        "ones_tanh": (
+            "tanh",
+            torch.ones(10, 10, 10),
+        ),
+        "rand_tanh": (
+            "tanh",
+            (torch.rand(10, 10) - 0.5),
+        ),
+        "randn_pos_tanh": (
+            "tanh",
+            (torch.randn(1, 4, 4, 4) + 10),
+        ),
+        "randn_neg_tanh": (
+            "tanh",
+            (torch.randn(1, 4, 4, 4) - 10),
+        ),
+        "ramp_tanh": (
+            "tanh",
+            torch.arange(-16, 16, 0.2),
+        ),
+    }
+
+    def __init__(self, approximate: str = "none"):
+        super().__init__()
+        self.gelu = torch.nn.GELU(approximate)
+
+    def forward(self, x: torch.Tensor):
+        return self.gelu(x)
+
+
+@common.parametrize("test_data", Gelu.test_data)
+def test_gelu_tosa_MI(test_data: input_t1):
+    approximate = test_data[0]
+    TosaPipelineMI[input_t1](
+        Gelu(approximate),
+        (test_data[1],),
+        Gelu.aten_op,
+        Gelu.exir_op,
+        use_to_edge_transform_and_lower=False,
+    ).run()
+
+
+@common.parametrize("test_data", Gelu.test_data)
+def test_gelu_tosa_BI(test_data: input_t1):
+    approximate = test_data[0]
+    TosaPipelineBI[input_t1](
+        Gelu(approximate),
+        (test_data[1],),
+        Gelu.aten_op,
+        Gelu.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", Gelu.test_data)
+def test_gelu_u55_BI(test_data: input_t1):
+    approximate = test_data[0]
+    EthosU55PipelineBI[input_t1](
+        Gelu(approximate),
+        (test_data[1],),
+        Gelu.aten_op,
+        Gelu.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", Gelu.test_data)
+def test_gelu_u85_BI(test_data: input_t1):
+    approximate = test_data[0]
+    EthosU85PipelineBI[input_t1](
+        Gelu(approximate),
+        (test_data[1],),
+        Gelu.aten_op,
+        Gelu.exir_op,
+    ).run()
diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py
index 2095f781bdb..15515958c85 100644
--- a/backends/arm/test/ops/test_gt.py
+++ b/backends/arm/test/ops/test_gt.py
@@ -5,7 +5,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 
@@ -16,13 +15,15 @@
     TosaPipelineMI,
 )
 
-aten_op = "torch.ops.aten.gt.Tensor"
-exir_op = "executorch_exir_dialects_edge__ops_aten_gt_Tensor"
 
 input_t = Tuple[torch.Tensor]
 
 
 class Greater(torch.nn.Module):
+    aten_op_tensor = "torch.ops.aten.gt.Tensor"
+    aten_op_scalar = "torch.ops.aten.gt.Scalar"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_gt_Tensor"
+
     def __init__(self, input, other):
         super().__init__()
         self.input_ = input
@@ -31,7 +32,7 @@ def __init__(self, input, other):
     def forward(
         self,
         input_: torch.Tensor,
-        other_: torch.Tensor,
+        other_: torch.Tensor | int | float,
     ):
         return input_ > other_
 
@@ -39,98 +40,135 @@ def get_inputs(self):
         return (self.input_, self.other_)
 
 
-op_gt_rank1_ones = Greater(
+op_gt_tensor_rank1_ones = Greater(
     torch.ones(5),
     torch.ones(5),
 )
-op_gt_rank2_rand = Greater(
+op_gt_tensor_rank2_rand = Greater(
     torch.rand(4, 5),
     torch.rand(1, 5),
 )
-op_gt_rank3_randn = Greater(
+op_gt_tensor_rank3_randn = Greater(
     torch.randn(10, 5, 2),
     torch.randn(10, 5, 2),
 )
-op_gt_rank4_randn = Greater(
+op_gt_tensor_rank4_randn = Greater(
     torch.randn(3, 2, 2, 2),
     torch.randn(3, 2, 2, 2),
 )
 
-test_data_common = {
-    "gt_rank1_ones": op_gt_rank1_ones,
-    "gt_rank2_rand": op_gt_rank2_rand,
-    "gt_rank3_randn": op_gt_rank3_randn,
-    "gt_rank4_randn": op_gt_rank4_randn,
+op_gt_scalar_rank1_ones = Greater(torch.ones(5), 1.0)
+op_gt_scalar_rank2_rand = Greater(torch.rand(4, 5), 0.2)
+op_gt_scalar_rank3_randn = Greater(torch.randn(10, 5, 2), -0.1)
+op_gt_scalar_rank4_randn = Greater(torch.randn(3, 2, 2, 2), 0.3)
+
+test_data_tensor = {
+    "gt_tensor_rank1_ones": op_gt_tensor_rank1_ones,
+    "gt_tensor_rank2_rand": op_gt_tensor_rank2_rand,
+    "gt_tensor_rank3_randn": op_gt_tensor_rank3_randn,
+    "gt_tensor_rank4_randn": op_gt_tensor_rank4_randn,
+}
+
+test_data_scalar = {
+    "gt_scalar_rank1_ones": op_gt_scalar_rank1_ones,
+    "gt_scalar_rank2_rand": op_gt_scalar_rank2_rand,
+    "gt_scalar_rank3_randn": op_gt_scalar_rank3_randn,
+    "gt_scalar_rank4_randn": op_gt_scalar_rank4_randn,
 }
 
 
-@common.parametrize("test_module", test_data_common)
-def test_gt_tosa_MI(test_module):
+@common.parametrize("test_module", test_data_tensor)
+def test_gt_tensor_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_scalar)
+def test_gt_scalar_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module, test_module.get_inputs(), Greater.aten_op_scalar, Greater.exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_tensor)
+def test_gt_tensor_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_gt_tosa_BI(test_module):
+@common.parametrize("test_module", test_data_scalar)
+def test_gt_scalar_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_gt_u55_BI(test_module):
-    # GREATER is not supported on U55.
+@common.parametrize("test_module", test_data_tensor)
+@common.XfailIfNoCorstone300
+def test_gt_tensor_u55_BI(test_module):
+    # Greater is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
         test_module,
         test_module.get_inputs(),
         "TOSA-0.80+BI+u55",
-        {exir_op: 1},
+        {Greater.exir_op: 1},
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_gt_u85_BI(test_module):
-    pipeline = EthosU85PipelineBI[input_t](
+@common.parametrize("test_module", test_data_scalar)
+@common.XfailIfNoCorstone300
+def test_gt_scalar_u55_BI(test_module):
+    # Greater is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
         test_module,
         test_module.get_inputs(),
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        use_to_edge_transform_and_lower=True,
+        "TOSA-0.80+BI+u55",
+        {Greater.exir_op: 1},
+        n_expected_delegates=1,
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-@pytest.mark.skip(reason="The same as test_gt_u55_BI")
-def test_gt_u55_BI_on_fvp(test_module):
-    # GREATER is not supported on U55.
-    pipeline = OpNotSupportedPipeline[input_t](
+@common.parametrize(
+    "test_module",
+    test_data_tensor,
+    xfails={
+        "gt_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
+    },
+)
+@common.XfailIfNoCorstone320
+def test_gt_tensor_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
         test_module,
         test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
-        {exir_op: 1},
+        Greater.aten_op_tensor,
+        Greater.exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize(
     "test_module",
-    test_data_common,
-    xfails={"gt_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
+    test_data_scalar,
+    xfails={
+        "gt_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
+    },
 )
-@common.SkipIfNoCorstone320
-def test_gt_u85_BI_on_fvp(test_module):
+@common.XfailIfNoCorstone320
+def test_gt_scalar_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
         test_module,
         test_module.get_inputs(),
-        aten_op,
-        exir_op,
+        Greater.aten_op_tensor,
+        Greater.exir_op,
         run_on_fvp=True,
-        use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 33bf9932b5a..9a289909bae 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import Tuple
@@ -19,9 +18,6 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
 
 test_data_suite_rank1 = [
     # (test_name, test_data, out_features, has_bias)
diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py
index cae119cd7a8..f5664b7895d 100644
--- a/backends/arm/test/ops/test_lt.py
+++ b/backends/arm/test/ops/test_lt.py
@@ -5,7 +5,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 
@@ -16,13 +15,15 @@
     TosaPipelineMI,
 )
 
-aten_op = "torch.ops.aten.lt.Tensor"
-exir_op = "executorch_exir_dialects_edge__ops_aten_lt_Tensor"
 
 input_t = Tuple[torch.Tensor]
 
 
 class LessThan(torch.nn.Module):
+    aten_op_tensor = "torch.ops.aten.lt.Tensor"
+    aten_op_scalar = "torch.ops.aten.lt.Scalar"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_lt_Tensor"
+
     def __init__(self, input, other):
         super().__init__()
         self.input_ = input
@@ -31,7 +32,7 @@ def __init__(self, input, other):
     def forward(
         self,
         input_: torch.Tensor,
-        other_: torch.Tensor,
+        other_: torch.Tensor | int | float,
     ):
         return input_ < other_
 
@@ -39,98 +40,135 @@ def get_inputs(self):
         return (self.input_, self.other_)
 
 
-op_lt_rank1_ones = LessThan(
+op_lt_tensor_rank1_ones = LessThan(
     torch.ones(5),
     torch.ones(5),
 )
-op_lt_rank2_rand = LessThan(
+op_lt_tensor_rank2_rand = LessThan(
     torch.rand(4, 5),
     torch.rand(1, 5),
 )
-op_lt_rank3_randn = LessThan(
+op_lt_tensor_rank3_randn = LessThan(
     torch.randn(10, 5, 2),
     torch.randn(10, 5, 2),
 )
-op_lt_rank4_randn = LessThan(
+op_lt_tensor_rank4_randn = LessThan(
     torch.randn(3, 2, 2, 2),
     torch.randn(3, 2, 2, 2),
 )
 
-test_data_common = {
-    "lt_rank1_ones": op_lt_rank1_ones,
-    "lt_rank2_rand": op_lt_rank2_rand,
-    "lt_rank3_randn": op_lt_rank3_randn,
-    "lt_rank4_randn": op_lt_rank4_randn,
+op_lt_scalar_rank1_ones = LessThan(torch.ones(5), 1.0)
+op_lt_scalar_rank2_rand = LessThan(torch.rand(4, 5), 0.2)
+op_lt_scalar_rank3_randn = LessThan(torch.randn(10, 5, 2), -0.1)
+op_lt_scalar_rank4_randn = LessThan(torch.randn(3, 2, 2, 2), 0.3)
+
+test_data_tensor = {
+    "lt_tensor_rank1_ones": op_lt_tensor_rank1_ones,
+    "lt_tensor_rank2_rand": op_lt_tensor_rank2_rand,
+    "lt_tensor_rank3_randn": op_lt_tensor_rank3_randn,
+    "lt_tensor_rank4_randn": op_lt_tensor_rank4_randn,
+}
+
+test_data_scalar = {
+    "lt_scalar_rank1_ones": op_lt_scalar_rank1_ones,
+    "lt_scalar_rank2_rand": op_lt_scalar_rank2_rand,
+    "lt_scalar_rank3_randn": op_lt_scalar_rank3_randn,
+    "lt_scalar_rank4_randn": op_lt_scalar_rank4_randn,
 }
 
 
-@common.parametrize("test_module", test_data_common)
-def test_lt_tosa_MI(test_module):
+@common.parametrize("test_module", test_data_tensor)
+def test_lt_tensor_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_scalar)
+def test_lt_scalar_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module, test_module.get_inputs(), LessThan.aten_op_scalar, LessThan.exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_tensor)
+def test_lt_tensor_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_lt_tosa_BI(test_module):
+@common.parametrize("test_module", test_data_scalar)
+def test_lt_scalar_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
-        test_module, test_module.get_inputs(), aten_op, exir_op
+        test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_lt_u55_BI(test_module):
-    # GREATER is not supported on U55. LT uses the GREATER Tosa operator.
+@common.parametrize("test_module", test_data_tensor)
+@common.XfailIfNoCorstone300
+def test_lt_tensor_u55_BI(test_module):
+    # LessThan is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t](
         test_module,
         test_module.get_inputs(),
         "TOSA-0.80+BI+u55",
-        {exir_op: 1},
+        {LessThan.exir_op: 1},
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-def test_lt_u85_BI(test_module):
-    pipeline = EthosU85PipelineBI[input_t](
+@common.parametrize("test_module", test_data_scalar)
+@common.XfailIfNoCorstone300
+def test_lt_scalar_u55_BI(test_module):
+    # LessThan is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
         test_module,
         test_module.get_inputs(),
-        aten_op,
-        exir_op,
-        run_on_fvp=False,
-        use_to_edge_transform_and_lower=True,
+        "TOSA-0.80+BI+u55",
+        {LessThan.exir_op: 1},
+        n_expected_delegates=1,
     )
     pipeline.run()
 
 
-@common.parametrize("test_module", test_data_common)
-@pytest.mark.skip(reason="The same as test_lt_u55_BI")
-def test_lt_u55_BI_on_fvp(test_module):
-    # GREATER is not supported on U55. LT uses the GREATER Tosa operator.
-    pipeline = OpNotSupportedPipeline[input_t](
+@common.parametrize(
+    "test_module",
+    test_data_tensor,
+    xfails={
+        "lt_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
+    },
+)
+@common.XfailIfNoCorstone320
+def test_lt_tensor_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
         test_module,
         test_module.get_inputs(),
-        "TOSA-0.80+BI+u55",
-        {exir_op: 1},
+        LessThan.aten_op_tensor,
+        LessThan.exir_op,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize(
     "test_module",
-    test_data_common,
-    xfails={"lt_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
+    test_data_scalar,
+    xfails={
+        "lt_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85",
+    },
 )
-@common.SkipIfNoCorstone320
-def test_lt_u85_BI_on_fvp(test_module):
+@common.XfailIfNoCorstone320
+def test_lt_scalar_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
         test_module,
         test_module.get_inputs(),
-        aten_op,
-        exir_op,
+        LessThan.aten_op_tensor,
+        LessThan.exir_op,
         run_on_fvp=True,
-        use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index a31c12be3a0..2f3426f2dda 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import Tuple
@@ -26,8 +25,6 @@
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 test_data_suite = [
     # (test_name, test_data, [kernel_size, stride, padding])
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index 6b906067f7b..a4503280db9 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -6,6 +6,7 @@
 
 from typing import Callable
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -53,6 +54,7 @@ def test_mm_tosa_u55(test_data_generator: Callable[[], tuple]):
 
 
 @parameterized.expand(MM.test_data_generators)
+@pytest.mark.flaky  # Investigate flakiness (MLETORCH-870)
 def test_mm_tosa_u85(test_data_generator: Callable[[], tuple]):
     test_data = test_data_generator()
     EthosU85PipelineBI[test_t](MM(), test_data, MM.aten_op, MM.exir_op).run()
@@ -67,6 +69,7 @@ def test_mm_tosa_u55_on_fvp(test_data_generator: Callable[[], tuple]):
 
 @parameterized.expand(MM.test_data_generators)
 @common.SkipIfNoCorstone320
+@pytest.mark.flaky  # Investigate flakiness (MLETORCH-870)
 def test_mm_tosa_u85_on_fvp(test_data_generator: Callable[[], tuple]):
     test_data = test_data_generator()
     EthosU85PipelineBI[test_t](
diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py
index a5c6c86c52b..43b4abd2039 100644
--- a/backends/arm/test/ops/test_sigmoid.py
+++ b/backends/arm/test/ops/test_sigmoid.py
@@ -1,24 +1,22 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import Tuple
 
+import pytest
+
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
 
 test_data_suite = [
     # (test_name, test_data)
@@ -67,7 +65,7 @@ def forward(self, x, y):
     def _test_sigmoid_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -81,11 +79,13 @@ def _test_sigmoid_tosa_MI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
         )
 
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
+
     def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -100,9 +100,11 @@ def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tup
             .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
         )
 
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
+
     def _test_sigmoid_tosa_ethos_BI_pipeline(
         self,
         compile_spec: list[CompileSpec],
@@ -141,6 +143,7 @@ def _test_sigmoid_tosa_u85_BI_pipeline(
         )
 
     @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_tosa_MI(
         self,
         test_name: str,
@@ -149,26 +152,33 @@ def test_sigmoid_tosa_MI(
         self._test_sigmoid_tosa_MI_pipeline(self.Sigmoid(), (test_data,))
 
     @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_tosa_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_sigmoid_tosa_BI_pipeline(self.Sigmoid(), (test_data,))
 
+    @pytest.mark.tosa_ref_model
     def test_add_sigmoid_tosa_MI(self):
         self._test_sigmoid_tosa_MI_pipeline(self.AddSigmoid(), (test_data_suite[0][1],))
 
+    @pytest.mark.tosa_ref_model
     def test_add_sigmoid_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(self.AddSigmoid(), (test_data_suite[5][1],))
 
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_add_tosa_MI(self):
         self._test_sigmoid_tosa_MI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],))
 
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_add_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],))
 
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_add_sigmoid_tosa_MI(self):
         self._test_sigmoid_tosa_MI_pipeline(
             self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
         )
 
+    @pytest.mark.tosa_ref_model
     def test_sigmoid_add_sigmoid_tosa_BI(self):
         self._test_sigmoid_tosa_BI_pipeline(
             self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1])
diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py
index c3907887ac9..240000e6973 100644
--- a/backends/arm/test/ops/test_sigmoid_16bit.py
+++ b/backends/arm/test/ops/test_sigmoid_16bit.py
@@ -81,7 +81,7 @@ def forward(self, x):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=5)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_tosa_BI(test_data):
     pipeline = TosaPipelineBI(
         Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op
@@ -97,7 +97,7 @@ def test_sigmoid_tosa_BI(test_data):
         "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787"
     },
 )
-@pytest.mark.flaky(reruns=5)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_add_sigmoid_tosa_BI(test_data):
     pipeline = TosaPipelineBI(
         SigmoidAddSigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op
@@ -110,6 +110,7 @@ def test_sigmoid_add_sigmoid_tosa_BI(test_data):
     "test_data",
     test_data_suite,
 )
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_tosa_u55(test_data):
     pipeline = OpNotSupportedPipeline(
         Sigmoid(), (test_data(),), "TOSA-0.80+BI+u55", {Sigmoid.exir_op: 1}
@@ -122,6 +123,7 @@ def test_sigmoid_tosa_u55(test_data):
     "test_data",
     test_data_suite,
 )
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_add_sigmoid_tosa_u55(test_data):
     pipeline = OpNotSupportedPipeline(
         SigmoidAddSigmoid(),
@@ -135,7 +137,7 @@ def test_sigmoid_add_sigmoid_tosa_u55(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=5)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
 def test_sigmoid_tosa_u85(test_data):
     pipeline = EthosU85PipelineBI(
@@ -152,7 +154,7 @@ def test_sigmoid_tosa_u85(test_data):
         "ramp": "AssertionError: Output 0 does not match reference output.",
     },
 )
-@pytest.mark.flaky(reruns=5)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
 def test_sigmoid_add_sigmoid_tosa_u85(test_data):
     pipeline = EthosU85PipelineBI(
diff --git a/backends/arm/test/ops/test_sigmoid_32bit.py b/backends/arm/test/ops/test_sigmoid_32bit.py
index 5388eae83c3..14808eedaf9 100644
--- a/backends/arm/test/ops/test_sigmoid_32bit.py
+++ b/backends/arm/test/ops/test_sigmoid_32bit.py
@@ -97,7 +97,7 @@ def forward(self, x):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=5)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_tosa_BI(test_data):
     pipeline = TosaPipelineBI(
         Sigmoid(),
@@ -110,7 +110,7 @@ def test_sigmoid_tosa_BI(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=5)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_add_sigmoid_tosa_BI(test_data):
     pipeline = TosaPipelineBI(
         SigmoidAddSigmoid(),
@@ -123,6 +123,7 @@ def test_sigmoid_add_sigmoid_tosa_BI(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_tosa_u55(test_data):
     pipeline = OpNotSupportedPipeline(
         Sigmoid(), (test_data(),), "TOSA-0.80+BI+u55", {Sigmoid.exir_op: 1}
@@ -132,6 +133,7 @@ def test_sigmoid_tosa_u55(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 def test_sigmoid_add_sigmoid_tosa_u55(test_data):
     pipeline = OpNotSupportedPipeline(
         SigmoidAddSigmoid(),
@@ -145,7 +147,7 @@ def test_sigmoid_add_sigmoid_tosa_u55(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.flaky(reruns=5)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
 def test_sigmoid_tosa_u85(test_data):
     pipeline = EthosU85PipelineBI(
@@ -162,7 +164,7 @@ def test_sigmoid_tosa_u85(test_data):
         "ramp": "AssertionError: Output 0 does not match reference output.",
     },
 )
-@pytest.mark.flaky(reruns=5)
+@pytest.mark.flaky(reruns=32)  # Flaky due to Vela bug: MLBEDSW-10642
 @common.XfailIfNoCorstone320
 def test_sigmoid_add_sigmoid_tosa_u85(test_data):
     pipeline = EthosU85PipelineBI(
diff --git a/backends/arm/test/ops/test_silu.py b/backends/arm/test/ops/test_silu.py
new file mode 100644
index 00000000000..51748b02450
--- /dev/null
+++ b/backends/arm/test/ops/test_silu.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Optional, Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+input_t = Tuple[torch.Tensor]
+
+
+class Silu(torch.nn.Module):
+    def forward(
+        self,
+        _input: torch.Tensor,
+        _inplace: Optional[bool] = False,
+    ):
+        return torch.nn.SiLU(inplace=_inplace)(_input)
+
+    test_data: list[input_t] = {
+        "op_silu_rank1_ones": (torch.ones(5),),
+        "op_silu_rank1_negative_ones": (torch.ones(5) * (-1),),
+        "op_silu_rank1_rand": (torch.rand(5) * 5,),
+        "op_silu_rank4_ones": (torch.ones(1, 10, 25, 20),),
+        "op_silu_rank4_negative_ones": ((-1) * torch.ones(1, 10, 25, 20),),
+        "op_silu_rank4_large_rand": (200 * torch.rand(1, 10, 25, 20),),
+        "op_silu_rank4_negative_large_rand": ((-200) * torch.rand(1, 10, 25, 20),),
+        "op_silu_rank4_large_randn": (200 * torch.randn(1, 10, 25, 20) + 1,),
+    }
+
+    aten_op_MI = "torch.ops.aten.silu.default"
+    aten_op_inplace_MI = "torch.ops.aten.silu_.default"
+    aten_op_BI = ["torch.ops.aten.sigmoid.default", "torch.ops.aten.mul.Tensor"]
+
+
+@common.parametrize("test_data", Silu.test_data)
+def test_silu_tosa_MI(test_data: input_t):
+    silu_data = (test_data[0], False)
+    pipeline = TosaPipelineMI[input_t](Silu(), silu_data, Silu.aten_op_MI)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Silu.test_data)
+def test_silu_tosa_MI_inplace(test_data: input_t):
+    silu_data = (test_data[0], True)
+    pipeline = TosaPipelineMI[input_t](Silu(), silu_data, Silu.aten_op_inplace_MI)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Silu.test_data)
+def test_silu_tosa_BI(test_data: input_t):
+    silu_data = (test_data[0], False)
+    pipeline = TosaPipelineBI[input_t](Silu(), silu_data, Silu.aten_op_BI)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Silu.test_data)
+def test_silu_tosa_BI_inplace(test_data: input_t):
+    silu_data = (test_data[0], True)
+    pipeline = TosaPipelineBI[input_t](Silu(), silu_data, Silu.aten_op_BI)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Silu.test_data)
+@common.XfailIfNoCorstone300
+def test_silu_u55_BI(test_data: input_t):
+    silu_data = (test_data[0], False)
+    pipeline = EthosU55PipelineBI[input_t](
+        Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Silu.test_data)
+@common.XfailIfNoCorstone300
+def test_silu_u55_BI_inplace(test_data: input_t):
+    silu_data = (test_data[0], True)
+    pipeline = EthosU55PipelineBI[input_t](
+        Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Silu.test_data)
+@common.XfailIfNoCorstone320
+def test_silu_u85_BI(test_data: input_t):
+    silu_data = (test_data[0], False)
+    pipeline = EthosU85PipelineBI[input_t](
+        Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Silu.test_data)
+@common.XfailIfNoCorstone320
+def test_silu_u85_BI_inplace(test_data: input_t):
+    silu_data = (test_data[0], True)
+    pipeline = EthosU85PipelineBI[input_t](
+        Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
index 7cb82e3a828..91ef51cc2a2 100644
--- a/backends/arm/test/ops/test_slice.py
+++ b/backends/arm/test/ops/test_slice.py
@@ -7,35 +7,35 @@
 import unittest
 from typing import Tuple
 
+import pytest
+
 import torch
 
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
+test_data_suite = [
+    (torch.ones(10), [(3, -3)]),
+    (torch.ones(10), [(-8, 3)]),
+    (torch.ones(10, 10), [(1, 3), (3, None)]),
+    (torch.ones(10, 10, 10), [(0, 7), (0, None), (0, 8)]),
+    (torch.ones((1, 12, 10, 10)), [(None, None), (None, 5), (3, 5), (4, 10)]),
+]
+
 
 class TestSimpleSlice(unittest.TestCase):
 
     class Slice(torch.nn.Module):
-
-        sizes = [(10), (10, 10), (10, 10, 10), ((1, 12, 10, 10))]
-        test_tensors = [(torch.ones(n),) for n in sizes]
-
-        def forward(self, x: torch.Tensor):
-            if x.dim() == 1:
-                return x[3:-3]
-            elif x.dim() == 2:
-                return x[1:3, 3:]
-            elif x.dim() == 3:
-                return x[0:7, 0:, 0:8]
-            elif x.dim() == 4:
-                return x[:, :5, 3:5, 4:10]
+        def forward(self, x: torch.Tensor, s: list[tuple[int, int]]):
+            slices = [slice(*i) for i in s]
+            return x[slices]
 
     def _test_slice_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: torch.Tensor
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -48,14 +48,16 @@ def _test_slice_tosa_MI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
         )
 
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
+
     def _test_slice_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
 
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -68,9 +70,11 @@ def _test_slice_tosa_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
+
     def _test_slice_ethos_BI_pipeline(
         self,
         compile_spec: list[CompileSpec],
@@ -106,22 +110,29 @@ def _test_slice_u85_BI_pipeline(
             common.get_u85_compile_spec(), module, test_data
         )
 
-    @parameterized.expand(Slice.test_tensors)
-    def test_slice_tosa_MI(self, tensor):
-        self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor,))
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
+    def test_slice_tosa_MI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]):
+        self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor, slices))
 
-    @parameterized.expand(Slice.test_tensors[:2])
-    def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor):
-        self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,))
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
+    def test_slice_nchw_tosa_BI(
+        self, tensor: torch.Tensor, slices: list[tuple[int, int]]
+    ):
+        self._test_slice_tosa_BI_pipeline(self.Slice(), (tensor, slices))
 
-    @parameterized.expand(Slice.test_tensors[2:])
-    def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor):
-        self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,))
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
+    def test_slice_nhwc_tosa_BI(
+        self, tensor: torch.Tensor, slices: list[tuple[int, int]]
+    ):
+        self._test_slice_tosa_BI_pipeline(self.Slice(), (tensor, slices))
 
-    @parameterized.expand(Slice.test_tensors)
-    def test_slice_u55_BI(self, test_tensor: torch.Tensor):
-        self._test_slice_u55_BI_pipeline(self.Slice(), (test_tensor,))
+    @parameterized.expand(test_data_suite)
+    def test_slice_u55_BI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]):
+        self._test_slice_u55_BI_pipeline(self.Slice(), (tensor, slices))
 
-    @parameterized.expand(Slice.test_tensors)
-    def test_slice_u85_BI(self, test_tensor: torch.Tensor):
-        self._test_slice_u85_BI_pipeline(self.Slice(), (test_tensor,))
+    @parameterized.expand(test_data_suite)
+    def test_slice_u85_BI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]):
+        self._test_slice_u85_BI_pipeline(self.Slice(), (tensor, slices))
diff --git a/backends/arm/test/ops/test_sqrt.py b/backends/arm/test/ops/test_sqrt.py
new file mode 100644
index 00000000000..53a1e79c0a8
--- /dev/null
+++ b/backends/arm/test/ops/test_sqrt.py
@@ -0,0 +1,78 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Dict, Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+class Sqrt(torch.nn.Module):
+    input_t = Tuple[torch.Tensor]
+    aten_op_MI = "torch.ops.aten.sqrt.default"
+    exir_op_MI = "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Tensor"
+
+    aten_op_BI = "torch.ops.aten.pow.Tensor_Scalar"
+    exir_op_BI = "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar"
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.sqrt(x)
+
+    test_data: Dict[str, input_t] = {
+        "sqrt_tensor_rank1_ones": (torch.ones(10),),
+        "sqrt_tensor_rank2_random": (torch.rand(5, 10),),
+        "sqrt_tensor_rank3_ones": (torch.ones(2, 3, 4),),
+        "sqrt_tensor_rank4_random": (torch.rand(1, 3, 8, 8),),
+        "sqrt_tensor_rank4_multibatch": (torch.rand(2, 3, 4, 4),),
+    }
+
+
+fvp_xfails = {
+    "sqrt_tensor_rank4_multibatch": "MLETORCH-517 : Multiple batches not supported",
+}
+
+
+@common.parametrize("test_data", Sqrt.test_data)
+def test_sqrt_tosa_MI(test_data: Sqrt.input_t):
+    pipeline = TosaPipelineMI[Sqrt.input_t](
+        Sqrt(), test_data, Sqrt.aten_op_MI, Sqrt.exir_op_MI
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Sqrt.test_data)
+def test_sqrt_tosa_BI(test_data: Sqrt.input_t):
+    pipeline = TosaPipelineBI[Sqrt.input_t](
+        Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Sqrt.test_data, fvp_xfails)
+@common.XfailIfNoCorstone300
+def test_sqrt_u55_BI(test_data: Sqrt.input_t):
+    pipeline = EthosU55PipelineBI[Sqrt.input_t](
+        Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Sqrt.test_data, fvp_xfails)
+@common.XfailIfNoCorstone320
+def test_sqrt_u85_BI(test_data: Sqrt.input_t):
+    pipeline = EthosU85PipelineBI[Sqrt.input_t](
+        Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI, run_on_fvp=True
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_tanh.py b/backends/arm/test/ops/test_tanh.py
index 060d7933ea5..8d13620dc4a 100644
--- a/backends/arm/test/ops/test_tanh.py
+++ b/backends/arm/test/ops/test_tanh.py
@@ -9,9 +9,11 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
 
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -40,7 +42,7 @@ def forward(self, x):
     def _test_tanh_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -54,11 +56,13 @@ def _test_tanh_tosa_MI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_tanh_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
         )
 
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
+
     def _test_tanh_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -73,9 +77,11 @@ def _test_tanh_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple)
             .check_not(["executorch_exir_dialects_edge__ops_aten_tanh_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
         )
 
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
+
     def _test_tanh_tosa_ethos_BI_pipeline(
         self,
         compile_spec: list[CompileSpec],
@@ -114,6 +120,7 @@ def _test_tanh_tosa_u85_BI_pipeline(
         )
 
     @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
     def test_tanh_tosa_MI(
         self,
         test_name: str,
@@ -122,6 +129,7 @@ def test_tanh_tosa_MI(
         self._test_tanh_tosa_MI_pipeline(self.Tanh(), (test_data,))
 
     @parameterized.expand(test_data_suite)
+    @pytest.mark.tosa_ref_model
     def test_tanh_tosa_BI(self, test_name: str, test_data: torch.Tensor):
         self._test_tanh_tosa_BI_pipeline(self.Tanh(), (test_data,))
 
diff --git a/backends/arm/test/ops/test_where.py b/backends/arm/test/ops/test_where.py
index bf127460f3e..dd4f3326f8e 100644
--- a/backends/arm/test/ops/test_where.py
+++ b/backends/arm/test/ops/test_where.py
@@ -173,14 +173,9 @@ def test_where_u55_BI(test_module):
         get_symmetric_quantization_config()
     )
 
-    # If condition is tensor_condition then there will be one full_like op which will be
-    # delegated.
-    if test_module.condition == tensor_condition:
-        num_delegates = 1
-        num_exir = 0
-    else:
-        num_delegates = 0
-        num_exir = 0
+    # There will be one full_like op which will be delegated.
+    num_delegates = 1
+    num_exir = 0
 
     pipeline = OpNotSupportedPipeline[input_t](
         test_module,
@@ -223,14 +218,9 @@ def test_where_u55_BI_on_fvp(test_module):
         get_symmetric_quantization_config()
     )
 
-    # If condition is tensor_condition then there will be one full_like op which will be
-    # delegated.
-    if test_module.condition == tensor_condition:
-        num_delegates = 1
-        num_exir = 0
-    else:
-        num_delegates = 0
-        num_exir = 0
+    # There will be one full_like op which will be delegated.
+    num_delegates = 1
+    num_exir = 0
 
     pipeline = OpNotSupportedPipeline[input_t](
         test_module,
@@ -249,18 +239,7 @@ def test_where_u55_BI_on_fvp(test_module):
     pipeline.run()
 
 
-@common.parametrize(
-    "test_module",
-    test_modules_BI,
-    xfails={
-        "two_dim_scalar_cond": "E [executorch:method.cpp:601] Missing operator: "
-        "[2] aten::gt.Scalar_out",
-        "three_dim_scalar_cond": "E [executorch:method.cpp:601] Missing operator: "
-        "[2] aten::gt.Scalar_out",
-        "float32_scalar_cond": "E [executorch:method.cpp:601] Missing operator: "
-        "[2] aten::gt.Scalar_out",
-    },
-)
+@common.parametrize("test_module", test_modules_BI)
 @common.XfailIfNoCorstone320
 def test_where_u85_BI_on_fvp(test_module):
     compile_spec = common.get_u85_compile_spec()
diff --git a/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py b/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py
new file mode 100644
index 00000000000..5d83bc82f22
--- /dev/null
+++ b/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py
@@ -0,0 +1,51 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.convert_expand_copy_to_repeat import (
+    ConvertExpandCopyToRepeatPass,
+)
+
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class Expand(torch.nn.Module):
+    """
+    Basic expand model using torch.Tensor.expand function
+    """
+
+    def __init__(self):
+        super(Expand, self).__init__()
+
+    def forward(self, x):
+        return x.expand(3, 4)
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(3, 1),)
+
+
+def test_expand_to_repeat_tosa_BI():
+    module = Expand()
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+BI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_expand_copy_default": 1,
+        },
+        ops_not_before_pass=["executorch_exir_dialects_edge__ops_aten_repeat_default"],
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_repeat_default": 1,
+        },
+        ops_not_after_pass=[
+            "executorch_exir_dialects_edge__ops_aten_expand_copy_default"
+        ],
+        pass_list=[ConvertExpandCopyToRepeatPass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_convert_split_to_slice.py b/backends/arm/test/passes/test_convert_split_to_slice.py
new file mode 100644
index 00000000000..d4fdffe3b01
--- /dev/null
+++ b/backends/arm/test/passes/test_convert_split_to_slice.py
@@ -0,0 +1,67 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.convert_split_to_slice import (
+    ConvertSplitToSlicePass,
+)
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class Split(torch.nn.Module):
+    """
+    Basic split model using torch.split function
+    """
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(10),)
+
+    def forward(self, x):
+        return torch.split(x, 2)
+
+
+class SplitTensor(torch.nn.Module):
+    """
+    Basic split model using torch.Tensor.split function
+    """
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(10),)
+
+    def forward(self, x):
+        return x.split(2)
+
+
+modules = {"split_basic": Split(), "split_tensor": SplitTensor()}
+
+
+@common.parametrize("module", modules)
+def test_split_to_slice_tosa_BI(module):
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+BI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1,
+        },
+        ops_not_before_pass=[
+            "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor"
+        ],
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 5,
+        },
+        ops_not_after_pass=[
+            "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default"
+        ],
+        pass_list=[ConvertSplitToSlicePass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_div_pass.py b/backends/arm/test/passes/test_decompose_div_pass.py
new file mode 100644
index 00000000000..71d586c0029
--- /dev/null
+++ b/backends/arm/test/passes/test_decompose_div_pass.py
@@ -0,0 +1,65 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class Div(torch.nn.Module):
+    """
+    Basic div model using torch.div
+    """
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(10),)
+
+    def forward(self, x):
+        return torch.div(x, 2)
+
+
+class DivTensor(torch.nn.Module):
+    """
+    Basic div model using torch.Tensor.div
+    """
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(10),)
+
+    def forward(self, x):
+        return x.div(2)
+
+
+modules = {"div_basic": Div(), "div_tensor": DivTensor()}
+
+
+@common.parametrize("module", modules)
+def test_decompose_div_tosa_MI(module):
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+MI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_div_Tensor": 1,
+        },
+        ops_not_before_pass=[
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_reciprocal_default",
+        ],
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+            "executorch_exir_dialects_edge__ops_aten_reciprocal_default": 1,
+        },
+        ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_div_Tensor"],
+        pass_list=[DecomposeDivPass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_layernorm_pass.py b/backends/arm/test/passes/test_decompose_layernorm_pass.py
new file mode 100644
index 00000000000..40e49e15bc5
--- /dev/null
+++ b/backends/arm/test/passes/test_decompose_layernorm_pass.py
@@ -0,0 +1,69 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.decompose_layernorm_pass import (
+    DecomposeLayerNormPass,
+)
+
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class LayerNorm(torch.nn.Module):
+    """
+    Basic layer_norm model using torch.nn.layer_norm layer
+    """
+
+    def __init__(self):
+        super(LayerNorm, self).__init__()
+        self.layer_norm = torch.nn.LayerNorm(10)
+
+    def forward(self, x):
+        x = self.layer_norm(x)
+        return x
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(10),)
+
+
+def test_decompose_layernorm_tosa_MI():
+    module = LayerNorm()
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+MI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_native_layer_norm_default": 1,
+        },
+        ops_not_before_pass=[
+            "executorch_exir_dialects_edge__ops_aten_add_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_view_copy_default",
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_full_default",
+            "executorch_exir_dialects_edge__ops_aten_rsqrt_default",
+            "executorch_exir_dialects_edge__ops_aten_var_correction",
+            "executorch_exir_dialects_edge__ops_aten_sub_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_mean_dim",
+        ],
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2,
+            "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 2,
+            "executorch_exir_dialects_edge__ops_aten_full_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_rsqrt_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_var_correction": 1,
+            "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1,
+            "executorch_exir_dialects_edge__ops_aten_mean_dim": 1,
+        },
+        ops_not_after_pass=[
+            "executorch_exir_dialects_edge__ops_aten_expand_copy_default"
+        ],
+        pass_list=[DecomposeLayerNormPass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_meandim_pass.py b/backends/arm/test/passes/test_decompose_meandim_pass.py
new file mode 100644
index 00000000000..6ba9ceff3a7
--- /dev/null
+++ b/backends/arm/test/passes/test_decompose_meandim_pass.py
@@ -0,0 +1,73 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class MeanDim(torch.nn.Module):
+    """
+    Basic mean model using torch.mean function making sure keepdim=True (keepdim=False doesnt work for this pass for some reason)
+    """
+
+    def __init__(self):
+        super(MeanDim, self).__init__()
+
+    def forward(self, x):
+        return torch.mean(x, 1, True)
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(4, 4),)
+
+
+class MeanDimTensor(torch.nn.Module):
+    """
+    Basic mean model using torch.Tensor.mean function making sure keepdim=True (keepdim=False doesnt work for this pass for some reason)
+    """
+
+    def __init__(self):
+        super(MeanDimTensor, self).__init__()
+
+    def forward(self, x):
+        return x.mean(1, True)
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(4, 4),)
+
+
+modules = {"meandim_basic": MeanDim(), "meandim_tensor": MeanDimTensor()}
+
+
+@common.parametrize("module", modules)
+def test_decompose_meandim_tosa_MI(module):
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+MI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_mean_dim": 1,
+        },
+        ops_not_before_pass=[
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_full_default",
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList",
+        ],
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+            "executorch_exir_dialects_edge__ops_aten_full_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
+        },
+        ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_mean_dim"],
+        pass_list=[DecomposeMeanDimPass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_softmax_pass.py b/backends/arm/test/passes/test_decompose_softmax_pass.py
new file mode 100644
index 00000000000..efb911f03aa
--- /dev/null
+++ b/backends/arm/test/passes/test_decompose_softmax_pass.py
@@ -0,0 +1,103 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.decompose_softmax_pass import DecomposeSoftmaxPass
+
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class Softmax(torch.nn.Module):
+    """
+    Basic torch.nn.softmax layer model
+    """
+
+    def __init__(self):
+        super(Softmax, self).__init__()
+        self.softmax = torch.nn.Softmax(dim=1)
+
+    def forward(self, x):
+        x = self.softmax(x)
+        return x
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(2, 3),)
+
+
+class SoftmaxLog(torch.nn.Module):
+    """
+    Basic torch.nn.log_softmax layer model
+    """
+
+    def __init__(self):
+        super(SoftmaxLog, self).__init__()
+        self.softmax = torch.nn.LogSoftmax(dim=1)
+
+    def forward(self, x):
+        x = self.softmax(x)
+        return x
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(2, 3),)
+
+
+def test_softmax_basic_tosa_MI():
+    module = Softmax()
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+MI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten__softmax_default": 1,
+        },
+        ops_not_before_pass=[
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_reciprocal_default",
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList",
+            "executorch_exir_dialects_edge__ops_aten_exp_default",
+        ],
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+            "executorch_exir_dialects_edge__ops_aten_exp_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_reciprocal_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
+        },
+        ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten__softmax_default"],
+        pass_list=[DecomposeSoftmaxPass],
+    )
+    pipeline.run()
+
+
+def test_softmax_log_tosa_MI():
+    module = SoftmaxLog()
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+MI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1,
+        },
+        ops_not_before_pass=[
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_reciprocal_default",
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList",
+            "executorch_exir_dialects_edge__ops_aten_exp_default",
+        ],
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+            "executorch_exir_dialects_edge__ops_aten_exp_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_reciprocal_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
+        },
+        ops_not_after_pass=[
+            "executorch_exir_dialects_edge__ops_aten__log_softmax_default"
+        ],
+        pass_list=[DecomposeSoftmaxPass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_decompose_var_pass.py b/backends/arm/test/passes/test_decompose_var_pass.py
new file mode 100644
index 00000000000..fe793dba14b
--- /dev/null
+++ b/backends/arm/test/passes/test_decompose_var_pass.py
@@ -0,0 +1,84 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class VarDim(torch.nn.Module):
+    """
+    Basic variance model using torch.Tensor.var function.
+    """
+
+    def __init__(self, keepdim):
+        super(VarDim, self).__init__()
+        self.keepdim = keepdim
+
+    def forward(self, x):
+        return x.var(dim=-1, keepdim=self.keepdim)
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(4, 4),)
+
+
+class VarCorrection(torch.nn.Module):
+    """
+    Basic variance model using torch.var function.
+    """
+
+    def __init__(self, keepdim):
+        super(VarCorrection, self).__init__()
+        self.keepdim = keepdim
+
+    def forward(self, x):
+        return torch.var(x, -1, keepdim=self.keepdim)
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(4, 4),)
+
+
+modules = {
+    "vardim_keepdim": VarDim(True),
+    "vardim_no_keepdim": VarDim(False),
+    "varcorrection_keepdim": VarCorrection(True),
+    "varcorrection_no_keepdim": VarCorrection(False),
+}
+
+
+@common.parametrize("module", modules)
+def test_decompose_var_tosa_MI(module):
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+MI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_var_correction": 1,
+        },
+        ops_not_before_pass=[
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+            "executorch_exir_dialects_edge__ops_aten_full_default",
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList",
+            "executorch_exir_dialects_edge__ops_aten_mean_dim",
+            "executorch_exir_dialects_edge__ops_aten_sub_Tensor",
+        ],
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 2,
+            "executorch_exir_dialects_edge__ops_aten_mean_dim": 1,
+            "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1,
+            "executorch_exir_dialects_edge__ops_aten_full_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1,
+        },
+        ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_var_correction"],
+        pass_list=[DecomposeVarPass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_remove_clone_pass.py b/backends/arm/test/passes/test_remove_clone_pass.py
new file mode 100755
index 00000000000..e586edd323d
--- /dev/null
+++ b/backends/arm/test/passes/test_remove_clone_pass.py
@@ -0,0 +1,43 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass
+
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class Clone(torch.nn.Module):
+    """
+    Basic remove layer model to test RemoveClonePass
+    """
+
+    def __init__(self):
+        super(Clone, self).__init__()
+
+    def forward(self, x):
+        return torch.clone(x)
+
+    def get_inputs(self) -> input_t:
+        return (torch.rand(3, 1),)
+
+
+def test_remove_clone_tosa_BI():
+    module = Clone()
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(),
+        tosa_version="TOSA-0.80+BI",
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_clone_default": 1,
+        },
+        ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_clone_default"],
+        pass_list=[RemoveClonePass],
+    )
+    pipeline.run()
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 28bbee052f9..4481a9c7cc2 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -13,31 +13,26 @@
 
 from pathlib import Path
 
-from typing import cast, Dict, List, Literal, Optional, Tuple
+from typing import Any, cast, Dict, List, Literal, Optional, Tuple
 
 import numpy as np
 import torch
 
-logger = logging.getLogger(__name__)
-try:
-    import tosa_reference_model
-except ImportError:
-    tosa_reference_model = None
 from executorch.backends.arm.arm_backend import get_tosa_spec, is_tosa
-
 from executorch.backends.arm.test.conftest import is_option_enabled
-from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.arm.tosa_specification import (
+    Tosa_0_80,
+    Tosa_1_00,
+    TosaSpecification,
+)
 from executorch.exir import ExecutorchProgramManager, ExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.lowered_backend_module import LoweredBackendModule
-from packaging.version import Version
 from torch.fx.node import Node
 
 from torch.overrides import TorchFunctionMode
-from tosa import TosaGraph
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.CRITICAL)
 
 # Copied from PyTorch.
 # From torch/testing/_internal/common_utils.py:torch_to_numpy_dtype_dict
@@ -568,7 +563,7 @@ def arm_executor_runner_exists(target_board):
 
 
 def run_tosa_graph(
-    graph: TosaGraph,
+    graph: Any,
     tosa_version: TosaSpecification,
     inputs: list[torch.Tensor],
 ) -> list[torch.Tensor]:
@@ -576,25 +571,38 @@ def run_tosa_graph(
     inputs_np = [input.numpy() for input in inputs]
     transpose_data_format(inputs_np, to="NHWC")
 
-    tosa_release = tosa_version.version
-
-    if tosa_release > Version("0.80"):
-        logger.warning("The reference model is only tested for TOSA v0.80")
-
-    # tosa_profile: 0 = Base Inference, 1 = Main Inference, 2 = Main Training.
-    tosa_profile = 1 if tosa_version.support_float() else 0
-    debug_mode = "ALL" if logger.level <= logging.DEBUG else None
-    outputs_np, status = tosa_reference_model.run(
-        graph,
-        inputs_np,
-        verbosity=_tosa_refmodel_loglevel(logger.level),
-        tosa_profile=tosa_profile,
-        initialize_variable_tensor_from_numpy=1,  # True
-        debug_mode=debug_mode,
-    )
+    if isinstance(tosa_version, Tosa_0_80):
+        import tosa_tools.v0_80.tosa_reference_model as reference_model
+
+        # tosa_profile: 0 = Base Inference, 1 = Main Inference, 2 = Main Training.
+        tosa_profile = 1 if tosa_version.support_float() else 0
+        debug_mode = "ALL" if logger.level <= logging.DEBUG else None
+        outputs_np, status = reference_model.run(
+            graph,
+            inputs_np,
+            verbosity=_tosa_refmodel_loglevel(logger.level),
+            tosa_profile=tosa_profile,
+            initialize_variable_tensor_from_numpy=True,
+            debug_mode=debug_mode,
+        )
+    elif isinstance(tosa_version, Tosa_1_00):
+        import tosa_reference_model as reference_model
+
+        debug_mode = "ALL" if logger.level <= logging.DEBUG else None
+        outputs_np, status = reference_model.run(
+            graph,
+            inputs_np,
+            verbosity=_tosa_refmodel_loglevel(logger.level),
+            initialize_variable_tensor_from_numpy=True,
+            debug_mode=debug_mode,
+        )
+    else:
+        raise ValueError(
+            f"Unknown TOSA specification: {tosa_version}. No refererence model available to run for this specification version"
+        )
 
     assert (
-        status == tosa_reference_model.GraphStatus.TOSA_VALID
+        status == reference_model.GraphStatus.TOSA_VALID
     ), "Non-valid TOSA given to reference model."
 
     transpose_data_format(outputs_np, to="NCHW")
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index e97b46cb977..9f5bb778e78 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -12,7 +12,12 @@ def define_arm_tests():
     test_files.remove("passes/test_ioquantization_pass.py")
 
     # Operators
-    test_files += native.glob(["ops/test_linear.py"])
+    test_files += [
+        "ops/test_linear.py", 
+        "ops/test_slice.py",
+        "ops/test_sigmoid.py",
+        "ops/test_tanh.py",
+    ]
 
     TESTS = {}
 
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index b995341a586..cc140cc9db5 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -12,10 +12,19 @@ script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 et_root_dir=$(cd ${script_dir}/../../.. && pwd)
 cd "${et_root_dir}"
 pwd
+setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
+_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
 
 
 TEST_SUITE=$1
 
+# Source the tools
+# This should be prepared by the setup.sh
+[[ -f ${setup_path_script} ]] \
+    || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
+
+source ${setup_path_script}
+
 help() {
     echo "Usage:"
     echo " $0 <TESTNAME>"
@@ -66,7 +75,6 @@ test_pytest() { # Test ops and other things
     ./examples/models/llama3_2_vision/install_requirements.sh
 
     cd "${et_root_dir}"
-    source examples/arm/ethos-u-scratch/setup_path.sh
     backends/arm/scripts/build_quantized_ops_aot_lib.sh
 
     # Run arm baremetal pytest tests without FVP
@@ -78,7 +86,6 @@ test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using
     echo "${TEST_SUITE_NAME}: Run pytest with fvp"
 
     ./examples/models/llama3_2_vision/install_requirements.sh
-    source examples/arm/ethos-u-scratch/setup_path.sh
 
     # Prepare Corstone-3x0 FVP for pytest
     examples/arm/run.sh --model_name=add --build_only
@@ -92,8 +99,6 @@ test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using
 test_run_ethosu_fvp() { # End to End model tests using run.sh
     echo "${TEST_SUITE_NAME}: Test ethos-u delegate examples with run.sh"
 
-    source examples/arm/ethos-u-scratch/setup_path.sh
-
     # TOSA quantized
     echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=TOSA --model_name=add
@@ -114,8 +119,6 @@ test_run_ethosu_fvp() { # End to End model tests using run.sh
 test_models_ethosu_fvp() { # End to End model tests using model_test.py
     echo "${TEST_SUITE_NAME}: Test ethos-u delegate models with test_model.py"
 
-    source examples/arm/ethos-u-scratch/setup_path.sh
-
     # Build common libs once
     python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --build_libs
 
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 7b74603cfb2..6346a53edef 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -14,10 +14,10 @@
 
 import executorch.backends.xnnpack.test.tester.tester as tester
 
-import serializer.tosa_serializer as ts  # type: ignore[import-untyped]
-
 import torch.fx
 import torch.utils._pytree as pytree
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore[import-untyped]
 from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager
 
 from executorch.backends.arm.arm_backend import (
diff --git a/backends/arm/third-party/reference_model/patches/v0.80/reference_model/0001-Move-tosa-tools-to-be-namespaced-into-tosa-tools.v0_.patch b/backends/arm/third-party/reference_model/patches/v0.80/reference_model/0001-Move-tosa-tools-to-be-namespaced-into-tosa-tools.v0_.patch
new file mode 100644
index 00000000000..512c105bda2
--- /dev/null
+++ b/backends/arm/third-party/reference_model/patches/v0.80/reference_model/0001-Move-tosa-tools-to-be-namespaced-into-tosa-tools.v0_.patch
@@ -0,0 +1,154 @@
+From 20c2059723d5c6952cecfb7fcde92601639ef825 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Per=20=C3=85strand?= <per.astrand@arm.com>
+Date: Wed, 5 Feb 2025 12:31:47 +0100
+Subject: [PATCH 1/2] Move tosa-tools to be namespaced into tosa-tools.v0_80
+
+---
+ CMakeLists.txt |  4 ++-
+ pyproject.toml |  3 ++-
+ setup.cfg      | 70 +++++++++++++++++++++++++-------------------------
+ setup.py       |  3 ++-
+ 4 files changed, 42 insertions(+), 38 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 68e8d8a..34becd0 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1,4 +1,6 @@
+-cmake_minimum_required (VERSION 3.4)
++cmake_minimum_required (VERSION 3.19)
++
++cmake_policy(SET CMP0077 NEW)
+ 
+ set(CMAKE_INSTALL_PREFIX ".")
+ project(tosa_tools LANGUAGES CXX)
+diff --git a/pyproject.toml b/pyproject.toml
+index 7565f93..60448e7 100644
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -6,7 +6,8 @@ requires = [
+     "setuptools>=42",
+     "wheel",
+     "setuptools_scm[toml]>=6.0",
+-    "cmake"
++    "cmake",
++    "ninja",
+ ]
+ build-backend = "setuptools.build_meta"
+ 
+diff --git a/setup.cfg b/setup.cfg
+index 82ec9b8..c1bd1a8 100644
+--- a/setup.cfg
++++ b/setup.cfg
+@@ -2,7 +2,7 @@
+ # SPDX-License-Identifier: Apache-2.0
+ 
+ [metadata]
+-name = tosa-tools
++name = tosa-tools-v0.80
+ # version = done by setuptools_scm in pyproject.toml
+ author = Arm Limited
+ #author_email =
+@@ -25,44 +25,44 @@ install_requires =
+ python_requires = >=3.6
+ include_package_data = True
+ packages =
+-    runner
+-    generator
+-    checker
+-    frameworks
+-    tests
+-    conformance
+-    xunit
+-    json2fbbin
+-    json2numpy
+-    schemavalidation
+-    convert2conformance
+-    tosa
+-    serializer
+-    tosa_reference_model
++    tosa_tools.v0_80.verif.runner
++    tosa_tools.v0_80.verif.generator
++    tosa_tools.v0_80.verif.checker
++    tosa_tools.v0_80.verif.frameworks
++    tosa_tools.v0_80.verif.tests
++    tosa_tools.v0_80.verif.conformance
++    tosa_tools.v0_80.xunit
++    tosa_tools.v0_80.json2fbbin
++    tosa_tools.v0_80.json2numpy
++    tosa_tools.v0_80.schemavalidation
++    tosa_tools.v0_80.convert2conformance
++    tosa_tools.v0_80.tosa
++    tosa_tools.v0_80.serializer
++    tosa_tools.v0_80.tosa_reference_model
+ package_dir =
+-    = verif
+-    xunit = scripts/xunit
+-    json2fbbin = scripts/json2fbbin
+-    json2numpy = scripts/json2numpy
+-    convert2conformance = scripts/convert2conformance
+-    tosa = thirdparty/serialization_lib/python/tosa
+-    serializer = thirdparty/serialization_lib/python/serializer
+-    tosa_reference_model = py_package
+-    schemavalidation = scripts/schemavalidation
++    tosa_tools.v0_80.verif = verif
++    tosa_tools.v0_80.xunit = scripts/xunit
++    tosa_tools.v0_80.json2fbbin = scripts/json2fbbin
++    tosa_tools.v0_80.json2numpy = scripts/json2numpy
++    tosa_tools.v0_80.convert2conformance = scripts/convert2conformance
++    tosa_tools.v0_80.tosa = thirdparty/serialization_lib/python/tosa
++    tosa_tools.v0_80.serializer = thirdparty/serialization_lib/python/serializer
++    tosa_tools.v0_80.tosa_reference_model = py_package
++    tosa_tools.v0_80.schemavalidation = scripts/schemavalidation
+ 
+ [options.entry_points]
+ console_scripts =
+-    tosa_verif_run_ref = runner.tosa_verif_run_tests:main
+-    tosa_verif_run_tests = runner.tosa_verif_run_tests:main
+-    tosa_verif_build_tests = generator.tosa_verif_build_tests:main
+-    tosa_json2numpy = json2numpy.json2numpy:main
+-    tosa_json2fbbin = json2fbbin.json2fbbin:main
+-    tosa_verif_result_check = checker.tosa_result_checker:main
+-    tosa_convert2conformance = convert2conformance.convert2conformance:main
+-    tosa_verif_framework_generator = frameworks.tosa_verif_framework_generator:main
+-    tosa_verif_framework_compiler_runner = frameworks.tosa_verif_framework_compiler_runner:main
+-    tosa_verif_conformance_generator = conformance.tosa_verif_conformance_generator:main
+-    tosa_schemavalidation = schemavalidation.schemavalidation:main
++    tosa_verif_run_ref = tosa_tools.v0_80.verif.runner.tosa_verif_run_tests:main
++    tosa_verif_run_tests = tosa_tools.v0_80.verif.runner.tosa_verif_run_tests:main
++    tosa_verif_build_tests = tosa_tools.v0_80.verif.generator.tosa_verif_build_tests:main
++    tosa_json2numpy = tosa_tools.v0_80.verif.json2numpy.json2numpy:main
++    tosa_json2fbbin = tosa_tools.v0_80.verif.json2fbbin.json2fbbin:main
++    tosa_verif_result_check = tosa_tools.v0_80.verif.checker.tosa_result_checker:main
++    tosa_convert2conformance = tosa_tools.v0_80.verif.convert2conformance.convert2conformance:main
++    tosa_verif_framework_generator = tosa_tools.v0_80.verif.frameworks.tosa_verif_framework_generator:main
++    tosa_verif_framework_compiler_runner = tosa_tools.v0_80.verif.frameworks.tosa_verif_framework_compiler_runner:main
++    tosa_verif_conformance_generator = tosa_tools.v0_80.verif.conformance.tosa_verif_conformance_generator:main
++    tosa_schemavalidation = tosa_tools.v0_80.verif.schemavalidation.schemavalidation:main
+ 
+ [options.package_data]
+ schemavalidation=
+diff --git a/setup.py b/setup.py
+index 8c6b4cd..95896ad 100644
+--- a/setup.py
++++ b/setup.py
+@@ -20,7 +20,7 @@ class CMakeBuild(build_py):
+         root_dir = Path(__file__).parent
+         build_dir = root_dir / "build"
+         build_dir.mkdir(exist_ok=True)
+-        package_dir = root_dir / "py_package"
++        package_dir = root_dir / "build/lib/tosa_tools/v0_80/tosa_reference_model/"
+ 
+         cmake_cmd = [
+             "cmake",
+@@ -90,6 +90,7 @@ class CMakeBuild(build_py):
+         # Python will know which one to import
+         copied_so = False
+         so_dir = build_dir / "reference_model"
++        package_dir.mkdir(parents=True, exist_ok=True)
+         print(f"copying .so files from '{so_dir}' to '{package_dir}'")
+         for so_file in so_dir.glob("tosa_reference_model.*.so"):
+             shutil.copy(so_file, package_dir)
+-- 
+2.39.5 (Apple Git-154)
+
diff --git a/backends/arm/third-party/reference_model/patches/v0.80/serialization_lib/0001-Make-TOSA-serializer-lib-to-be-self-contained.patch b/backends/arm/third-party/reference_model/patches/v0.80/serialization_lib/0001-Make-TOSA-serializer-lib-to-be-self-contained.patch
new file mode 100644
index 00000000000..cc9cbc4edad
--- /dev/null
+++ b/backends/arm/third-party/reference_model/patches/v0.80/serialization_lib/0001-Make-TOSA-serializer-lib-to-be-self-contained.patch
@@ -0,0 +1,283 @@
+From b3c8c3f779a7e051826f317598fb831fa9cfe923 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Per=20=C3=85strand?= <per.astrand@arm.com>
+Date: Wed, 5 Feb 2025 12:30:09 +0100
+Subject: [PATCH] Make TOSA serializer lib to be self contained
+
+---
+ CMakeLists.txt                       |  4 ++
+ python/serializer/tosa_serializer.py | 57 ++++++++++++++--------------
+ 2 files changed, 32 insertions(+), 29 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index ac34b75..5e191aa 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -19,6 +19,8 @@
+ cmake_minimum_required(VERSION 3.13.4)
+ project(TosaSerialization)
+ 
++cmake_policy(SET CMP0077 NEW)
++
+ set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard to conform to")
+ set(CMAKE_CXX_STANDARD_REQUIRED YES)
+ 
+@@ -27,6 +29,8 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
+ option(BUILD_TESTS "Build test applications" ON)
+ option(FLATBUFFERS_ROOT "Location where the flatbuffers 'include' and 'lib' folders to be found" Off)
+ 
++message(STATUS "FLATBUFFERS_ROOT set to: ${FLATBUFFERS_ROOT}")
++
+ include_directories(${PROJECT_SOURCE_DIR}/third_party/half/include)
+ 
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+diff --git a/python/serializer/tosa_serializer.py b/python/serializer/tosa_serializer.py
+index 7bc75f0..d191997 100644
+--- a/python/serializer/tosa_serializer.py
++++ b/python/serializer/tosa_serializer.py
+@@ -14,12 +14,11 @@
+ 
+ import os
+ import struct
+-import serializer.tosa_serializer as ts
+ import json
+ import flatbuffers
+ import numpy as np
+ from enum import IntEnum, unique
+-from tosa import (
++from ..tosa import (
+     TosaGraph,
+     TosaRegion,
+     TosaBasicBlock,
+@@ -27,8 +26,8 @@ from tosa import (
+     TosaOperator,
+     Version,
+ )
+-import tosa.DType as TosaDType
+-import tosa.Op as TosaOp
++from ..tosa import DType as TosaDType
++from ..tosa import Op as TosaOp
+ 
+ # Keep version number in sync with the version default value with schema/tosa.fbs
+ TOSA_VERSION_MAJOR = 0
+@@ -159,7 +158,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         output_zp,
+         accum_dtype,
+     ):
+-        from tosa import PoolAttribute as a, Attribute
++        from ..tosa import PoolAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().PoolAttribute
+ 
+@@ -172,7 +171,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.ints.append((a.AddAccumDtype, accum_dtype))
+ 
+     def ConvAttribute(self, pad, stride, dilation, input_zp, weight_zp, local_bound):
+-        from tosa import ConvAttribute as a, Attribute
++        from ..tosa import ConvAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().ConvAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -187,7 +186,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+     def TransposeConvAttribute(
+         self, outpad, stride, output_shape, input_zp, weight_zp, local_bound
+     ):
+-        from tosa import TransposeConvAttribute as a, Attribute
++        from ..tosa import TransposeConvAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().TransposeConvAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -200,7 +199,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.bools.append((a.AddLocalBound, local_bound))
+ 
+     def PadAttribute(self, serializer_builder, padding, pad_const_int, pad_const_fp):
+-        from tosa import PadAttribute as a, Attribute
++        from ..tosa import PadAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().PadAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -210,14 +209,14 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+ 
+         # pad_const_fp attribute serialized as uint8 vector
+         pad_const_float_as_bytes = struct.pack("<f", pad_const_fp)
+-        serialized_pad_const_fp = ts.TosaSerializer.serializeUint8Vec(
++        serialized_pad_const_fp = TosaSerializer.serializeUint8Vec(
+             serializer_builder, pad_const_float_as_bytes
+         )
+ 
+         self.floats.append((a.AddPadConstFp, serialized_pad_const_fp))
+ 
+     def AxisAttribute(self, axis):
+-        from tosa import AxisAttribute as a, Attribute
++        from ..tosa import AxisAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().AxisAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -225,7 +224,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.ints.append((a.AddAxis, axis))
+ 
+     def ReshapeAttribute(self, new_shape):
+-        from tosa import ReshapeAttribute as a, Attribute
++        from ..tosa import ReshapeAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().ReshapeAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -233,7 +232,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.intvecs.append((a.AddNewShape, new_shape))
+ 
+     def SliceAttribute(self, start, size):
+-        from tosa import SliceAttribute as a, Attribute
++        from ..tosa import SliceAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().SliceAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -242,7 +241,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.intvecs.append((a.AddSize, size))
+ 
+     def TileAttribute(self, multiples):
+-        from tosa import TileAttribute as a, Attribute
++        from ..tosa import TileAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().TileAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -250,7 +249,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.intvecs.append((a.AddMultiples, multiples))
+ 
+     def ResizeAttribute(self, scale, offset, border, mode):
+-        from tosa import ResizeAttribute as a, Attribute
++        from ..tosa import ResizeAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().ResizeAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -261,7 +260,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.ints.append((a.AddMode, mode))
+ 
+     def ClampAttribute(self, serializer_builder, minint, maxint, minfp, maxfp):
+-        from tosa import ClampAttribute as a, Attribute
++        from ..tosa import ClampAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().ClampAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -272,10 +271,10 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         # min/max float attributes serialized as uint8 vectors
+         minfp_bytes = struct.pack("<f", minfp)
+         maxfp_bytes = struct.pack("<f", maxfp)
+-        serialized_minfp_bytes = ts.TosaSerializer.serializeUint8Vec(
++        serialized_minfp_bytes = TosaSerializer.serializeUint8Vec(
+             serializer_builder, minfp_bytes
+         )
+-        serialized_maxfp_bytes = ts.TosaSerializer.serializeUint8Vec(
++        serialized_maxfp_bytes = TosaSerializer.serializeUint8Vec(
+             serializer_builder, maxfp_bytes
+         )
+ 
+@@ -294,7 +293,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         input_unsigned,
+         output_unsigned,
+     ):
+-        from tosa import RescaleAttribute as a, Attribute
++        from ..tosa import RescaleAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().RescaleAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -310,7 +309,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.bools.append((a.AddOutputUnsigned, output_unsigned))
+ 
+     def MulAttribute(self, shift):
+-        from tosa import MulAttribute as a, Attribute
++        from ..tosa import MulAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().MulAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -318,7 +317,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.ints.append((a.AddShift, shift))
+ 
+     def ArithmeticRightShiftAttribute(self, round):
+-        from tosa import ArithmeticRightShiftAttribute as a, Attribute
++        from ..tosa import ArithmeticRightShiftAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().ArithmeticRightShiftAttribute
+         self.optFcns = (
+@@ -329,7 +328,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.bools.append((a.AddRound, round))
+ 
+     def CondIfAttribute(self, then_branch, else_branch):
+-        from tosa import CondIfAttribute as a, Attribute
++        from ..tosa import CondIfAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().CondIfAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -338,7 +337,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.strings.append((a.AddElseBranch, else_branch))
+ 
+     def WhileLoopAttribute(self, cond_branch, body_branch):
+-        from tosa import WhileLoopAttribute as a, Attribute
++        from ..tosa import WhileLoopAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().WhileLoopAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -347,7 +346,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.strings.append((a.AddBodyBranch, body_branch))
+ 
+     def TransposeAttribute(self, perms):
+-        from tosa import TransposeAttribute as a, Attribute
++        from ..tosa import TransposeAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().TransposeAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -355,7 +354,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.intvecs.append((a.AddPerms, perms))
+ 
+     def TableAttribute(self, table):
+-        from tosa import TableAttribute as a, Attribute
++        from ..tosa import TableAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().TableAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -363,7 +362,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.int16vecs.append((a.AddTable, table))
+ 
+     def MatMulAttribute(self, A_zp, B_zp):
+-        from tosa import MatMulAttribute as a, Attribute
++        from ..tosa import MatMulAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().MatMulAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -372,7 +371,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.ints.append((a.AddBZp, B_zp))
+ 
+     def FullyConnectedAttribute(self, input_zp, weight_zp):
+-        from tosa import FullyConnectedAttribute as a, Attribute
++        from ..tosa import FullyConnectedAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().FullyConnectedAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -381,7 +380,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.ints.append((a.AddWeightZp, weight_zp))
+ 
+     def NegateAttribute(self, input1_zp, output_zp):
+-        from tosa import NegateAttribute as a, Attribute
++        from ..tosa import NegateAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().NegateAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -390,7 +389,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.ints.append((a.AddOutputZp, output_zp))
+ 
+     def FFTAttribute(self, inverse, local_bound):
+-        from tosa import FFTAttribute as a, Attribute
++        from ..tosa import FFTAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().FFTAttribute
+         self.optFcns = (a.Start, a.End)
+@@ -399,7 +398,7 @@ class TosaSerializerAttribute(TosaSerializerUnion):
+         self.bools.append((a.AddLocalBound, local_bound))
+ 
+     def RFFTAttribute(self, local_bound):
+-        from tosa import RFFTAttribute as a, Attribute
++        from ..tosa import RFFTAttribute as a, Attribute
+ 
+         self.utype = Attribute.Attribute().RFFTAttribute
+         self.optFcns = (a.Start, a.End)
+-- 
+2.39.5 (Apple Git-154)
+
diff --git a/backends/arm/tosa_backend.py b/backends/arm/tosa_backend.py
index 314f4c7d291..ad16b0d84df 100644
--- a/backends/arm/tosa_backend.py
+++ b/backends/arm/tosa_backend.py
@@ -13,7 +13,7 @@
 import logging
 from typing import cast, final, List
 
-import serializer.tosa_serializer as ts  # type: ignore
+import executorch.backends.arm.tosa_specification as tosa_specification
 
 from executorch.backends.arm.arm_backend import get_tosa_spec
 from executorch.backends.arm.operators.node_visitor import get_node_visitors
@@ -88,7 +88,22 @@ def preprocess(  # noqa: C901
 
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
+        if isinstance(tosa_spec, tosa_specification.Tosa_0_80):
+            import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+        elif isinstance(tosa_spec, tosa_specification.Tosa_1_00):
+            import serializer.tosa_serializer as ts  # type: ignore
+        else:
+            raise RuntimeError(
+                f"Unknown TOSA version {tosa_spec}, no pip package installed to handle serialization to that version."
+            )
+
         tosa_graph = ts.TosaSerializer(artifact_path)
+
+        assert (
+            tosa_spec.version.major == ts.TOSA_VERSION_MAJOR
+            and tosa_spec.version.minor == ts.TOSA_VERSION_MINOR
+        ), f"TOSA serializer version ({ts.TOSA_VERSION_MAJOR}.{ts.TOSA_VERSION_MINOR}) doesn't match specification {tosa_spec}"
+
         graph_module = ArmPassManager(tosa_spec).transform_to_backend_pipeline(  # type: ignore
             exported_program=edge_program
         )
diff --git a/backends/arm/tosa_mapping.py b/backends/arm/tosa_mapping.py
index b75f0e88fde..26441cbfb02 100644
--- a/backends/arm/tosa_mapping.py
+++ b/backends/arm/tosa_mapping.py
@@ -13,9 +13,10 @@
 
 from typing import Any, Sequence
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
 
 UNSUPPORTED_DTYPES = (
     torch.float64,
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index 70a2dd4281b..0cfa19eb453 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -12,13 +12,13 @@
 
 import executorch.backends.arm.tosa_mapping
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch.fx
 import torch.fx.node
-import tosa.Op as TosaOp  # type: ignore
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+import tosa_tools.v0_80.tosa.Op as TosaOp  # type: ignore
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.exir.dialects._ops import ops as exir_ops
-from serializer.tosa_serializer import TosaSerializer, TosaSerializerTensor
 from torch import Tensor
 from torch.fx import Node
 
@@ -30,7 +30,7 @@
 
 def insert_rescale_ops_to_int32(
     tosa_graph: ts.TosaSerializer, inputs: list[TosaArg], node: Node
-) -> tuple[list[TosaSerializerTensor], float]:
+) -> tuple[list[ts.TosaSerializerTensor], float]:
     """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'.
     The scales are adjusted using the smallest scale of all 'nodes'.
 
@@ -61,7 +61,7 @@ def insert_rescale_ops_to_int32(
     min_scale = min([qarg.scale for qarg in qargs])
     scales = [qarg.scale / min_scale for qarg in qargs]
 
-    rescaled_nodes: list[TosaSerializerTensor] = []
+    rescaled_nodes: list[ts.TosaSerializerTensor] = []
     for tensor, qarg, scale in zip(tensors, qargs, scales):
         rescaled_nodes.append(
             build_rescale_to_int32(
@@ -198,9 +198,9 @@ def compute_multiplier_and_shift(
 
 
 def build_rescale(
-    tosa_fb: TosaSerializer,
+    tosa_fb: ts.TosaSerializer,
     scale: list[float],
-    input_node: TosaSerializerTensor,
+    input_node: ts.TosaSerializerTensor,
     output_name: str,
     output_type: ts.DType,
     output_shape: List[int],
@@ -233,14 +233,14 @@ def build_rescale(
 
 
 def build_rescale_to_int32(
-    tosa_fb: TosaSerializer,
+    tosa_fb: ts.TosaSerializer,
     input_arg: executorch.backends.arm.tosa_mapping.TosaArg,
     input_zp: int,
     rescale_scale: list[float],
     is_scale32: bool = True,
     is_double_round: bool = False,
     per_channel: bool = False,
-) -> TosaSerializerTensor:
+) -> ts.TosaSerializerTensor:
     multipliers, shifts = compute_multiplier_and_shift(rescale_scale)
     attr_rescale = ts.TosaSerializerAttribute()
     attr_rescale.RescaleAttribute(
@@ -266,7 +266,7 @@ def build_rescale_to_int32(
 
 
 def build_rescale_from_int32(
-    tosa_fb: TosaSerializer,
+    tosa_fb: ts.TosaSerializer,
     input_name: str,
     output_name: str,
     output_zp: int,
@@ -300,8 +300,8 @@ def build_rescale_from_int32(
 
 
 def build_rescale_conv_output(
-    tosa_fb: TosaSerializer,
-    op: TosaSerializerTensor,
+    tosa_fb: ts.TosaSerializer,
+    op: ts.TosaSerializerTensor,
     output_name: str,
     output_type: ts.DType,
     input_scale: list[float],
diff --git a/backends/arm/tosa_specification.py b/backends/arm/tosa_specification.py
index 94c307d440c..640361e059c 100644
--- a/backends/arm/tosa_specification.py
+++ b/backends/arm/tosa_specification.py
@@ -142,7 +142,7 @@ class Tosa_1_00(TosaSpecification):
 
     available_profiles = ["INT", "FP"]
     valid_extensions = {
-        "INT": ["int16", "int4", "var", "cf"],
+        "INT": ["int16", "int4", "var", "cf", "u55"],
         "FP": ["bf16", "fp8e4m3", "fp8e5m2", "fft", "var", "cf"],
     }
 
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index 5fa603ea683..4d0f33003bc 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -9,14 +9,15 @@
 import os
 from typing import Any, Optional, Tuple
 
-import serializer.tosa_serializer as ts  # type: ignore
 import torch
+
+import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.tosa_mapping import TosaArg
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.print_program import inspect_node
-from serializer.tosa_serializer import TosaOp
 from torch.fx import Node
+from tosa_tools.v0_80.serializer.tosa_serializer import TosaOp
 
 logger = logging.getLogger(__name__)
 
diff --git a/backends/arm/util/arm_model_evaluator.py b/backends/arm/util/arm_model_evaluator.py
index c371d794376..5fbc6f7e894 100644
--- a/backends/arm/util/arm_model_evaluator.py
+++ b/backends/arm/util/arm_model_evaluator.py
@@ -24,6 +24,7 @@
 
 # Logger for outputting progress for longer running evaluation
 logger = logging.getLogger(__name__)
+# Explicitly set logging level: MLETORCH-893
 logger.setLevel(logging.INFO)
 
 
diff --git a/backends/cadence/README.md b/backends/cadence/README.md
index 998ac55ddf0..3cefb71d945 100644
--- a/backends/cadence/README.md
+++ b/backends/cadence/README.md
@@ -6,7 +6,7 @@
 
 ## Tutorial
 
-Please follow the [tutorial](https://pytorch.org/executorch/main/build-run-xtensa.html) for more information on how to run models on Cadence/Xtensa DSPs.
+Please follow the [tutorial](https://pytorch.org/executorch/main/backends-cadence) for more information on how to run models on Cadence/Xtensa DSPs.
 
 ## Directory Structure
 
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 4d51d1fa34c..32a4427278b 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -31,11 +31,11 @@
     EdgeProgramManager,
     ExecutorchBackendConfig,
     ExecutorchProgramManager,
-    to_edge,
 )
 from executorch.exir.pass_base import PassResult
 from executorch.exir.passes import ToOutVarPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
+from executorch.exir.program._program import to_edge_with_preserved_ops
 from torch._inductor.decomposition import remove_decompositions
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
@@ -80,6 +80,7 @@ def convert_pt2(
         torch.ops.aten.layer_norm.default,
         torch.ops.aten.linear.default,
         torch.ops.aten.matmul.default,
+        torch.ops.aten.rms_norm.default,
     ]
     # Remove decompositions for the ops we want to keep
     # pyre-fixme[6]: For 1st argument expected `Dict[typing.Callable[..., typing.Any
@@ -201,9 +202,9 @@ def lower_ep_to_edge(
     """
     Lower an ExportedProgram to an EdgeProgramManager (in edge IR).
     """
-    # Call to_edge to convert the graph to edge IR.
+    # Call to_edge_with_preserved_ops to convert the graph to edge IR.
     # Note: dim_order is skipped (https://github.com/pytorch/executorch/issues/3704)
-    edge_prog_manager = to_edge(
+    edge_prog_manager = to_edge_with_preserved_ops(
         expo_program,
         compile_config=EdgeCompileConfig(
             _skip_dim_order=True,
@@ -216,9 +217,11 @@ def lower_ep_to_edge(
                 torch.ops.aten.linalg_vector_norm.default,
                 torch.ops.aten.unfold.default,
                 torch.ops.aten.angle.default,
+                torch.ops.aten.rms_norm.default,
             ],
         ),
         constant_methods=constant_methods,
+        preserve_ops=(torch.ops.aten.rms_norm.default,),
     )
 
     if dump_graphs:
diff --git a/backends/cadence/aot/memory_planning.py b/backends/cadence/aot/memory_planning.py
index cfe1b9ab9d8..3c6c518f16a 100644
--- a/backends/cadence/aot/memory_planning.py
+++ b/backends/cadence/aot/memory_planning.py
@@ -12,7 +12,7 @@
 import math
 import typing
 from functools import partial
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from executorch.backends.cadence.aot.memory_constraints import (
@@ -73,11 +73,11 @@ def collect_specs_from_graph_module(
 # the fastest memory available
 # flake8: noqa 'position_based_greedy_with_hierarchy' is too complex (13)
 def position_based_greedy_with_hierarchy(
-    graph_module: torch.fx.GraphModule,
     alignment: int,
+    specs: Set[TensorSpec],
+    graph_module: torch.fx.GraphModule,
     graph_signature: ExportGraphSignature,
-    alloc_graph_input: bool,
-    alloc_graph_output: bool,
+    extra_padding: int = 0,
     *,
     memory_config: MemoryConfig,
     mem_constraints: MemConstraints,
@@ -119,9 +119,7 @@ def memory_available(spec: TensorSpec) -> bool:
 
     # Iterate over all the specs in sorted order
     for spec in sorted(
-        collect_specs_from_graph_module(
-            graph_module, graph_signature, alloc_graph_input, alloc_graph_output
-        ),
+        specs,
         key=lambda spec: spec.allocated_memory,
         reverse=True,
     ):
@@ -167,11 +165,11 @@ def memory_available(spec: TensorSpec) -> bool:
 
 # Greedy tensor placement with the heuristics from arxiv.org/pdf/2001.03288.pdf
 def greedy_by_size_for_offset_calculation_with_hierarchy(
-    graph_module: torch.fx.GraphModule,
     alignment: int,
+    specs: Set[TensorSpec],
+    graph_module: torch.fx.GraphModule,
     graph_signature: ExportGraphSignature,
-    alloc_graph_input: bool,
-    alloc_graph_output: bool,
+    extra_padding: int = 0,
     *,
     memory_config: MemoryConfig,
     mem_constraints: MemConstraints,
@@ -199,9 +197,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
 
     # Iterate over all the specs in sorted order
     for spec in sorted(
-        collect_specs_from_graph_module(
-            graph_module, graph_signature, alloc_graph_input, alloc_graph_output
-        ),
+        specs,
         key=lambda spec: spec.allocated_memory,
         reverse=True,
     ):
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index dec6feb1b8d..aca4965083d 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -139,7 +139,6 @@
     "int in_zero_point, bool channel_last=False) -> (Tensor out)"
 )
 lib.define("linalg_vector_norm(Tensor X) -> (Tensor Y)")
-lib.define("rms_norm(Tensor X, float eps, Tensor W) -> (Tensor Y)")
 lib.define(
     "transposed_im2row(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, "
     "int[2] output_padding, Tensor in_zero_point, bool channel_last=False) -> (Tensor out)"
@@ -211,9 +210,6 @@
     "fully_connected.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define("linalg_vector_norm.out(Tensor X, *, Tensor(a!) out) -> Tensor(a!)")
-lib.define(
-    "rms_norm.out(Tensor X, float eps, Tensor W, *, Tensor(a!) out) -> Tensor(a!)"
-)
 lib.define(
     "quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, "
     "Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
@@ -293,6 +289,15 @@
     "attention_mask.out(Tensor input, Tensor start, Tensor stop, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
+# Custom ops in aten namespace. RMSNorm is usually decomposed, so having
+# an out-variant is non-standard
+
+lib_aten = Library("aten", "FRAGMENT")
+
+lib_aten.define(
+    "rms_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
 
 @register_fake("cadence::quantize_per_tensor")
 def quantize_per_tensor_meta(
@@ -619,15 +624,6 @@ def linalg_vector_norm_meta(
     return X.new_empty([], dtype=X.dtype)
 
 
-@register_fake("cadence::rms_norm")
-def rms_norm_meta(
-    X: torch.Tensor,
-    eps: float,
-    weight: torch.Tensor,
-) -> torch.Tensor:
-    return X.new_empty(X.shape, dtype=X.dtype)
-
-
 @register_fake("cadence::requantize")
 def requantize_meta(
     input: torch.Tensor,
diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py
index 6b34021a20a..ca5ed017046 100644
--- a/backends/cadence/aot/pass_utils.py
+++ b/backends/cadence/aot/pass_utils.py
@@ -35,8 +35,8 @@ class CadencePassAttribute:
 ALL_CADENCE_PASSES: dict[ExportPass, CadencePassAttribute] = {}
 
 
-def get_cadence_pass_attribute(p: ExportPass) -> CadencePassAttribute:
-    return ALL_CADENCE_PASSES[p]
+def get_cadence_pass_attribute(p: ExportPass) -> Optional[CadencePassAttribute]:
+    return ALL_CADENCE_PASSES.get(p, None)
 
 
 # A decorator that registers a pass.
@@ -61,7 +61,8 @@ def create_cadence_pass_filter(
     def _filter(p: ExportPass) -> bool:
         pass_attribute = get_cadence_pass_attribute(p)
         return (
-            pass_attribute.opt_level is not None
+            pass_attribute is not None
+            and pass_attribute.opt_level is not None
             and pass_attribute.opt_level <= opt_level
             and (not pass_attribute.debug_pass or debug)
         )
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index 62727985452..761b2bf8d31 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -43,7 +43,7 @@
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
 
 
-act_qspec_asym8u = QuantizationSpec(
+act_qspec_asym8s = QuantizationSpec(
     dtype=torch.int8,
     quant_min=-128,
     quant_max=127,
@@ -52,7 +52,7 @@
     observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
 )
 
-wgt_qspec_asym8u = QuantizationSpec(
+wgt_qspec_asym8s = QuantizationSpec(
     dtype=torch.int8,
     quant_min=-128,
     quant_max=127,
@@ -61,7 +61,7 @@
     observer_or_fake_quant_ctr=MinMaxObserver,
 )
 
-wgt_qspec_asym8s = QuantizationSpec(
+wgt_qspec_sym8s = QuantizationSpec(
     dtype=torch.int8,
     quant_min=-128,
     quant_max=127,
@@ -72,17 +72,17 @@
 
 bias_qspec: Optional[QuantizationSpec] = None
 
-qconfig_A8uW8u = QuantizationConfig(
-    act_qspec_asym8u,
-    act_qspec_asym8u,
-    wgt_qspec_asym8u,
+qconfig_A8W8 = QuantizationConfig(
+    act_qspec_asym8s,
+    act_qspec_asym8s,
+    wgt_qspec_asym8s,
     None,
 )
 
-qconfig_A8uW8s = QuantizationConfig(
-    act_qspec_asym8u,
-    act_qspec_asym8u,
-    wgt_qspec_asym8s,
+qconfig_A8W8sym = QuantizationConfig(
+    act_qspec_asym8s,
+    act_qspec_asym8s,
+    wgt_qspec_sym8s,
     None,
 )
 
@@ -189,15 +189,14 @@ def get_supported_operators(cls) -> List[OperatorConfig]:
 
 def get_cadence_default_quantizers() -> List[Quantizer]:
     return [
-        CadenceAtenQuantizer(AddmmPattern(), qconfig_A8uW8u),
-        CadenceAtenQuantizer(BmmPattern(), qconfig_A8uW8u),
-        CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8uW8s),
-        CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8uW8s),
-        CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8uW8u),
-        CadenceAtenQuantizer(LinearPattern(), qconfig_A8uW8u),
-        CadenceAtenQuantizer(MatmulPattern(), qconfig_A8uW8u),
-        CadenceAtenQuantizer(ReluPattern0(), qconfig_A8uW8u),
-        CadenceAtenQuantizer(ReluPattern1(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(AddmmPattern(), qconfig_A8W8),
+        CadenceAtenQuantizer(BmmPattern(), qconfig_A8W8),
+        CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8W8sym),
+        CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8W8sym),
+        CadenceAtenQuantizer(LinearPattern(), qconfig_A8W8),
+        CadenceAtenQuantizer(MatmulPattern(), qconfig_A8W8),
+        CadenceAtenQuantizer(ReluPattern0(), qconfig_A8W8),
+        CadenceAtenQuantizer(ReluPattern1(), qconfig_A8W8),
     ]
 
 
@@ -236,14 +235,26 @@ def __init__(
         super().__init__([])
 
 
+class CadenceWithLayerNormQuantizer(CadenceQuantizer):
+    """
+    Quantizer including layer norm
+    """
+
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = get_cadence_default_quantizers()
+        quantizers.append(CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8W8))
+        super().__init__(quantizers)
+
+
 class CadenceWakeWordQuantizer(CadenceQuantizer):
     """
-    Quantizer for WakeWord, including add
+    Quantizer for WakeWord, including add and cat
     """
 
     def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
         if quantizers is None:
             quantizers = get_cadence_default_quantizers()
-        quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8uW8u))
-        quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8uW8u))
+        quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8))
+        quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8))
         super().__init__(quantizers)
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 5a4922ae069..867e4ec79c6 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -1806,30 +1806,6 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(op, tuple(new_args), kwargs, meta)
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=0))
-class ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass(ExportPass):
-    """
-    Replace the aten.linalg_vector_norm op with a custom op.
-    aten.linalg_vector_norm is not supported by Jarvis, so we
-    need to replace it with native_batch_norm at all optimization levels.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if op != exir_ops.edge.aten.linalg_vector_norm.default:
-            return super().call_operator(op, args, kwargs, meta)
-
-        assert (
-            len(args) == 1
-        ), "aten.linalg_vector_norm should have 1 argument (a tensor), we do not support any custom variants"
-
-        return super().call_operator(
-            exir_ops.edge.cadence.linalg_vector_norm.default,
-            args,
-            kwargs,
-            meta,
-        )
-
-
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
     """
@@ -2110,6 +2086,102 @@ def call_operator(
         return super().call_operator(op, args, kwargs, meta)
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=2))
+class ReplaceGeluWithApproximateGeluPass(ExportPass):
+    """
+    Replace the gelu op with an approximate gelu op. The approximate gelu op
+    is more efficient on DSP backends.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in {
+            exir_ops.edge.aten.gelu.default,
+        }:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # compute the approximate gelu (0.7978845608028654 is sqrt(2 / pi))
+        # as 0.5 * x * (1 + torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3)))
+
+        # Get 0.5 * x
+        half = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[0], 0.5),
+            {},
+            meta,
+        )
+
+        scaled = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[0], 0.044715),
+            {},
+            meta,
+        )
+
+        # Get x^2 (note that we use mul.Tensor twice instead of pow.Tensor because
+        # it is much more efficient on DSP backends)
+        scaled_square = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (scaled, args[0]),
+            {},
+            meta,
+        )
+
+        # Get x^3
+        scaled_cubed = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (scaled_square, args[0]),
+            {},
+            meta,
+        )
+
+        # Get x + 0.044715 * x^3
+        inner_sum = super().call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (scaled_cubed, args[0]),
+            {},
+            meta,
+        )
+
+        # Get 0.7978845608028654 * ( x + 0.044715 * x^3)
+        scaled_sum = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (inner_sum, 0.7978845608028654),
+            {},
+            meta,
+        )
+
+        # Get torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3))
+        tanh = super().call_operator(
+            exir_ops.edge.aten.tanh.default,
+            (scaled_sum,),
+            {},
+            meta,
+        )
+
+        # Get 1 + torch.tanh(0.79788456 * ( x + 0.044715 * x^3))
+        # TODO(): Check why this is not working properly with integer values (e.g. 1 instead of 1.)
+        outer_sum = super().call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (tanh, 1.0),
+            {},
+            meta,
+        )
+
+        # Retunr the final result
+        return super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (half, outer_sum),
+            {},
+            meta,
+        )
+
+
 # This class encapsulates all the functions that replace/switch one op in the
 # graph with another.
 class CadenceReplaceOpsInGraph:
@@ -2147,6 +2219,6 @@ class CadenceReplaceOpsInGraph:
         ReplacePT2DequantWithCadenceDequantPass,
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
-        ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
         ReplaceWhereWithFullArgsWithWhereScalar,
+        # ReplaceGeluWithApproximateGeluPass,
     ]
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
index e40c26c0f4e..886550772b5 100644
--- a/backends/cadence/aot/tests/test_replace_ops_passes.py
+++ b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -23,12 +23,12 @@
     MakeSliceAndCatDimOutermostPass,
     ReplaceAddMMWithLinearPass,
     ReplaceAtenConvolutionWithJarvisConvolutionPass,
-    ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
     ReplaceConstantPadNdWithSlicePass,
     ReplaceConvolutionOptionalArgsWithConcreteArgsPass,
     ReplaceConvWithIm2RowAndLinear,
     ReplaceEmptyTensorsWithFullPass,
     ReplaceFunctionallyEquivalentOpTargets,
+    ReplaceGeluWithApproximateGeluPass,
     ReplaceIm2RowWithViewPass,
     ReplaceLinearWithFullyConnectedOpPass,
     ReplaceMMWithAddMMPass,
@@ -1188,36 +1188,6 @@ def forward(self, x):
             count_node(graph_after_passes, exir_ops.edge.aten.transpose_copy.int), 0
         )
 
-    def test_replace_aten_linalg_vector_norm_with_cadence_linalg_vector_norm(self):
-        class LinalgVectorNorm(torch.nn.Module):
-            def forward(self, x: torch.Tensor):
-                return torch.linalg.vector_norm(x)
-
-        x = torch.randn(32)
-
-        graph_module = (
-            export_to_edge(LinalgVectorNorm(), (x,)).exported_program().graph_module
-        )
-
-        p = ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass()
-        graph_after_passes = cast(PassResult, p(graph_module)).graph_module
-
-        # Assert that aten.linalg_vector_norm op was replaced by a
-        # cadence.linalg_vector_norm op
-        self.assertEqual(
-            count_node(
-                graph_after_passes,
-                exir_ops.edge.aten.linalg_vector_norm.default,
-            ),
-            0,
-        )
-        self.assertEqual(
-            count_node(
-                graph_after_passes, exir_ops.edge.cadence.linalg_vector_norm.default
-            ),
-            1,
-        )
-
     def test_replace_aten_where_with_cadence_where_Scalar(self):
         class WhereScalarModel(torch.nn.Module):
             def forward(self, cond: torch.Tensor):
@@ -1301,6 +1271,41 @@ def forward(self, cond: torch.Tensor):
             1,
         )
 
+    def test_replace_aten_gelu_with_approximate_gelu(self):
+        class Gelu(torch.nn.Module):
+            def forward(self, input):
+                return torch.nn.functional.gelu(input)
+
+        inputs = torch.randn(2, 1, 64)
+
+        graph_module = export_to_edge(Gelu(), (inputs,)).exported_program().graph_module
+
+        p = ReplaceGeluWithApproximateGeluPass()
+        graph_after_passes = cast(PassResult, p(graph_module)).graph_module
+
+        # Assert that aten.gelu op was decomposed
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                exir_ops.edge.aten.gelu.default,
+            ),
+            0,
+        )
+
+        # The decomposition should have one tanh, 2 add and 6 mul
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.tanh.default),
+            1,
+        )
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.add.Tensor),
+            2,
+        )
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.mul.Tensor),
+            6,
+        )
+
 
 class TestReplaceIm2rowWithViewPass(unittest.TestCase):
     def test_no_replacement_for_conv(self):
diff --git a/backends/cadence/fusion_g3/operators/targets.bzl b/backends/cadence/fusion_g3/operators/targets.bzl
index fffeee0d7b3..b878226fcb1 100644
--- a/backends/cadence/fusion_g3/operators/targets.bzl
+++ b/backends/cadence/fusion_g3/operators/targets.bzl
@@ -40,6 +40,7 @@ OPERATORS = [
     "rsqrt",
     "sigmoid",
     "sqrt",
+    "hardtanh",
     "tanh",
     "transpose_copy",
     "where",
diff --git a/backends/cadence/runtime/et_pal.cpp b/backends/cadence/runtime/et_pal.cpp
index fdf058f05b3..7973e3acc5b 100644
--- a/backends/cadence/runtime/et_pal.cpp
+++ b/backends/cadence/runtime/et_pal.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#if defined(XTENSA)
+#if defined(__XTENSA__)
 
 #include <stdio.h>
 #include <sys/times.h>
diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py
index 52b64dc1581..f38c1cc4154 100644
--- a/backends/cadence/utils/facto_util.py
+++ b/backends/cadence/utils/facto_util.py
@@ -99,6 +99,7 @@ def apply_scalar_contraints(op_name: str) -> list[ScalarDtype]:
     match op_name:
         case "add.Scalar" | "sub.Scalar" | "mul.Scalar" | "div.Scalar":
             return [ScalarDtype.int]
+
         case _:
             return [ScalarDtype.float, ScalarDtype.int]
 
@@ -122,6 +123,11 @@ def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, s
                         cp.Size.Le(lambda deps, r, d: 2**2),
                     ]
                 )
+            if in_spec.name == "max_val":  # hardtanh
+                spec.inspec[index].deps = [0, 1]
+                spec.inspec[index].constraints.extend(
+                    [cp.Value.Ge(lambda deps, _: deps[1])]
+                )
             else:
                 spec.inspec[index].constraints.extend(
                     [
diff --git a/backends/example/README.md b/backends/example/README.md
index e1780722904..2e5ddd1f7bb 100644
--- a/backends/example/README.md
+++ b/backends/example/README.md
@@ -17,16 +17,16 @@ In the following diagram, we show how to quantize a mobile net v2 model and lowe
 
 We can define patterns based on the operators supported by the backend, which will be used by the quantizer and delegate.
 
-![](./diagrams/quantize_delegate.png)
+![](diagrams/quantize_delegate.png)
 
 ### Partitioner and Backend
 
 The way partitioner and backend is, partitioner will tag the nodes to lower to the backend and backend will will receive all tagged nodes and preprocess them as a delegate.
 
-![](./diagrams/delegate.png)
+![](diagrams/delegate.png)
 
 ### Memory format permute
 
 Some operators may have better performance in the memory format other than contiguous. One way to do that is to insert `to_dim_op` to describe memory format permutation and merge if there two opposite one next to each other.
 
-![](./diagrams/memory_permute.png)
+![](diagrams/memory_permute.png)
diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md
index ec4c392eb46..0a756a7bf1a 100644
--- a/backends/mediatek/README.md
+++ b/backends/mediatek/README.md
@@ -43,7 +43,7 @@ Download [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/publ
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) guide for detailed instructions on setting up the ExecuTorch environment.
+1. **Setup ExecuTorch Environment**: Refer to the [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup) guide for detailed instructions on setting up the ExecuTorch environment.
 
 2. **Setup MediaTek Backend Environment**
 - Install the dependent libs. Ensure that you are inside backends/mediatek/ directory
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
new file mode 100644
index 00000000000..eff7f513cb9
--- /dev/null
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -0,0 +1,205 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from executorch.backends.nxp.quantizer.patterns import (
+    AddmmPattern,
+    AvgPoolPattern,
+    Conv1dPattern,
+    Conv2dPattern,
+    LinearPattern,
+    MaxPoolPattern,
+    PadPattern,
+    PermutePattern,
+    QuantizationPattern,
+    ReluInPlacePattern,
+    ReluPattern,
+    ReshapePattern,
+    SoftMaxPattern,
+)
+from executorch.backends.nxp.quantizer.utils import (
+    find_sequential_partitions_aten,
+    is_annotated,
+    no_outside_users,
+)
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
+    OperatorConfig,
+    QuantizationAnnotation,
+    QuantizationConfig,
+    QuantizationSpec,
+)
+from torch import fx
+from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
+from torch.ao.quantization.quantizer import DerivedQuantizationSpec, Quantizer
+from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
+
+
+class NeutronAtenQuantizer(Quantizer):
+    def __init__(
+        self, pattern: QuantizationPattern, quantization_config: QuantizationConfig
+    ) -> None:
+        super().__init__()
+        self.pattern = pattern
+        self.quantization_config = quantization_config
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        fused_partitions = find_sequential_partitions_aten(
+            model,
+            self.pattern.partition_types(),
+        )
+
+        input_act_qspec = self.quantization_config.input_activation
+        weight_qspec = self.quantization_config.weight
+        bias_qspec = self.quantization_config.bias
+        output_act_qspec = self.quantization_config.output_activation
+
+        for fused_partition in fused_partitions:
+            if not no_outside_users(fused_partition):
+                continue
+
+            anchors = self.pattern.get_anchors(model, fused_partition)
+            if not anchors or anchors.empty:
+                continue
+            if is_annotated(
+                [
+                    x[0]
+                    for x in anchors.inputs
+                    + anchors.weights
+                    + anchors.biases
+                    + anchors.output
+                ]
+            ):
+                continue
+
+            for output, *custom_spec in anchors.output:
+                # pyre-ignore[16]: no attribute
+                output.meta["quantization_annotation"] = QuantizationAnnotation(
+                    # pyre-ignore[6]: incompatible parameter type
+                    output_qspec=(custom_spec[0] if custom_spec else output_act_qspec),
+                    _annotated=True,
+                )
+
+            def annotate_inputs(
+                inputs: Union[
+                    List[Tuple[fx.Node, int]],
+                    List[Tuple[fx.Node, int, DerivedQuantizationSpec],],
+                ],
+                spec: Optional[QuantizationSpec],
+            ) -> None:
+                for node, idx, *custom_spec in inputs:
+                    # pyre-ignore[16]: no attribute
+                    annotation = node.meta.get(
+                        "quantization_annotation",
+                        QuantizationAnnotation(_annotated=True),
+                    )
+                    arg = (
+                        # pyre-ignore[16]: no attribute
+                        node.args[idx]
+                        if isinstance(idx, int)
+                        # pyre-ignore[16]: no attribute
+                        else node.args[idx[0]][idx[1]]
+                    )
+                    annotation.input_qspec_map[arg] = (
+                        custom_spec[0] if custom_spec else spec
+                    )
+                    # pyre-ignore[16]: no attribute
+                    node.meta["quantization_annotation"] = annotation
+
+            def annotate_weights_or_biases(
+                weights_or_biases: List[Tuple[fx.Node, int]],
+                spec: Optional[QuantizationSpec],
+            ) -> None:
+                for node, idx, *custom_spec in weights_or_biases:
+                    annotation = node.meta.get(
+                        "quantization_annotation",
+                        QuantizationAnnotation(_annotated=True),
+                    )
+                    annotation.input_qspec_map[node.args[idx]] = (
+                        custom_spec[0] if custom_spec else spec
+                    )
+                    node.meta["quantization_annotation"] = annotation
+
+            # pyre-ignore[6]: incompatible parameter type
+            annotate_inputs(anchors.inputs, input_act_qspec)
+            annotate_weights_or_biases(anchors.weights, weight_qspec)
+            # pyre-ignore[6]: incompatible parameter type
+            annotate_weights_or_biases(anchors.biases, bias_qspec)
+        return model
+
+    def validate(self, model: fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> List[OperatorConfig]:
+        return []
+
+
+# Quantization Specification used by Neutron NPU
+act_qspec = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-128,
+    quant_max=127,
+    qscheme=torch.per_tensor_affine,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
+)
+
+wgt_qspec = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-127,
+    quant_max=127,
+    qscheme=torch.per_tensor_symmetric,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=MinMaxObserver,
+    ch_axis=0,
+)
+
+wgt_fc_qspec = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-127,
+    quant_max=127,
+    qscheme=torch.per_tensor_symmetric,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=MinMaxObserver,
+)
+
+# Is set by the *PatternQuantizer directly.
+bias_qspec = None
+
+
+class NeutronQuantizer(ComposableQuantizer):
+    def __init__(self):
+        static_qconfig = QuantizationConfig(
+            act_qspec,
+            act_qspec,
+            wgt_qspec,
+            None,
+        )
+        static_fc_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_fc_qspec, None)
+        super().__init__(
+            [
+                NeutronAtenQuantizer(AddmmPattern(), static_fc_qconfig),
+                NeutronAtenQuantizer(Conv1dPattern(), static_qconfig),
+                NeutronAtenQuantizer(Conv2dPattern(), static_qconfig),
+                NeutronAtenQuantizer(LinearPattern(), static_fc_qconfig),
+                NeutronAtenQuantizer(MaxPoolPattern(), static_qconfig),
+                NeutronAtenQuantizer(SoftMaxPattern(), static_qconfig),
+                NeutronAtenQuantizer(ReshapePattern(), static_qconfig),
+                NeutronAtenQuantizer(PermutePattern(), static_qconfig),
+                NeutronAtenQuantizer(PadPattern(), static_qconfig),
+                NeutronAtenQuantizer(ReluPattern(), static_qconfig),
+                NeutronAtenQuantizer(ReluInPlacePattern(), static_qconfig),
+                NeutronAtenQuantizer(AvgPoolPattern(), static_qconfig),
+            ]
+        )
+
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        return model
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
new file mode 100644
index 00000000000..6797447c50c
--- /dev/null
+++ b/backends/nxp/quantizer/patterns.py
@@ -0,0 +1,342 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2025 NXP
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple, Type, Union
+
+import torch
+
+from executorch.backends.nxp.quantizer.utils import get_bias_qparams
+from torch import fx
+from torch._ops import OpOverload
+from torch.ao.quantization.quantizer import (
+    DerivedQuantizationSpec,
+    FixedQParamsQuantizationSpec,
+    SharedQuantizationSpec,
+)
+
+
+@dataclass
+class PartitionAnchors:
+    """
+    All fields except output are lists of (node, args_index) pair, where node is from
+    the given partition and node.args[args_index] is an input to the partition. Assumes
+    a single output.
+
+    Quantizer uses inputs, weights and biases for quantization annotation. The others
+    field contains tensor inputs that aren't quantized, and the literals fields contains
+    is used for other types of input values as well as handling default parameters.
+    """
+
+    # Inputs can share quantization parameters
+    inputs: List[
+        Union[
+            Tuple[fx.Node, Union[int, Tuple[int, int]]],
+            Tuple[
+                fx.Node,
+                Union[int, Tuple[int, int]],
+                SharedQuantizationSpec,
+            ],
+        ]
+    ] = field(default_factory=list)
+    weights: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    biases: List[
+        Union[Tuple[fx.Node, int], Tuple[fx.Node, int, DerivedQuantizationSpec]]
+    ] = field(default_factory=list)
+    others: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    literals: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    output: List[Union[Tuple[fx.Node], Tuple[fx.Node, SharedQuantizationSpec]]] = field(
+        default_factory=list
+    )
+    empty: bool = False
+
+
+class QuantizationPattern(ABC):
+    @abstractmethod
+    def partition_types(self) -> list[OpOverload]:
+        """
+        List of types to be passed to find_sequential_partitions_aten.
+        """
+        pass
+
+    @abstractmethod
+    def get_anchors(
+        self, gm: torch.fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Optional[PartitionAnchors]:
+        pass
+
+
+class SharedSpecPattern(QuantizationPattern):
+    """
+    Quantization pattern for shared quantization.
+
+    The quantization is derived from the previous node quantization and the input and output shares the same
+    quantization parameters (scale and zero-point).
+    """
+
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        pass
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        node = fused_partition[0].nodes[-1]
+        assert len(fused_partition[0].input_nodes) == 1
+        prev_node = fused_partition[0].input_nodes[0]
+
+        # Previous node was not quantized => we are not able to share q-params
+        if "quantization_annotation" not in prev_node.meta:
+            return None
+
+        qspec = SharedQuantizationSpec(prev_node)
+
+        return PartitionAnchors(
+            inputs=[(node, 0)],
+            weights=[],
+            biases=[],
+            output=[
+                (node, qspec),
+            ],
+        )
+
+
+class AddmmPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.addmm.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        addmm_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (addmm_node.args[1], addmm_node),
+                (addmm_node.args[2], addmm_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        return PartitionAnchors(
+            inputs=[(addmm_node, 1)],
+            weights=[(addmm_node, 2)],
+            biases=[(addmm_node, 0, bias_qspec)],
+            output=[(addmm_node,)],
+        )
+
+
+class AvgPoolPattern(SharedSpecPattern):
+    """
+    Quantizer for AvgPool2D operator.
+    """
+
+    def partition_types(self):
+        return [torch.ops.aten.avg_pool2d.default]
+
+
+class Conv1dPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv1d.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        conv1d_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (conv1d_node.args[0], conv1d_node),
+                (conv1d_node.args[1], conv1d_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(conv1d_node.args) > 2 and conv1d_node.args[2] is not None:
+            bias = [(conv1d_node, 2, bias_qspec)]
+
+        return PartitionAnchors(
+            inputs=[(conv1d_node, 0)],
+            weights=[(conv1d_node, 1)],
+            # pyre-fixme[6]: Incompatible parameter type
+            biases=bias,
+            output=[(conv1d_node,)],
+        )
+
+
+class Conv2dPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv2d.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        conv2d_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (conv2d_node.args[0], conv2d_node),
+                (conv2d_node.args[1], conv2d_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(conv2d_node.args) > 2 and conv2d_node.args[2] is not None:
+            bias = [(conv2d_node, 2, bias_qspec)]
+
+        return PartitionAnchors(
+            inputs=[(conv2d_node, 0)],
+            weights=[(conv2d_node, 1)],
+            # pyre-fixme[6]: Incompatible parameter type
+            biases=bias,
+            output=[(conv2d_node,)],
+        )
+
+
+class LinearPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.linear.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        linear_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (linear_node.args[0], linear_node),
+                (linear_node.args[1], linear_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(linear_node.args) > 2:
+            bias = [(linear_node, 2, bias_qspec)]
+
+        return PartitionAnchors(
+            inputs=[(linear_node, 0)],
+            weights=[(linear_node, 1)],
+            # pyre-fixme[6]: Incompatible parameter type
+            biases=bias,
+            output=[(linear_node,)],
+        )
+
+
+class MaxPoolPattern(SharedSpecPattern):
+    """
+    Quantizer for MaxPool2D operator.
+    """
+
+    def partition_types(self):
+        return [torch.ops.aten.max_pool2d.default]
+
+
+class PadPattern(SharedSpecPattern):
+    """
+    Quantizer for Pad operator.
+    """
+
+    def partition_types(self):
+        return [torch.ops.aten.pad.default]
+
+
+class PermutePattern(SharedSpecPattern):
+    """
+    Quantizer for Permute operator.
+    """
+
+    def partition_types(self):
+        return [torch.ops.aten.permute.default]
+
+
+class ReluPattern(SharedSpecPattern):
+    """
+    Quantizer for Relu operator. Shared quantization spec is selected, as ReLU usually follows computation layer.
+    """
+
+    def partition_types(self):
+        return [torch.ops.aten.relu.default]
+
+
+class ReluInPlacePattern(SharedSpecPattern):
+    """
+    Quantizer for Relu operator with param inplace=True. Shared quantization spec is selected, as ReLU usually
+    follows computation layer.
+    """
+
+    def partition_types(self):
+        return [torch.ops.aten.relu_.default]
+
+
+class ReshapePattern(SharedSpecPattern):
+    """
+    Quantizer for Reshape operator.
+    """
+
+    def partition_types(self):
+        return [torch.ops.aten.reshape.default]
+
+
+class SoftMaxPattern(QuantizationPattern):
+    """
+    Quantizer for Softmax operator.
+
+    The quantization of Softmax output is fixed to scale 1/256, zero point -128, dtype int8.
+    """
+
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.softmax.int]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        node = fused_partition[0].nodes[-1]
+        assert len(fused_partition[0].input_nodes) == 1
+
+        qspec = FixedQParamsQuantizationSpec(
+            dtype=torch.int8,
+            scale=1.0 / 256.0,
+            zero_point=-128,
+            quant_min=-128,
+            quant_max=127,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        return PartitionAnchors(
+            inputs=[(node, 0)],
+            weights=[],
+            biases=[],
+            output=[
+                (node, qspec),
+            ],
+        )
diff --git a/backends/nxp/quantizer/utils.py b/backends/nxp/quantizer/utils.py
new file mode 100644
index 00000000000..1effcdff25a
--- /dev/null
+++ b/backends/nxp/quantizer/utils.py
@@ -0,0 +1,151 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-2025 NXP
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import itertools
+from collections import OrderedDict
+from typing import Any, Dict, List, Tuple, Type
+
+import torch
+from torch import fx
+from torch._ops import OpOverload
+from torch.ao.quantization import ObserverOrFakeQuantize
+from torch.fx.passes.utils.source_matcher_utils import (
+    check_subgraphs_connected,
+    SourcePartition,
+)
+
+
+def is_annotated(nodes: List[fx.Node]) -> bool:
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def no_outside_users(fused_partition) -> bool:
+    """
+    Checks if each partition other than the last does not have any outside users.
+    """
+    for source_partition in fused_partition[:-1]:
+        if len(source_partition.output_nodes) != 1:
+            return False
+        if len(source_partition.output_nodes[0].users) != 1:
+            return False
+    return True
+
+
+def get_bias_qparams(
+    obs_or_fqs: List[ObserverOrFakeQuantize],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    act_scale, _ = obs_or_fqs[0].calculate_qparams()
+    weight_scale, _ = obs_or_fqs[1].calculate_qparams()
+    bias_scale = act_scale * weight_scale
+    bias_zero_point = torch.zeros_like(bias_scale, dtype=torch.int32)
+    return bias_scale, bias_zero_point
+
+
+def get_aten_node_target_partitions(
+    graph: torch.fx.Graph,
+    wanted_original_aten_op: List[OpOverload],
+):
+    """
+    Args:
+        graph: The graph we want to partition
+        wanted_original_aten_op: List of original_aten ops (OpOverload)
+
+    Returns:
+        Dictionary mapping aten ops that were given to a list of SourcePartitions
+        that correspond to the list of nodes that were decomposed from the given
+        aten ops.
+    """
+    modules: Dict[Type, Dict[str, List[torch.fx.Node]]] = {}
+
+    for node in graph.nodes:
+        # The metadata source_fn should contain a tuple of a unique name for the
+        # source, and the source function if the node is decomposed from a
+        # function, or the type of module if the node is decomposed from a leaf
+        # module
+        # TODO(matthiascremon): look into ways to avoid using source_fn_stack
+        if (source_fn_st := node.meta.get("source_fn_stack")) is None:
+            continue
+
+        source_fn = source_fn_st[-1]
+        if node.target not in wanted_original_aten_op:
+            continue
+
+        diff_modules = modules.setdefault(source_fn[1], {})
+        partition = diff_modules.setdefault(node.name, [])
+        partition.append(node)
+
+    def make_partition(
+        nodes: List[torch.fx.Node], module_type: Type
+    ) -> SourcePartition:
+        input_nodes = set()
+        output_nodes = set()
+        params = set()
+        for node in nodes:
+            for arg in node.args:
+                if isinstance(arg, torch.fx.Node) and arg not in nodes:
+                    input_nodes.add(arg)
+
+            if node.op == "get_attr":
+                params.add(node)
+
+            for user in node.users.keys():
+                if user not in nodes:
+                    output_nodes.add(node)
+
+        return SourcePartition(
+            nodes,
+            module_type,
+            list(input_nodes),
+            list(output_nodes),
+            list(params),  # type: ignore[arg-type]
+        )
+
+    ret: Dict[Type[Any], List[SourcePartition]] = {}
+
+    for k, v in modules.items():
+        ret[k] = [make_partition(partition, k) for partition in v.values()]
+
+    return ret
+
+
+def _partitions_sequential(partitions: Tuple[SourcePartition]) -> bool:
+    prev_partition = None
+    for partition in partitions:
+        if prev_partition is not None and not check_subgraphs_connected(
+            prev_partition, partition
+        ):
+            return False
+        prev_partition = partition
+    return True
+
+
+def find_sequential_partitions_aten(
+    gm: torch.fx.GraphModule,
+    partition_types: List[Any],
+):
+    typed_partitions: OrderedDict[Any, List[SourcePartition]] = OrderedDict()
+    for partition_type in partition_types:
+        partitions = get_aten_node_target_partitions(gm.graph, [partition_type])
+        typed_partitions[partition_type] = list(
+            itertools.chain.from_iterable(partitions.values())
+        )
+
+    typed_partitions_list = list(typed_partitions.values())
+    fusion_candidates = itertools.product(*typed_partitions_list)
+    fused_partitions = []
+    for candidate in fusion_candidates:
+        if _partitions_sequential(candidate):
+            fused_partitions.append(candidate)
+    return fused_partitions
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
new file mode 100644
index 00000000000..741e64a28a1
--- /dev/null
+++ b/backends/nxp/tests/models.py
@@ -0,0 +1,238 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Collection, Union
+
+import torch
+
+
+class Conv2dModule(torch.nn.Module):
+    def __init__(
+        self,
+        bias: bool = True,
+        dilation: Union[int, tuple[int, int]] = 1,
+        in_channels: int = 4,
+        kernel_size: Union[int, tuple[int, int]] = 3,
+        out_channels: int = 8,
+        padding: Union[str, int, Collection[int]] = 0,
+        stride: Union[int, tuple[int, int]] = 2,
+    ):
+        super().__init__()
+
+        self.conv = torch.nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Conv2dAndMaxPool2DModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.conv = torch.nn.Conv2d(
+            in_channels=8, out_channels=32, kernel_size=5, bias=True
+        )
+        self.maxpool = torch.nn.MaxPool2d(kernel_size=2, stride=2)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return self.maxpool(x)
+
+
+class Conv2dConstantPadNDModule(torch.nn.Module):
+    def __init__(self, paddings: Collection[int], constant: float | int | None = None):
+        super().__init__()
+        self.pad = ConstantPadNDModule(paddings, constant)
+        self.conv = Conv2dModule()
+
+    def forward(self, x):
+        x = self.conv(x)
+        return self.pad(x)
+
+
+class SoftmaxModule(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+
+        self.softmax = torch.nn.Softmax(dim=dim)
+
+    def forward(self, x):
+        return self.softmax(x)
+
+
+class SoftmaxConvModule(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+
+        self.conv = Conv2dModule()
+        self.softmax = SoftmaxModule(dim=dim)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return self.softmax(x)
+
+
+class LinearModule(torch.nn.Module):
+    def __init__(self, bias: bool):
+        super().__init__()
+        self.linear = torch.nn.Linear(32, 16, bias=bias)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class LinearSoftmaxModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.linear = torch.nn.Linear(12, 10)
+        self.softmax = torch.nn.Softmax(1)
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.softmax(x)
+
+        return x
+
+
+class ConvFCSoftmaxModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.conv = torch.nn.Conv2d(4, 64, 2, bias=False)
+        self.fc = torch.nn.Linear(1024, 10)
+        self.softmax = torch.nn.Softmax(1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = torch.reshape(x, (-1, 1024))
+        x = self.fc(x)
+        x = self.softmax(x)
+
+        return x
+
+
+class ConstantPadNDModule(torch.nn.Module):
+    def __init__(self, paddings: Collection[int], constant: float | int | None = None):
+        super().__init__()
+        self.paddings = paddings
+        self.constant = constant
+
+    def forward(self, x):
+        if self.constant is None:
+            return torch.nn.functional.pad(x, tuple(self.paddings), "constant")
+        else:
+            return torch.nn.functional.pad(
+                x, tuple(self.paddings), "constant", self.constant
+            )
+
+
+class ConstantPadNDConvModule(torch.nn.Module):
+    def __init__(self, paddings: Collection[int], constant: float | int | None = None):
+        super().__init__()
+        self.pad = ConstantPadNDModule(paddings, constant)
+        self.conv = Conv2dModule()
+
+    def forward(self, x):
+        x = self.pad(x)
+        return self.conv(x)
+
+
+class MaxPool2dModule(torch.nn.Module):
+    def __init__(self, padding=0):
+        super().__init__()
+
+        self.max_pool2d = torch.nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=padding, dilation=1
+        )
+
+    def forward(self, x):
+        return self.max_pool2d(x)
+
+
+class MaxPool2dConvModule(torch.nn.Module):
+    def __init__(self, padding=0):
+        super().__init__()
+
+        self.conv = Conv2dModule()
+        self.max_pool2d = torch.nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=padding, dilation=1
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        return self.max_pool2d(x)
+
+
+class AvgPool2dModule(torch.nn.Module):
+    def __init__(self, count_include_pad, padding=0):
+        super().__init__()
+
+        self.avg_pool = torch.nn.AvgPool2d(
+            kernel_size=3,
+            stride=2,
+            padding=padding,
+            count_include_pad=count_include_pad,
+        )
+
+    def forward(self, x):
+        return self.avg_pool(x)
+
+
+class AvgPool2dConvModule(torch.nn.Module):
+    def __init__(self, count_include_pad, padding=0):
+        super().__init__()
+
+        self.conv = Conv2dModule()
+        self.avg_pool = torch.nn.AvgPool2d(
+            kernel_size=3,
+            stride=1,
+            padding=padding,
+            count_include_pad=count_include_pad,
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        return self.avg_pool(x)
+
+
+class ReLUModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        return self.relu(x)
+
+
+class Conv2dReLUModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.conv = torch.nn.Conv2d(4, 64, 2, bias=False)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        return self.relu(x)
+
+
+class Conv2dPermuteModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(4, 64, 2, bias=False)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return torch.permute(x, [0, 2, 1, 3])
diff --git a/backends/nxp/tests/test_quantizer.py b/backends/nxp/tests/test_quantizer.py
new file mode 100644
index 00000000000..868a94059b5
--- /dev/null
+++ b/backends/nxp/tests/test_quantizer.py
@@ -0,0 +1,273 @@
+# Copyright 2024 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Tests for NeutronQuantizer.
+
+import executorch.backends.nxp.tests.models as models
+import torch
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+
+def _get_target_name(node):
+    return node._pretty_print_target(node.target)
+
+
+def test_quantizer_conv2d():
+    model = models.Conv2dModule()
+    model.eval()
+
+    example_input = (torch.ones(1, 4, 32, 32),)
+    quantizer = NeutronQuantizer()
+    graph_module = torch.export.export_for_training(
+        model, example_input, strict=True
+    ).module()
+
+    # noinspection PyTypeChecker
+    m = prepare_pt2e(graph_module, quantizer)
+    m(*example_input)
+    m = convert_pt2e(m)
+
+    # Dry run
+    m(*example_input)
+
+    nodes = list(m.graph.nodes)
+    assert len(nodes) == 11
+    assert nodes[7].name == "conv2d"
+    # [0]: Input, [1] : weights, [2]: bias
+    assert (
+        _get_target_name(nodes[7].args[0])
+        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+    )
+    assert (
+        _get_target_name(nodes[7].args[1])
+        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+    )
+    assert (
+        _get_target_name(nodes[7].args[2])
+        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+    )
+    assert (
+        _get_target_name(nodes[8])
+        == "torch.ops.quantized_decomposed.quantize_per_tensor.default"
+    )
+    assert nodes[8].args[0].name == "conv2d"
+
+
+def test_quantizer_linear():
+    model = models.LinearModule(bias=True)
+    model.eval()
+
+    example_input = (torch.ones(10, 32),)
+    quantizer = NeutronQuantizer()
+    graph_module = torch.export.export_for_training(
+        model, example_input, strict=True
+    ).module()
+
+    # noinspection PyTypeChecker
+    m = prepare_pt2e(graph_module, quantizer)
+    m(*example_input)
+    m = convert_pt2e(m)
+
+    # Dry run
+    m(*example_input)
+
+    nodes = list(m.graph.nodes)
+    assert len(nodes) == 11
+    assert nodes[7].name == "linear"
+    # [0]: Input, [1] : weights, [2]: bias
+    assert (
+        _get_target_name(nodes[7].args[0])
+        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+    )
+    assert (
+        _get_target_name(nodes[7].args[1])
+        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+    )
+    assert (
+        _get_target_name(nodes[7].args[2])
+        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+    )
+    assert (
+        _get_target_name(nodes[8])
+        == "torch.ops.quantized_decomposed.quantize_per_tensor.default"
+    )
+    assert nodes[8].args[0].name == "linear"
+
+
+def test_quantizer_maxpool2d():
+    model = models.Conv2dAndMaxPool2DModule()
+    model.eval()
+
+    example_input = (torch.ones(1, 8, 32, 32),)
+    quantizer = NeutronQuantizer()
+    graph_module = torch.export.export_for_training(
+        model, example_input, strict=True
+    ).module()
+
+    # noinspection PyTypeChecker
+    m = prepare_pt2e(graph_module, quantizer)
+    m(*example_input)
+    m = convert_pt2e(m)
+
+    # Dry run
+    m(*example_input)
+
+    nodes = list(m.graph.nodes)
+    assert len(nodes) == 14
+    # Check if QDQ pattern:
+    assert nodes[10].name == "max_pool2d"
+    assert (
+        _get_target_name(nodes[10].args[0])
+        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+    )
+    assert (
+        _get_target_name(nodes[11])
+        == "torch.ops.quantized_decomposed.quantize_per_tensor.default"
+    )
+    assert nodes[11].args[0].name == "max_pool2d"
+
+    # Check if input and output quantization is same
+    input_quant = nodes[10].args[0].args[1:]
+    output_quant = nodes[11].args[1:]
+    assert input_quant == output_quant
+
+
+def test_quantizer_softmax():
+    model = models.SoftmaxModule(dim=0)
+    model.eval()
+
+    example_input = (torch.ones(1, 10),)
+    quantizer = NeutronQuantizer()
+    graph_module = torch.export.export_for_training(
+        model, example_input, strict=True
+    ).module()
+
+    # noinspection PyTypeChecker
+    m = prepare_pt2e(graph_module, quantizer)
+    m(*example_input)
+    m = convert_pt2e(m)
+
+    # Dry run
+    m(*example_input)
+
+    nodes = list(m.graph.nodes)
+    assert len(nodes) == 7
+    # Check if QDQ pattern:
+    assert nodes[3].name == "softmax"
+    assert (
+        _get_target_name(nodes[3].args[0])
+        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+    )
+    assert (
+        _get_target_name(nodes[4])
+        == "torch.ops.quantized_decomposed.quantize_per_tensor.default"
+    )
+    assert nodes[4].args[0].name == "softmax"
+
+    # Check output quantization
+    scale, zp, _, _, dtype = nodes[4].args[1:]
+    assert scale == 1.0 / 256.0
+    assert zp == -128
+    assert dtype == torch.int8
+
+
+def test_quantizer_single_maxpool2d():
+    model = models.MaxPool2dModule()
+    model.eval()
+
+    example_input = (torch.ones(1, 4, 32, 32),)
+    quantizer = NeutronQuantizer()
+    graph_module = torch.export.export_for_training(
+        model, example_input, strict=True
+    ).module()
+
+    # noinspection PyTypeChecker
+    m = prepare_pt2e(graph_module, quantizer)
+    m(*example_input)
+    m = convert_pt2e(m)
+
+    # Dry run
+    m(*example_input)
+
+    nodes = list(m.graph.nodes)
+    assert len(nodes) == 3
+    assert nodes[1].name == "max_pool2d"
+    assert "quantization_annotation" not in nodes[1].meta
+
+
+def test_quantizer_conv2d_relu():
+    model = models.Conv2dReLUModule()
+    model.eval()
+
+    example_input = (torch.ones(1, 4, 32, 32),)
+    quantizer = NeutronQuantizer()
+    graph_module = torch.export.export_for_training(
+        model, example_input, strict=True
+    ).module()
+
+    # noinspection PyTypeChecker
+    m = prepare_pt2e(graph_module, quantizer)
+    m(*example_input)
+    m = convert_pt2e(m)
+
+    # Dry run
+    m(*example_input)
+
+    nodes = list(m.graph.nodes)
+    assert len(nodes) == 12
+    assert nodes[7].name == "dequantize_per_tensor_default_2"
+    assert nodes[8].name == "relu"
+    assert nodes[9].name == "quantize_per_tensor_default_3"
+
+
+def test_quantizer_conv2d_avg_pool2d():
+    model = models.AvgPool2dConvModule(count_include_pad=False)
+    model.eval()
+
+    example_input = (torch.ones(1, 4, 16, 16),)
+    quantizer = NeutronQuantizer()
+    graph_module = torch.export.export_for_training(
+        model, example_input, strict=True
+    ).module()
+
+    # noinspection PyTypeChecker
+    m = prepare_pt2e(graph_module, quantizer)
+    m(*example_input)
+    m = convert_pt2e(m)
+
+    # Dry run
+    m(*example_input)
+
+    nodes = list(m.graph.nodes)
+    assert len(nodes) == 14
+    assert nodes[9].name == "dequantize_per_tensor_default_3"
+    assert nodes[10].name == "avg_pool2d"
+    assert nodes[11].name == "quantize_per_tensor_default_4"
+
+
+def test_quantizer_conv2d_permute():
+    model = models.Conv2dPermuteModule()
+    model.eval()
+
+    example_input = (torch.ones(1, 4, 16, 16),)
+    quantizer = NeutronQuantizer()
+    graph_module = torch.export.export_for_training(
+        model, example_input, strict=True
+    ).module()
+
+    # noinspection PyTypeChecker
+    m = prepare_pt2e(graph_module, quantizer)
+    m(*example_input)
+    m = convert_pt2e(m)
+
+    # Dry run
+    m(*example_input)
+
+    nodes = list(m.graph.nodes)
+    assert len(nodes) == 12
+    assert nodes[7].name == "dequantize_per_tensor_default_2"
+    assert nodes[8].name == "permute"
+    assert nodes[9].name == "quantize_per_tensor_default_3"
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index 95a5f4c364e..8adc19f828a 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -40,7 +40,9 @@ executorch
 
 ### Prerequisites
 
-Before you begin, ensure you have openvino installed and configured on your system:
+Before you begin, ensure you have openvino installed and configured on your system.
+
+### Build OpenVINO from Source
 
 ```bash
 git clone https://github.com/openvinotoolkit/openvino.git
@@ -56,14 +58,26 @@ cmake --install build --prefix <your_preferred_install_location>
 cd <your_preferred_install_location>
 source setupvars.sh
 ```
-Note: The OpenVINO backend is not yet supported with the current OpenVINO release packages. It is recommended to build from source. The instructions for using OpenVINO release packages will be added soon.
+
+### Use OpenVINO from Release Packages
+
+1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform.
+
+2. Extract the release package from the archive and set the environment variables.
+
+   ```bash
+   tar -zxf openvino_toolkit_<your_release_configuration>.tgz
+   cd openvino_toolkit_<your_release_configuration>
+   source setupvars.sh
+   ```
+
 For more information about OpenVINO build, refer to the [OpenVINO Build Instructions](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/build_linux.md).
 
 ### Setup
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/stable/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
+1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
 
 2. **Setup OpenVINO Backend Environment**
 - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
@@ -78,7 +92,7 @@ Follow the steps below to setup your build environment:
    ```bash
    ./openvino_build.sh
    ```
-   **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `export_and_infer_openvino.py` script inside `executorch/examples/openvino` folder.
+   **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
 
    ```bash
    ./openvino_build.sh --enable_python
diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md
index 85019add313..c3d51e7c116 100644
--- a/backends/qualcomm/README.md
+++ b/backends/qualcomm/README.md
@@ -6,14 +6,14 @@ we reserve the right to modify interfaces and implementations.
 
 This backend is implemented on the top of
 [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk).
-Please follow [tutorial](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md) to setup environment, build, and run executorch models by this backend (Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation).
+Please follow [tutorial](../../docs/source/backends-qualcomm.md) to setup environment, build, and run executorch models by this backend (Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation).
 
-A website version of the tutorial is [here](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html).
+A website version of the tutorial is [here](https://pytorch.org/executorch/main/backends-qualcomm).
 
 ## Delegate Options
 
 Please check `generate_qnn_executorch_compiler_spec()` in
-[utils.py](./utils/utils.py) for supported SoC and inference type.
+[utils.py](utils/utils.py) for supported SoC and inference type.
 
 ### Supported Chipset
 - Snapdragon 8 Gen 1
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index 9c884d7ab93..81b86992dee 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -9,7 +9,10 @@
 from .annotate_unbind import AnnotateUnbind
 from .convert_bmm_to_matmul import ConvertBmmToMatmul
 from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d
+from .convert_square_to_pow import ConvertSquareToPow
+from .convert_upsample_bicubic2d import ConvertUpsampleBicubicWithBilinear
 from .decompose_any import DecomposeAny
+from .decompose_cdist import DecomposeCDist
 from .decompose_einsum import DecomposeEinsum
 from .decompose_expm1 import DecomposeExpM1
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
@@ -26,6 +29,7 @@
 from .recompose_pixel_unshuffle import RecomposePixelUnshuffle
 from .recompose_rms_norm import RecomposeRmsNorm
 from .reduce_dynamic_range import ReduceDynamicRange
+from .remove_0d_tensor import Remove0DTensor
 from .remove_redundancy import RemoveRedundancy
 from .replace_arange_args import ReplaceArangeArgs
 from .replace_index_put_input import ReplaceIndexPutInput
@@ -39,7 +43,10 @@
     AnnotateUnbind,
     ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
+    ConvertSquareToPow,
+    ConvertUpsampleBicubicWithBilinear,
     DecomposeAny,
+    DecomposeCDist,
     DecomposeEinsum,
     DecomposeExpM1,
     DecomposeLinalgVectorNorm,
@@ -56,6 +63,7 @@
     RecomposePixelUnshuffle,
     RecomposeRmsNorm,
     ReduceDynamicRange,
+    Remove0DTensor,
     RemoveRedundancy,
     ReplaceArangeArgs,
     ReplaceIndexPutInput,
diff --git a/backends/qualcomm/_passes/annotate_stack.py b/backends/qualcomm/_passes/annotate_stack.py
index c42804af2f2..5fbfde058b2 100644
--- a/backends/qualcomm/_passes/annotate_stack.py
+++ b/backends/qualcomm/_passes/annotate_stack.py
@@ -17,14 +17,16 @@ class AnnotateStack(ExportPass):
     generated after quantization process.
     """
 
-    decomp_ops = [torch.ops.aten.unbind.int]
+    decomp_ops = [torch.ops.aten.stack.default]
 
     def __init__(self, edge_program: torch.export.ExportedProgram):
         super(AnnotateStack, self).__init__()
         self.edge_program = edge_program
 
     def _annotate_stack(self, graph_module: torch.fx.GraphModule):
-        partitions = get_source_partitions(graph_module.graph, [torch.stack, "stack"])
+        partitions = get_source_partitions(
+            graph_module.graph, [torch.stack, torch.ops.aten.stack.default, "stack"]
+        )
         for _, src_partitions in partitions.items():
             for src_partition in src_partitions:
                 output = src_partition.output_nodes[0]
diff --git a/backends/qualcomm/_passes/annotate_unbind.py b/backends/qualcomm/_passes/annotate_unbind.py
index 0efa1638bc4..426285e872b 100644
--- a/backends/qualcomm/_passes/annotate_unbind.py
+++ b/backends/qualcomm/_passes/annotate_unbind.py
@@ -24,7 +24,9 @@ def __init__(self, edge_program: torch.export.ExportedProgram):
         self.edge_program = edge_program
 
     def _annotate_unbind(self, graph_module: torch.fx.GraphModule):
-        partitions = get_source_partitions(graph_module.graph, [torch.unbind, "unbind"])
+        partitions = get_source_partitions(
+            graph_module.graph, [torch.unbind, torch.ops.aten.unbind.int, "unbind"]
+        )
         for _, src_partitions in partitions.items():
             for src_partition in src_partitions:
                 if src_partition.input_nodes[0].target in dq_ops:
diff --git a/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py b/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
index 947b631dbbf..72dc29c2880 100644
--- a/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
+++ b/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn as nn
 from executorch.backends.qualcomm.builders.utils import get_parameter, set_parameter
+from executorch.backends.qualcomm.utils.constants import QCOM_REQUANTIZE
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -43,6 +44,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                     unsqueeze_node.meta = copy_meta(
                         input_node.meta, lambda m: {**m, "val": m["val"].unsqueeze(2)}
                     )
+
                     with graph_module.graph.inserting_after(unsqueeze_node):
 
                         filter_node = node.args[1]
@@ -92,6 +94,14 @@ def call(self, graph_module: torch.fx.GraphModule):
                                 ),
                             )
                             squeeze_node.meta = copy_meta(node.meta)
+
+                            if QCOM_REQUANTIZE in input_node.meta:
+                                input_node.meta.pop(QCOM_REQUANTIZE)
+                            if QCOM_REQUANTIZE in node.meta:
+                                squeeze_node.meta[QCOM_REQUANTIZE] = node.meta[
+                                    QCOM_REQUANTIZE
+                                ]
+                                conv2d_node.meta.pop(QCOM_REQUANTIZE, None)
                 for user in node.users.copy():
                     user.replace_input_with(node, squeeze_node)
         graph.eliminate_dead_code()
diff --git a/backends/qualcomm/_passes/convert_square_to_pow.py b/backends/qualcomm/_passes/convert_square_to_pow.py
new file mode 100644
index 00000000000..51a74ac5f10
--- /dev/null
+++ b/backends/qualcomm/_passes/convert_square_to_pow.py
@@ -0,0 +1,38 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta
+
+
+class ConvertSquareToPow(ExportPass):
+    """
+    Convert square to pow with a scalar value of 2.
+    This allows LiftConstantScalarOperands to lift the scalar into a scalar.
+    Otherwise, the square op will be converted to pow.tensor_scalar after to_edge.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.square.default:
+                input_node = node.args[0]
+                with graph_module.graph.inserting_after(input_node):
+                    pow_op = torch.ops.aten.pow.Tensor_Scalar
+                    pow_node = graph.create_node(
+                        "call_function", pow_op, (input_node, 2)
+                    )
+                    pow_node.meta = copy_meta(node.meta)
+                for user in node.users.copy():
+                    user.replace_input_with(node, pow_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/convert_upsample_bicubic2d.py b/backends/qualcomm/_passes/convert_upsample_bicubic2d.py
new file mode 100644
index 00000000000..367e9155c77
--- /dev/null
+++ b/backends/qualcomm/_passes/convert_upsample_bicubic2d.py
@@ -0,0 +1,27 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class ConvertUpsampleBicubicWithBilinear(ExportPass):
+    """
+    Qnn does not support bicubic interpolation, so we need to convert it to bilinear.
+    This pass will convert bicubic interpolation to bilinear interpolation.
+    """
+
+    bicubic_op_targets = {
+        exir_ops.edge.aten.upsample_bicubic2d.vec,
+    }
+    upsample_bilinear_op = exir_ops.edge.aten.upsample_bilinear2d.default
+
+    def __init__(self):
+        super(ConvertUpsampleBicubicWithBilinear, self).__init__()
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in self.bicubic_op_targets:
+            return super().call_operator(op, args, kwargs, meta)
+        return super().call_operator(self.upsample_bilinear_op, args[:-1], kwargs, meta)
diff --git a/backends/qualcomm/_passes/decompose_cdist.py b/backends/qualcomm/_passes/decompose_cdist.py
new file mode 100644
index 00000000000..d18a0295ffb
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_cdist.py
@@ -0,0 +1,81 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class CDist(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        # Step 1: Compute differences
+        diff = x.unsqueeze(-2) - y.unsqueeze(-3)
+
+        # Step 2: Square differences
+        sq_diff = diff**2
+
+        # Step 3: Sum of squares
+        sum_sq_diff = sq_diff.sum(dim=-1)
+
+        # Step 4: Square root
+        distances = torch.sqrt(sum_sq_diff)
+
+        return distances
+
+
+class DecomposeCDist(ExportPass):
+    """
+    Decompose for math equivalent op.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            model = CDist()
+            if torch.ops.aten.cdist.default == node.target:
+                if len(node.args) > 2:
+                    assert (
+                        node.args[2] == 2
+                    ), "Currently only p=2 is supported for CDist Decomposition"
+                decomposed_module = torch.export.export(
+                    model,
+                    (node.args[0].meta["val"], node.args[1].meta["val"]),
+                    strict=True,
+                ).module()
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": node.args[0], "y": node.args[1]}
+
+                    for decomposed_node in decomposed_module.graph.nodes:
+                        # no need to copy existent 'output'
+                        if decomposed_node.op == "output":
+                            for user in node.users.copy():
+                                # remap
+                                user.replace_input_with(
+                                    node,
+                                    remap[decomposed_node.args[0][0]],
+                                )
+                        # no need to copy existent placeholders
+                        elif decomposed_node.op == "placeholder":
+                            # replace node map from string to graph node
+                            remap[decomposed_node] = remap.pop(decomposed_node.name)
+                        else:
+                            remap[decomposed_node] = graph.node_copy(
+                                decomposed_node,
+                                arg_transform=lambda x, remap=remap: remap[x],
+                            )
+
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_einsum.py b/backends/qualcomm/_passes/decompose_einsum.py
index cbf8cbf1249..046c1598311 100644
--- a/backends/qualcomm/_passes/decompose_einsum.py
+++ b/backends/qualcomm/_passes/decompose_einsum.py
@@ -8,6 +8,8 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx.experimental.proxy_tensor import make_fx
 
+from .utils import copy_nn_module_stack
+
 
 class DecomposeEinsum(ExportPass):
     """
@@ -36,6 +38,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                         remap[f"arg1_{i+1}"] = arg
 
                     for decomposed_node in decomposed_module.graph.nodes:
+                        copy_nn_module_stack(node, decomposed_node)
                         # This is the arg[0] equation string, which is not required anymore after decomposition
                         if "arg0" in decomposed_node.name:
                             continue
diff --git a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
index 7d70f5c9342..993f088da12 100644
--- a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
+++ b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
@@ -8,6 +8,8 @@
 from executorch.exir import to_edge
 from executorch.exir.pass_base import ExportPass, PassResult
 
+from .utils import copy_nn_module_stack
+
 
 class LinalgVectorNorm(torch.nn.Module):
     def __init__(self, exp, dim, keepdim):
@@ -62,6 +64,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     remap = {"x": node.args[0]}
 
                     for decomposed_node in decomposed_module.graph.nodes:
+                        copy_nn_module_stack(node, decomposed_node)
                         # no need to copy existent 'output'
                         if decomposed_node.op == "output":
                             for user in node.users.copy():
diff --git a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
index 277fc9c6ce8..829b3757e06 100644
--- a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
+++ b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
@@ -22,12 +22,16 @@ def __init__(self):
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.mul.Tensor,
             exir_ops.edge.aten.div.Tensor,
+            # Support if the rank of input tensor: {input_dims} is less than the rank of output tensor: {output_dims}.
+            exir_ops.edge.aten.expand_copy.default,
         ]
 
     def traverse_broadcast_node(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             if node.target in self.broadcast_op_targets:
                 for arg in node.args:
+                    if not isinstance(arg, torch.fx.Node):
+                        continue
                     input_rank = len(arg.meta["val"].shape)
                     output_rank = len(node.meta["val"].shape)
                     if input_rank != output_rank:
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
index 17960a6029b..19c5417f8f8 100644
--- a/backends/qualcomm/_passes/layout_transform.py
+++ b/backends/qualcomm/_passes/layout_transform.py
@@ -47,6 +47,7 @@ class LayoutTransform(ExportPass):
     layout_agnostic_ops = {
         exir_ops.edge.aten.abs.default,
         exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.amax.default,
         exir_ops.edge.aten.bitwise_or.Tensor,
         exir_ops.edge.aten.bmm.default,
         exir_ops.edge.aten.bitwise_and.Tensor,
@@ -54,6 +55,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.ceil.default,
         exir_ops.edge.aten.clamp.default,
         exir_ops.edge.aten.constant_pad_nd.default,
+        exir_ops.edge.aten.cumsum.default,
         exir_ops.edge.aten.div.Tensor,
         exir_ops.edge.aten.elu.default,
         exir_ops.edge.aten.eq.Tensor,
diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
index 93abfe621bc..9b3a308813e 100644
--- a/backends/qualcomm/_passes/lift_constant_scalar_operands.py
+++ b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
@@ -53,7 +53,13 @@ class TensorOpInfo:
 }
 
 
-SKIP_LIFT_OPS = {aten.full_like.default, aten.arange.start_step}
+SKIP_LIFT_OPS = {
+    aten.full_like.default,
+    aten.arange.start_step,
+    aten.arange.default,
+    aten.scalar_tensor.default,
+    aten.elu.default,
+}
 
 
 class LiftConstantScalarOperands(ExportPass):
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index ab2c86102df..c98f27db120 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -14,7 +14,10 @@
     AnnotateUnbind,
     ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
+    ConvertSquareToPow,
+    ConvertUpsampleBicubicWithBilinear,
     DecomposeAny,
+    DecomposeCDist,
     DecomposeEinsum,
     DecomposeExpM1,
     DecomposeLinalgVectorNorm,
@@ -31,6 +34,7 @@
     RecomposePixelUnshuffle,
     RecomposeRmsNorm,
     ReduceDynamicRange,
+    Remove0DTensor,
     RemoveRedundancy,
     ReplaceArangeArgs,
     ReplaceIndexPutInput,
@@ -70,10 +74,11 @@ def get_capture_program_passes():
     # If a pass is activated, it will be executed by default.
     default_passes_and_setting = [
         (AnnotateQuantAttrs, True),
-        (AnnotateStack, False),
+        (AnnotateStack, True),
         (AnnotateUnbind, True),
         (ConvertBmmToMatmul, True),
         (ConvertConv1dToConv2d, True),
+        (ConvertUpsampleBicubicWithBilinear, False),
         (DecomposeAny, True),
         (ExpandBroadcastTensorShape, False),
         (FixedLinearKeepDim, True),
@@ -82,6 +87,7 @@ def get_capture_program_passes():
         (LayoutTransform, True),
         (RecomposePixelUnshuffle, True),
         (RecomposeRmsNorm, False),
+        (Remove0DTensor, True),
         (RemoveRedundancy, True),
         (ReplaceIndexPutInput, True),
         (TagQuantIO, False),
@@ -174,10 +180,27 @@ def transform_for_to_edge_pipeline(
 
         return exported_program
 
+    # Before quantizer
+    def transform_for_annotation_pipeline(self, graph_module: GraphModule):
+        self.add_pass(ReduceDynamicRange())
+        self.add_pass(RecomposePixelUnshuffle(quantization_capture=True))
+        self.add_pass(ReplaceArangeArgs())
+        self.add_pass(DecomposeCDist())
+        self.add_pass(DecomposeScaledDotProductAttention())
+        self.add_pass(DecomposeSilu())
+        self.add_pass(DecomposeEinsum())
+        self.add_pass(DecomposeExpM1())
+        self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
+        self.add_pass(ReplaceInfValues())
+        self.add_pass(LiftConstantScalarOperands())
+        return self._transform(graph_module)
+
     def transform_for_export_pipeline(self, exported_program: ExportedProgram):
+        self.add_pass(DecomposeCDist())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        self.add_pass(ConvertSquareToPow())
         self.add_pass(LiftConstantScalarOperands())
         self._transform(exported_program.graph_module)
         ep = lift_constant_tensor_pass(exported_program)
@@ -189,16 +212,3 @@ def transform_for_preprocess_pipeline(self, exported_program: ExportedProgram):
         self.add_pass(LayoutTransform(exported_program, insert_permute=True))
         self.add_pass(FuseConsecutiveTranspose())
         return self._transform(exported_program.graph_module)
-
-    def transform_for_annotation_pipeline(self, graph_module: GraphModule):
-        self.add_pass(ReduceDynamicRange())
-        self.add_pass(RecomposePixelUnshuffle(quantization_capture=True))
-        self.add_pass(ReplaceArangeArgs())
-        self.add_pass(DecomposeScaledDotProductAttention())
-        self.add_pass(DecomposeSilu())
-        self.add_pass(DecomposeEinsum())
-        self.add_pass(DecomposeExpM1())
-        self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
-        self.add_pass(ReplaceInfValues())
-        self.add_pass(LiftConstantScalarOperands())
-        return self._transform(graph_module)
diff --git a/backends/qualcomm/_passes/recompose_pixel_unshuffle.py b/backends/qualcomm/_passes/recompose_pixel_unshuffle.py
index 7aac4fb823e..81214facc3a 100644
--- a/backends/qualcomm/_passes/recompose_pixel_unshuffle.py
+++ b/backends/qualcomm/_passes/recompose_pixel_unshuffle.py
@@ -45,13 +45,11 @@ def call(self, graph_module: torch.fx.GraphModule):
                         continue
 
                     view_node = premute_node.args[0]
-                    if any(
-                        [
-                            view_node.op != "call_function",
-                            view_node.target != self.view_target,
-                            len(view_node.args[1]) != 6,
-                            len(premute_node.args[1]) != 6,
-                        ]
+                    if (
+                        view_node.op != "call_function"
+                        or view_node.target != self.view_target
+                        or len(view_node.args[1]) != 6
+                        or len(premute_node.args[1]) != 6
                     ):
                         continue
 
diff --git a/backends/qualcomm/_passes/remove_0d_tensor.py b/backends/qualcomm/_passes/remove_0d_tensor.py
new file mode 100644
index 00000000000..1e1d711c2b8
--- /dev/null
+++ b/backends/qualcomm/_passes/remove_0d_tensor.py
@@ -0,0 +1,36 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class Remove0DTensor(ExportPass):
+    """
+    QNN does not allow 0D tensor, we remove the node that will output an 0D tensor.
+    Before adding operations to the list of nodes to be removed, please ensure that it will not change the logic.
+    """
+
+    remove_ops = {
+        exir_ops.edge.aten.select.int,
+        exir_ops.edge.aten.select_copy.int,
+    }
+
+    def __init__(self, quantization_capture=False) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target in self.remove_ops and len(node.meta["val"].shape) == 0:
+                for user_n in list(node.users.keys()):
+                    user_n.replace_input_with(node, node.args[0])
+                graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
index d538fe0d34f..46d9e0cde76 100755
--- a/backends/qualcomm/_passes/utils.py
+++ b/backends/qualcomm/_passes/utils.py
@@ -78,6 +78,7 @@ def get_passes_dependency_for_capture_program():
         AnnotateUnbind,
         ConvertBmmToMatmul,
         ConvertConv1dToConv2d,
+        ConvertUpsampleBicubicWithBilinear,
         DecomposeAny,
         DecomposeLinalgVectorNorm,
         ExpandBroadcastTensorShape,
@@ -96,18 +97,20 @@ def get_passes_dependency_for_capture_program():
         AnnotateQuantAttrs: [
             RecomposePixelUnshuffle,
             ConvertBmmToMatmul,
+            ConvertUpsampleBicubicWithBilinear,
             RemoveRedundancy,
         ],
         AnnotateStack: [RemoveRedundancy],
         AnnotateUnbind: [RemoveRedundancy],
         ConvertBmmToMatmul: [RecomposePixelUnshuffle],
         ConvertConv1dToConv2d: [FoldQDQ],
+        ConvertUpsampleBicubicWithBilinear: [RemoveRedundancy],
         DecomposeAny: [RemoveRedundancy],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
-        ExpandBroadcastTensorShape: [RemoveRedundancy],
+        ExpandBroadcastTensorShape: [FoldQDQ],
         FixedLinearKeepDim: [FoldQDQ],
         FoldQDQ: [AnnotateQuantAttrs, AnnotateStack, AnnotateUnbind],
-        I64toI32: [RemoveRedundancy],
+        I64toI32: [ConvertUpsampleBicubicWithBilinear, RemoveRedundancy],
         LayoutTransform: [
             AnnotateQuantAttrs,
             ConvertConv1dToConv2d,
@@ -121,6 +124,14 @@ def get_passes_dependency_for_capture_program():
     }
 
 
+def copy_nn_module_stack(src, target):
+    """
+    Copy meta["nn_module_stack"] from src node to target node if existing.
+    """
+    if value := src.meta.get("nn_module_stack"):
+        target.meta["nn_module_stack"] = value
+
+
 def is_float_tensor(node: torch.fx.Node) -> bool:
     if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor):
         return False
diff --git a/backends/qualcomm/aot/ir/targets.bzl b/backends/qualcomm/aot/ir/targets.bzl
index 5fdcd14485c..b6ca0879dbe 100644
--- a/backends/qualcomm/aot/ir/targets.bzl
+++ b/backends/qualcomm/aot/ir/targets.bzl
@@ -4,7 +4,7 @@ load(
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/backends/qualcomm:targets.bzl", "generate_schema_header")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 QCIR_NAME = "qcir"
 INPUT_QCIR = QCIR_NAME + ".fbs"
@@ -56,7 +56,7 @@ def define_common_targets():
         platforms = [ANDROID],
         visibility = ["@EXECUTORCH_CLIENTS"],
         deps = [
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
             "//executorch/backends/qualcomm/aot/wrappers:wrappers",
diff --git a/backends/qualcomm/aot/python/targets.bzl b/backends/qualcomm/aot/python/targets.bzl
index f29c02aa593..f2eb654a10c 100644
--- a/backends/qualcomm/aot/python/targets.bzl
+++ b/backends/qualcomm/aot/python/targets.bzl
@@ -3,7 +3,7 @@ load(
     "ANDROID",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 PYTHON_MODULE_NAME = "PyQnnManagerAdaptor"
 
@@ -34,7 +34,7 @@ def define_common_targets():
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
             "fbsource//third-party/pybind11:pybind11",
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
         ],
         external_deps = [
             "libtorch_python",
@@ -67,7 +67,7 @@ def define_common_targets():
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
             "fbsource//third-party/pybind11:pybind11",
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
         ],
         external_deps = [
             "libtorch_python",
@@ -94,6 +94,6 @@ def define_common_targets():
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
             "fbsource//third-party/pybind11:pybind11",
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
         ],
     )
diff --git a/backends/qualcomm/aot/wrappers/targets.bzl b/backends/qualcomm/aot/wrappers/targets.bzl
index 24ceeb723eb..0c5d5b1c3e9 100644
--- a/backends/qualcomm/aot/wrappers/targets.bzl
+++ b/backends/qualcomm/aot/wrappers/targets.bzl
@@ -3,7 +3,7 @@ load(
     "ANDROID",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -23,7 +23,7 @@ def define_common_targets():
         platforms = [ANDROID],
         visibility = ["@EXECUTORCH_CLIENTS"],
         deps = [
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
         ],
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
index 3a97e8d6d6a..783a53dd645 100644
--- a/backends/qualcomm/builders/README.md
+++ b/backends/qualcomm/builders/README.md
@@ -8,6 +8,7 @@ Thank you for contributing to Qualcomm AI Engine Direct delegate for ExecuTorch.
     * [Check Operator Spec](#check-operator-spec)
     * [Implementation](#implementation)
     * [Quantizer Annotation](#quantizer-annotation)
+* [Operator Support Status](#operator-support-status)
 * [Issues](#issues)
 * [Pull Requests](#pull-requests)
 
@@ -246,7 +247,7 @@ Now, we can start to fill in function body step by step:
             nodes_to_wrappers,
         )
     ```
-    The logic should be similar and straightforward. Please carefully set arguments `tensor_type` 
+    The logic should be similar and straightforward. Please carefully set arguments `tensor_type`
     according to tensors' property.
 
 3. Define parameters:
@@ -355,6 +356,128 @@ Now, we can start to fill in function body step by step:
 ### Quantizer Annotation
 The operator now should be functional for Qualcomm backends. For operator to work in fixed-precision, we should also make `QnnQuantizer` to correctly insert observers for recording calibrated encodings. Please read more on the [Quantization Annotation Tutorial](../quantizer//README.md).
 
+## Operator Support Status
+Please help update following table if you are contributing new operators:
+
+| Operators | HTP - 77/116 Enabled |
+|-----------|---------|
+| Argmax | &cross; |
+| Argmin | &check; |
+| BatchNorm | &check; |
+| BatchToSpace | &cross; |
+| Cast | &check; |
+| ChannelShuffle | &cross; |
+| Concat | &check; |
+| Conv2d | &check; |
+| Conv3d | &cross; |
+| Convert | &check; |
+| CreateSparse | &cross; |
+| CumulativeSum | &check; |
+| DepthToSpace | &check; |
+| DepthWiseConv2d | &check; |
+| Dequantize | &check; |
+| DetectionOutput | &cross; |
+| ElementWiseAbs | &check; |
+| ElementWiseAdd | &check; |
+| ElementWiseAnd | &check; |
+| ElementWiseAsin | &cross; |
+| ElementWiseAtan | &cross; |
+| ElementWiseBinary | &cross; |
+| ElementWiseCeil | &check; |
+| ElementWiseCos | &check; |
+| ElementWiseDivide | &check; |
+| ElementWiseEqual | &check; |
+| ElementWiseExp | &check; |
+| ElementWiseFloor | &cross; |
+| ElementWiseFloorDiv | &cross; |
+| ElementWiseGreater | &check; |
+| ElementWiseGreaterEqual | &check; |
+| ElementWiseLess | &check; |
+| ElementWiseLessEqual | &check; |
+| ElementWiseLog | &check; |
+| ElementWiseMaximum | &check; |
+| ElementWiseMinimum | &check; |
+| ElementWiseMultiply | &check; |
+| ElementWiseNeg | &check; |
+| ElementWiseNeuron | &check; |
+| ElementWiseNot | &check; |
+| ElementWiseNotEqual | &check; |
+| ElementWiseOr | &check; |
+| ElementWisePower | &check; |
+| ElementWiseRound | &cross; |
+| ElementWiseRsqrt | &check; |
+| ElementWiseSelect | &check; |
+| ElementWiseSign | &cross; |
+| ElementWiseSin | &check; |
+| ElementWiseSquaredDifference | &cross; |
+| ElementWiseSquareRoot | &check; |
+| ElementWiseSubtract | &check; |
+| ElementWiseUnary | &cross; |
+| ElementWiseXor | &cross; |
+| Elu | &check; |
+| ExpandDims | &check; |
+| ExtractGlimpse | &cross; |
+| ExtractPatches | &cross; |
+| FullyConnected | &check; |
+| Gather | &check; |
+| GatherElements | &cross; |
+| GatherNd | &check; |
+| Gelu | &check; |
+| GetSparseIndices | &cross; |
+| GetSparseValues | &cross; |
+| GridSample | &cross; |
+| GroupNorm | &check; |
+| HardSwish | &check; |
+| InstanceNorm | &check; |
+| L2Norm | &cross; |
+| LayerNorm | &check; |
+| LogSoftmax | &check; |
+| Lrn | &cross; |
+| Lstm | &cross; |
+| MatMul | &check; |
+| MultiClassNms | &cross; |
+| NonMaxSuppression | &cross; |
+| Nonzero | &cross; |
+| OneHot | &cross; |
+| Pack | &check; |
+| Pad | &check; |
+| PoolAvg2d | &check; |
+| PoolAvg3d | &cross; |
+| PoolMax2d | &check; |
+| Prelu | &check; |
+| Quantize | &check; |
+| ReduceMax | &check; |
+| ReduceMean | &check; |
+| ReduceMin | &cross; |
+| ReduceSum | &check; |
+| Relu | &check; |
+| Relu1 | &cross; |
+| Relu6 | &cross; |
+| ReluMinMax | &check; |
+| Reshape | &check; |
+| Resize | &cross; |
+| ResizeBilinear | &check; |
+| ResizeNearestNeighbor | &check; |
+| RoiAlign | &cross; |
+| RmsNorm | &check; |
+| ScatterElements | &cross; |
+| ScatterNd | &check; |
+| Sigmoid | &check; |
+| Softmax | &check; |
+| SpaceToBatch | &cross; |
+| SpaceToDepth | &check; |
+| SparseToDense | &cross; |
+| Split | &check; |
+| Squeeze | &check; |
+| StridedSlice | &check; |
+| Tanh | &check; |
+| Tile | &check; |
+| TopK | &check; |
+| TransPose | &check; |
+| TransPoseConv2d | &check; |
+| TransPoseConv3d | &cross; |
+| Unpack | &check; |
+
 ## Issues
 Please refer to the [issue section](../README.md#issues) for more information.
 
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index cc85333f26b..705d5d163cd 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -9,6 +9,7 @@
     op_abs,
     op_adaptive_avg_pool2d,
     op_add,
+    op_amax,
     op_and,
     op_arange,
     op_argmin,
@@ -20,6 +21,7 @@
     op_clamp,
     op_conv2d,
     op_cos,
+    op_cum_sum,
     op_depth_to_space,
     op_dequantize,
     op_div,
@@ -95,6 +97,7 @@
     op_abs,
     op_adaptive_avg_pool2d,
     op_add,
+    op_amax,
     op_and,
     op_arange,
     op_argmin,
@@ -106,6 +109,7 @@
     op_clamp,
     op_conv2d,
     op_cos,
+    op_cum_sum,
     op_depth_to_space,
     op_dequantize,
     op_div,
diff --git a/backends/qualcomm/builders/op_amax.py b/backends/qualcomm/builders/op_amax.py
new file mode 100644
index 00000000000..099004a4bcf
--- /dev/null
+++ b/backends/qualcomm/builders/op_amax.py
@@ -0,0 +1,84 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict, List
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpReduceMax, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class AMax(NodeVisitor):
+    target = ["aten.amax.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        # mean dims and keep dims
+        mean_dims = cast(List[int], node.args[1])
+        mean_dims = [
+            mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims
+        ]
+        if QCOM_AXIS_ORDER in node.meta:
+            mean_dims = [
+                node.meta[QCOM_AXIS_ORDER].index(mean_dim) for mean_dim in mean_dims
+            ]
+        mean_dims_shape = [len(mean_dims)]
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        reduce_max_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReduceMax.op_name,
+        )
+        reduce_max_op.AddInputTensors([input_tensor_wrapper])
+        reduce_max_op.AddOutputTensors([output_tensor_wrapper])
+        reduce_max_op.AddTensorParam(
+            OpReduceMax.param_axes,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(mean_dims_shape),
+            mean_dims_shape,
+            np.array(mean_dims, dtype=np.uint32),
+            True,
+        )
+        if len(node.args) > 2:
+            keep_dims = cast(bool, node.args[2])
+            reduce_max_op.AddScalarParam(
+                OpReduceMax.param_keep_dims,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+                {QCOM_DATA: keep_dims},
+            )
+
+        return reduce_max_op
diff --git a/backends/qualcomm/builders/op_cos.py b/backends/qualcomm/builders/op_cos.py
index 3858a947d93..589bf3ef88e 100644
--- a/backends/qualcomm/builders/op_cos.py
+++ b/backends/qualcomm/builders/op_cos.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
diff --git a/backends/qualcomm/builders/op_cum_sum.py b/backends/qualcomm/builders/op_cum_sum.py
new file mode 100644
index 00000000000..f62485bc519
--- /dev/null
+++ b/backends/qualcomm/builders/op_cum_sum.py
@@ -0,0 +1,84 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpCumulativeSum, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class CumulativeSum(NodeVisitor):
+    target = ["aten.cumsum.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def get_param(self, node, input_tensor):
+        dim = node.args[1]
+
+        if dim < 0:
+            dim = dim % len(input_tensor.shape)
+        if QCOM_AXIS_ORDER in node.meta:
+            dim = node.meta[QCOM_AXIS_ORDER].index(dim)
+
+        return cast(np.uint32, dim)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        dim = self.get_param(node, input_tensor)
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        cumsum_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpCumulativeSum.op_name,
+        )
+        cumsum_op.AddInputTensors([input_tensor_wrapper])
+        cumsum_op.AddOutputTensors([output_tensor_wrapper])
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {QCOM_DATA: dim},
+        )
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_exclusive,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+            {QCOM_DATA: False},
+        )
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_reverse,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+            {QCOM_DATA: False},
+        )
+
+        return cumsum_op
diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py
index d224e34feb5..aa7f9becd98 100644
--- a/backends/qualcomm/builders/op_rms_norm.py
+++ b/backends/qualcomm/builders/op_rms_norm.py
@@ -81,8 +81,9 @@ def define_node(
             {},  # kwargs
         )
         if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
+            quant_attrs = quant_attrs.copy()
+            quant_attrs[QCOM_ZERO_POINT] = 0
             bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
-            bias_node.meta[QCOM_QUANT_ATTRS][QCOM_ZERO_POINT] = 0
         bias_tensor_wrapper = self.define_tensor(
             bias_node,
             node,
diff --git a/backends/qualcomm/builders/op_sin.py b/backends/qualcomm/builders/op_sin.py
index 89fce6bee9c..8828685ac9e 100644
--- a/backends/qualcomm/builders/op_sin.py
+++ b/backends/qualcomm/builders/op_sin.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
diff --git a/backends/qualcomm/builders/op_sqrt.py b/backends/qualcomm/builders/op_sqrt.py
index 030e6c3e10a..5505e92ee67 100644
--- a/backends/qualcomm/builders/op_sqrt.py
+++ b/backends/qualcomm/builders/op_sqrt.py
@@ -10,7 +10,7 @@
 import torch
 
 from .node_visitor import NodeVisitor, register_node_visitor
-from .qnn_constants import OpElementWiseSqrt, QNN_OP_PACKAGE_NAME_QTI_AISW
+from .qnn_constants import OpElementWiseSquareRoot, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 @register_node_visitor
@@ -51,7 +51,7 @@ def define_node(
         sqrt_op = PyQnnWrapper.PyQnnOpWrapper(
             node.name,
             QNN_OP_PACKAGE_NAME_QTI_AISW,
-            OpElementWiseSqrt.op_name,
+            OpElementWiseSquareRoot.op_name,
         )
         sqrt_op.AddInputTensors(sqrt_input_tensors)
         sqrt_op.AddOutputTensors(sqrt_output_tensors)
diff --git a/backends/qualcomm/builders/op_stack.py b/backends/qualcomm/builders/op_stack.py
index 616d0ee0ccc..fdef148ad4d 100644
--- a/backends/qualcomm/builders/op_stack.py
+++ b/backends/qualcomm/builders/op_stack.py
@@ -51,7 +51,7 @@ def define_node(
 
         dim = 0 if len(node.args) == 1 else cast(int, node.args[1])
         if dim < 0:
-            dim = dim % len(input_tensor.shape)
+            dim = dim % len(output_tensor.shape)
         if QCOM_AXIS_ORDER in node.meta:
             dim = node.meta[QCOM_AXIS_ORDER].index(dim)
         stack_op = PyQnnWrapper.PyQnnOpWrapper(
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 9613c755c7c..06e398f7c05 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -14,6 +14,13 @@
 # instead of replicating them here.
 
 
+@dataclass(init=False, frozen=True)
+class OpArgmin:
+    op_name: str = "Argmin"
+    param_axis: str = "axis"
+    param_keep_dims: str = "keep_dims"
+
+
 @dataclass(init=False, frozen=True)
 class OpBatchnorm:
     op_name: str = "Batchnorm"
@@ -50,6 +57,14 @@ class OpConvert:
     op_name: str = "Convert"
 
 
+@dataclass(init=False, frozen=True)
+class OpCumulativeSum:
+    op_name = "CumulativeSum"
+    param_axis = "axis"
+    param_exclusive = "exclusive"
+    param_reverse = "reverse"
+
+
 @dataclass(init=False, frozen=True)
 class OpDepthToSpace:
     op_name: str = "DepthToSpace"
@@ -204,7 +219,7 @@ class OpElementWiseSelect:
 
 
 @dataclass(init=False, frozen=True)
-class OpElementWiseSqrt:
+class OpElementWiseSquareRoot:
     op_name = "ElementWiseSquareRoot"
 
 
@@ -350,16 +365,16 @@ class OpQuantize:
 
 
 @dataclass(init=False, frozen=True)
-class OpReduceMean:
-    op_name: str = "ReduceMean"
+class OpReduceMax:
+    op_name: str = "ReduceMax"
     param_axes: str = "axes"
     param_keep_dims: str = "keep_dims"
 
 
 @dataclass(init=False, frozen=True)
-class OpArgmin:
-    op_name: str = "Argmin"
-    param_axis: str = "axis"
+class OpReduceMean:
+    op_name: str = "ReduceMean"
+    param_axes: str = "axes"
     param_keep_dims: str = "keep_dims"
 
 
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index b427c59ce07..6326f4d1210 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -13,6 +13,7 @@
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.slice_scatter.default,
     exir_ops.edge.aten.copy.default,
+    exir_ops.edge.aten.upsample_bicubic2d.vec,
     exir_ops.edge.quantized_decomposed.embedding_4bit.dtype,
 ]
 
diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py
index 7b5e72d461d..d9eb188614c 100644
--- a/backends/qualcomm/partition/qnn_partitioner.py
+++ b/backends/qualcomm/partition/qnn_partitioner.py
@@ -34,7 +34,7 @@
     not_supported_operator,
     to_be_implemented_operator,
 )
-from .utils import generate_qnn_executorch_option, get_skip_decomp_table
+from .utils import filter_fn, generate_qnn_executorch_option, get_skip_decomp_table
 
 
 class QnnOperatorSupport(OperatorSupportBase):
@@ -181,5 +181,4 @@ def ops_to_not_decompose(
         self, ep: ExportedProgram
     ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
         do_not_decompose = get_skip_decomp_table()
-
-        return do_not_decompose, None
+        return (do_not_decompose, filter_fn)
diff --git a/backends/qualcomm/partition/utils.py b/backends/qualcomm/partition/utils.py
index 1e2b17b2a69..816d1ac1d9b 100644
--- a/backends/qualcomm/partition/utils.py
+++ b/backends/qualcomm/partition/utils.py
@@ -24,6 +24,21 @@ def generate_qnn_executorch_option(
     return qnn_compile_spec_buffer
 
 
+# Logic to determine whether to skip decompose and has higher priority than get_skip_decomp_table()
+def filter_fn(node: torch.fx.Node) -> bool:
+    # QNN does not support int32/int64 IO for the following OPs.
+    potential_i32_i64_io_ops = [
+        torch.ops.aten.stack.default,
+        torch.ops.aten.unbind.int,
+    ]
+    if node.target in potential_i32_i64_io_ops and node.meta["val"].dtype in [
+        torch.int32,
+        torch.int64,
+    ]:
+        return False
+    return True
+
+
 def get_skip_decomp_table() -> List[torch._ops.OperatorBase]:
     do_not_decompose = [
         torch.ops.aten.adaptive_avg_pool2d.default,
@@ -39,8 +54,9 @@ def get_skip_decomp_table() -> List[torch._ops.OperatorBase]:
         torch.ops.aten.rms_norm.default,
         torch.ops.aten._safe_softmax.default,
         torch.ops.aten.stack.default,
+        torch.ops.aten.upsample_bicubic2d.vec,
         # This request is ignored because it is in a blocklist. Refer to exir/program/_program.py
-        # torch.ops.aten.unbind.int,
+        torch.ops.aten.unbind.int,
         torch.ops.pt2e_quant.quantize_affine.default,
         torch.ops.pt2e_quant.dequantize_affine.default,
     ]
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
index 93af5e86c97..469a801feeb 100644
--- a/backends/qualcomm/quantizer/annotators.py
+++ b/backends/qualcomm/quantizer/annotators.py
@@ -97,6 +97,7 @@ def annotate_in_out_obs_sharing_op(
         QUANT_ANNOTATION_KEY not in input_act.meta
         or not input_act.meta[QUANT_ANNOTATION_KEY]._annotated
         or input_act.meta[QUANT_ANNOTATION_KEY].output_qspec is None
+        or not _is_float_tensor(input_act)
     ):
         return
 
@@ -132,9 +133,10 @@ def annotate_single_in_single_out(
         return
 
     input_qspec_map = {}
-    input_act = node.args[0]
-    assert isinstance(input_act, Node)
-    input_qspec_map[input_act] = quantization_config.input_activation
+    if _is_float_tensor(node.args[0]):
+        input_act = node.args[0]
+        assert isinstance(input_act, Node)
+        input_qspec_map[input_act] = quantization_config.input_activation
 
     if _is_float_tensor(node):
         node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
@@ -177,11 +179,18 @@ def annotate_binary(node: Node, quantization_config: QuantizationConfig) -> None
     )
 
 
-@register_annotator([torch.ops.aten.add, torch.ops.aten.add.Tensor])
+@register_annotator(
+    [torch.ops.aten.add, torch.ops.aten.add.Tensor, torch.ops.aten.add_.Tensor]
+)
 def annotate_add(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.amax.default])
+def annotate_amax(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_binary(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.argmin.default])
 def annotate_argmin(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
@@ -928,6 +937,11 @@ def annotate_bmm(node: Node, quantization_config: QuantizationConfig) -> None:
     node.meta["source_fn_stack"] = [(node, torch.bmm)]
 
 
+@register_annotator([torch.ops.aten.cdist.default])
+def annotate_cdist(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_binary(node, quantization_config)
+
+
 @register_annotator(
     [
         torch.ops.aten.conv2d.default,
@@ -936,7 +950,7 @@ def annotate_bmm(node: Node, quantization_config: QuantizationConfig) -> None:
         torch.ops.aten.conv_transpose1d.default,
     ]
 )
-def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
+def annotate_conv(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
 
@@ -971,6 +985,11 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
     )
 
 
+@register_annotator([torch.ops.aten.cumsum.default])
+def annotate_cumsum(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.linear.default])
 def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None:
     act_node = node.args[0]
@@ -1108,15 +1127,17 @@ def annotate_cat(node: Node, quantization_config: QuantizationConfig) -> None:
     input_qspec_map = {}
     assert isinstance(first_input_node, Node)
     assert isinstance(node, Node)
-    input_qspec_map[first_input_node] = quantization_config.input_activation
-    share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
-        (first_input_node, node)
-    )
+    if _is_float_tensor(first_input_node):
+        input_qspec_map[first_input_node] = quantization_config.input_activation
+        share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
+            (first_input_node, node)
+        )
 
     for input_node in input_nodes[1:]:
         if input_node not in input_qspec_map:
             assert isinstance(input_node, Node)
-            input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
+            if _is_float_tensor(input_node):
+                input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
 
     node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
         input_qspec_map=input_qspec_map,
@@ -1130,7 +1151,6 @@ def annotate_unbind(node: Node, quantization_config: QuantizationConfig) -> None
     # Seems like unbind.int can be either float or int. Only quant when input is float.
     if _is_annotated([node]) or not _is_float_tensor(node.args[0]):
         return
-
     input_qspec_map = {}
     input_act = node.args[0]
     assert isinstance(input_act, Node)
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
index 33237f3bebe..bda91609f1c 100644
--- a/backends/qualcomm/quantizer/custom_annotation.py
+++ b/backends/qualcomm/quantizer/custom_annotation.py
@@ -6,7 +6,10 @@
 from typing import Sequence
 
 import torch
-from executorch.backends.qualcomm.quantizer.annotators import QUANT_ANNOTATION_KEY
+from executorch.backends.qualcomm.quantizer.annotators import (
+    _is_float_tensor,
+    QUANT_ANNOTATION_KEY,
+)
 from executorch.backends.qualcomm.quantizer.quantizer import (
     get_16a8w_qnn_ptq_config,
     get_8a8w_qnn_ptq_config,
@@ -23,6 +26,38 @@
 from torch.fx import Node
 
 
+def annotate_mimi_decoder(gm: torch.fx.GraphModule):
+    """
+    The 1st transpose conv in mimi decoder is really sensitive to scale/offset in 16a8w, which causes execution failure.
+    Annotate 1st transpose conv as 8a8w to prevent execution failure.
+    """
+    quantization_config_8a8w = get_8a8w_qnn_ptq_config()
+    for node in gm.graph.nodes:
+        if not _is_float_tensor(node):
+            continue
+        elif node.target == torch.ops.aten.conv_transpose1d.default:
+            input_qspec_map = {}
+            input_act = node.args[0]
+            assert isinstance(input_act, Node)
+            input_spec = quantization_config_8a8w.input_activation
+            input_qspec_map[input_act] = input_spec
+
+            weight = node.args[1]
+            assert isinstance(weight, Node)
+            input_qspec_map[weight] = quantization_config_8a8w.weight
+
+            if len(node.args) > 2 and isinstance(node.args[2], Node):
+                bias = node.args[2]
+                input_qspec_map[bias] = quantization_config_8a8w.bias
+
+            node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=quantization_config_8a8w.output_activation,
+                _annotated=True,
+            )
+            break
+
+
 def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None:
     def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
         input_qspec_map = {}
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 3620841aff9..8e65607dd84 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -3,9 +3,10 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
 from enum import IntEnum, unique
 from functools import partial
-from typing import Callable, Dict, Optional, Sequence, Set, Tuple
+from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple
 
 import torch
 from executorch.backends.qualcomm._passes.qnn_pass_manager import QnnPassManager
@@ -58,7 +59,7 @@ class QuantDtype(IntEnum):
     use_8a8w = 4
 
 
-quant_config_dict = {
+QUANT_CONFIG_DICT = {
     # PTQ
     (QuantDtype.use_16a16w, False): (
         get_16a16w_qnn_ptq_config,
@@ -123,6 +124,59 @@ class QuantDtype(IntEnum):
 }
 
 
+@dataclass
+class ModuleQConfig:
+    quant_dtype: QuantDtype = QuantDtype.use_8a8w
+    is_qat: bool = False
+    is_conv_per_channel: bool = False
+    is_linear_per_channel: bool = False
+    act_observer: Optional[
+        torch.ao.quantization.observer.UniformQuantizationObserverBase
+    ] = None
+
+    def __post_init__(self):
+        if (self.quant_dtype, self.is_qat) not in QUANT_CONFIG_DICT:
+            raise RuntimeError(
+                f"the quant config, (quant_dtype: {self.quant_dtype}, is_qat: {self.is_qat}) is not support"
+            )
+        (
+            quant_config_func,
+            per_channel_quant_config_func,
+            per_block_quant_config_func,
+        ) = QUANT_CONFIG_DICT[(self.quant_dtype, self.is_qat)]
+        self.quant_config = (
+            quant_config_func(act_observer=self.act_observer)
+            if self.act_observer
+            else quant_config_func()
+        )
+        self.per_channel_quant_config = (
+            per_channel_quant_config_func(act_observer=self.act_observer)
+            if self.act_observer
+            else per_channel_quant_config_func()
+        )
+        self.use_per_channel_weight_quant_ops = set()
+        if self.is_conv_per_channel:
+            self.use_per_channel_weight_quant_ops.update(
+                {
+                    torch.ops.aten.conv1d.default,
+                    torch.ops.aten.conv2d.default,
+                    torch.ops.aten.conv_transpose2d.input,
+                }
+            )
+        if self.is_linear_per_channel:
+            self.use_per_channel_weight_quant_ops.update(
+                {
+                    torch.ops.aten.linear.default,
+                }
+            )
+        if per_block_quant_config_func:
+            self.per_block_quant_config = (
+                per_block_quant_config_func(act_observer=self.act_observer)
+                if self.act_observer
+                else per_block_quant_config_func()
+            )
+
+
 class QnnQuantizer(Quantizer):
     SUPPORTED_OPS: Set = set(OP_ANNOTATOR.keys())
 
@@ -130,14 +184,11 @@ def __init__(self):
         super().__init__()
         self.quant_ops: Set[OpOverload] = self.SUPPORTED_OPS.copy()
 
-        self.is_qat = False
-        self.quant_dtype = QuantDtype.use_8a8w
-        self.quant_config: QuantizationConfig = get_8a8w_qnn_ptq_config()
-        self.per_channel_quant_config = get_ptq_per_channel_quant_config()
-        self.per_block_quant_config = get_ptq_per_block_quant_config()
+        self.default_quant_config = ModuleQConfig()
+        self.submodule_qconfig_list: List[
+            Tuple[Callable[[torch.fx.Node], bool], ModuleQConfig]
+        ] = []
         self.block_size_map = {}
-        self.use_per_channel_weight_quant_ops: Set[OpOverload] = set()
-        self.use_per_block_weight_quant_ops: Set[OpOverload] = set()
 
         self.custom_quant_annotations: Sequence[Callable] = []
         self.discard_nodes: Set[str] = set()
@@ -155,41 +206,38 @@ def _annotate_custom_annotation(self, gm: GraphModule) -> None:
         for annotation_func in self.custom_quant_annotations:
             annotation_func(gm)
 
-    def _get_quant_config(self, op: torch.fx.Node) -> Optional[QuantizationConfig]:
+    def _get_submodule_qconfig(self, node: torch.fx.Node):
+        for func, qconfig in self.submodule_qconfig_list:
+            if func(node):
+                return qconfig
+        return self.default_quant_config
+
+    def _get_quant_config(self, node: torch.fx.Node) -> Optional[QuantizationConfig]:
         """
-        Priority:
-            1. is one of use_per_block_weight_quant_ops
-            2. is one of use_per_channel_weight_quant_ops
-            3. quant config
+        How to pick:
+            1. is one of per_block_quant_config
+            2. Pick specific submodule config if given.
+            3. Pick one if op belongs to use_per_channel_weight_quant_ops
+            4. If not 3, pick normal quant config
         """
-        target = op.target
-        if isinstance(target, str):
+        op = node.target
+        if isinstance(op, str):
             return
 
-        if target in self.use_per_block_weight_quant_ops:
-            if block_size := self.block_size_map.get(op.name):
-                self.per_block_quant_config.block_size = block_size
-                return self.per_block_quant_config
+        if block_size := self.block_size_map.get(node.name):
+            config = self.default_quant_config.per_block_quant_config
+            config.block_size = block_size
+            return config
 
-        if target in self.use_per_channel_weight_quant_ops:
-            return self.per_channel_quant_config
+        config = self._get_submodule_qconfig(node)
 
-        if target in self.quant_ops:
-            return self.quant_config
+        if op in config.use_per_channel_weight_quant_ops:
+            return config.per_channel_quant_config
 
-        print(f"No quant config is implemented for op, {op}")
-
-    def _update_per_block_weight_quant_ops(self, ops: Set[OpOverload], enable: bool):
-        if enable:
-            self.use_per_block_weight_quant_ops.update(ops)
-        else:
-            self.use_per_block_weight_quant_ops.difference_update(ops)
+        if op in self.quant_ops:
+            return config.quant_config
 
-    def _update_per_channel_weight_quant_ops(self, ops: Set[OpOverload], enable: bool):
-        if enable:
-            self.use_per_channel_weight_quant_ops.update(ops)
-        else:
-            self.use_per_channel_weight_quant_ops.difference_update(ops)
+        print(f"No quant config is implemented for op, {op}")
 
     def add_custom_quant_annotations(
         self, custom_quant_annotations: Sequence[Callable]
@@ -212,55 +260,74 @@ def annotate(self, model: GraphModule) -> GraphModule:
     def get_supported_ops(self) -> Set[OpOverload]:
         return self.SUPPORTED_OPS
 
-    def set_quant_config(
-        self, quant_dtype: QuantDtype, is_qat=False, act_observer=None
+    def set_default_quant_config(
+        self,
+        quant_dtype: QuantDtype,
+        is_qat=False,
+        is_conv_per_channel=False,
+        is_linear_per_channel=False,
+        act_observer=None,
     ) -> None:
-        self.quant_dtype = quant_dtype
-        self.is_qat = is_qat
-        if (quant_dtype, is_qat) not in quant_config_dict:
-            raise RuntimeError(
-                f"the quant config, (quant_dtype: {quant_dtype}, is_qat: {is_qat}) is not support"
-            )
-
-        quant_config_fuc, per_channel_quant_config_fuc, per_block_quant_config_fuc = (
-            quant_config_dict[(quant_dtype, is_qat)]
-        )
-        self.quant_config = (
-            quant_config_fuc(act_observer=act_observer)
-            if act_observer
-            else quant_config_fuc()
+        self.default_quant_config = ModuleQConfig(
+            quant_dtype,
+            is_qat,
+            is_conv_per_channel,
+            is_linear_per_channel,
+            act_observer,
         )
-        self.per_channel_quant_config = (
-            per_channel_quant_config_fuc(act_observer=act_observer)
-            if act_observer
-            else per_channel_quant_config_fuc()
-        )
-        if per_block_quant_config_fuc is not None:
-            self.per_block_quant_config = (
-                per_block_quant_config_fuc(act_observer=act_observer)
-                if act_observer
-                else per_block_quant_config_fuc()
-            )
 
     def set_block_size_map(self, block_size_map: Dict[str, Tuple]) -> None:
         self.block_size_map = block_size_map
 
-    def set_per_block_conv_quant(self, enable: bool) -> None:
-        conv_ops = {torch.ops.aten.conv2d.default}
-        self._update_per_block_weight_quant_ops(conv_ops, enable)
-
-    def set_per_channel_conv_quant(self, enable: bool) -> None:
-        conv_ops = {torch.ops.aten.conv1d.default, torch.ops.aten.conv2d.default}
-        self._update_per_channel_weight_quant_ops(conv_ops, enable)
-
-    def set_per_channel_linear_quant(self, enable: bool) -> None:
-        linear_ops = {
-            torch.ops.aten.linear.default,
-        }
-        self._update_per_channel_weight_quant_ops(linear_ops, enable)
+    def set_submodule_qconfig_list(
+        self, submodule_qconfig_list: List[Tuple[Callable, ModuleQConfig]]
+    ) -> None:
+        """
+        Set specific quant config from a callback function.
+        If a node fits more than one callback, only apply the first one.
+        """
+        self.submodule_qconfig_list = submodule_qconfig_list
 
     def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         return QnnPassManager().transform_for_annotation_pipeline(model)
 
     def validate(self, model: GraphModule) -> None:
         pass
+
+
+def get_submodule_type_predicate(module_type_str):
+    """
+    An example of nn_module_stack
+    {
+        'L__self__': ('', 'executorch.backends.qualcomm.tests.models.SubModules'),
+        'L__self___add': ('add', 'executorch.backends.qualcomm.tests.models.Add')
+    }
+    """
+
+    def predicate(node):
+        if nn_module_stack := node.meta.get("nn_module_stack"):
+            for _, type_name in nn_module_stack.values():
+                if module_type_str in type_name:
+                    return True
+        return False
+
+    return predicate
+
+
+def get_submodule_name_predicate(module_name_str):
+    """
+    An example of nn_module_stack
+    {
+        'L__self__': ('', 'executorch.backends.qualcomm.tests.models.SubModules'),
+        'L__self___add': ('add', 'executorch.backends.qualcomm.tests.models.Add')
+    }
+    """
+
+    def predicate(node):
+        if nn_module_stack := node.meta.get("nn_module_stack"):
+            for name in nn_module_stack.keys():
+                if module_name_str in name:
+                    return True
+        return False
+
+    return predicate
diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
index 7abe4b35076..fd580867db5 100644
--- a/backends/qualcomm/runtime/backends/QnnProfiler.cpp
+++ b/backends/qualcomm/runtime/backends/QnnProfiler.cpp
@@ -84,6 +84,22 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData(
         "ProfileData failed to get events: %d", QNN_GET_ERROR_CODE(error));
     return error;
   }
+
+  auto get_unit = [](QnnProfile_EventUnit_t unit) {
+    switch (unit) {
+      case QNN_PROFILE_EVENTUNIT_MICROSEC:
+        return " (us)";
+      case QNN_PROFILE_EVENTUNIT_BYTES:
+        return " (bytes)";
+      case QNN_PROFILE_EVENTUNIT_COUNT:
+        return " (count)";
+      case QNN_PROFILE_EVENTUNIT_BACKEND:
+      // cycle unit is default appeared
+      case QNN_PROFILE_EVENTUNIT_CYCLES:
+      default:
+        return "";
+    }
+  };
   QnnProfile_EventData_t event_data;
   for (std::uint32_t i = 0; i < num_events; ++i) {
     error =
@@ -96,6 +112,16 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData(
           QNN_GET_ERROR_CODE(error));
       return error;
     }
+    // add events for other important metrics, e.g. RPC execution time
+    std::string identifier =
+        std::string(event_data.identifier) + get_unit(event_data.unit);
+    executorch::runtime::event_tracer_log_profiling_delegate(
+        event_tracer,
+        identifier.c_str(),
+        /*delegate_debug_id=*/
+        static_cast<executorch::runtime::DebugHandle>(-1),
+        0,
+        event_data.value);
     // Check an event's sub events only if it relates to graph execution time
     // (and its sub events are the individual op executions):
     if (backend_->IsProfileEventTypeParentOfNodeTime(event_data.type)) {
@@ -109,6 +135,7 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData(
             QNN_GET_ERROR_CODE(error));
         return error;
       }
+
       QnnProfile_EventData_t sub_event_data;
       for (std::uint32_t j = 0; j < num_sub_events; ++j) {
         error = qnn_interface.qnn_profile_get_event_data(
diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl
index a56accd7813..b9fb2cc54fd 100644
--- a/backends/qualcomm/runtime/targets.bzl
+++ b/backends/qualcomm/runtime/targets.bzl
@@ -3,7 +3,7 @@ load(
     "ANDROID",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -24,7 +24,7 @@ def define_common_targets():
         platforms = [ANDROID],
         visibility = ["@EXECUTORCH_CLIENTS"],
         deps = [
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
             "//executorch/runtime/backend:interface",
         ],
         exported_deps = [
@@ -60,11 +60,11 @@ def define_common_targets():
             platforms = [ANDROID],
             visibility = ["@EXECUTORCH_CLIENTS"],
             resources = ({
-                "qnn_lib": "fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs".format(get_qnn_library_verision()),
+                "qnn_lib": "fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs".format(get_qnn_library_version()),
                 } if include_aot_qnn_lib else {
             }),
             deps = [
-                "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+                "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
                 ":logging",
                 "//executorch/backends/qualcomm:schema",
                 "//executorch/backends/qualcomm/aot/ir:qcir_utils",
diff --git a/backends/qualcomm/setup.md b/backends/qualcomm/setup.md
index 37d8e04c210..a7adb6d006d 100644
--- a/backends/qualcomm/setup.md
+++ b/backends/qualcomm/setup.md
@@ -1,6 +1,6 @@
 # Setting up QNN Backend
 
-Please refer to [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md).
+Please refer to [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](../../docs/source/backends-qualcomm.md).
 
 That is a tutorial for building and running Qualcomm AI Engine Direct backend,
 including compiling a model on a x64 host and running the inference
diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl
index fbbfa0f1925..9a44ee8b773 100644
--- a/backends/qualcomm/targets.bzl
+++ b/backends/qualcomm/targets.bzl
@@ -3,7 +3,7 @@ load(
     "ANDROID",
 )
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 # Construct the input and output file names. All input and output files rely on scalar_type file.
 SCHEMA_NAME = "qc_compiler_spec"
@@ -84,7 +84,7 @@ def define_common_targets():
         define_static_target = True,
         visibility = ["@EXECUTORCH_CLIENTS"],
         deps = [
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
             "//executorch/backends/qualcomm/runtime:runtime_android_build",
diff --git a/backends/qualcomm/tests/TARGETS b/backends/qualcomm/tests/TARGETS
index b6a9664dcbf..8078ca611f8 100644
--- a/backends/qualcomm/tests/TARGETS
+++ b/backends/qualcomm/tests/TARGETS
@@ -1,6 +1,6 @@
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 python_library(
     name = "models",
@@ -17,7 +17,7 @@ python_library(
         "utils.py",
     ],
     # env = {
-    #     "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()),
+    #     "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()),
     # },
     deps = [
         ":models",
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 0857a597d88..adf6e256f54 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -72,6 +72,16 @@ def forward(self, x):
         return torch.any(x, dim=self.dim, keepdim=self.keepdim)
 
 
+class AMax(torch.nn.Module):
+    def __init__(self, dim=None, keepdim=False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, x):
+        return torch.amax(x, dim=self.dim, keepdim=self.keepdim)
+
+
 class Arange(torch.nn.Module):
     def __init__(self, start, end, step, dtype):
         super().__init__()
@@ -180,6 +190,14 @@ def forward(self, x, y):
         return torch.cat((y, y, x, x), axis=2)
 
 
+class CDist(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return torch.cdist(x, y, p=2)
+
+
 class Ceil(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -558,6 +576,14 @@ def forward(self, x):
         return torch.cos(x)
 
 
+class CumSum(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.cumsum(dim=0)
+
+
 class Div(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1410,6 +1436,15 @@ def forward(self, x):
         return x / torch.sqrt(torch.tensor([64.0]))
 
 
+class SquaredReLU(torch.nn.Module):
+    def __init__(self, inplace=False):
+        super().__init__()
+        self.relu = torch.nn.ReLU(inplace=inplace)
+
+    def forward(self, x):
+        return torch.square(self.relu(x))
+
+
 class Squeeze(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1450,6 +1485,18 @@ def forward(self, x):
         return 10 - x
 
 
+class SimpleSubModules(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.add = Add()
+        self.sub = Sub()
+
+    def forward(self, a, b, c, d):
+        lhs = self.add(a, b)
+        rhs = self.sub(c, d)
+        return torch.mul(lhs, rhs)
+
+
 class SumIntList(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1558,3 +1605,14 @@ def forward(self, x):
         return torch.nn.functional.softmax(
             torch.where(x >= 0, 0.1, float("-inf")), dim=-1
         )
+
+
+# Mimi Decoder has 0D tensor which QNN cannot handle.
+class ZeroDimTensor(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        input1 = torch.zeros(1)
+        selected_element = torch.select(input1, 0, 0)
+        return torch.add(x, selected_element)
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 795459a9f77..7d097fd45bf 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -24,6 +24,7 @@
 
 from executorch.backends.qualcomm.tests.utils import (
     generate_context_binary,
+    ModuleQConfig,
     QuantDtype,
     TestQNN,
     validate_context_binary,
@@ -68,7 +69,11 @@
 from collections import defaultdict
 from typing import List
 
-from executorch.backends.qualcomm._passes import FoldQDQ, TagQuantIO
+from executorch.backends.qualcomm._passes import (
+    ExpandBroadcastTensorShape,
+    FoldQDQ,
+    TagQuantIO,
+)
 from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors
 from executorch.backends.qualcomm.debugger.utils import DrawGraph
 from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model
@@ -113,6 +118,13 @@ def test_qnn_backend_adaptive_avg_pool2d(self):
         sample_input = (torch.randn(1, 512, 7, 7),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_amax(self):
+        modules = [AMax(dim=1, keepdim=False), AMax(dim=1, keepdim=True)]  # noqa: F405
+        sample_input = (torch.randn(4, 4),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_any(self):
         modules = [Any(), Any(dim=[0, 1]), Any(dim=1, keepdim=True)]  # noqa: F405
         sample_input = (torch.randn(3, 3, 3) > 0,)
@@ -164,6 +176,14 @@ def test_qnn_backend_cat(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cdist(self):
+        module = CDist()  # noqa: F405
+        sample_input = (
+            torch.randn(1, 125, 256),
+            torch.randn(1, 2048, 256),
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_chunk_single(self):
         module = Chunk()  # noqa: F405
         sample_input = (torch.randn(1, 1, 4, 3),)
@@ -225,6 +245,11 @@ def test_qnn_backend_cos(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cumsum(self):
+        module = CumSum()  # noqa: F405
+        sample_input = (torch.randn(4),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
         x = torch.randn(5)
@@ -422,10 +447,20 @@ def test_qnn_backend_equal(self):
 
     def test_qnn_backend_expand(self):
         modules = [ExpandAs(), ExpandCopy()]  # noqa: F405
-        sample_input = (torch.randn([3, 1]),)
-        for i, module in enumerate(modules):
-            with self.subTest(i=i):
-                self.lower_module_and_test_output(module, sample_input)
+        sample_inputs = [
+            (torch.randn([3, 1]),),
+            (torch.randn([4]),),
+        ]
+        passes_job = get_capture_program_passes()
+        passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
+        index = 0
+        for module in modules:
+            for sample_input in sample_inputs:
+                with self.subTest(i=index):
+                    self.lower_module_and_test_output(
+                        module, sample_input, passes_job=passes_job
+                    )
+                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -808,6 +843,11 @@ def test_qnn_backend_softmax(self):
         sample_input = (torch.randn([1, 4, 8, 8]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_squared_relu(self):
+        module = SquaredReLU()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_squeeze(self):
         module = Squeeze()  # noqa: F405
         sample_input = (torch.randn([1, 3, 3]),)
@@ -843,14 +883,14 @@ def test_qnn_backend_where(self):
             Where(),  # noqa: F405
             WhereConstant(torch.randn(3, 2), torch.randn(3, 2)),  # noqa: F405
             WhereConstantOther(),  # noqa: F405
-            # WhereConstantAll(),  # noqa: F405 TODO: constant dtype does not propogate when doing const i64->32, causing where to fail since where does not support int64 output
+            WhereConstantAll(),  # noqa: F405
             WhereConstantInf(),  # noqa: F405
         ]
         sample_inputs = [
             (torch.randn(3, 2), torch.randn(3, 2), torch.randn(3, 2)),
             (torch.randn(3, 2),),
             (torch.randn(3, 2),),
-            # (torch.randn(3, 2),),
+            (torch.randn(3, 2),),
             (torch.randn(30, 20),),
         ]
         for i, module in enumerate(modules):
@@ -979,6 +1019,11 @@ def test_qnn_backend_view_permute_matmul(self):
         sample_input = (torch.randn([1, 8, 512]), torch.randn([1, 2, 8, 256]))
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_zero_dim_tensor(self):
+        module = ZeroDimTensor()  # noqa: F405
+        sample_input = (torch.randn(1, 256, 125),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_example_models(self):
         # TODO Fix MobileBertModelExample and TorchVisionViTModel
         instances = [
@@ -1111,6 +1156,14 @@ def test_qnn_backend_adaptive_avg_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_amax(self):
+        modules = [AMax(dim=1, keepdim=False), AMax(dim=1, keepdim=True)]  # noqa: F405
+        sample_input = (torch.randn(4, 4),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_any(self):
         modules = [Any(), Any(dim=[0, 1]), Any(dim=1, keepdim=True)]  # noqa: F405
         sample_input = (torch.randn(3, 3, 3) > 0,)
@@ -1164,6 +1217,15 @@ def test_qnn_backend_cat(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cdist(self):
+        module = CDist()  # noqa: F405
+        sample_input = (
+            torch.randn(1, 125, 256),
+            torch.randn(1, 2048, 256),
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_chunk_single(self):
         module = Chunk()  # noqa: F405
         sample_input = (torch.randn(1, 1, 4, 3),)
@@ -1237,7 +1299,6 @@ def test_qnn_backend_conv2d_block(self):
                 module = self.get_qdq_module(
                     module,
                     sample_input,
-                    is_conv_per_block=True,
                     quant_dtype=QuantDtype.use_16a4w_block,
                     block_size_map={"conv2d": (1, 128, 1, 1)},
                 )
@@ -1282,6 +1343,12 @@ def test_qnn_backend_cos(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cumsum(self):
+        module = CumSum()  # noqa: F405
+        sample_input = (torch.randn(4),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
         x = torch.randn(5)
@@ -1326,8 +1393,8 @@ def test_qnn_backend_element_wise_add(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        module = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(module, sample_input)
+                        gm = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(gm, sample_input)
                         index += 1
 
     def test_qnn_backend_element_wise_and(self):
@@ -1367,8 +1434,8 @@ def test_qnn_backend_element_wise_div(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        module = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(module, sample_input)
+                        gm = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(gm, sample_input)
                         index += 1
 
     def test_qnn_backend_element_wise_mul(self):
@@ -1395,8 +1462,8 @@ def test_qnn_backend_element_wise_mul(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        module = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(module, sample_input)
+                        gm = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(gm, sample_input)
                         index += 1
 
     def test_qnn_backend_element_wise_or(self):
@@ -1455,8 +1522,8 @@ def test_qnn_backend_element_wise_sub(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        module = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(module, sample_input)
+                        gm = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(gm, sample_input)
                         index += 1
 
     def test_qnn_backend_elu(self):
@@ -1491,11 +1558,21 @@ def test_qnn_backend_equal(self):
 
     def test_qnn_backend_expand(self):
         modules = [ExpandAs(), ExpandCopy()]  # noqa: F405
-        sample_input = (torch.randn([3, 1]),)
-        for i, module in enumerate(modules):
-            with self.subTest(i=i):
-                module = self.get_qdq_module(module, sample_input)
-                self.lower_module_and_test_output(module, sample_input)
+        sample_inputs = [
+            (torch.randn([3, 1]),),
+            (torch.randn([4]),),
+        ]
+        passes_job = get_capture_program_passes()
+        passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
+        index = 0
+        for module in modules:
+            for sample_input in sample_inputs:
+                with self.subTest(i=index):
+                    module = self.get_qdq_module(module, sample_input)
+                    self.lower_module_and_test_output(
+                        module, sample_input, passes_job=passes_job
+                    )
+                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -1929,6 +2006,12 @@ def test_qnn_backend_softmax(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_squared_relu(self):
+        module = SquaredReLU()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_squeeze(self):
         module = Squeeze()  # noqa: F405
         sample_input = (torch.randn([1, 3, 3]),)
@@ -1980,14 +2063,14 @@ def test_qnn_backend_where(self):
             Where(),  # noqa: F405
             WhereConstant(torch.randn(3, 2), torch.randn(3, 2)),  # noqa: F405
             WhereConstantOther(),  # noqa: F405
-            # WhereConstantAll(),  # noqa: F405, TODO: constant dtype does not propogate when doing const i64->32, causing where to fail since where does not support int64 output
+            WhereConstantAll(),  # noqa: F405
             WhereConstantInf(),  # noqa: F405
         ]
         sample_inputs = [
             (torch.randn(3, 2), torch.randn(3, 2), torch.randn(3, 2)),
             (torch.randn(3, 2),),
             (torch.randn(3, 2),),
-            # (torch.randn(3, 2),),
+            (torch.randn(3, 2),),
             (torch.randn(30, 20),),
         ]
         for i, module in enumerate(modules):
@@ -2122,6 +2205,32 @@ def test_qnn_backend_simple_model(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_submodules(self):
+        module = SimpleSubModules()  # noqa: F405
+        sample_input = (
+            torch.rand(1, 3, 8, 8),
+            torch.rand(1, 3, 8, 8),
+            torch.rand(1, 3, 8, 8),
+            torch.rand(1, 3, 8, 8),
+        )
+
+        from executorch.backends.qualcomm.quantizer.quantizer import (
+            get_submodule_type_predicate,
+        )
+
+        submodule_qconfig_list = [
+            (
+                get_submodule_type_predicate("Add"),
+                ModuleQConfig(QuantDtype.use_16a16w),
+            )  # noqa: F405
+        ]
+        module = self.get_qdq_module(
+            module,
+            sample_input,
+            submodule_qconfig_list=submodule_qconfig_list,
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_topk_and_index(self):
         module = TopKandIndex()  # noqa: F405
         sample_input = (torch.randn(3, 10),)
@@ -2135,6 +2244,12 @@ def test_qnn_backend_view_permute_matmul(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_zero_dim_tensor(self):
+        module = ZeroDimTensor()  # noqa: F405
+        sample_input = (torch.randn(1, 256, 125),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_example_models(self):
         instances = [
             {
@@ -2390,7 +2505,7 @@ def test_qnn_backend_profile_op(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_profile_events=24,
+            expected_profile_events=34,
         )
 
     def test_qnn_backend_shared_buffer(self):
@@ -3005,7 +3120,7 @@ def test_qnn_backend_profile_op(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_profile_events=25,
+            expected_profile_events=35,
         )
 
     def test_qnn_backend_shared_buffer(self):
@@ -3496,7 +3611,6 @@ def test_conv_former(self):
                 self.assertGreaterEqual(msg["top_1"], 60)
                 self.assertGreaterEqual(msg["top_5"], 80)
 
-    @unittest.skip("bicubic resize is not supported")
     def test_dino_v2(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
@@ -3532,6 +3646,46 @@ def test_dino_v2(self):
                 self.assertGreaterEqual(msg["top_1"], 70)
                 self.assertGreaterEqual(msg["top_5"], 85)
 
+    def test_efficientSAM(self):
+        if not self.required_envs(
+            [self.image_dataset, self.pretrained_weight, self.oss_repo]
+        ):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--oss_repo",
+            self.oss_repo,
+            "--pretrained_weight",
+            self.pretrained_weight,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["MIoU"], 0.55)
+
     def test_esrgan(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 41c56c08a85..71d3b9e7ec2 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -9,14 +9,14 @@
 import subprocess
 import tempfile
 import unittest
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, OrderedDict, Tuple
 
 import numpy as np
 import torch
 
 from executorch import exir
 from executorch.backends.qualcomm.qnn_preprocess import QnnBackend
-from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.quantizer.quantizer import ModuleQConfig, QuantDtype
 from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_DTYPE,
@@ -30,6 +30,7 @@
     to_edge_transform_and_lower_to_qnn,
 )
 from executorch.devtools import generate_etrecord, Inspector
+from executorch.devtools.inspector._inspector_utils import TimeScale
 from executorch.examples.qualcomm.utils import (
     generate_inputs,
     make_output_dir,
@@ -290,7 +291,12 @@ def post_process():
                     outputs.append(output)
 
             def validate_profile():
-                inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path)
+                inspector = Inspector(
+                    etdump_path=etdump_path,
+                    etrecord=etrecord_path,
+                    source_time_scale=TimeScale.CYCLES,
+                    target_time_scale=TimeScale.CYCLES,
+                )
                 self.assertTrue(
                     len(inspector.to_dataframe().index) == expected_profile_events
                 )
@@ -435,6 +441,7 @@ def lower_module_and_test_output(
         expected_profile_events: int = -1,
         expected_intermediate_events: int = -1,
         assert_output_equal: bool = True,
+        passes_job: Optional[OrderedDict] = None,
         skip_node_id_set: set = None,
         skip_node_op_set: set = None,
         dynamic_shapes: Dict = None,
@@ -444,6 +451,7 @@ def lower_module_and_test_output(
             sample_inputs,
             self.compiler_specs,
             dynamic_shapes=dynamic_shapes,
+            passes_job=passes_job,
             skip_node_id_set=skip_node_id_set,
             skip_node_op_set=skip_node_op_set,
         )
@@ -497,7 +505,6 @@ def get_qdq_module(
         self,
         module: torch.nn.Module,
         inputs: Tuple[torch.Tensor],
-        is_conv_per_block: Optional[bool] = False,
         is_conv_per_channel: Optional[bool] = True,
         is_linear_per_channel: Optional[bool] = False,
         custom_quant_annotations: Tuple[Callable] = (),
@@ -505,6 +512,7 @@ def get_qdq_module(
         dynamic_shapes: Dict = None,
         bypass_check: bool = False,
         block_size_map: Dict[str, Tuple] = None,
+        submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None,
     ) -> torch.fx.GraphModule:
         m = torch.export.export(
             module, inputs, dynamic_shapes=dynamic_shapes, strict=True
@@ -513,9 +521,9 @@ def get_qdq_module(
         quantizer = make_quantizer(
             quant_dtype=quant_dtype,
             custom_annotations=custom_quant_annotations,
-            per_block_conv=is_conv_per_block,
             per_channel_conv=is_conv_per_channel,
             per_channel_linear=is_linear_per_channel,
+            submodule_qconfig_list=submodule_qconfig_list,
         )
         if block_size_map is not None:
             quantizer.set_block_size_map(block_size_map)
@@ -543,6 +551,7 @@ def get_prepared_qat_module(
         is_linear_per_channel: Optional[bool] = False,
         custom_quant_annotations: Tuple[Callable] = (),
         quant_dtype: QuantDtype = QuantDtype.use_8a8w,
+        submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None,
     ) -> torch.fx.GraphModule:
         m = torch.export.export_for_training(module, inputs, strict=True).module()
 
@@ -551,12 +560,12 @@ def get_prepared_qat_module(
             custom_annotations=custom_quant_annotations,
             per_channel_conv=is_conv_per_channel,
             per_channel_linear=is_linear_per_channel,
+            is_qat=True,
+            submodule_qconfig_list=submodule_qconfig_list,
         )
 
-        if quant_dtype == QuantDtype.use_8a8w:
-            quantizer.set_quant_config(quant_dtype, is_qat=True)
-        else:
-            raise RuntimeError("Shuld not be here")
+        submodule_qconfig_list = submodule_qconfig_list or []
+        quantizer.set_submodule_qconfig_list(submodule_qconfig_list)
 
         prepared = prepare_qat_pt2e(m, quantizer)
         return torch.ao.quantization.move_exported_model_to_train(prepared)
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index f7b966ee8ea..e0ebc5beebe 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -16,7 +16,7 @@
 
 import torch
 
-from executorch.backends.qualcomm._passes import AnnotateStack
+from executorch.backends.qualcomm._passes import AnnotateStack, AnnotateUnbind
 from executorch.backends.qualcomm._passes.qnn_pass_manager import QnnPassManager
 
 from executorch.backends.qualcomm.builders.node_visitor import (
@@ -304,11 +304,12 @@ def get_decomp_table(passes_job) -> Dict[torch._ops.OperatorBase, Callable]:
     skip_decompositions = get_skip_decomp_table()
 
     # If we want to annotate the decomposed ops, then we should decompose the operation.
-    if passes_job and passes_job.get(AnnotateStack, False):
+    if passes_job:
         skip_decompositions = [
             skip_decomp_op
             for skip_decomp_op in skip_decompositions
-            if skip_decomp_op not in AnnotateStack.decomp_ops
+            if skip_decomp_op
+            not in AnnotateStack.decomp_ops + AnnotateUnbind.decomp_ops
         ]
     remove_decompositions(source_decompositions, skip_decompositions)
 
diff --git a/backends/transforms/decompose_sdpa.py b/backends/transforms/decompose_sdpa.py
index 329dab96df2..73e9d986c3d 100644
--- a/backends/transforms/decompose_sdpa.py
+++ b/backends/transforms/decompose_sdpa.py
@@ -62,6 +62,9 @@ def call(
 
                     # Copy node from decompose graph module
                     for decomposed_node in decomposed_module.graph.nodes:
+                        node.meta["nn_module_stack"] = decomposed_node.meta.get(
+                            "nn_module_stack"
+                        )
                         if decomposed_node.op == "placeholder":
                             continue
 
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
index 2cfff6a6eb6..3ae80950645 100644
--- a/backends/vulkan/README.md
+++ b/backends/vulkan/README.md
@@ -133,7 +133,7 @@ will be executed on the GPU.
 
 
 ::::{note}
-The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/supported_ops.py)
+The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/op_registry.py#L194)
 Vulkan partitioner code can be inspected to examine which ops are currently
 implemented in the Vulkan delegate.
 ::::
diff --git a/backends/vulkan/_passes/int4_weight_only_quantizer.py b/backends/vulkan/_passes/int4_weight_only_quantizer.py
index 409cbb4b755..d0b73b8af0e 100644
--- a/backends/vulkan/_passes/int4_weight_only_quantizer.py
+++ b/backends/vulkan/_passes/int4_weight_only_quantizer.py
@@ -118,9 +118,6 @@ def _vk_replace_linear_int4(
     # Use custom vulkan linear layer as default
     linear_class: Type[torch.nn.Module] = VkWeightOnlyInt4Linear,
     copy_weights: bool = False,
-    # Serves the same purpose as `tensor_dim_limit` in
-    # executorch.backends.vulkan.partitioner.VulkanSupportedOperators
-    feature_limit: int = 16384,
 ):
     for name, child in module.named_children():
         if isinstance(child, torch.nn.Linear) and (
@@ -131,8 +128,6 @@ def _vk_replace_linear_int4(
             if (
                 _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles)
                 or padding_allowed
-            ) and (
-                child.out_features < feature_limit and child.in_features < feature_limit
             ):
                 new_linear = linear_class(
                     child.in_features,
@@ -175,7 +170,6 @@ def __init__(
         inner_k_tiles: Optional[int] = 8,
         device: torch.device = torch.device("cpu"),  # noqa
         precision: torch.dtype = torch.float32,
-        feature_limit: int = 16384,
     ) -> None:
         super().__init__()
         assert inner_k_tiles in [2, 4, 8]
@@ -186,9 +180,6 @@ def __init__(
         self.padding_allowed: bool = padding_allowed
         self.device: torch.device = device
         self.precision: torch.dtype = precision
-        # Serves the same purpose as `tensor_dim_limit` in
-        # executorch.backends.vulkan.partitioner.VulkanSupportedOperators
-        self.feature_limit = feature_limit
 
     @torch.no_grad()
     def _create_quantized_state_dict(
@@ -197,10 +188,7 @@ def _create_quantized_state_dict(
         cur_state_dict = model.state_dict()
         for fqn, mod in model.named_modules():
             # Add additional check to make sure features do not exceed feature limit
-            if isinstance(mod, torch.nn.Linear) and (
-                mod.out_features < self.feature_limit
-                and mod.in_features < self.feature_limit
-            ):
+            if isinstance(mod, torch.nn.Linear):
                 out_features = mod.out_features
                 in_features = mod.in_features
                 logging.info(f"linear: {fqn}, in={in_features}, out={out_features}")
diff --git a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
index a0160efa90f..b4337829d7f 100644
--- a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
+++ b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
@@ -27,6 +27,19 @@ class SqueezeUnsqueezeInputs(ExportPass):
         exir_ops.edge.aten.gelu.default,
     }
 
+    def should_squeeze(self, op, shape: List[int]) -> bool:  # pyre-ignore
+        if len(shape) == 3:
+            return shape[1] == 1 and shape[0] > 1
+        if len(shape) == 4:
+            # No need to squeeze if all dims are 1 except the width dim
+            if all(dim == 1 for dim in shape[:-1]):
+                return False
+            # Otherwise, check for squeezable dim
+            return 1 in shape[:-1]
+
+        # Prefer not to introduce additional orchestration ops by default
+        return False
+
     def call_operator(
         self,
         op,  # pyre-ignore
@@ -34,18 +47,18 @@ def call_operator(
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        def _squeezable(shape: List[int]) -> bool:
-            return len(shape) > 2 and 1 in shape
-
         if op not in self._squeezable_ops:
             return super().call_operator(op, args, kwargs, meta)
-
         # pyre-ignore[16]: `None` has no attribute `node`
         input_shape = args[0].node.meta["val"].shape
         output_shape = meta["val"].shape
-        if not _squeezable(input_shape):
+
+        if not self.should_squeeze(op, input_shape):
             return super().call_operator(op, args, kwargs, meta)
 
+        def _squeezable(shape: List[int]) -> bool:
+            return len(shape) > 2 and 1 in shape
+
         # squeeze input tensor
         squeeze_shape = list(input_shape)
         while _squeezable(squeeze_shape):
diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake
index 67285738b4c..adbffaa76fd 100644
--- a/backends/vulkan/cmake/ShaderLibrary.cmake
+++ b/backends/vulkan/cmake/ShaderLibrary.cmake
@@ -45,16 +45,20 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 function(gen_vulkan_shader_lib_cpp shaders_path)
   set(VULKAN_SHADERGEN_ENV "")
-  set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/${ARGV1})
+  set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/vulkan_compute_shaders)
 
-  execute_process(
+  add_custom_command(
+    COMMENT "Generating Vulkan Compute Shaders"
+    OUTPUT ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp
     COMMAND
       "${PYTHON_EXECUTABLE}"
       ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path
       ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH}
-      --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH}/shader_cache/
-      --env ${VULKAN_GEN_ARG_ENV}
-    RESULT_VARIABLE error_code
+      --glslc-path=${GLSLC_PATH}
+      --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH}/shader_cache/ --env
+      ${VULKAN_GEN_ARG_ENV}
+    DEPENDS ${shaders_path}/*
+            ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py
   )
 
   set(generated_spv_cpp
@@ -86,13 +90,6 @@ macro(vulkan_shader_library shaders_path library_name)
   set(VULKAN_SHADERGEN_ENV "")
   set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/${library_name})
 
-  # execute_process( COMMAND "${PYTHON_EXECUTABLE}"
-  # ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path
-  # ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH}
-  # --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH} --env
-  # ${VULKAN_GEN_ARG_ENV} RESULT_VARIABLE error_code ) set(ENV{PYTHONPATH}
-  # ${PYTHONPATH})
-
   set(generated_spv_cpp ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp)
 
   add_library(${library_name} STATIC ${generated_spv_cpp})
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
index 7eab1c21f89..1f36b76ec6f 100644
--- a/backends/vulkan/docs/android_demo.md
+++ b/backends/vulkan/docs/android_demo.md
@@ -1,6 +1,6 @@
 # Building and Running ExecuTorch with the Vulkan Backend
 
-The [ExecuTorch Vulkan Delegate](./native-delegates-executorch-vulkan-delegate.md)
+The [ExecuTorch Vulkan Delegate](../../../docs/source/native-delegates-executorch-vulkan-delegate.md)
 is a native GPU delegate for ExecuTorch.
 
 <!----This will show a grid card on the page----->
@@ -12,8 +12,8 @@ is a native GPU delegate for ExecuTorch.
 :::
 :::{grid-item-card}  Prerequisites:
 :class-card: card-prerequisites
-* Follow [**Setting up ExecuTorch**](./getting-started-setup.md)
-* It is also recommended that you read through [**ExecuTorch Vulkan Delegate**](./native-delegates-executorch-vulkan-delegate.md) and follow the example in that page
+* Follow [**Setting up ExecuTorch**](../../../docs/source/getting-started-setup.rst)
+* It is also recommended that you read through [**ExecuTorch Vulkan Delegate**](../../../docs/source/native-delegates-executorch-vulkan-delegate.md) and follow the example in that page
 :::
 ::::
 
@@ -59,7 +59,7 @@ partially lower the Llama model to Vulkan.
 # The files will usually be downloaded to ~/.llama
 python -m examples.models.llama.export_llama \
   --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
-  --model "llama3_2" \ 
+  --model "llama3_2" \
   -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
   -p ~/.llama/checkpoints/Llama3.2-1B/params.json \
   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index b33430a6bca..026f1db9273 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -277,6 +277,7 @@ def register_binary_op(features: OpFeatures):
         exir_ops.edge.aten.rsqrt.default,
         exir_ops.edge.aten.tanh.default,
         exir_ops.edge.aten.round.default,
+        exir_ops.edge.aten.leaky_relu.default,
     ]
 )
 def register_unary_op(features: OpFeatures):
@@ -392,6 +393,7 @@ def register_int8_mm_op(features: OpFeatures):
 
 @update_features(exir_ops.edge.et_vk.linear_weight_int4.default)
 def register_int4_mm_op(features: OpFeatures):
+    features.buffer_impl = True
     features.texture_impl = TextureImplFeatures(
         uses_axis_map=False,
         valid_packed_dims={PackedDim.WIDTH},
@@ -400,6 +402,7 @@ def register_int4_mm_op(features: OpFeatures):
     features.optimal_storage = VkStorageType.TEXTURE_3D
     features.optimal_layout = VkMemoryLayout.TENSOR_WIDTH_PACKED
     features.handles_own_prepacking = True
+    features.skip_limits_check = {1}
     return features
 
 
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 4cbd1290401..62b53f9a76c 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -260,6 +260,26 @@ vkapi::VulkanImage allocate_image(
       return vkapi::VulkanImage();
   }
 
+    // TODO(ssjia): change to always check that the image extents do not exceed
+    // physical limits. Adding the check now based on `maxImageDimension3D` will
+    // cause some existing models to break. Anecdotally, on Adreno and
+    // SwiftShader devices, using 3D textures that exceed `maxImageDimension3D`
+    // appears to be ok. So we need to figure out if is it undefined behaviour
+    // or if there's a better way to figure out what the limit is. For now, only
+    // check during debug build so that we can detect when exceeding physical
+    // limits could be a potential cause for model outputs to be wrong. In the
+    // meantime, the threshold for using texture storage can be configured at
+    // export time.
+#ifdef VULKAN_DEBUG
+  uint32_t max_extent = storage_type == utils::kTexture3D
+      ? adapter_ptr->max_texture3d_dim()
+      : adapter_ptr->max_texture2d_dim();
+
+  VK_CHECK_COND(
+      image_extents[0] <= max_extent && image_extents[1] <= max_extent &&
+      image_extents[2] <= max_extent);
+#endif
+
   VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
 
   return adapter_ptr->vma().create_image(
@@ -291,6 +311,8 @@ vkapi::VulkanBuffer allocate_buffer(
       return vkapi::VulkanBuffer();
   }
 
+  VK_CHECK_COND(numel <= context_ptr->adapter_ptr()->max_buffer_numel());
+
   return adapter_ptr->vma().create_storage_buffer(
       element_size(dtype) * numel, allocate_memory);
 }
@@ -497,9 +519,7 @@ vTensor::vTensor(
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
 
-  if (storage_type != utils::kBuffer) {
-    set_logical_limits(storage_.image_extents_);
-  }
+  set_logical_limits(storage_.image_extents_);
 }
 
 // NOLINTNEXTLINE
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index a29f7d14964..e52780b6a4d 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -125,6 +125,8 @@ def buffer_gvec_type(dtype: str, n: int) -> str:
 
     if dtype == "float":
         return f"vec{n}"
+    if dtype == "uint":
+        return f"uvec{n}"
     elif dtype == "half":
         return f"f16vec{n}"
     elif dtype == "int":
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
index 1c2ffe7afe4..c0ed9204227 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -47,7 +47,17 @@ void main() {
 
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so reads from the padding region are skipped.
-  const ivec2 start = max(ivec2(0), ipos);
+  ivec2 start = ipos;
+  if (start.x < 0) {
+    // number of "steps" to get to >= zero is div_up(-start, dilation)
+    int num_steps = ((-ipos.x) + dilation.x - 1) / dilation.x;
+    start.x = ipos.x + num_steps * dilation.x;
+  }
+  if (start.y < 0) {
+    // number of "steps" to get to >= zero is div_up(-start, dilation)
+    int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y;
+    start.y = ipos.y + num_steps * dilation.y;
+  }
   const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy));
   // Compute the start of the kernel based on how far we are skipping ahead when
   // reading the input. Note that these are "canonical" indices.
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
index d6c94661ace..c3e53cbfc3b 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
@@ -43,106 +43,275 @@ ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
 const lowp int out_packed_dim = unhash_packed_dim(out_layout);
 
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+#define MAX_WORKGROUP_SIZE 64
+
+// Shared memory factor increases shared memory allocation by a scale that should either be 1 or a power of 2.
+//
+// Increasing factor allows more data to be stored in shared memory and increase thread utilization during reduction.
+// Why? Because when performing reduction, the number of active threads becomes half in each iteration.
+// Increasing scaling factor increases the thread occupancy and hence utilize the GPU better.
+// eg.
+// If local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 1, 32 elements will be loaded into shared memory.
+// First iteration of reduce will have 16 threads sum up 32 elements.
+// Second iteration will have 8 threads sum up 16 elements from previous iteration and so on.
+// So thread utilization starts at 50%.
+//
+// By contrast if local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 2, 64 elements will be loaded into shared memory.
+// First iteration of reduce will have 32 threads sum up 64 elements.
+// Second iteration will have 32 threads sum up 16 elements from previous iteration and so on.
+// Thus thread utilization starts at 100%.
+#define SHARED_MEMORY_FACTOR 2
+
+#define offset_pos_index(index) ((index) + ((index) >> 2))
+
+shared VEC4_T shared_input[offset_pos_index(MAX_WORKGROUP_SIZE * SHARED_MEMORY_FACTOR)];
+
+// Function to reduce input data in workgroup's x dimension
+//
+// The implementation resembles reduction as depicted below
+// | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | 2 | 3 | 2 | 7 | 0 | 11 | 0 | 2 | current_stride -> 1
+//   |   /    |   /   |   /   |   /   |   /   |   /   |   /    |   /
+//   |  /     |  /    |  /    |  /    |  /    |  /    |  /     |  /
+//   | /      | /     | /     | /     | /     | /     | /      | /
+// | 11 | 1 | 9 | 1 | 2 | 2 | 8 | 5 | 5 | 3 | 9 | 7 | 11 | 11 | 2 | 2 | current_stride -> 2
+//   |       /        |       /       |       /       |       /
+//   |    /           |    /          |    /          |    /
+//   | /              | /             | /             | /
+// | 20 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |14 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 4
+//   |                /               |               /
+//   |            /                   |            /
+//   |        /                       |         /
+//   |    /                           |     /
+//   | /                              | /
+// | 30 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 8
+//   |                                /
+//   |                             /
+//   |                         /
+//   |                     /
+//   |                 /
+//   |             /
+//   |         /
+//   |     /
+//   | /
+// | 57 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride = -> 16
+//
+// Threads access shared index in following pattern
+// Thread       | 0 | 1 | 2 | 3 | 4 | 5  | 6  | 7  | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 1
+// Shared Index | 0 | 2 | 4 | 6 | 8 | 10 | 12 | 14 | X | X | X  | X  | X  | X  | X  | X  | index *= 1
+//
+// Thread       | 0 | 1 | 2 | 3  | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 2
+// Shared Index | 0 | 4 | 8 | 12 | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 2
+//
+// Thread       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 4
+// Shared Index | 0 | 8 | X | X | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 4
+//
+// Thread       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 8
+// Shared Index | 0 | X | X | X | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 8
+
+void reduce_input(const int width_stride, const int shared_idx_offset) {
+  // wait for all shared memory writes to finish
+  memoryBarrierShared();
+  barrier();
+
+  // loop log(width_stride) times
+  for (int current_stride = 1, index = int(gl_LocalInvocationID.x << 1); current_stride < width_stride; current_stride *= 2, index <<= 1) {
+    // if the index at this thread is within the width stride
+    if (index < width_stride) {
+      const int local_shared_idx = shared_idx_offset + index;
+      // add the value at current stride to this thread's value
+      shared_input[offset_pos_index(local_shared_idx)] += shared_input[offset_pos_index(local_shared_idx + current_stride)];
+    }
 
-  if (any(greaterThanEqual(lpos, out_limits))) {
-    return;
+    memoryBarrierShared();
+    barrier();
   }
+}
 
+void reduce_non_packed_dim() {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
   const int width = int(sizes.x);
+  ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
 
-  if (in_packed_dim != W_DIM) {
-    VEC4_T mean = VEC4_T(0);
-    VEC4_T delta = VEC4_T(0);
-    VEC4_T delta2 = VEC4_T(0);
-    VEC4_T M2 = VEC4_T(0);
-
-    // Use Welford's online algorithm to compute mean and variance in one pass
-    // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
-    ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
-    for (int w = 0; w < width; ++w) {
-      in_pos[in_axis_map.x] = w;
-      VEC4_T v = load_texel(t_in, in_pos);
-      delta = v - mean;
-      mean += delta / (w + 1);
-      delta2 = v - mean;
-      M2 += delta * delta2;
+  // width batch read stride
+  const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR;
+
+  // local memory starting offset for this thread
+  const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y);
+
+  // local memory index for this thread
+  const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x);
+
+  VEC4_T mean = VEC4_T(0);
+  VEC4_T var = VEC4_T(0);
+
+  // Loop over the width in stride increments
+  for (int width_offset = 0; width_offset < width; width_offset += width_stride) {
+    // Read input in shared memory
+    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
+      in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
+
+      VEC4_T in_val = VEC4_T(0);
+      if (all(lessThan(in_pos, out_limits))) {
+        in_val = load_texel(t_in, in_pos);
+      }
+      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = in_val;
     }
 
-    VEC4_T var = M2 / width;
-    VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5));
-    VEC4_T offset = -rstd * mean;
-
-    for (int w = 0; w < width; ++w) {
-      in_pos[in_axis_map.x] = w;
-      VEC4_T v = load_texel(t_in, in_pos);
-      // broadcasting
-      VEC4_T weight = load_texel(t_weight, ivec3(w, 0, 0)).xxxx;
-      VEC4_T bias = load_texel(t_bias, ivec3(w, 0, 0)).xxxx;
-      VEC4_T outtex = (v * rstd + offset) * weight + bias;
-      write_texel_lpos(t_out, ivec3(w, lpos.y, lpos.z), outtex, out_axis_map);
+    reduce_input(width_stride, shared_idx_offset);
+    mean += shared_input[offset_pos_index(shared_idx_offset)];
+  }
+
+  mean /= width;
+
+  memoryBarrierShared();
+  barrier();
+
+  // Loop over the width in stride increments
+  for (int width_offset = 0; width_offset < width; width_offset += width_stride) {
+    // Read input in shared memory
+    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
+      in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
+
+      VEC4_T in_val = mean;
+      if (all(lessThan(in_pos, out_limits))) {
+        in_val = load_texel(t_in, in_pos);
+      }
+
+      const VEC4_T delta = in_val - mean;
+      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = delta * delta;
     }
 
+    reduce_input(width_stride, shared_idx_offset);
+    var += shared_input[offset_pos_index(shared_idx_offset)];
+  }
+
+  var /= width;
+
+  VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5));
+  VEC4_T offset = -rstd * mean;
+
+  VEC4_T v = load_texel(t_in, lpos);
+  VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0)).xxxx;
+  VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0)).xxxx;
+  VEC4_T outtex = (v * rstd + offset) * weight + bias;
+
+  if (all(lessThan(lpos, out_limits))) {
+    write_texel_lpos(t_out, lpos, outtex, out_axis_map);
+  }
+
+  if (gl_GlobalInvocationID.x == 0) {
     write_texel(t_mean, lpos, mean);
     write_texel(t_rstd, lpos, rstd);
-  } else {
-    const int packed_width = divup4(width);
-
-    T mean = T(0);
-    T delta = T(0);
-    T delta2 = T(0);
-    T M2 = T(0);
-    // Use Welford's online algorithm to compute mean and variance in one pass
-    // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
-    ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
-    T width_counter = T(1);
-
-    const bool has_unaligned_width = (width & 0x3) != 0;
-    const int fully_packed_4_comp_count = packed_width - mix(0, 1, has_unaligned_width);
-
-    // iterate through texels that are fully packed ie. has 4 components
-    for (int w = 0; w < fully_packed_4_comp_count; ++w) {
-      in_pos[in_axis_map.x] = w;
-      VEC4_T v = load_texel(t_in, in_pos);
-      for (int i=0; i<4; i++) {
-        delta = v[i] - mean;
-        mean += delta / width_counter;
-        delta2 = v[i] - mean;
-        M2 += delta * delta2;
-        width_counter++;
+  }
+}
+
+void reduce_packed_dim() {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const int width = int(sizes.x);
+  ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
+
+  // width batch read stride
+  const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR;
+
+  // local memory starting offset for this thread
+  const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y);
+
+  // local memory index for this thread
+  const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x);
+
+  const int last_packed_width_index = divup4(width) - 1;
+  T mean = T(0);
+  T var = T(0);
+  const int remain = width & 3;
+
+  const int in_pos_x_limit = out_limits[in_axis_map.x];
+
+  // Loop over the width in stride increments
+  for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
+    // Read input in shared memory
+    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
+      const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
+      in_pos[in_axis_map.x] = in_pos_x;
+
+      VEC4_T in_val = VEC4_T(0);
+      if (in_pos_x < in_pos_x_limit) {
+        in_val = load_texel(t_in, in_pos);
       }
-    }
 
-    // handle last texel if its not 4 aligned
-    if (has_unaligned_width) {
-      in_pos[in_axis_map.x] = fully_packed_4_comp_count;
-      const int remaining_width = width & 0x3;
-
-      VEC4_T v = load_texel(t_in, in_pos);
-      for (int i=0; i<remaining_width; i++) {
-        delta = v[i] - mean;
-        mean += delta / width_counter;
-        delta2 = v[i] - mean;
-        M2 += delta * delta2;
-        width_counter++;
+      if (in_pos_x == last_packed_width_index && remain != 0) {
+        const int remain_inv = 4 - remain;
+        in_val.y = mix(in_val.y, T(0), remain_inv > 2);
+        in_val.z = mix(in_val.z, T(0), remain_inv > 1);
+        in_val.w = mix(in_val.w, T(0), remain_inv > 0);
       }
+
+      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = in_val;
     }
 
-    T var = M2 / (width_counter - 1);
-    T rstd = inversesqrt(var + epsilon);
-    T offset = -rstd * mean;
-
-    for (int w = 0; w < packed_width; ++w) {
-      in_pos[in_axis_map.x] = w;
-      VEC4_T v = load_texel(t_in, in_pos);
-      VEC4_T weight = load_texel(t_weight, ivec3(w, 0, 0));
-      VEC4_T bias = load_texel(t_bias, ivec3(w, 0, 0));
-      VEC4_T outtex = (v * rstd + offset) * weight + bias;
-      write_texel_lpos(t_out, ivec3(w, lpos.y, lpos.z), outtex, out_axis_map);
+    reduce_input(width_stride, shared_idx_offset);
+    const VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
+    mean += val.x + val.y + val.z + val.w;
+  }
+
+  mean /= width;
+
+  memoryBarrierShared();
+  barrier();
+
+  // Loop over the width in stride increments
+  for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
+    // Read input in shared memory
+    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
+      const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
+      in_pos[in_axis_map.x] = in_pos_x;
+
+      VEC4_T in_val = VEC4_T(mean);
+      if (in_pos_x < in_pos_x_limit) {
+        in_val = load_texel(t_in, in_pos);
+      }
+
+      if (in_pos_x == last_packed_width_index && remain != 0) {
+        const int remain_inv = 4 - remain;
+        in_val.y = mix(in_val.y, mean.x, remain_inv > 2);
+        in_val.z = mix(in_val.z, mean.x, remain_inv > 1);
+        in_val.w = mix(in_val.w, mean.x, remain_inv > 0);
+      }
+
+      const VEC4_T delta = in_val - mean;
+      const VEC4_T delta2 = delta * delta;
+      shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = delta2;
     }
 
+    reduce_input(width_stride, shared_idx_offset);
+    const VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
+    var += val.x + val.y + val.z + val.w;
+  }
+
+  var /= width;
+
+  T rstd = pow(var + epsilon, T(-0.5));
+  T offset = -rstd * mean;
+
+  VEC4_T v = load_texel(t_in, lpos);
+  VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0));
+  VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0));
+  VEC4_T outtex = (v * rstd + offset) * weight + bias;
+
+  if (all(lessThan(lpos, out_limits))) {
+    write_texel_lpos(t_out, lpos, outtex, out_axis_map);
+  }
+
+  if (gl_GlobalInvocationID.x == 0) {
     write_texel(t_mean, lpos, VEC4_T(mean));
     write_texel(t_rstd, lpos, VEC4_T(rstd));
   }
 }
+
+void main() {
+  // if packed dimension width
+  if (in_packed_dim != W_DIM) {
+    reduce_non_packed_dim();
+  } else {
+    reduce_packed_dim();
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl
new file mode 100644
index 00000000000..0079526c248
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+$if not NO_INT8_BUFFERS:
+  ${define_required_extensions("uint8")}
+$if STORAGE == "buffer":
+  ${define_required_extensions("int8")}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_qmat2", "uint8", STORAGE, is_scalar_array=False)}
+$if NO_INT8_BUFFERS:
+  ${layout_declare_tensor(B, "r", "nchw_4x2", "uint", "buffer")}
+$else:
+  ${layout_declare_tensor(B, "r", "nchw_4x2", "uint8", "buffer")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 qmat2_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+$if NO_INT8_BUFFERS:
+  #define BUF_T uint
+$else:
+  #define BUF_T uint8_t
+
+$if STORAGE == "buffer":
+  #define UVEC4_T u8vec4
+$else:
+  #define UVEC4_T uvec4
+
+uint get_first(const BUF_T packed) {
+  return (packed & 0xF0) >> 4;
+}
+
+uint get_second(const BUF_T packed) {
+  return packed & 0x0F;
+}
+
+uint combine(const uint first, const uint second) {
+  return (first << 4 | second);
+}
+
+$if NO_INT8_BUFFERS:
+  uint extract_comp(const uint packed4, const uint idx) {
+    return (packed4 >> (idx * 8)) & 0xFF;
+  }
+
+/*
+ * This shader packs the weight tensor into a texture.
+ *
+ * The original tensor has a (W, H) shape of (K / 2, N) and each scalar element
+ * is a uint8_t, which contains 2 packed 4 bit uint values.
+ *
+ * The transform performed by this shader is to first transpose the tensor, so
+ * the shape of the packed tensor becomes (N / 2, K). Then, the 4 bit integers
+ * are re-packed in groups of 8. For each 4 uint8_t values, the "left" 4-bits
+ * of each value contain the 0, 1, 2, 3 4-bit values, and the "right" 4-bits of
+ * each value contain the 4, 5, 6, 7 4-bit values.
+ *
+ * As a concrete example, consider the following weight tensor. The | demarks
+ * the packing boundary, so 1| 2 represents a single uint8_t value with 1 in the
+ * leftmost 4 bits and 2 in the rightmost 4 bits.
+ *
+ *  1| 2,  3| 4,  5| 6,  7| 8,
+ *  9|10, 11|12, 13|14, 15|16,
+ * 17|18, 19|20, 21|22, 23|24,
+ * 25|26, 27|28, 29|30, 31|32,
+ * 33|34, 35|36, 37|38, 39|40,
+ * 41|42, 43|44, 45|46, 47|48,
+ * 49|50, 51|52, 53|54, 55|56,
+ * 57|58, 59|60, 61|62, 63|64,
+ *
+ * After packing, the packed tensor would contain
+ *
+ *  1|33,  9|41, 17|49, 25|57,
+ *  2|34, 10|42, 18|50, 26|58,
+ *  3|35, 11|43, 19|51, 27|59,
+ *  4|36, 12|44, 20|52, 28|60,
+ *  5|37, 13|45, 21|53, 29|61,
+ *  6|38, 14|46, 22|54, 30|62,
+ *  7|39, 15|47, 23|55, 31|63,
+ *  8|40, 16|48, 24|56, 32|64,
+ *
+ * The purpose of interleaving is to make it easier to extract the unpacked
+ * values in order using the u8vec4 vectorized type. With the packing in place,
+ * The 4-bit values can be extracted via
+ *
+ * u8vec4 packed;
+ * u8vec4 vals_0123 = (packed & 0xF0) >> 4;
+ * u8vec4 vals_4567 = (packed | 0x0F);
+ */
+void main() {
+  // Each thread writes 2 output texels along the height axis
+  ivec2 packed_pos = ivec2(
+      gl_GlobalInvocationID.x,
+      gl_GlobalInvocationID.y << 1);
+
+  // The packed tensor is width packed
+  if ((packed_pos.x << 2) >= qmat2_sizes.x || packed_pos.y >= qmat2_sizes.y) {
+    return;
+  }
+
+  int out_col = packed_pos.x << 3;
+  int out_row = packed_pos.y;
+
+  int in_col = out_row;
+  int in_int8_col = in_col >> 1;
+  int in_row = out_col;
+
+  int in_numrows = qmat2_sizes.x << 1;
+  int in_numcols = qmat2_sizes.y;
+  int in_num_int8_cols = qmat2_sizes.y >> 1;
+
+  uint in_vals[8][2];
+  for (int r = 0; r < 8; ++r) {
+    if (in_row + r < in_numrows) {
+      uint scalar_idx = (in_row + r) * in_num_int8_cols + in_int8_col;
+      $if NO_INT8_BUFFERS:
+        BUF_T in_val_packed_texel = nchw_4x2[scalar_idx >> 2];
+        const uint packed_idx = scalar_idx % 4;
+        uint in_val_packed = extract_comp(in_val_packed_texel, packed_idx);
+      $else:
+        BUF_T in_val_packed = nchw_4x2[scalar_idx];
+
+      in_vals[r][0] = get_first(in_val_packed);
+      in_vals[r][1] = get_second(in_val_packed);
+    } else {
+      in_vals[r][0] = uint(0);
+      in_vals[r][1] = uint(0);
+    }
+  }
+
+  UVEC4_T out_tex_1 = UVEC4_T(
+      combine(in_vals[0][0], in_vals[4][0]),
+      combine(in_vals[1][0], in_vals[5][0]),
+      combine(in_vals[2][0], in_vals[6][0]),
+      combine(in_vals[3][0], in_vals[7][0]));
+
+  UVEC4_T out_tex_2 = UVEC4_T(
+      combine(in_vals[0][1], in_vals[4][1]),
+      combine(in_vals[1][1], in_vals[5][1]),
+      combine(in_vals[2][1], in_vals[6][1]),
+      combine(in_vals[3][1], in_vals[7][1]));
+
+  $if STORAGE == "buffer":
+    int stride = qmat2_sizes.x >> 2;
+    t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1;
+    t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2;
+  $else:
+    imageStore(t_qmat2, packed_pos.xy, out_tex_1);
+    imageStore(t_qmat2, ivec2(packed_pos.x, packed_pos.y + 1), out_tex_2);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml
new file mode 100644
index 00000000000..145f4301f14
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pack_int4_linear_weight_transposed_interleaved:
+  parameter_names_with_default_values:
+    STORAGE: texture2d
+    NO_INT8_BUFFERS: false
+  shader_variants:
+    - NAME: pack_int4_linear_weight_transposed_interleaved_texture2d
+    - NAME: pack_int4_linear_weight_transposed_interleaved_buffer
+      STORAGE: buffer
+    - NAME: pack_int4_linear_weight_transposed_interleaved_nobitw8buffer_texture2d
+      NO_INT8_BUFFERS: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
index 8a8703becd9..716c42e8ede 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
@@ -31,6 +31,8 @@ layout(push_constant) uniform PRECISION restrict Block {
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
+#extension GL_EXT_control_flow_attributes : require
+
 void main() {
   ivec3 pos = ivec3(gl_GlobalInvocationID);
 
@@ -54,11 +56,16 @@ void main() {
   in_bchw_pos[out_ndims[2]] = pos.y;
   in_bchw_pos[out_ndims[3]] = pos.x;
 
-  for (int j = 0; j < 4; ++j) {
+  const int in_packed_dim_size = in_sizes[3 - out_ndims[in_packed_dim_bchw_index]];
+
+  [[unroll]] for (int j = 0, bchw_index = in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]; j < 4; ++j, ++bchw_index) {
     // terminate the loop if trying to access input texture out of bounds
-    if (any(greaterThanEqual(in_bchw_pos.wzyx, in_sizes.xyzw))) {
+    if (bchw_index >= in_packed_dim_size) {
       break;
     }
+    // go to position in the input, that is mapped to the packed dim in the output
+    in_bchw_pos[out_ndims[in_packed_dim_bchw_index]] = bchw_index;
+
     ivec3 fetch_pos;
 
     fetch_pos.xy = in_bchw_pos.wz;
@@ -74,9 +81,6 @@ void main() {
     // fetch input texel
     VEC4_T inval = VEC4_T(load_texel(t_in, fetch_pos));
     outval[j] = inval[in_packed_dim_lane_index];
-
-    // go to next position in the input, that is mapped to the packed dim in the output
-    in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]++;
   }
 
   pos[packed_dim] = int(gl_GlobalInvocationID[packed_dim]);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
deleted file mode 100644
index b702a110a65..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#include "indexing_utils.h"
-
-#define PRECISION ${PRECISION}
-
-#define FOUR 4
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-#define FLOAT_T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type(STORAGE)}
-
-${define_required_extensions([DTYPE, "uint8", "uint16"])}
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "ret", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "x", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "weights", "uint8", "buffer")}
-${layout_declare_tensor(B, "r", "qparams", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec3", "ret_limits")}
-${layout_declare_ubo(B, "ivec4", "x_sizes")}
-${layout_declare_ubo(B, "ivec4", "weights_strides")}
-${layout_declare_ubo(B, "ivec4", "qparams_strides")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int group_size = 1;
-
-/*
- * This shader computes a linear operator between a floating point input matrix
- * x and a weights matrix that is quantized to 4 bits.
- *
- * The (W, H, C) shape of each tensor is:
- * - x: (K, M)
- * - weights: (K / 2, N)
- *   - The weights tensor has a data type of `uint8`. Each element in the tensor
- *     contains 2 4-bit values packed into a uint8.
- * - qparams: (2, N, number_of_groups)
- *   - This tensor contains the scales and zeros quantization parameters for the
- *     weights tensor. The weight tensor is quantized group-wise, which means
- *     that every `group_size` elements along the K dimension of the weights
- *     tensor has independent quantization parameters. Along the width dim, the
- *     first value contains the scale for the group and the second value
- *     contains the zero point for the group.
- *
- * Note that this shader assumes that all tensors are width packed.
- */
-void main() {
-  // output positions being calculated are (n, m), (n + 1, m), ...
-  // This means multiplying the m-th row of x with the n-th, (n+1)-th, ... rows
-  // of the weights tensor.
-  const u16vec3 ret_pos = u16vec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(ret_pos, ret_limits))) {
-    return;
-  }
-
-  // Since ret is width packed, need to multiply by 4
-  const uint16_t n = uint16_t(ret_pos.x * 4);
-
-  // K is guaranteed to be a multiple of group size
-  const uint16_t num_blocks = uint16_t(x_sizes.x / group_size);
-
-  uint16_t k_texel_i = uint16_t(0);
-  vec4 sums = vec4(0.0);
-  for (uint16_t block_idx = uint16_t(0); block_idx < num_blocks; block_idx++) {
-    vec4 scales;
-    vec4 zeros;
-
-    [[unroll]] for (int comp = 0; comp < 4; comp++) {
-      const vec4 scale_and_zero = load_texel(
-          qparams, u16vec3(0, n + comp, block_idx));
-      scales[comp] = scale_and_zero.x;
-      zeros[comp] = scale_and_zero.y;
-    }
-
-    for (uint16_t i = uint16_t(0); i < group_size; i += uint16_t(4), k_texel_i++) {
-      const VEC4_T x_texel = load_texel(
-          x, u16vec3(k_texel_i, ret_pos.y, ret_pos.z));
-
-      [[unroll]] for (int comp = 0; comp < 4; comp++) {
-        const int weights_bufi = (n + comp) * weights_strides.y + (k_texel_i * 2);
-        // Need to read 4 unpacked values, which corresponds to 2 packed values
-        const uint8_t weights_val_1 = weights[weights_bufi];
-        const uint8_t weights_val_2 = weights[weights_bufi + 1];
-
-        const u8vec4 weights_texel = u8vec4(
-          (weights_val_1 & 0xF0) >> 4,
-          weights_val_1 & 0x0F,
-          (weights_val_2 & 0xF0) >> 4,
-          weights_val_2 & 0x0F);
-
-        // Note that the unpacked 4-bit values are unsigned, therefore they must
-        // first be "centered" around 0 by subtracting 8 before applying the
-        // scale and zero point.
-        sums[comp] += dot(
-            x_texel, (vec4(weights_texel) - 8.0) * scales[comp] + zeros[comp]);
-      }
-    }
-  }
-  write_texel(ret, ret_pos, sums);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml
deleted file mode 100644
index 40d95d4a05f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-q_4w_linear:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: q_4w_linear_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.glsl
new file mode 100644
index 00000000000..715f84d3a56
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.glsl
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
+
+#define TILE_ROWS ${TILE_ROWS}
+
+#define NGROUPS 8
+#define NWORKERS 8
+
+${define_required_extensions(DTYPE)}
+$if WEIGHT_STORAGE == "buffer":
+  ${define_required_extensions("uint8")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_mat1", DTYPE, IN_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ivec4 mat1_sizes;
+  ivec4 qmat2_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int group_size = 64;
+
+shared VEC4_T partial_sums[NGROUPS][NWORKERS][TILE_ROWS][2];
+
+/*
+ * This shader computes a linear operator between a floating point input matrix
+ * x and a weights matrix that is quantized to 4 bits. Please refer to the
+ * q_4w_linear shader for more details.
+ *
+ * This shader implements a co-operative algorithm to compute the output. The
+ * work group size is {NGROUP, 1, NWORKERS}, and each group of NWORKERS threads
+ * cooperative to compute TILE_ROWS * 2 output texels. Therefore,
+ * NGROUP * TILE_ROWS * 2 output texels are computed across one work group.
+ *
+ * The threads co-operate by each thread computing a partial reduction along the
+ * K dimension. To illustrate the computation, consider a scalar variant of the
+ * algorithm that computes the dot product of 2 vectors. Also assume that
+ * NWORKERS is 8.
+ *
+ * Thread 1 in each group will compute:
+ * (mat1[0] * mat2[0]) + (mat1[8] * mat2[8]) + (mat1[16] * mat2[16]) + ...
+ *
+ * Thread 2 in each group will compute:
+ * (mat1[1] * mat2[1]) + (mat2[9] * mat2[9]) + (mat1[17] * mat2[17]) + ...
+ *
+ * Thread 3 in each group will compute:
+ * (mat1[2] * mat2[2]) + (mat2[10] * mat2[10]) + (mat1[18] * mat2[18]) + ...
+ *
+ * The partial accumulations is structured such that memory accesses in each
+ * loop iteration can be coalesced.
+ *
+ * Then, at the end first thread in each group will accumulate the partial
+ * accumulations computed by each thread to obtain the final result.
+ *
+ * Note that this shader assumes that all tensors are width packed.
+ */
+void main() {
+  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
+  // Each thread writes out 2 texels along the width axis, equivalent to 8
+  // scalar elements. Therefore multiply the thread_idx.x by 8.
+  const uint out_col = gl_GlobalInvocationID.x << 3;
+  // Similar reasoning to the above, each thread works on 2 texels along the
+  // width axis so multiply thread_idx.x by 2.
+  const int out_col_texel_idx = int(gl_GlobalInvocationID.x) << 1;
+
+  const uint gid = gl_LocalInvocationID.x; // group id
+  const uint wid = gl_LocalInvocationID.z; // worker id
+
+  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
+    return;
+  }
+
+  const int num_blocks = mat1_sizes.x / group_size;
+
+  VEC4_T mat1[TILE_ROWS];
+  VEC4_T qmat2[4][2];
+  VEC4_T local_sums[TILE_ROWS][2];
+
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    local_sums[r][0] = VEC4_T(0);
+    local_sums[r][1] = VEC4_T(0);
+  }
+
+  VEC4_T scales[2];
+  VEC4_T zeros[2];
+
+  $if WEIGHT_STORAGE == "buffer":
+    const int qmat2_stride = qmat2_sizes.x >> 2;
+  $if PARAMS_STORAGE == "buffer":
+    const int qparams_y_stride = out_sizes.x >> 2;
+    const int qparams_z_stride = qparams_y_stride * 2;
+
+  for (int block_idx = 0; block_idx < num_blocks; ++block_idx) {
+    $if PARAMS_STORAGE == "buffer":
+      scales[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx];
+      zeros[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + qparams_y_stride];
+
+      scales[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1];
+      zeros[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1 + qparams_y_stride];
+    $else:
+      scales[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 0, block_idx), 0);
+      zeros[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 1, block_idx), 0);
+
+      scales[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 0, block_idx), 0);
+      zeros[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 1, block_idx), 0);
+
+    for (uint g_idx = 4 * wid; g_idx < group_size; g_idx += (4 * NWORKERS)) {
+      const uint k = block_idx * group_size + g_idx;
+
+      // Preload B
+      [[unroll]] for (int r = 0; r < 4; ++r) {
+        $if WEIGHT_STORAGE == "buffer":
+          const u8vec4 packed_weight_tex = t_qmat2[(k + r) * qmat2_stride + gl_GlobalInvocationID.x];
+        $else:
+          const uvec4 packed_weight_tex = texelFetch(
+              t_qmat2,
+              ivec2(gl_GlobalInvocationID.x, k + r),
+              0);
+
+        qmat2[r][0] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0) * scales[0] + zeros[0];
+        qmat2[r][1] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0) * scales[1] + zeros[1];
+      }
+
+      // Preload A
+      [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+        $if IN_STORAGE == "buffer":
+          mat1[r] = t_mat1[((out_row + r) * mat1_sizes.x + k) >> 2];
+        $else:
+          mat1[r] = texelFetch(t_mat1, ivec3(k >> 2, out_row + r, 0), 0);
+      }
+
+      // Accumulate local output tile
+      [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+        local_sums[r][0] +=   mat1[r].x * qmat2[0][0]
+                      + mat1[r].y * qmat2[1][0]
+                      + mat1[r].z * qmat2[2][0]
+                      + mat1[r].w * qmat2[3][0];
+
+        local_sums[r][1] +=   mat1[r].x * qmat2[0][1]
+                      + mat1[r].y * qmat2[1][1]
+                      + mat1[r].z * qmat2[2][1]
+                      + mat1[r].w * qmat2[3][1];
+      }
+    }
+  }
+
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    partial_sums[gid][wid][r][0] = local_sums[r][0];
+    partial_sums[gid][wid][r][1] = local_sums[r][1];
+  }
+
+  memoryBarrierShared();
+  barrier();
+
+  if (wid != 0) {
+    return;
+  }
+
+  VEC4_T sums[TILE_ROWS][2];
+
+  for (int r = 0; r < TILE_ROWS; ++r) {
+    sums[r][0] = VEC4_T(0);
+    sums[r][1] = VEC4_T(0);
+    [[unroll]] for (int worker = 0; worker < NWORKERS; ++ worker) {
+      sums[r][0] += partial_sums[gid][worker][r][0];
+      sums[r][1] += partial_sums[gid][worker][r][1];
+    }
+  }
+
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    $if OUT_STORAGE == "buffer":
+      t_out[((out_row + r) * out_sizes.x + out_col) >> 2] = sums[r][0];
+      t_out[((out_row + r) * out_sizes.x + out_col + 4) >> 2] = sums[r][1];
+    $else:
+      imageStore(t_out, ivec3(out_col_texel_idx, out_row + r, 0), sums[r][0]);
+      imageStore(t_out, ivec3(out_col_texel_idx + 1, out_row + r, 0), sums[r][1]);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.yaml
new file mode 100644
index 00000000000..504cc4ab3b1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.yaml
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q_4w_linear_coop:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OUT_STORAGE: texture3d
+    IN_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+    PARAMS_STORAGE: buffer
+    TILE_ROWS: 1
+  shader_variants:
+    - NAME: q_4w_linear_coop_texture3d_texture3d_texture2d_float
+    - NAME: q_4w_linear_coop_buffer_buffer_texture2d_float
+      OUT_STORAGE: buffer
+      IN_STORAGE: buffer
+    - NAME: q_4w_linear_coop_buffer_buffer_buffer_float
+      OUT_STORAGE: buffer
+      IN_STORAGE: buffer
+      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.glsl
new file mode 100644
index 00000000000..64d0991e489
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.glsl
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
+
+#define TILE_ROWS ${TILE_ROWS}
+
+${define_required_extensions(DTYPE)}
+$if WEIGHT_STORAGE == "buffer":
+  ${define_required_extensions("uint8")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_mat1", DTYPE, IN_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ivec4 mat1_sizes;
+  ivec4 qmat2_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int group_size = 64;
+
+/*
+ * This shader computes a linear operator between a floating point input matrix
+ * x and a weights matrix that is quantized to 4 bits.
+ *
+ * The (W, H, C) shape of each tensor is:
+ * - x: (K, M)
+ * - weights: (N / 2, K)
+ *   - The weights tensor has a data type of `uint8`. Each element in the tensor
+ *     contains 2 4-bit values packed into a uint8.
+ *   - See the pack_int4_linear_weight_transposed_interleave shader to see more
+ *     details on how the weight tensor is stored.
+ * - qparams: (2, N, number_of_groups)
+ *   - This tensor contains the scales and zeros quantization parameters for the
+ *     weights tensor. The weight tensor is quantized group-wise, which means
+ *     that every `group_size` elements along the K dimension of the weights
+ *     tensor has independent quantization parameters. Along the width dim, the
+ *     first value contains the scale for the group and the second value
+ *     contains the zero point for the group.
+ *
+ * Each thread computes a tile of TILE_ROWS * 2 texels of the output tensor.
+ *
+ * Note that this shader assumes that all tensors are width packed.
+ */
+void main() {
+  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
+  // Each thread writes out 2 texels along the width axis, equivalent to 8
+  // scalar elements. Therefore multiply the thread_idx.x by 8.
+  const uint out_col = gl_GlobalInvocationID.x << 3;
+  // Similar reasoning to the above, each thread works on 2 texels along the
+  // width axis so multiply thread_idx.x by 2.
+  const int out_col_texel_idx = int(gl_GlobalInvocationID.x) << 1;
+
+  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
+    return;
+  }
+
+  const int num_blocks = mat1_sizes.x / group_size;
+
+  VEC4_T mat1[TILE_ROWS];
+  VEC4_T qmat2[4][2];
+  VEC4_T sums[TILE_ROWS][2];
+
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    sums[r][0] = VEC4_T(0);
+    sums[r][1] = VEC4_T(0);
+  }
+
+  VEC4_T scales[2];
+  VEC4_T zeros[2];
+
+  $if WEIGHT_STORAGE == "buffer":
+    const int qmat2_stride = qmat2_sizes.x >> 2;
+  $if PARAMS_STORAGE == "buffer":
+    const int qparams_y_stride = out_sizes.x >> 2;
+    const int qparams_z_stride = qparams_y_stride * 2;
+
+  for (int block_idx = 0; block_idx < num_blocks; ++block_idx) {
+    $if PARAMS_STORAGE == "buffer":
+      scales[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx];
+      zeros[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + qparams_y_stride];
+
+      scales[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1];
+      zeros[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1 + qparams_y_stride];
+    $else:
+      scales[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 0, block_idx), 0);
+      zeros[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 1, block_idx), 0);
+
+      scales[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 0, block_idx), 0);
+      zeros[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 1, block_idx), 0);
+
+    for (int g_idx = 0; g_idx < group_size; g_idx += 4) {
+      const int k = block_idx * group_size + g_idx;
+
+      // Preload B
+      [[unroll]] for (int r = 0; r < 4; ++r) {
+        $if WEIGHT_STORAGE == "buffer":
+          const u8vec4 packed_weight_tex = t_qmat2[(k + r) * qmat2_stride + gl_GlobalInvocationID.x];
+        $else:
+          const uvec4 packed_weight_tex = texelFetch(
+              t_qmat2,
+              ivec2(gl_GlobalInvocationID.x, k + r),
+              0);
+
+        qmat2[r][0] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0) * scales[0] + zeros[0];
+        qmat2[r][1] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0) * scales[1] + zeros[1];
+      }
+
+      // Preload A
+      [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+        $if IN_STORAGE == "buffer":
+          mat1[r] = t_mat1[((out_row + r) * mat1_sizes.x + k) >> 2];
+        $else:
+          mat1[r] = texelFetch(t_mat1, ivec3(k >> 2, out_row + r, 0), 0);
+      }
+
+      // Accumulate output tile
+      [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+        sums[r][0] +=   mat1[r].x * qmat2[0][0]
+                      + mat1[r].y * qmat2[1][0]
+                      + mat1[r].z * qmat2[2][0]
+                      + mat1[r].w * qmat2[3][0];
+
+        sums[r][1] +=   mat1[r].x * qmat2[0][1]
+                      + mat1[r].y * qmat2[1][1]
+                      + mat1[r].z * qmat2[2][1]
+                      + mat1[r].w * qmat2[3][1];
+      }
+    }
+  }
+
+  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
+    $if OUT_STORAGE == "buffer":
+      if (out_row + r < out_sizes.y) {
+        t_out[((out_row + r) * out_sizes.x + out_col) >> 2] = sums[r][0];
+        t_out[((out_row + r) * out_sizes.x + out_col + 4) >> 2] = sums[r][1];
+      }
+    $else:
+      imageStore(t_out, ivec3(out_col_texel_idx, out_row + r, 0), sums[r][0]);
+      imageStore(t_out, ivec3(out_col_texel_idx + 1, out_row + r, 0), sums[r][1]);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.yaml
new file mode 100644
index 00000000000..865a46629df
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.yaml
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q_4w_linear_tiled:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OUT_STORAGE: texture3d
+    IN_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+    PARAMS_STORAGE: buffer
+    TILE_ROWS: 3
+  shader_variants:
+    - NAME: q_4w_linear_tiled_texture3d_texture3d_texture2d_float
+    - NAME: q_4w_linear_tiled_buffer_buffer_texture2d_float
+      OUT_STORAGE: buffer
+      IN_STORAGE: buffer
+    - NAME: q_4w_linear_tiled_buffer_buffer_buffer_float
+      OUT_STORAGE: buffer
+      IN_STORAGE: buffer
+      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index 228e2e8f870..dfb5f1f2f9c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -29,16 +29,20 @@ ${layout_declare_tensor(2, "r", "t_qmat2", "int8", STORAGE)}
 ${layout_declare_tensor(3, "r", "t_scales", DTYPE, STORAGE)}
 
 $if STORAGE == "buffer":
-  ${layout_declare_ubo(4, "ivec4", "out_sizes")}
-  ${layout_declare_ubo(5, "ivec4", "out_strides")}
-  ${layout_declare_ubo(6, "int", "out_numel")}
-  ${layout_declare_ubo(7, "ivec4", "mat1_sizes")}
-  ${layout_declare_ubo(8, "ivec4", "mat1_strides")}
-  ${layout_declare_ubo(9, "ivec4", "qmat2_strides")}
-  ${layout_declare_ubo(10, "ivec4", "scales_strides")}
+  layout(push_constant) uniform restrict Block {
+    ivec4 out_sizes;
+    ivec4 out_strides;
+    ivec4 mat1_sizes;
+    ivec4 mat1_strides;
+    ivec4 qmat2_strides;
+    ivec4 scales_strides;
+    int out_numel;
+  };
 $else:
-  ${layout_declare_ubo(4, "ivec3", "out_limits")}
-  ${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
+  layout(push_constant) uniform restrict Block {
+    ivec3 out_limits;
+    ivec4 mat1_sizes;
+  };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -83,42 +87,40 @@ void main() {
 
 #else // USING_TEXTURE
 
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-
 void main() {
-  const u16vec2 out_pos = u16vec2(
-    gl_GlobalInvocationID.x,
-    gl_GlobalInvocationID.y);
+  const ivec2 out_pos = ivec2(
+    gl_GlobalInvocationID.x % out_limits.x,
+    gl_GlobalInvocationID.x / out_limits.x);
 
-  if (out_pos.x >= out_limits.x || out_pos.y >= out_limits.y) {
+  if (out_pos.y >= out_limits.y) {
     return;
   }
 
-  const uint16_t qmat2_pos_x = out_pos.x;
+  const int qmat2_pos_x = out_pos.x;
 
   VEC4_T outtex = VEC4_T(0);
 
-  const VEC4_T scales = load_texel(t_scales,  u16vec3(out_pos.x, 0, 0));
+  const VEC4_T scales = load_texel(t_scales,  ivec3(out_pos.x, 0, 0));
 
   VEC4_T mat1_tex;
   VEC4_T mat2_tex[4];
   for (
-    uint16_t i = uint16_t(0), x = uint16_t(0);
-    i < uint16_t(mat1_sizes.x);
-    i += uint16_t(4), x++)
+    int i = 0, x = 0;
+    i < mat1_sizes.x;
+    i += 4, x++)
   {
-    mat1_tex = load_texel(t_mat1, u16vec3(x, out_pos.y, 0));
+    mat1_tex = load_texel(t_mat1, ivec3(x, out_pos.y, 0));
 
-    mat2_tex[0] = load_texel(t_qmat2, u16vec3(out_pos.x, i, 0));
-    mat2_tex[1] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(1), 0));
-    mat2_tex[2] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(2), 0));
-    mat2_tex[3] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(3), 0));
+    mat2_tex[0] = load_texel(t_qmat2, ivec3(out_pos.x, i, 0));
+    mat2_tex[1] = load_texel(t_qmat2, ivec3(out_pos.x, i + 1, 0));
+    mat2_tex[2] = load_texel(t_qmat2, ivec3(out_pos.x, i + 2, 0));
+    mat2_tex[3] = load_texel(t_qmat2, ivec3(out_pos.x, i + 3, 0));
 
     outtex += mat1_tex.x * mat2_tex[0] + mat1_tex.y * mat2_tex[1] + mat1_tex.z * mat2_tex[2] + mat1_tex.w * mat2_tex[3];
   }
 
   outtex *= scales;
-  write_texel(t_out, u16vec3(out_pos, 0), outtex);
+  write_texel(t_out, ivec3(out_pos, 0), outtex);
 }
 
 #endif
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 18599ed4ba6..060f5028c02 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -262,11 +262,6 @@ void check_conv2d_params(const Kernel2dParams& p, const bool transposed) {
           "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!");
     }
   }
-  if ((p.padding[0] > 0 && p.kernel_size[0] > 1 && p.dilation[0] > 1) ||
-      (p.padding[1] > 0 && p.kernel_size[1] > 1 && p.dilation[1] > 1)) {
-    VK_THROW(
-        "aten.convolution.default: padding > 0 while dilation, kernel_size > 1 is not supported yet!");
-  }
 }
 
 Conv2dMethod get_conv2d_method(
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
index 7aa98e52654..f2e8eff763a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -83,8 +83,19 @@ void add_native_layer_norm_node(
 
   std::vector<int64_t> in_sizes = t_input->sizes();
 
-  utils::uvec3 global_size = t_mean->logical_limits();
-  utils::uvec3 local_size = adaptive_work_group_size(global_size);
+  utils::uvec3 global_size = t_out->logical_limits();
+  utils::uvec3 local_size;
+
+  // Since the shader sets shared memory scale factor > 1, if dispatch is
+  // greater than maximum WG size. Setting WG size in X axis to max WG size,
+  // would allow best thread utilization.
+  if (global_size[0] > 64) {
+    local_size = {64, 1, 1};
+  } else {
+    // If thread size in X axis is smaller or equal to maximum WG size, we can
+    // let the function decide the best WG size.
+    local_size = graph.create_local_wg_size(global_size);
+  }
 
   std::string kernel_name("native_layer_norm");
   kernel_name.reserve(kShaderNameReserve);
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp
new file mode 100644
index 00000000000..4b33dd9b806
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void check_q_4w_linear_args(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef group_size,
+    const ValueRef scales_and_zeros,
+    const ValueRef out) {
+  VK_CHECK_COND(graph.val_is_tensor(mat1));
+  VK_CHECK_COND(graph.val_is_tref(mat2_data));
+  VK_CHECK_COND(graph.val_is_tref(scales_and_zeros));
+
+  VK_CHECK_COND(graph.dim_of(mat1) <= 3);
+  VK_CHECK_COND(graph.dim_of(mat2_data) == 2);
+  VK_CHECK_COND(graph.dim_of(scales_and_zeros) == 3);
+
+  VK_CHECK_COND(graph.size_at<int>(-3, mat1) == 1);
+  const int K = graph.size_at<int>(-1, mat1);
+  VK_CHECK_COND(graph.size_at<int>(-1, mat2_data) * 2 == K);
+
+  const int group_size_val = graph.extract_scalar<int>(group_size);
+  VK_CHECK_COND(K % group_size_val == 0);
+  // Due to the way weight packing works, group size needs to be a multiple of 8
+  VK_CHECK_COND(group_size_val % 8 == 0);
+
+  VK_CHECK_COND(graph.has_standard_axis_map(mat1));
+  VK_CHECK_COND(graph.has_standard_axis_map(out));
+}
+
+void resize_q_4w_linear_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
+  vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);
+
+  const int out_cols = utils::val_at(-2, mat1->sizes());
+  const int out_rows = utils::val_at(-1, mat2->sizes()) * 2;
+
+  std::vector<int64_t> new_out_sizes(3);
+  if (mat1->sizes().size() == 2) {
+    new_out_sizes.resize(2);
+    new_out_sizes.at(0) = out_cols;
+    new_out_sizes.at(1) = out_rows;
+  } else {
+    new_out_sizes.at(0) = mat1->sizes().at(0);
+    new_out_sizes.at(1) = out_cols;
+    new_out_sizes.at(2) = out_rows;
+  }
+
+  out->virtual_resize(new_out_sizes);
+}
+
+ValueRef prepack_int4_linear_weight_transposed_interleaved(
+    ComputeGraph& graph,
+    const ValueRef qmat2_data) {
+  std::vector<int64_t> qmat2_orig_sizes = graph.sizes_of(qmat2_data);
+  const int64_t ndim = graph.dim_of(qmat2_data);
+
+  const int64_t K = qmat2_orig_sizes.at(ndim - 1) * 2;
+  const int64_t N = qmat2_orig_sizes.at(ndim - 2);
+  const int64_t N_div2 = N / int64_t(2);
+
+  utils::StorageType storage_type = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (N_div2 > max_extent * 4 || K > max_extent) {
+    storage_type = utils::kBuffer;
+  }
+
+  std::vector<int64_t> qmat2_sizes{K, N_div2};
+  ValueRef qmat2 = graph.add_tensor(
+      qmat2_sizes, vkcompute::vkapi::kByte, storage_type, utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size;
+  global_wg_size = graph.logical_limits_of(qmat2);
+  global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(2));
+
+  std::string kernel_name =
+      graph.context()->adapter_ptr()->has_full_int8_buffers_support()
+      ? "pack_int4_linear_weight_transposed_interleaved"
+      : "pack_int4_linear_weight_transposed_interleaved_nobitw8buffer";
+  add_storage_type_suffix(kernel_name, storage_type);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Inputs and Outputs
+      qmat2_data,
+      qmat2,
+      // UBOs
+      {},
+      // Specialization Constants
+      {},
+      // Push Constants
+      {graph.sizes_pc_of(qmat2)}));
+
+  return qmat2;
+}
+
+void add_q_4w_linear_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef group_size,
+    const ValueRef scales_and_zeros_data,
+    const ValueRef out) {
+  check_q_4w_linear_args(
+      graph, mat1, mat2_data, group_size, scales_and_zeros_data, out);
+
+  const uint32_t group_size_val = graph.extract_scalar<uint32_t>(group_size);
+
+  bool use_coop_algorithm = false;
+  // Apply the coop algorithm for gemv cases, i.e. mat1 is a vector as opposed
+  // to a matrix.
+  if (graph.size_at<uint32_t>(-2, mat1) == 1) {
+    use_coop_algorithm = true;
+  }
+
+  ValueRef mat2 =
+      prepack_int4_linear_weight_transposed_interleaved(graph, mat2_data);
+
+  ValueRef scales_and_zeros = prepack_standard_hw_transposed(
+      graph, scales_and_zeros_data, utils::kBuffer, utils::kWidthPacked);
+
+  std::string kernel_name = "q_4w_linear";
+  if (use_coop_algorithm) {
+    kernel_name += "_coop";
+  } else {
+    kernel_name += "_tiled";
+  }
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(mat1));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(mat2));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
+  global_wg_size[0] = utils::div_up(global_wg_size[0], uint32_t(2));
+  utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  if (use_coop_algorithm) {
+    local_wg_size = {8, 1, 8};
+  } else {
+    global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(3));
+  }
+
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {{mat1, mat2, scales_and_zeros}, vkapi::kRead}},
+      // Shader params buffers
+      {},
+      // Specialization Constants
+      {SV(group_size_val)},
+      // Resizing Logic
+      resize_q_4w_linear_node,
+      {},
+      // Push Constants
+      {graph.sizes_pc_of(out),
+       graph.sizes_pc_of(mat1),
+       graph.sizes_pc_of(mat2)}));
+}
+
+void linear_weight_int4(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  return add_q_4w_linear_node(
+      graph,
+      args[0], // mat1
+      args[1], // mat2
+      args[2], // group_size
+      args[3], // scales_and_zeros
+      // There is an unused variable inner_k_tiles which is used to call
+      // _convert_weight_to_int4pack in the AOT custom op, which is why the 4th
+      // argument is skipped.
+      args[5] // out
+  );
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(et_vk.linear_weight_int4.default, linear_weight_int4);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
similarity index 56%
rename from backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
rename to backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
index f4f5c853ddd..5054b2e5e9c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
@@ -98,47 +98,25 @@ void add_q_8w_linear_node(
   add_dtype_suffix(kernel_name, graph.dtype_of(out_W_packed));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out_W_packed));
 
-  vkapi::ParamsBindList ubos({});
+  std::vector<PushConstantDataInfo> pcs;
   if (graph.is_buffer_storage(out_W_packed)) {
-    ubos.append(
-        {graph.sizes_ubo(out_W_packed),
-         graph.strides_ubo(out_W_packed),
-         graph.numel_ubo(out_W_packed),
-         graph.sizes_ubo(mat1_W_packed),
-         graph.strides_ubo(mat1),
-         graph.strides_ubo(q_mat2),
-         graph.strides_ubo(scales)});
+    pcs = {
+        graph.sizes_pc_of(out_W_packed),
+        graph.strides_pc_of(out_W_packed),
+        graph.sizes_pc_of(mat1_W_packed),
+        graph.strides_pc_of(mat1),
+        graph.strides_pc_of(q_mat2),
+        graph.strides_pc_of(scales),
+        graph.numel_pc_of(out_W_packed)};
   } else {
-    ubos.append(
-        {graph.logical_limits_ubo(out_W_packed),
-         graph.sizes_ubo(mat1_W_packed)});
+    pcs = {
+        graph.logical_limits_pc_of(out_W_packed),
+        graph.sizes_pc_of(mat1_W_packed)};
   }
 
-  utils::uvec3 global_wg;
-  if (graph.is_buffer_storage(out)) {
-    global_wg = {static_cast<uint32_t>(graph.numel_of(out_W_packed)), 1, 1};
-  } else {
-    global_wg = graph.logical_limits_of(out_W_packed);
-  }
-
-  utils::uvec3 local_wg{8, 8, 1};
-  int32_t out_W = graph.size_at<int32_t>(-1, out_W_packed);
-
-  if (graph.is_buffer_storage(out_W_packed)) {
-    local_wg[0] = 64;
-    local_wg[1] = 1;
-    local_wg[2] = 1;
-  } else {
-    if (out_W % 8 != 0) {
-      if (out_W % 4 == 0) {
-        local_wg[0] = 4;
-        local_wg[1] = 16;
-      } else {
-        local_wg[0] = 2;
-        local_wg[1] = 32;
-      }
-    }
-  }
+  const utils::uvec3 global_wg = {
+      static_cast<uint32_t>(graph.numel_of(out_W_packed)), 1, 1};
+  const utils::uvec3 local_wg{64, 1, 1};
 
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
@@ -149,11 +127,13 @@ void add_q_8w_linear_node(
       {{out_W_packed, vkapi::MemoryAccessType::WRITE},
        {{mat1_W_packed, q_mat2, scales}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      ubos,
+      {},
       // Specialization Constants
       {},
       // Resizing Logic
-      resize_q_8w_linear_node));
+      resize_q_8w_linear_node,
+      {},
+      pcs));
   if (!graph.is_buffer_storage(out) &&
       graph.packed_dim_of(out) != WHCN::kWidthDim) {
     viewFn(graph, {out_W_packed, graph.add_none(), out});
@@ -268,157 +248,8 @@ void weight_int8pack_mm(
   return add_q_8w_linear_node(graph, args[0], args[1], args[2], args[3]);
 }
 
-void check_q_4w_linear_args(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef group_size,
-    const ValueRef scales_and_zeros,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.int16_shader_types_enabled());
-  VK_CHECK_COND(graph.int8_buffers_enabled());
-
-  VK_CHECK_COND(graph.val_is_tensor(mat1));
-  VK_CHECK_COND(graph.val_is_tref(mat2_data));
-  VK_CHECK_COND(graph.val_is_tref(scales_and_zeros));
-
-  VK_CHECK_COND(graph.dim_of(mat1) <= 3);
-  VK_CHECK_COND(graph.dim_of(mat2_data) == 2);
-  VK_CHECK_COND(graph.dim_of(scales_and_zeros) == 3);
-
-  VK_CHECK_COND(graph.size_at<int>(-3, mat1) == 1);
-  const int K = graph.size_at<int>(-1, mat1);
-  VK_CHECK_COND(graph.size_at<int>(-1, mat2_data) * 2 == K);
-
-  const int group_size_val = graph.extract_scalar<int>(group_size);
-  VK_CHECK_COND(K % group_size_val == 0);
-
-  VK_CHECK_COND(graph.has_standard_axis_map(mat1));
-  VK_CHECK_COND(graph.has_standard_axis_map(out));
-}
-
-void resize_q_4w_linear_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-
-  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
-  vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
-  vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);
-
-  const int out_cols = utils::val_at(-2, mat1->sizes());
-  const int out_rows = utils::val_at(-2, mat2->sizes());
-
-  std::vector<int64_t> new_out_sizes(3);
-  if (mat1->sizes().size() == 2) {
-    new_out_sizes.resize(2);
-    new_out_sizes.at(0) = out_cols;
-    new_out_sizes.at(1) = out_rows;
-  } else {
-    new_out_sizes.at(0) = mat1->sizes().at(0);
-    new_out_sizes.at(1) = out_cols;
-    new_out_sizes.at(2) = out_rows;
-  }
-
-  out->virtual_resize(new_out_sizes);
-}
-
-void add_q_4w_linear_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef group_size,
-    const ValueRef scales_and_zeros_data,
-    const ValueRef out) {
-  check_q_4w_linear_args(
-      graph, mat1, mat2_data, group_size, scales_and_zeros_data, out);
-
-  utils::StorageType storage_type = graph.storage_type_of(out);
-
-  ValueRef mat2 = prepack_direct_copy_buffer(graph, mat2_data);
-
-  ValueRef scales_and_zeros = prepack_standard(
-      graph,
-      scales_and_zeros_data,
-      graph.storage_type_of(out),
-      utils::kWidthPacked);
-
-  std::string kernel_name = "q_4w_linear";
-  add_storage_type_suffix(kernel_name, storage_type);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  const uint32_t group_size_val = graph.extract_scalar<uint32_t>(group_size);
-
-  ValueRef mat1_W_packed = mat1;
-  ValueRef out_W_packed = out;
-  auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
-  // Create temporary tensors to store the width packed versions of mat1 and out
-  TmpTensor mat1_tmp(
-      &graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked);
-  TmpTensor out_tmp(
-      &graph, graph.sizes_of(out), graph.dtype_of(out), utils::kWidthPacked);
-  if (storage_type == utils::kTexture3D) {
-    if (!graph.is_buffer_storage(out) &&
-        graph.packed_dim_of(mat1) != WHCN::kWidthDim) {
-      // Ensure mat1 is width packed
-      mat1_W_packed = mat1_tmp;
-      viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
-      // Ensure out is packed correctly
-      out_W_packed = out_tmp;
-    }
-  }
-
-  vkapi::ParamsBindList ubos({});
-  ubos.append(graph.logical_limits_ubo(out_W_packed));
-  ubos.append(graph.sizes_ubo(mat1_W_packed));
-  ubos.append(graph.strides_ubo(mat2));
-  ubos.append(graph.strides_ubo(scales_and_zeros));
-
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out_W_packed);
-  utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      local_wg_size,
-      // Inputs and Outputs
-      {{out_W_packed, vkapi::MemoryAccessType::WRITE},
-       {{mat1_W_packed, mat2, scales_and_zeros},
-        vkapi::MemoryAccessType::READ}},
-      // Shader params buffers
-      ubos,
-      // Specialization Constants
-      {SV(group_size_val)},
-      // Resizing Logic
-      resize_q_4w_linear_node,
-      {}));
-  if (!graph.is_buffer_storage(out) &&
-      graph.packed_dim_of(out) != WHCN::kWidthDim) {
-    viewFn(graph, {out_W_packed, graph.add_none(), out});
-  }
-}
-
-void linear_weight_int4(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  return add_q_4w_linear_node(
-      graph,
-      args[0], // mat1
-      args[1], // mat2
-      args[2], // group_size
-      args[3], // scales_and_zeros
-      // There is an unused variable inner_k_tiles which is used to call
-      // _convert_weight_to_int4pack in the AOT custom op, which is why the 4th
-      // argument is skipped.
-      args[5] // out
-  );
-}
-
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten._weight_int8pack_mm.default, weight_int8pack_mm);
-  VK_REGISTER_OP(et_vk.linear_weight_int4.default, linear_weight_int4);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
index be0554161d3..d73ed1bc0ce 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.h
+++ b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -211,6 +211,18 @@ class Adapter final {
     return physical_device_.min_ubo_alignment;
   }
 
+  inline uint32_t max_texture2d_dim() const {
+    return physical_device_.properties.limits.maxImageDimension2D;
+  }
+
+  inline uint32_t max_texture3d_dim() const {
+    return physical_device_.properties.limits.maxImageDimension3D;
+  }
+
+  inline uint32_t max_buffer_numel() const {
+    return physical_device_.properties.limits.maxStorageBufferRange;
+  }
+
   // Command Buffer Submission
 
   void
diff --git a/backends/vulkan/test/CMakeLists.txt b/backends/vulkan/test/CMakeLists.txt
index 4559077ccf8..592d7fca40e 100644
--- a/backends/vulkan/test/CMakeLists.txt
+++ b/backends/vulkan/test/CMakeLists.txt
@@ -46,8 +46,7 @@ if(LIB_VULKAN_BACKEND)
   set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../third-party)
 
   set(GTEST_INCLUDE_PATH
-      ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include set
-      (PYTORCH_PATH ${EXECUTORCH_ROOT}/third-party/pytorch)
+      ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include
   )
   set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
   set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
diff --git a/backends/vulkan/test/op_tests/CMakeLists.txt b/backends/vulkan/test/op_tests/CMakeLists.txt
new file mode 100644
index 00000000000..0c0558b7917
--- /dev/null
+++ b/backends/vulkan/test/op_tests/CMakeLists.txt
@@ -0,0 +1,166 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON
+
+cmake_minimum_required(VERSION 3.19)
+project(executorch)
+
+find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend)
+find_package(GTest CONFIG REQUIRED)
+
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+endif()
+
+# Include this file to access target_link_options_shared_lib This is required to
+# provide access to target_link_options_shared_lib which allows libraries to be
+# linked with the --whole-archive flag. This is required for libraries that
+# perform dynamic registration via static initialization.
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+get_torch_base_path(TORCH_BASE_PATH)
+message(STATUS "torch base path: ${TORCH_BASE_PATH}")
+
+# Only build tests if Vulkan was compiled
+find_library(LIB_VULKAN_BACKEND vulkan_backend)
+find_library(LIB_TORCH torch ${TORCH_BASE_PATH}/lib)
+find_library(LIB_TORCH_CPU torch_cpu ${TORCH_BASE_PATH}/lib)
+find_library(LIB_C10 c10 ${TORCH_BASE_PATH}/lib)
+
+message(STATUS "Vulkan backend lib ${LIB_VULKAN_BACKEND}")
+message(STATUS "Torch ${LIB_TORCH}")
+
+if(NOT PYTHON_EXECUTABLE)
+  set(PYTHON_EXECUTABLE python3)
+endif()
+
+# Third party include paths
+
+set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party)
+
+set(GTEST_INCLUDE_PATH
+    ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include
+)
+set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
+set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
+set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator)
+
+set(COMMON_INCLUDES
+    ${EXECUTORCH_ROOT}/..
+    ${VULKAN_HEADERS_PATH}
+    ${VOLK_PATH}
+    ${VMA_PATH}
+    ${GTEST_INCLUDE_PATH}
+    ${TORCH_BASE_PATH}/include
+    ${TORCH_BASE_PATH}/include/torch/csrc/api/include
+)
+
+target_link_options_shared_lib(vulkan_backend)
+
+function(vulkan_op_test test_name test_src)
+  set(extra_deps ${ARGN})
+
+  add_executable(${test_name} ${test_src})
+  target_include_directories(${test_name} PRIVATE ${COMMON_INCLUDES})
+  target_link_libraries(
+    ${test_name}
+    PRIVATE GTest::gtest_main
+            vulkan_backend
+            executorch
+            ${LIB_TORCH}
+            ${LIB_TORCH_CPU}
+            ${LIB_C10}
+            ${extra_deps}
+  )
+
+  add_test(${test_name} ${test_name})
+endfunction()
+
+if(LIB_VULKAN_BACKEND AND LIB_TORCH)
+  find_library(
+    CUSTOM_OPS_LIB custom_ops_aot_lib
+    HINTS ${CMAKE_INSTALL_PREFIX}/executorch/extension/llm/custom_ops
+  )
+  if(CUSTOM_OPS_LIB)
+    vulkan_op_test(
+      vulkan_sdpa_test ${CMAKE_CURRENT_SOURCE_DIR}/sdpa_test.cpp
+      ${CUSTOM_OPS_LIB}
+    )
+  else()
+    message(
+      STATUS "Skip building sdpa_test because custom_ops_aot_lib is not found"
+    )
+  endif()
+  vulkan_op_test(
+    vulkan_rope_test ${CMAKE_CURRENT_SOURCE_DIR}/rotary_embedding_test.cpp
+  )
+  vulkan_op_test(
+    vulkan_linear_weight_int4_test
+    ${CMAKE_CURRENT_SOURCE_DIR}/linear_weight_int4_test.cpp
+  )
+
+  # Only build generated op tests if a path to tags.yaml and
+  # native_functions.yaml is provided. These files are required for codegen.
+  if(TORCH_OPS_YAML_PATH)
+    set(GENERATED_VULKAN_TESTS_CPP_PATH ${CMAKE_CURRENT_BINARY_DIR}/vk_gen_cpp)
+
+    # Generated operator correctness tests
+
+    set(generated_test_cpp ${GENERATED_VULKAN_TESTS_CPP_PATH}/op_tests.cpp)
+
+    add_custom_command(
+      COMMENT "Generating Vulkan operator correctness tests"
+      OUTPUT ${generated_test_cpp}
+      COMMAND
+        ${PYTHON_EXECUTABLE}
+        ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
+        -o ${GENERATED_VULKAN_TESTS_CPP_PATH} --tags-path
+        ${TORCH_OPS_YAML_PATH}/tags.yaml --aten-yaml-path
+        ${TORCH_OPS_YAML_PATH}/native_functions.yaml
+      DEPENDS ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/**/*.py
+    )
+
+    vulkan_op_test(vulkan_op_correctness_tests ${generated_test_cpp})
+
+    # Generated operator benchmarks (only built in google benchmark is
+    # installed)
+    find_package(benchmark CONFIG)
+
+    if(benchmark_FOUND)
+      set(generated_benchmark_cpp
+          ${GENERATED_VULKAN_TESTS_CPP_PATH}/op_benchmarks.cpp
+      )
+
+      add_custom_command(
+        COMMENT "Generating Vulkan operator benchmarks"
+        OUTPUT ${generated_benchmark_cpp}
+        COMMAND
+          ${PYTHON_EXECUTABLE}
+          ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/generate_op_benchmarks.py
+          -o ${GENERATED_VULKAN_TESTS_CPP_PATH} --tags-path
+          ${TORCH_OPS_YAML_PATH}/tags.yaml --aten-yaml-path
+          ${TORCH_OPS_YAML_PATH}/native_functions.yaml
+        DEPENDS ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/**/*.py
+      )
+
+      vulkan_op_test(vulkan_op_benchmarks ${generated_benchmark_cpp})
+    endif()
+  else()
+    message(
+      STATUS
+        "Skipping generated operator correctness tests and benchmarks. Please specify TORCH_OPS_YAML_PATH to build these tests."
+    )
+  endif()
+endif()
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 85008a52ff0..a1b03db27c9 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -226,153 +226,190 @@ def get_max_pool2d_inputs():
 
 @register_test_suite("aten.convolution.default")
 def get_conv_inputs():
-    test_suite = VkTestSuite(
+    Test = namedtuple(
+        "ConvTest",
         [
-            (
-                (1, 6, 40, 50),
-                (8, 6, 3, 3),
-                (8,),
-                [1, 2],
-                [2, 3],
-                [1, 1],
-                False,
-                [0, 0],
-                1,
-            ),
-            (
-                (1, 6, 40, 50),
-                (6, 8, 3, 3),
-                (8,),
-                [1, 2],
-                [2, 3],
-                [1, 1],
-                True,
-                [0, 1],
-                1,
-            ),
-            (
-                (1, 8, 72, 96),
-                (8, 1, 3, 3),
-                (8,),
-                [1, 1],
-                [1, 1],
-                [1, 1],
-                False,
-                [0, 0],
-                8,
-            ),
-            (
-                (1, 8, 72, 96),
-                (8, 8, 1, 1),
-                (8,),
-                [1, 1],
-                [1, 1],
-                [1, 1],
-                False,
-                [0, 0],
-                1,
-            ),
-            (
-                (1, 6, 40, 50),
-                (8, 6, 3, 3),
-                None,
-                [1, 2],
-                [2, 3],
-                [1, 1],
-                False,
-                [0, 0],
-                1,
-            ),
-            (
-                (1, 6, 7),
-                (6, 1, 3),
-                (6,),
-                [1],
-                [0],
-                [1],
-                False,
-                [0],
-                6,
-            ),
-            (
-                (2, 20, 30),
-                (10, 4, 6),
-                (10,),
-                [5],
-                [5],
-                [3],
-                False,
-                [0],
-                5,
-            ),
-            (
-                (1, 9, 11),
-                (9, 1, 3),
-                None,
-                [1],
-                [0],
-                [1],
-                False,
-                [0],
-                9,
-            ),
-            (
-                (5, 15, 30),
-                (20, 3, 3),
-                None,
-                [3],
-                [5],
-                [7],
-                False,
-                [0],
-                5,
-            ),
-            (
-                (1, 16, 672, 512),
-                (64, 16, 1, 1),
-                (64,),
-                [1, 1],
-                [0, 0],
-                [1, 1],
-                False,
-                [0, 0],
-                1,
-            ),
-            (
-                (1, 4, 234, 234),
-                (4, 1, 3, 3),
-                (4,),
-                [2, 1],
-                [1, 1],
-                [1, 1],
-                False,
-                [0, 0],
-                4,
-            ),
-            (
-                (1, 4, 234, 234),
-                (4, 1, 3, 3),
-                (4,),
-                [1, 2],
-                [1, 1],
-                [1, 1],
-                False,
-                [0, 0],
-                4,
-            ),
-            (
-                (1, 4, 234, 234),
-                (4, 1, 3, 3),
-                (4,),
-                [2, 2],
-                [1, 1],
-                [1, 1],
-                False,
-                [0, 0],
-                4,
-            ),
-        ]
+            "self",
+            "weight",
+            "bias",
+            "stride",
+            "padding",
+            "dilation",
+            "transposed",
+            "output_padding",
+            "groups",
+        ],
+    )
+    Test.__new__.__defaults__ = (
+        None,
+        None,
+        None,
+        [1, 1],
+        [0, 0],
+        [1, 1],
+        False,
+        [9, 0],
+        1,
     )
+    test_cases = []
+    test_cases = [
+        Test(
+            self=(1, 6, 40, 50),
+            weight=(8, 6, 3, 3),
+            bias=(8,),
+            stride=[1, 2],
+            padding=[2, 3],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=1,
+        ),
+        Test(
+            self=(1, 6, 40, 50),
+            weight=(6, 8, 3, 3),
+            bias=(8,),
+            stride=[1, 2],
+            padding=[2, 3],
+            dilation=[1, 1],
+            transposed=True,
+            output_padding=[0, 1],
+            groups=1,
+        ),
+        Test(
+            self=(1, 8, 72, 96),
+            weight=(8, 1, 3, 3),
+            bias=(8,),
+            stride=[1, 1],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=8,
+        ),
+        Test(
+            self=(1, 8, 72, 96),
+            weight=(8, 8, 1, 1),
+            bias=(8,),
+            stride=[1, 1],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=1,
+        ),
+        Test(
+            self=(1, 6, 40, 50),
+            weight=(8, 6, 3, 3),
+            bias=None,
+            stride=[1, 2],
+            padding=[2, 3],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=1,
+        ),
+        Test(
+            self=(1, 6, 7),
+            weight=(6, 1, 3),
+            bias=(6,),
+            stride=[1],
+            padding=[0],
+            dilation=[1],
+            transposed=False,
+            output_padding=[0],
+            groups=6,
+        ),
+        Test(
+            self=(2, 20, 30),
+            weight=(10, 4, 6),
+            bias=(10,),
+            stride=[5],
+            padding=[5],
+            dilation=[3],
+            transposed=False,
+            output_padding=[0],
+            groups=5,
+        ),
+        Test(
+            self=(1, 9, 11),
+            weight=(9, 1, 3),
+            bias=None,
+            stride=[1],
+            padding=[0],
+            dilation=[1],
+            transposed=False,
+            output_padding=[0],
+            groups=9,
+        ),
+        Test(
+            self=(5, 15, 30),
+            weight=(20, 3, 3),
+            bias=None,
+            stride=[3],
+            padding=[5],
+            dilation=[7],
+            transposed=False,
+            output_padding=[0],
+            groups=5,
+        ),
+        Test(
+            self=(1, 16, 672, 512),
+            weight=(64, 16, 1, 1),
+            bias=(64,),
+            stride=[1, 1],
+            padding=[0, 0],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=1,
+        ),
+        Test(
+            self=(1, 4, 234, 234),
+            weight=(4, 1, 3, 3),
+            bias=(4,),
+            stride=[2, 1],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=4,
+        ),
+        Test(
+            self=(1, 4, 234, 234),
+            weight=(4, 1, 3, 3),
+            bias=(4,),
+            stride=[1, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=4,
+        ),
+        Test(
+            self=(1, 4, 234, 234),
+            weight=(4, 1, 3, 3),
+            bias=(4,),
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=4,
+        ),
+        Test(
+            self=(1, 8, 90, 77),
+            weight=(1, 8, 3, 3),
+            bias=(1,),
+            stride=[1, 1],
+            padding=[2, 2],
+            dilation=[2, 2],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=1,
+        ),
+    ]
+
+    test_suite = VkTestSuite(test_cases)
     return test_suite
 
 
diff --git a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
index 4e51e23940b..8814070abd3 100644
--- a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
+++ b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
@@ -58,6 +58,9 @@ def process_test_suites(
 def generate_cpp(
     native_functions_yaml_path: str, tags_path: str, output_dir: str
 ) -> None:
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
     output_file = os.path.join(output_dir, "op_tests.cpp")
     cpp_generator = VkCorrectnessTestFileGen(output_file)
 
diff --git a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
index 66a585844cf..e617f5b5249 100644
--- a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
+++ b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
@@ -152,13 +152,17 @@ vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
   }
 }
 
-void test_vulkan_linear_int4(
+void test_vulkan_linear_int4_impl(
     const int B,
     const int M,
     const int K,
     const int N,
     const int group_size = 32,
-    const int inner_k_tiles = 8) {
+    const int inner_k_tiles = 8,
+    const vkcompute::utils::StorageType in_storage =
+        vkcompute::utils::kTexture3D,
+    const vkcompute::utils::StorageType out_storage =
+        vkcompute::utils::kTexture3D) {
   assert(K % group_size == 0);
 
   at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
@@ -169,8 +173,13 @@ void test_vulkan_linear_int4(
   at::Tensor scales_and_zeros =
       at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat));
 
-  at::Tensor out_ref = dequantize_and_linear(
-      x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);
+  at::Tensor weights_int = unpack_weights_4x2(weights_4x2);
+  at::Tensor out_ref = linear_weight_int4_reference_impl(
+      x,
+      at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size),
+      group_size,
+      scales_and_zeros,
+      inner_k_tiles);
 
   // Build Vulkan graph
   using namespace vkcompute;
@@ -188,14 +197,13 @@ void test_vulkan_linear_int4(
   MAKE_TENSORREF_FOR(weights_4x2);
   MAKE_TENSORREF_FOR(scales_and_zeros);
 
-#define MAKE_INPUT_FOR(x)                    \
-  IOValueRef r_##x = graph.add_input_tensor( \
-      x.sizes().vec(), from_at_scalartype(x.scalar_type()));
-
-  MAKE_INPUT_FOR(x);
+  IOValueRef r_x = graph.add_input_tensor(
+      x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage);
 
   const ValueRef r_out = graph.add_tensor(
-      out_ref.sizes().vec(), from_at_scalartype(out_ref.scalar_type()));
+      out_ref.sizes().vec(),
+      from_at_scalartype(out_ref.scalar_type()),
+      out_storage);
 
   VK_GET_OP_FN("et_vk.linear_weight_int4.default")
   (graph,
@@ -229,6 +237,34 @@ void test_vulkan_linear_int4(
   ASSERT_TRUE(at::allclose(vk_out, out_ref, 1e-4, 1e-4));
 }
 
+void test_vulkan_linear_int4(
+    const int B,
+    const int M,
+    const int K,
+    const int N,
+    const int group_size = 32,
+    const int inner_k_tiles = 8) {
+  test_vulkan_linear_int4_impl(
+      B,
+      M,
+      K,
+      N,
+      group_size,
+      inner_k_tiles,
+      vkcompute::utils::kBuffer,
+      vkcompute::utils::kBuffer);
+
+  test_vulkan_linear_int4_impl(
+      B,
+      M,
+      K,
+      N,
+      group_size,
+      inner_k_tiles,
+      vkcompute::utils::kTexture3D,
+      vkcompute::utils::kTexture3D);
+}
+
 TEST(VulkanInt4LinearTest, test_reference_impl) {
   test_reference_linear_int4(
       /*B = */ 1,
@@ -237,15 +273,24 @@ TEST(VulkanInt4LinearTest, test_reference_impl) {
       /*N = */ 32);
 }
 
-TEST(VulkanInt4LinearTest, test_vulkan_impl) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
+TEST(VulkanInt4LinearTest, test_vulkan_impl_small_m) {
   test_vulkan_linear_int4(
       /*B = */ 1,
       /*M = */ 4,
       /*K = */ 128,
       /*N = */ 32);
+
+  test_vulkan_linear_int4(
+      /*B = */ 1,
+      /*M = */ 1,
+      /*K = */ 256,
+      /*N = */ 256);
+}
+
+TEST(VulkanInt4LinearTest, test_vulkan_impl_gemm) {
+  test_vulkan_linear_int4(
+      /*B = */ 1,
+      /*M = */ 256,
+      /*K = */ 256,
+      /*N = */ 256);
 }
diff --git a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
index 983d2c82bd0..65bb959f6d1 100644
--- a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
+++ b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
@@ -228,7 +228,7 @@ def generate_benchmark_fixture(self) -> str:
   return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone();
 }}
 
-at::Tensor make_index_tensor(std::vector<int64_t> indices) {{
+at::Tensor make_index_tensor_1d(std::vector<int64_t> indices) {{
   at::ScalarType dtype = at::kInt;
   std::vector<int64_t> sizes = {{static_cast<int64_t>(indices.size())}};
 
@@ -236,7 +236,7 @@ def generate_benchmark_fixture(self) -> str:
   return at::from_blob(indices.data(), sizes, dtype).detach().clone();
 }}
 
-at::Tensor make_index_tensor(std::vector<std::vector<int64_t>> indices) {{
+at::Tensor make_index_tensor_2d(std::vector<std::vector<int64_t>> indices) {{
   at::ScalarType dtype = at::kInt;
   std::vector<int64_t> sizes = {{
     static_cast<int64_t>(indices.size()),
@@ -252,7 +252,7 @@ def generate_benchmark_fixture(self) -> str:
   return at::from_blob(acc.data(), sizes, dtype).detach().clone();
 }}
 
-at::Tensor make_index_tensor(std::vector<std::vector<std::vector<int64_t>>> indices) {{
+at::Tensor make_index_tensor_3d(std::vector<std::vector<std::vector<int64_t>>> indices) {{
   at::ScalarType dtype = at::kInt;
   std::vector<int64_t> sizes = {{
     static_cast<int64_t>(indices.size()),
diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
index 708da8eab85..b24879f660a 100644
--- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py
+++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
@@ -229,11 +229,10 @@ def create_aten_fn_call(self) -> str:
 
     def create_aten_method_call(self) -> str:
         # For functions with only Method variant, we fallback to the function
-        # declared in MethodOperators.h. The method is declared as
-        # at::_ops::{name}::call(*), and ATEN_FN is a handly macro.
+        # declared in MethodOperators.h
         cpp_sig = gen_static_dispatch_backend_call_signature(self.f_sig, self.f)
         exprs = translate_args(self.f_sig, cpp_sig)
-        func_call = f"ATEN_FN({self.f_sig.name()})({exprs});"
+        func_call = f"at::_ops::{self.f_sig.name()}::call({exprs});"
         return func_call
 
     def create_out_src(self, include_declarations: bool = True) -> str:
diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
index d7e38969452..e6ce135736b 100644
--- a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
+++ b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
@@ -170,7 +170,13 @@ def create_input_data(self, arg: Argument, data: Any) -> str:  # noqa: C901
 
         if cpp_type == AT_TENSOR:
             if arg.name == "index" or arg.name == "indices":
-                ret_str += f"make_index_tensor({init_list_str(data)});"
+                args_str = init_list_str(data)
+                if args_str[:3] == "{{{":
+                    ret_str += f"make_index_tensor_3d({init_list_str(data)});"
+                elif args_str[:2] == "{{":
+                    ret_str += f"make_index_tensor_2d({init_list_str(data)});"
+                else:
+                    ret_str += f"make_index_tensor_1d({init_list_str(data)});"
             else:
                 ret_str += self.call_data_gen_fn(arg, data)
         elif cpp_type == OPT_AT_TENSOR:
@@ -278,7 +284,7 @@ def generate_suite_cpp(self) -> str:
     float high = 1.0) {{
   if (high == 1.0 && low == 0.0)
     return at::rand(sizes, at::device(at::kCPU).dtype(dtype));
-    
+
   if (dtype == at::kChar)
     return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype));
 
@@ -307,7 +313,7 @@ def generate_suite_cpp(self) -> str:
   return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone();
 }}
 
-at::Tensor make_index_tensor(std::vector<int64_t> indices) {{
+at::Tensor make_index_tensor_1d(std::vector<int64_t> indices) {{
   at::ScalarType dtype = at::kInt;
   std::vector<int64_t> sizes = {{static_cast<int64_t>(indices.size())}};
 
@@ -315,7 +321,7 @@ def generate_suite_cpp(self) -> str:
   return at::from_blob(indices.data(), sizes, dtype).detach().clone();
 }}
 
-at::Tensor make_index_tensor(std::vector<std::vector<int64_t>> indices) {{
+at::Tensor make_index_tensor_2d(std::vector<std::vector<int64_t>> indices) {{
   at::ScalarType dtype = at::kInt;
   std::vector<int64_t> sizes = {{
     static_cast<int64_t>(indices.size()),
@@ -331,7 +337,7 @@ def generate_suite_cpp(self) -> str:
   return at::from_blob(acc.data(), sizes, dtype).detach().clone();
 }}
 
-at::Tensor make_index_tensor(std::vector<std::vector<std::vector<int64_t>>> indices) {{
+at::Tensor make_index_tensor_3d(std::vector<std::vector<std::vector<int64_t>>> indices) {{
   at::ScalarType dtype = at::kInt;
   std::vector<int64_t> sizes = {{
     static_cast<int64_t>(indices.size()),
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 1c1c51bb58a..188311e5f2c 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -47,7 +47,7 @@
 )
 from executorch.exir.backend.utils import DelegateMappingBuilder
 
-from executorch.exir.memory_planning import greedy, memory_planning_algorithm_suite
+from executorch.exir.memory_planning import greedy, MemoryPlanningAlgorithmSuite
 from executorch.exir.pass_base import ExportPass, PassBase
 
 from executorch.exir.passes import MemoryPlanningPass, SpecPropPass
@@ -199,8 +199,8 @@ def preprocess(  # noqa: C901
         # Finally, apply dynamic shape passes and memory planning pass. These passes
         # must be applied only when the graph structure is finalized.
         greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False)
-        mem_planning_suite = partial(
-            memory_planning_algorithm_suite, algo_list=[greedy_memory_planning]
+        mem_planning_suite = MemoryPlanningAlgorithmSuite(
+            algo_list=[greedy_memory_planning]
         )
         program = apply_passes(
             program,
diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md
index 967a852599a..2328f8e4b90 100644
--- a/backends/xnnpack/README.md
+++ b/backends/xnnpack/README.md
@@ -131,6 +131,6 @@ create an issue on [github](https://www.github.com/pytorch/executorch/issues).
 
 
 ## See Also
-For more information about the XNNPACK Delegate, please check out the following resources:
-- [ExecuTorch XNNPACK Delegate](https://pytorch.org/executorch/0.2/native-delegates-executorch-xnnpack-delegate.html)
-- [Building and Running ExecuTorch with XNNPACK Backend](https://pytorch.org/executorch/0.2/native-delegates-executorch-xnnpack-delegate.html)
+For more information about the XNNPACK Backend, please check out the following resources:
+- [XNNPACK Backend](https://pytorch.org/executorch/main/backends-xnnpack)
+- [XNNPACK Backend Internals](https://pytorch.org/executorch/main/backend-delegates-xnnpack-reference)
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 89a44f303df..768df1f4f04 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -8,6 +8,7 @@
 
 import torch
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.backends.xnnpack.utils.quant_utils import is_dynamic_qdq
 from executorch.backends.xnnpack.utils.utils import is_param_node
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
@@ -283,6 +284,14 @@ def input_to_nhwc(
             ]
         else:
             # Need to create NHWC node
+            # Check if input uses dynamic quantization
+            is_dynamic_input = is_dynamic_qdq(input_node)
+
+            if is_dynamic_input:
+                # Trace back to original source node
+                while getattr(input_node, "args", None):
+                    input_node = input_node.args[0]
+
             with graph_module.graph.inserting_after(input_node):
                 input_node_nhwc = self.create_call_function_node(
                     graph_module=graph_module,
@@ -290,7 +299,11 @@ def input_to_nhwc(
                     args=(input_node,),
                     memory_format=torch.channels_last,
                 )
-            self.mark_as_nhwc_node(input_node_nhwc)
+
+            if is_dynamic_input:
+                # Replace downstream input_nodes with NHWC node
+                input_node.replace_all_uses_with(input_node_nhwc)
+                input_node_nhwc.args = (input_node,)
 
         self.insert_copy_and_assign_partner_nodes_quantization_sensitive(
             graph_module=graph_module,
diff --git a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
index 82a35236294..bef9ac40c02 100644
--- a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
+++ b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
@@ -78,7 +78,7 @@ def define_node(
 
 @register_node_visitor
 class OpDequantizeAffine(NodeVisitor):
-    target = "quant.dequantize_affine.default"
+    target = "torchao.dequantize_affine.default"
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
diff --git a/backends/xnnpack/operators/op_dynamic_quantize_ops.py b/backends/xnnpack/operators/op_dynamic_quantize_ops.py
index 9369f025216..6c6d31d82a4 100644
--- a/backends/xnnpack/operators/op_dynamic_quantize_ops.py
+++ b/backends/xnnpack/operators/op_dynamic_quantize_ops.py
@@ -127,7 +127,7 @@ def define_node(
 
 @register_node_visitor
 class OpQuantizeAffine(NodeVisitor):
-    target = "quant.quantize_affine.default"
+    target = "torchao.quantize_affine.default"
 
     def define_node(
         self,
diff --git a/backends/xnnpack/operators/op_skip_ops.py b/backends/xnnpack/operators/op_skip_ops.py
index face7342d8f..19df74e77ac 100644
--- a/backends/xnnpack/operators/op_skip_ops.py
+++ b/backends/xnnpack/operators/op_skip_ops.py
@@ -85,7 +85,7 @@ class OpChooseQparamsAffine(OpSkipOps):
     do nothing if node is choose_qparams_affine.default
     """
 
-    target = "quant.choose_qparams_affine.default"
+    target = "torchao.choose_qparams_affine.default"
 
 
 @register_node_visitor
diff --git a/backends/xnnpack/operators/op_slice_copy.py b/backends/xnnpack/operators/op_slice_copy.py
index 40d8e5f04eb..d9056afa832 100644
--- a/backends/xnnpack/operators/op_slice_copy.py
+++ b/backends/xnnpack/operators/op_slice_copy.py
@@ -69,7 +69,9 @@ def define_node(
             output_shape = [output_shape[i] for i in PERM_NCHW_TO_NHWC]
             dim_of_slice = PERM_NHWC_TO_NCHW[dim_of_slice]
 
-        slice_begin_index = cast(int, node.args[2])
+        slice_begin_index = 0
+        if len(node.args) > 2 and node.args[2]:
+            slice_begin_index = cast(int, node.args[2])
         if slice_begin_index < 0:
             slice_begin_index = input_shape[dim_of_slice] + slice_begin_index
 
diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py
index e695b151560..fbee1d192cf 100644
--- a/backends/xnnpack/operators/quant_params.py
+++ b/backends/xnnpack/operators/quant_params.py
@@ -141,12 +141,27 @@ def quantize_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
                 tensor, self.scale, self.zp, self.qmin, self.qmax, self.dtype
             )
 
+    # Temporary helper until non-batch dimensions can be inferred
+    # Detects if a node feeds into a conv op by checking all downstream users
+    @staticmethod
+    def _feeds_into_conv(node: torch.fx.Node) -> bool:
+        users_list = [node]
+
+        while users_list:
+            current_user = users_list.pop()
+            if "convolution" in str(current_user.target):
+                return True
+            users_list.extend(current_user.users)
+
+        return False
+
     @classmethod
     def _from_dynamic_input_node(cls, quant_node: torch.fx.Node) -> QuantParams:
         q_input = quant_node.args[0]  # fp32 input
         assert isinstance(q_input, torch.fx.Node)
         # TODO - materialize this from the quant_node scale count and val shape
-        num_nonbatch_dims = 1
+        # Set non-batch dims to 3 if node feeds into conv (only 2D is supported), otherwise set to 1 for linear
+        num_nonbatch_dims = 3 if cls._feeds_into_conv(quant_node) else 1
 
         return cls(
             per_channel=False,  # True is not valid
diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index 8712c2709ac..67bccbc52d1 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -9,6 +9,7 @@
 from typing import cast, List, Optional, Tuple
 
 import torch
+from executorch.backends.transforms import get_shape
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
 from executorch.backends.xnnpack.partition.config.xnnpack_config import (
     ConfigPrecisionType,
@@ -27,6 +28,7 @@
 )
 from executorch.backends.xnnpack.utils.utils import (
     get_input_node,
+    is_depthwise_conv,
     is_getitem,
     is_node,
     is_param_node,
@@ -359,12 +361,23 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False  # Only support 1D + 2D Conv
 
         kernel_node = get_input_node(node, 1)
+        kernel_shape = get_shape(kernel_node)
         weight_quant_params = QuantParams.from_weights(kernel_node, ep)
-
-        is_transpose = node.args[6]
         groups = cast(int, node.args[8])
+        is_transpose = node.args[6]
+
+        # XNNPACK does not support dynamic quantization convs that are not 2D or are depthwise
+        if self._detect_precision(node) == ConfigPrecisionType.DYNAMIC_QUANT and (
+            len(conv_stride) != 2
+            or is_depthwise_conv(kernel_shape, groups, is_transpose)
+        ):
+            why(
+                node,
+                "XNNPACK only supports standard 2D convolutions for dynamic quantization",
+            )
+            return False
 
-        # XNNPack does not support non-zero output padding in transposed
+        # XNNPACK does not support non-zero output padding in transposed
         # convolutions.
         if is_transpose and any(
             out_pad != 0 for out_pad in cast(List[int], node.args[7])
@@ -394,6 +407,7 @@ def supported_precision_types(self):
         return [
             ConfigPrecisionType.FP32,
             ConfigPrecisionType.STATIC_QUANT,
+            ConfigPrecisionType.DYNAMIC_QUANT,
         ]
 
 
diff --git a/backends/xnnpack/partition/config/quant_affine_configs.py b/backends/xnnpack/partition/config/quant_affine_configs.py
index d9e789104b6..046402800a3 100644
--- a/backends/xnnpack/partition/config/quant_affine_configs.py
+++ b/backends/xnnpack/partition/config/quant_affine_configs.py
@@ -36,7 +36,7 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         try:
             import torchao.quantization.quant_primitives  # noqa
 
-            return torch.ops.quant.quantize_affine.default
+            return torch.ops.torchao.quantize_affine.default
         except:
             return None
 
@@ -48,7 +48,7 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         try:
             import torchao.quantization.quant_primitives  # noqa
 
-            return torch.ops.quant.dequantize_affine.default
+            return torch.ops.torchao.dequantize_affine.default
         except:
             return None
 
@@ -60,6 +60,6 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         try:
             import torchao.quantization.quant_primitives  # noqa
 
-            return torch.ops.quant.choose_qparams_affine.default
+            return torch.ops.torchao.choose_qparams_affine.default
         except:
             return None
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer.py b/backends/xnnpack/quantizer/xnnpack_quantizer.py
index 0ddee53a41a..fdabd0383e6 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer.py
@@ -265,6 +265,7 @@ class XNNPACKQuantizer(Quantizer):
 
     DYNAMIC_OPS = [
         "linear",
+        "conv",
     ]
 
     def __init__(self) -> None:
diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
index ce459806c6e..4b961bef81d 100644
--- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
+++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.nn.functional as F
+from executorch.backends.xnnpack.utils.utils import is_depthwise_conv
 from torch._subclasses import FakeTensor
 from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
 from torch.ao.quantization.pt2e.export_utils import _WrapperModule
@@ -29,7 +30,6 @@
 )
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
-
 __all__ = [
     "OperatorConfig",
     "OperatorPatternType",
@@ -323,6 +323,23 @@ def _do_annotate_conv(
         assert isinstance(weight, Node)
         input_qspec_map[weight] = get_weight_qspec(quantization_config)
 
+        # Only annotate dynamically quantized conv if it's 2D and not depthwise
+        if (
+            quantization_config
+            and quantization_config.input_activation
+            and quantization_config.input_activation.is_dynamic
+        ):
+            weight_val = weight.meta.get("val", None)
+            weight_shape = getattr(weight_val, "shape", None)
+
+            # Skip if not a 4D weight tensor (i.e. not conv2d)
+            if weight_shape is not None and len(weight_shape) != 4:
+                continue
+
+            # Skip if depthwise (default to groups=1 since it's not an arg)
+            if is_depthwise_conv(weight_shape, 1, is_conv_transpose):
+                continue
+
         # adding weight node to the partition as well
         partition = [conv_node, conv_node.args[1]]
 
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index c0204831c07..0b187d05df0 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -512,11 +512,6 @@ Error defineTensor(
             buffer_ptr == nullptr,
             Internal,
             "Dynamically quantized tensor should not have constant data but found non-nullptr");
-        // TODO(T179441835): Dynamic Quantization with num_nonbatch_dims > 1
-        ET_CHECK_OR_RETURN_ERROR(
-            qparams->num_nonbatch_dims() == 1,
-            Internal,
-            "Dynamically Quantized Tensors currently only support per token quantization");
         status = xnn_define_dynamically_quantized_tensor_value(
             /*subgraph=*/subgraph_ptr,
             /*datatype=*/getDataType(tensor_value->datatype()),
@@ -1172,7 +1167,7 @@ Error defineStaticTransposeNode(
   ET_CHECK_OR_RETURN_ERROR(
       status == xnn_status_success,
       Internal,
-      "Failed to create sigmoid node %i with code: %s",
+      "Failed to create static transpose node %i with code: %s",
       node->debug_handle(),
       xnn_status_to_string(status));
 
diff --git a/backends/xnnpack/test/ops/test_check_quant_params.py b/backends/xnnpack/test/ops/test_check_quant_params.py
index b76935a9f72..d05b1fce540 100644
--- a/backends/xnnpack/test/ops/test_check_quant_params.py
+++ b/backends/xnnpack/test/ops/test_check_quant_params.py
@@ -52,7 +52,7 @@ def _test_check_quant_message(self, ep_modifier, expected_message):
         torch._dynamo.reset()
         mod = torch.nn.Linear(10, 10)
         quantizer = XNNPACKQuantizer()
-        captured = export_for_training(mod, (torch.randn(1, 10),)).module()
+        captured = export_for_training(mod, (torch.randn(1, 10),), strict=True).module()
         quantizer.set_global(get_symmetric_quantization_config(is_per_channel=True))
         prepared = prepare_pt2e(captured, quantizer)
 
@@ -65,10 +65,9 @@ def _test_check_quant_message(self, ep_modifier, expected_message):
         with self.assertRaises(ValueError) as context:
             to_edge_transform_and_lower(aten, partitioner=[XnnpackPartitioner()])
 
-        self.assertEquals(str(context.exception), expected_message)
+        self.assertEqual(str(context.exception), expected_message)
 
     def test_in_per_tensor_quant(self):
-
         for invalid_scale in [
             float("nan"),
             float("inf"),
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index 80b731bd18e..92bb03c907a 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -18,6 +18,10 @@
 except:
     has_quantized_ops = False
 
+from executorch.backends.xnnpack.partition.config.xnnpack_config import (
+    ConfigPrecisionType,
+)
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
 )
@@ -26,7 +30,7 @@
 )
 from executorch.backends.xnnpack.test.test_xnnpack_utils import randomize_bn
 from executorch.backends.xnnpack.test.tester import Quantize, Tester
-
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
@@ -169,6 +173,43 @@ def get_inputs(self):
         return (torch.randn(2, 2, 4, 4),)
 
 
+class Conv2dDQSeq(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.first = torch.nn.Conv2d(
+            in_channels=3, out_channels=8, kernel_size=3, padding=1
+        )
+        self.second = torch.nn.Conv2d(
+            in_channels=8, out_channels=10, kernel_size=3, padding=1
+        )
+
+    def forward(self, x):
+        y = self.first(x)
+        return self.second(y)
+
+    def get_inputs(self):
+        return (torch.randn(1, 3, 8, 8),)
+
+
+class Conv2dDQParallel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.first = torch.nn.Conv2d(
+            in_channels=3, out_channels=8, kernel_size=3, padding=1
+        )
+        self.second = torch.nn.Conv2d(
+            in_channels=3, out_channels=8, kernel_size=3, padding=1
+        )
+
+    def forward(self, x):
+        first = self.first(x)
+        second = self.second(x)
+        return first, second
+
+    def get_inputs(self):
+        return (torch.randn(1, 3, 8, 8),)
+
+
 class TestConv2d(unittest.TestCase):
     def setUp(self):
         torch._dynamo.reset()
@@ -223,6 +264,37 @@ def _test(
                     .run_method_and_compare_outputs(qtol=1)
                 )
 
+    def _test_dq(
+        self,
+        m: torch.nn.Module,
+        conv_count=1,
+        dynamic_shapes=None,
+    ):
+        quant_config = get_symmetric_quantization_config(
+            is_per_channel=True,
+            is_dynamic=True,
+        )
+
+        DynamicallyQuantizedPartitioner = XnnpackPartitioner(
+            config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
+            per_op_mode=True,
+        )
+
+        tester = Tester(m, m.get_inputs(), dynamic_shapes=dynamic_shapes)
+        tester.quantize(Quantize(quantization_config=quant_config))
+        tester.export()
+        tester.check(["torch.ops.quantized_decomposed.choose_qparams"])
+        tester.to_edge_transform_and_lower(
+            ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
+        )
+        tester.check_count(
+            {"torch.ops.higher_order.executorch_call_delegate": conv_count}
+        )
+        tester.check_not(["executorch_exir_dialects_edge__ops_aten_conv2d_default"])
+        tester.to_executorch()
+        tester.serialize()
+        tester.run_method_and_compare_outputs(qtol=1)
+
     def test_fp16_conv2d(self) -> None:
         for transpose in (True, False):
             for has_bias in (True, False):
@@ -699,3 +771,26 @@ def forward(self, x):
             .serialize()
             .run_method_and_compare_outputs(qtol=1)
         )
+
+    def test_dq_conv2d(self) -> None:
+        model = Conv2d(
+            in_channels=3,
+            out_channels=10,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(0, 0),
+            batches=1,
+            width=8,
+            height=8,
+        )
+        self._test_dq(model)
+
+    def test_dq_conv2d_seq(self) -> None:
+        model = Conv2dDQSeq()
+        conv_count = sum(1 for m in model.modules() if type(m) is torch.nn.Conv2d)
+        self._test_dq(model, conv_count)
+
+    def test_dq_conv2d_parallel(self) -> None:
+        model = Conv2dDQParallel()
+        conv_count = sum(1 for m in model.modules() if type(m) is torch.nn.Conv2d)
+        self._test_dq(model, conv_count)
diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py
index bcaf2e82a08..421e59c0b08 100644
--- a/backends/xnnpack/test/ops/test_linear.py
+++ b/backends/xnnpack/test/ops/test_linear.py
@@ -402,9 +402,9 @@ def _test_groupwise_dq_linear(
             .export()
             .check_count(
                 {
-                    "torch.ops.quant.choose_qparams_affine.default": 1 * num_linears,
-                    "torch.ops.quant.quantize_affine.default": 1 * num_linears,
-                    "torch.ops.quant.dequantize_affine.default": 2 * num_linears,
+                    "torch.ops.torchao.choose_qparams_affine.default": 1 * num_linears,
+                    "torch.ops.torchao.quantize_affine.default": 1 * num_linears,
+                    "torch.ops.torchao.dequantize_affine.default": 2 * num_linears,
                     "torch.ops.aten.linear.default": 1 * num_linears,
                 }
             )
diff --git a/backends/xnnpack/test/ops/test_slice_copy.py b/backends/xnnpack/test/ops/test_slice_copy.py
index ea65571b1e8..857c78480ad 100644
--- a/backends/xnnpack/test/ops/test_slice_copy.py
+++ b/backends/xnnpack/test/ops/test_slice_copy.py
@@ -69,6 +69,18 @@ def forward(self, x):
         # Note that two of the slices are optimized away as they are identity.
         self._test_slice_copy(ConvSlice(), inputs, 4, 2)
 
+    def test_fp32_slice_copy_default_start(self):
+        """
+        XNNPACK supports default start in slice op.
+        """
+
+        class Slice(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.slice.Tensor(x, 0, None, 2)
+
+        inputs = (torch.randn(5, 5),)
+        self._test_slice_copy(Slice(), inputs, 1, 1)
+
     def test_fp32_slice_copy_stride_non_1(self):
         """
         XNNPACK does not support strided slicing.
diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
index 6d60f9d76b5..a00209f4ea6 100644
--- a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
+++ b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -10,10 +10,13 @@
 from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import (
     ChannelsLastTaggedReshapePass,
 )
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+)
 from executorch.backends.xnnpack.test.test_xnnpack_utils_classes import (
     OpSequencesAddConv2d,
 )
-from executorch.backends.xnnpack.test.tester import RunPasses, Tester
+from executorch.backends.xnnpack.test.tester import Quantize, RunPasses, Tester
 
 
 class TestChannelsLastTaggedReshapePass(unittest.TestCase):
@@ -35,6 +38,10 @@ def setUp(self):
     dequant_name = "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default"
     conv_name = "executorch_exir_dialects_edge__ops_aten_convolution_default"
     relu_name = "executorch_exir_dialects_edge__ops_aten_relu_default"
+    choose_qparams_name = (
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_choose_qparams_tensor"
+    )
+    dynamic_quant_name = "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_tensor"
 
     def test_fp32_channels_last_tagged_reshape_pass(self):
         for module, num_reshape in self.modules.items():
@@ -179,3 +186,37 @@ def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self):
             )
             .run_method_and_compare_outputs()
         )
+
+    class Conv2dDynamicQuant(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 10, 3)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    def test_dq_conv2d_channels_last_tagged_reshape_pass(self) -> None:
+        (
+            Tester(self.Conv2dDynamicQuant().eval(), (torch.randn(1, 3, 8, 8),))
+            .quantize(
+                Quantize(
+                    quantization_config=get_symmetric_quantization_config(
+                        is_dynamic=True
+                    )
+                )
+            )
+            .export()
+            .to_edge()
+            .run_passes(self.PassStage)
+            .check(
+                [
+                    self.to_copy_name,
+                    self.choose_qparams_name,
+                    self.dynamic_quant_name,
+                    self.dequant_name,
+                    self.conv_name,
+                    self.to_copy_name,
+                ]
+            )
+            .run_method_and_compare_outputs()
+        )
diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py
index db1914e3910..cb91b78c123 100644
--- a/backends/xnnpack/utils/quant_utils.py
+++ b/backends/xnnpack/utils/quant_utils.py
@@ -12,6 +12,7 @@
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
     format_target_name,
 )
+from torch.fx.experimental.symbolic_shapes import free_symbols, has_free_symbols
 
 _Q_OPS = {
     "quantize_per_tensor.tensor",
@@ -126,8 +127,8 @@ def is_affine_qdq(node: torch.fx.Node) -> bool:
 def _get_block_size_input_scale(node: torch.fx.Node):
     assert is_affine_qdq(node)
     block_size = node.args[1]
-    input_val = node.all_input_nodes[0].meta["val"]
-    scale_val = node.all_input_nodes[1].meta["val"]
+    input_val = cast(torch.fx.Node, node.args[0]).meta["val"]
+    scale_val = cast(torch.fx.Node, node.args[2]).meta["val"]
     return block_size, input_val, scale_val
 
 
@@ -145,7 +146,21 @@ def is_per_token(node: torch.fx.Node):
             flag &= block_size[i] == 1
             scale_numel_expected *= input_val.shape[i]
 
-        flag &= block_size[-1] == input_val.shape[-1]
+        ic_block_size = block_size[-1]
+        if isinstance(ic_block_size, torch.fx.Node):
+            ic_block_size = ic_block_size.meta["val"]
+            assert free_symbols(
+                ic_block_size
+            ), f"block_size: {block_size} given, but {block_size[-1]} is not a dynamic symint"
+
+        ic_dim = input_val.shape[-1]
+        if isinstance(ic_dim, torch.fx.Node):
+            ic_dim = ic_dim.meta["val"]
+            assert free_symbols(
+                ic_dim
+            ), f"input_shape: {input_val.shape} given, but {input_val.shape[-1]} is not a dynamic symint"
+
+        flag &= ic_dim == ic_block_size
         flag &= scale_val.numel() == scale_numel_expected
         return flag
 
@@ -160,6 +175,11 @@ def is_per_channel_group(node: torch.fx.Node):
         return True
     elif is_affine_qdq(node):
         block_size, input_val, scale_val = _get_block_size_input_scale(node)
+        # per channel group is only valid on static weights
+        # so scales and weights can't have dynamic shape
+        if has_free_symbols(input_val.shape) or has_free_symbols(scale_val.shape):
+            return False
+
         flag = True
         flag &= len(block_size) == 2
         flag &= block_size[0] == 1
diff --git a/backends/xnnpack/utils/utils.py b/backends/xnnpack/utils/utils.py
index fab95618807..b23fd444117 100644
--- a/backends/xnnpack/utils/utils.py
+++ b/backends/xnnpack/utils/utils.py
@@ -158,3 +158,33 @@ def get_source_fn(node: torch.fx.Node) -> Optional[torch.fx.Node]:
         return None
     source_fn = source_fn_st[-1]
     return source_fn[1]
+
+
+def is_depthwise_conv(
+    kernel_shape: Tuple[int, ...], groups: int = 1, is_transpose: bool = False
+) -> bool:
+    """
+    A convolution is depthwise if:
+        1) groups = input_channels (i.e. group_input_channels = 1)
+        2) output_channels is a positive integer multiple of input channels
+
+    For standard convolutions:
+        weight shape = (out_channels, in_channels_per_group, height, width)
+    For transposed convolutions:
+        weight shape = (in_channels, out_channels_per_group, height, width)
+
+    Returns True if the convolution is depthwise
+    """
+    if len(kernel_shape) < 2 or groups < 1:
+        return False
+
+    if is_transpose:
+        group_input_channels = int(kernel_shape[0] / groups)
+        group_output_channels = kernel_shape[1]
+    else:
+        group_input_channels = kernel_shape[1]
+        group_output_channels = int(kernel_shape[0] / groups)
+
+    return (
+        group_input_channels == 1 and group_output_channels % group_input_channels == 0
+    )
diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
index 3076cde1a99..180baf9b2a9 100644
--- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
@@ -22,8 +22,8 @@
 // JIT op registry instead of c10 dispatcher. JIT op registry only takes boxed
 // kernels, so we are calling unboxing functions in UnboxingFunctions.h to cast
 // arguments into C++ types (instead of IValue) and delegate to unboxed kernels.
-using KernelSpan =
-    ::executorch::runtime::Span<const ::executorch::runtime::Kernel>;
+using KernelSpan = ::executorch::runtime::Span<
+    const ::executorch::ET_RUNTIME_NAMESPACE::Kernel>;
 namespace torch {
 namespace executor {
 namespace function {
diff --git a/devtools/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp
index f12262f7dd0..df4124e0038 100644
--- a/devtools/bundled_program/bundled_program.cpp
+++ b/devtools/bundled_program/bundled_program.cpp
@@ -27,13 +27,13 @@ using executorch::aten::ArrayRef;
 using executorch::aten::Half;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
+using ::executorch::ET_RUNTIME_NAMESPACE::Method;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
-using ::executorch::runtime::Method;
 using ::executorch::runtime::Result;
 
 namespace executorch {
-namespace bundled_program {
+namespace BUNDLED_PROGRAM_NAMESPACE {
 
 namespace {
 
@@ -332,8 +332,9 @@ ET_NODISCARD Error load_bundled_input(
         static_cast<uint32_t>(status));
   }
 
-  ::executorch::runtime::internal::event_tracer_set_bundled_input_index(
-      method.get_event_tracer(), testset_idx);
+  ::executorch::ET_RUNTIME_NAMESPACE::internal::
+      event_tracer_set_bundled_input_index(
+          method.get_event_tracer(), testset_idx);
 
   return Error::Ok;
 }
@@ -432,5 +433,5 @@ bool is_bundled_program(void* file_data, ET_UNUSED size_t file_data_len) {
       file_data);
 }
 
-} // namespace bundled_program
+} // namespace BUNDLED_PROGRAM_NAMESPACE
 } // namespace executorch
diff --git a/devtools/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h
index 884ca6f21bc..14f26ce00f7 100644
--- a/devtools/bundled_program/bundled_program.h
+++ b/devtools/bundled_program/bundled_program.h
@@ -10,15 +10,20 @@
 
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/executor/method.h>
+#ifdef USE_ATEN_LIB
+#define BUNDLED_PROGRAM_NAMESPACE bundled_program::aten
+#else // !USE_ATEN_LIB
+#define BUNDLED_PROGRAM_NAMESPACE bundled_program
+#endif // USE_ATEN_LIB
 
 namespace executorch {
-namespace bundled_program {
+namespace BUNDLED_PROGRAM_NAMESPACE {
 
 /**
  * An opaque pointer to a serialized bundled program.
  */
 using SerializedBundledProgram = const void;
-
+using ::executorch::ET_RUNTIME_NAMESPACE::Method;
 /**
  * Load testset_idx-th bundled input of method_idx-th Method test in
  * bundled_program_ptr to given Method.
@@ -31,7 +36,7 @@ using SerializedBundledProgram = const void;
  * execution.
  */
 ET_NODISCARD ::executorch::runtime::Error load_bundled_input(
-    ::executorch::runtime::Method& method,
+    Method& method,
     SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx);
 
@@ -49,7 +54,7 @@ ET_NODISCARD ::executorch::runtime::Error load_bundled_input(
  * execution.
  */
 ET_NODISCARD ::executorch::runtime::Error verify_method_outputs(
-    ::executorch::runtime::Method& method,
+    Method& method,
     SerializedBundledProgram* bundled_program_ptr,
     size_t testset_idx,
     double rtol = 1e-5,
@@ -94,7 +99,7 @@ ET_DEPRECATED inline bool is_bundled_program(void* file_data) {
   return is_bundled_program(file_data, 128);
 }
 
-} // namespace bundled_program
+} // namespace BUNDLED_PROGRAM_NAMESPACE
 } // namespace executorch
 
 namespace torch {
@@ -103,24 +108,24 @@ namespace bundled_program {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
 using serialized_bundled_program =
-    ::executorch::bundled_program::SerializedBundledProgram;
+    ::executorch::BUNDLED_PROGRAM_NAMESPACE::SerializedBundledProgram;
 
 ET_NODISCARD inline ::executorch::runtime::Error LoadBundledInput(
-    ::executorch::runtime::Method& method,
+    Method& method,
     serialized_bundled_program* bundled_program_ptr,
     size_t testset_idx) {
-  return ::executorch::bundled_program::load_bundled_input(
+  return ::executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input(
       method, bundled_program_ptr, testset_idx);
 }
 
 ET_NODISCARD inline ::executorch::runtime::Error
 VerifyResultWithBundledExpectedOutput(
-    ::executorch::runtime::Method& method,
+    Method& method,
     serialized_bundled_program* bundled_program_ptr,
     size_t testset_idx,
     double rtol = 1e-5,
     double atol = 1e-8) {
-  return ::executorch::bundled_program::verify_method_outputs(
+  return ::executorch::BUNDLED_PROGRAM_NAMESPACE::verify_method_outputs(
       method, bundled_program_ptr, testset_idx, rtol, atol);
 }
 
@@ -129,13 +134,14 @@ ET_NODISCARD inline ::executorch::runtime::Error GetProgramData(
     size_t file_data_len,
     const void** out_program_data,
     size_t* out_program_data_len) {
-  return ::executorch::bundled_program::get_program_data(
+  return ::executorch::BUNDLED_PROGRAM_NAMESPACE::get_program_data(
       file_data, file_data_len, out_program_data, out_program_data_len);
 }
 
 inline bool IsBundledProgram(void* file_data) {
   // 128 is enough data to contain the identifier in the flatbuffer header.
-  return ::executorch::bundled_program::is_bundled_program(file_data, 128);
+  return ::executorch::BUNDLED_PROGRAM_NAMESPACE::is_bundled_program(
+      file_data, 128);
 }
 } // namespace bundled_program
 } // namespace executor
diff --git a/devtools/etdump/etdump_filter.h b/devtools/etdump/etdump_filter.h
index 545823a5556..29db43be8b9 100644
--- a/devtools/etdump/etdump_filter.h
+++ b/devtools/etdump/etdump_filter.h
@@ -77,8 +77,9 @@ class ETDumpFilter : public ::executorch::runtime::EventTracerFilterBase {
    *
    * @return A Result<bool> indicating whether the event matches the filter
    * criteria.
-   *         - True if the event matches the filter, or filter is unset.
-   *         - False if the event does not match or is unknown.
+   *         - True if the event matches the filter.
+   *         - False if the event does not match, or is unknown, or filter is
+   * unset.
    *         - An error code if an error occurs during filtering.
    */
   Result<bool> filter(
diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp
index ea464f2f5ce..4b5da78550e 100644
--- a/devtools/etdump/etdump_flatcc.cpp
+++ b/devtools/etdump/etdump_flatcc.cpp
@@ -42,7 +42,7 @@ namespace executorch {
 namespace etdump {
 namespace {
 
-executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type(
+Result<executorch_flatbuffer_ScalarType_enum_t> get_flatbuffer_scalar_type(
     executorch::aten::ScalarType tensor_scalar_type) {
   switch (tensor_scalar_type) {
     case executorch::aten::ScalarType::Byte:
@@ -66,21 +66,26 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type(
     case executorch::aten::ScalarType::UInt16:
       return executorch_flatbuffer_ScalarType_UINT16;
     default:
-      ET_CHECK_MSG(
+      ET_CHECK_OR_RETURN_ERROR(
           0,
+          InvalidArgument,
           "This ScalarType = %hhd is not yet supported in ETDump",
           static_cast<char>(tensor_scalar_type));
   }
 }
 
-etdump_Tensor_ref_t add_tensor_entry(
+Result<etdump_Tensor_ref_t> add_tensor_entry(
     flatcc_builder_t* builder_,
     const executorch::aten::Tensor& tensor,
     long offset) {
   etdump_Tensor_start(builder_);
 
-  etdump_Tensor_scalar_type_add(
-      builder_, get_flatbuffer_scalar_type(tensor.scalar_type()));
+  Result<executorch_flatbuffer_ScalarType_enum_t> scalar_type =
+      get_flatbuffer_scalar_type(tensor.scalar_type());
+  if (!scalar_type.ok()) {
+    return scalar_type.error();
+  }
+  etdump_Tensor_scalar_type_add(builder_, scalar_type.get());
   etdump_Tensor_sizes_start(builder_);
 
   for (auto dim : tensor.sizes()) {
@@ -323,40 +328,32 @@ Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DelegateDebugIntId delegate_debug_index,
     const ArrayRef<Tensor> output) {
-  log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
-  Result<bool> result = log_intermediate_output_delegate_helper(
+  return log_intermediate_output_delegate_helper(
       name, delegate_debug_index, output);
-  return result;
 }
 
 Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DelegateDebugIntId delegate_debug_index,
     const int& output) {
-  log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
-  Result<bool> result = log_intermediate_output_delegate_helper(
+  return log_intermediate_output_delegate_helper(
       name, delegate_debug_index, output);
-  return result;
 }
 
 Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DelegateDebugIntId delegate_debug_index,
     const bool& output) {
-  log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
-  Result<bool> result = log_intermediate_output_delegate_helper(
+  return log_intermediate_output_delegate_helper(
       name, delegate_debug_index, output);
-  return result;
 }
 
 Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DelegateDebugIntId delegate_debug_index,
     const double& output) {
-  log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
-  Result<bool> result = log_intermediate_output_delegate_helper(
+  return log_intermediate_output_delegate_helper(
       name, delegate_debug_index, output);
-  return result;
 }
 
 template <typename T>
@@ -369,6 +366,19 @@ Result<bool> ETDumpGen::log_intermediate_output_delegate_helper(
       InvalidArgument,
       "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details.");
 
+  if (filter_) {
+    Result<bool> result = filter_->filter(name, delegate_debug_index);
+    if (!result.ok()) {
+      return result;
+    }
+
+    // If the filter returns true, meaning this event should be filtered out and
+    // we should not log it.
+    if (result.get()) {
+      return false;
+    }
+  }
+
   check_ready_to_add_events();
   int64_t string_id = name != nullptr ? create_string_entry(name) : -1;
 
@@ -385,18 +395,26 @@ Result<bool> ETDumpGen::log_intermediate_output_delegate_helper(
   // Check the type of `output` then call the corresponding logging functions
   if constexpr (std::is_same<T, Tensor>::value) {
     long offset = write_tensor_or_raise_error(output);
-    etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset);
+    Result<etdump_Tensor_ref_t> tensor_ref =
+        add_tensor_entry(builder_, output, offset);
+    if (!tensor_ref.ok()) {
+      return tensor_ref.error();
+    }
 
     etdump_Value_start(builder_);
     etdump_Value_val_add(builder_, etdump_ValueType_Tensor);
-    etdump_Value_tensor_add(builder_, tensor_ref);
+    etdump_Value_tensor_add(builder_, tensor_ref.get());
 
   } else if constexpr (std::is_same<T, ArrayRef<Tensor>>::value) {
     etdump_Tensor_vec_start(builder_);
     for (size_t i = 0; i < output.size(); ++i) {
       long offset = write_tensor_or_raise_error(output[i]);
-      etdump_Tensor_vec_push(
-          builder_, add_tensor_entry(builder_, output[i], offset));
+      Result<etdump_Tensor_ref_t> tensor_ref =
+          add_tensor_entry(builder_, output[i], offset);
+      if (!tensor_ref.ok()) {
+        return tensor_ref.error();
+      }
+      etdump_Tensor_vec_push(builder_, tensor_ref.get());
     }
     etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_);
     etdump_TensorList_ref_t tensor_list_ref =
@@ -518,22 +536,26 @@ ETDumpResult ETDumpGen::get_etdump_data() {
   return result;
 }
 
-void ETDumpGen::set_debug_buffer(Span<uint8_t> buffer) {
+Result<bool> ETDumpGen::set_debug_buffer(Span<uint8_t> buffer) {
   Result<BufferDataSink> bds_ret = BufferDataSink::create(buffer);
-  ET_CHECK_MSG(
+  ET_CHECK_OR_RETURN_ERROR(
       bds_ret.ok(),
+      InvalidArgument,
       "Failed to create data sink from debug buffer with error 0x%" PRIx32,
       static_cast<uint32_t>(bds_ret.error()));
 
   buffer_data_sink_ = std::move(bds_ret.get());
   data_sink_ = &buffer_data_sink_;
+  return true;
 }
 
 void ETDumpGen::set_data_sink(DataSinkBase* data_sink) {
   data_sink_ = data_sink;
 }
 
-void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
+Result<bool> ETDumpGen::log_evalue(
+    const EValue& evalue,
+    LoggedEValueType evalue_type) {
   check_ready_to_add_events();
 
   etdump_DebugEvent_start(builder_);
@@ -545,12 +567,15 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
     case Tag::Tensor: {
       executorch::aten::Tensor tensor = evalue.toTensor();
       long offset = write_tensor_or_raise_error(tensor);
-      etdump_Tensor_ref_t tensor_ref =
+      Result<etdump_Tensor_ref_t> tensor_ref =
           add_tensor_entry(builder_, tensor, offset);
+      if (!tensor_ref.ok()) {
+        return tensor_ref.error();
+      }
 
       etdump_Value_start(builder_);
       etdump_Value_val_add(builder_, etdump_ValueType_Tensor);
-      etdump_Value_tensor_add(builder_, tensor_ref);
+      etdump_Value_tensor_add(builder_, tensor_ref.get());
       if (evalue_type == LoggedEValueType::kProgramOutput) {
         auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE);
         etdump_Value_output_add(builder_, bool_ref);
@@ -567,8 +592,12 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
       etdump_Tensor_vec_start(builder_);
       for (size_t i = 0; i < tensors.size(); ++i) {
         long offset = write_tensor_or_raise_error(tensors[i]);
-        etdump_Tensor_vec_push(
-            builder_, add_tensor_entry(builder_, tensors[i], offset));
+        Result<etdump_Tensor_ref_t> tensor_ref =
+            add_tensor_entry(builder_, tensors[i], offset);
+        if (!tensor_ref.ok()) {
+          return tensor_ref.error();
+        }
+        etdump_Tensor_vec_push(builder_, tensor_ref.get());
       }
       etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_);
       etdump_TensorList_ref_t tensor_list_ref =
@@ -640,6 +669,7 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
   etdump_RunData_events_push_start(builder_);
   etdump_Event_debug_event_add(builder_, debug_event);
   etdump_RunData_events_push_end(builder_);
+  return true;
 }
 
 size_t ETDumpGen::get_num_blocks() {
@@ -654,6 +684,11 @@ DataSinkBase* ETDumpGen::get_data_sink() {
   return data_sink_;
 }
 
+void ETDumpGen::set_delegation_intermediate_output_filter(
+    EventTracerFilterBase* filter) {
+  filter_ = filter;
+}
+
 long ETDumpGen::write_tensor_or_raise_error(Tensor tensor) {
   // Previously, the function copy_tensor_to_debug_buffer returned 0xFF..F when
   // given an empty tensor, which is an invalid offset for most buffers. In our
diff --git a/devtools/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h
index 6b51745eee3..ea0c1cb653d 100644
--- a/devtools/etdump/etdump_flatcc.h
+++ b/devtools/etdump/etdump_flatcc.h
@@ -25,6 +25,7 @@ namespace executorch {
 namespace etdump {
 
 using ::executorch::runtime::DelegateDebugIntId;
+using ::executorch::runtime::EventTracerFilterBase;
 using ::executorch::runtime::Result;
 
 namespace internal {
@@ -101,7 +102,7 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
       size_t size) override;
   virtual ::executorch::runtime::AllocatorID track_allocator(
       const char* name) override;
-  virtual void log_evalue(
+  virtual Result<bool> log_evalue(
       const ::executorch::runtime::EValue& evalue,
       ::executorch::runtime::LoggedEValueType evalue_type =
           ::executorch::runtime::LoggedEValueType::kIntermediateOutput)
@@ -146,7 +147,14 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
       const char* name,
       DelegateDebugIntId delegate_debug_index,
       const double& output) override;
-  void set_debug_buffer(::executorch::runtime::Span<uint8_t> buffer);
+
+  /**
+   * Set the filter of event tracer for delegation intermediate outputs.
+   */
+  virtual void set_delegation_intermediate_output_filter(
+      EventTracerFilterBase* event_tracer_filter) override;
+
+  Result<bool> set_debug_buffer(::executorch::runtime::Span<uint8_t> buffer);
   void set_data_sink(DataSinkBase* data_sink);
   ETDumpResult get_etdump_data();
   size_t get_num_blocks();
@@ -188,6 +196,8 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
   int bundled_input_index_ = -1;
   State state_ = State::Init;
   struct internal::ETDumpStaticAllocator alloc_;
+
+  EventTracerFilterBase* filter_ = nullptr;
 };
 
 } // namespace etdump
diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp
index c64bab0448c..9d39a8bbde1 100644
--- a/devtools/etdump/tests/etdump_test.cpp
+++ b/devtools/etdump/tests/etdump_test.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
 #include <executorch/devtools/etdump/data_sinks/file_data_sink.h>
+#include <executorch/devtools/etdump/etdump_filter.h>
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
@@ -33,6 +34,7 @@ using ::executorch::runtime::AllocatorID;
 using ::executorch::runtime::ArrayRef;
 using ::executorch::runtime::BoxedEvalueList;
 using ::executorch::runtime::DelegateDebugIdType;
+using ::executorch::runtime::DelegateDebugIntId;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::EventTracerEntry;
@@ -45,6 +47,8 @@ using ::executorch::runtime::testing::TensorFactory;
 using ::executorch::etdump::BufferDataSink;
 using ::executorch::etdump::FileDataSink;
 
+using ::executorch::etdump::ETDumpFilter;
+
 class ProfilerETDumpTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -75,6 +79,70 @@ class ProfilerETDumpTest : public ::testing::Test {
         "Must set data sink before writing tensor-like data");
   }
 
+  void check_log_with_filter(
+      const char* name,
+      DelegateDebugIntId delegate_debug_index,
+      bool use_tensor_input,
+      bool expected_log,
+      bool expected_ok) {
+    TensorFactory<ScalarType::Float> tf;
+    for (size_t i = 0; i < 2; i++) {
+      const size_t buffer_size = 2048;
+
+      void* ptr = malloc(buffer_size);
+      auto buffer_data_sink = BufferDataSink::create(ptr, buffer_size);
+      auto filter = ETDumpFilter();
+      filter.add_regex("filtered.*");
+      filter.set_debug_handle_range(1, 10);
+      etdump_gen[i]->set_delegation_intermediate_output_filter(&filter);
+
+      etdump_gen[i]->create_event_block("test_block");
+      etdump_gen[i]->set_data_sink(&buffer_data_sink.get());
+
+      // size of empty etdump
+      size_t initial_size = 68;
+
+      // Perform logging
+
+      if (use_tensor_input) {
+        auto tensor = tf.ones({3, 2});
+        auto result = etdump_gen[i]->log_intermediate_output_delegate(
+            name, delegate_debug_index, tensor);
+        ASSERT_EQ(result.ok(), expected_ok);
+        if (expected_ok) {
+          ASSERT_EQ(result.get(), expected_log);
+        }
+      } else { // use tensor_list instead
+        std::vector<Tensor> tensors = {tf.ones({5, 4}), tf.ones({7, 6})};
+        Result<bool> result = etdump_gen[i]->log_intermediate_output_delegate(
+            name,
+            delegate_debug_index,
+            ArrayRef<Tensor>(tensors.data(), tensors.size()));
+        ASSERT_EQ(result.ok(), expected_ok);
+        if (expected_ok) {
+          ASSERT_EQ(result.get(), expected_log);
+        }
+      }
+
+      // Get final size of etdump
+      ETDumpResult final_result = etdump_gen[i]->get_etdump_data();
+      size_t final_size = final_result.size;
+      // Check if the size of etdump has changed based on logging success
+      if (expected_log) {
+        ASSERT_NE(initial_size, final_size); // Expect size change if logged
+      } else {
+        ASSERT_EQ(
+            initial_size, final_size); // Expect no size change if not logged
+      }
+
+      if (!etdump_gen[i]->is_static_etdump()) {
+        free(final_result.buf);
+      }
+
+      free(ptr);
+    }
+  }
+
   ETDumpGen* etdump_gen[2];
   uint8_t* buf = nullptr;
   std::unique_ptr<TempFile> temp_file;
@@ -652,7 +720,7 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) {
 
       void* ptr = malloc(2048);
       Span<uint8_t> buffer((uint8_t*)ptr, 2048);
-      ;
+
       auto buffer_data_sink = BufferDataSink::create(ptr, 2048);
       auto file_data_sink = FileDataSink::create(dump_file_path.c_str());
 
@@ -892,3 +960,62 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) {
     }
   }
 }
+
+TEST_F(ProfilerETDumpTest, LogWithRegexAndUnsetDelegateDebugIdOnTensor) {
+  check_log_with_filter(
+      "filtered_event",
+      kUnsetDelegateDebugIntId,
+      /*use_tensor_input=*/true,
+      /*expected_log=*/false,
+      /*expected_ok=*/true);
+}
+
+TEST_F(ProfilerETDumpTest, LogWithRegexAndUnsetDelegateDebugIdOnTensorList) {
+  check_log_with_filter(
+      "filtered_event",
+      kUnsetDelegateDebugIntId,
+      /*use_tensor_input=*/true,
+      /*expected_log=*/false,
+      /*expected_ok=*/true);
+}
+
+TEST_F(ProfilerETDumpTest, LogWithNullptrAndInRange) {
+  check_log_with_filter(
+      nullptr,
+      5,
+      /*use_tensor_input=*/true,
+      /*expected_log=*/false,
+      /*expected_ok=*/true);
+}
+TEST_F(ProfilerETDumpTest, LogWithNonMatchingRegexAndOutOfRange) {
+  check_log_with_filter(
+      "unfiltered_event",
+      kUnsetDelegateDebugIntId,
+      /*use_tensor_input=*/true,
+      /*expected_log=*/true,
+      /*expected_ok=*/true);
+}
+TEST_F(ProfilerETDumpTest, LogWithNullptrAndOutOfRange) {
+  check_log_with_filter(
+      nullptr,
+      20,
+      /*use_tensor_input=*/true,
+      /*expected_log=*/true,
+      /*expected_ok=*/true);
+}
+TEST_F(ProfilerETDumpTest, LogWithRegexAndInRange) {
+  check_log_with_filter(
+      "filtered_event",
+      5,
+      /*use_tensor_input=*/true,
+      /*expected_log=*/false,
+      /*expected_ok=*/false);
+}
+TEST_F(ProfilerETDumpTest, LogWithNullptrAndUnsetDebugHandle) {
+  check_log_with_filter(
+      nullptr,
+      kUnsetDelegateDebugIntId,
+      /*use_tensor_input=*/true,
+      /*expected_log=*/false,
+      /*expected_ok=*/false);
+}
diff --git a/devtools/etdump/tests/targets.bzl b/devtools/etdump/tests/targets.bzl
index 7f266eed5a7..10eb8608362 100644
--- a/devtools/etdump/tests/targets.bzl
+++ b/devtools/etdump/tests/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/extension/testing_util:temp_file",
             "//executorch/runtime/platform:platform",
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+            "//executorch/devtools/etdump:etdump_filter",
         ],
     )
 
diff --git a/devtools/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py
index 5e224415bb6..ee571e365fe 100644
--- a/devtools/inspector/tests/inspector_utils_test.py
+++ b/devtools/inspector/tests/inspector_utils_test.py
@@ -205,7 +205,7 @@ def test_compare_results(self):
         self.assertAlmostEqual(calculate_cosine_similarity([a], [b])[0], 1.0)
 
     def test_compare_results_uint8(self):
-        a = torch.randint(0, 255, (4, 4), dtype=torch.uint8)
+        a = torch.randint(1, 255, (4, 4), dtype=torch.uint8)
 
         # Create tensor b which has very close value to tensor a
         b = a.clone()
diff --git a/docs/README.md b/docs/README.md
index e6dc66d335e..e30decb9362 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -102,7 +102,7 @@ The current version of PyTorch is ${executorch_version:pytorch}.
 
 This will result in the following output:
 
-<img src="./source/_static/img/s_custom_variables_extension.png" width="300">
+<img src="source/_static/img/s_custom_variables_extension.png" width="300">
 
 Right now we only support PyTorch version as custom variable, but will support others in the future.
 
@@ -130,7 +130,7 @@ Use the
 to contribute to the documentation.
 
 In addition to that, see
-[Markdown in Sphinx Tips and Tricks](https://pytorch.org/executorch/markdown-sphinx-tips-tricks.html)
+[Markdown in Sphinx Tips and Tricks](source/markdown-sphinx-tips-tricks.md)
 for tips on how to author high-quality markdown pages with Myst Parser.
 
 ## Adding Tutorials
@@ -143,12 +143,12 @@ directory. Use one of the following templates:
 - [Markdown template](https://github.com/pytorch/executorch/blob/main/docs/source/tutorial-template.md)
 
 After creating a tutorial, make sure to add the corresponding path in the
-[index.rst](./source/index.rst) file in the following places:
+[index.md](source/index.md) file in the following places:
 
 - In the
-  [tutorials torctree](https://github.com/pytorch/executorch/blob/main/docs/source/index.rst?plain=1#L183)
+  [tutorials torctree](https://github.com/pytorch/executorch/blob/main/docs/source/index.md?plain=1#L185)
 - In the
-  [customcard section](https://github.com/pytorch/executorch/blob/main/docs/source/index.rst?plain=1#L201)
+  [customcard section](https://github.com/pytorch/executorch/blob/main/docs/source/index.md?plain=1#L201)
 
 If you want to include a Markdown tutorial that is stored in another directory
 outside of the `docs/source` directory, complete the following steps:
@@ -163,7 +163,7 @@ outside of the `docs/source` directory, complete the following steps:
 
    **NOTE:** Your tutorial source file needs to follow the tutorial template.
 
-3. Add the file that you have created in **Step 1** to the `index.rst` toctree
+3. Add the file that you have created in **Step 1** to the `index.md` toctree
    and add a `customcarditem` with the link to that file.
 
 For example, if I wanted to include the `README.md` file from
@@ -176,7 +176,7 @@ file:
 ```{include} ../../../examples/selective_build/README.md
 ````
 
-In the `index.rst` file, I would add `tutorials/selective-build-tutorial` in
+In the `index.md` file, I would add `tutorials/selective-build-tutorial` in
 both the `toctree` and the `cusotmcarditem` sections.
 
 # Auto-generated API documentation
@@ -211,7 +211,7 @@ executorch.exir
 ```
 
 These separate `.rst` files should all be linked together, with the initial
-landing page under `index.rst`.
+landing page under `index.md`.
 
 ### C++ APIs
 
@@ -236,4 +236,4 @@ important/relevant parts are:
 
 If you need to include new files, simply add them to the `INPUT` in the
 `Doxyfile`. The generated output is included to the ExecuTorch documentation
-build and referenced in `index.rst`.
+build and referenced in `index.md`.
diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile
index 0d60bf51c7e..ef076e5794d 100644
--- a/docs/source/Doxyfile
+++ b/docs/source/Doxyfile
@@ -399,9 +399,9 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
+# https://python-sip.readthedocs.io/en/stable/introduction.html) sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
 
 SIP_SUPPORT            = NO
@@ -1483,8 +1483,9 @@ HTML_INDEX_NUM_ENTRIES = 100
 # output directory. Running make will produce the docset in that directory and
 # running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
-# genXcode/_index.html for more information.
+# startup. See
+# https://developer.apple.com/library/archive/featuredarticles/DoxygenXcode/_index.html
+# for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
diff --git a/docs/source/_static/img/demo_ios_app.jpg b/docs/source/_static/img/demo_ios_app.jpg
deleted file mode 100644
index d45b3dd38f6..00000000000
Binary files a/docs/source/_static/img/demo_ios_app.jpg and /dev/null differ
diff --git a/docs/source/_static/img/demo_ios_app.png b/docs/source/_static/img/demo_ios_app.png
new file mode 100644
index 00000000000..97622123093
Binary files /dev/null and b/docs/source/_static/img/demo_ios_app.png differ
diff --git a/docs/source/_static/img/llama_ios_app.mp4 b/docs/source/_static/img/llama_ios_app.mp4
index 2f5df08984d..b4bf23cfdf6 100644
Binary files a/docs/source/_static/img/llama_ios_app.mp4 and b/docs/source/_static/img/llama_ios_app.mp4 differ
diff --git a/docs/source/_static/img/llama_ios_app.png b/docs/source/_static/img/llama_ios_app.png
index d9088abc4f9..fff399cfe1d 100644
Binary files a/docs/source/_static/img/llama_ios_app.png and b/docs/source/_static/img/llama_ios_app.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/ci1.png b/docs/source/_static/img/new-contributor-guide/ci1.png
new file mode 100644
index 00000000000..ba26f572913
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/ci1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/cla1.png b/docs/source/_static/img/new-contributor-guide/cla1.png
new file mode 100644
index 00000000000..0e9918bd542
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/cla1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/cla2.png b/docs/source/_static/img/new-contributor-guide/cla2.png
new file mode 100644
index 00000000000..e62d90b46fd
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/cla2.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/end_of_draft_pr1.png b/docs/source/_static/img/new-contributor-guide/end_of_draft_pr1.png
new file mode 100644
index 00000000000..4bd8d085a9f
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/end_of_draft_pr1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/end_of_draft_pr2.png b/docs/source/_static/img/new-contributor-guide/end_of_draft_pr2.png
new file mode 100644
index 00000000000..0de46229b22
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/end_of_draft_pr2.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/good_first_issues.png b/docs/source/_static/img/new-contributor-guide/good_first_issues.png
new file mode 100644
index 00000000000..0c3a0564678
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/good_first_issues.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_clone.png b/docs/source/_static/img/new-contributor-guide/how_to_clone.png
new file mode 100644
index 00000000000..6a8ba7e9a35
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_clone.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_draft_pr1.png b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr1.png
new file mode 100644
index 00000000000..b92a7016d52
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_draft_pr2.png b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr2.png
new file mode 100644
index 00000000000..46110ba7886
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr2.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_draft_pr3.png b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr3.png
new file mode 100644
index 00000000000..ca5bb03436c
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr3.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_fork1.png b/docs/source/_static/img/new-contributor-guide/how_to_fork1.png
new file mode 100644
index 00000000000..c8f56d5a841
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_fork1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_fork2.png b/docs/source/_static/img/new-contributor-guide/how_to_fork2.png
new file mode 100644
index 00000000000..ea4b2e9dfa2
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_fork2.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_label1.png b/docs/source/_static/img/new-contributor-guide/how_to_label1.png
new file mode 100644
index 00000000000..fb2d4e03868
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_label1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_label2.png b/docs/source/_static/img/new-contributor-guide/how_to_label2.png
new file mode 100644
index 00000000000..f5d38561744
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_label2.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_merge1.png b/docs/source/_static/img/new-contributor-guide/how_to_merge1.png
new file mode 100644
index 00000000000..6f06911db97
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_merge1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_merge2.png b/docs/source/_static/img/new-contributor-guide/how_to_merge2.png
new file mode 100644
index 00000000000..e7a38177b36
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_merge2.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_merge3.png b/docs/source/_static/img/new-contributor-guide/how_to_merge3.png
new file mode 100644
index 00000000000..88911271f04
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_merge3.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_pr1.png b/docs/source/_static/img/new-contributor-guide/how_to_pr1.png
new file mode 100644
index 00000000000..454c86a6a02
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_pr1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_pr2.png b/docs/source/_static/img/new-contributor-guide/how_to_pr2.png
new file mode 100644
index 00000000000..b3eb7900e81
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_pr2.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/how_to_pr3.png b/docs/source/_static/img/new-contributor-guide/how_to_pr3.png
new file mode 100644
index 00000000000..6c5037f78f4
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_pr3.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/pr_approval1.png b/docs/source/_static/img/new-contributor-guide/pr_approval1.png
new file mode 100644
index 00000000000..d21ddd966ba
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/pr_approval1.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/pr_approval2.png b/docs/source/_static/img/new-contributor-guide/pr_approval2.png
new file mode 100644
index 00000000000..88c0c2389d4
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/pr_approval2.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/release_notes.png b/docs/source/_static/img/new-contributor-guide/release_notes.png
new file mode 100644
index 00000000000..8f5d34cf03d
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/release_notes.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/synced_fork.png b/docs/source/_static/img/new-contributor-guide/synced_fork.png
new file mode 100644
index 00000000000..a2ba263df84
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/synced_fork.png differ
diff --git a/docs/source/_static/img/new-contributor-guide/unsynced_fork.png b/docs/source/_static/img/new-contributor-guide/unsynced_fork.png
new file mode 100644
index 00000000000..916b08424d5
Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/unsynced_fork.png differ
diff --git a/docs/source/_static/img/swiftpm_xcode1.png b/docs/source/_static/img/swiftpm_xcode1.png
index 11b9c237827..3fcad383610 100644
Binary files a/docs/source/_static/img/swiftpm_xcode1.png and b/docs/source/_static/img/swiftpm_xcode1.png differ
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
index 210153e123c..55f91103b35 100644
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -2,8 +2,8 @@
 
 {% block extrahead %}
 {% if 'getting-started-setup' in pagename%}
-<link rel="stylesheet" href="_static/css/progress-bar.css">
-<script src="_static/js/progress-bar.js" defer></script>
+<link rel="stylesheet" href="../_static/css/progress-bar.css">
+<script src="../_static/js/progress-bar.js" defer></script>
 {% elif 'compiler-delegate-and-partitioner' in pagename%}
 <link rel="stylesheet" href="../_static/css/progress-bar.css">
 <script src="../_static/js/progress-bar.js" defer></script>
@@ -74,7 +74,7 @@
               <div class="local-toc">{{ toc }}</div>
          {% endif %}
 {% endblock %}
-<!-- END OF LOCAL OVERRIDE -->  
+<!-- END OF LOCAL OVERRIDE -->
 
 {% block footer %}
 {{ super() }}
@@ -131,14 +131,14 @@
     $(".main-menu a:contains('GitHub')").each(overwrite);
     // Overwrite link to Tutorials and Get Started top navigation. If these sections are moved
     // this overrides need to be updated.
-    $(".main-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/stable/index.html#tutorials-and-examples");
-    $(".main-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/stable/getting-started-setup.html");
+    $(".main-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/main/index#tutorials-and-examples");
+    $(".main-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/main/getting-started-setup");
     // Mobile
     $(".mobile-menu a:contains('Github')").each(overwrite);
     // Overwrite link to Tutorials and Get Started top navigation. If these sections are moved
     // this overrides need to be updated.
-    $(".mobile-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/stable/index.html#tutorials-and-examples");
-    $(".mobile-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/stable/getting-started-setup.html");
+    $(".mobile-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/main/index#tutorials-and-examples");
+    $(".mobile-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/main/getting-started-setup");
 
   });
 </script>
diff --git a/docs/source/backend-delegates-integration.md b/docs/source/backend-delegates-integration.md
index c127252e2f4..0179ceff872 100644
--- a/docs/source/backend-delegates-integration.md
+++ b/docs/source/backend-delegates-integration.md
@@ -16,7 +16,7 @@ Delegate Python files such as those implementing `preprocess()` or `partition()`
 functions for ExecuTorch AOT flow, excluding any external third-party
 dependencies and their files, should be installed and available with
 the top level ExecuTorch package. For third-party dependencies, please refer to
-[this](./backend-delegates-dependencies.md).
+[this](backend-delegates-dependencies.md).
 
 ## C++ Source Files
 
@@ -28,7 +28,7 @@ top level `CMakeLists.txt` file using `add_subdirectory` CMake command, and
 should be built conditionally with an ExecuTorch build flag like
 `EXECUTORCH_BUILD_<DELEGATE_NAME>`, see `EXECUTORCH_BUILD_XNNPACK` for example.
 For third-party dependencies, please refer to
-[this](./backend-delegates-dependencies.md).
+[this](backend-delegates-dependencies.md).
 
 <!---
 TODO: Add more details. Need to insert a CMake layer in `executorch/backends` to
diff --git a/docs/source/backend-delegates-xnnpack-reference.md b/docs/source/backend-delegates-xnnpack-reference.md
index 62dead75a4e..8fe346680d4 100644
--- a/docs/source/backend-delegates-xnnpack-reference.md
+++ b/docs/source/backend-delegates-xnnpack-reference.md
@@ -9,12 +9,12 @@ XNNPACK is a library of highly-optimized neural network operators for ARM, x86,
 A delegate is an entry point for backends to process and execute parts of the ExecuTorch program. Delegated portions of ExecuTorch models hand off execution to backends. The XNNPACK backend delegate is one of many available in ExecuTorch. It leverages the XNNPACK third-party library to accelerate ExecuTorch programs efficiently across a variety of CPUs. More detailed information on the delegates and developing your own delegates is available [here](compiler-delegate-and-partitioner.md). It is recommended that you get familiar with that content before continuing on to the Architecture section.
 
 ## Architecture
-![High Level XNNPACK delegate Architecture](./xnnpack-delegate-architecture.png)
+![High Level XNNPACK delegate Architecture](xnnpack-delegate-architecture.png)
 
 ### Ahead-of-time
 In the ExecuTorch export flow, lowering to the XNNPACK delegate happens at the `to_backend()` stage. In this stage, the model is partitioned by the `XnnpackPartitioner`. Partitioned sections of the graph are converted to a XNNPACK specific graph represenationed and then serialized via flatbuffer. The serialized flatbuffer is then ready to be deserialized and executed by the XNNPACK backend at runtime.
 
-![ExecuTorch XNNPACK delegate Export Flow](./xnnpack-et-flow-diagram.png)
+![ExecuTorch XNNPACK delegate Export Flow](xnnpack-et-flow-diagram.png)
 
 #### Partitioner
 The partitioner is implemented by backend delegates to mark nodes suitable for lowering. The `XnnpackPartitioner` lowers using node targets and module metadata. Some more references for partitioners can be found [here](compiler-delegate-and-partitioner.md)
@@ -70,7 +70,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `xnn_executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
+We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `xnn_executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
diff --git a/docs/source/backends-cadence.md b/docs/source/backends-cadence.md
index 6dfb097c805..d9811e6b6c4 100644
--- a/docs/source/backends-cadence.md
+++ b/docs/source/backends-cadence.md
@@ -17,9 +17,9 @@ On top of being able to run on the Xtensa HiFi4 DSP, another goal of this tutori
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](./intro-how-it-works.md)
-* [Getting Started](./getting-started.md)
-* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
+* [Introduction to ExecuTorch](intro-how-it-works.md)
+* [Getting Started](getting-started.md)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 :::
 ::::
 
@@ -89,7 +89,7 @@ executorch
 
 ***AoT (Ahead-of-Time) Components***:
 
-The AoT folder contains all of the python scripts and functions needed to export the model to an ExecuTorch `.pte` file. In our case, [export_example.py](https://github.com/pytorch/executorch/blob/main/backends/cadence/aot/export_example.py) is an API that takes a model (nn.Module) and representative inputs and runs it through the quantizer (from [quantizer.py](https://github.com/pytorch/executorch/blob/main/backends/cadence/aot/quantizer.py)). Then a few compiler passes, also defined in [quantizer.py](https://github.com/pytorch/executorch/blob/main/backends/cadence/aot/quantizer.py), will replace operators with custom ones that are supported and optimized on the chip. Any operator needed to compute things should be defined in [ops_registrations.py](https://github.com/pytorch/executorch/blob/main/backends/cadence/aot/ops_registrations.py) and have corresponding implemetations in the other folders.
+The AoT folder contains all of the python scripts and functions needed to export the model to an ExecuTorch `.pte` file. In our case, [export_example.py](https://github.com/pytorch/executorch/blob/main/backends/cadence/aot/export_example.py) is an API that takes a model (nn.Module) and representative inputs and runs it through the quantizer (from [quantizer.py](https://github.com/pytorch/executorch/blob/main/backends/cadence/aot/quantizer/quantizer.py)). Then a few compiler passes, also defined in [quantizer.py](https://github.com/pytorch/executorch/blob/main/backends/cadence/aot/quantizer/quantizer.py), will replace operators with custom ones that are supported and optimized on the chip. Any operator needed to compute things should be defined in [ops_registrations.py](https://github.com/pytorch/executorch/blob/main/backends/cadence/aot/ops_registrations.py) and have corresponding implemetations in the other folders.
 
 ***Operators***:
 
@@ -115,8 +115,8 @@ python3 -m examples.portable.scripts.export --model_name="add"
 ***Quantized Operators***:
 
 The other, more complex model are custom operators, including:
-  - a quantized [linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) operation. The model is defined [here](https://github.com/pytorch/executorch/blob/main/examples/cadence/operators/quantized_linear_op.py#L28). Linear is the backbone of most Automatic Speech Recognition (ASR) models.
-  - a quantized [conv1d](https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html) operation. The model is defined [here](https://github.com/pytorch/executorch/blob/main/examples/cadence/operators/quantized_conv1d_op.py#L36). Convolutions are important in wake word and many denoising models.
+  - a quantized [linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) operation. The model is defined [here](https://github.com/pytorch/executorch/blob/main/examples/cadence/operators/test_quantized_linear_op.py#L30). Linear is the backbone of most Automatic Speech Recognition (ASR) models.
+  - a quantized [conv1d](https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html) operation. The model is defined [here](https://github.com/pytorch/executorch/blob/main/examples/cadence/operators/test_quantized_conv1d_op.py#L40). Convolutions are important in wake word and many denoising models.
 
 In both cases the generated file is called `CadenceDemoModel.pte`.
 
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
index 126727735ae..2d5e4256bcc 100644
--- a/docs/source/backends-coreml.md
+++ b/docs/source/backends-coreml.md
@@ -172,8 +172,8 @@ add_subdirectory("executorch")
 target_link_libraries(
     my_target
     PRIVATE executorch
-    executorch_module_static
-    executorch_tensor
+    extension_module_static
+    extension_tensor
     optimized_native_cpu_ops_lib
     coremldelegate)
 ```
diff --git a/docs/source/backends-mediatek.md b/docs/source/backends-mediatek.md
index 456a62aaabd..9f130c8c594 100644
--- a/docs/source/backends-mediatek.md
+++ b/docs/source/backends-mediatek.md
@@ -11,9 +11,9 @@ MediaTek backend empowers ExecuTorch to speed up PyTorch models on edge devices
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](./intro-how-it-works.md)
-* [Getting Started](./getting-started.md)
-* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
+* [Introduction to ExecuTorch](intro-how-it-works.md)
+* [Getting Started](getting-started.md)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 :::
 ::::
 
diff --git a/docs/source/backends-mps.md b/docs/source/backends-mps.md
index 8053954bb3b..4d5c445349d 100644
--- a/docs/source/backends-mps.md
+++ b/docs/source/backends-mps.md
@@ -12,9 +12,9 @@ The MPS backend device maps machine learning computational graphs and primitives
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](./intro-how-it-works.md)
-* [Getting Started](./getting-started.md)
-* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
+* [Introduction to ExecuTorch](intro-how-it-works.md)
+* [Getting Started](getting-started.md)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 * [ExecuTorch iOS Demo App](demo-apps-ios.md)
 * [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md)
 :::
@@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp
 ```
 
 ### Profiling:
-1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
+1. [Optional] Generate an [ETRecord](etrecord.rst) while you're exporting your model.
 ```bash
 cd executorch
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
 ```
-2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](etdump.md).
 ```
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
 ```
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 71f1d3cd93c..422ad26c49c 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -14,9 +14,9 @@ Qualcomm AI Engine Direct is also referred to as QNN in the source and documenta
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](./intro-how-it-works.md)
-* [Getting Started](./getting-started.md)
-* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
+* [Introduction to ExecuTorch](intro-how-it-works.md)
+* [Getting Started](getting-started.md)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 :::
 ::::
 
diff --git a/docs/source/backends-vulkan.md b/docs/source/backends-vulkan.md
index 2cfff6a6eb6..3ae80950645 100644
--- a/docs/source/backends-vulkan.md
+++ b/docs/source/backends-vulkan.md
@@ -133,7 +133,7 @@ will be executed on the GPU.
 
 
 ::::{note}
-The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/supported_ops.py)
+The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/op_registry.py#L194)
 Vulkan partitioner code can be inspected to examine which ops are currently
 implemented in the Vulkan delegate.
 ::::
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
index 1fcc6d8c51b..db1c055dc9c 100644
--- a/docs/source/backends-xnnpack.md
+++ b/docs/source/backends-xnnpack.md
@@ -28,6 +28,7 @@ the core ExecuTorch runtime.
 To target the XNNPACK backend during the export and lowering process, pass an instance of the `XnnpackPartitioner` to `to_edge_transform_and_lower`. The example below demonstrates this process using the MobileNet V2 model from torchvision.
 
 ```python
+import torch
 import torchvision.models as models
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
@@ -47,10 +48,10 @@ with open("mv2_xnnpack.pte", "wb") as file:
 
 ### Partitioner API
 
-The XNNPACK partitioner API allows for configuration of the model delegation to XNNPACK. Passing an `XnnpackPartitioner` instance with no additional parameters will run as much of the model as possible on the XNNPACK backend. This is the most common use-case. For advanced use cases, the partitioner exposes the following options via the [constructor](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/xnnpack/partition/xnnpack_partitioner.py#L31):
+The XNNPACK partitioner API allows for configuration of the model delegation to XNNPACK. Passing an `XnnpackPartitioner` instance with no additional parameters will run as much of the model as possible on the XNNPACK backend. This is the most common use-case. For advanced use cases, the partitioner exposes the following options via the [constructor](https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/xnnpack_partitioner.py#L31):
 
- - `configs`: Control which operators are delegated to XNNPACK. By default, all available operators all delegated. See [../config/\_\_init\_\_.py](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/xnnpack/partition/config/__init__.py#L66) for an exhaustive list of available operator configs.
- - `config_precisions`: Filter operators by data type. By default, delegate all precisions. One or more of `ConfigPrecisionType.FP32`, `ConfigPrecisionType.STATIC_QUANT`, or `ConfigPrecisionType.DYNAMIC_QUANT`. See [ConfigPrecisionType](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/xnnpack/partition/config/xnnpack_config.py#L24).
+ - `configs`: Control which operators are delegated to XNNPACK. By default, all available operators all delegated. See [../config/\_\_init\_\_.py](https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/config/__init__.py#L66) for an exhaustive list of available operator configs.
+ - `config_precisions`: Filter operators by data type. By default, delegate all precisions. One or more of `ConfigPrecisionType.FP32`, `ConfigPrecisionType.STATIC_QUANT`, or `ConfigPrecisionType.DYNAMIC_QUANT`. See [ConfigPrecisionType](https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/config/xnnpack_config.py#L24).
  - `per_op_mode`: If true, emit individual delegate calls for every operator. This is an advanced option intended to reduce memory overhead in some contexts at the cost of a small amount of runtime overhead. Defaults to false.
  - `verbose`: If true, print additional information during lowering.
 
@@ -87,15 +88,23 @@ To perform 8-bit quantization with the PT2E flow, perform the following steps pr
 The output of `convert_pt2e` is a PyTorch model which can be exported and lowered using the normal flow. As it is a regular PyTorch model, it can also be used to evaluate the accuracy of the quantized model using standard PyTorch techniques.
 
 ```python
+import torch
+import torchvision.models as models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import XNNPACKQuantizer
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import to_edge_transform_and_lower
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.xnnpack_quantizer import get_symmetric_quantization_config
 
+model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
 qparams = get_symmetric_quantization_config(is_per_channel=True) # (1)
 quantizer = XNNPACKQuantizer()
 quantizer.set_global(qparams)
 
-training_ep = torch.export.export_for_training(model, sample_inputs).module(), # (2)
+training_ep = torch.export.export_for_training(model, sample_inputs).module() # (2)
 prepared_model = prepare_pt2e(training_ep, quantizer) # (3)
 
 for cal_sample in [torch.randn(1, 3, 224, 224)]: # Replace with representative model inputs
@@ -128,8 +137,8 @@ add_subdirectory("executorch")
 target_link_libraries(
     my_target
     PRIVATE executorch
-    executorch_module_static
-    executorch_tensor
+    extension_module_static
+    extension_tensor
     optimized_native_cpu_ops_lib
     xnnpack_backend)
 ```
diff --git a/docs/source/build-run-openvino.md b/docs/source/build-run-openvino.md
index f9ea5df0862..dc6f098850f 100644
--- a/docs/source/build-run-openvino.md
+++ b/docs/source/build-run-openvino.md
@@ -11,8 +11,8 @@ In this tutorial we will walk you through the process of setting up the prerequi
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
 * [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [Setting up ExecuTorch](getting-started.md)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 :::
 ::::
 
@@ -61,7 +61,7 @@ For more information about OpenVINO build, refer to the [OpenVINO Build Instruct
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/stable/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
+1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](getting-started-setup.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
 
 2. **Setup OpenVINO Backend Environment**
 - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
index 08961db7769..3e8accce80e 100644
--- a/docs/source/bundled-io.md
+++ b/docs/source/bundled-io.md
@@ -17,7 +17,7 @@ This stage mainly focuses on the creation of a `BundledProgram` and dumping it o
 
 ### Step 1: Create a Model and Emit its ExecuTorch Program.
 
-ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate Sample ExecuTorch program](./getting-started-setup.md) or [Exporting to ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial).
+ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
 
 ### Step 2: Construct `List[MethodTestSuite]` to hold test info
 
@@ -89,7 +89,7 @@ Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch m
 ```python
 import torch
 
-from executorch.exir import to_edge
+from executorch.exir import to_edge_transform_and_lower
 from executorch.devtools import BundledProgram
 
 from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
@@ -105,8 +105,8 @@ class SampleModel(torch.nn.Module):
 
     def __init__(self) -> None:
         super().__init__()
-        self.a: torch.Tensor = 3 * torch.ones(2, 2, dtype=torch.int32)
-        self.b: torch.Tensor = 2 * torch.ones(2, 2, dtype=torch.int32)
+        self.register_buffer('a', 3 * torch.ones(2, 2, dtype=torch.int32))
+        self.register_buffer('b', 2 * torch.ones(2, 2, dtype=torch.int32))
 
     def forward(self, x: torch.Tensor, q: torch.Tensor) -> torch.Tensor:
         z = x.clone()
@@ -136,7 +136,7 @@ method_graph = export(
 
 
 # Emit the traced method into ET Program.
-et_program = to_edge(method_graph).to_executorch()
+et_program = to_edge_transform_and_lower(method_graph).to_executorch()
 
 # Step 2: Construct MethodTestSuite for Each Method
 
@@ -180,7 +180,6 @@ serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer(
 save_path = "bundled_program.bpte"
 with open(save_path, "wb") as f:
     f.write(serialized_bundled_program)
-
 ```
 
 We can also regenerate `BundledProgram` from flatbuffer file if needed:
@@ -199,103 +198,29 @@ This stage mainly focuses on executing the model with the bundled inputs and and
 
 
 ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
-We need the pointer to ExecuTorch program to do the execution. To unify the process of loading and executing `BundledProgram` and Program flatbuffer, we create an API:
-
-:::{dropdown} `get_program_data`
-
-```{eval-rst}
-.. doxygenfunction:: ::executorch::bundled_program::get_program_data
-```
-:::
-
-Here's an example of how to use the `get_program_data` API:
-```c++
-// Assume that the user has read the contents of the file into file_data using
-// whatever method works best for their application. The file could contain
-// either BundledProgram data or Program data.
-void* file_data = ...;
-size_t file_data_len = ...;
-
-// If file_data contains a BundledProgram, get_program_data() will return a
-// pointer to the Program data embedded inside it. Otherwise it will return
-// file_data, which already pointed to Program data.
-const void* program_ptr;
-size_t program_len;
-status = executorch::bundled_program::get_program_data(
-    file_data, file_data_len, &program_ptr, &program_len);
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "get_program_data() failed with status 0x%" PRIx32,
-    status);
-```
+We need the pointer to ExecuTorch program to do the execution. To unify the process of loading and executing `BundledProgram` and Program flatbuffer, we create an API for this
+`executorch::bundled_program::get_program_data`. Check out an [example usage](https://github.com/pytorch/executorch/blob/release/0.6/examples/devtools/example_runner/example_runner.cpp#L128-L137) of this API.
 
 ### Load Bundled Input to Method
-To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `executorch::bundled_program::load_bundled_input`:
-
-:::{dropdown} `load_bundled_input`
-
-```{eval-rst}
-.. doxygenfunction:: ::executorch::bundled_program::load_bundled_input
-```
-:::
+To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `executorch::bundled_program::load_bundled_input`.  Check out an [example usage](https://github.com/pytorch/executorch/blob/release/0.6/examples/devtools/example_runner/example_runner.cpp#L253-L259) of this API.
 
 ### Verify the Method's Output.
-We call `executorch::bundled_program::verify_method_outputs` to verify the method's output with bundled expected outputs. Here's the details of this API:
-
-:::{dropdown} `verify_method_outputs`
-
-```{eval-rst}
-.. doxygenfunction:: ::executorch::bundled_program::verify_method_outputs
-```
-:::
-
+We call `executorch::bundled_program::verify_method_outputs` to verify the method's output with bundled expected outputs. Check out an [example usage](https://github.com/pytorch/executorch/blob/release/0.6/examples/devtools/example_runner/example_runner.cpp#L300-L311) of this API.
 
 ### Runtime Example
 
-Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp), and please review that file if you need more info and context:
-
-```c++
-// method_name is the name for the method we want to test
-// memory_manager is the executor::MemoryManager variable for executor memory allocation.
-// program is the ExecuTorch program.
-Result<Method> method = program->load_method(method_name, &memory_manager);
-
-ET_CHECK_MSG(
-    method.ok(),
-    "load_method() failed with status 0x%" PRIx32,
-    method.error());
-
-// Load testset_idx-th input in the buffer to plan
-status = executorch::bundled_program::load_bundled_input(
-        *method,
-        program_data.bundled_program_data(),
-        FLAGS_testset_idx);
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "load_bundled_input failed with status 0x%" PRIx32,
-    status);
-
-// Execute the plan
-status = method->execute();
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "method->execute() failed with status 0x%" PRIx32,
-    status);
-
-// Verify the result.
-status = executorch::bundled_program::verify_method_outputs(
-        *method,
-        program_data.bundled_program_data(),
-        FLAGS_testset_idx,
-        FLAGS_rtol,
-        FLAGS_atol);
-ET_CHECK_MSG(
-    status == Error::Ok,
-    "Bundle verification failed with status 0x%" PRIx32,
-    status);
+Please checkout our [example runner](https://github.com/pytorch/executorch/blob/release/0.6/examples/devtools/README.md#bundledprogram) for a bundled program. You could run these commands to test with the BundledProgram binary (`.bpte`) file you generated in the previous step:
 
+```bash
+cd executorch
+   ./examples/devtools/build_example_runner.sh
+   ./cmake-out/examples/devtools/example_runner --bundled_program_path {your-bpte-file} --output_verification
 ```
 
+It is expected to see no output from running the above mentioned snippet.
+
+For a detailed example of how the runner should be like, please refer to our [example runner](https://github.com/pytorch/executorch/blob/release/0.6/examples/devtools/example_runner/example_runner.cpp).
+
 ## Common Errors
 
 Errors will be raised if `List[MethodTestSuites]` doesn't match the `Program`. Here're two common situations:
diff --git a/docs/source/compiler-backend-dialect.md b/docs/source/compiler-backend-dialect.md
index 0ab8fe79f2c..486a84f7f46 100644
--- a/docs/source/compiler-backend-dialect.md
+++ b/docs/source/compiler-backend-dialect.md
@@ -1,7 +1,7 @@
 # Backend Dialect
 ## Overview
 
-_Backend dialect_ is a special variant of [edge dialect](./ir-exir.md), because it contains backend specific nodes and metadata, after backend specific graph transformations. Backend dialect is an optional stage, only needed if we want to introduce backend-awareness into the graph. More specifically, a graph in backend dialect may contain operators or delegated lowered modules (see [delegate doc](./compiler-delegate-and-partitioner.md)) that are only meaningful to the target backend. One use case is that if we want to fuse operators into a single operator, for example, fusing consecutive addmm + relu to a single operator addmm_relu, we can do that here.
+_Backend dialect_ is a special variant of [edge dialect](ir-exir.md), because it contains backend specific nodes and metadata, after backend specific graph transformations. Backend dialect is an optional stage, only needed if we want to introduce backend-awareness into the graph. More specifically, a graph in backend dialect may contain operators or delegated lowered modules (see [delegate doc](compiler-delegate-and-partitioner.md)) that are only meaningful to the target backend. One use case is that if we want to fuse operators into a single operator, for example, fusing consecutive addmm + relu to a single operator addmm_relu, we can do that here.
 
 This document describes how to introduce backend specific operators.
 
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index c6808a11383..4e1cb22e9d0 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -28,7 +28,7 @@ A delegate backend implementation is composed of:
 
 The diagram looks like following
 
-<img src="./_static/img/backend_interface.png" alt="drawing" style="width:600px;"/>
+<img src="_static/img/backend_interface.png" alt="drawing" style="width:600px;"/>
 
 **Figure 1.** A high level of entry points for backend interfaces, including both ahead-of-time and runtime.
 
@@ -71,7 +71,7 @@ parsed and executed at runtime.
 
 The diagram looks like following
 
-<img src="./_static/img/backend_interface_aot.png" alt="drawing" style="width:800px;"/>
+<img src="_static/img/backend_interface_aot.png" alt="drawing" style="width:800px;"/>
 
 **Figure 2.** The graph goes through partition and each subgraph will be sent to the preprocess part.
 
@@ -107,7 +107,7 @@ virtual void destroy(ET_UNUSED DelegateHandle* handle);
 
 The diagram looks like following
 
-<img src="./_static/img/backend_interface_runtime.png" alt="drawing" style="width:600px;"/>
+<img src="_static/img/backend_interface_runtime.png" alt="drawing" style="width:600px;"/>
 
 **Figure 3.** The relationship between standard ExecuTorch Runtime and backend entry point.
 
@@ -129,20 +129,20 @@ static auto success_with_compiler = register_backend(backend);
 
 ## Developer Tools Integration: Debuggability
 
-Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./etrecord).
+Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](etrecord.rst).
 
 Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native Developer Tools does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart.
 
-In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Delegate Debugging](./delegate-debugging).
+In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Delegate Debugging](delegate-debugging.md).
 
 By leveraging the debug identifier, backend developer can embed the debug as part of the delegated blob
 
-<img src="./_static/img/backend_debug_handle.png" alt="drawing" style="width:600px;"/>
+<img src="_static/img/backend_debug_handle.png" alt="drawing" style="width:600px;"/>
 
 In this way, during execute stage, with the debug identifier, backend developer can associate the failed instruction inside the delegate
 back to the exact line of PyThon code.
 
-<img src="./_static/img/backend_debug_handle_example.png" alt="drawing" style="width:700px;"/>
+<img src="_static/img/backend_debug_handle_example.png" alt="drawing" style="width:700px;"/>
 
 ## Common Questions
 
@@ -236,7 +236,7 @@ class Backend_1_2_Partitioner(Partitioner):
 **6. Is there an easy way to write a partitioner?**
 
 We provide some helper partitioners
-[here](./compiler-custom-compiler-passes.md) to make it easy to find
+[here](compiler-custom-compiler-passes.md) to make it easy to find
 nodes from decomposed operators.
 
 **7. How do we link the node back to the source code?**
diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md
index 0f4489654b4..5c30defada7 100644
--- a/docs/source/compiler-memory-planning.md
+++ b/docs/source/compiler-memory-planning.md
@@ -86,4 +86,4 @@ Users attempting to write a custom memory planning algorithm should start by loo
 
 ## Debugging Tool
 
-Please refer to [Memory Planning Inspection](./memory-planning-inspection.md) for a tool to inspect the result of memory planning.
+Please refer to [Memory Planning Inspection](memory-planning-inspection.md) for a tool to inspect the result of memory planning.
diff --git a/docs/source/concepts.md b/docs/source/concepts.md
index 4cef25c606e..43ec3c95615 100644
--- a/docs/source/concepts.md
+++ b/docs/source/concepts.md
@@ -2,13 +2,13 @@
 This page provides an overview of key concepts and terms used throughout the ExecuTorch documentation. It is intended to help readers understand the terminology and concepts used in PyTorch Edge and ExecuTorch.
 
 ## Concepts Map
-![](./_static/img/concepts-map-overview.png)
+![](_static/img/concepts-map-overview.png)
 
-<a href="./_static/img/concepts-map-overview.png" target="_blank">View in full size</a>
+<a href="_static/img/concepts-map-overview.png" target="_blank">View in full size</a>
 
-<a href="./_static/img/concepts-map-detailed.png" target="_blank">View detailed concept map</a>
+<a href="_static/img/concepts-map-detailed.png" target="_blank">View detailed concept map</a>
 
-## [AOT (Ahead of Time)](./getting-started-architecture.md#program-preparation)
+## [AOT (Ahead of Time)](getting-started-architecture.md#program-preparation)
 
 AOT generally refers to the program preparation that occurs before execution. On a high level, ExecuTorch workflow is split into an AOT compilation and a runtime. The AOT steps involve compilation into an Intermediate Representation (IR), along with optional transformations and optimizations.
 
@@ -16,11 +16,11 @@ AOT generally refers to the program preparation that occurs before execution. On
 
 Fundamentally, it is a tensor library on top of which almost all other Python and C++ interfaces in PyTorch are built. It provides a core Tensor class, on which many hundreds of operations are defined.
 
-## [ATen Dialect](./ir-exir.md#aten-dialect)
+## [ATen Dialect](ir-exir.md#aten-dialect)
 
-ATen dialect is the immediate result of exporting an eager module to a graph representation. It is the entry point of the ExecuTorch compilation pipeline; after exporting to ATen dialect, subsequent passes can lower to [Core ATen dialect](./concepts.md#concepts#core-aten-dialect) and [Edge dialect](./concepts.md#edge-dialect).
+ATen dialect is the immediate result of exporting an eager module to a graph representation. It is the entry point of the ExecuTorch compilation pipeline; after exporting to ATen dialect, subsequent passes can lower to [Core ATen dialect](concepts.md#concepts#core-aten-dialect) and [Edge dialect](concepts.md#edge-dialect).
 
-ATen dialect is a valid [EXIR](./concepts.md#exir) with additional properties. It consists of functional ATen operators, higher order operators (like control flow operators) and registered custom operators.
+ATen dialect is a valid [EXIR](concepts.md#exir) with additional properties. It consists of functional ATen operators, higher order operators (like control flow operators) and registered custom operators.
 
 The goal of ATen dialect is to capture users’ programs as faithfully as possible.
 
@@ -39,7 +39,7 @@ Autograd safe ATen dialect includes only differentiable ATen operators, along wi
 
 A specific hardware (like GPU, NPU) or a software stack (like XNNPACK) that consumes a graph or part of it, with performance and efficiency benefits.
 
-## [Backend Dialect](./ir-exir.md#backend-dialect)
+## [Backend Dialect](ir-exir.md#backend-dialect)
 
 Backend dialect is the immediate result of exporting Edge dialect to specific backend. It’s target-aware, and may contain operators or submodules that are only meaningful to the target backend. This dialect allows the introduction of target-specific operators that do not conform to the schema defined in the Core ATen Operator Set and are not shown in ATen or Edge Dialect.
 
@@ -61,7 +61,7 @@ An open-source, cross-platform family of tools designed to build, test and packa
 
 ## Codegen
 
-At a high level, codegen performs two tasks; generating the [kernel registration](./kernel-library-custom-aten-kernel.md) library, and optionally running [selective build](#selective-build).
+At a high level, codegen performs two tasks; generating the [kernel registration](kernel-library-custom-aten-kernel.md) library, and optionally running [selective build](#selective-build).
 
 The kernel registration library connects operator names (referenced in the model) with the corresponding kernel implementation (from the kernel library).
 
@@ -73,7 +73,7 @@ The output of codegen is a set of C++ bindings (various `.h`, `.cpp` files) that
 
 Core ATen dialect contains the core ATen operators along with higher order operators (control flow) and registered custom operators.
 
-## [Core ATen operators / Canonical ATen operator set](./ir-ops-set-definition.md)
+## [Core ATen operators / Canonical ATen operator set](ir-ops-set-definition.md)
 
 A select subset of the PyTorch ATen operator library. Core ATen operators will not be decomposed when exported with the core ATen decomposition table. They serve as a reference for the baseline ATen ops that a backend or compiler should expect from upstream.
 
@@ -83,13 +83,13 @@ Decomposing an operator means expressing it as a combination of other operators.
 
 ## [Custom operator](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit?fbclid=IwAR1qLTrChO4wRokhh_wHgdbX1SZwsU-DUv1XE2xFq0tIKsZSdDLAe6prTxg#heading=h.ahugy69p2jmz)
 
-These are operators that aren't part of the ATen library, but which appear in [eager mode](./concepts.md#eager-mode). Registered custom operators are registered into the current PyTorch eager mode runtime, usually with a `TORCH_LIBRARY` call. They are most likely associated with a specific target model or hardware platform. For example, [torchvision::roi_align](https://pytorch.org/vision/main/generated/torchvision.ops.roi_align.html) is a custom operator widely used by torchvision (doesn't target a specific hardware).
+These are operators that aren't part of the ATen library, but which appear in [eager mode](concepts.md#eager-mode). Registered custom operators are registered into the current PyTorch eager mode runtime, usually with a `TORCH_LIBRARY` call. They are most likely associated with a specific target model or hardware platform. For example, [torchvision::roi_align](https://pytorch.org/vision/main/generated/torchvision.ops.roi_align.html) is a custom operator widely used by torchvision (doesn't target a specific hardware).
 
 ## DataLoader
 
 An interface that enables the ExecuTorch runtime to read from a file or other data source without directly depending on operating system concepts like files or memory allocation.
 
-## [Delegation](./compiler-delegate-and-partitioner.md)
+## [Delegation](compiler-delegate-and-partitioner.md)
 
 To run parts (or all) of a program on a specific backend (eg. XNNPACK) while the rest of the program (if any) runs on the basic ExecuTorch runtime. Delegation enables us to leverage the performance and efficiency benefits of specialized backends and hardware.
 
@@ -113,7 +113,7 @@ Data type, the type of data (eg. float, integer, etc.) in a tensor.
 
 ## [Dynamic Quantization](https://pytorch.org/docs/main/quantization.html#general-quantization-flow)
 
-A method of quantizing wherein tensors are quantized on the fly during inference. This is in contrast to [static quantization](./concepts.md#static-quantization), where tensors are quantized before inference.
+A method of quantizing wherein tensors are quantized on the fly during inference. This is in contrast to [static quantization](concepts.md#static-quantization), where tensors are quantized before inference.
 
 ## Dynamic shapes
 
@@ -123,7 +123,7 @@ Refers to the ability of a model to accept inputs with varying shapes during inf
 
 Python execution environment where operators in a model are immediately executed as they are encountered. e.g. Jupyter / Colab notebooks are run in eager mode. This is in contrast to graph mode, where operators are first synthesized into a graph which is then compiled and executed.
 
-## [Edge Dialect](./ir-exir.md#edge-dialect)
+## [Edge Dialect](ir-exir.md#edge-dialect)
 
 A dialect of EXIR with the following properties:
 - All operators are from a predefined operator set, called 'Edge Operators' or are registered custom operators.
@@ -151,7 +151,7 @@ An ExecuTorch `Program` maps string names like `forward` to specific ExecuTorch
 
 A sample wrapper around the ExecuTorch runtime which includes all the operators and backends.
 
-## [EXIR](./ir-exir.md)
+## [EXIR](ir-exir.md)
 
 The **EX**port **I**ntermediate **R**epresentation (IR) from `torch.export`. Contains the computational graph of the model. All EXIR graphs are valid [FX graphs](https://pytorch.org/docs/stable/fx.html#torch.fx.Graph).
 
@@ -171,7 +171,7 @@ The cost of various loading and initialization tasks (not inference). For exampl
 
 ATen operators that do not have any side effects.
 
-## [Graph](./ir-exir.md)
+## [Graph](ir-exir.md)
 
 An EXIR Graph is a PyTorch program represented in the form of a DAG (directed acyclic graph). Each node in the graph represents a particular computation or operation, and edges of this graph consist of references between nodes. Note: all EXIR graphs are valid [FX graphs](https://pytorch.org/docs/stable/fx.html#torch.fx.Graph).
 
@@ -206,11 +206,11 @@ A table with mappings between kernel names and their implementations. This allow
 
 The process of transforming a model to run on various backends. It is called 'lowering' as it is moving code closer to the hardware. In ExecuTorch, lowering is performed as part of backend delegation.
 
-## [Memory planning](./compiler-memory-planning.md)
+## [Memory planning](compiler-memory-planning.md)
 
 The process of allocating and managing memory for a model. In ExecuTorch, a memory planning pass is run before the graph is saved to flatbuffer. This assigns a memory ID to each tensor and an offset in the buffer, marking where storage for the tensor starts.
 
-## [Node](./ir-exir.md)
+## [Node](ir-exir.md)
 
 A node in an EXIR graph represents a particular computation or operation, and is represented in Python using [torch.fx.Node](https://pytorch.org/docs/stable/fx.html#torch.fx.Node) class.
 
@@ -228,7 +228,7 @@ Instead of allocating returned tensors in kernel implementations, an operator's
 
 This makes it easier for memory planners to perform tensor lifetime analysis. In ExecuTorch, an out variant pass is performed before memory planning.
 
-## [PAL (Platform Abstraction Layer)](./runtime-platform-abstraction-layer.md)
+## [PAL (Platform Abstraction Layer)](runtime-platform-abstraction-layer.md)
 
 Provides a way for execution environments to override operations such as;
 - Getting the current time.
@@ -240,7 +240,7 @@ The default PAL implementation can be overridden if it doesn’t work for a part
 
 Kernels that support a subset of tensor dtypes and/or dim orders.
 
-## [Partitioner](./compiler-custom-compiler-passes#Partitioner)
+## [Partitioner](compiler-custom-compiler-passes.md#Partitioner)
 
 Parts of a model may be delegated to run on an optimized backend. The partitioner splits the graph into the appropriate sub-networks and tags them for delegation.
 
@@ -275,19 +275,19 @@ A quantization technique where the model is quantized after it has been trained
 
 Models may lose accuracy after quantization. QAT enables higher accuracy compared to eg. PTQ, by modeling the effects of quantization while training. During training, all weights and activations are ‘fake quantized’; float values are rounded to mimic int8 values, but all computations are still done with floating point numbers. Thus, all weight adjustments during training are made ‘aware’ that the model will ultimately be quantized. QAT applies the quantization flow during training, in contrast to PTQ which applies it afterwards.
 
-## [Quantization](./quantization-overview.md)
+## [Quantization](quantization-overview.md)
 
 Techniques for performing computations and memory accesses on tensors with lower precision data, usually `int8`. Quantization improves model performance by lowering the memory usage and (usually) decreasing computational latency; depending on the hardware, computation done in lower precision will typically be faster, e.g. `int8` matmul vs `fp32` matmul. Often, quantization comes at the cost of model accuracy.
 
-## [Runtime](./runtime-overview.md)
+## [Runtime](runtime-overview.md)
 
 The ExecuTorch runtime executes models on edge devices. It is responsible for program initialization, program execution and, optionally, destruction (releasing backend owned resources).
 
-## [Developer Tools](./devtools-overview.md)
+## [Developer Tools](devtools-overview.md)
 
 A collection of tools users need to profile, debug and visualize programs that are running with ExecuTorch.
 
-## [Selective build](./kernel-library-selective-build.md)
+## [Selective build](kernel-library-selective-build.md)
 
 An API used to build a leaner runtime by linking only to kernels used by the program. This provides significant binary size savings.
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index a3c8baacb05..d0e21e8aa22 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -62,6 +62,7 @@
     "myst_parser",
     "sphinx_design",
     "sphinx_gallery.gen_gallery",
+    "sphinx_reredirects",
 ]
 
 if not FBCODE:
@@ -191,10 +192,27 @@
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
     "python": ("https://docs.python.org/", None),
-    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
     "torch": ("https://pytorch.org/docs/stable/", None),
 }
 
+# Redirects for moved pages
+redirects = {
+    "getting-started-setup": "getting-started.html",
+    "export-overview": "using-executorch-export.html",
+    "runtime-build-and-cross-compilation": "using-executorch-building-from-source.html",
+    "tutorials/export-to-executorch-tutorial": "../using-executorch-export.html",
+    "running-a-model-cpp-tutorial": "using-executorch-cpp.html",
+    "build-run-vulkan": "backends-vulkan.html",
+    "executorch-arm-delegate-tutorial": "backends-arm-ethos-u.html",
+    "build-run-coreml": "backends-coreml.html",
+    "build-run-mediatek-backend": "backends-mediatek.html",
+    "build-run-mps": "backends-mps.html",
+    "build-run-qualcomm-ai-engine-direct-backend": "backends-qualcomm.html",
+    "build-run-xtensa": "backends-cadence.html",
+    "apple-runtime": "using-executorch-ios.html",
+}
+
 # Custom directives defintions to create cards on main landing page
 
 from custom_directives import (  # type: ignore[import-not-found]
diff --git a/docs/source/delegate-debugging.md b/docs/source/delegate-debugging.md
index e4e6b0ddcc9..f14b640da79 100644
--- a/docs/source/delegate-debugging.md
+++ b/docs/source/delegate-debugging.md
@@ -127,7 +127,7 @@ A demo of the runtime code can be found [here](https://github.com/pytorch/execut
 
 ## Surfacing custom metadata from delegate events
 
-As seen in the runtime logging API's above, users can log an array of bytes along with their delegate profiling event. We make this data available for users in post processing via the [Inspector API](./model-inspector.rst).
+As seen in the runtime logging API's above, users can log an array of bytes along with their delegate profiling event. We make this data available for users in post processing via the [Inspector API](model-inspector.rst).
 
 Users can pass a metadata parser when creating an instance of the Inspector. The parser is a callable that deserializes the data and returns a list of strings or a dictionary containing key-value pairs. The deserialized data is then added back to the corresponding event in the event block for user consumption. Here's an example of how to write this parser:
 
diff --git a/docs/source/demo-apps-ios.md b/docs/source/demo-apps-ios.md
index d68b1309e2b..e85f820d753 100644
--- a/docs/source/demo-apps-ios.md
+++ b/docs/source/demo-apps-ios.md
@@ -1 +1,135 @@
-```{include} ../../examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
+# Building an ExecuTorch iOS Demo App
+
+Welcome to the tutorial on setting up the ExecuTorch iOS Demo App!
+
+This app uses the
+[MobileNet v3](https://pytorch.org/vision/main/models/mobilenetv3.html) model to
+process live camera images leveraging three different backends:
+[XNNPACK](https://github.com/google/XNNPACK),
+[Core ML](https://developer.apple.com/documentation/coreml) and
+[Metal Performance Shaders (MPS)](https://developer.apple.com/documentation/metalperformanceshaders)
+(Xcode 15+ and iOS 17+ only).
+
+<p align="center">
+  <img src="_static/img/demo_ios_app.png" width="600"">
+</p>
+
+## Prerequisites
+
+Before we start, make sure you have the following tools installed:
+
+### 1. Xcode 15 and Command Line Tools
+
+Install Xcode 15 from the
+[Mac App Store](https://apps.apple.com/app/xcode/id497799835) and then install
+the Command Line Tools using the terminal:
+
+```bash
+xcode-select --install
+```
+
+### 2. Python 3.10+
+
+Python 3.10 or above, along with `pip`, should be pre-installed on MacOS 13.5+.
+If needed, [download Python](https://www.python.org/downloads/macos/) and
+install it. Verify the Python and pip versions using these commands:
+
+```bash
+which python3 pip
+python3 --version
+pip --version
+```
+
+### 3. Set Up ExecuTorch
+
+Clone ExecuTorch and set up the environment as explained in the [Building from Source](using-executorch-building-from-source.md) tutorial:
+
+```bash
+git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
+
+python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
+
+./install_executorch.sh
+```
+
+### 4. Backend Dependencies
+
+Install additional dependencies for [Core ML](backends-coreml.md) and [MPS](backends-mps.md) backends:
+
+```bash
+./backends/apple/coreml/scripts/install_requirements.sh
+./backends/apple/mps/install_requirements.sh
+```
+
+### 5. Clone the Demo App
+
+```bash
+git clone --depth 1 https://github.com/pytorch-labs/executorch-examples.git
+```
+
+## Models and Labels
+
+Now, let's move on to exporting and bundling the MobileNet v3 model.
+
+### 1. Export Model
+
+Export the MobileNet v3 model with Core ML, MPS and XNNPACK backends, and move
+the exported model to a specific location where the Demo App will pick them up:
+
+```bash
+MODEL_NAME="mv3"
+
+python3 -m examples.portable.scripts.export --model_name="$MODEL_NAME"
+python3 -m examples.apple.coreml.scripts.export --model_name="$MODEL_NAME"
+python3 -m examples.apple.mps.scripts.mps_example --model_name="$MODEL_NAME"
+python3 -m examples.xnnpack.aot_compiler --model_name="$MODEL_NAME" --delegate
+
+APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo"
+
+mkdir -p "$APP_PATH/Resources/Models/MobileNet/"
+mv $MODEL_NAME*.pte "$APP_PATH/Resources/Models/MobileNet/"
+```
+
+### 2. Download Labels
+
+Download the MobileNet model labels required for image classification:
+
+```bash
+curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \
+  -o "$APP_PATH/Resources/Models/MobileNet/imagenet_classes.txt"
+```
+
+## Build and Run the App
+
+We're almost done! Now, we just need to open the project in Xcode, run the tests, and finally run the app.
+
+### 1. Open Project in Xcode
+
+Double-click on the project file under `executorch-examples/apple/ExecuTorchDemo` or run the command:
+
+```bash
+open $APP_PATH.xcodeproj
+```
+
+### 2. Run Tests
+
+You can run tests on Simulaltor directly in Xcode with `Cmd + U` or use the command line:
+
+```bash
+xcrun simctl create executorch "iPhone 15"
+xcodebuild clean test \
+     -project $APP_PATH.xcodeproj \
+     -scheme App \
+     -destination name=executorch
+xcrun simctl delete executorch
+```
+
+### 3. Run App
+
+Finally, connect the device, set up Code Signing in Xcode, and then run the app
+using `Cmd + R`. Try installing a Release build for better performance.
+
+Congratulations! You've successfully set up the ExecuTorch iOS Demo App. Now,
+you can explore and enjoy the power of ExecuTorch on your iOS device!
+
+Learn more about [Using ExecuTorch on iOS](using-executorch-ios.md).
diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md
index 259eaf562c3..6d2f0e6375a 100644
--- a/docs/source/devtools-overview.md
+++ b/docs/source/devtools-overview.md
@@ -1,6 +1,6 @@
 # Introduction to the ExecuTorch Developer Tools
 
-ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch Developer Tools enable this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch.
+ExecuTorch has been designed with [productivity](intro-overview.md) as one of its core objectives and the ExecuTorch Developer Tools enable this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch.
 
 All the components of the Developer Tools have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from.
 
diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md
index 33d78cf58da..7c6cedc311b 100644
--- a/docs/source/devtools-tutorial.md
+++ b/docs/source/devtools-tutorial.md
@@ -1,3 +1,3 @@
 ## Developer Tools Usage Tutorial
 
-Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
+Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
diff --git a/docs/source/etdump.md b/docs/source/etdump.md
index 13957f1c018..467b7379cf1 100644
--- a/docs/source/etdump.md
+++ b/docs/source/etdump.md
@@ -41,4 +41,4 @@ target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
 ```
 ## Using an ETDump
 
-Pass this ETDump into the [Inspector API](./model-inspector.rst) to access this data and do post-run analysis.
+Pass this ETDump into the [Inspector API](model-inspector.rst) to access this data and do post-run analysis.
diff --git a/docs/source/examples-end-to-end-to-lower-model-to-delegate.md b/docs/source/examples-end-to-end-to-lower-model-to-delegate.md
index 90b0a1583fb..4ef6bcd0d6e 100644
--- a/docs/source/examples-end-to-end-to-lower-model-to-delegate.md
+++ b/docs/source/examples-end-to-end-to-lower-model-to-delegate.md
@@ -137,7 +137,7 @@ are meant to be lowered. It will return a `partition_tags` dictionary mapping ta
 backend names and module compile specs. The tagged nodes will then be
 partitioned and lowered to their mapped backends using Flow 1's process.
 Available helper partitioners are documented
-[here](./compiler-custom-compiler-passes.md). These lowered modules
+[here](compiler-custom-compiler-passes.md). These lowered modules
 will be inserted into the top-level module and serialized.
 
 The following is an example of the flow:
diff --git a/docs/source/export-overview.md b/docs/source/export-overview.md
index 487564d4dad..d07701d06cd 100644
--- a/docs/source/export-overview.md
+++ b/docs/source/export-overview.md
@@ -11,5 +11,5 @@ program, making it easier for you to understand and implement the process.
 
 To learn more about exporting your model:
 
-* Complete the [Exporting to ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial).
+* Complete the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
 * Read the [torch.export documentation](https://pytorch.org/docs/2.1/export.html).
diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md
index 05367020842..835c5c12e27 100644
--- a/docs/source/extension-module.md
+++ b/docs/source/extension-module.md
@@ -6,7 +6,7 @@ In the [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutoria
 
 ## Example
 
-Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial) using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
+Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
 
 ```cpp
 #include <executorch/extension/module/module.h>
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
index 2472b3547fe..ef4a12d1a7f 100644
--- a/docs/source/getting-started-architecture.md
+++ b/docs/source/getting-started-architecture.md
@@ -10,7 +10,7 @@ In order to target on-device AI with diverse hardware, critical power requiremen
 
 There are three phases to deploy a PyTorch model to on-device: program preparation, runtime preparation, and program execution, as shown in the diagram below, with a number of user entry points. We’ll discuss each step separately in this documentation.
 
-![](./executorch_stack.png)
+![](executorch_stack.png)
 
 **Figure 1.** The figure illustrates the three phases - program preparation, runtime preparation and program execution.
 
@@ -18,7 +18,7 @@ There are three phases to deploy a PyTorch model to on-device: program preparati
 
 ExecuTorch extends the flexibility and usability of PyTorch to edge devices. It
 leverages PyTorch 2 compiler and export functionality
-([TorchDynamo](https://pytorch.org/docs/stable/dynamo/index.html),
+([TorchDynamo](https://pytorch.org/docs/stable/torch.compiler_dynamo_overview.html),
 [AOTAutograd](https://pytorch.org/functorch/stable/notebooks/aot_autograd_optimizations.html),
 [Quantization](https://pytorch.org/docs/main/quantization.html),
 [dynamic shapes](https://pytorch.org/get-started/pytorch-2.0/#pytorch-2x-faster-more-pythonic-and-as-dynamic-as-ever),
@@ -40,21 +40,21 @@ Starting from the program source code, below are the steps you would go through
 
 ### Export
 
-To deploy the program to the device, engineers need to have a graph representation for compiling a model to run on various backends. With [`torch.export()`](https://pytorch.org/docs/main/export.html), an [EXIR](./ir-exir.md) (export intermediate representation) is generated with ATen dialect. All AOT compilations are based on this EXIR, but can have multiple dialects along the lowering path as detailed below.
+To deploy the program to the device, engineers need to have a graph representation for compiling a model to run on various backends. With [`torch.export()`](https://pytorch.org/docs/main/export.html), an [EXIR](ir-exir.md) (export intermediate representation) is generated with ATen dialect. All AOT compilations are based on this EXIR, but can have multiple dialects along the lowering path as detailed below.
 
-* _[ATen Dialect](./ir-exir.md#aten-dialect)_. PyTorch Edge is based on PyTorch’s Tensor library ATen, which has clear contracts for efficient execution. ATen Dialect is a graph represented by ATen nodes which are fully ATen compliant. Custom operators are allowed, but must be registered with the dispatcher. It’s flatten with no module hierarchy (submodules in a bigger module), but the source code and module hierarchy are preserved in the metadata. This representation is also autograd safe.
+* _[ATen Dialect](ir-exir.md#aten-dialect)_. PyTorch Edge is based on PyTorch’s Tensor library ATen, which has clear contracts for efficient execution. ATen Dialect is a graph represented by ATen nodes which are fully ATen compliant. Custom operators are allowed, but must be registered with the dispatcher. It’s flatten with no module hierarchy (submodules in a bigger module), but the source code and module hierarchy are preserved in the metadata. This representation is also autograd safe.
 * Optionally, _quantization_, either QAT (quantization-aware training) or PTQ (post training quantization) can be applied to the whole ATen graph before converting to Core ATen. Quantization helps with reducing the model size, which is important for edge devices.
-* _[Core ATen Dialect](./ir-ops-set-definition.md)_. ATen has thousands of operators. It’s not ideal for some fundamental transforms and kernel library implementation. The operators from the ATen Dialect graph are decomposed into fundamental operators so that the operator set (op set) is smaller and more fundamental transforms can be applied. The Core ATen dialect is also serializable and convertible to Edge Dialect as detailed below.
+* _[Core ATen Dialect](ir-ops-set-definition.md)_. ATen has thousands of operators. It’s not ideal for some fundamental transforms and kernel library implementation. The operators from the ATen Dialect graph are decomposed into fundamental operators so that the operator set (op set) is smaller and more fundamental transforms can be applied. The Core ATen dialect is also serializable and convertible to Edge Dialect as detailed below.
 
 ### Edge Compilation
 
 The Export process discussed above operates on a graph that is agnostic to the edge device where the code is ultimately executed. During the edge compilation step, we work on representations that are Edge specific.
 
-* _[Edge Dialect](./ir-exir.md#edge-dialect)_. All operators are either compliant with ATen operators with dtype plus memory layout information (represented as `dim_order`) or registered custom operators. Scalars are converted to Tensors. Those specifications allow following steps focusing on a smaller Edge domain. In addition, it enables the selective build which is based on specific dtypes and memory layouts.
+* _[Edge Dialect](ir-exir.md#edge-dialect)_. All operators are either compliant with ATen operators with dtype plus memory layout information (represented as `dim_order`) or registered custom operators. Scalars are converted to Tensors. Those specifications allow following steps focusing on a smaller Edge domain. In addition, it enables the selective build which is based on specific dtypes and memory layouts.
 
-With the Edge dialect, there are two target-aware ways to further lower the graph to the _[Backend Dialect](./compiler-backend-dialect.md)_. At this point, delegates for specific hardware can perform many operations. For example, Core ML on iOS, QNN on Qualcomm, or TOSA on Arm can rewrite the graph. The options at this level are:
+With the Edge dialect, there are two target-aware ways to further lower the graph to the _[Backend Dialect](compiler-backend-dialect.md)_. At this point, delegates for specific hardware can perform many operations. For example, Core ML on iOS, QNN on Qualcomm, or TOSA on Arm can rewrite the graph. The options at this level are:
 
-* _[Backend Delegate](./compiler-delegate-and-partitioner.md)_. The entry point to compile the graph (either full or partial) to a specific backend. The compiled graph is swapped with the semantically equivalent graph during this transformation. The compiled graph will be offloaded to the backend (aka `delegated`) later during the runtime for improved performance.
+* _[Backend Delegate](compiler-delegate-and-partitioner.md)_. The entry point to compile the graph (either full or partial) to a specific backend. The compiled graph is swapped with the semantically equivalent graph during this transformation. The compiled graph will be offloaded to the backend (aka `delegated`) later during the runtime for improved performance.
 * _User-defined passes_. Target-specific transforms can also be performed by the user. Good examples of this are kernel fusion, async behavior, memory layout conversion, and others.
 
 ### Compile to ExecuTorch Program
@@ -73,7 +73,7 @@ Finally, the emitted program can be serialized to [flatbuffer](https://github.co
 
 With the serialized program, and provided kernel libraries (for operator calls) or backend libraries (for delegate calls), model deployment engineers can now prepare the program for the runtime.
 
-ExecuTorch has the _[selective build](./kernel-library-selective-build.md)_ APIs, to build the runtime that links to only kernels used by the program, which can provide significant binary size savings in the resulting application.
+ExecuTorch has the _[selective build](kernel-library-selective-build.md)_ APIs, to build the runtime that links to only kernels used by the program, which can provide significant binary size savings in the resulting application.
 
 ## Program Execution
 
@@ -89,6 +89,6 @@ _Executor_ is the entry point to load the program and execute it. The execution
 
 ## Developer Tools
 
-It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](./devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
+It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
 
 During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments.
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
index fbca80cf23b..c4ccd66c5e8 100644
--- a/docs/source/getting-started.md
+++ b/docs/source/getting-started.md
@@ -43,7 +43,8 @@ ExecuTorch provides hardware acceleration for a wide variety of hardware. The mo
 For mobile use cases, consider using XNNPACK for Android and Core ML or XNNPACK for iOS as a first step. See [Hardware Backends](backends-overview.md) for more information.
 
 ### Exporting
-Exporting is done using Python APIs. ExecuTorch provides a high degree of customization during the export process, but the typical flow is as follows. This example uses the MobileNet V2 image classification model implementation in torchvision, but the process supports any [export-compliant](https://pytorch.org/docs/stable/export.html) PyTorch model.
+Exporting is done using Python APIs. ExecuTorch provides a high degree of customization during the export process, but the typical flow is as follows. This example uses the MobileNet V2 image classification model implementation in torchvision, but the process supports any [export-compliant](https://pytorch.org/docs/stable/export.html) PyTorch model. For users working with Hugging Face models,
+you can find a list of supported models in the [*huggingface/optimum-executorch*](https://github.com/huggingface/optimum-executorch) repo.
 
 ```python
 import torch
@@ -86,9 +87,22 @@ runtime = Runtime.get()
 input_tensor: torch.Tensor = torch.randn(1, 3, 224, 224)
 program = runtime.load_program("model.pte")
 method = program.load_method("forward")
-outputs: List[torch.Tensor] = method.execute([input_tensor])
+output: List[torch.Tensor] = method.execute([input_tensor])
+print("Run succesfully via executorch")
+
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+import torchvision.models as models
+
+eager_reference_model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+eager_reference_output = eager_reference_model(input_tensor)
+
+print("Comparing against original PyTorch module")
+print(torch.allclose(output[0], eager_reference_output, rtol=1e-3, atol=1e-5))
 ```
 
+For complete examples of exporting and running the model, please refer to our [examples GitHub repository](https://github.com/pytorch-labs/executorch-examples/tree/main/mv2/python).
+
+Additionally, if you work with Hugging Face models, the [*huggingface/optimum-executorch*](https://github.com/huggingface/optimum-executorch) library simplifies running these models end-to-end with ExecuTorch, using familiar Hugging Face APIs. Visit the repository for specific examples and supported models.
 
 <hr/>
 
@@ -111,6 +125,8 @@ To add the library to your app, add the following dependency to gradle build rul
 dependencies {
   implementation("org.pytorch:executorch-android:0.5.1")
 }
+
+# See latest available versions in https://mvnrepository.com/artifact/org.pytorch/executorch-android
 ```
 
 #### Runtime APIs
@@ -160,12 +176,13 @@ add_subdirectory("executorch")
 target_link_libraries(
   my_target
   PRIVATE executorch
-          executorch_module_static
-          executorch_tensor
+          extension_module_static
+          extension_tensor
           optimized_native_cpu_ops_lib
           xnnpack_backend)
 ```
 
+
 #### Runtime APIs
 Both high-level and low-level C++ APIs are provided. The low-level APIs are platform independent, do not dynamically allocate memory, and are most suitable for resource-constrained embedded systems. The high-level APIs are provided as a convenience wrapper around the lower-level APIs, and make use of dynamic memory allocation and standard library constructs to reduce verbosity.
 
@@ -182,8 +199,8 @@ using namespace ::executorch::extension;
 Module module("/path/to/model.pte");
 
 // Create an input tensor.
-float input[1 * 3 * 256 * 256];
-auto tensor = from_blob(input, {1, 3, 256, 256});
+float input[1 * 3 * 224 * 224];
+auto tensor = from_blob(input, {1, 3, 224, 224});
 
 // Perform an inference.
 const auto result = module.forward(tensor);
@@ -196,6 +213,8 @@ if (result.ok()) {
 
 For more information on the C++ APIs, see [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md) and [Managing Tensor Memory in C++](extension-tensor.md).
 
+For complete examples of building and running C++ application, please refer to our [examples GitHub repository](https://github.com/pytorch-labs/executorch-examples/tree/main/mv2/cpp).
+
 <hr/>
 
 ## Next Steps
diff --git a/docs/source/index.md b/docs/source/index.md
index 47ea42a21ea..730e4eb1711 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -43,6 +43,7 @@ ExecuTorch provides support for:
 #### Examples
 - [Android Demo Apps](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
 - [iOS Demo Apps](demo-apps-ios.md)
+- [Hugging Face Models](https://github.com/huggingface/optimum-executorch/blob/main/README.md)
 #### Backends
 - [Overview](backends-overview)
 - [XNNPACK](backends-xnnpack)
@@ -86,11 +87,11 @@ ExecuTorch provides support for:
 - [Custom ATen Kernel](kernel-library-custom-aten-kernel)
 - [Selective Build](kernel-library-selective-build)
 #### Working with LLMs
-- [Llama](llm/llama)
-- [Llama on Android](llm/llama-demo-android)
-- [Llama on iOS](llm/llama-demo-ios)
-- [Llama on Android via Qualcomm backend](llm/build-run-llama3-qualcomm-ai-engine-direct-backend)
-- [Intro to LLMs in Executorch](llm/getting-started)
+- [Llama](llm/llama.md)
+- [Llama on Android](llm/llama-demo-android.md)
+- [Llama on iOS](llm/llama-demo-ios.md)
+- [Llama on Android via Qualcomm backend](llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md)
+- [Intro to LLMs in Executorch](llm/getting-started.md)
 #### Backend Development
 - [Delegates Integration](backend-delegates-integration)
 - [XNNPACK Reference](backend-delegates-xnnpack-reference)
diff --git a/docs/source/intro-how-it-works.md b/docs/source/intro-how-it-works.md
index 9b8ee87f78f..3e6d384a62f 100644
--- a/docs/source/intro-how-it-works.md
+++ b/docs/source/intro-how-it-works.md
@@ -17,10 +17,10 @@ ExecuTorch provides the following benefits to engineers who need to deploy machi
 
 * **Export that is robust and powerful.** Export uses [`torch.export()`](https://pytorch.org/docs/main/export.html), which uses the same technology used in PyTorch 2.x to capture PyTorch programs for fast execution. While eager mode is flexible and allows experimentation in Python, it may not work well if Python isn't available or cannot deliver efficient execution. The _Export Intermediate Representation (Export IR)_ that export flow generates can describe a wide range of dynamism in PyTorch models, including control flow and dynamic shapes, which makes it a powerful tool for fully capturing existing PyTorch models with little effort.
 * **Operator standardization.** During the graph export process, the nodes in the graph represent operators such as addition, multiplication, or convolution. These operators are part of a small standardized list called the [Core ATen Op set](https://pytorch.org/docs/main/torch.compiler_ir.html#core-aten-ir). Most PyTorch programs can be decomposed into a graph using this small set of operators during export. Small list of standardized operators reduces the surface, needed to be covered, by third-party operator libraries as well as accelerator backends, in order to run models exported for ExecuTorch. ExecuTorch runtime ships with one such library, called portable operator library, that implements core ATen opset.
-* **Standardization for compiler interfaces (aka delegates) and the OSS ecosystem.** In addition to the _Operator standardization_ above, ExecuTorch has a standardized interface for delegation to compilers. This allows third-party vendors and compilers to implement interfaces and API entry points for compilation and execution of (either partial or full) graphs targeting their specialized hardware. This provides greater flexibility in terms of hardware support and performance optimization, as well as easier integration with the PyTorch open source ecosystem for on-device AI.
-* **First-party SDK and toolchain.** Due to the above standardization efforts, it was possible to build a unified first-party SDK for ExecuTorch, where developers can export, compile, and deploy to a wide range of target devices--such as iOS, Android, and microcontrollers--using the same SDK, streamlining the process and gaining productivity. Additionally, the SDK provides profiling and debugging functionality to easily inspect intermediate states, which are core parts of most developer workflows.
+* **Standardization for compiler interfaces (aka delegates) and the OSS ecosystem.** In addition to the _Operator standardization_ above, ExecuTorch has a [standardized interface](compiler-delegate-and-partitioner.md) for delegation to compilers. This allows third-party vendors and compilers to implement interfaces and API entry points for compilation and execution of (either partial or full) graphs targeting their specialized hardware. This provides greater flexibility in terms of hardware support and performance optimization, as well as easier integration with the PyTorch open source ecosystem for on-device AI.
+* **First-party Developer Tools** Due to the above standardization efforts, it was possible to build unified first-party [developer tools](devtools-overview.md) for ExecuTorch, where developers can export, compile, and deploy to a wide range of target devices—such as iOS, Android, and microcontrollers—using the same APIs, streamlining the process and increasing productivity. Additionally, ExecuTorch provides profiling and debugging functionality to easily inspect intermediate states, which are core parts of most developer workflows.
 * **No intermediate conversions necessary.** ExecuTorch's main design principle is to allow developers to run their models on target devices without the need for converting to third-party intermediate representations. This eliminates a number of problems that on-device developers typically face when working with these conversion steps, such as lack of debuggability and profiling, the need to familiarize themselves with hardware-specific tools, and models not being able to run due to conversion steps failing.
-* **Ease of customization.** Developers can optimize their deployment for even better performance gains on the target architecture by applying custom techniques, such as linking with high-performance operator implementations or customizing memory planning based on storage and latency trade-offs. This level of customization is made possible through the standardization of the compiler pass interface and registration APIs on exported graphs.
+* **Ease of customization.** Developers can optimize their deployment for even better performance gains on the target architecture by applying custom techniques, such as [linking with high-performance operator implementations](kernel-library-custom-aten-kernel.md) or [customizing memory planning](compiler-memory-planning.md) based on storage and latency trade-offs. This level of customization is made possible through the standardization of the [compiler pass interface](compiler-custom-compiler-passes.md) and registration APIs on exported graphs.
 * **Low overhead runtime and execution.** The ExecuTorch runtime, written in C++, is highly efficient and can run on a wide range of architectures, including Linux, iOS, Android, embedded systems, and bare metal hardware, with little additional setup or configuration. It is capable of linking in only those operators needed for the model, resulting in a minimal runtime binary size. It is also able to run at low latency because of ahead-of-time compilation and memory planning stages, with the runtime responsible only for execution (e.g., call operator `conv` and save the result in memory location X).
 
 The above highlights the key advantages of ExecuTorch across three main categories: portability, productivity, and performance. We consider it to be an ideal choice for enabling on-device AI across mobile and edge computing platforms.
diff --git a/docs/source/ir-exir.md b/docs/source/ir-exir.md
index e784abcb7dc..9f7d47651da 100644
--- a/docs/source/ir-exir.md
+++ b/docs/source/ir-exir.md
@@ -66,7 +66,7 @@ properties:
 
 ### ATen Operator Definition
 
-The operator set definition can be found [here](./ir-ops-set-definition.md).
+The operator set definition can be found [here](ir-ops-set-definition.md).
 
 ## Edge Dialect
 
@@ -113,7 +113,7 @@ At this point, user defined graph transformation can be run through
 is touching `node.target`, be aware that all of the `node.target` at this stage
 are "Edge ops" (more details below) and not torch ops like in the ATen dialect.
 A tutorial on pass writing can be found
-[here](./compiler-custom-compiler-passes.md). After all these passes are
+[here](compiler-custom-compiler-passes.md). After all these passes are
 executed, `to_edge()` will make sure the graph is still valid.
 
 ### Edge Operators
@@ -175,4 +175,4 @@ for all core ATen ops.
 
 ## Backend Dialect
 
-See this [doc](./compiler-backend-dialect.md)
+See this [doc](compiler-backend-dialect.md)
diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md
index 838b0f69d3b..3eca9405aa9 100644
--- a/docs/source/kernel-library-custom-aten-kernel.md
+++ b/docs/source/kernel-library-custom-aten-kernel.md
@@ -1,7 +1,7 @@
 # Kernel Registration
 ## Overview
 
-At the last stage of [ExecuTorch model exporting](./export-overview.md), we lower the operators in the dialect to the _out variants_ of the [core ATen operators](./ir-ops-set-definition.md). Then we serialize these operator names into the model artifact. During runtime execution, for each operator name we will need to find the actual _kernels_, i.e., the C++ functions that do the heavy-lifting calculations and return results.
+At the last stage of [ExecuTorch model exporting](export-overview.md), we lower the operators in the dialect to the _out variants_ of the [core ATen operators](ir-ops-set-definition.md). Then we serialize these operator names into the model artifact. During runtime execution, for each operator name we will need to find the actual _kernels_, i.e., the C++ functions that do the heavy-lifting calculations and return results.
 
 ## Kernel Libraries
 ### First-party kernel libraries:
@@ -47,7 +47,7 @@ If it's not clear which API to use, please see [Best Practices](#custom-ops-api-
 
 ### YAML Entry API High Level Architecture
 
-![](./_static/img/kernel-library-custom-aten-kernel.png)
+![](_static/img/kernel-library-custom-aten-kernel.png)
 
 ExecuTorch users are asked to provide:
 
@@ -299,6 +299,26 @@ torch.ops.load_library("libcustom_linear.so/dylib")
 op = torch.ops.myop.custom_linear.default
 ```
 
+#### Using a Custom Operator in a Model
+
+The custom operator can explicitly used in the PyTorch model, or you can write a transformation to replace instances of a core operator with the custom variant. For this example, you could find
+all instances of `torch.nn.Linear` and replace them with `CustomLinear`.
+
+```python
+def  replace_linear_with_custom_linear(module):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
+            setattr(
+                module,
+                name,
+                CustomLinear(child.in_features,  child.out_features, child.bias),
+        )
+        else:
+            replace_linear_with_custom_linear(child)
+```
+
+The remaining steps are the same as the normal flow. Now you can run this module in eager mode as well as export to ExecuTorch.
+
 ### Custom Ops API Best Practices
 
 Given that we have 2 kernel registration APIs for custom ops, which API should we use? Here are some pros and cons for each API:
diff --git a/docs/source/kernel-library-overview.md b/docs/source/kernel-library-overview.md
index f9421c536d2..cfd46524097 100644
--- a/docs/source/kernel-library-overview.md
+++ b/docs/source/kernel-library-overview.md
@@ -2,7 +2,7 @@ This page provides a description of the Portable Kernel Library and the Optimize
 
 # Overview of ExecuTorch’s Kernel Libraries
 
-An ExecuTorch program encodes instructions that describe the computation that should be performed by the program. Many of these instructions will correspond to calling a specific ATen operator, for example `aten.convolution`. However, one of the core design principles of ExecuTorch is that the signature of an operator should be separate from the implementation of the operator. This means that the ExecuTorch runtime does not ship with any standard implementation for ATen operators; users must make sure to link against kernel libraries that contain implementations of the operators required by their ExecuTorch program, and configure [operator registration](./kernel-library-custom-aten-kernel.md) to map an operator signature to the desired implementation. This makes it easy to adjust the implementation of operators such as `aten.convolution` that will be called when executing an ExecuTorch program; it allows users to select the exact operator implementations that will meet the unique performance, memory usage, battery usage, etc. constraints of their use-case.
+An ExecuTorch program encodes instructions that describe the computation that should be performed by the program. Many of these instructions will correspond to calling a specific ATen operator, for example `aten.convolution`. However, one of the core design principles of ExecuTorch is that the signature of an operator should be separate from the implementation of the operator. This means that the ExecuTorch runtime does not ship with any standard implementation for ATen operators; users must make sure to link against kernel libraries that contain implementations of the operators required by their ExecuTorch program, and configure [operator registration](kernel-library-custom-aten-kernel.md) to map an operator signature to the desired implementation. This makes it easy to adjust the implementation of operators such as `aten.convolution` that will be called when executing an ExecuTorch program; it allows users to select the exact operator implementations that will meet the unique performance, memory usage, battery usage, etc. constraints of their use-case.
 
 **In essence, a kernel library is simply a collection of ATen operator implementations that follow a common theme or design principle**. Note that due to ExecuTorch’s selective build process (discussed in the following section), operator implementations are linked individually. This means that users can easily mix different kernel libraries in their build without sacrificing build size.
 
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index 4bbbb8e7f36..f9a991767a3 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -4,7 +4,7 @@ _Selective build_ is a build mode on ExecuTorch that uses model metadata to guid
 
 This document aims to help ExecuTorch users better use selective build, by listing out available APIs, providing an overview of high level architecture and showcasing examples.
 
-Preread: [Overview of the ExecuTorch runtime](./runtime-overview.md), [High-level architecture and components of ExecuTorch](./getting-started-architecture.md)
+Preread: [Overview of the ExecuTorch runtime](runtime-overview.md), [High-level architecture and components of ExecuTorch](getting-started-architecture.md)
 
 
 ## Design Principles
@@ -20,7 +20,7 @@ Preread: [Overview of the ExecuTorch runtime](./runtime-overview.md), [High-leve
 
 
 
-![](./_static/img/kernel-library-selective-build.png)
+![](_static/img/kernel-library-selective-build.png)
 
 
 Note that all of the selective build tools are running at build-time (to be distinguished from compile-time or runtime). Therefore selective build tools only have access to static data from user input or models.
@@ -55,7 +55,7 @@ If this input is set to true, it means we are registering all the kernels from a
 
 ### Select ops from schema yaml
 
-Context: each kernel library is designed to have a yaml file associated with it. For more information on this yaml file, see [Kernel Library Overview](./kernel-library-overview.md). This API allows users to pass in the schema yaml for a kernel library directly, effectively allowlisting all kernels in the library to be registered.
+Context: each kernel library is designed to have a yaml file associated with it. For more information on this yaml file, see [Kernel Library Overview](kernel-library-overview.md). This API allows users to pass in the schema yaml for a kernel library directly, effectively allowlisting all kernels in the library to be registered.
 
 
 ### Select root ops from operator list
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index c02701a839c..3dbba3ef5bb 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -4,7 +4,7 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng
 
 ## Prerequisites
 
-- Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment.
+- Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.rst) to set up the repo and dev environment.
 - Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../backends-qualcomm.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device.
 - Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama) to know how to run a llama model on mobile via ExecuTorch.
 - A Qualcomm device with 16GB RAM
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index 035da31f119..152162841e4 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -13,7 +13,7 @@ We encourage users to use this project as a starting point and adapt it to their
 which includes creating your own versions of the tokenizer, sampler, acceleration backends, and
 other components. We hope this project serves as a useful guide in your journey with LLMs and ExecuTorch.
 
-For deploying Llama with optimal performance, please see [Llama guide](./llama.md).
+For deploying Llama with optimal performance, please see [Llama guide](llama.md).
 
 ### Table Of Contents
 
@@ -89,7 +89,7 @@ cd ../..
 :::
 ::::
 
-For more information, see [Setting Up ExecuTorch](../getting-started-setup.md).
+For more information, see [Setting Up ExecuTorch](../getting-started-setup.rst).
 
 
 ## Running a Large Language Model Locally
@@ -159,7 +159,7 @@ example_inputs = (torch.randint(0, 100, (1, model.config.block_size), dtype=torc
 # long as they adhere to the rules specified in the dynamic shape configuration.
 # Here we set the range of 0th model input's 1st dimension as
 # [0, model.config.block_size].
-# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# See https://pytorch.org/executorch/main/concepts#dynamic-shapes
 # for details about creating dynamic shapes.
 dynamic_shape = (
     {1: torch.export.Dim("token_dim", max=model.config.block_size)},
@@ -183,7 +183,7 @@ with open("nanogpt.pte", "wb") as file:
 
 To export, run the script with `python export_nanogpt.py` (or python3, as appropriate for your environment). It will generate a `nanogpt.pte` file in the current directory.
 
-For more information, see [Exporting to ExecuTorch](../tutorials/export-to-executorch-tutorial) and
+For more information, see [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) and
 [torch.export](https://pytorch.org/docs/stable/export.html).
 
 ### Step 2. Invoking the Runtime
@@ -395,7 +395,6 @@ At this point, the working directory should contain the following files:
 
 If all of these are present, you can now build and run:
 ```bash
-./install_executorch.sh --clean
 (mkdir cmake-out && cd cmake-out && cmake ..)
 cmake --build cmake-out -j10
 ./cmake-out/nanogpt_runner
@@ -589,7 +588,7 @@ The delegated model should be noticeably faster compared to the non-delegated mo
 
 For more information regarding backend delegation, see the ExecuTorch guides
 for the [XNNPACK Backend](../backends-xnnpack.md),  [Core ML
-Backend](../backends-coreml.md) and [Qualcomm AI Engine Direct Backend](build-run-llama3-qualcomm-ai-engine-direct-backend.md).
+Backend](../backends-coreml.md) and [Qualcomm AI Engine Direct Backend](../backends-qualcomm.md).
 
 ## Quantization
 
@@ -661,19 +660,15 @@ edge_config = get_xnnpack_edge_compile_config()
 # Convert to edge dialect and lower to XNNPack.
 edge_manager = to_edge_transform_and_lower(traced_model, partitioner = [XnnpackPartitioner()], compile_config = edge_config)
 et_program = edge_manager.to_executorch()
-```
-
-Finally, ensure that the runner links against the `xnnpack_backend` target in CMakeLists.txt.
 
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
 ```
-add_executable(nanogpt_runner main.cpp)
-target_link_libraries(
-    nanogpt_runner
-    PRIVATE
-    executorch
-    extension_module_static # Provides the Module class
-    optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
-    xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
+
+Then run:
+```bash
+python export_nanogpt.py
+./cmake-out/nanogpt_runner
 ```
 
 For more information, see [Quantization in ExecuTorch](../quantization-overview.md).
@@ -756,7 +751,7 @@ Through the ExecuTorch Developer Tools, users are able to profile model executio
 
 ##### ETRecord generation (Optional)
 
-An ETRecord is an artifact generated at the time of export that contains model graphs and source-level metadata linking the ExecuTorch program to the original PyTorch model. You can view all profiling events without an ETRecord, though with an ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [the ETRecord docs](../etrecord.md).
+An ETRecord is an artifact generated at the time of export that contains model graphs and source-level metadata linking the ExecuTorch program to the original PyTorch model. You can view all profiling events without an ETRecord, though with an ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [the ETRecord docs](../etrecord.rst).
 
 
 In your export script, after calling `to_edge()` and `to_executorch()`, call `generate_etrecord()` with the `EdgeProgramManager` from `to_edge()` and the `ExecuTorchProgramManager` from `to_executorch()`. Make sure to copy the `EdgeProgramManager`, as the call to `to_edge_transform_and_lower()` mutates the graph in-place.
@@ -782,11 +777,14 @@ Run the export script and the ETRecord will be generated as `etrecord.bin`.
 
 An ETDump is an artifact generated at runtime containing a trace of the model execution. For more information, see [the ETDump docs](../etdump.md).
 
-Include the ETDump header in your code.
+Include the ETDump header and namespace in your code.
 ```cpp
 // main.cpp
 
 #include <executorch/devtools/etdump/etdump_flatcc.h>
+
+using executorch::etdump::ETDumpGen;
+using torch::executor::etdump_result;
 ```
 
 Create an Instance of the ETDumpGen class and pass it to the Module constructor.
@@ -850,106 +848,20 @@ This prints the performance data in a tabular format in “inspector_out.txt”,
 ![](../_static/img/llm_manual_print_data_tabular.png)
 <a href="../_static/img/llm_manual_print_data_tabular.png" target="_blank">View in full size</a>
 
-To learn more about the Inspector and the rich functionality it provides, see the [Inspector API Reference](../model-inspector.md).
+To learn more about the Inspector and the rich functionality it provides, see the [Inspector API Reference](../model-inspector.rst).
 
 ## Custom Kernels
 With the ExecuTorch custom operator APIs, custom operator and kernel authors can easily bring in their kernel into PyTorch/ExecuTorch.
 
 There are three steps to use custom kernels in ExecuTorch:
 
-1.  Write the custom kernel using ExecuTorch types.
-2.  Compile and link the custom kernel to both AOT Python environment as well as the runtime binary.
-3.  Source-to-source transformation to swap an operator with a custom op.
-
-### Writing a Custom Kernel
-
-Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see [native_functions.yaml](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)).
-
-```
-custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor
-
-custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)
-```
-
-Write your custom kernel according to the schema defined above. Use the `EXECUTORCH_LIBRARY` macro to make the kernel available to the ExecuTorch runtime.
-
-```cpp
-// custom_linear.h / custom_linear.cpp
-#include <executorch/runtime/kernel/kernel_includes.h>
-
-Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional<Tensor> bias, Tensor& out) {
-    // calculation
-    return out;
-}
-
-// Register as myop::custom_linear.out
-EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out);
-```
-
-To make this operator available in PyTorch, you can define a wrapper around the ExecuTorch custom kernel. Note that the ExecuTorch
-implementation uses ExecuTorch tensor types, while the PyTorch wrapper uses ATen tensors.
-
-```cpp
-// custom_linear_pytorch.cpp
-
-#include "custom_linear.h"
-#include <torch/library.h>
-
-at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional<at::Tensor> bias) {
-
-    // initialize out
-    at::Tensor out = at::empty({weight.size(1), input.size(1)});
-
-    // wrap kernel in custom_linear.cpp into ATen kernel
-    WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out);
-
-    return out;
-}
-
-// Register the operator with PyTorch.
-TORCH_LIBRARY(myop,  m) {
-    m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear);
-    m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3));
-}
-```
-
-### Compile and Link the Custom Kernel
-
-To make it available to the ExecuTorch runtime, compile custom_linear.h/cpp into the binary target. You can also build the kernel as a dynamically loaded library (.so or .dylib) and link it as well.
-
-To make it available to PyTorch, package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into the python environment.
-This is needed to make PyTorch aware of the custom operator at the time of export.
-
-```python
-import torch
-torch.ops.load_library("libcustom_linear.so")
-```
-
-Once loaded, you can use the custom operator in PyTorch code.
+1.  [Write the custom kernel](../kernel-library-custom-aten-kernel.md#c-api-for-custom-ops) using ExecuTorch types.
+2.  [Compile and link the custom kernel](../kernel-library-custom-aten-kernel.md#compile-and-link-the-custom-kernel) to both AOT Python environment as well as the runtime binary.
+3.  [Source-to-source transformation](../kernel-library-custom-aten-kernel.md#using-a-custom-operator-in-a-model) to swap an operator with a custom op.
 
 For more information, see [PyTorch Custom Operators](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html) and
 and [ExecuTorch Kernel Registration](../kernel-library-custom-aten-kernel.md).
 
-### Using a Custom Operator in a Model
-
-The custom operator can explicitly used in the PyTorch model, or you can write a transformation to replace instances of a core operator with the custom variant. For this example, you could find
-all instances of `torch.nn.Linear` and replace them with `CustomLinear`.
-
-```python
-def  replace_linear_with_custom_linear(module):
-    for name, child in module.named_children():
-        if isinstance(child, nn.Linear):
-            setattr(
-                module,
-                name,
-                CustomLinear(child.in_features,  child.out_features, child.bias),
-        )
-        else:
-            replace_linear_with_custom_linear(child)
-```
-
-The remaining steps are the same as the normal flow. Now you can run this module in eager mode as well as export to ExecuTorch.
-
 ## How to Build Mobile Apps
 See the instructions for building and running LLMs using ExecuTorch on iOS and Android.
 
diff --git a/docs/source/memory-planning-inspection.md b/docs/source/memory-planning-inspection.md
index 47951a72038..9f7d6d6b688 100644
--- a/docs/source/memory-planning-inspection.md
+++ b/docs/source/memory-planning-inspection.md
@@ -1,9 +1,9 @@
 # Memory Planning Inspection in ExecuTorch
 
-After the [Memory Planning](https://pytorch.org/executorch/main/concepts.html#memory-planning) pass of ExecuTorch, memory allocation information is stored on the nodes of the [`ExportedProgram`](https://pytorch.org/executorch/main/concepts.html#exportedprogram). Here, we present a tool designed to inspect memory allocation and visualize all active tensor objects.
+After the [Memory Planning](concepts.md#memory-planning) pass of ExecuTorch, memory allocation information is stored on the nodes of the [`ExportedProgram`](concepts.md#exportedprogram). Here, we present a tool designed to inspect memory allocation and visualize all active tensor objects.
 
 ## Usage
-User should add this code after they call [to_executorch()](https://pytorch.org/executorch/main/export-to-executorch-api-reference.html#executorch.exir.EdgeProgramManager.to_executorch), and it will write memory allocation information stored on the nodes to the file path "memory_profile.json". The file is compatible with the Chrome trace viewer; see below for more information about interpreting the results.
+User should add this code after they call [to_executorch()](export-to-executorch-api-reference.rst#executorch.exir.EdgeProgramManager.to_executorch), and it will write memory allocation information stored on the nodes to the file path "memory_profile.json". The file is compatible with the Chrome trace viewer; see below for more information about interpreting the results.
 
 ```python
 from executorch.util.activation_memory_profiler import generate_memory_trace
@@ -13,18 +13,18 @@ generate_memory_trace(
     enable_memory_offsets=True,
 )
 ```
-* `prog` is an instance of [`ExecuTorchProgramManager`](https://pytorch.org/executorch/main/export-to-executorch-api-reference.html#executorch.exir.ExecutorchProgramManager), returned by [to_executorch()](https://pytorch.org/executorch/main/export-to-executorch-api-reference.html#executorch.exir.EdgeProgramManager.to_executorch).
+* `prog` is an instance of [`ExecuTorchProgramManager`](export-to-executorch-api-reference.rst#executorch.exir.ExecutorchProgramManager), returned by [to_executorch()](export-to-executorch-api-reference.rst#executorch.exir.EdgeProgramManager.to_executorch).
 * Set `enable_memory_offsets` to `True` to show the location of each tensor on the memory space.
 
 ## Chrome Trace
 Open a Chrome browser tab and navigate to <chrome://tracing/>. Upload the generated `.json` to view.
 Example of a [MobileNet V2](https://pytorch.org/vision/main/models/mobilenetv2.html) model:
 
-![Memory planning Chrome trace visualization](/_static/img/memory_planning_inspection.png)
+![Memory planning Chrome trace visualization](_static/img/memory_planning_inspection.png)
 
 Note that, since we are repurposing the Chrome trace tool, the axes in this context may have different meanings compared to other Chrome trace graphs you may have encountered previously:
 * The horizontal axis, despite being labeled in seconds (s), actually represents megabytes (MBs).
 * The vertical axis has a 2-level hierarchy. The first level, "pid", represents memory space. For CPU, everything is allocated on one "space"; other backends may have multiple. In the second level, each row represents one time step. Since nodes will be executed sequentially, each node represents one time step, thus you will have as many nodes as there are rows.
 
 ## Further Reading
-* [Memory Planning](https://pytorch.org/executorch/main/compiler-memory-planning.html)
+* [Memory Planning](compiler-memory-planning.md)
diff --git a/docs/source/model-debugging.md b/docs/source/model-debugging.md
index 5475a703bd7..5cf0d7633fc 100644
--- a/docs/source/model-debugging.md
+++ b/docs/source/model-debugging.md
@@ -13,8 +13,8 @@ Currently, ExecuTorch supports the following debugging flows:
 ### Runtime
 For a real example reflecting the steps below, please refer to [example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp).
 
-1. [Optional] Generate an [ETRecord](./etrecord.rst) while exporting your model. When provided, this enables users to link profiling information back to the eager model source code (with stack traces and module hierarchy).
-2. Integrate [ETDump generation](./etdump.md) into the runtime and set the debugging level by configuring the `ETDumpGen` object. Then, provide an additional buffer to which intermediate outputs and program outputs will be written. Currently we support two levels of debugging:
+1. [Optional] Generate an [ETRecord](etrecord.rst) while exporting your model. When provided, this enables users to link profiling information back to the eager model source code (with stack traces and module hierarchy).
+2. Integrate [ETDump generation](etdump.md) into the runtime and set the debugging level by configuring the `ETDumpGen` object. Then, provide an additional buffer to which intermediate outputs and program outputs will be written. Currently we support two levels of debugging:
     - Program level outputs
     ```C++
     Span<uint8_t> buffer((uint8_t*)debug_buffer, debug_buffer_size);
@@ -30,12 +30,12 @@ For a real example reflecting the steps below, please refer to [example_runner.c
     etdump_gen.set_event_tracer_debug_level(
         EventTracerDebugLogLevel::kIntermediateOutputs);
     ```
-3. Build the runtime with the pre-processor flag that enables tracking of debug events. Instructions are in the [ETDump documentation](./etdump.md).
-4. Run your model and dump out the ETDump buffer as described [here](./etdump.md). (Do so similarly for the debug buffer if configured above)
+3. Build the runtime with the pre-processor flag that enables tracking of debug events. Instructions are in the [ETDump documentation](etdump.md).
+4. Run your model and dump out the ETDump buffer as described [here](etdump.md). (Do so similarly for the debug buffer if configured above)
 
 
 ### Accessing the debug outputs post run using the Inspector API's
-Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](./model-inspector.rst) to inspect these debug outputs.
+Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](model-inspector.rst) to inspect these debug outputs.
 
 ```python
 from executorch.devtools import Inspector
diff --git a/docs/source/new-contributor-guide.md b/docs/source/new-contributor-guide.md
new file mode 100644
index 00000000000..d2074a3379f
--- /dev/null
+++ b/docs/source/new-contributor-guide.md
@@ -0,0 +1,304 @@
+# New Contributor Guide
+
+Welcome to **ExecuTorch** — a runtime for efficient deployment of PyTorch AI models to edge devices, including mobile phones, wearables, and embedded systems. ExecuTorch is proudly open-source and welcomes contributions from developers of all backgrounds.
+
+If you're new to ExecuTorch, open-source projects, or GitHub, this guide is for you. We're excited to have you on board!
+
+If you have any questions, issues, comments, or just want to say hello to our community, please feel free to introduce yourselves on our **[Discord Server](https://discord.com/invite/Dh43CKSAdc)**. We'd love to speak with you.
+
+---
+
+## 🔑 Prerequisites
+
+### Git
+
+This guide assumes a basic knowledge of Git, and how to run Git commands in your terminal. If you've never used Git before, you can read [this quick guide](https://www.freecodecamp.org/news/learn-the-basics-of-git-in-under-10-minutes-da548267cc91/), [git guide](https://rogerdudler.github.io/git-guide/), [cheat sheet](https://towardsdatascience.com/git-commands-cheat-sheet-software-developer-54f6aedc1c46/), the [Setup Git](https://docs.github.com/en/get-started/git-basics/set-up-git) page from GitHub’s documentation, or watch one of the many tutorials on YouTube.
+
+Git is a powerful version control system for coding projects — it enables you to collaborate, record code changes, and avoid losing hours of work when you make a mistake. It is essential for projects like ExecuTorch with large codebases and many collaborators. Without it, the complexity of tracking everyone's changes, reviewing their code, and identifying bugs, quickly becomes unmanageable.
+
+Git is an industry standard in the coding world, and particularly in open-source. It can take a while to get used to at first, but we promise you it's well worth the effort! We believe that learning Git can make you a significantly stronger and more effective developer.
+
+### A GitHub Account
+
+We also assume that you have a GitHub account. If you don't, please [register here](https://github.com/signup), [verify your email address](https://docs.github.com/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/verifying-your-email-address#verifying-your-email-address) (required for the steps below to work!), then [login](https://github.com/login) to your new account before proceeding further.
+
+---
+
+## 🧑‍💻 Your First Contribution
+
+The first step towards making a contribution is finding something you want to work on. If you're new to ExecuTorch or the wider world of open-source, it might seem hard to know where to start.
+
+To help you out with this, we've gathered together some beginner-friendly suggestions.  These are self-contained pieces of work — "issues" in GitHub parlance — specifically designed to help people new to ExecuTorch get started contributing code. We call these "good first issues", and you can view all of them here: [New Contributors Projects and Issues](https://github.com/orgs/pytorch/projects/102/views/1).
+
+Here's what the list looks like at the time of writing — you can see that they all have a purple `good first issue` label in the right-hand column:
+
+![](_static/img/new-contributor-guide/good_first_issues.png)
+
+Please check it out and see if anything interests you! New issues are added to this list all the time.
+
+Once you've found an issue you like the look of, read our [Contribution Guide](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md). This comprehensive manual will help you:
+* build ExecuTorch on your machine.
+* understand the structure of the ExecuTorch codebase.
+* format, test, and document your code according to ExecuTorch best practices.
+* and finally, submit your code for review, so it can be polished, approved, and merged into the main codebase.
+
+If that seems like a lot of information, please read on — we'll walk you through your first contribution right now.
+
+---
+
+## 📤 Contributing Code, Step-By-Step
+
+### Prepare Your Workspace
+
+Before you can start writing any code, you need to get a copy of ExecuTorch codebase onto your GitHub account, and download it onto your dev machine. You'll want to build it, too — otherwise, you won't be able to test your solution.
+
+1. Fork the main ExecuTorch repository into your GitHub account. This creates a clone of the repository in your own space, so you can modify it freely. To do this, visit the [main repository page](https://github.com/pytorch/executorch) and click `Fork`:
+
+    ![](_static/img/new-contributor-guide/how_to_fork1.png)
+
+    This will take you to another page. Click `Create fork`:
+
+    ![](_static/img/new-contributor-guide/how_to_fork2.png)
+
+2. Clone your fork locally. This downloads a copy of your fork onto your dev machine, ready for you to make your changes.
+
+    In the example below, we clone using HTTP, but any of the provided methods on the `Local` tab are fine. For HTTP, copy the URL given here:
+
+    ![](_static/img/new-contributor-guide/how_to_clone.png)
+
+    Then go to your terminal, enter the directory you want to clone the fork to, and run:
+
+    ```bash
+    git clone https://github.com/pytorch/executorch.git
+    ```
+
+    This will create an `executorch` folder in your directory containing your forked codebase.
+
+3.  Set the `upstream` pointing to the main ExecuTorch repository. This will allow you to easily synchronize with the latest development.
+
+    Assuming you're in the same directory you cloned into, run:
+
+    ```bash
+    cd executorch # enter the cloned project
+    git remote add upstream https://github.com/pytorch/executorch.git
+    ```
+
+    To see if it worked, run:
+
+    ```bash
+    git remote -v
+    ```
+
+    Depending on how you cloned your repo (HTTP, SSH, etc.), this should print something like:
+
+    ```bash
+    origin  https://github.com/{YOUR_GITHUB_USERNAME}/executorch.git (fetch)
+    origin  https://github.com/{YOUR_GITHUB_USERNAME}/executorch.git (push)
+    upstream        https://github.com/pytorch/executorch.git (fetch)
+    upstream        https://github.com/pytorch/executorch.git (push)
+    ```
+
+    What does this mean? Well:
+
+      * The `origin` entries show your forked GitHub repository. They tell you that when you run `git pull` or `git push`, your changes will go from/to your GitHub fork.
+
+      * The `upstream` entries show the main ExecuTorch repository. If you want to sync the latest changes from there, you can run `git fetch upstream`.
+
+    Let's sync from both your fork _and_ the main ExecuTorch branch, getting the latest changes from each of them. To do this, run:
+
+    ```bash
+    git fetch --all --prune
+    ```
+
+4. If you just cloned your fork, your GitHub repository will tell you your branch is up-to-date:
+
+    ![](_static/img/new-contributor-guide/synced_fork.png)
+
+    However, ExecuTorch updates frequently — if it's been a while you visited your fork, you might not have the latest version anymore. It's important to keep your fork as up-to-date as possible. Otherwise, the code changes you're making might fix your issue for an old version of the codebase, but _not_ fix it for the current version.
+
+    GitHub will tell you if your fork is out-of-date. To synchronise the necessary changes, click `Sync fork`, then `Update branch` as shown:
+
+    ![](_static/img/new-contributor-guide/unsynced_fork.png)
+
+5. Now you have the latest fork on your GitHub account, it's time to download it onto your dev machine. For this, you can run the following commands in your terminal:
+
+    ```bash
+    git fetch --all --prune   # pull all branches from GitHub
+    git checkout main         # enter your local main branch
+    git merge upstream/main   # merge latest state from GitHub parent repo
+    git push                  # push updated local main to your GitHub fork
+    ```
+
+6. [Build the project](using-executorch-building-from-source.md) and [run the tests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#testing).
+
+    Unfortunately, this step is too long to detail here. If you get stuck at any point, please feel free to ask for help on our [Discord server](https://discord.com/invite/Dh43CKSAdc) — we're always eager to help newcomers get onboarded.
+
+One final note before we finish this section. It's very important to get your tests running at this stage, for two reasons:
+
+* If they work, it's a great sign that you've got things set up correctly.
+
+* As we'll discuss later, you'll want to run the tests _after_ making your changes to ensure you haven't broken existing functionality. Running them _before_ making your changes gives you a baseline you can compare with later test results.
+
+### Implement your changes
+
+Great job — you're all set up. Now you can actually start coding!
+
+1. Before making any changes, we recommend creating a new branch. To do this, just run:
+    ```bash
+    git checkout -b YOUR_NEW_BRANCH_NAME
+    ```
+
+    You can follow this naming convention: `type/<short-name>`, where the types are: `bugfix`, `feature`, `docs`, `tests`, etc. — or use something similarly descriptive. By way of example, here are a few branch names that were actually merged to ExecuTorch:
+
+    * [bugfix/op_eq](https://github.com/pytorch/executorch/pull/9794)
+
+    * [error-handling-log-intermediate-output-delegate](https://github.com/pytorch/executorch/pull/9759)
+
+    * [add-datasink-try-before-set-tests](https://github.com/pytorch/executorch/pull/9762)
+
+    Creating a new branch means that any changes you make will be isolated to your branch, allowing you to work on multiple issues in parallel. It also means that, if your fork gets behind the main repository and you have to synchronise, you won't need to deal with any merge conflicts — accidentally blocking your `main` branch can be very time-consuming.
+
+2. Make your changes. For bugfixes, we recommend a test-driven workflow:
+    - Find a test case that demonstrates your bug.
+    - Verify that your new test case fails on the `main` branch.
+    - Add that example as an automated test, and assert the expected failing results. If you can, try to make this test as minimal as possible to reduce interference with some other issue.
+
+    Once you have a failing test, you can keep working on the issue and running the test until it passes.
+
+    **Note:** Even if you do not find the solution, sending a PR with a test covering the issue is a valid contribution. From this point, we can help you find the solution, or even finish it with you.
+
+3. After every set of edits, checkpoint and commit your code changes with a "commit" message that describes the changes you made. For example, in terminal:
+
+    ```bash
+    git add my_changed_file1 my_new_test_case # Pick the files you changed
+    git commit -m "Fixed bug X and added a passing test case" # Describe your change
+    ```
+
+    Try to make your commit messages as descriptive as possible. This helps to maintain a clear project history. Not only will this help your own development, but it will make your code vastly easier for other developers to review and maintain.
+
+    Here are some example commit messages that were merged to ExecuTorch:
+
+    * [Delete examples/demo-apps/apple_ios/ExecuTorchDemo directory](https://github.com/pytorch/executorch/pull/9991/commits/df2f451e5e8fc217231975d7a0065a8cc36709cb)
+    * [[ET-VK][ez] Allow logit linear layer to be lowered to Vulkan](https://github.com/pytorch/executorch/pull/9951/commits/3fdd8cab8c58db0be666f3454c41f73ad5964743)
+    * [Allow emitting mutable buffer names in schema](https://github.com/pytorch/executorch/pull/9935/commits/773a34725afea6c0bf1b99d02a9cefb91c4960e1)
+
+4. When you are done making changes and the test case you added is passing, [run the same tests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#testing) you ran earlier (at the end of the [Prepare Your Workspace](#prepare-your-workspace) section).
+
+    If any tests fail now which were working before, it means your changes have broken some existing functionality. You'll need to dig back into your code to figure out what's gone wrong.
+
+5. Once your new test _and_ the old tests are all working as intended, upload/push these changes to your fork:
+
+    ```bash
+    # Make sure you've committed all your changes first, then run:
+    git push
+    ```
+
+### Submit a PR
+
+Once you've successfully finished local development, it's time to send out your pull request. This is the final phase — here, we'll help you finetune your changes to get merged into the main repository.
+
+1. After pushing your last edit to remote, your GitHub fork will show your new changed branch — click `Compare & pull request`:
+
+    ![](_static/img/new-contributor-guide/how_to_pr1.png)
+
+    Alternatively, you can click the same `Compare & pull request` button on the main ExecuTorch repo:
+
+    ![](_static/img/new-contributor-guide/how_to_pr2.png)
+
+    Another way still is via the `Pull request` tab on the main repo — we won't go into that here though, as it takes a few more steps.
+
+2. This will take you to a page where you can format your PR and explain your changes. You'll see all the required details in our PR template. You should choose a title describing the proposed fix and fill in all the required details.
+
+    ![](_static/img/new-contributor-guide/how_to_pr3.png)
+
+    In the description, you’ll describe all the changes you’ve made.
+
+3. If you want to submit your PR right away, you can go ahead and click the Green `Create pull request` button. However, please note that this will immediately notify all reviewers. We strongly recommend creating a Draft PR first. This will allow you to perform some extra checks first:
+
+    * You can get some early feedback on your PR without notifying everybody.
+
+    * It prevents anyone from accidentally merging your unfinished PR.
+
+    * Creating it will start CI (["Continuous Integration"](https://en.wikipedia.org/wiki/Continuous_integration)) checks to verify that all tests pass under various configurations. If some tests fail, you can fix them before creating the final PR.
+
+    To do submit a draft, click the arrow next to the `Create Pull Request` button, then click `Create draft pull request` in the dropdown menu:
+
+    ![](_static/img/new-contributor-guide/how_to_draft_pr1.png)
+
+    This will change the green button's text to `Draft pull request`:
+
+    ![](_static/img/new-contributor-guide/how_to_draft_pr2.png)
+
+    Click it to create your draft PR.
+
+4. This will take you to your Draft PR page. It might look something like this:
+
+    ![](_static/img/new-contributor-guide/how_to_draft_pr3.png)
+
+    As you scroll down, you might see a number of comments and automated checks, some of which may come with alarming red warning signs and the word "Failure"! There's no need to panic, though — they are here to help. Let's go through some common checks one-by-one.
+
+    * The `pytorch-bot` will probably be the first comment. It runs regular CI checks. When your PR is passing, this comment will automatically update to let you know.
+
+      ![](_static/img/new-contributor-guide/ci1.png)
+
+    * If this is your very first contribution to a Meta Open Source project, and you've not signed Meta's contributor license agreement (CLA), you may have a comment like this from `facebook-github-bot`:
+
+        ![](_static/img/new-contributor-guide/cla1.png)
+
+        You will need to sign the linked CLA to contribute your code. Once your signature has been processed, the bot will let you know in another comment:
+
+        ![](_static/img/new-contributor-guide/cla2.png)
+
+    * You may see a comment from `github-actions` requesting a "release notes" label:
+
+        ![](_static/img/new-contributor-guide/release_notes.png)
+
+        As the comment says, you can add a label by commenting on the PR with an instruction to pytorchbot. You can see a list of all our labels [here](https://github.com/pytorch/executorch/labels/). Pick the one which fits your PR best, then add it as a comment using the syntax `@pytorchbot label "YOUR LABEL HERE"`. For example:
+
+        ![](./_static/img/new-contributor-guide/how_to_label1.png)
+
+        After you've submitted your comment, `pytorchbot` will add your chosen label to the PR:
+
+        ![](./_static/img/new-contributor-guide/how_to_label2.png)
+
+        and the `github-actions` comment requesting a label will disappear.
+
+    * At the end of your Draft PR, you'll see something like this:
+
+        ![](_static/img/new-contributor-guide/end_of_draft_pr1.png)
+
+        This is a summary of all the CI checks and requirements which need to be satisfied before your PR can be merged. Ensure that all tests are passing. If not, click on a failing test to see what went wrong and make the required changes.
+
+        Once you're happy with your draft, you can click the `Ready for review` button to create your PR:
+
+        ![](_static/img/new-contributor-guide/end_of_draft_pr2.png)
+
+5. Now you've created your PR, it's time for your changes to be reviewed by the ExecuTorch community and maintainers.
+
+    You'll need approval from one of our core contributors for your request to be merged. They may have questions or suggestions for you to address or respond to. Be aware that the review process may take a couple of iterations... Nevertheless, we hope that you'll find this feedback helpful. Code reviews can be a fantastic way to learn more about ExecuTorch and coding best practices from other contributors.
+
+    Those reviewers/maintainers are here to finetune your contribution and eventually catch some issues before we merge the PR. We aim for this process to be pleasing on both sides: we try to give and get the best.
+
+    Once the reviewers are happy, they'll approve your PR, indicating that they're happy for it to be merged. This will send you a notification and display as follows on your PR page:
+
+    ![](_static/img/new-contributor-guide/pr_approval1.png)
+
+    And in the PR comments:
+
+    ![](_static/img/new-contributor-guide/pr_approval2.png)
+
+6. Once you've received the required approval from a core contributor, you're very nearly done. We just need to make sure all the CI checks have passed, some of which need approval from a maintainer to start:
+
+    ![](_static/img/new-contributor-guide/how_to_merge1.png)
+
+    Once all checks these have all been approved, ran, and passed, you can go ahead and merge your PR. If there's a grey `Update branch` button instead of a green `Merge pull request` button, click that first:
+
+    ![](_static/img/new-contributor-guide/how_to_merge2.png)
+
+    After a moment, the branch should update with the latest changes, and you'll see the final green `Merge pull request` button:
+
+    ![](_static/img/new-contributor-guide/how_to_merge3.png)
+
+    Click it to merge your changes into the main codebase. Congratulations — you're now an official ExecuTorch contributor!
+
+Great job making it to the end of our guide — we hope you enjoy contributing. Once again, please check out our **[Discord Server](https://discord.com/invite/Dh43CKSAdc)** if you want to say hello, ask any questions, or talk about any and all things ExecuTorch. We look forward to receiving your contributions!
diff --git a/docs/source/quantization-overview.md b/docs/source/quantization-overview.md
index e80cfd2eb83..62411f0d8c4 100644
--- a/docs/source/quantization-overview.md
+++ b/docs/source/quantization-overview.md
@@ -1,11 +1,11 @@
 # Quantization Overview
-Quantization is a process that reduces the precision of computations and lowers memory footprint in the model. To learn more, please visit the [ExecuTorch concepts page](./concepts.md#quantization). This is particularly useful for edge devices including wearables, embedded devices and microcontrollers, which typically have limited resources such as processing power, memory, and battery life. By using quantization, we can make our models more efficient and enable them to run effectively on these devices.
+Quantization is a process that reduces the precision of computations and lowers memory footprint in the model. To learn more, please visit the [ExecuTorch concepts page](concepts.md#quantization). This is particularly useful for edge devices including wearables, embedded devices and microcontrollers, which typically have limited resources such as processing power, memory, and battery life. By using quantization, we can make our models more efficient and enable them to run effectively on these devices.
 
 In terms of flow, quantization happens early in the ExecuTorch stack:
 
-![ExecuTorch Entry Points](/_static/img/executorch-entry-points.png)
+![ExecuTorch Entry Points](_static/img/executorch-entry-points.png)
 
-A more detailed workflow can be found in the [ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial).
+A more detailed workflow can be found in the [ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
 
 Quantization is usually tied to execution backends that have quantized operators implemented. Thus each backend is opinionated about how the model should be quantized, expressed in a backend specific ``Quantizer`` class. ``Quantizer`` provides API for modeling users in terms of how they want their model to be quantized and also passes on the user intention to quantization workflow.
 
@@ -13,11 +13,11 @@ Backend developers will need to implement their own ``Quantizer`` to express how
 
 Modeling users will use the ``Quantizer`` specific to their target backend to quantize their model, e.g. ``XNNPACKQuantizer``.
 
-For an example quantization flow with ``XNPACKQuantizer``, more documentation and tutorials, please see ``Performing Quantization`` section in [ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial).
+For an example quantization flow with ``XNPACKQuantizer``, more documentation and tutorials, please see ``Performing Quantization`` section in [ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
 
 ## Source Quantization: Int8DynActInt4WeightQuantizer
 
-In addition to export based quantization (described above), ExecuTorch wants to highlight source based quantizations, accomplished via [torchao](https://github.com/pytorch/ao). Unlike export based quantization, source based quantization directly modifies the model prior to export. One specific example is `Int8DynActInt4WeightQuantizer`. 
+In addition to export based quantization (described above), ExecuTorch wants to highlight source based quantizations, accomplished via [torchao](https://github.com/pytorch/ao). Unlike export based quantization, source based quantization directly modifies the model prior to export. One specific example is `Int8DynActInt4WeightQuantizer`.
 
 This scheme represents 4-bit weight quantization with 8-bit dynamic quantization of activation during inference.
 
diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md
index b86642ca0d9..43692f49a1b 100644
--- a/docs/source/running-a-model-cpp-tutorial.md
+++ b/docs/source/running-a-model-cpp-tutorial.md
@@ -12,7 +12,7 @@ each API please see the [Runtime API Reference](executorch-runtime-api-reference
 ## Prerequisites
 
 You will need an ExecuTorch model to follow along. We will be using
-the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](./tutorials/export-to-executorch-tutorial).
+the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
 
 ## Model Loading
 
diff --git a/docs/source/runtime-backend-delegate-implementation-and-linking.md b/docs/source/runtime-backend-delegate-implementation-and-linking.md
index 146609d05a8..ce5da006ec5 100644
--- a/docs/source/runtime-backend-delegate-implementation-and-linking.md
+++ b/docs/source/runtime-backend-delegate-implementation-and-linking.md
@@ -1,6 +1,6 @@
 # Backend Delegate Implementation and Linking
 
 Please refer to:
-- The "Runtime Initialization and Execution" section of [Compiler Backend and Delegate](./compiler-delegate-and-partitioner.md).
-- [Integrating a Backend Delegate into ExecuTorch](./backend-delegates-integration.md).
-- [Third-Party Dependency Management for Backend Delegates](./backend-delegates-dependencies.md).
+- The "Runtime Initialization and Execution" section of [Compiler Backend and Delegate](compiler-delegate-and-partitioner.md).
+- [Integrating a Backend Delegate into ExecuTorch](backend-delegates-integration.md).
+- [Third-Party Dependency Management for Backend Delegates](backend-delegates-dependencies.md).
diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md
index 911d0c142e8..b1aa3870dd6 100644
--- a/docs/source/runtime-overview.md
+++ b/docs/source/runtime-overview.md
@@ -11,7 +11,7 @@ Works](intro-how-it-works.md).
 At the highest level, the ExecuTorch runtime is responsible for:
 
 * Loading binary `.pte` program files that were generated by the
-  [`to_executorch()`](./tutorials/export-to-executorch-tutorial) step of the
+  [`to_executorch()`](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) step of the
   model-lowering process.
 * Executing the series of instructions that implement a lowered model.
 
@@ -22,7 +22,7 @@ This diagram shows the high-level flow of, and components involved with,
 exporting and executing an ExecuTorch program:
 
 ![High-level diagram of the ExecuTorch
-Runtime](/_static/img/runtime-overview-high-level.png)
+Runtime](_static/img/runtime-overview-high-level.png)
 
 The runtime is also responsible for:
 
diff --git a/docs/source/runtime-profiling.md b/docs/source/runtime-profiling.md
index c228971d28c..120d31954fd 100644
--- a/docs/source/runtime-profiling.md
+++ b/docs/source/runtime-profiling.md
@@ -4,20 +4,20 @@ Profiling in ExecuTorch gives users access to these runtime metrics:
 - Model Load Time.
 - Operator Level Execution Time.
 - Delegate Execution Time.
-  - If the delegate that the user is calling into has been integrated with the [Developer Tools](./delegate-debugging.md), then users will also be able to access delegated operator execution time.
+  - If the delegate that the user is calling into has been integrated with the [Developer Tools](delegate-debugging.md), then users will also be able to access delegated operator execution time.
 - End-to-end Inference Execution Time.
 
 One uniqe aspect of ExecuTorch Profiling is the ability to link every runtime executed operator back to the exact line of python code from which this operator originated. This capability enables users to easily identify hotspots in their model, source them back to the exact line of Python code, and optimize if chosen to.
 
-We provide access to all the profiling data via the Python [Inspector API](./model-inspector.rst). The data mentioned above can be accessed through these interfaces, allowing users to perform any post-run analysis of their choice.
+We provide access to all the profiling data via the Python [Inspector API](model-inspector.rst). The data mentioned above can be accessed through these interfaces, allowing users to perform any post-run analysis of their choice.
 
 ## Steps to Profile a Model in ExecuTorch
 
-1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model. If provided this will enable users to link back profiling details to eager model source code (with stack traces and module hierarchy).
-2.  Build the runtime with the pre-processor flags that enable profiling. Detailed in the [ETDump documentation](./etdump.md).
-3. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
-4. Create an instance of the [Inspector API](./model-inspector.rst) by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
+1. [Optional] Generate an [ETRecord](etrecord.rst) while you're exporting your model. If provided this will enable users to link back profiling details to eager model source code (with stack traces and module hierarchy).
+2.  Build the runtime with the pre-processor flags that enable profiling. Detailed in the [ETDump documentation](etdump.md).
+3. Run your Program on the ExecuTorch runtime and generate an [ETDump](etdump.md).
+4. Create an instance of the [Inspector API](model-inspector.rst) by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
     - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level.
 
 
-Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model.
+Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a step-by-step walkthrough of the above process on a sample model.
diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md
deleted file mode 100644
index 488ade7bac8..00000000000
--- a/docs/source/sdk-bundled-io.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Bundled Program -- a Tool for ExecuTorch Model Validation
-
-Please update your link to <https://pytorch.org/executorch/main/bundled-io.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md
deleted file mode 100644
index 3e975875f21..00000000000
--- a/docs/source/sdk-debugging.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Debugging Models in ExecuTorch
-
-Please update your link to <https://pytorch.org/executorch/main/model-debugging.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-delegate-integration.md b/docs/source/sdk-delegate-integration.md
deleted file mode 100644
index 7c2c9e92a90..00000000000
--- a/docs/source/sdk-delegate-integration.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Debug in Delegates
-
-Please update your link to <https://pytorch.org/executorch/main/delegate-debugging.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md
deleted file mode 100644
index a765d4cf1b4..00000000000
--- a/docs/source/sdk-etdump.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Prerequisite | ETDump - ExecuTorch Dump
-
-Please update your link to <https://pytorch.org/executorch/main/etdump.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst
deleted file mode 100644
index ee8f9b2b2d2..00000000000
--- a/docs/source/sdk-etrecord.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Prerequisite | ETRecord - ExecuTorch Record
-===========================================
-
-Please update your link to <https://pytorch.org/executorch/main/etrecord.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst
deleted file mode 100644
index 0019528f419..00000000000
--- a/docs/source/sdk-inspector.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Inspector APIs
-==============
-
-Please update your link to <https://pytorch.org/executorch/main/model-inspector.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-overview.md b/docs/source/sdk-overview.md
deleted file mode 100644
index 1e8f1fae1ba..00000000000
--- a/docs/source/sdk-overview.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Introduction to the ExecuTorch Developer Tools
-
-Please update your link to <https://pytorch.org/executorch/main/devtools-overview.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-profiling.md b/docs/source/sdk-profiling.md
deleted file mode 100644
index 9c99a979757..00000000000
--- a/docs/source/sdk-profiling.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Profiling Models in ExecuTorch
-
-Please update your link to <https://pytorch.org/executorch/main/runtime-profiling.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/sdk-tutorial.md b/docs/source/sdk-tutorial.md
deleted file mode 100644
index 457d3b47ebf..00000000000
--- a/docs/source/sdk-tutorial.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## Developer Tools Usage Tutorial
-
-Please update your link to <https://pytorch.org/executorch/main/devtools-tutorial.html>. This URL will be deleted after v0.4.0.
diff --git a/docs/source/tutorial-arm-ethos-u.md b/docs/source/tutorial-arm-ethos-u.md
index 432b901e560..70db1e11ef6 100644
--- a/docs/source/tutorial-arm-ethos-u.md
+++ b/docs/source/tutorial-arm-ethos-u.md
@@ -6,9 +6,9 @@
 
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](./intro-how-it-works.md)
-* [Getting Started](./getting-started.md)
-* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
+* [Introduction to ExecuTorch](intro-how-it-works.md)
+* [Getting Started](getting-started.md)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 :::
 
 :::{grid-item-card}  What you will learn in this tutorial:
@@ -246,7 +246,7 @@ python3 -m examples.arm.aot_arm_compiler --model_name="add" --delegate
 ### Delegated Quantized Workflow
 Before generating the `.pte` file for delegated quantized networks like MobileNetV2, you need to build the `quantized_ops_aot_lib`
 
-You can just run the `backends/arm/scripts/build_quantized_ops_aot_lib.sh` script to build this for you or build it yourself like this. 
+You can just run the `backends/arm/scripts/build_quantized_ops_aot_lib.sh` script to build this for you or build it yourself like this.
 
 ```bash
 
@@ -295,7 +295,7 @@ Also before you get started, make sure that you have completed ExecuTorch cmake
 
 The block diagram below demonstrates, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable.
 
-![](./arm-delegate-runtime-build.svg)
+![](arm-delegate-runtime-build.svg)
 
 ```{tip}
 The `generate_pte_file` function in `run.sh` script produces the `.pte` files based on the models provided through `--model_name` input argument
@@ -305,7 +305,7 @@ The `generate_pte_file` function in `run.sh` script produces the `.pte` files ba
 
 ExecuTorch's CMake build system produces a set of build pieces which are critical to building the ExecuTorch runtime with-in the bare-metal environment you have for Corstone FVPs from Ethos-U SDK.
 
-[This](./using-executorch-building-from-source.md) document provides a detailed overview of each individual build piece. For running either variant of the `.pte` file, you will need a core set of libraries. Here is a list,
+[This](using-executorch-building-from-source.md) document provides a detailed overview of each individual build piece. For running either variant of the `.pte` file, you will need a core set of libraries. Here is a list,
 
 - `libexecutorch.a`
 - `libportable_kernels.a`
@@ -389,12 +389,12 @@ I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr2 : 0
 I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr3 : 503
 I [executorch:arm_perf_monitor.cpp:178] Ethos-U PMU Events:[ETHOSU_PMU_EXT0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]
 I [executorch:arm_executor_runner.cpp:470] model_pte_loaded_size:     4425968 bytes.
-I [executorch:arm_executor_runner.cpp:484] method_allocator_used:     1355722 / 62914560  free: 61558838 ( used: 2 % ) 
+I [executorch:arm_executor_runner.cpp:484] method_allocator_used:     1355722 / 62914560  free: 61558838 ( used: 2 % )
 I [executorch:arm_executor_runner.cpp:491] method_allocator_planned:  752640 bytes
 I [executorch:arm_executor_runner.cpp:493] method_allocator_loaded:   966 bytes
 I [executorch:arm_executor_runner.cpp:494] method_allocator_input:    602116 bytes
 I [executorch:arm_executor_runner.cpp:495] method_allocator_executor: 0 bytes
-I [executorch:arm_executor_runner.cpp:498] temp_allocator_used:       0 / 1048576 free: 1048576 ( used: 0 % ) 
+I [executorch:arm_executor_runner.cpp:498] temp_allocator_used:       0 / 1048576 free: 1048576 ( used: 0 % )
 I [executorch:arm_executor_runner.cpp:152] Model executed successfully.
 I [executorch:arm_executor_runner.cpp:156] 1 outputs:
 Output[0][0]: -0.749744
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 7fc97476ef7..add60a12deb 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -10,9 +10,9 @@ In this tutorial, you will learn how to export an XNNPACK lowered Model and run
 :::
 :::{grid-item-card}  Before you begin it is recommended you go through the following:
 :class-card: card-prerequisites
-* [Setting up ExecuTorch](./getting-started-setup.md)
-* [Model Lowering Tutorial](./tutorials/export-to-executorch-tutorial)
-* [ExecuTorch XNNPACK Delegate](./backends-xnnpack.md)
+* [Setting up ExecuTorch](getting-started-setup.rst)
+* [Model Lowering Tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
+* [ExecuTorch XNNPACK Delegate](backends-xnnpack.md)
 :::
 ::::
 
@@ -176,7 +176,7 @@ Now you should be able to find the executable built at `./cmake-out/backends/xnn
 ```
 
 ## Building and Linking with the XNNPACK Backend
-You can build the XNNPACK backend [CMake target](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt#L83), and link it with your application binary such as an Android or iOS application. For more information on this you may take a look at this [resource](./using-executorch-android.md) next.
+You can build the XNNPACK backend [CMake target](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt#L83), and link it with your application binary such as an Android or iOS application. For more information on this you may take a look at this [resource](using-executorch-android.md) next.
 
 ## Profiling
 To enable profiling in the `xnn_executor_runner` pass the flags `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` and `-DEXECUTORCH_BUILD_DEVTOOLS=ON` to the build command (add `-DENABLE_XNNPACK_PROFILING=ON` for additional details). This will enable ETDump generation when running the inference and enables command line flags for profiling (see `xnn_executor_runner --help` for details).
diff --git a/docs/source/tutorials_source/README.txt b/docs/source/tutorials_source/README.txt
index b8717197d14..2677c1ea882 100644
--- a/docs/source/tutorials_source/README.txt
+++ b/docs/source/tutorials_source/README.txt
@@ -3,4 +3,4 @@ Tutorials
 
 1. tutorials/*
        Getting Started Tutorials
-       https://pytorch.org/executorch/tutorials/template_tutorial.html
+       https://github.com/pytorch/executorch/blob/main/docs/source/tutorials_source/template_tutorial.py
diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py
deleted file mode 100644
index b9a8009c646..00000000000
--- a/docs/source/tutorials_source/sdk-integration-tutorial.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Using the ExecuTorch Developer Tools to Profile a Model
-========================
-
-Please update your link to <https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial.html>. This URL will be deleted after v0.4.0.
-"""
diff --git a/docs/source/tutorials_source/template_tutorial.py b/docs/source/tutorials_source/template_tutorial.py
index 02d7fa89e13..b11df51f4af 100644
--- a/docs/source/tutorials_source/template_tutorial.py
+++ b/docs/source/tutorials_source/template_tutorial.py
@@ -9,7 +9,7 @@
 Template Tutorial
 =================
 
-**Author:** `FirstName LastName <https://github.com/username>`_
+**Author:** `FirstName LastName <https://github.com/{username}>`_
 
 .. grid:: 2
 
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
index f1a18ba4ae4..8ac179d325d 100644
--- a/docs/source/using-executorch-android.md
+++ b/docs/source/using-executorch-android.md
@@ -2,7 +2,7 @@
 
 To use from Android, ExecuTorch provides Java/Kotlin API bindings and Android platform integration, available as an AAR file.
 
-Note: This page covers Android app integration through the AAR library. The ExecuTorch C++ APIs can also be used from Android native, and the documentation can be found on [this page about cross compilation](https://pytorch.org/executorch/main/using-executorch-building-from-source.html#cross-compilation).
+Note: This page covers Android app integration through the AAR library. The ExecuTorch C++ APIs can also be used from Android native, and the documentation can be found on [this page about cross compilation](using-executorch-building-from-source.md#cross-compilation).
 
 ## Installation
 
@@ -12,7 +12,7 @@ All ExecuTorch Android libraries are packaged into an [Android library (AAR)](ht
 
 The AAR artifact contains the Java library for users to integrate with their Java/Kotlin application code, as well as the corresponding JNI library (.so file), which is loaded by the Java code during initialization.
 
-- [Java library](https://github.com/pytorch/executorch/tree/main/extension/android/src/main/java/org/pytorch/executorch)
+- [Java library](https://github.com/pytorch/executorch/tree/main/extension/android/executorch_android/src/main/java/org/pytorch/executorch)
 - JNI contains the JNI binding for the corresponding Java code, and ExecuTorch native library, including
   - core ExecuTorch runtime libraries
   - XNNPACK backend
@@ -41,8 +41,8 @@ dependencies {
 Note: If you want to use release v0.5.0, please use dependency `org.pytorch:executorch-android:0.5.1`.
 
 Click the screenshot below to watch the *demo video* on how to add the package and run a simple ExecuTorch model with Android Studio.
-<a href="https://pytorch.org/executorch/main/_static/img/android_studio.mp4">
-  <img src="https://pytorch.org/executorch/main/_static/img/android_studio.jpeg" width="800" alt="Integrating and Running ExecuTorch on Android">
+<a href="_static/img/android_studio.mp4">
+  <img src="_static/img/android_studio.jpeg" width="800" alt="Integrating and Running ExecuTorch on Android">
 </a>
 
 ## Using AAR file directly
@@ -58,9 +58,20 @@ You can also directly specify an AAR file in the app. We upload pre-built AAR to
 
 ### Snapshots from main branch
 
-| Date | AAR | SHASUMS |
-| ------- | --- | ------- |
-| 2025-02-27 | [executorch.aar](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-20250227/executorch.aar) | [executorch.aar.sha256sums](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-20250227/executorch.aar.sha256sums) |
+Starting from 2025-04-12, you can download nightly `main` branch snapshots:
+* `executorch.aar`: `https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-{YYYYMMDD}/executorch.aar`
+* `executorch.aar.sha256sums`: `https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-{YYYYMMDD}/executorch.aar.sha256sums`
+* Replace `YYYYMMDD` with the actual date you want to use.
+* AAR file is generated by [this workflow](https://github.com/pytorch/executorch/blob/c66b37d010c88a113560693b14dc6bd112593c11/.github/workflows/android-release-artifacts.yml#L14-L15).
+
+For example:
+
+```sh
+curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar
+curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar.sha256sums
+```
+
+We aim to make every daily snapshot available and useable. However, for best stability, please use releases, not snapshots.
 
 ## Using AAR file
 
@@ -96,8 +107,7 @@ Now you can compile your app with the ExecuTorch Android library.
 
 ## Building from Source
 
-`scripts/build_android_library.sh` is a helper script to build the Java library (into .jar), native library (into .so), and the packaged AAR file. It can also build
-demo apps to showcase the AAR is integrated into a user app correctly.
+`scripts/build_android_library.sh` is a helper script to build the Java library (into .jar), native library (into .so), and the packaged AAR file.
 
 You need Android [SDK](https://developer.android.com/studio) and [NDK](https://developer.android.com/ndk/downloads) to use it.
 
@@ -131,17 +141,17 @@ Set environment variable `EXECUTORCH_CMAKE_BUILD_TYPE` to `Release` or `Debug` b
 
 #### Using MediaTek backend
 
-To use [MediaTek backend](https://pytorch.org/executorch/main/backends-mediatek.html),
+To use [MediaTek backend](backends-mediatek.md),
 after installing and setting up the SDK, set `NEURON_BUFFER_ALLOCATOR_LIB` and `NEURON_USDK_ADAPTER_LIB` to the corresponding path.
 
 #### Using Qualcomm AI Engine Backend
 
-To use [Qualcomm AI Engine Backend](https://pytorch.org/executorch/main/backends-qualcomm.html#qualcomm-ai-engine-backend),
+To use [Qualcomm AI Engine Backend](backends-qualcomm.md#qualcomm-ai-engine-backend),
 after installing and setting up the SDK, set `QNN_SDK_ROOT` to the corresponding path.
 
 #### Using Vulkan Backend
 
-To use [Vulkan Backend](https://pytorch.org/executorch/main/backends-vulkan.html#vulkan-backend),
+To use [Vulkan Backend](backends-vulkan.md#vulkan-backend),
 set `EXECUTORCH_BUILD_VULKAN` to `ON`.
 
 ## Android Backends
@@ -150,10 +160,10 @@ The following backends are available for Android:
 
 | Backend | Type | Doc |
 | ------- | -------- | --- |
-| [XNNPACK](https://github.com/google/XNNPACK) | CPU | [Doc](./backends-xnnpack.md) |
-| [MediaTek NeuroPilot](https://neuropilot.mediatek.com/) | NPU | [Doc](./backends-mediatek.md) |
-| [Qualcomm AI Engine](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) | NPU | [Doc](./backends-qualcomm.md) |
-| [Vulkan](https://www.vulkan.org/) | GPU | [Doc](./backends-vulkan.md) |
+| [XNNPACK](https://github.com/google/XNNPACK) | CPU | [Doc](backends-xnnpack.md) |
+| [MediaTek NeuroPilot](https://neuropilot.mediatek.com/) | NPU | [Doc](backends-mediatek.md) |
+| [Qualcomm AI Engine](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) | NPU | [Doc](backends-qualcomm.md) |
+| [Vulkan](https://www.vulkan.org/) | GPU | [Doc](backends-vulkan.md) |
 
 
 ## Runtime Integration
@@ -172,18 +182,22 @@ public class MainActivity extends Activity {
     protected void onCreate(Bundle savedInstanceState) {
         super.onCreate(savedInstanceState);
         // Load the ExecuTorch module
-        module = Module.load("/path/to/module.pte");
-    }
-    public void runInference(View view) {
-        // Prepare input data
-        Tensor input = Tensor.fromBlob(getInputData());
-        // Run inference
-        Tensor output = module.forward(EValue.from(input))[0].toTensor();
-        // Process output data
-        processOutput(output);
+        Module module = Module.load("/data/local/tmp/add.pte");
+        Tensor tensor1 = Tensor.fromBlob(new float[] {1.0f}, new long[] {1});
+        Tensor tensor2 = Tensor.fromBlob(new float[] {20.0f}, new long[] {1});
+
+        EValue eValue1 = EValue.from(tensor1);
+        EValue eValue2 = EValue.from(tensor2);
+        float result = module.forward(eValue1, eValue2)[0].toTensor().getDataAsFloatArray()[0];
     }
 }
 ```
+
+Push the corresponding pte file to the phone:
+```sh
+adb push extension/module/test/resources/add.pte /data/local/tmp/
+```
+
 This example loads an ExecuTorch module, prepares input data, runs inference, and processes the output data.
 
 Please use [DeepLabV3AndroidDemo](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo)
diff --git a/docs/source/using-executorch-building-from-source.md b/docs/source/using-executorch-building-from-source.md
index 668f696f040..b5b8d3d4261 100644
--- a/docs/source/using-executorch-building-from-source.md
+++ b/docs/source/using-executorch-building-from-source.md
@@ -29,9 +29,10 @@ Windows (x86_64)
   - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative.
 * `g++` version 7 or higher, `clang++` version 5 or higher, or another
   C++17-compatible toolchain.
+* `python` version 3.10-3.12
 
 Note that the cross-compilable core runtime code supports a wider range of
-toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for
+toolchains, down to C++17. See the [Runtime Overview](runtime-overview.md) for
 portability details.
 
 ## Environment Setup
@@ -88,6 +89,17 @@ Or alternatively, [install conda on your machine](https://conda.io/projects/cond
    pip install -e .
    ```
 
+   If C++ files are being modified, you will still have to reinstall ExecuTorch from source.
+
+> **_WARNING:_**
+> Some modules can't be imported directly in editable mode. This is a known [issue](https://github.com/pytorch/executorch/issues/9558) and we are actively working on a fix for this. To workaround this:
+> ```bash
+> # This will fail
+> python -c "from executorch.exir import CaptureConfig"
+> # But this will succeed
+> python -c "from executorch.exir.capture import CaptureConfig"
+> ```
+
 > **_NOTE:_**  Cleaning the build system
 >
 > When fetching a new version of the upstream repo (via `git fetch` or `git
@@ -101,7 +113,7 @@ Or alternatively, [install conda on your machine](https://conda.io/projects/cond
 > # From the root of the executorch repo:
 > ./install_executorch.sh --clean
 > git submodule sync
-> git submodule update --init
+> git submodule update --init --recursive
 > ```
 
 ## Build ExecuTorch C++ runtime from source
@@ -219,7 +231,7 @@ Assuming Android NDK is available, run:
 mkdir cmake-android-out && cd cmake-android-out
 
 # point -DCMAKE_TOOLCHAIN_FILE to the location where ndk is installed
-cmake -DCMAKE_TOOLCHAIN_FILE=/Users/{user_name}/Library/Android/sdk/ndk/27.2.12479018/build/cmake/android.toolchain.cmake  -DANDROID_ABI=arm64-v8a ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake  -DANDROID_ABI=arm64-v8a ..
 
 cd  ..
 cmake --build  cmake-android-out  -j9
@@ -245,7 +257,7 @@ sh scripts/build_android_library.sh
 ```
 
 This script will build the AAR, which contains the Java API and its corresponding JNI library. Please see
-[this documentation](./using-executorch-android.md#using-aar-file) for usage.
+[this documentation](using-executorch-android.md#using-aar-file) for usage.
 
 ### iOS
 
@@ -280,5 +292,5 @@ Check out the [iOS Demo App](demo-apps-ios.md) tutorial for more info.
 You have successfully cross-compiled `executor_runner` binary to iOS and Android platforms. You can start exploring advanced features and capabilities. Here is a list of sections you might want to read next:
 
 * [Selective build](kernel-library-selective-build.md) to build the runtime that links to only kernels used by the program, which can provide significant binary size savings.
-* Tutorials on building [Android](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app) and [iOS](./demo-apps-ios.md) demo apps.
-* Tutorials on deploying applications to embedded devices such as [ARM Cortex-M/Ethos-U](backends-arm-ethos-u.md) and [XTensa HiFi DSP](./backends-cadence.md).
+* Tutorials on building [Android](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app) and [iOS](demo-apps-ios.md) demo apps.
+* Tutorials on deploying applications to embedded devices such as [ARM Cortex-M/Ethos-U](backends-arm-ethos-u.md) and [XTensa HiFi DSP](backends-cadence.md).
diff --git a/docs/source/using-executorch-cpp.md b/docs/source/using-executorch-cpp.md
index 4f8a83830e0..d64dad97da9 100644
--- a/docs/source/using-executorch-cpp.md
+++ b/docs/source/using-executorch-cpp.md
@@ -32,13 +32,15 @@ if (result.ok()) {
 
 For more information on the Module class, see [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md). For information on high-level tensor APIs, see [Managing Tensor Memory in C++](extension-tensor.md).
 
+For complete examples of building and running a C++ application using the Module API, refer to our [examples GitHub repository](https://github.com/pytorch-labs/executorch-examples/tree/main/mv2/cpp).
+
 ## Low-Level APIs
 
 Running a model using the low-level runtime APIs allows for a high-degree of control over memory allocation, placement, and loading. This allows for advanced use cases, such as placing allocations in specific memory banks or loading a model without a file system. For an end to end example using the low-level runtime APIs, see [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md).
 
 ## Building with CMake
 
-ExecuTorch uses CMake as the primary build system. Inclusion of the module and tensor APIs are controlled by the `EXECUTORCH_BUILD_EXTENSION_MODULE` and `EXECUTORCH_BUILD_EXTENSION_TENSOR` CMake options. As these APIs may not be supported on embedded systems, they are disabled by default when building from source. The low-level API surface is always included. To link, add the `executorch` target as a CMake dependency, along with `executorch_module_static` and `executorch_tensor`, if desired.
+ExecuTorch uses CMake as the primary build system. Inclusion of the module and tensor APIs are controlled by the `EXECUTORCH_BUILD_EXTENSION_MODULE` and `EXECUTORCH_BUILD_EXTENSION_TENSOR` CMake options. As these APIs may not be supported on embedded systems, they are disabled by default when building from source. The low-level API surface is always included. To link, add the `executorch` target as a CMake dependency, along with `extension_module_static` and `extension_tensor`, if desired.
 
 ```
 # CMakeLists.txt
@@ -47,8 +49,8 @@ add_subdirectory("executorch")
 target_link_libraries(
     my_target
     PRIVATE executorch
-    executorch_module_static
-    executorch_tensor
+    extension_module_static
+    extension_tensor
     optimized_native_cpu_ops_lib
     xnnpack_backend)
 ```
diff --git a/docs/source/using-executorch-faqs.md b/docs/source/using-executorch-faqs.md
index 56384c8015e..f639524d69c 100644
--- a/docs/source/using-executorch-faqs.md
+++ b/docs/source/using-executorch-faqs.md
@@ -18,7 +18,7 @@ if you are using Ubuntu, or use an equivalent install command.
 
 ### Missing out variants: { _ }
 
-The model likely contains torch custom operators. Custom ops need an Executorch implementation and need to be loaded at export time. See the [ExecuTorch Custom Ops Documentation](https://pytorch.org/executorch/main/kernel-library-custom-aten-kernel.html#apis) for details on how to do this.
+The model likely contains torch custom operators. Custom ops need an Executorch implementation and need to be loaded at export time. See the [ExecuTorch Custom Ops Documentation](kernel-library-custom-aten-kernel.md#apis) for details on how to do this.
 
 ### RuntimeError: PyTorch convert function for op _ not implemented
 
@@ -32,7 +32,7 @@ ExecuTorch error codes are defined in [executorch/core/runtime/error.h](https://
 
 If building the runtime from source, ensure that the build is done in release mode. For CMake builds, this can be done by passing `-DCMAKE_BUILD_TYPE=Release`.
 
-Ensure the model is delegated. If not targeting a specific accelerator, use the XNNPACK delegate for CPU performance. Undelegated operators will typically fall back to the ExecuTorch portable library, which is designed as a fallback, and is not intended for performance sensitive operators. To target XNNPACK, pass an `XnnpackPartitioner` to `to_edge_transform_and_lower`. See [Building and Running ExecuTorch with XNNPACK Backend](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering.html) for more information.
+Ensure the model is delegated. If not targeting a specific accelerator, use the XNNPACK delegate for CPU performance. Undelegated operators will typically fall back to the ExecuTorch portable library, which is designed as a fallback, and is not intended for performance sensitive operators. To target XNNPACK, pass an `XnnpackPartitioner` to `to_edge_transform_and_lower`. See [Building and Running ExecuTorch with XNNPACK Backend](tutorial-xnnpack-delegate-lowering.md) for more information.
 
 Thread count can have a significant impact on CPU performance. The optimal thread count may depend on the model and application. By default, ExecuTorch will currently use as many threads as there are cores. Consider setting the thread count to cores / 2, or just set to 4 on mobile CPUs.
 
@@ -41,11 +41,11 @@ Thread count can be set with the following function. Ensure this is done prior t
 ::executorch::extension::threadpool::get_threadpool()->_unsafe_reset_threadpool(num_threads);
 ```
 
-For a deeper investgiation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial.html) for more information.
+For a deeper investgiation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information.
 
 ### Missing Logs
 
-ExecuTorch provides hooks to route runtime logs. By default, logs are sent to stdout/stderr, but users can override `et_pal_emit_log_message` to route logs to a custom destination. The Android and iOS extensions also provide out-of-box log routing to the appropriate platform logs. See [Runtime Platform Abstraction Layer (PAL)](https://pytorch.org/executorch/main/runtime-platform-abstraction-layer.html) for more information.
+ExecuTorch provides hooks to route runtime logs. By default, logs are sent to stdout/stderr, but users can override `et_pal_emit_log_message` to route logs to a custom destination. The Android and iOS extensions also provide out-of-box log routing to the appropriate platform logs. See [Runtime Platform Abstraction Layer (PAL)](runtime-platform-abstraction-layer.md) for more information.
 
 ### Error setting input: 0x10 / Attempted to resize a bounded tensor...
 
diff --git a/docs/source/using-executorch-ios.md b/docs/source/using-executorch-ios.md
index 1d03284ec2c..61b260f4a00 100644
--- a/docs/source/using-executorch-ios.md
+++ b/docs/source/using-executorch-ios.md
@@ -35,8 +35,8 @@ Then select which ExecuTorch framework should link against which target.
 
 Click the screenshot below to watch the *demo video* on how to add the package and run a simple ExecuTorch model on iOS.
 
-<a href="https://pytorch.org/executorch/main/_static/img/swiftpm_xcode.mp4">
-  <img src="https://pytorch.org/executorch/main/_static/img/swiftpm_xcode.png" width="800" alt="Integrating and Running ExecuTorch on Apple Platforms">
+<a href="_static/img/swiftpm_xcode.mp4">
+  <img src="_static/img/swiftpm_xcode.png" width="800" alt="Integrating and Running ExecuTorch on Apple Platforms">
 </a>
 
 #### CLI
@@ -171,29 +171,471 @@ You can assign such a config file to your target in Xcode:
 
 ## Runtime API
 
-Check out the [C++ Runtime API](extension-module.md) and [Tensors](extension-tensor.md) tutorials to learn more about how to load and run an exported model. It is recommended to use the C++ API for macOS or iOS, wrapped with Objective-C++ and Swift code if needed to expose it for other components. Please refer to the [Demo App](demo-apps-ios.md) as an example of such a setup.
+ExecuTorch provides native Objective-C APIs, automatically bridged to Swift, for interacting with the runtime. These APIs act as wrappers around the core C++ components found in [extension/tensor](extension-tensor.md) and [extension/module](extension-module.md), offering a more idiomatic experience for Apple platform developers.
 
-Once linked against the `executorch` runtime framework, the target can now import all ExecuTorch public headers. For example, in Objective-C++:
+**Note:** These Objective-C/Swift APIs are currently experimental and subject to change.
 
-```objectivecpp
+### Importing
+
+Once linked against the `executorch` framework, you can import the necessary components.
+
+Objective-C (Objective-C++):
+
+```objectivec
+// Import the main umbrella header for Module/Tensor/Value wrappers.
 #import <ExecuTorch/ExecuTorch.h>
+
+// If using C++ directly alongside Objective-C++, you might still need C++ headers.
 #import <executorch/extension/module/module.h>
 #import <executorch/extension/tensor/tensor.h>
 ```
 
-Or in Swift:
+Swift:
+
+```swift
+import ExecuTorch
+```
+
+#### Example
+
+Here's a concise example demonstrating how to load a model, prepare input, run inference, and process output using the Objective-C and Swift API. Imagine you have a MobileNet v3 model (`mv3.pte`) that takes a `[1, 3, 224, 224]` float tensor as input and outputs logits.
+
+Objective-C:
+
+```objectivec
+NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"mv3" ofType:@"pte"];
+
+// Create a module with the model file path. Nothing gets loaded into memory just yet.
+ExecuTorchModule *module = [[ExecuTorchModule alloc] initWithFilePath:modelPath];
+
+NSError *error;  // Optional error output argument to learn about failures.
+
+// Force-load the program and 'forward' method. Otherwise, it's loaded at the first execution.
+[module loadMethod:@"forward" error:&error];
+
+float *imageBuffer = ...;  // Existing image buffer.
+
+// Create an input tensor referencing the buffer and assuming the given shape and data type.
+ExecuTorchTensor *inputTensor = [[ExecuTorchTensor alloc] initWithBytesNoCopy:imageBuffer
+                                                                        shape:@[@1, @3, @224, @224]
+                                                                     dataType:ExecuTorchDataTypeFloat];
+
+// Execute the 'forward' method with the given input tensor and get output values back.
+NSArray<ExecuTorchValue *> *outputs = [module forwardWithTensor:inputTensor error:&error];
+
+// Get the first output value assuming it's a tensor.
+ExecuTorchTensor *outputTensor = outputs.firstObject.tensor;
+
+// Access the output tensor data.
+[outputTensor bytesWithHandler:^(const void *pointer, NSInteger count, ExecuTorchDataType dataType) {
+  float *logits = (float *)pointer;
+  // Use logits...
+}];
+```
+
+Swift:
+
+```swift
+let modelPath = Bundle.main.path(forResource: "mv3", ofType: "pte")!
+
+// Create a module with the model file path. Nothing gets loaded into memory just yet.
+let module = Module(filePath: modelPath)
+
+// Force-load the program and 'forward' method. Otherwise, it's loaded at the first execution.
+try module.load("forward")
+
+let imageBuffer: UnsafeMutableRawPointer = ... // Existing image buffer
+
+// Create an input tensor referencing the buffer and assuming the given shape and data type.
+let inputTensor = Tensor(
+  bytesNoCopy: imageBuffer,
+  shape: [1, 3, 224, 224],
+  dataType: .float
+)
+
+// Execute the 'forward' method with the given input tensor and get output values back.
+let outputs = try module.forward(inputTensor)
+
+// Get the first output value assuming it's a tensor.
+if let outputTensor = outputs.first?.tensor {
+  // Access the output tensor data.
+  outputTensor.bytes { pointer, count, dataType in
+    // Copy the tensor data into logits array for easier access.
+    let logits = Array(UnsafeBufferPointer(
+      start: pointer.assumingMemoryBound(to: Float.self),
+      count: count
+    ))
+    // Use logits...
+  }
+}
+```
+
+### Tensor
+
+The `Tensor` class (exposed as `ExecuTorchTensor` in Objective-C) represents a multi-dimensional array of elements (such as floats or ints) and includes metadata like shape (dimensions) and data type. Tensors are used to feed inputs to a model and retrieve outputs, or for any computation you need to do on raw data. You can create tensors from simple arrays of numbers, inspect their properties, read or modify their contents, and even reshape or copy them.
+
+#### Key Properties:
+
+- dataType: The element type (e.g., `.float`, `.int`, `.byte`).
+- shape: An array of `NSNumber` describing the size of each dimension.
+- count: The total number of elements.
+- strides: The jump in memory needed to advance one element along each dimension.
+- dimensionOrder: The order of dimensions in memory.
+- shapeDynamism: Indicates if the tensor shape can change (`.static`, `.dynamicBound`, `.dynamicUnbound`).
+
+#### Initialization:
+
+You can create tensors in various ways:
+
+From existing memory buffers:
+- `init(bytesNoCopy:shape:dataType:...)`: Creates a tensor that references an existing memory buffer without copying. The buffer's lifetime must exceed the tensor's.
+- `init(bytes:shape:dataType:...)`: Creates a tensor by copying data from a memory buffer.
 
+From `NSData` / `Data`:
+- `init(data:shape:dataType:...)`: Creates a tensor using an `NSData` object, referencing its bytes without copying.
+
+From scalar arrays:
+- `init(_:shape:dataType:...)`: Creates a tensor from an array of `NSNumber` scalars. Convenience initializers exist to infer shape or data type.
+
+From single scalars:
+- `init(_:)`, `init(_:dataType:)`, `init(float:)`, `init(int:)`, etc.: Create 0-dimensional tensors (scalars).
+
+Objective-C:
+
+```objectivec
+#import <ExecuTorch/ExecuTorch.h>
+
+// Create from copying bytes.
+float data[] = {1.0f, 2.0f, 3.0f, 4.0f};
+NSArray<NSNumber *> *shape = @[@2, @2];
+ExecuTorchTensor *tensorFromBytes = [[ExecuTorchTensor alloc] initWithBytes:data
+                                                                      shape:shape
+                                                                   dataType:ExecuTorchDataTypeFloat];
+
+// Create from scalars.
+NSArray<NSNumber *> *scalars = @[@(1), @(2), @(3)];
+ExecuTorchTensor *tensorFromScalars = [[ExecuTorchTensor alloc] initWithScalars:scalars
+                                                                       dataType:ExecuTorchDataTypeInt];
+
+// Create a float scalar tensor.
+ExecuTorchTensor *scalarTensor = [[ExecuTorchTensor alloc] initWithFloat:3.14f];
+```
+
+Swift:
 ```swift
 import ExecuTorch
+
+// Create from existing buffer without copying.
+var mutableData: [Float] = [1.0, 2.0, 3.0, 4.0]
+let tensorNoCopy = mutableData.withUnsafeMutableBytes { bufferPointer in
+  Tensor(
+    bytesNoCopy: bufferPointer.baseAddress!,
+    shape: [2, 2],
+    dataType: .float
+  )
+}
+
+// Create from Data (no copy).
+let data = Data(bytes: mutableData, count: mutableData.count * MemoryLayout<Float>.size)
+let tensorFromData = Tensor(data: data, shape: [2, 2], dataType: .float)
+
+// Create from scalars (infers float type).
+let tensorFromScalars = Tensor([1.0, 2.0, 3.0, 4.0], shape: [4])
+
+// Create an Int scalar tensor.
+let scalarTensor = Tensor(42) // Infers Int as .long data type (64-bit integer)
+```
+
+#### Accessing Data:
+
+Use `bytes(_:)` for immutable access and `mutableBytes(_:)` for mutable access to the tensor's underlying data buffer.
+
+Objective-C:
+
+```objectivec
+[tensor bytesWithHandler:^(const void *pointer, NSInteger count, ExecuTorchDataType dataType) {
+  if (dataType == ExecuTorchDataTypeFloat) {
+    const float *floatPtr = (const float *)pointer;
+    NSLog(@"First float element: %f", floatPtr[0]);
+  }
+}];
+
+[tensor mutableBytesWithHandler:^(void *pointer, NSInteger count, ExecuTorchDataType dataType) {
+  if (dataType == ExecuTorchDataTypeFloat) {
+    float *floatPtr = (float *)pointer;
+    floatPtr[0] = 100.0f; // Modify the original mutableData buffer.
+  }
+}];
 ```
 
-**Note:** Importing the ExecuTorch umbrella header (or ExecuTorch module in Swift) provides access to the logging API only. You still need to import the other runtime headers explicitly as needed, e.g., `module.h`. There is no support for other runtime APIs in Objective-C or Swift beyond logging described below.
+Swift:
+```swift
+tensor.bytes { pointer, count, dataType in
+  if dataType == .float {
+    let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count)
+    print("First float element: \(buffer.first ?? 0.0)")
+  }
+}
+
+tensor.mutableBytes { pointer, count, dataType in
+  if dataType == .float {
+    let buffer = UnsafeMutableBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count)
+    buffer[1] = 200.0 // Modify the original mutableData buffer.
+  }
+}
+```
 
-**Note:** Logs are stripped in the release builds of ExecuTorch frameworks. To preserve logging, use debug builds during development.
+#### Resizing:
+
+Tensors can be resized if their underlying memory allocation allows it (typically requires ShapeDynamism other than Static or sufficient capacity).
+
+Objective-C:
+
+```objectivec
+NSError *error;
+BOOL success = [tensor resizeToShape:@[@4, @1] error:&error];
+if (success) {
+  NSLog(@"Resized shape: %@", tensor.shape);
+} else {
+  NSLog(@"Resize failed: %@", error);
+}
+```
+
+Swift:
+```swift
+do {
+  try tensor.resize(to: [4, 1])
+  print("Resized shape: \(tensor.shape)")
+} catch {
+  print("Resize failed: \(error)")
+}
+```
+
+### Value
+
+The `Value` class (exposed as `ExecuTorchValue` in Objective-C) is a dynamic container that can hold different types of data, primarily used for model inputs and outputs. ExecuTorch methods accept and return arrays of `Value` objects.
+
+#### Key Properties:
+
+- `tag`: Indicates the type of data held (e.g., `.tensor`, `.integer`, `.string`, `.boolean`).
+- `isTensor`, `isInteger`, `isString`, etc.: Boolean checks for the type.
+- `tensor`, `integer`, `string`, `boolean`, `double`: Accessors for the underlying data (return `nil` or a default value if the tag doesn't match).
+
+#### Initialization:
+
+Create Value objects directly from the data they should hold.
+
+Objective-C:
+
+```objectivec
+#import <ExecuTorch/ExecuTorch.h>
+
+ExecuTorchTensor *tensor = [[ExecuTorchTensor alloc] initWithFloat:1.0f];
+
+ExecuTorchValue *tensorValue = [[ExecuTorchValue alloc] valueWithTensor:tensor];
+ExecuTorchValue *intValue = [[ExecuTorchValue alloc] valueWithInteger:100];
+ExecuTorchValue *stringValue = [[ExecuTorchValue alloc] valueWithString:@"hello"];
+ExecuTorchValue *boolValue = [[ExecuTorchValue alloc] valueWithBoolean:YES];
+ExecuTorchValue *doubleValue = [[ExecuTorchValue alloc] valueWithDouble:3.14];
+```
+
+Swift:
+
+```swift
+import ExecuTorch
+
+let tensor = Tensor(2.0)
+
+let tensorValue = Value(tensor)
+let intValue = Value(200)
+let stringValue = Value("world")
+let boolValue = Value(false)
+let doubleValue = Value(2.718)
+```
+
+### Module
+
+The `Module` class (exposed as `ExecuTorchModule` in Objective-C) represents a loaded ExecuTorch model (`.pte` file). It provides methods to load the model program and execute its internal methods (like `forward`).
+
+#### Initialization:
+
+Create a `Module` instance by providing the file path to the `.pte` model. Initialization itself is lightweight and doesn't load the program data immediately.
+
+Objective-C:
+
+```objectivec
+#import <ExecuTorch/ExecuTorch.h>
+
+NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"model" ofType:@"pte"];
+ExecuTorchModule *module = [[ExecuTorchModule alloc] initWithFilePath:modelPath];
+// Optional: specify load mode, e.g., memory mapping.
+// ExecuTorchModule *moduleMmap = [[ExecuTorchModule alloc] initWithFilePath:modelPath
+//                                                                   loadMode:ExecuTorchModuleLoadModeMmap];
+```
+
+Swift:
+```swift
+import ExecuTorch
+
+let modelPath = Bundle.main.path(forResource: "model", ofType: "pte")
+let module = Module(filePath: modelPath!)
+// Optional: specify load mode, e.g., memory mapping.
+// let moduleMmap = Module(filePath: modelPath, loadMode: .mmap)
+```
+
+#### Loading:
+
+Model loading is deferred until explicitly requested or needed for execution. While execution calls can trigger loading automatically, it's often more efficient to load methods explicitly beforehand.
+
+- `load()`: Loads the basic program structure. Minimal verification is used by default.
+- `load(_:)`: Loads the program structure and prepares a specific method (e.g., "forward") for execution. This performs necessary setup like backend delegation and is recommended if you know which method you'll run.
+- `isLoaded()` / `isLoaded(_:)`: Check loading status.
+
+Objective-C:
+
+```objectivec
+NSError *error;
+// Loads program and prepares 'forward' for execution.
+BOOL success = [module loadMethod:@"forward" error:&error];
+if (success) {
+  NSLog(@"Forward method loaded: %d", [module isMethodLoaded:@"forward"]);
+} else {
+  NSLog(@"Failed to load method: %@", error);
+}
+```
+
+Swift:
+
+```swift
+do {
+  // Loads program and prepares 'forward' for execution.
+  try module.load("forward")
+  print("Forward method loaded: \(module.isLoaded("forward"))")
+} catch {
+  print("Failed to load method: \(error)")
+}
+```
+
+#### Execution:
+
+The `Module` class offers flexible ways to execute methods within the loaded program.
+
+- Named Execution: You can execute any available method by name using `execute(methodName:inputs:)`.
+- Forward Shortcut: For the common case of running the primary inference method, use the `forward(inputs:)` shortcut, which is equivalent to calling execute with the method name "forward".
+- Input Flexibility: Inputs can be provided in several ways:
+  - As an array of `Value` objects. This is the most general form.
+  - As an array of `Tensor` objects. This is a convenience where tensors are automatically wrapped into `Value` objects.
+  - As a single `Value` or `Tensor` object if the method expects only one input.
+  - With no inputs if the method takes none.
+
+Outputs are always returned as an array of `Value`.
+
+Objective-C:
+
+```objectivec
+ExecuTorchTensor *inputTensor1 = [[ExecuTorchTensor alloc] initWithScalars:@[@1.0f, @2.0f]];
+ExecuTorchTensor *inputTensor2 = [[ExecuTorchTensor alloc] initWithScalars:@[@3.0f, @4.0f]];
+ExecuTorchTensor *singleInputTensor = [[ExecuTorchTensor alloc] initWithFloat:5.0f];
+NSError *error;
+
+// Execute "forward" using the shortcut with an array of Tensors.
+NSArray<ExecuTorchValue *> *outputs1 = [module forwardWithTensors:@[inputTensor1, inputTensor2] error:&error];
+if (outputs1) {
+  NSLog(@"Forward output count: %lu", (unsigned long)outputs1.count);
+} else {
+  NSLog(@"Execution failed: %@", error);
+}
+
+// Execute "forward" with a single Tensor input.
+NSArray<ExecuTorchValue *> *outputs2 = [module forwardWithTensor:singleInputTensor error:&error];
+if (outputs2) {
+    NSLog(@"Forward single input output count: %lu", (unsigned long)outputs2.count);
+} else {
+    NSLog(@"Execution failed: %@", error);
+}
+
+// Execute a potentially different method by name.
+NSArray<ExecuTorchValue *> *outputs3 = [module executeMethod:@"another_method"
+                                                   withInput:[[ExecuTorchValue alloc] valueWithTensor:inputTensor1]
+                                                       error:&error];
+
+// Process outputs (assuming first output is a tensor).
+if (outputs1) {
+  ExecuTorchValue *firstOutput = outputs1.firstObject;
+  if (firstOutput.isTensor) {
+    ExecuTorchTensor *resultTensor = firstOutput.tensorValue;
+    // Process resultTensor.
+  }
+}
+```
+
+Swift:
+
+```swift
+let inputTensor1 = Tensor([1.0, 2.0], dataType: .float)
+let inputTensor2 = Tensor([3.0, 4.0], dataType: .float)
+let singleInputTensor = Tensor([5.0], dataType: .float)
+
+do {
+  // Execute "forward" using the shortcut with an array of Tensors.
+  let outputs1 = try module.forward([inputTensor1, inputTensor2])
+  print("Forward output count: \(outputs1.count)")
+
+  // Execute "forward" with a single Tensor input.
+  let outputs2 = try module.forward(singleInputTensor)
+  print("Forward single input output count: \(outputs2.count)")
+
+  // Execute a potentially different method by name.
+  let outputs3 = try module.execute("another_method", inputs: [Value(inputTensor1)])
+
+  // Process outputs (assuming first output is a tensor).
+  if let resultTensor = outputs1.first?.tensor {
+    resultTensor.bytes { ptr, count, dtype in
+      // Access result data.
+    }
+  }
+} catch {
+  print("Execution failed: \(error)")
+}
+```
+
+#### Method Names:
+
+You can query the available method names in the model after the program is loaded.
+
+Objective-C:
+
+```objectivec
+NSError *error;
+
+// Note: methodNames: will load the program if not already loaded.
+NSSet<NSString *> *names = [module methodNames:&error];
+if (names) {
+  NSLog(@"Available methods: %@", names);
+} else {
+  NSLog(@"Could not get method names: %@", error);
+}
+```
+
+Swift:
+
+```swift
+do {
+  // Note: methodNames() will load the program if not already loaded.
+  let names = try module.methodNames()
+  print("Available methods: \(names)") // Output: e.g., {"forward"}
+} catch {
+  print("Could not get method names: \(error)")
+}
+```
 
 ### Logging
 
-ExecuTorch provides extra APIs for logging in Objective-C and Swift as a lightweight wrapper of the internal ExecuTorch machinery. To use it, just import the main framework header in Objective-C. Then use the `ExecuTorchLog` interface (or the `Log` class in Swift) to subscribe your own implementation of the `ExecuTorchLogSink` protocol (or `LogSink` in Swift) to listen to log events.
+ExecuTorch provides APIs for logging in Objective-C and Swift via the `ExecuTorchLog` (`Log` in Swift) singleton. You can subscribe custom log sinks conforming to the `ExecuTorchLogSink` (`LogSink` in Swift) protocol to receive internal ExecuTorch log messages.
+
+**Note:** Logs are stripped in the Release builds of ExecuTorch frameworks. To capture logs, link against the Debug builds (e.g., `executorch_debug`) during development.
+
+Objective-C:
 
 ```objectivec
 #import <ExecuTorch/ExecuTorch.h>
@@ -250,7 +692,7 @@ ExecuTorch provides extra APIs for logging in Objective-C and Swift as a lightwe
 @end
 ```
 
-Swift version:
+Swift:
 
 ```swift
 import ExecuTorch
diff --git a/docs/source/using-executorch-troubleshooting.md b/docs/source/using-executorch-troubleshooting.md
index 16006802611..56c2e1a0653 100644
--- a/docs/source/using-executorch-troubleshooting.md
+++ b/docs/source/using-executorch-troubleshooting.md
@@ -16,5 +16,5 @@ The ExecuTorch developer tools, or devtools, are a collection of tooling for tro
 
 - [Frequently Asked Questions](using-executorch-faqs.md) for solutions to commonly encountered questions and issues.
 - [Introduction to the ExecuTorch Developer Tools](runtime-profiling.md) for a high-level introduction to available developer tooling.
-- [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial.md) for information on runtime performance profiling.
+- [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for information on runtime performance profiling.
 - [Inspector APIs](runtime-profiling.md) for reference material on trace inspector APIs.
diff --git a/examples/README.md b/examples/README.md
index 17999b15423..3dbdacaac68 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,7 +9,7 @@ ExecuTorch's extensive support spans from simple modules like "Add" to comprehen
 ## Directory structure
 ```
 examples
-├── llm_manual                        # A storage place for the files that [LLM Maunal](https://pytorch.org/executorch/main/llm/getting-started.html) needs
+├── llm_manual                        # A storage place for the files that [LLM Maunal](https://pytorch.org/executorch/main/llm/getting-started) needs
 ├── models                            # Contains a set of popular and representative PyTorch models
 ├── portable                          # Contains end-to-end demos for ExecuTorch in portable mode
 ├── selective_build                   # Contains demos of selective build for optimizing the binary size of the ExecuTorch runtime
@@ -29,53 +29,53 @@ examples
 
 ## Using the examples
 
-A user's journey may commence by exploring the demos located in the [`portable/`](./portable) directory. Here, you will gain insights into the fundamental end-to-end workflow to generate a binary file from a ML model in [portable mode](../docs/source/concepts.md##portable-mode-lean-mode) and run it on the ExecuTorch runtime.
+A user's journey may commence by exploring the demos located in the [`portable/`](portable) directory. Here, you will gain insights into the fundamental end-to-end workflow to generate a binary file from a ML model in [portable mode](../docs/source/concepts.md##portable-mode-lean-mode) and run it on the ExecuTorch runtime.
 
 ## Demos Apps
 
-Explore mobile apps with ExecuTorch models integrated and deployable on [Android](./demo-apps/android) and [iOS]((./demo-apps/apple_ios)). This provides end-to-end instructions on how to export Llama models, load on device, build the app, and run it on device.
+Explore mobile apps with ExecuTorch models integrated and deployable on [Android](demo-apps/android) and [iOS](demo-apps/apple_ios). This provides end-to-end instructions on how to export Llama models, load on device, build the app, and run it on device.
 
 For specific details related to models and backend, you can explore the various subsections.
 
 ### Llama Models
 
-[This page](./models/llama/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
+[This page](models/llama/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
 
 ### Llava1.5 7B
 
-[This page](./models/llava/README.md) demonstrates how to run [Llava 1.5 7B](https://github.com/haotian-liu/LLaVA) model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
+[This page](models/llava/README.md) demonstrates how to run [Llava 1.5 7B](https://github.com/haotian-liu/LLaVA) model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones.
 
 ### Selective Build
 
-To understand how to deploy the ExecuTorch runtime with optimization for binary size, explore the demos available in the [`selective_build/`](./selective_build) directory. These demos are specifically designed to illustrate the [Selective Build](../docs/source/kernel-library-selective_build.md), offering insights into reducing the binary size while maintaining efficiency.
+To understand how to deploy the ExecuTorch runtime with optimization for binary size, explore the demos available in the [`selective_build`](selective_build) directory. These demos are specifically designed to illustrate the [Selective Build](../docs/source/kernel-library-selective-build.md), offering insights into reducing the binary size while maintaining efficiency.
 
 ### Developer Tools
 
-You will find demos of [ExecuTorch Developer Tools](./devtools/) in the [`devtools/`](./devtools/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
+You will find demos of [ExecuTorch Developer Tools](devtools) in the [`devtools`](devtools) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data.
 
 ### XNNPACK delegation
 
-The demos in the [`xnnpack/`](./xnnpack) directory provide valuable insights into the process of lowering and executing an ExecuTorch model with built-in performance enhancements. These demos specifically showcase the workflow involving [XNNPACK backend](https://github.com/pytorch/executorch/tree/main/backends/xnnpack) delegation and quantization.
+The demos in the [`xnnpack/`](xnnpack) directory provide valuable insights into the process of lowering and executing an ExecuTorch model with built-in performance enhancements. These demos specifically showcase the workflow involving [XNNPACK backend](https://github.com/pytorch/executorch/tree/main/backends/xnnpack) delegation and quantization.
 
 ### Apple Backend
 
-You will find demos of [ExecuTorch Core ML Backend](./apple/coreml/) in the [`apple/coreml/`](./apple/coreml) directory and [MPS Backend](./apple/mps/) in the [`apple/mps/`](./apple/mps) directory.
+You will find demos of [ExecuTorch Core ML Backend](apple/coreml/) in the [`apple/coreml`](apple/coreml) directory and [MPS Backend](apple/mps) in the [`apple/mps`](apple/mps) directory.
 
 ### ARM Cortex-M55 + Ethos-U55 Backend
 
-The [`arm/`](./arm) directory contains scripts to help you run a PyTorch model on a ARM Corstone-300 platform via ExecuTorch.
+The [`arm`](arm) directory contains scripts to help you run a PyTorch model on a ARM Corstone-300 platform via ExecuTorch.
 
 ### QNN Backend
 
-You will find demos of [ExecuTorch QNN Backend](./qualcomm) in the [`qualcomm/`](./qualcomm) directory.
+You will find demos of [ExecuTorch QNN Backend](qualcomm) in the [`qualcomm`](qualcomm) directory.
 
 ### Cadence HiFi4 DSP
 
-The [`Cadence/`](./cadence) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it.
+The [`Cadence`](cadence) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/backends-cadence.md) to guide you in configuring the demo and running it.
 
 ## Dependencies
 
-Various models and workflows listed in this directory have dependencies on some other packages. You need to follow the setup guide in [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/stable/getting-started-setup) to have appropriate packages installed.
+Various models and workflows listed in this directory have dependencies on some other packages. You need to follow the setup guide in [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/main/getting-started-setup) to have appropriate packages installed.
 
 # Disclaimer
 
diff --git a/examples/apple/coreml/README.md b/examples/apple/coreml/README.md
index 4dba5031358..a4234e72c2d 100644
--- a/examples/apple/coreml/README.md
+++ b/examples/apple/coreml/README.md
@@ -15,7 +15,7 @@ coreml
 
 We will walk through an example model to generate a Core ML delegated binary file from a python `torch.nn.module` then we will use the `coreml_executor_runner` to run the exported binary file.
 
-1. Following the setup guide in [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
+1. Following the setup guide in [Setting Up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup)
 you should be able to get the basic development environment for ExecuTorch working.
 
 
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index bc3f2872291..bdfbf24a0cf 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -5,11 +5,33 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+set -e
+
+MODE="Release"
+
+usage() {
+  echo "Builds Core ML executor runner"
+  echo "Options:"
+  echo "  --Debug              Use Debug build mode. Default: 'Release'"
+  echo "Example:"
+  echo "  $0 --Debug"
+  exit 0
+}
+
+for arg in "$@"; do
+  case $arg in
+      -h|--help) usage ;;
+      --Debug) MODE="Debug" ;;
+      *)
+  esac
+done
+
 SCRIPT_DIR_PATH="$(
     cd -- "$(dirname "$0")" >/dev/null 2>&1
     pwd -P
 )"
 
+
 EXECUTORCH_ROOT_PATH=$(realpath "$SCRIPT_DIR_PATH/../../../../")
 COREML_DIR_PATH="$EXECUTORCH_ROOT_PATH/backends/apple/coreml"
 EXAMPLES_COREML_DIR_PATH="$EXECUTORCH_ROOT_PATH/examples/apple/coreml"
@@ -23,13 +45,13 @@ cd "$EXECUTORCH_ROOT_PATH"
 
 echo "ExecuTorch: Building executor_runner"
 
-echo "ExecuTorch: Removing build directory $CMAKE_BUILD_DIR"
+echo "ExecuTorch: Removing build directory $CMAKE_BUILD_DIR_PATH"
 rm -rf "$CMAKE_BUILD_DIR_PATH"
 
 # Build executorch
 echo "ExecuTorch: Building executorch"
 cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \
--DCMAKE_BUILD_TYPE=Release \
+-DCMAKE_BUILD_TYPE="$MODE" \
 -DCMAKE_TOOLCHAIN_FILE="$IOS_TOOLCHAIN_PATH" \
 -DPLATFORM=MAC_UNIVERSAL \
 -DDEPLOYMENT_TARGET=13.0 \
diff --git a/examples/apple/mps/README.md b/examples/apple/mps/README.md
index dc01d585f84..2eafb86f1c6 100644
--- a/examples/apple/mps/README.md
+++ b/examples/apple/mps/README.md
@@ -8,7 +8,7 @@ This README gives some examples on backend-specific model workflow.
 ## Prerequisite
 
 Please finish the following tutorials:
-- [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup).
+- [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup).
 - [Setting up MPS backend](../../../backends/apple/mps/setup.md).
 
 ## Delegation to MPS backend
diff --git a/examples/arm/README.md b/examples/arm/README.md
index 8762e7ccdd1..a326db70e64 100644
--- a/examples/arm/README.md
+++ b/examples/arm/README.md
@@ -7,8 +7,8 @@ contains the Cortex-M55 CPU and Ethos-U55 NPU.
 We will start from a PyTorch model in python, export it, convert it to a `.pte`
 file - A binary format adopted by ExecuTorch. Then we will take the `.pte`
 model file and embed that with a baremetal application executor_runner. We will
-then take the executor_runner file, which contains not only the `.pte` file but
-also necessary software component to run standalone on a baremetal system.
+then take the executor_runner file, which contains not only the `.pte` binary but
+also necessary software components to run standalone on a baremetal system.
 Lastly, we will run the executor_runner binary on a Corstone-300 FVP Simulator
 platform.
 
@@ -44,6 +44,6 @@ jupyter notebook ethos_u_minimal_example.ipynb
 
 ### Online Tutorial
 
-We also have a [tutorial](https://pytorch.org/executorch/stable/executorch-arm-delegate-tutorial.html) explaining the steps performed in these
+We also have a [tutorial](https://pytorch.org/executorch/main/backends-arm-ethos-u) explaining the steps performed in these
 scripts, expected results, possible problems and more. It is a step-by-step guide
 you can follow to better understand this delegate.
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 016e03f04a2..8d77eabce0f 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -55,10 +55,6 @@ else
     echo "[main] Error: only x86-64 & aarch64/arm64 architecture is supported for now!"; exit 1;
 fi
 
-# tosa reference model
-tosa_reference_model_url="https://git.gitlab.arm.com/tosa/tosa-reference-model.git"
-tosa_reference_model_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a"
-
 # vela
 vela_repo_url="https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela"
 vela_rev="425541302c7e4b6fbeca7c0061286b131ee507c3"
@@ -156,14 +152,6 @@ function setup_toolchain() {
     tar xf "${toolchain_dir}.tar.xz"
 }
 
-function setup_tosa_reference_model() {
-    # reference_model flatbuffers version clashes with Vela.
-    # go with Vela's since it newer.
-    # Vela's flatbuffer requirement is expected to loosen, then remove this. MLETORCH-565
-    CMAKE_POLICY_VERSION_MINIMUM=3.5 pip install tosa-tools@git+${tosa_reference_model_url}@${tosa_reference_model_rev} --no-dependencies flatbuffers
-
-}
-
 function setup_vela() {
     pip install ethos-u-vela@git+${vela_repo_url}@${vela_rev}
 }
@@ -233,7 +221,7 @@ if [[ $is_script_sourced -eq 0 ]]
     create_setup_path
 
     # Setup the tosa_reference_model
-    setup_tosa_reference_model
+    $et_dir/backends/arm/scripts/install_reference_model.sh ${root_dir}
 
     # Setup vela and patch in codegen fixes
     setup_vela
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index a735b48dee1..d4db18ec016 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -135,11 +135,39 @@ Ensure you have the following functions in your callback class that you provided
   }
 
   @Override
-  public void onStats(float tps) {
-    //...tps (tokens per second) stats is provided by framework
+  public void onStats(String stats) {
+    //... will be a json. See extension/llm/stats.h for the field definitions
   }
 
 ```
 
+## Instrumentation Test
+You can run the instrumentation test for sanity check. The test loads a model pte file and tokenizer.bin file
+under `/data/local/tmp/llama`.
+
+### Model preparation
+Go to ExecuTorch root,
+```sh
+curl -C - -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
+curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
+# Create params.json file
+touch params.json
+echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv
+python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+```
+### Push model
+```sh
+adb mkdir -p /data/local/tmp/llama
+adb push stories110m_h.pte /data/local/tmp/llama
+adb push tokenizer.bin /data/local/tmp/llama
+```
+
+### Run test
+Go to `examples/demo-apps/android/LlamaDemo`,
+```sh
+./gradlew connectedAndroidTest
+```
+
 ## Reporting Issues
 If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
index ea9d4e6c172..893b1ee4784 100644
--- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
@@ -60,6 +60,7 @@ dependencies {
   implementation(files("libs/executorch.aar"))
   implementation("com.google.android.material:material:1.12.0")
   implementation("androidx.activity:activity:1.9.0")
+  implementation("org.json:json:20250107")
   testImplementation("junit:junit:4.13.2")
   androidTestImplementation("androidx.test.ext:junit:1.1.5")
   androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
index 21ac285d3b0..32ec24a0df9 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
@@ -18,6 +18,8 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import org.json.JSONException;
+import org.json.JSONObject;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.pytorch.executorch.extension.llm.LlmCallback;
@@ -64,8 +66,16 @@ public void onResult(String result) {
   }
 
   @Override
-  public void onStats(float tps) {
-    tokensPerSecond.add(tps);
+  public void onStats(String result) {
+    try {
+      JSONObject jsonObject = new JSONObject(result);
+      int numGeneratedTokens = jsonObject.getInt("generated_tokens");
+      int inferenceEndMs = jsonObject.getInt("inference_end_ms");
+      int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
+      float tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
+      tokensPerSecond.add(tps);
+    } catch (JSONException e) {
+    }
   }
 
   private void report(final String metric, final Float value) {
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index e19155b83e8..137e01f8f43 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -49,6 +49,8 @@
 import java.util.List;
 import java.util.concurrent.Executor;
 import java.util.concurrent.Executors;
+import org.json.JSONException;
+import org.json.JSONObject;
 import org.pytorch.executorch.extension.llm.LlmCallback;
 import org.pytorch.executorch.extension.llm.LlmModule;
 
@@ -97,10 +99,20 @@ public void onResult(String result) {
   }
 
   @Override
-  public void onStats(float tps) {
+  public void onStats(String stats) {
     runOnUiThread(
         () -> {
           if (mResultMessage != null) {
+            float tps = 0;
+            try {
+              JSONObject jsonObject = new JSONObject(stats);
+              int numGeneratedTokens = jsonObject.getInt("generated_tokens");
+              int inferenceEndMs = jsonObject.getInt("inference_end_ms");
+              int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
+              tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
+            } catch (JSONException e) {
+              Log.e("LLM", "Error parsing JSON: " + e.getMessage());
+            }
             mResultMessage.setTokensPerSecond(tps);
             mMessageAdapter.notifyDataSetChanged();
           }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
index 78cfee993c4..a1bc205c4ac 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
@@ -13,6 +13,8 @@
 import android.os.Looper;
 import android.os.Message;
 import androidx.annotation.NonNull;
+import org.json.JSONException;
+import org.json.JSONObject;
 import org.pytorch.executorch.extension.llm.LlmCallback;
 import org.pytorch.executorch.extension.llm.LlmModule;
 
@@ -69,7 +71,16 @@ public void onResult(String result) {
   }
 
   @Override
-  public void onStats(float tps) {
+  public void onStats(String stats) {
+    float tps = 0;
+    try {
+      JSONObject jsonObject = new JSONObject(stats);
+      int numGeneratedTokens = jsonObject.getInt("generated_tokens");
+      int inferenceEndMs = jsonObject.getInt("inference_end_ms");
+      int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
+      tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
+    } catch (JSONException e) {
+    }
     mCallback.onStats("tokens/second: " + tps);
   }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java
index c8bdc53075e..5e8b6f00e3d 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java
@@ -18,7 +18,7 @@ public interface ModelRunnerCallback {
 
   void onTokenGenerated(String token);
 
-  void onStats(String token);
+  void onStats(String stats);
 
   void onGenerationStopped();
 }
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
index dcfd07918cd..2ad87df0653 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
@@ -65,7 +65,7 @@ export ANDROID_ABIS=arm64-v8a
 MTK currently supports Llama 3 exporting.
 
 ### Set up Environment
-1. Follow the ExecuTorch set-up environment instructions found on the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html) page
+1. Follow the ExecuTorch set-up environment instructions found on the [Getting Started](https://pytorch.org/executorch/main/getting-started-setup.html) page
 2. Set-up MTK AoT environment
 ```
 // Ensure that you are inside executorch/examples/mediatek directory
@@ -126,13 +126,31 @@ The Mediatek runner (`examples/mediatek/executor_runner/mtk_llama_runner.cpp`) c
 
 
 ## Build AAR Library
+1. Open a terminal window and navigate to the root directory of the executorch
+2. Set the following environment variables:
+```sh
+export ANDROID_NDK=<path_to_android_ndk>
+export ANDROID_ABIS=arm64-v8a
+export NEURON_BUFFER_ALLOCATOR_LIB=<path_to_neuron_buffer_allocator_lib>
+```
+*Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.*
 
-Next we need to build and compile the MediaTek backend and MediaTek Llama runner. By setting  `NEURON_BUFFER_ALLOCATOR_LIB`, the script will build the MediaTek backend.
+3. Create a directory to hold the AAR
+```sh
+mkdir -p aar-out
+export BUILD_AAR_DIR=aar-out
 ```
+
+4. Run the following command to build the AAR:
+```sh
 sh scripts/build_android_library.sh
 ```
 
-**Output**: This will generate an .aar file that is already imported into the expected directory for the Android app. It will live in `examples/demo-apps/android/Llamademo/app/libs`.
+5. Copy the AAR to the app:
+```sh
+mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
+cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs/executorch.aar
+```
 
 If you were to unzip the .aar file or open it in Android Studio, verify it contains the following related to MediaTek backend:
 * libneuron_buffer_allocator.so
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
index f6952df97ad..fb9df3c3375 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -135,7 +135,7 @@ You may also wonder what the "--metadata" flag is doing. This flag helps export
 
 Convert tokenizer for Llama 2
 ```
-python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 ```
 Rename tokenizer for Llama 3 with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly.
 
@@ -178,30 +178,38 @@ adb push tokenizer.bin /data/local/tmp/llama
 
 
 ## Build AAR Library
-Open a terminal window and navigate to the root directory of the executorch.
+1. Open a terminal window and navigate to the root directory of the executorch
 Set the following environment variables:
-```
+```sh
 export ANDROID_NDK=<path_to_android_ndk>
-export ANDROID_ABI=arm64-v8a
-```
-Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.
-Build the Android Java extension code:
+export ANDROID_ABIS=arm64-v8a
+export QNN_SDK_ROOT=<path_to_qnn_sdk>
 ```
-pushd extension/android
-./gradlew build
-popd
+
+*Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.*
+
+3. Create a directory to hold the AAR
+```sh
+mkdir -p aar-out
+export BUILD_AAR_DIR=aar-out
 ```
-Run the following command set up the required JNI library:
+
+4. Run the following command to build the AAR:
+```sh
+sh scripts/build_android_library.sh
 ```
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:setupQnn
-popd
+
+5. Copy the AAR to the app:
+```sh
+mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
+cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs/executorch.aar
 ```
+
 Alternative you can also just run the shell script directly as in the root directory:
-```
+```sh
 sh examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
 ```
-This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them, and copies them to jniLibs.
+This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them into AAR, and copies it to the app.
 Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app.
 
 
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
index 59b74a3c1ac..cce21749979 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -57,6 +57,7 @@ Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch s
 ```
 python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
+For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
@@ -64,6 +65,8 @@ Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch su
 ```
 python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048--preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
+For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
+
 
 ### For Llama 3.2 1B and 3B BF16 models
 We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models.
@@ -73,6 +76,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 ```
 python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
+For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
 
@@ -103,7 +107,7 @@ You may wonder what the ‘--metadata’ flag is doing. This flag helps export t
 
 * Convert tokenizer for Llama 2 and Llava (skip this for Llama 3.x)
 ```
-python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 ```
 
 ### For LLaVA model
@@ -128,30 +132,35 @@ adb push tokenizer.bin /data/local/tmp/llama
 ## Build AAR Library
 1. Open a terminal window and navigate to the root directory of the executorch
 2. Set the following environment variables:
-```
+```sh
 export ANDROID_NDK=<path_to_android_ndk>
-export ANDROID_ABI=arm64-v8a
+export ANDROID_ABIS=arm64-v8a
 ```
 *Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.*
 
-3. Build the Android Java extension code:
+3. Create a directory to hold the AAR
+```sh
+mkdir -p aar-out
+export BUILD_AAR_DIR=aar-out
 ```
-pushd extension/android
-./gradlew build
-popd
-```
-4. Run the following command set up the required JNI library:
+
+4. Run the following command to build the AAR:
+```sh
+sh scripts/build_android_library.sh
 ```
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:setup
-popd
+
+5. Copy the AAR to the app:
+```sh
+mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
+cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs/executorch.aar
 ```
+
 Alternative you can also just run the shell script directly as in the root directory:
-```
+```sh
 sh examples/demo-apps/android/LlamaDemo/setup.sh
 ```
 
-This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them, and copies them to jniLibs.
+This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them into AAR, and copies it to the app.
 
 **Output**: The executorch.aar file will be generated in a newly created folder in the example/demo-apps/android/LlamaDemo/app/libs directory. This is the path that the Android app expects it to be in.
 
diff --git a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh
new file mode 100644
index 00000000000..ff59fc56b2c
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eu
+
+BASEDIR=$(dirname "$0")
+pushd "$BASEDIR"/../../../../
+curl -C - -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
+curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
+# Create params.json file
+touch params.json
+echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -d fp16 -n stories110m_h.pte -kv
+python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+
+adb mkdir -p /data/local/tmp/llama
+adb push stories110m_h.pte /data/local/tmp/llama
+adb push tokenizer.bin /data/local/tmp/llama
+popd
+
+pushd "$BASEDIR"
+./gradlew connectedAndroidTest
+popd
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
deleted file mode 100644
index 3a8f1ba8d25..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,864 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 56;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		031B92F72ACF2A5400D4EBE0 /* mv3_mps_float16.pte in Resources */ = {isa = PBXBuildFile; fileRef = 031B92F62ACF2A5400D4EBE0 /* mv3_mps_float16.pte */; };
-		032C01A52AC22B16002955E1 /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C018B2AC22B16002955E1 /* App.swift */; };
-		032C01A72AC22B16002955E1 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C018D2AC22B16002955E1 /* ContentView.swift */; };
-		032C01B72AC329B6002955E1 /* CustomViews.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C01B62AC329B6002955E1 /* CustomViews.swift */; };
-		032C01B92AC32ADF002955E1 /* CameraController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C01B82AC32ADF002955E1 /* CameraController.swift */; };
-		032C01E82AC34B60002955E1 /* MobileNetClassifier.mm in Sources */ = {isa = PBXBuildFile; fileRef = 032C01902AC22B16002955E1 /* MobileNetClassifier.mm */; };
-		032C01EC2AC34CAC002955E1 /* libMobileNetClassifier.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */; platformFilter = ios; };
-		032C02032AC47CFB002955E1 /* mv3_xnnpack_fp32.pte in Resources */ = {isa = PBXBuildFile; fileRef = 032C01FC2AC47CFB002955E1 /* mv3_xnnpack_fp32.pte */; };
-		032C02082AC47CFB002955E1 /* imagenet_classes.txt in Resources */ = {isa = PBXBuildFile; fileRef = 032C02012AC47CFB002955E1 /* imagenet_classes.txt */; };
-		036834D52ACB710D00BA100F /* mv3.pte in Resources */ = {isa = PBXBuildFile; fileRef = 036834D42ACB710D00BA100F /* mv3.pte */; };
-		036834D62ACB710D00BA100F /* mv3.pte in Resources */ = {isa = PBXBuildFile; fileRef = 036834D42ACB710D00BA100F /* mv3.pte */; };
-		036834D82ACB893700BA100F /* mv3_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 036834D72ACB893700BA100F /* mv3_coreml_all.pte */; };
-		03C818192AC671980084CC29 /* Samoyed.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 03C818182AC671980084CC29 /* Samoyed.jpg */; };
-		03C8181D2AC69C280084CC29 /* Arctic fox.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 03C8181C2AC69C280084CC29 /* Arctic fox.jpg */; };
-		03C8181F2AC69DFF0084CC29 /* hot pot.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 03C8181E2AC69DFF0084CC29 /* hot pot.jpg */; };
-		03C818242AC6E3010084CC29 /* MobileNetClassifierTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03C818232AC6E3010084CC29 /* MobileNetClassifierTest.swift */; };
-		03C818252AC75E580084CC29 /* MobileNetClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 032C018A2AC22B16002955E1 /* MobileNetClassifier.swift */; };
-		03C818282AC760650084CC29 /* libMobileNetClassifier.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */; };
-		03C8182A2AC7901D0084CC29 /* ClassificationController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03C818292AC7901D0084CC29 /* ClassificationController.swift */; };
-		03C8182C2AC790F40084CC29 /* CameraPreview.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03C8182B2AC790F40084CC29 /* CameraPreview.swift */; };
-		03C818322AC79FCD0084CC29 /* ImageClassification.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03C8182D2AC796A00084CC29 /* ImageClassification.swift */; };
-		03C818472AC7A1190084CC29 /* libImageClassification.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 03C818442AC79FCD0084CC29 /* libImageClassification.a */; };
-		03C818482AC7BF470084CC29 /* imagenet_classes.txt in Resources */ = {isa = PBXBuildFile; fileRef = 032C02012AC47CFB002955E1 /* imagenet_classes.txt */; };
-		03C8184A2AC7BF470084CC29 /* mv3_xnnpack_fp32.pte in Resources */ = {isa = PBXBuildFile; fileRef = 032C01FC2AC47CFB002955E1 /* mv3_xnnpack_fp32.pte */; };
-		03C8DC7C2AE2533C0064384A /* mv3_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 036834D72ACB893700BA100F /* mv3_coreml_all.pte */; };
-		03D494E22B5B7AE900FCF524 /* mv3_mps_float16.pte in Resources */ = {isa = PBXBuildFile; fileRef = 031B92F62ACF2A5400D4EBE0 /* mv3_mps_float16.pte */; };
-		03D76BF12BAD297E0040FD82 /* backend_coreml in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76BF02BAD297E0040FD82 /* backend_coreml */; };
-		03D76BF32BAD297E0040FD82 /* executorch in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76BF22BAD297E0040FD82 /* executorch */; };
-		03D76BF52BAD297E0040FD82 /* backend_mps in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76BF42BAD297E0040FD82 /* backend_mps */; };
-		03D76BF72BAD297E0040FD82 /* kernels_portable in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76BF62BAD297E0040FD82 /* kernels_portable */; };
-		03D76BF92BAD297E0040FD82 /* backend_xnnpack in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76BF82BAD297E0040FD82 /* backend_xnnpack */; };
-		03D76BFB2BAD29B60040FD82 /* backend_coreml in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76BFA2BAD29B60040FD82 /* backend_coreml */; };
-		03D76BFD2BAD29B60040FD82 /* backend_mps in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76BFC2BAD29B60040FD82 /* backend_mps */; };
-		03D76BFF2BAD29B60040FD82 /* kernels_portable in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76BFE2BAD29B60040FD82 /* kernels_portable */; };
-		03D76C012BAD29B60040FD82 /* backend_xnnpack in Frameworks */ = {isa = PBXBuildFile; productRef = 03D76C002BAD29B60040FD82 /* backend_xnnpack */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXContainerItemProxy section */
-		032C01EA2AC34CA8002955E1 /* PBXContainerItemProxy */ = {
-			isa = PBXContainerItemProxy;
-			containerPortal = 032C01672AC228E5002955E1 /* Project object */;
-			proxyType = 1;
-			remoteGlobalIDString = 032C01CA2AC34632002955E1;
-			remoteInfo = VisionModel;
-		};
-		03C818452AC7A0DB0084CC29 /* PBXContainerItemProxy */ = {
-			isa = PBXContainerItemProxy;
-			containerPortal = 032C01672AC228E5002955E1 /* Project object */;
-			proxyType = 1;
-			remoteGlobalIDString = 03C818302AC79FCD0084CC29;
-			remoteInfo = ImageClassification;
-		};
-		84EF1FE92C7850B6005922B4 /* PBXContainerItemProxy */ = {
-			isa = PBXContainerItemProxy;
-			containerPortal = 032C01672AC228E5002955E1 /* Project object */;
-			proxyType = 1;
-			remoteGlobalIDString = 032C016E2AC228E6002955E1;
-			remoteInfo = App;
-		};
-/* End PBXContainerItemProxy section */
-
-/* Begin PBXCopyFilesBuildPhase section */
-		032C01C92AC34632002955E1 /* CopyFiles */ = {
-			isa = PBXCopyFilesBuildPhase;
-			buildActionMask = 2147483647;
-			dstPath = "include/$(PRODUCT_NAME)";
-			dstSubfolderSpec = 16;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		03C818402AC79FCD0084CC29 /* CopyFiles */ = {
-			isa = PBXCopyFilesBuildPhase;
-			buildActionMask = 2147483647;
-			dstPath = "include/$(PRODUCT_NAME)";
-			dstSubfolderSpec = 16;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
-		031B92F62ACF2A5400D4EBE0 /* mv3_mps_float16.pte */ = {isa = PBXFileReference; lastKnownFileType = file; path = mv3_mps_float16.pte; sourceTree = "<group>"; };
-		032C016F2AC228E6002955E1 /* ExecuTorchDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = ExecuTorchDemo.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		032C018A2AC22B16002955E1 /* MobileNetClassifier.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNetClassifier.swift; sourceTree = "<group>"; };
-		032C018B2AC22B16002955E1 /* App.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = "<group>"; };
-		032C018D2AC22B16002955E1 /* ContentView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
-		032C018E2AC22B16002955E1 /* MobileNet-Bridging-Header.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "MobileNet-Bridging-Header.h"; sourceTree = "<group>"; };
-		032C01902AC22B16002955E1 /* MobileNetClassifier.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MobileNetClassifier.mm; sourceTree = "<group>"; };
-		032C01912AC22B16002955E1 /* MobileNetClassifier.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MobileNetClassifier.h; sourceTree = "<group>"; };
-		032C01B62AC329B6002955E1 /* CustomViews.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CustomViews.swift; sourceTree = "<group>"; };
-		032C01B82AC32ADF002955E1 /* CameraController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CameraController.swift; sourceTree = "<group>"; };
-		032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libMobileNetClassifier.a; sourceTree = BUILT_PRODUCTS_DIR; };
-		032C01FC2AC47CFB002955E1 /* mv3_xnnpack_fp32.pte */ = {isa = PBXFileReference; lastKnownFileType = file; path = mv3_xnnpack_fp32.pte; sourceTree = "<group>"; };
-		032C02012AC47CFB002955E1 /* imagenet_classes.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_classes.txt; sourceTree = "<group>"; };
-		036834D42ACB710D00BA100F /* mv3.pte */ = {isa = PBXFileReference; lastKnownFileType = file; path = mv3.pte; sourceTree = "<group>"; };
-		036834D72ACB893700BA100F /* mv3_coreml_all.pte */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = mv3_coreml_all.pte; sourceTree = "<group>"; };
-		03C8180A2AC66FC30084CC29 /* MobileNetClassifierTest.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = MobileNetClassifierTest.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
-		03C818182AC671980084CC29 /* Samoyed.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = Samoyed.jpg; sourceTree = "<group>"; };
-		03C8181C2AC69C280084CC29 /* Arctic fox.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = "Arctic fox.jpg"; sourceTree = "<group>"; };
-		03C8181E2AC69DFF0084CC29 /* hot pot.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = "hot pot.jpg"; sourceTree = "<group>"; };
-		03C818232AC6E3010084CC29 /* MobileNetClassifierTest.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNetClassifierTest.swift; sourceTree = "<group>"; };
-		03C818292AC7901D0084CC29 /* ClassificationController.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ClassificationController.swift; sourceTree = "<group>"; };
-		03C8182B2AC790F40084CC29 /* CameraPreview.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CameraPreview.swift; sourceTree = "<group>"; };
-		03C8182D2AC796A00084CC29 /* ImageClassification.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageClassification.swift; sourceTree = "<group>"; };
-		03C818442AC79FCD0084CC29 /* libImageClassification.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libImageClassification.a; sourceTree = BUILT_PRODUCTS_DIR; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		032C016C2AC228E6002955E1 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				03D76BF72BAD297E0040FD82 /* kernels_portable in Frameworks */,
-				03D76BF52BAD297E0040FD82 /* backend_mps in Frameworks */,
-				03D76BF92BAD297E0040FD82 /* backend_xnnpack in Frameworks */,
-				032C01EC2AC34CAC002955E1 /* libMobileNetClassifier.a in Frameworks */,
-				03D76BF12BAD297E0040FD82 /* backend_coreml in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		032C01C82AC34632002955E1 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				03D76BF32BAD297E0040FD82 /* executorch in Frameworks */,
-				03C818472AC7A1190084CC29 /* libImageClassification.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		03C818072AC66FC30084CC29 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				03D76BFF2BAD29B60040FD82 /* kernels_portable in Frameworks */,
-				03D76BFD2BAD29B60040FD82 /* backend_mps in Frameworks */,
-				03D76C012BAD29B60040FD82 /* backend_xnnpack in Frameworks */,
-				03C818282AC760650084CC29 /* libMobileNetClassifier.a in Frameworks */,
-				03D76BFB2BAD29B60040FD82 /* backend_coreml in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		03C818362AC79FCD0084CC29 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		032C01662AC228E5002955E1 = {
-			isa = PBXGroup;
-			children = (
-				032C01712AC228E6002955E1 /* ExecuTorchDemo */,
-				032C01702AC228E6002955E1 /* Products */,
-			);
-			sourceTree = "<group>";
-		};
-		032C01702AC228E6002955E1 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				032C016F2AC228E6002955E1 /* ExecuTorchDemo.app */,
-				032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */,
-				03C8180A2AC66FC30084CC29 /* MobileNetClassifierTest.xctest */,
-				03C818442AC79FCD0084CC29 /* libImageClassification.a */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		032C01712AC228E6002955E1 /* ExecuTorchDemo */ = {
-			isa = PBXGroup;
-			children = (
-				032C019E2AC22B16002955E1 /* Resources */,
-				032C01882AC22B16002955E1 /* Sources */,
-			);
-			path = ExecuTorchDemo;
-			sourceTree = "<group>";
-		};
-		032C01882AC22B16002955E1 /* Sources */ = {
-			isa = PBXGroup;
-			children = (
-				032C01892AC22B16002955E1 /* App */,
-				03C8182F2AC79F500084CC29 /* ImageClassification */,
-				032C018F2AC22B16002955E1 /* MobileNet */,
-			);
-			path = Sources;
-			sourceTree = "<group>";
-		};
-		032C01892AC22B16002955E1 /* App */ = {
-			isa = PBXGroup;
-			children = (
-				032C018B2AC22B16002955E1 /* App.swift */,
-				032C018D2AC22B16002955E1 /* ContentView.swift */,
-				03C8182B2AC790F40084CC29 /* CameraPreview.swift */,
-				032C01B62AC329B6002955E1 /* CustomViews.swift */,
-				032C01B82AC32ADF002955E1 /* CameraController.swift */,
-				03C818292AC7901D0084CC29 /* ClassificationController.swift */,
-			);
-			path = App;
-			sourceTree = "<group>";
-		};
-		032C018F2AC22B16002955E1 /* MobileNet */ = {
-			isa = PBXGroup;
-			children = (
-				03C818132AC6707F0084CC29 /* Test */,
-				032C018A2AC22B16002955E1 /* MobileNetClassifier.swift */,
-				032C018E2AC22B16002955E1 /* MobileNet-Bridging-Header.h */,
-				032C01912AC22B16002955E1 /* MobileNetClassifier.h */,
-				032C01902AC22B16002955E1 /* MobileNetClassifier.mm */,
-			);
-			path = MobileNet;
-			sourceTree = "<group>";
-		};
-		032C019E2AC22B16002955E1 /* Resources */ = {
-			isa = PBXGroup;
-			children = (
-				03C818172AC671980084CC29 /* Examples */,
-				032C019F2AC22B16002955E1 /* Models */,
-			);
-			path = Resources;
-			sourceTree = "<group>";
-		};
-		032C019F2AC22B16002955E1 /* Models */ = {
-			isa = PBXGroup;
-			children = (
-				032C01A02AC22B16002955E1 /* MobileNet */,
-			);
-			path = Models;
-			sourceTree = "<group>";
-		};
-		032C01A02AC22B16002955E1 /* MobileNet */ = {
-			isa = PBXGroup;
-			children = (
-				032C02012AC47CFB002955E1 /* imagenet_classes.txt */,
-				036834D72ACB893700BA100F /* mv3_coreml_all.pte */,
-				031B92F62ACF2A5400D4EBE0 /* mv3_mps_float16.pte */,
-				032C01FC2AC47CFB002955E1 /* mv3_xnnpack_fp32.pte */,
-				036834D42ACB710D00BA100F /* mv3.pte */,
-			);
-			path = MobileNet;
-			sourceTree = "<group>";
-		};
-		03C818132AC6707F0084CC29 /* Test */ = {
-			isa = PBXGroup;
-			children = (
-				03C818232AC6E3010084CC29 /* MobileNetClassifierTest.swift */,
-			);
-			path = Test;
-			sourceTree = "<group>";
-		};
-		03C818172AC671980084CC29 /* Examples */ = {
-			isa = PBXGroup;
-			children = (
-				03C8181C2AC69C280084CC29 /* Arctic fox.jpg */,
-				03C818182AC671980084CC29 /* Samoyed.jpg */,
-				03C8181E2AC69DFF0084CC29 /* hot pot.jpg */,
-			);
-			path = Examples;
-			sourceTree = "<group>";
-		};
-		03C8182F2AC79F500084CC29 /* ImageClassification */ = {
-			isa = PBXGroup;
-			children = (
-				03C8182D2AC796A00084CC29 /* ImageClassification.swift */,
-			);
-			path = ImageClassification;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		032C016E2AC228E6002955E1 /* App */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 032C017D2AC228E7002955E1 /* Build configuration list for PBXNativeTarget "App" */;
-			buildPhases = (
-				032C016B2AC228E6002955E1 /* Sources */,
-				032C016C2AC228E6002955E1 /* Frameworks */,
-				032C016D2AC228E6002955E1 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-				032C01EB2AC34CA8002955E1 /* PBXTargetDependency */,
-			);
-			name = App;
-			packageProductDependencies = (
-				03D76BF02BAD297E0040FD82 /* backend_coreml */,
-				03D76BF42BAD297E0040FD82 /* backend_mps */,
-				03D76BF62BAD297E0040FD82 /* kernels_portable */,
-				03D76BF82BAD297E0040FD82 /* backend_xnnpack */,
-			);
-			productName = ExecuTorchDemo;
-			productReference = 032C016F2AC228E6002955E1 /* ExecuTorchDemo.app */;
-			productType = "com.apple.product-type.application";
-		};
-		032C01CA2AC34632002955E1 /* MobileNetClassifier */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 032C01D12AC34632002955E1 /* Build configuration list for PBXNativeTarget "MobileNetClassifier" */;
-			buildPhases = (
-				032C01C72AC34632002955E1 /* Sources */,
-				032C01C82AC34632002955E1 /* Frameworks */,
-				032C01C92AC34632002955E1 /* CopyFiles */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-				03C818462AC7A0DB0084CC29 /* PBXTargetDependency */,
-			);
-			name = MobileNetClassifier;
-			packageProductDependencies = (
-				03D76BF22BAD297E0040FD82 /* executorch */,
-			);
-			productName = TorchModule;
-			productReference = 032C01CB2AC34632002955E1 /* libMobileNetClassifier.a */;
-			productType = "com.apple.product-type.library.static";
-		};
-		03C818092AC66FC30084CC29 /* MobileNetClassifierTest */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 03C818102AC66FC30084CC29 /* Build configuration list for PBXNativeTarget "MobileNetClassifierTest" */;
-			buildPhases = (
-				03C818062AC66FC30084CC29 /* Sources */,
-				03C818072AC66FC30084CC29 /* Frameworks */,
-				03C818082AC66FC30084CC29 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-				84EF1FEA2C7850B6005922B4 /* PBXTargetDependency */,
-			);
-			name = MobileNetClassifierTest;
-			packageProductDependencies = (
-				03D76BFA2BAD29B60040FD82 /* backend_coreml */,
-				03D76BFC2BAD29B60040FD82 /* backend_mps */,
-				03D76BFE2BAD29B60040FD82 /* kernels_portable */,
-				03D76C002BAD29B60040FD82 /* backend_xnnpack */,
-			);
-			productName = ExecuTorchDemoTests;
-			productReference = 03C8180A2AC66FC30084CC29 /* MobileNetClassifierTest.xctest */;
-			productType = "com.apple.product-type.bundle.unit-test";
-		};
-		03C818302AC79FCD0084CC29 /* ImageClassification */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 03C818412AC79FCD0084CC29 /* Build configuration list for PBXNativeTarget "ImageClassification" */;
-			buildPhases = (
-				03C818312AC79FCD0084CC29 /* Sources */,
-				03C818362AC79FCD0084CC29 /* Frameworks */,
-				03C818402AC79FCD0084CC29 /* CopyFiles */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = ImageClassification;
-			productName = TorchModule;
-			productReference = 03C818442AC79FCD0084CC29 /* libImageClassification.a */;
-			productType = "com.apple.product-type.library.static";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		032C01672AC228E5002955E1 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				BuildIndependentTargetsInParallel = 1;
-				LastSwiftUpdateCheck = 1500;
-				LastUpgradeCheck = 1500;
-				TargetAttributes = {
-					032C016E2AC228E6002955E1 = {
-						CreatedOnToolsVersion = 15.0;
-					};
-					032C01CA2AC34632002955E1 = {
-						CreatedOnToolsVersion = 15.0;
-					};
-					03C818092AC66FC30084CC29 = {
-						CreatedOnToolsVersion = 15.0;
-						LastSwiftMigration = 1500;
-						TestTargetID = 032C016E2AC228E6002955E1;
-					};
-				};
-			};
-			buildConfigurationList = 032C016A2AC228E5002955E1 /* Build configuration list for PBXProject "ExecuTorchDemo" */;
-			compatibilityVersion = "Xcode 14.0";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 032C01662AC228E5002955E1;
-			packageReferences = (
-				03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */,
-			);
-			productRefGroup = 032C01702AC228E6002955E1 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				032C016E2AC228E6002955E1 /* App */,
-				03C818302AC79FCD0084CC29 /* ImageClassification */,
-				032C01CA2AC34632002955E1 /* MobileNetClassifier */,
-				03C818092AC66FC30084CC29 /* MobileNetClassifierTest */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		032C016D2AC228E6002955E1 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				031B92F72ACF2A5400D4EBE0 /* mv3_mps_float16.pte in Resources */,
-				036834D82ACB893700BA100F /* mv3_coreml_all.pte in Resources */,
-				036834D52ACB710D00BA100F /* mv3.pte in Resources */,
-				032C02082AC47CFB002955E1 /* imagenet_classes.txt in Resources */,
-				032C02032AC47CFB002955E1 /* mv3_xnnpack_fp32.pte in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		03C818082AC66FC30084CC29 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				036834D62ACB710D00BA100F /* mv3.pte in Resources */,
-				03C818482AC7BF470084CC29 /* imagenet_classes.txt in Resources */,
-				03C8184A2AC7BF470084CC29 /* mv3_xnnpack_fp32.pte in Resources */,
-				03C8181D2AC69C280084CC29 /* Arctic fox.jpg in Resources */,
-				03C818192AC671980084CC29 /* Samoyed.jpg in Resources */,
-				03C8181F2AC69DFF0084CC29 /* hot pot.jpg in Resources */,
-				03C8DC7C2AE2533C0064384A /* mv3_coreml_all.pte in Resources */,
-				03D494E22B5B7AE900FCF524 /* mv3_mps_float16.pte in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		032C016B2AC228E6002955E1 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				032C01A52AC22B16002955E1 /* App.swift in Sources */,
-				032C01A72AC22B16002955E1 /* ContentView.swift in Sources */,
-				03C8182A2AC7901D0084CC29 /* ClassificationController.swift in Sources */,
-				03C8182C2AC790F40084CC29 /* CameraPreview.swift in Sources */,
-				032C01B72AC329B6002955E1 /* CustomViews.swift in Sources */,
-				032C01B92AC32ADF002955E1 /* CameraController.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		032C01C72AC34632002955E1 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				03C818252AC75E580084CC29 /* MobileNetClassifier.swift in Sources */,
-				032C01E82AC34B60002955E1 /* MobileNetClassifier.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		03C818062AC66FC30084CC29 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				03C818242AC6E3010084CC29 /* MobileNetClassifierTest.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		03C818312AC79FCD0084CC29 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				03C818322AC79FCD0084CC29 /* ImageClassification.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXTargetDependency section */
-		032C01EB2AC34CA8002955E1 /* PBXTargetDependency */ = {
-			isa = PBXTargetDependency;
-			platformFilter = ios;
-			target = 032C01CA2AC34632002955E1 /* MobileNetClassifier */;
-			targetProxy = 032C01EA2AC34CA8002955E1 /* PBXContainerItemProxy */;
-		};
-		03C818462AC7A0DB0084CC29 /* PBXTargetDependency */ = {
-			isa = PBXTargetDependency;
-			target = 03C818302AC79FCD0084CC29 /* ImageClassification */;
-			targetProxy = 03C818452AC7A0DB0084CC29 /* PBXContainerItemProxy */;
-		};
-		84EF1FEA2C7850B6005922B4 /* PBXTargetDependency */ = {
-			isa = PBXTargetDependency;
-			target = 032C016E2AC228E6002955E1 /* App */;
-			targetProxy = 84EF1FE92C7850B6005922B4 /* PBXContainerItemProxy */;
-		};
-/* End PBXTargetDependency section */
-
-/* Begin XCBuildConfiguration section */
-		032C017B2AC228E7002955E1 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "c++17";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-				"EXCLUDED_ARCHS[sdk=iphonesimulator*]" = x86_64;
-				GCC_C_LANGUAGE_STANDARD = c17;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				OTHER_LDFLAGS = (
-					"-lc++",
-					"-all_load",
-				);
-				SDKROOT = iphoneos;
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 5.0;
-			};
-			name = Debug;
-		};
-		032C017C2AC228E7002955E1 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "c++17";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-				"EXCLUDED_ARCHS[sdk=iphonesimulator*]" = x86_64;
-				GCC_C_LANGUAGE_STANDARD = c17;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-				OTHER_LDFLAGS = (
-					"-lc++",
-					"-all_load",
-				);
-				SDKROOT = iphoneos;
-				SWIFT_COMPILATION_MODE = wholemodule;
-				SWIFT_VERSION = 5.0;
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		032C017E2AC228E7002955E1 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_IDENTITY = "Apple Development";
-				CODE_SIGN_STYLE = Automatic;
-				CURRENT_PROJECT_VERSION = 1;
-				DEVELOPMENT_TEAM = "";
-				ENABLE_PREVIEWS = YES;
-				GENERATE_INFOPLIST_FILE = YES;
-				INFOPLIST_KEY_NSCameraUsageDescription = "";
-				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-				INFOPLIST_KEY_UIRequiresFullScreen = YES;
-				INFOPLIST_KEY_UISupportedInterfaceOrientations = UIInterfaceOrientationPortrait;
-				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo.test;
-				PRODUCT_NAME = "$(PROJECT_NAME)";
-				PROVISIONING_PROFILE_SPECIFIER = "";
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
-				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		032C017F2AC228E7002955E1 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_IDENTITY = "Apple Development";
-				CODE_SIGN_STYLE = Automatic;
-				CURRENT_PROJECT_VERSION = 1;
-				DEVELOPMENT_TEAM = "";
-				ENABLE_PREVIEWS = YES;
-				GENERATE_INFOPLIST_FILE = YES;
-				INFOPLIST_KEY_NSCameraUsageDescription = "";
-				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-				INFOPLIST_KEY_UIRequiresFullScreen = YES;
-				INFOPLIST_KEY_UISupportedInterfaceOrientations = UIInterfaceOrientationPortrait;
-				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo.test;
-				PRODUCT_NAME = "$(PROJECT_NAME)";
-				PROVISIONING_PROFILE_SPECIFIER = "";
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
-				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SWIFT_EMIT_LOC_STRINGS = YES;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-		032C01D22AC34632002955E1 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SKIP_INSTALL = YES;
-				SWIFT_OBJC_BRIDGING_HEADER = "ExecuTorchDemo/Sources/MobileNet/MobileNet-Bridging-Header.h";
-			};
-			name = Debug;
-		};
-		032C01D32AC34632002955E1 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SKIP_INSTALL = YES;
-				SWIFT_OBJC_BRIDGING_HEADER = "ExecuTorchDemo/Sources/MobileNet/MobileNet-Bridging-Header.h";
-			};
-			name = Release;
-		};
-		03C818112AC66FC30084CC29 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CLANG_ENABLE_MODULES = YES;
-				CURRENT_PROJECT_VERSION = 1;
-				GENERATE_INFOPLIST_FILE = YES;
-				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo.test;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SUPPORTS_MACCATALYST = NO;
-				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/ExecuTorchDemo.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/ExecuTorchDemo";
-			};
-			name = Debug;
-		};
-		03C818122AC66FC30084CC29 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CLANG_ENABLE_MODULES = YES;
-				CURRENT_PROJECT_VERSION = 1;
-				GENERATE_INFOPLIST_FILE = YES;
-				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo.test;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SUPPORTS_MACCATALYST = NO;
-				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/ExecuTorchDemo.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/ExecuTorchDemo";
-			};
-			name = Release;
-		};
-		03C818422AC79FCD0084CC29 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SKIP_INSTALL = YES;
-			};
-			name = Debug;
-		};
-		03C818432AC79FCD0084CC29 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SKIP_INSTALL = YES;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		032C016A2AC228E5002955E1 /* Build configuration list for PBXProject "ExecuTorchDemo" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				032C017B2AC228E7002955E1 /* Debug */,
-				032C017C2AC228E7002955E1 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		032C017D2AC228E7002955E1 /* Build configuration list for PBXNativeTarget "App" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				032C017E2AC228E7002955E1 /* Debug */,
-				032C017F2AC228E7002955E1 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		032C01D12AC34632002955E1 /* Build configuration list for PBXNativeTarget "MobileNetClassifier" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				032C01D22AC34632002955E1 /* Debug */,
-				032C01D32AC34632002955E1 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		03C818102AC66FC30084CC29 /* Build configuration list for PBXNativeTarget "MobileNetClassifierTest" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				03C818112AC66FC30084CC29 /* Debug */,
-				03C818122AC66FC30084CC29 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		03C818412AC79FCD0084CC29 /* Build configuration list for PBXNativeTarget "ImageClassification" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				03C818422AC79FCD0084CC29 /* Debug */,
-				03C818432AC79FCD0084CC29 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-
-/* Begin XCRemoteSwiftPackageReference section */
-		03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */ = {
-			isa = XCRemoteSwiftPackageReference;
-			repositoryURL = "https://github.com/pytorch/executorch";
-			requirement = {
-				branch = "swiftpm-0.7.0.20250401";
-				kind = branch;
-			};
-		};
-/* End XCRemoteSwiftPackageReference section */
-
-/* Begin XCSwiftPackageProductDependency section */
-		03D76BF02BAD297E0040FD82 /* backend_coreml */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = backend_coreml;
-		};
-		03D76BF22BAD297E0040FD82 /* executorch */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = executorch;
-		};
-		03D76BF42BAD297E0040FD82 /* backend_mps */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = backend_mps;
-		};
-		03D76BF62BAD297E0040FD82 /* kernels_portable */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = kernels_portable;
-		};
-		03D76BF82BAD297E0040FD82 /* backend_xnnpack */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = backend_xnnpack;
-		};
-		03D76BFA2BAD29B60040FD82 /* backend_coreml */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = backend_coreml;
-		};
-		03D76BFC2BAD29B60040FD82 /* backend_mps */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = backend_mps;
-		};
-		03D76BFE2BAD29B60040FD82 /* kernels_portable */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = kernels_portable;
-		};
-		03D76C002BAD29B60040FD82 /* backend_xnnpack */ = {
-			isa = XCSwiftPackageProductDependency;
-			package = 03D76BEF2BAD297E0040FD82 /* XCRemoteSwiftPackageReference "executorch" */;
-			productName = backend_xnnpack;
-		};
-/* End XCSwiftPackageProductDependency section */
-	};
-	rootObject = 032C01672AC228E5002955E1 /* Project object */;
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/xcshareddata/xcschemes/App.xcscheme b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/xcshareddata/xcschemes/App.xcscheme
deleted file mode 100644
index e451ec338d6..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/xcshareddata/xcschemes/App.xcscheme
+++ /dev/null
@@ -1,90 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Scheme
-   LastUpgradeVersion = "1500"
-   version = "1.7">
-   <BuildAction
-      parallelizeBuildables = "YES"
-      buildImplicitDependencies = "YES">
-      <BuildActionEntries>
-         <BuildActionEntry
-            buildForTesting = "YES"
-            buildForRunning = "YES"
-            buildForProfiling = "YES"
-            buildForArchiving = "YES"
-            buildForAnalyzing = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "032C016E2AC228E6002955E1"
-               BuildableName = "ExecuTorchDemo.app"
-               BlueprintName = "App"
-               ReferencedContainer = "container:ExecuTorchDemo.xcodeproj">
-            </BuildableReference>
-         </BuildActionEntry>
-      </BuildActionEntries>
-   </BuildAction>
-   <TestAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      shouldAutocreateTestPlan = "YES">
-      <Testables>
-         <TestableReference
-            skipped = "NO"
-            parallelizable = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "03C818092AC66FC30084CC29"
-               BuildableName = "MobileNetClassifierTest.xctest"
-               BlueprintName = "MobileNetClassifierTest"
-               ReferencedContainer = "container:ExecuTorchDemo.xcodeproj">
-            </BuildableReference>
-         </TestableReference>
-      </Testables>
-   </TestAction>
-   <LaunchAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      launchStyle = "0"
-      useCustomWorkingDirectory = "NO"
-      ignoresPersistentStateOnLaunch = "NO"
-      debugDocumentVersioning = "YES"
-      debugServiceExtension = "internal"
-      allowLocationSimulation = "YES">
-      <BuildableProductRunnable
-         runnableDebuggingMode = "0">
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "032C016E2AC228E6002955E1"
-            BuildableName = "ExecuTorchDemo.app"
-            BlueprintName = "App"
-            ReferencedContainer = "container:ExecuTorchDemo.xcodeproj">
-         </BuildableReference>
-      </BuildableProductRunnable>
-   </LaunchAction>
-   <ProfileAction
-      buildConfiguration = "Release"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      savedToolIdentifier = ""
-      useCustomWorkingDirectory = "NO"
-      debugDocumentVersioning = "YES">
-      <BuildableProductRunnable
-         runnableDebuggingMode = "0">
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "032C016E2AC228E6002955E1"
-            BuildableName = "ExecuTorchDemo.app"
-            BlueprintName = "App"
-            ReferencedContainer = "container:ExecuTorchDemo.xcodeproj">
-         </BuildableReference>
-      </BuildableProductRunnable>
-   </ProfileAction>
-   <AnalyzeAction
-      buildConfiguration = "Debug">
-   </AnalyzeAction>
-   <ArchiveAction
-      buildConfiguration = "Release"
-      revealArchiveInOrganizer = "YES">
-   </ArchiveAction>
-</Scheme>
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/xcshareddata/xcschemes/MobileNetClassifierTest.xcscheme b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/xcshareddata/xcschemes/MobileNetClassifierTest.xcscheme
deleted file mode 100644
index f042036feb9..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/xcshareddata/xcschemes/MobileNetClassifierTest.xcscheme
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Scheme
-   LastUpgradeVersion = "1500"
-   version = "1.7">
-   <BuildAction
-      parallelizeBuildables = "YES"
-      buildImplicitDependencies = "YES">
-   </BuildAction>
-   <TestAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      shouldAutocreateTestPlan = "YES">
-      <Testables>
-         <TestableReference
-            skipped = "NO"
-            parallelizable = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "03C818092AC66FC30084CC29"
-               BuildableName = "MobileNetClassifierTest.xctest"
-               BlueprintName = "MobileNetClassifierTest"
-               ReferencedContainer = "container:ExecuTorchDemo.xcodeproj">
-            </BuildableReference>
-         </TestableReference>
-      </Testables>
-   </TestAction>
-   <LaunchAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      launchStyle = "0"
-      useCustomWorkingDirectory = "NO"
-      ignoresPersistentStateOnLaunch = "NO"
-      debugDocumentVersioning = "YES"
-      debugServiceExtension = "internal"
-      allowLocationSimulation = "YES">
-   </LaunchAction>
-   <ProfileAction
-      buildConfiguration = "Release"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      savedToolIdentifier = ""
-      useCustomWorkingDirectory = "NO"
-      debugDocumentVersioning = "YES">
-   </ProfileAction>
-   <AnalyzeAction
-      buildConfiguration = "Debug">
-   </AnalyzeAction>
-   <ArchiveAction
-      buildConfiguration = "Release"
-      revealArchiveInOrganizer = "YES">
-   </ArchiveAction>
-</Scheme>
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/Arctic fox.jpg b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/Arctic fox.jpg
deleted file mode 100644
index ea1a5df91c4..00000000000
Binary files a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/Arctic fox.jpg and /dev/null differ
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/Samoyed.jpg b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/Samoyed.jpg
deleted file mode 100644
index 12f0e0dd116..00000000000
Binary files a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/Samoyed.jpg and /dev/null differ
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/hot pot.jpg b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/hot pot.jpg
deleted file mode 100644
index 8cde5eb752c..00000000000
Binary files a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Examples/hot pot.jpg and /dev/null differ
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/App.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/App.swift
deleted file mode 100644
index ceddbde1e61..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/App.swift
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import SwiftUI
-
-@main
-struct App: SwiftUI.App {
-  var body: some Scene {
-    WindowGroup {
-      ContentView()
-    }
-  }
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CameraController.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CameraController.swift
deleted file mode 100644
index c4b58ee78ab..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CameraController.swift
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import AVFoundation
-import SwiftUI
-
-enum CameraControllerError: Error {
-  case authorization(String)
-  case capture(String)
-  case setup(String)
-}
-
-class CameraController: NSObject, ObservableObject, AVCapturePhotoCaptureDelegate {
-  let captureSession = AVCaptureSession()
-  private var photoOutput = AVCapturePhotoOutput()
-  private var timer: Timer?
-  private var callback: ((Result<UIImage, Error>) -> Void)?
-
-  func startCapturing(withTimeInterval interval: TimeInterval,
-                      callback: @escaping (Result<UIImage, Error>) -> Void) {
-    authorize { error in
-      if let error {
-        DispatchQueue.main.async {
-          callback(.failure(error))
-        }
-        return
-      }
-      self.setup { error in
-        if let error {
-          DispatchQueue.main.async {
-            callback(.failure(error))
-          }
-          return
-        }
-        self.captureSession.startRunning()
-        DispatchQueue.main.async {
-          self.callback = callback
-          self.timer = Timer.scheduledTimer(withTimeInterval: interval, repeats: true) { _ in
-            self.photoOutput.capturePhoto(with: AVCapturePhotoSettings(), delegate: self)
-          }
-        }
-      }
-    }
-  }
-
-  private func authorize(_ completion: @escaping (Error?) -> Void) {
-    switch AVCaptureDevice.authorizationStatus(for: .video) {
-    case .authorized:
-      DispatchQueue.global(qos: .userInitiated).async {
-        completion(nil)
-      }
-    case .notDetermined:
-      AVCaptureDevice.requestAccess(for: .video) { granted in
-        DispatchQueue.global(qos: .userInitiated).async {
-          if granted {
-            completion(nil)
-          } else {
-            completion(CameraControllerError.authorization("Camera access denied"))
-          }
-        }
-      }
-    default:
-      DispatchQueue.global(qos: .userInitiated).async {
-        completion(CameraControllerError.authorization("Camera access denied"))
-      }
-    }
-  }
-
-  private func setup(_ callback: (Error?) -> Void) {
-    guard let videoCaptureDevice = AVCaptureDevice.default(for: .video)
-    else {
-      callback(CameraControllerError.setup("Cannot get video capture device"))
-      return
-    }
-    let videoInput: AVCaptureDeviceInput
-    do {
-      videoInput = try AVCaptureDeviceInput(device: videoCaptureDevice)
-    } catch {
-      callback(CameraControllerError.setup("Cannot set up video input: \(error)"))
-      return
-    }
-    if captureSession.canAddInput(videoInput) {
-      captureSession.addInput(videoInput)
-    } else {
-      callback(CameraControllerError.setup("Cannot add video input"))
-      return
-    }
-    if captureSession.canAddOutput(photoOutput) {
-      captureSession.addOutput(photoOutput)
-    } else {
-      callback(CameraControllerError.setup("Cannot add photo output"))
-      return
-    }
-    callback(nil)
-  }
-
-  func photoOutput(_ output: AVCapturePhotoOutput, didFinishProcessingPhoto photo: AVCapturePhoto, error: Error?) {
-    guard let callback = self.callback else {
-      print("No image capturing callback set")
-      return
-    }
-    if let error {
-      callback(.failure(CameraControllerError.capture("Image capture error: \(error)")))
-    }
-    guard let imageData = photo.fileDataRepresentation(),
-          let image = UIImage(data: imageData),
-          let cgImage = image.cgImage
-    else {
-      callback(.failure(CameraControllerError.capture("Couldn't get image data")))
-      return
-    }
-    var orientation = UIImage.Orientation.up
-    switch UIDevice.current.orientation {
-    case .portrait:
-      orientation = .right
-    case .portraitUpsideDown:
-      orientation = .left
-    case .landscapeRight:
-      orientation = .down
-    default:
-      break
-    }
-    callback(.success(UIImage(cgImage: cgImage, scale: image.scale, orientation: orientation)))
-  }
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CameraPreview.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CameraPreview.swift
deleted file mode 100644
index 0a7d9f8b5af..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CameraPreview.swift
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import AVFoundation
-import SwiftUI
-
-struct CameraPreview: UIViewRepresentable {
-  let captureSession: AVCaptureSession
-
-  func makeUIView(context: Context) -> UIView {
-    let view = CameraView(frame: UIScreen.main.bounds)
-    view.videoPreviewLayer?.session = captureSession
-    return view
-  }
-
-  func updateUIView(_ uiView: UIView, context: Context) {
-    if let view = uiView as? CameraView {
-      view.videoPreviewLayer?.frame = uiView.bounds
-    }
-  }
-}
-
-final class CameraView: UIView {
-  override class var layerClass: AnyClass {
-    return AVCaptureVideoPreviewLayer.self
-  }
-
-  var videoPreviewLayer: AVCaptureVideoPreviewLayer? {
-    return layer as? AVCaptureVideoPreviewLayer
-  }
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ClassificationController.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ClassificationController.swift
deleted file mode 100644
index 5a50da9c22d..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ClassificationController.swift
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import ImageClassification
-import MobileNetClassifier
-import SwiftUI
-
-enum Mode: String, CaseIterable {
-  case xnnpack = "XNNPACK"
-  case coreML = "Core ML"
-  case mps = "MPS"
-}
-
-class ClassificationController: ObservableObject {
-  @AppStorage("mode") var mode: Mode = .xnnpack
-  @Published var classifications: [Classification] = []
-  @Published var elapsedTime: TimeInterval = 0.0
-  @Published var isRunning = false
-
-  private let queue = DispatchQueue(label: "org.pytorch.executorch.demo", qos: .userInitiated)
-  private var classifier: ImageClassification?
-  private var currentMode: Mode = .xnnpack
-
-  func classify(_ image: UIImage) {
-    guard !isRunning else {
-      print("Dropping frame")
-      return
-    }
-    isRunning = true
-
-    if currentMode != mode {
-      currentMode = mode
-      classifier = nil
-    }
-    queue.async {
-      var classifications: [Classification] = []
-      var elapsedTime: TimeInterval = -1
-      do {
-        if self.classifier == nil {
-          self.classifier = try self.createClassifier(for: self.currentMode)
-        }
-        let startTime = CFAbsoluteTimeGetCurrent()
-        classifications = try self.classifier?.classify(image: image) ?? []
-        elapsedTime = (CFAbsoluteTimeGetCurrent() - startTime) * 1000
-      } catch {
-        print("Error classifying image: \(error)")
-      }
-      DispatchQueue.main.async {
-        self.classifications = classifications
-        self.elapsedTime = elapsedTime
-        self.isRunning = false
-      }
-    }
-  }
-
-  private func createClassifier(for mode: Mode) throws -> ImageClassification? {
-    let modelFileName: String
-    switch mode {
-    case .coreML:
-      modelFileName = "mv3_coreml_all"
-    case .mps:
-      modelFileName = "mv3_mps_float16"
-    case .xnnpack:
-      modelFileName = "mv3_xnnpack_fp32"
-    }
-    guard let modelFilePath = Bundle.main.path(forResource: modelFileName, ofType: "pte"),
-          let labelsFilePath = Bundle.main.path(forResource: "imagenet_classes", ofType: "txt")
-    else { return nil }
-    return try MobileNetClassifier(modelFilePath: modelFilePath, labelsFilePath: labelsFilePath)
-  }
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ContentView.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ContentView.swift
deleted file mode 100644
index 9005a20d1af..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ContentView.swift
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import AVFoundation
-import SwiftUI
-
-struct ContentView: View {
-  @StateObject private var cameraController = CameraController()
-  @StateObject private var classificationController = ClassificationController()
-
-  var body: some View {
-    ZStack {
-      cameraPreview
-      controlPanel
-    }
-  }
-
-  private var cameraPreview: some View {
-    CameraPreview(captureSession: cameraController.captureSession)
-      .aspectRatio(contentMode: .fill)
-      .edgesIgnoringSafeArea(.all)
-      .onAppear(perform: startCapturing)
-      .onDisappear(perform: stopCapturing)
-  }
-
-  private var controlPanel: some View {
-    VStack(spacing: 0) {
-      TopBar(title: "ExecuTorch Demo")
-      ClassificationLabelView(controller: classificationController)
-      Spacer()
-      ClassificationTimeView(controller: classificationController)
-      ModeSelector(controller: classificationController)
-    }
-  }
-
-  private func startCapturing() {
-    UIApplication.shared.isIdleTimerDisabled = true
-    cameraController.startCapturing(withTimeInterval: 1.0) { result in
-      switch result {
-      case .success(let image):
-        self.classificationController.classify(image)
-      case .failure(let error):
-        self.handleError(error)
-      }
-    }
-  }
-
-  private func stopCapturing() {
-    UIApplication.shared.isIdleTimerDisabled = false
-  }
-
-  private func handleError(_ error: Error) {
-    stopCapturing()
-    print(error)
-  }
-}
-
-struct ContentView_Previews: PreviewProvider {
-  static var previews: some View {
-    ContentView()
-  }
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CustomViews.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CustomViews.swift
deleted file mode 100644
index d00fefdb514..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/CustomViews.swift
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import AVFoundation
-import ImageClassification
-import SwiftUI
-
-struct TopBar: View {
-  let title: String
-
-  var body: some View {
-    Text(title)
-      .font(.title)
-      .foregroundColor(.white)
-      .frame(maxWidth: .infinity)
-      .background(Color.black.opacity(0.5))
-  }
-}
-
-struct ClassificationLabelView: View {
-  @ObservedObject var controller: ClassificationController
-
-  var body: some View {
-    VStack(alignment: .leading) {
-      ForEach(controller.classifications.prefix(3), id: \.label) { classification in
-        Text("\(classification.label) \(Int(classification.confidence * 100))%")
-          .font(.footnote)
-          .foregroundColor(.white)
-      }
-    }
-    .padding()
-    .frame(maxWidth: .infinity)
-    .background(Color.black.opacity(0.5))
-  }
-}
-
-struct ClassificationTimeView: View {
-  @ObservedObject var controller: ClassificationController
-
-  var body: some View {
-    VStack {
-      if controller.isRunning {
-        ProgressView()
-          .progressViewStyle(CircularProgressViewStyle(tint: .white))
-          .frame(width: nil, height: 34, alignment: .center)
-      } else {
-        Text("\n\(controller.elapsedTime, specifier: "%.2f") ms")
-          .font(.footnote)
-          .foregroundColor(.white)
-      }
-    }
-    .frame(maxWidth: .infinity)
-    .background(Color.black.opacity(0.5))
-  }
-}
-
-struct ModeSelector: View {
-  @ObservedObject var controller: ClassificationController
-
-  var body: some View {
-    HStack {
-      ForEach(Mode.allCases, id: \.self) { mode in
-        ModeButton(mode: mode, controller: controller)
-      }
-    }
-    .padding()
-    .frame(maxWidth: .infinity)
-    .background(Color.black.opacity(0.5))
-  }
-}
-
-struct ModeButton: View {
-  var mode: Mode
-  @ObservedObject var controller: ClassificationController
-
-  var body: some View {
-    Button(action: { controller.mode = mode }) {
-      Text(mode.rawValue)
-        .fontWeight(.semibold)
-        .foregroundColor(.white)
-        .padding(.horizontal, 10)
-        .padding(.vertical, 5)
-        .background(controller.mode == mode ? Color.red : Color.clear)
-        .cornerRadius(15)
-    }
-  }
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/ImageClassification/ImageClassification.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/ImageClassification/ImageClassification.swift
deleted file mode 100644
index 1c515e98a62..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/ImageClassification/ImageClassification.swift
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import UIKit
-
-public struct Classification {
-  public let label: String
-  public let confidence: Float
-
-  public init(label: String, confidence: Float) {
-    self.label = label
-    self.confidence = confidence
-  }
-}
-
-public protocol ImageClassification {
-  func classify(image: UIImage) throws -> [Classification]
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNet-Bridging-Header.h b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNet-Bridging-Header.h
deleted file mode 100644
index 6d1eb27932b..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNet-Bridging-Header.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#import "MobileNetClassifier.h"
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.h b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.h
deleted file mode 100644
index 74051909a36..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#import <Foundation/Foundation.h>
-
-NS_ASSUME_NONNULL_BEGIN
-
-FOUNDATION_EXPORT NSErrorDomain const ETMobileNetClassifierErrorDomain;
-
-@interface ETMobileNetClassifier : NSObject
-
-- (instancetype)initWithFilePath:(NSString*)filePath;
-- (BOOL)classifyWithInput:(float*)input
-                   output:(float*)output
-               outputSize:(NSInteger)predictionBufferSize
-                    error:(NSError**)error;
-
-@end
-
-NS_ASSUME_NONNULL_END
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm
deleted file mode 100644
index 59b66e510bd..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#import "MobileNetClassifier.h"
-
-#import <executorch/extension/module/module.h>
-#import <executorch/extension/tensor/tensor.h>
-
-using namespace ::executorch::extension;
-
-NSErrorDomain const ETMobileNetClassifierErrorDomain =
-    @"MobileNetClassifierErrorDomain";
-const int32_t kSize = 224;
-const int32_t kChannels = 3;
-
-@implementation ETMobileNetClassifier {
-  std::unique_ptr<Module> _module;
-}
-
-- (instancetype)initWithFilePath:(NSString*)filePath {
-  self = [super init];
-  if (self) {
-    _module = std::make_unique<Module>(filePath.UTF8String);
-  }
-  return self;
-}
-
-- (BOOL)classifyWithInput:(float*)input
-                   output:(float*)output
-               outputSize:(NSInteger)outputSize
-                    error:(NSError**)error {
-  const auto result =
-      _module->forward(from_blob(input, {1, kChannels, kSize, kSize}));
-
-  if (!result.ok()) {
-    if (error) {
-      *error = [NSError
-          errorWithDomain:ETMobileNetClassifierErrorDomain
-                     code:NSInteger(result.error())
-                 userInfo:@{
-                   NSLocalizedDescriptionKey : [NSString
-                       stringWithFormat:
-                           @"Failed to run forward on the torch module, error code: %i",
-                           result.error()]
-                 }];
-    }
-    return NO;
-  }
-  const auto outputData = result->at(0).toTensor().const_data_ptr<float>();
-  std::copy(outputData, outputData + outputSize, output);
-
-  return YES;
-}
-
-@end
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.swift
deleted file mode 100644
index b7a6e30a285..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.swift
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import ExecuTorch
-import ImageClassification
-import UIKit
-
-import os.log
-
-public enum MobileNetClassifierError: Error {
-  case inputPointer
-  case rawData
-  case transform
-
-  var localizedDescription: String {
-    switch self {
-    case .inputPointer:
-      return "Cannot get the input pointer base address"
-    case .rawData:
-      return "Cannot get the pixel data from the image"
-    case .transform:
-      return "Cannot transform the image"
-    }
-  }
-}
-
-// See https://pytorch.org/vision/main/models/generated/torchvision.models.mobilenet_v3_small.html
-// on model input/output spec.
-public class MobileNetClassifier: ImageClassification {
-  private static let resizeSize: CGFloat = 256
-  private static let cropSize: CGFloat = 224
-
-  private var mobileNetClassifier: ETMobileNetClassifier
-  private var labels: [String] = []
-  private var rawDataBuffer: [UInt8]
-  private var normalizedBuffer: [Float]
-
-  public init?(modelFilePath: String, labelsFilePath: String) throws {
-    labels = try String(contentsOfFile: labelsFilePath, encoding: .utf8)
-      .components(separatedBy: .newlines)
-    mobileNetClassifier = ETMobileNetClassifier(filePath: modelFilePath)
-    rawDataBuffer = [UInt8](repeating: 0, count: Int(Self.cropSize * Self.cropSize) * 4)
-    normalizedBuffer = [Float](repeating: 0, count: rawDataBuffer.count / 4 * 3)
-
-    #if DEBUG
-    Log.shared.add(sink: self)
-    #endif
-  }
-
-  deinit {
-    #if DEBUG
-    Log.shared.remove(sink: self)
-    #endif
-  }
-
-  public func classify(image: UIImage) throws -> [Classification] {
-    var input = try normalize(rawData(from: transformed(image)))
-    var output = [Float](repeating: 0, count: labels.count)
-
-    try input.withUnsafeMutableBufferPointer { inputPointer in
-      guard let inputPointerBaseAddress = inputPointer.baseAddress else {
-        throw MobileNetClassifierError.inputPointer
-      }
-      try mobileNetClassifier.classify(
-        withInput: inputPointerBaseAddress,
-        output: &output,
-        outputSize: labels.count)
-    }
-    return softmax(output).enumerated().sorted(by: { $0.element > $1.element })
-      .compactMap { (index, probability) -> Classification? in
-        guard index < labels.count else { return nil }
-        return Classification(label: labels[index], confidence: probability)
-      }
-  }
-
-  private func transformed(_ image: UIImage) throws -> UIImage {
-    let aspectRatio = image.size.width / image.size.height
-    let targetSize =
-      aspectRatio > 1
-      ? CGSize(width: Self.resizeSize * aspectRatio, height: Self.resizeSize)
-      : CGSize(width: Self.resizeSize, height: Self.resizeSize / aspectRatio)
-    let cropRect = CGRect(
-      x: (targetSize.width - Self.cropSize) / 2,
-      y: (targetSize.height - Self.cropSize) / 2,
-      width: Self.cropSize,
-      height: Self.cropSize)
-
-    UIGraphicsBeginImageContextWithOptions(cropRect.size, false, 1)
-    defer { UIGraphicsEndImageContext() }
-    image.draw(
-      in: CGRect(
-        x: -cropRect.origin.x,
-        y: -cropRect.origin.y,
-        width: targetSize.width,
-        height: targetSize.height))
-    guard let resizedAndCroppedImage = UIGraphicsGetImageFromCurrentImageContext()
-    else {
-      throw MobileNetClassifierError.transform
-    }
-    return resizedAndCroppedImage
-  }
-
-  private func rawData(from image: UIImage) throws -> [UInt8] {
-    guard let cgImage = image.cgImage else {
-      throw MobileNetClassifierError.rawData
-    }
-    let context = CGContext(
-      data: &rawDataBuffer,
-      width: cgImage.width,
-      height: cgImage.height,
-      bitsPerComponent: 8,
-      bytesPerRow: cgImage.width * 4,
-      space: CGColorSpaceCreateDeviceRGB(),
-      bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
-    )
-    context?.draw(
-      cgImage,
-      in: CGRect(
-        origin: CGPoint.zero,
-        size: CGSize(width: cgImage.width, height: cgImage.height)))
-    return rawDataBuffer
-  }
-
-  private func normalize(_ rawData: [UInt8]) -> [Float] {
-    let mean: [Float] = [0.485, 0.456, 0.406]
-    let std: [Float] = [0.229, 0.224, 0.225]
-    let pixelCount = rawData.count / 4
-
-    for i in 0..<pixelCount {
-      normalizedBuffer[i] = (Float(rawData[i * 4 + 0]) / 255 - mean[0]) / std[0]
-      normalizedBuffer[i + pixelCount] = (Float(rawData[i * 4 + 1]) / 255 - mean[1]) / std[1]
-      normalizedBuffer[i + pixelCount * 2] = (Float(rawData[i * 4 + 2]) / 255 - mean[2]) / std[2]
-    }
-    return normalizedBuffer
-  }
-
-  private func softmax(_ input: [Float]) -> [Float] {
-    let maxInput = input.max() ?? 0
-    let expInput = input.map { exp($0 - maxInput) }
-    let sumExpInput = expInput.reduce(0, +)
-    return expInput.map { $0 / sumExpInput }
-  }
-}
-
-#if DEBUG
-extension MobileNetClassifier: LogSink {
-  public func log(level: LogLevel, timestamp: TimeInterval, filename: String, line: UInt, message: String) {
-    let logMessage = "executorch:\(filename):\(line) \(message)"
-
-    switch level {
-    case .debug:
-      os_log(.debug, "%{public}@", logMessage)
-    case .info:
-      os_log(.info, "%{public}@", logMessage)
-    case .error:
-      os_log(.error, "%{public}@", logMessage)
-    case .fatal:
-      os_log(.fault, "%{public}@", logMessage)
-    default:
-      os_log("%{public}@", logMessage)
-    }
-  }
-}
-#endif
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/Test/MobileNetClassifierTest.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/Test/MobileNetClassifierTest.swift
deleted file mode 100644
index 7936699bb9b..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/Test/MobileNetClassifierTest.swift
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import ImageClassification
-import XCTest
-
-@testable import MobileNetClassifier
-
-final class MobileNetClassifierTest: XCTestCase {
-
-  func testV3WithPortableBackend() throws {
-    try run(model: "mv3")
-  }
-
-  func testV3WithCoreMLBackend() throws {
-    try run(model: "mv3_coreml_all")
-  }
-
-  func testV3WithMPSBackend() throws {
-    try run(model: "mv3_mps_float16")
-  }
-
-  func testV3WithXNNPACKBackend() throws {
-    try run(model: "mv3_xnnpack_fp32")
-  }
-
-  private func run(model modelName: String) throws {
-    guard
-      let modelFilePath = Bundle(for: type(of: self))
-        .path(forResource: modelName, ofType: "pte")
-    else {
-      XCTFail("Failed to get model path")
-      return
-    }
-    guard
-      let labelsFilePath = Bundle(for: type(of: self))
-        .path(forResource: "imagenet_classes", ofType: "txt")
-    else {
-      XCTFail("Failed to get labels path")
-      return
-    }
-    let classifier = try MobileNetClassifier(
-      modelFilePath: modelFilePath,
-      labelsFilePath: labelsFilePath)
-    for expectedClassification in [
-      Classification(label: "Arctic fox", confidence: 0.9),
-      Classification(label: "Samoyed", confidence: 0.7),
-      Classification(label: "hot pot", confidence: 0.8),
-    ] {
-      guard
-        let imagePath = Bundle(for: type(of: self))
-          .path(forResource: expectedClassification.label, ofType: "jpg"),
-        let image = UIImage(contentsOfFile: imagePath)
-      else {
-        XCTFail("Failed to get image path or image")
-        return
-      }
-      guard let classification = try classifier?.classify(image: image).first
-      else {
-        XCTFail("Failed to run the model")
-        return
-      }
-      XCTAssertEqual(classification.label, expectedClassification.label)
-      XCTAssertGreaterThan(classification.confidence, expectedClassification.confidence)
-    }
-  }
-}
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md b/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
deleted file mode 100644
index a66a1f75954..00000000000
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md
+++ /dev/null
@@ -1,136 +0,0 @@
-# Building an ExecuTorch iOS Demo App
-
-Welcome to the tutorial on setting up the ExecuTorch iOS Demo App!
-
-This app uses the
-[MobileNet v3](https://pytorch.org/vision/main/models/mobilenetv3.html) model to
-process live camera images leveraging three different backends:
-[XNNPACK](https://github.com/google/XNNPACK),
-[Core ML](https://developer.apple.com/documentation/coreml) and
-[Metal Performance Shaders (MPS)](https://developer.apple.com/documentation/metalperformanceshaders)
-(Xcode 15+ and iOS 17+ only).
-
-![](_static/img/demo_ios_app.jpg)
-
-## Prerequisites
-
-Before we start, make sure you have the following tools installed:
-
-### 1. Xcode 15 and Command Line Tools
-
-Install Xcode 15 from the
-[Mac App Store](https://apps.apple.com/app/xcode/id497799835) and then install
-the Command Line Tools using the terminal:
-
-```bash
-xcode-select --install
-```
-
-### 2. Python 3.10+
-
-Python 3.10 or above, along with `pip`, should be pre-installed on MacOS 13.5+.
-If needed, [download Python](https://www.python.org/downloads/macos/) and
-install it. Verify the Python and pip versions using these commands:
-
-```bash
-which python3 pip
-python3 --version
-pip --version
-```
-
-### 3. Getting Started Tutorial
-
-Follow the [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
-tutorial to configure the basic environment:
-
-```bash
-git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
-
-python3 -m venv .venv && source .venv/bin/activate
-
-pip install --upgrade cmake pip setuptools wheel
-
-./install_executorch.sh --pybind coreml mps xnnpack
-```
-
-### 4. Backend Dependencies
-
-Also, follow the corresponding sections from [Core ML](https://pytorch.org/executorch/stable/build-run-coreml) and
-[MPS](https://pytorch.org/executorch/stable/build-run-mps) tutorials to install additional dependencies for those
-backends:
-
-```bash
-./backends/apple/coreml/scripts/install_requirements.sh
-
-./backends/apple/mps/install_requirements.sh
-```
-
-## Models and Labels
-
-Now, let's move on to exporting and bundling the MobileNet v3 model.
-
-### 1. Export Model
-
-Export the MobileNet v3 model with Core ML, MPS and XNNPACK backends, and move
-the exported model to a specific location where the Demo App will pick them up:
-
-```bash
-MODEL_NAME="mv3"
-
-python3 -m examples.portable.scripts.export --model_name="$MODEL_NAME"
-python3 -m examples.apple.coreml.scripts.export --model_name="$MODEL_NAME"
-python3 -m examples.apple.mps.scripts.mps_example --model_name="$MODEL_NAME"
-python3 -m examples.xnnpack.aot_compiler --model_name="$MODEL_NAME" --delegate
-
-mkdir -p examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/
-mv "$MODEL_NAME*.pte" examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/
-```
-
-### 2. Download Labels
-
-Download the MobileNet model labels required for image classification:
-
-```bash
-curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \
-  -o examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/imagenet_classes.txt
-```
-
-## Final Steps
-
-We're almost done! Now, we just need to open the project in Xcode, run the
-tests, and finally run the app.
-
-### 1. Open Project in Xcode
-
-Double-click on the project file under
-`examples/demo-apps/apple_ios/ExecuTorchDemo` or run the command:
-
-```bash
-open examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj
-```
-
-### 2. Run Tests
-
-You can run tests on Simulaltor directly in Xcode with `Cmd + U` or use the
-command line:
-
-```bash
-xcrun simctl create executorch "iPhone 15"
-xcodebuild clean test \
-     -project examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj \
-     -scheme App \
-     -destination name=executorch
-xcrun simctl delete executorch
-```
-
-### 3. Run App
-
-Finally, connect the device, set up Code Signing in Xcode, and then run the app
-using `Cmd + R`. Try installing a Release build for better performance.
-
-Congratulations! You've successfully set up the ExecuTorch iOS Demo App. Now,
-you can explore and enjoy the power of ExecuTorch on your iOS device!
-
-Learn more about integrating and running [ExecuTorch on Apple](https://pytorch.org/executorch/stable/apple-runtime) platforms.
-
-![](_static/img/demo_ios_xcode.jpg)
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index b4719e70e9a..efaf851da82 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -128,8 +128,8 @@
 		F292B07F2D88B0D200BE6839 /* result.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = result.h; sourceTree = "<group>"; };
 		F292B0812D88B0D200BE6839 /* tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = "<group>"; };
 		F292B0832D88B0D200BE6839 /* tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = "<group>"; };
-		F292B0FF2D88B20C00BE6839 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = /Users/larryliu/CLionProjects/executorch/examples/models/llama/tokenizer/llama_tiktoken.h; sourceTree = "<absolute>"; };
-		F292B1002D88B20C00BE6839 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama_tiktoken.cpp; path = /Users/larryliu/CLionProjects/executorch/examples/models/llama/tokenizer/llama_tiktoken.cpp; sourceTree = "<absolute>"; };
+		F292B0FF2D88B20C00BE6839 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = ../../../../../executorch/examples/models/llama/tokenizer/llama_tiktoken.h; sourceTree = "<group>"; };
+		F292B1002D88B20C00BE6839 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama_tiktoken.cpp; path = ../../../../../executorch/examples/models/llama/tokenizer/llama_tiktoken.cpp; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -852,7 +852,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = "swiftpm-0.7.0.20250401";
+				branch = "swiftpm-0.6.0";
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
index e03bc7aabc4..3618d05ec6c 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
@@ -12,6 +12,7 @@
 #import <executorch/examples/models/llama/runner/runner.h>
 #import <executorch/examples/models/llava/runner/llava_runner.h>
 
+using executorch::extension::llm::GenerationConfig;
 using executorch::extension::llm::Image;
 using executorch::runtime::Error;
 
@@ -61,8 +62,11 @@ - (BOOL)generate:(NSString*)prompt
        sequenceLength:(NSInteger)seq_len
     withTokenCallback:(nullable void (^)(NSString*))callback
                 error:(NSError**)error {
+  const GenerationConfig config{
+    .seq_len = static_cast<int32_t>(seq_len)
+  };
   const auto status = _runner->generate(
-      prompt.UTF8String, seq_len, [callback](const std::string& token) {
+      prompt.UTF8String, config, [callback](const std::string& token) {
         callback(@(token.c_str()));
       });
   if (status != Error::Ok) {
diff --git a/examples/demo-apps/apple_ios/LLaMA/README.md b/examples/demo-apps/apple_ios/LLaMA/README.md
index 5ac8c80ca78..2c984031809 100644
--- a/examples/demo-apps/apple_ios/LLaMA/README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/README.md
@@ -1,100 +1,47 @@
 # ExecuTorch Llama iOS Demo App
 
-**[UPDATE - 10/24]** We have added support for running quantized Llama 3.2 1B/3B models in demo apps on the [XNNPACK backend](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md). We currently support inference with SpinQuant and QAT+LoRA quantization methods.
+Get hands-on with running LLaMA and LLaVA models — exported via ExecuTorch — natively on your iOS device!
 
-We’re excited to share that the newly revamped iOS demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an iOS demo app and how to exercise the many features ExecuTorch and Llama models have to offer.
-
-This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case.
-
-Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas.
-
-## Key Concepts
-From this demo app, you will learn many key concepts such as:
-* How to prepare Llama models, build the ExecuTorch library, and perform model inference across delegates
-* Expose the ExecuTorch library via Swift Package Manager
-* Familiarity with current ExecuTorch app-facing capabilities
-
-The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases.
-
-## Supported Models
-
-As a whole, the models that this app supports are (varies by delegate):
-* Llama 3.2 Quantized 1B/3B
-* Llama 3.2 1B/3B in BF16
-* Llama 3.1 8B
-* Llama 3 8B
-* Llama 2 7B
-* Llava 1.5 (only XNNPACK)
-
-## Building the application
-First it’s important to note that currently ExecuTorch provides support across several delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to export the models to build ExecuTorch libraries and apps to run on device:
-
-| Delegate                       | Resource                           |
-| ------------------------------ | ---------------------------------  |
-| XNNPACK (CPU-based library)    | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md)|
-| MPS (Metal Performance Shader) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md)    |
-
-## How to Use the App
-This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API.
-
-### Swift Package Manager
-
-ExecuTorch runtime is distributed as a Swift package providing some .xcframework as prebuilt binary targets.
-Xcode will download and cache the package on the first run, which will take some time.
-
-Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
-
-```
-rm -rf \
-  ~/Library/org.swift.swiftpm \
-  ~/Library/Caches/org.swift.swiftpm \
-  ~/Library/Caches/com.apple.dt.Xcode \
-  ~/Library/Developer/Xcode/DerivedData
-```
-
-Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target.
-
-Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
-
-For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
-
-### XCode
-* Open XCode and select "Open an existing project" to open `examples/demo-apps/apple_ios/LLama`.
-* Ensure that the ExecuTorch package dependencies are installed correctly, then select which ExecuTorch framework should link against which target.
+*Click the image below to see it in action!*
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
+<a href="https://pytorch.org/executorch/main/_static/img/llama_ios_app.mp4">
+  <img src="https://pytorch.org/executorch/main/_static/img/llama_ios_app.png" width="600" alt="iOS app running a LlaMA model">
+</a>
 </p>
 
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" style="width:600px">
-</p>
+## Requirements
+- [Xcode](https://apps.apple.com/us/app/xcode/id497799835?mt=12/) 15.0 or later
+- [Cmake](https://cmake.org/download/) 3.19 or later
+  - Download and open the macOS `.dmg` installer and move the Cmake app to `/Applications` folder.
+  - Install Cmake command line tools: `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install`
+- A development provisioning profile with the [`increased-memory-limit`](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement.
 
-* Run the app. This builds and launches the app on the phone.
-* In app UI pick a model and tokenizer to use, type a prompt and tap the arrow buton
+## Models
 
-## Copy the model to Simulator
+Download already exported LLaMA/LLaVA models along with tokenizers from [HuggingFace](https://huggingface.co/executorch-community) or export your own empowered by [XNNPACK](docs/delegates/xnnpack_README.md) or [MPS](docs/delegates/mps_README.md) backends.
 
-* Drag&drop the model and tokenizer files onto the Simulator window and save them somewhere inside the iLLaMA folder.
-* Pick the files in the app dialog, type a prompt and click the arrow-up button.
+## Build and Run
 
-## Copy the model to Device
+1. Make sure git submodules are up-to-date:
+   ```bash
+   git submodule update --init --recursive
+   ```
 
-* Wire-connect the device and open the contents in Finder.
-* Navigate to the Files tab and drag&drop the model and tokenizer files onto the iLLaMA folder.
-* Wait until the files are copied.
+2. Open the Xcode project:
+    ```bash
+    open examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj
+    ```
+    
+3. Click the Play button to launch the app in the Simulator.
 
-If the app successfully run on your device, you should see something like below:
+4. To run on a device, ensure you have it set up for development and a provisioning profile with the `increased-memory-limit` entitlement. Update the app's bundle identifier to match your provisioning profile with the required capability.
 
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app.jpg" alt="iOS LLaMA App" style="width:300px">
-</p>
+5. After successfully launching the app, copy the exported ExecuTorch model (`.pte`) and tokenizer (`.model`) files to the iLLaMA folder.
 
-For Llava 1.5 models, you can select and image (via image/camera selector button) before typing prompt and send button.
+    - **For the Simulator:** Drag and drop both files onto the Simulator window and save them in the `On My iPhone > iLLaMA` folder.
+    - **For a Device:** Open a separate Finder window, navigate to the Files tab, drag and drop both files into the iLLaMA folder, and wait for the copying to finish.
 
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_llava.jpg" alt="iOS LLaMA App" style="width:300px">
-</p>
+6. Follow the app's UI guidelines to select the model and tokenizer files from the local filesystem and issue a prompt.
 
-## Reporting Issues
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
+For more details check out the [Using ExecuTorch on iOS](../../../../docs/source/using-executorch-ios.md) page.
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
index bffe4465eee..b16f27410af 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -9,7 +9,7 @@ More specifically, it covers:
 ## Prerequisites
 * [Xcode 15](https://developer.apple.com/xcode)
 * [iOS 18 SDK](https://developer.apple.com/ios)
-* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment:
+* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/main/using-executorch-building-from-source) to set up the repo and dev environment:
 
 ## Setup ExecuTorch
 In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)). The commands below are running on Linux (CentOS).
@@ -85,7 +85,7 @@ Link your binary with the ExecuTorch runtime and any backends or kernels used by
 
 Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the executorch_debug framework. For optimal performance, always link against the Release version of the deliverables (those without the _debug suffix), which have all logging overhead removed.
 
-For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
+For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/using-executorch-ios).
 
 <p align="center">
 <img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" style="width:600px">
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index f9cc3da1641..dd6bb26fec1 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -53,6 +53,7 @@ Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch s
 ```
 python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
+For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
@@ -60,6 +61,7 @@ Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch su
 ```
 python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
+For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
 
 ### For Llama 3.2 1B and 3B BF16 models
 We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models.
@@ -69,8 +71,9 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 ```
 python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
+For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
 
-For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
+For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2/#-llama-3.2-lightweight-models-(1b/3b)-).
 
 ### For Llama 3.1 and Llama 2 models
 
@@ -131,7 +134,7 @@ BUCK2_RELEASE_DATE="2024-12-16"
 BUCK2_ARCHIVE="buck2-aarch64-apple-darwin.zst"
 BUCK2=".venv/bin/buck2"
 
-curl -LO "https://github.com/facebook/buck2/releases/download/$BUCK2_RELEASE_DATE/$BUCK2_ARCHIVE"
+curl -LO "https://github.com/facebook/buck2/releases/download/${BUCK2_RELEASE_DATE}/${BUCK2_ARCHIVE}"
 zstd -cdq "$BUCK2_ARCHIVE" > "$BUCK2" && chmod +x "$BUCK2"
 rm "$BUCK2_ARCHIVE"
 
@@ -160,7 +163,7 @@ If you cannot add the package into your app target (it's greyed out), it might h
 
 
 
- More details on integrating and Running ExecuTorch on Apple Platforms, check out the detailed guide [here](https://pytorch.org/executorch/main/apple-runtime.html#local-build).
+ More details on integrating and Running ExecuTorch on Apple Platforms, check out the detailed guide [here](https://pytorch.org/executorch/main/using-executorch-ios#local-build).
 
 ### 3. Configure Build Schemes
 
@@ -172,7 +175,7 @@ Navigate to `Product --> Scheme --> Edit Scheme --> Info --> Build Configuration
 
 We recommend that you only use the Debug build scheme during development, where you might need to access additional logs. Debug build has logging overhead and will impact inferencing performance, while release build has compiler optimizations enabled and all logging overhead removed.
 
-For more details integrating and Running ExecuTorch on Apple Platforms or building the package locally, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
+For more details integrating and Running ExecuTorch on Apple Platforms or building the package locally, checkout this [link](https://pytorch.org/executorch/main/using-executorch-ios).
 
 ### 4. Build and Run the project
 
diff --git a/examples/demo-apps/react-native/rnllama/README.md b/examples/demo-apps/react-native/rnllama/README.md
index 33c607d635f..f017c8bfa22 100644
--- a/examples/demo-apps/react-native/rnllama/README.md
+++ b/examples/demo-apps/react-native/rnllama/README.md
@@ -1,7 +1,7 @@
 # React Native Llama
 
 <p align="center">
-  <img src="./assets/images/rnllama.png" width="200" alt="rnllama Logo">
+  <img src="assets/images/rnllama.png" width="200" alt="rnllama Logo">
 </p>
 
 A React Native mobile application for running LLaMA language models using ExecuTorch. This example is for iOS only for now.
diff --git a/examples/devtools/README.md b/examples/devtools/README.md
index e4fbadfcca0..0b516ad629e 100644
--- a/examples/devtools/README.md
+++ b/examples/devtools/README.md
@@ -17,7 +17,7 @@ examples/devtools
 We will use an example model (in `torch.nn.Module`) and its representative inputs, both from [`models/`](../models) directory, to generate a [BundledProgram(`.bpte`)](../../docs/source/bundled-io.md) file using the [script](scripts/export_bundled_program.py). Then we will use [devtools/example_runner](example_runner/example_runner.cpp) to execute the `.bpte` model on the ExecuTorch runtime and verify the model on BundledProgram API.
 
 
-1. Sets up the basic development environment for ExecuTorch by [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/stable/getting-started-setup).
+1. Sets up the basic development environment for ExecuTorch by [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/main/getting-started-setup).
 
 2. Using the [script](scripts/export_bundled_program.py) to generate a BundledProgram binary file by retreiving a `torch.nn.Module` model and its representative inputs from the list of available models in the [`models/`](../models) dir.
 
diff --git a/examples/llm_manual/README.md b/examples/llm_manual/README.md
index e465255fc66..6318bbe7e84 100644
--- a/examples/llm_manual/README.md
+++ b/examples/llm_manual/README.md
@@ -1,3 +1,3 @@
 # LLM Manual
 
-This repository is a storage place for the files that [LLM Manual](https://pytorch.org/executorch/main/llm/getting-started.html) needs. Please refer to the documentation website for more information.
+This repository is a storage place for the files that [LLM Manual](https://pytorch.org/executorch/main/llm/getting-started) needs. Please refer to the documentation website for more information.
diff --git a/examples/llm_manual/export_nanogpt.py b/examples/llm_manual/export_nanogpt.py
index 9de2e831e25..8c948479f2a 100644
--- a/examples/llm_manual/export_nanogpt.py
+++ b/examples/llm_manual/export_nanogpt.py
@@ -28,7 +28,7 @@
 # The torch.no_grad() call tells PyTorch to exclude training-specific logic.
 with sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
     m = export_for_training(
-        model, example_inputs, dynamic_shapes=dynamic_shape
+        model, example_inputs, dynamic_shapes=dynamic_shape, strict=True
     ).module()
     traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape, strict=True)
 
diff --git a/examples/llm_pte_finetuning/README.md b/examples/llm_pte_finetuning/README.md
index 8aeea31608c..b8d0b1eac1a 100644
--- a/examples/llm_pte_finetuning/README.md
+++ b/examples/llm_pte_finetuning/README.md
@@ -63,7 +63,7 @@ shuffle: True
 batch_size: 1
 ```
 
-Torchtune supports datasets using huggingface dataloaders, so custom datasets could also be defined. For examples on defining your own datasets, review the [torchtune docs](https://pytorch.org/torchtune/stable/tutorials/datasets.html#hugging-face-datasets).
+Torchtune supports datasets using huggingface dataloaders, so custom datasets could also be defined. For examples on defining your own datasets, review the [torchtune docs](https://pytorch.org/torchtune/stable/basics/text_completion_datasets.html#loading-text-completion-datasets-from-hugging-face).
 
 ### Loss
 
diff --git a/examples/mediatek/aot_utils/llm_utils/tokenizers_/tokenization_llama.py b/examples/mediatek/aot_utils/llm_utils/tokenizers_/tokenization_llama.py
index 9b5ef2c0c85..00ca0bf5b77 100644
--- a/examples/mediatek/aot_utils/llm_utils/tokenizers_/tokenization_llama.py
+++ b/examples/mediatek/aot_utils/llm_utils/tokenizers_/tokenization_llama.py
@@ -454,7 +454,7 @@ def create_token_type_ids_from_sequences(
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+            `List[int]`: List of token type IDs according to the given sequence(s).
         """
         bos_token_id = [self.bos_token_id] if self.add_bos_token else []
         eos_token_id = [self.eos_token_id] if self.add_eos_token else []
diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py
index 2246b8eeb15..25362788e31 100755
--- a/examples/mediatek/aot_utils/oss_utils/utils.py
+++ b/examples/mediatek/aot_utils/oss_utils/utils.py
@@ -30,7 +30,9 @@ def build_executorch_binary(
         if quant_dtype not in Precision:
             raise AssertionError(f"No support for Precision {quant_dtype}.")
 
-        captured_model = torch.export.export_for_training(model, inputs).module()
+        captured_model = torch.export.export_for_training(
+            model, inputs, strict=True
+        ).module()
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
         # calibration
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index 5274d0925ae..131ad95e34b 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -80,11 +80,9 @@ bool MTKLlamaRunner::is_loaded() const {
 
 Error MTKLlamaRunner::generate(
     const std::string& prompt,
-    int32_t seq_len,
+    executorch::extension::llm::GenerationConfig config,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback,
-    bool echo,
-    bool warming) {
+    std::function<void(const Stats&)> stats_callback) {
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index 0f76f610a7e..5dd8a85005e 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -43,11 +43,9 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner {
   Error load();
   Error generate(
       const std::string& prompt,
-      int32_t seq_len = 128,
+      executorch::extension::llm::GenerationConfig config,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {},
-      bool echo = true,
-      bool warming = false);
+      std::function<void(const Stats&)> stats_callback = {});
   void stop();
 
   LlamaModelOptions get_model_options();
diff --git a/examples/mediatek/model_export_scripts/llama.py b/examples/mediatek/model_export_scripts/llama.py
index 5da17727075..413df21d5cc 100644
--- a/examples/mediatek/model_export_scripts/llama.py
+++ b/examples/mediatek/model_export_scripts/llama.py
@@ -319,7 +319,7 @@ def export_to_et_ir(
     )
     print("Getting pre autograd ATen Dialect Graph")
     pre_autograd_aten_dialect = torch.export.export_for_training(
-        model, example_inputs, dynamic_shapes=dynamic_shapes
+        model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
     ).module()  # NOTE: Will be replaced with export
     quantizer = NeuropilotQuantizer()
     quantizer.setup_precision(getattr(Precision, precision))
diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md
index 3a7a723c73b..5fd47ad61ec 100644
--- a/examples/models/deepseek-r1-distill-llama-8B/README.md
+++ b/examples/models/deepseek-r1-distill-llama-8B/README.md
@@ -17,7 +17,7 @@ pip install -U "huggingface_hub[cli]"
 huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Llama-8B --local-dir /target_dir/DeepSeek-R1-Distill-Llama-8B --local-dir-use-symlinks False
 ```
 
-2. Download the [tokenizer.model](https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/original/tokenizer.model) from the Llama3.1 repo which will be needed later on when running the model using the runtime.
+2. Download the [tokenizer.model](https://huggingface.co/meta-llama/Llama-3.1-8B/tree/main/original) from the Llama3.1 repo which will be needed later on when running the model using the runtime.
 
 3. Convert the model to pth file.
 ```
@@ -48,16 +48,13 @@ print("saving checkpoint")
 torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth")
 ```
 
-4. Download and save the params.json file
-```
-wget https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/original/params.json -o /tmp/params.json
-```
+4. Download and save the [params.json](https://huggingface.co/meta-llama/Llama-3.1-8B/tree/main/original) file.
 
 5. Generate a PTE file for use with the Llama runner.
 ```
 python -m examples.models.llama.export_llama \
     --checkpoint /tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
-	-p /tmp/params.json \
+	-p params.json \
 	-kv \
 	--use_sdpa_with_kv_cache \
 	-X \
diff --git a/examples/models/efficient_sam/README.md b/examples/models/efficient_sam/README.md
index bce1f7c5319..1f89a3ec5b3 100644
--- a/examples/models/efficient_sam/README.md
+++ b/examples/models/efficient_sam/README.md
@@ -12,7 +12,7 @@ Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup#
 
 ### Exporting to Core ML
 
-Make sure to install the [required dependencies](https://pytorch.org/executorch/main/build-run-coreml.html#setting-up-your-developer-environment) for Core ML export.
+Make sure to install the [required dependencies](https://pytorch.org/executorch/main/backends-coreml#development-requirements) for Core ML export.
 
 To export the model to Core ML, run the following command:
 
@@ -32,7 +32,7 @@ python -m examples.xnnpack.aot_compiler -m efficient_sam
 
 # Performance
 
-Tests were conducted on an Apple M1 Pro chip using the instructions for building and running Executorch with [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html#runtime) and [XNNPACK](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering.html#running-the-xnnpack-model-with-cmake) backends.
+Tests were conducted on an Apple M1 Pro chip using the instructions for building and running Executorch with [Core ML](https://pytorch.org/executorch/main/backends-coreml#runtime-integration) and [XNNPACK](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering#running-the-xnnpack-model-with-cmake) backends.
 
 | Backend Configuration  | Average Inference Time (seconds) |
 | ---------------------- | -------------------------------- |
@@ -46,4 +46,4 @@ All models were tested with `float32` precision.
 
 # Licensing
 
-The code in the `efficient_sam_core` directory is licensed under the [Apache License 2.0](./efficient_sam_core/LICENSE.txt).
+The code in the `efficient_sam_core` directory is licensed under the [Apache License 2.0](efficient_sam_core/LICENSE.txt).
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index e6d45424bd4..12385f32d20 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -111,7 +111,8 @@ target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  list(APPEND link_libraries $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
+  target_link_options_shared_lib(custom_ops)
+  list(APPEND link_libraries custom_ops)
 endif()
 
 if(EXECUTORCH_BUILD_TORCHAO)
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 0bef45ea3ae..3f616b86e19 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -11,7 +11,7 @@ Here are supported models:
 
 Pretrained models are not included in this repo. Users are suggested to download them [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
 
-This page contains the basic recipe for running Llama. See [Llama utils page](./UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes.
+This page contains the basic recipe for running Llama. See [Llama utils page](UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes.
 
 # What is Llama?
 Llama is a collection of large language models that use publicly available data for training. These models are based on the transformer architecture, which allows it to process input sequences of arbitrary length and generate output sequences of variable length. One of the key features of Llama models is its ability to generate coherent and contextually relevant text. This is achieved through the use of attention mechanisms, which allow the model to focus on different parts of the input sequence as it generates output. Additionally, Llama models use a technique called “masked language modeling” to pre-train the model on a large corpus of text, which helps it learn to predict missing words in a sentence.
@@ -80,12 +80,12 @@ Llama 3.2 1B and 3B performance was measured on Android OnePlus 12 device. The p
 <table>
   <tr>
     <td>
-        <img src="./Android3_2_1B_bf16.gif" width="300">
+        <img src="Android3_2_1B_bf16.gif" width="300">
         <br>
         <em> Llama3.2 1B, unquantized, BF16 on Android phone. </em>
     </td>
     <td>
-      <img src="./Android3_2_3B_SpinQuant.gif" width="300">
+      <img src="Android3_2_3B_SpinQuant.gif" width="300">
       <br>
       <em>
       Llama3.2 3B, 4bit quantized (SpinQuant) on Android phone
@@ -129,7 +129,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
 
 <p align="center">
       <br>
-      <img src="./llama_via_xnnpack.gif" width=300>
+      <img src="llama_via_xnnpack.gif" width=300>
       <br>
       <em>
       Llama3.1 8B, 4bit quantized on Android phone
@@ -143,7 +143,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus
 ## Tested on
 
 - MacOS M1/M2, Linux.
-- For Llama 3 8B, your device may require at least 32GB RAM. If this is a constraint for you, please try the [smaller stories model](./UTILS.md).
+- For Llama 3 8B, your device may require at least 32GB RAM. If this is a constraint for you, please try the [smaller stories model](UTILS.md).
 
 ## Step 1: Setup
 > :warning: **double check your python environment**: make sure `conda activate <VENV>` is run before all the bash and python scripts.
@@ -177,6 +177,7 @@ python -m examples.models.llama.export_llama \
   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
   --output_name="llama3_2.pte"
 ```
+For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
 
 - To use **SpinQuant**, here are two ways:
     - Download directly from [Llama website](https://www.llama.com/llama-downloads). The model weights are prequantized and can be exported to `pte` file directly.
@@ -206,6 +207,8 @@ python -m examples.models.llama.export_llama \
    --use_spin_quant native \
    --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
 ```
+For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
+
 
 - To use **QAT+LoRA**, download directly from [Llama website](https://www.llama.com/llama-downloads). The model weights are prequantized and can be exported to `pte` file directly by:
 
@@ -234,6 +237,7 @@ python -m examples.models.llama.export_llama \
    --output_name "llama3_2.pte" \
    --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
 ```
+For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
 
 ### Option B: Download and export Llama 3 8B instruct model
 
@@ -371,14 +375,14 @@ adb push cmake-out-android/examples/models/llama/llama_main /data/local/tmp/llam
 ```
 adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.model> --prompt \"What is the capital of France?\" --seq_len 120" --warmup=1
 ```
-## Step 6: Build Mobile apps
+## Step 5: Build Mobile apps
 
 ### iOS
 
-Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension.
+Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension.
 
 ### Android
-Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.
+Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android) to for full instructions on building the Android LLAMA Demo App.
 
 ## Running with low-bit kernels
 
@@ -412,7 +416,7 @@ python -m examples.models.llama.export_llama \
 ```
 
 A few notes:
-- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory.  When this option is enabled, you can specify whether embeddings are quantized with weight zeros or not by specifying a third argument.  For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and uses weight zeros (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32, but is quantized with scales-only.  If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
+- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory.  When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument.  For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric.  If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
 - To do channelwise quantization, specify group_size to 0.  This works for both linear and embedding layers.
 
 Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
@@ -427,7 +431,7 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
@@ -492,7 +496,7 @@ python -m examples.models.llama.eval_llama \
 	--max_context_len <max context length>
 ```
 
-See [Llama utils page](./UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes, and quick iteration and verification.
+See [Llama utils page](UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes, and quick iteration and verification.
 
 # What is coming next?
 ## Quantization
@@ -544,3 +548,22 @@ clang: error: linker command failed with exit code 1 (use -v to see invocation)
 ```
 It's a known issue for Xcode version 15.1.
 Mitigation: update to most recent Xcode version, clean and rebuild.
+
+- If you encounter issues with missing abseil-cpp or re2, try running `git submodule update --init --recursive` to pull in those submodules.
+Example error:
+```
+CMake Error at runner/CMakeLists.txt:68 (add_subdirectory):
+  The source directory
+
+    /Users/../executorch/extension/llm/tokenizers/third-party/abseil-cpp
+
+  does not contain a CMakeLists.txt file.
+
+
+CMake Error at runner/CMakeLists.txt:72 (add_subdirectory):
+  The source directory
+
+    /Users/../executorch/extension/llm/tokenizers/third-party/re2
+
+  does not contain a CMakeLists.txt file.
+```
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
index 93ac18c993d..f2aa396f7a1 100644
--- a/examples/models/llama/TARGETS
+++ b/examples/models/llama/TARGETS
@@ -3,7 +3,7 @@
 
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_common_targets")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 oncall("executorch")
 
@@ -90,7 +90,7 @@ runtime.python_binary(
 runtime.command_alias(
     name = "export_llama_qnn",
     env = {
-        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()),
+        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()),
     },
     exe = ":export_llama",
 )
@@ -108,7 +108,7 @@ runtime.python_library(
         "source_transformation/pre_quantization.py",
         "source_transformation/prune_vocab.py",
         "source_transformation/quantize.py",
-        "source_transformation/quantized_kv_cache.py",
+        "source_transformation/custom_kv_cache.py",
         "source_transformation/rms_norm.py",
         "source_transformation/rope.py",
         "source_transformation/sdpa.py",
@@ -208,9 +208,9 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "quantized_kv_cache",
+    name = "custom_kv_cache",
     srcs = [
-        "source_transformation/quantized_kv_cache.py",
+        "source_transformation/custom_kv_cache.py",
     ],
     _is_external_target = True,
     visibility = ["//executorch/..."],
@@ -240,7 +240,7 @@ runtime.python_test(
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
     ],
     deps = [
-        ":quantized_kv_cache",
+        ":custom_kv_cache",
         "//caffe2:torch",
         "//executorch/examples/models/llama:llama_transformer",
     ],
@@ -255,7 +255,7 @@ runtime.python_test(
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
     ],
     deps = [
-        ":quantized_kv_cache",
+        ":custom_kv_cache",
         ":sdpa",
         "//caffe2:torch",
         "//executorch/examples/models/llama:llama_transformer",
@@ -274,3 +274,20 @@ runtime.python_test(
         ":export_library",
     ],
 )
+
+runtime.python_test(
+    name = "quantized_sdpa_source_transform_test",
+    srcs = [
+        "source_transformation/test_quantized_sdpa.py",
+    ],
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_py",
+    ],
+    deps = [
+        ":custom_kv_cache",
+        ":sdpa",
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:llama_transformer",
+    ],
+)
diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md
index dd014240ace..5f760ad7670 100644
--- a/examples/models/llama/UTILS.md
+++ b/examples/models/llama/UTILS.md
@@ -25,7 +25,7 @@ From `executorch` root:
 ## Smaller model delegated to other backends
 
 Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
-for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
+for each backend ([CoreML](https://pytorch.org/executorch/main/backends-coreml), [MPS](https://pytorch.org/executorch/main/backends-mps), [QNN](https://pytorch.org/executorch/main/backends-qualcomm)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
 
 - Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
 - MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 8e6d4fefb0e..79a225232e0 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -59,14 +59,15 @@
 )
 
 from .source_transformation.attention import replace_attention_to_attention_sha
+from .source_transformation.custom_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
+    replace_kv_cache_with_quantized_kv_cache,
+)
+
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
 )
-from .source_transformation.quantized_kv_cache import (
-    replace_kv_cache_with_custom_kv_cache,
-    replace_kv_cache_with_quantized_kv_cache,
-)
 from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
 
 from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis
@@ -77,6 +78,7 @@
     replace_sdpa_with_coreml_sdpa,
     replace_sdpa_with_custom_op,
     replace_sdpa_with_flex_sdpa,
+    replace_sdpa_with_quantized_sdpa,
     replace_sdpa_with_simple_sdpa,
 )
 from .source_transformation.vulkan_rope import replace_with_vulkan_rotary_emb
@@ -651,7 +653,7 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
         _get_source_transforms(
             modelname=args.model,
             dtype_override=dtype_override,
-            checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype),
+            checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype),  # type: ignore
             args=args,
         )
     )
@@ -793,10 +795,6 @@ def _to_edge_and_lower_llama(  # noqa: C901
                 args.enable_dynamic_shape,
             )
         )
-        # Apply XNNPACK after Vulkan so that undelegated ops can be accelerated by XNNPACK
-        partitioners.append(
-            get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
-        )
         modelname = f"vulkan_{modelname}"
 
         # Need to remove asserts from the graph to prevent graph breaks
@@ -818,6 +816,10 @@ def _to_edge_and_lower_llama(  # noqa: C901
         modelname = f"coreml_{modelname}"
 
     if args.qnn:
+        logging.warning(
+            "The model definition in current repro is not performant, please refer to the instruction"
+            " in https://github.com/pytorch/executorch/tree/main/examples/qualcomm/oss_scripts/llama/README.md for better performance."
+        )
         from executorch.extension.llm.custom_ops import model_sharding
 
         partitioners.append(
@@ -1104,7 +1106,7 @@ def _load_llama_model(
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
-        max_seq_len=model.max_seq_len,
+        max_seq_len=model.max_seq_len,  # type: ignore
         dtype=dtype_override,
         use_kv_cache=use_kv_cache,
         generate_full_logits=generate_full_logits,
@@ -1117,6 +1119,8 @@ def _load_llama_model(
         calibration_seq_length=calibration_seq_length,
         calibration_data=calibration_data,
         tokenizer_path=tokenizer_path,
+        use_legacy_export=args.qnn,
+        save_exported_program=args.export_only,
         verbose=verbose,
         metadata=_load_llama_model_metadata(
             weight_type,
@@ -1137,7 +1141,6 @@ def _load_llama_model(
             model.vocab_size,
             metadata_str,
         ),
-        args=args,
     )
 
 
@@ -1224,13 +1227,28 @@ def _get_source_transforms(  # noqa
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
+    use_attention_mask_for_custom_sdpa = False
+    if isinstance(args, argparse.Namespace):
+        if getattr(args, "use_custom_sdpa_with_attention_mask", None):
+            use_attention_mask_for_custom_sdpa = True
+
     if args.use_sdpa_with_kv_cache:
         transforms.append(replace_kv_cache_with_custom_kv_cache)
-        transforms.append(replace_sdpa_with_custom_op)
+        # todo: do this optionally
+        # if use attention mask instead of causal attention
+        # then create partial function that sets use_attention_mask=True
+        if use_attention_mask_for_custom_sdpa:
+            transforms.append(
+                partial(replace_sdpa_with_custom_op, use_attention_mask=True)
+            )
+        else:
+            transforms.append(replace_sdpa_with_custom_op)
 
     if args.quantize_kv_cache:
         assert args.use_kv_cache, "quantize_kv_cache requires use_kv_cache=True"
         transforms.append(replace_kv_cache_with_quantized_kv_cache)
+        # Right now
+        transforms.append(replace_sdpa_with_quantized_sdpa)
 
     if args.use_kv_cache:
         if args.qnn:
diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
index 5fe0ce93cf6..5179bf28fc7 100644
--- a/examples/models/llama/main.cpp
+++ b/examples/models/llama/main.cpp
@@ -53,7 +53,7 @@ int32_t main(int32_t argc, char** argv) {
 
   const char* prompt = FLAGS_prompt.c_str();
 
-  double temperature = FLAGS_temperature;
+  float temperature = FLAGS_temperature;
 
   int32_t seq_len = FLAGS_seq_len;
 
@@ -73,13 +73,18 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  example::Runner runner(model_path, tokenizer_path, temperature);
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+  example::Runner runner(model_path, tokenizer_path);
 
   if (warmup) {
-    runner.warmup(prompt, seq_len);
+    // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+    runner.warmup(prompt, /*max_new_tokens=*/seq_len);
   }
   // generate
-  runner.generate(prompt, seq_len);
+  executorch::extension::llm::GenerationConfig config{
+      .seq_len = seq_len, .temperature = temperature};
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+  runner.generate(prompt, config);
 
   return 0;
 }
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index 19829576482..2c82841c573 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -18,6 +18,7 @@
 from executorch.examples.models.llama.llama_transformer import Transformer
 
 from executorch.examples.models.llama.model_args import ModelArgs
+from torchao.utils import TorchAOBaseTensor
 
 try:
     from .fairseq2 import convert_to_llama_checkpoint
@@ -257,6 +258,9 @@ def __init__(self, **kwargs):
                 strict=False,
                 assign=True,
             )  # self.model_ = Transformer(gptconf)
+            for param in self.model_.parameters():
+                if isinstance(param, TorchAOBaseTensor):
+                    param.requires_grad = False
         else:
             print("Checkpoint not provided, defaulting weights to zeros.")
             self.model_.to_empty(device="cpu")
diff --git a/examples/models/llama/non_cpu_backends.md b/examples/models/llama/non_cpu_backends.md
index 1ee594ebd83..f414582a3c1 100644
--- a/examples/models/llama/non_cpu_backends.md
+++ b/examples/models/llama/non_cpu_backends.md
@@ -2,7 +2,7 @@
 # Running Llama 3/3.1 8B on non-CPU backends
 
 ### QNN
-Please follow [the instructions](https://pytorch.org/executorch/stable/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.html) to deploy Llama 3 8B to an Android smartphone with Qualcomm SoCs.
+Please follow [the instructions](https://pytorch.org/executorch/main/llm/build-run-llama3-qualcomm-ai-engine-direct-backend) to deploy Llama 3 8B to an Android smartphone with Qualcomm SoCs.
 
 ### MPS
 Export:
@@ -10,7 +10,7 @@ Export:
 python -m examples.models.llama2.export_llama --checkpoint llama3.pt --params params.json -kv --disable_dynamic_shape --mps --use_sdpa_with_kv_cache -d fp32 -qmode 8da4w -G 32 --embedding-quantize 4,32
 ```
 
-After exporting the MPS model .pte file, the [iOS LLAMA](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) app can support running the model. ` --embedding-quantize 4,32` is an optional args for quantizing embedding to reduce the model size.
+After exporting the MPS model .pte file, the [iOS LLAMA](https://pytorch.org/executorch/main/llm/llama-demo-ios) app can support running the model. ` --embedding-quantize 4,32` is an optional args for quantizing embedding to reduce the model size.
 
 ### CoreML
 Export:
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
index 429e4b61c36..53c777fa80b 100644
--- a/examples/models/llama/runner/runner.cpp
+++ b/examples/models/llama/runner/runner.cpp
@@ -41,13 +41,11 @@ static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 Runner::Runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    const float temperature,
     std::optional<const std::string> data_path)
     // NOTE: we observed ~2x loading performance increase on iPhone 15
     // and a ~5% improvement on Galaxy S22 by switching to
     // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-    : temperature_(temperature),
-      tokenizer_path_(tokenizer_path),
+    : tokenizer_path_(tokenizer_path),
       metadata_({
           {kEnableDynamicShape, false},
           {kMaxSeqLen, 128},
@@ -68,6 +66,17 @@ Runner::Runner(
       tokenizer_path.c_str());
 }
 
+[[deprecated(
+    "This constructor is deprecated. Use the constructor without temperature parameter instead.")]]
+Runner::Runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    const float temperature,
+    std::optional<const std::string> data_path)
+    : Runner(model_path, tokenizer_path, std::move(data_path)) {
+  temperature_ = temperature;
+}
+
 bool Runner::is_loaded() const {
   return module_->is_loaded() && tokenizer_ && text_decoder_runner_ &&
       text_prefiller_ && text_token_generator_;
@@ -133,11 +142,9 @@ Error Runner::load() {
       ET_LOG(Info, "eos_id = %" PRId64, value);
     }
   }
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
   text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
-      module_.get(),
-      metadata_.at(kUseKVCache),
-      metadata_.at(kVocabSize),
-      temperature_);
+      module_.get(), metadata_.at(kUseKVCache));
   text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
@@ -164,11 +171,9 @@ Error Runner::load() {
 
 Error Runner::generate(
     const std::string& prompt,
-    int32_t seq_len,
+    const ::executorch::extension::llm::GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const llm::Stats&)> stats_callback,
-    bool echo,
-    bool warmup) {
+    std::function<void(const llm::Stats&)> stats_callback) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
@@ -178,19 +183,19 @@ Error Runner::generate(
     stats_.model_load_end_ms = llm::time_in_ms();
   }
 
-  if (warmup) {
+  if (config.warming) {
     ET_LOG(Info, "Doing a warmup run...");
   }
 
   RUNNER_ET_LOG(
-      warmup,
+      config.warming,
       "RSS after loading model: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
-      [token_callback, warmup](const std::string& piece) {
-        if (!warmup) {
+      [token_callback, config](const std::string& piece) {
+        if (!config.warming) {
           llm::safe_printf(piece.c_str());
           fflush(stdout);
         }
@@ -204,11 +209,6 @@ Error Runner::generate(
   stats_.inference_start_ms = llm::time_in_ms();
   shouldStop_ = false;
 
-  // Set the sequence length to the max seq length if not provided
-  seq_len = (seq_len > 0 && seq_len <= metadata_.at(kMaxContextLen))
-      ? seq_len
-      : metadata_.at(kMaxContextLen);
-
   ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
       prompt,
       /* bos */ 0,
@@ -225,21 +225,22 @@ Error Runner::generate(
   ET_CHECK_MSG(
       num_prompt_tokens < metadata_.at(kMaxContextLen),
       "num_prompt_tokens %d >= max_seq_len_ %" PRId64
-      ", Max seq length exceeded - please increase max seq len value in .../llama2/model.py",
+      ", Max seq length exceeded - please increase max seq len value in your export script",
       num_prompt_tokens,
       metadata_.at(kMaxContextLen));
-  ET_CHECK_MSG(
-      num_prompt_tokens < seq_len,
-      "num_prompt_tokens %d >= seq_len %d, Sequence length exceeded - please increase the seq_len value passed to generate()",
-      num_prompt_tokens,
-      seq_len);
+
+  // Determine max_new_tokens using the GenerationConfig's resolve method
+  int max_new_tokens = config.resolve_max_new_tokens(
+      metadata_.at(kMaxContextLen), num_prompt_tokens);
+
+  ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens);
 
   // Prefill first
   // Here feed all tokens to the model and get the next predicted token
   // after the prompt. After that we will enter generate loop.
 
   // print prompts
-  if (echo) {
+  if (config.echo) {
     wrapped_callback(prompt);
   }
   int64_t pos = 0;
@@ -253,32 +254,38 @@ Error Runner::generate(
   wrapped_callback(
       ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
   RUNNER_ET_LOG(
-      warmup,
+      config.warming,
       "RSS after prompt prefill: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
+
+  // Generate max_new_tokens - 1 because prefill already generated 1 token.
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));
+      prompt_tokens,
+      num_prompt_tokens,
+      max_new_tokens - 1,
+      temperature_ == -1.0f ? config.temperature : temperature_,
+      wrapped_callback));
 
   stats_.inference_end_ms = llm::time_in_ms();
-  if (!warmup) {
+  if (!config.warming) {
     printf("\n");
   }
   RUNNER_ET_LOG(
-      warmup,
+      config.warming,
       "RSS after finishing text generation: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
-  if (num_prompt_tokens + num_generated_tokens == seq_len) {
-    RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len);
+  if (num_generated_tokens == max_new_tokens) {
+    RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
   }
 
   stats_.num_prompt_tokens = num_prompt_tokens;
   stats_.num_generated_tokens = num_generated_tokens;
 
-  if (warmup) {
+  if (config.warming) {
     ET_LOG(Info, "Warmup run finished!");
   } else {
     // Do not print report during warmup
@@ -291,14 +298,15 @@ Error Runner::generate(
   return Error::Ok;
 }
 
-Error Runner::warmup(const std::string& prompt, int32_t seq_len) {
-  Error err = generate(
-      prompt,
-      seq_len,
-      /*token_callback=*/nullptr,
-      /*stats_callbak=*/nullptr,
-      /*echo=*/false,
-      /*warmup=*/true);
+Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) {
+  // Create a GenerationConfig for warmup
+  llm::GenerationConfig config{
+      .echo = false, .max_new_tokens = max_new_tokens, .warming = true};
+
+  // Call generate with the warmup config
+  Error err = generate(prompt, config);
+
+  // Reset stats after warmup
   stats_.reset();
   return err;
 }
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
index 509fe234027..97ffe4b98b7 100644
--- a/examples/models/llama/runner/runner.h
+++ b/examples/models/llama/runner/runner.h
@@ -33,26 +33,30 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
   explicit Runner(
       const std::string& model_path,
       const std::string& tokenizer_path,
-      const float temperature = 0.8f,
       std::optional<const std::string> data_path = std::nullopt);
 
-  bool is_loaded() const;
-  ::executorch::runtime::Error load();
+  [[deprecated(
+      "This constructor is deprecated. Use the constructor without temperature parameter instead.")]]
+  explicit Runner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      const float temperature,
+      std::optional<const std::string> data_path = std::nullopt);
+
+  bool is_loaded() const override;
+  ::executorch::runtime::Error load() override;
   ::executorch::runtime::Error generate(
       const std::string& prompt,
-      int32_t seq_len = 128,
+      const ::executorch::extension::llm::GenerationConfig& config,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const ::executorch::extension::llm::Stats&)>
-          stats_callback = {},
-      bool echo = true,
-      bool warming = false);
+          stats_callback = {}) override;
   ::executorch::runtime::Error warmup(
       const std::string& prompt,
-      int32_t seq_len = 128);
-  void stop();
+      int32_t max_new_tokens);
+  void stop() override;
 
  private:
-  float temperature_;
   bool shouldStop_{false};
 
   // model
@@ -68,6 +72,10 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
 
   // stats
   ::executorch::extension::llm::Stats stats_;
+
+  // temperature.
+  // Deprecated, we should rely on the temperature in GenerationConfig instead.
+  float temperature_ = -1.0f;
 };
 
 } // namespace example
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/custom_kv_cache.py
similarity index 88%
rename from examples/models/llama/source_transformation/quantized_kv_cache.py
rename to examples/models/llama/source_transformation/custom_kv_cache.py
index e7138622ed9..1158a8ba7a6 100644
--- a/examples/models/llama/source_transformation/quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/custom_kv_cache.py
@@ -52,6 +52,8 @@ def __init__(
         self.use_custom_update_cache_op = use_custom_update_cache_op
         self.quantized_cache_dtype = torch.int8
         self.cache_fp_type = torch.float32
+        self.return_float_values = True
+        self.max_context_length = max_context_length
         cache_shape = (max_batch_size, max_context_length, n_heads, head_dim)
         scale_shape = (max_batch_size, max_context_length, n_heads, 1)
         self.register_buffer(
@@ -61,17 +63,17 @@ def __init__(
             "v_cache", torch.zeros(cache_shape, dtype=self.quantized_cache_dtype)
         )
         self.register_buffer(
-            "k_cache_scales", torch.ones(scale_shape, dtype=torch.float64)
+            "k_cache_scales", torch.ones(scale_shape, dtype=torch.float32)
         )
         self.register_buffer(
-            "v_cache_scales", torch.ones(scale_shape, dtype=torch.float64)
+            "v_cache_scales", torch.ones(scale_shape, dtype=torch.float32)
         )
         if cache_type == QuantizedCacheType.AffineAsymmetric:
             self.register_buffer(
-                "k_cache_zero_points", torch.ones(scale_shape, dtype=torch.int64)
+                "k_cache_zero_points", torch.ones(scale_shape, dtype=torch.int8)
             )
             self.register_buffer(
-                "v_cache_zero_points", torch.ones(scale_shape, dtype=torch.int64)
+                "v_cache_zero_points", torch.ones(scale_shape, dtype=torch.int8)
             )
 
     def _quantize(self, value):
@@ -91,20 +93,15 @@ def _quantize(self, value):
         )
         return quantized_value, scales, zero_points
 
-    def update(self, input_pos, k_val, v_val):
-        """
-        k_val, v_val: [B, H, S, D]
-        return: [B, H, S, D]
-        However the storage is [B, S, H, D] so we incur transpose in, transpose out
-        This shall be removed by subsequent post-export graph pass
-        """
-        k_val = k_val.transpose(1, 2)
-        v_val = v_val.transpose(1, 2)
-        # quantize current k_val and store it in the cache
+    def _quantize_and_update(self, input_pos, k_val, v_val):
         quantized_k_val, k_scales, k_zero_points = self._quantize(k_val)
-
         quantized_v_val, v_scales, v_zero_points = self._quantize(v_val)
 
+        k_scales = k_scales.to(torch.float32)
+        k_zero_points = k_zero_points.to(self.quantized_cache_dtype)
+        v_scales = v_scales.to(torch.float32)
+        v_zero_points = v_zero_points.to(self.quantized_cache_dtype)
+
         if self.use_custom_update_cache_op:
             start_pos = input_pos[0].item()
             _ = torch.ops.llama.update_cache(quantized_k_val, self.k_cache, start_pos)
@@ -125,10 +122,13 @@ def update(self, input_pos, k_val, v_val):
             self.v_cache_scales[:, input_pos] = v_scales
             self.v_cache_zero_points[:, input_pos] = v_zero_points
 
+    def _update_and_return_float_values(self, input_pos, k_val, v_val):
+        self._quantize_and_update(input_pos, k_val, v_val)
+
         k_out = torch.ops.quantized_decomposed.dequantize_per_token(
             self.k_cache,
-            self.k_cache_scales,
-            self.k_cache_zero_points,
+            self.k_cache_scales.to(torch.float64),
+            self.k_cache_zero_points.to(torch.int64),
             torch.iinfo(self.quantized_cache_dtype).min,
             torch.iinfo(self.quantized_cache_dtype).max,
             self.quantized_cache_dtype,
@@ -136,14 +136,16 @@ def update(self, input_pos, k_val, v_val):
         )
         v_out = torch.ops.quantized_decomposed.dequantize_per_token(
             self.v_cache,
-            self.v_cache_scales,
-            self.v_cache_zero_points,
+            self.v_cache_scales.to(torch.float64),
+            self.v_cache_zero_points.to(torch.int64),
             torch.iinfo(self.quantized_cache_dtype).min,
             torch.iinfo(self.quantized_cache_dtype).max,
             self.quantized_cache_dtype,
             self.cache_fp_type,
         )
 
+        # When returning float values we jsut use the last value
+        # instead of dequantized value.
         start_pos = input_pos[0].item()
         if self.use_custom_update_cache_op:
             _ = torch.ops.llama.update_cache(k_val, k_out, start_pos)
@@ -152,6 +154,29 @@ def update(self, input_pos, k_val, v_val):
             k_out[:, input_pos] = k_val
             v_out[:, input_pos] = v_val
 
+        return k_out, v_out
+
+    def _update_and_return_quantized_values(self, input_pos, k_val, v_val):
+        self._quantize_and_update(input_pos, k_val, v_val)
+
+        return self.k_cache, self.v_cache
+
+    def update(self, input_pos, k_val, v_val):
+        """
+        k_val, v_val: [B, H, S, D]
+        return: [B, H, S, D]
+        However the storage is [B, S, H, D] so we incur transpose in, transpose out
+        This shall be removed by subsequent post-export graph pass
+        """
+        k_val = k_val.transpose(1, 2)
+        v_val = v_val.transpose(1, 2)
+
+        if self.return_float_values:
+            k_out, v_out = self._update_and_return_float_values(input_pos, k_val, v_val)
+        else:
+            k_out, v_out = self._update_and_return_quantized_values(
+                input_pos, k_val, v_val
+            )
         return k_out.transpose(1, 2), v_out.transpose(1, 2)
 
     @classmethod
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index 2ef016de097..ec02f442217 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -107,14 +107,24 @@ def quantize(  # noqa C901
             print("quantized model:", model)
         return model
     elif qmode.startswith("torchao:8da"):
+        # Check for required args
+        if group_size is None:
+            raise Exception(
+                "For torchao:8daxw quantization, group size must be specified."
+            )
+
         pattern = r"torchao:8da(\d+)w"
         matches = re.findall(pattern, qmode)
         assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
         bitwidth = int(matches[0][0])
 
-        from torchao.experimental.quant_api import Int8DynamicActivationIntxWeightConfig
-        from torchao.quantization.granularity import PerGroup, PerRow
-        from torchao.quantization.quant_api import quantize_
+        from torchao.dtypes import PackedLinearInt8DynamicActivationIntxWeightLayout
+        from torchao.quantization.granularity import PerAxis, PerGroup
+        from torchao.quantization.quant_api import (
+            Int8DynamicActivationIntxWeightConfig,
+            MappingType,
+            quantize_,
+        )
         from torchao.utils import unwrap_tensor_subclass
 
         with torch.no_grad():
@@ -124,8 +134,11 @@ def quantize(  # noqa C901
                 model,
                 Int8DynamicActivationIntxWeightConfig(
                     weight_dtype=getattr(torch, f"int{bitwidth}"),
-                    granularity=(PerRow() if group_size == 0 else PerGroup(group_size)),
-                    has_weight_zeros=False,
+                    weight_granularity=(
+                        PerAxis(0) if group_size == 0 else PerGroup(group_size)
+                    ),
+                    weight_mapping_type=MappingType.SYMMETRIC,
+                    layout=PackedLinearInt8DynamicActivationIntxWeightLayout(),
                 ),
             )
             model = unwrap_tensor_subclass(model)
@@ -164,7 +177,7 @@ def quantize(  # noqa C901
 
         try:
             # torchao 0.3+
-            from torchao._eval import InputRecorder  # pyre-fixme[21]
+            from torchao._models._eval import InputRecorder
         except ImportError:
             from torchao.quantization.GPTQ import InputRecorder  # pyre-ignore
 
@@ -206,17 +219,6 @@ def quantize(  # noqa C901
         q_group_size = 256 if group_size is None else group_size
         model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model)
 
-        # Apply additional quantizer for linear layers that aren't lowered to Vulkan
-        # at the moment
-        from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
-
-        # 1. Quantize in checkpoint dtype.
-        model = Int8DynActInt4WeightQuantizer(
-            precision=checkpoint_torch_dtype, groupsize=q_group_size
-        ).quantize(model)
-        # 2. Set the computation dtype (what weights/acts dequantize to).
-        model = set_8da4w_computation_dtype(model, computation_torch_dtype)
-
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")
@@ -788,23 +790,27 @@ def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
             EmbeddingQuantizer,
             SharedEmbeddingQuantizer,
         )
-        from torchao.quantization.granularity import PerGroup, PerRow
+        from torchao.quantization.granularity import PerAxis, PerGroup
+        from torchao.quantization.quant_api import MappingType
 
         quant_args = args.embedding_quantize.split(":")[1].split(",")
         if len(quant_args) == 2:
             bitwidth, group_size = quant_args
-            has_weight_zeros = True
+            is_asymmetric = True
         else:
-            bitwidth, group_size, has_weight_zeros = quant_args
+            bitwidth, group_size, is_asymmetric = quant_args
 
         if group_size in ["none", "None", "0"]:
             group_size = 0
 
         group_size = int(group_size)
         bitwidth = int(bitwidth)
-        has_weight_zeros = bool(has_weight_zeros)
+        is_asymmetric = bool(is_asymmetric)
         weight_dtype = getattr(torch, f"int{bitwidth}")
-        granularity = PerRow() if group_size == 0 else PerGroup(group_size)
+        granularity = PerAxis(0) if group_size == 0 else PerGroup(group_size)
+        mapping_type = (
+            MappingType.ASYMMETRIC if is_asymmetric else MappingType.SYMMETRIC
+        )
 
         def _torchao_embedding_quantizer(model):
             with torch.no_grad():
@@ -812,14 +818,14 @@ def _torchao_embedding_quantizer(model):
                     EmbeddingQuantizer(
                         weight_dtype=weight_dtype,
                         granularity=granularity,
-                        has_weight_zeros=has_weight_zeros,
+                        mapping_type=mapping_type,
                         use_fallback=False,
                     ).quantize(model)
                 else:
                     SharedEmbeddingQuantizer(
                         weight_dtype=weight_dtype,
                         granularity=granularity,
-                        has_weight_zeros=has_weight_zeros,
+                        mapping_type=mapping_type,
                     ).quantize(model)
             return model
 
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
index 1bb7d277545..1bc54198fba 100644
--- a/examples/models/llama/source_transformation/sdpa.py
+++ b/examples/models/llama/source_transformation/sdpa.py
@@ -13,16 +13,24 @@
 
 import torch
 
-from executorch.examples.models.llama.attention import KVCache, SDPA
+from executorch.examples.models.llama.attention import Attention, KVCache, SDPA
+
+from .custom_kv_cache import QuantizedKVCache
 
 
 class SDPACustom(torch.nn.Module):
     def __init__(
         self,
         dim: int,
+        max_context_len,
+        enable_dynamic_shape,
+        use_attention_mask: bool = False,
     ):
         super().__init__()
         self.dim = dim
+        self.max_context_len = max_context_len
+        self.use_attention_mask = use_attention_mask
+        self.enable_dynamic_shape = enable_dynamic_shape
 
     def forward(
         self,
@@ -34,6 +42,16 @@ def forward(
         seqlen,
         mask,
     ):
+        if self.use_attention_mask:
+            if self.enable_dynamic_shape:
+                start_pos = input_pos[-1].item()
+                torch._check_is_size(start_pos)
+                torch._check(start_pos < self.max_context_len)
+                seq_length = q.size(2)
+                mask = mask.narrow(0, start_pos, seq_length)
+            else:
+                mask = mask[input_pos]
+
         q = q.transpose(1, 2)  # (bs, seqlen, n_local_heads, head_dim)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
@@ -45,34 +63,172 @@ def forward(
         k = k.to(dtype=torch.float)
         v = v.to(dtype=torch.float)
 
-        output = torch.ops.llama.custom_sdpa(
-            q,
-            k,
-            v,
-            input_pos[0].item(),
-            None,  # Attention mask
-            0,  # dropout probability. Ignored by the code
-            True,  # is_causal
-        )
+        if self.use_attention_mask:
+            output = torch.ops.llama.custom_sdpa(
+                q,
+                k,
+                v,
+                input_pos[0].item(),
+                mask,  # Attention mask
+                0,  # dropout probability. Ignored by the code
+                False,  # is_causal
+            )
+        else:
+            output = torch.ops.llama.custom_sdpa(
+                q,
+                k,
+                v,
+                input_pos[0].item(),
+                None,  # Attention mask
+                0,  # dropout probability. Ignored by the code
+                True,  # is_causal
+            )
         return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
-def _replace_sdpa_with_custom_op(module: torch.nn.Module):
+def _replace_sdpa_with_custom_op(
+    module: torch.nn.Module, use_attention_mask: bool = False
+):
     for name, child in module.named_children():
         if isinstance(child, SDPA):
             setattr(
                 module,
                 name,
-                SDPACustom(child.dim),
+                SDPACustom(
+                    child.dim,
+                    child.max_context_len,
+                    child.enable_dynamic_shape,
+                    use_attention_mask=use_attention_mask,
+                ),
+            )
+        else:
+            _replace_sdpa_with_custom_op(child, use_attention_mask=use_attention_mask)
+
+
+def replace_sdpa_with_custom_op(
+    module: torch.nn.Module, use_attention_mask: bool = False
+) -> torch.nn.Module:
+    from executorch.extension.llm.custom_ops import custom_ops  # noqa
+
+    _replace_sdpa_with_custom_op(module, use_attention_mask=use_attention_mask)
+    return module
+
+
+class QuantizedSDPA(torch.nn.Module):
+    """
+    A quantized version of the SDPA (Scaled Dot Product Attention) module.
+
+    This module implements attention computation using quantized key-value pairs
+    to reduce memory footprint and potentially improve performance. It works with
+    a QuantizedKVCache to store and retrieve quantized key-value tensors.
+
+    The quantization process converts floating point tensors to int8, which requires
+    maintaining scale and zero point values for proper dequantization during computation.
+
+    Args:
+        dim (int): The dimension of the model
+        kv_cache (QuantizedKVCache): The cache for storing quantized key-value pairs
+    Note that it needs to own kv_cache to access scales and zero points, and since
+    SDPA forward signature only accepts q, k and v, to allow accessing scales and
+    zero points, we need to pass kv_cache to SDPA.
+    """
+
+    def __init__(self, dim: int, kv_cache: QuantizedKVCache):
+        super().__init__()
+        self.dim = dim
+        self.quantized_dtype = torch.int8
+        self.float_dtype = torch.float32
+        self.kv_cache = kv_cache
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k_quantized: torch.Tensor,
+        v_quantized: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        q = q.transpose(1, 2)  # (bs, seqlen, n_local_heads, head_dim)
+        k_quantized = k_quantized.transpose(1, 2)
+        v_quantized = v_quantized.transpose(1, 2)
+
+        q_scale, q_zero_point = (
+            torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
+                q, self.quantized_dtype
             )
+        )
+        q_quantized = torch.ops.quantized_decomposed.quantize_per_token(
+            q,
+            q_scale,
+            q_zero_point,
+            torch.iinfo(self.quantized_dtype).min,
+            torch.iinfo(self.quantized_dtype).max,
+            self.quantized_dtype,
+        )
+        q_zero_point_int8 = q_zero_point.to(dtype=torch.int8)
+        q_scale_fp32 = q_scale.to(dtype=torch.float32)
+
+        k_zero_point_int8 = self.kv_cache.k_cache_zero_points
+        k_scale_fp32 = self.kv_cache.k_cache_scales
+        v_zero_point_int8 = self.kv_cache.v_cache_zero_points
+        v_scale_fp32 = self.kv_cache.v_cache_scales
+
+        start_pos = input_pos[0].item()
+        output = torch.ops.llama.custom_quantized_sdpa(
+            q_quantized,
+            k_quantized,
+            v_quantized,
+            start_pos,
+            None,
+            0,
+            True,
+            None,
+            q_zero_point_int8,
+            q_scale_fp32,
+            k_zero_point_int8,
+            k_scale_fp32,
+            v_zero_point_int8,
+            v_scale_fp32,
+        )
+
+        return output.view(bsz, seqlen, self.dim)
+
+
+def _update_attention_module_with_quantized_sdpa(
+    module: torch.nn.Module, kv_cache: QuantizedKVCache
+):
+    sdpa = getattr(module, "SDPA", None)
+    assert sdpa is not None
+    # pyre-ignore
+    setattr(module, "SDPA", QuantizedSDPA(sdpa.dim, kv_cache))  # noqa: B010
+
+
+def _replace_sdpa_with_quantized_sdpa(module: torch.nn.Module):
+    for _, child in module.named_children():
+        if isinstance(child, Attention):
+            kv_cache = getattr(child, "kv_cache", None)
+            if kv_cache is None:
+                continue
+            if not isinstance(kv_cache, QuantizedKVCache):
+                continue
+            # Only when kv_cache is QuantizedKVCache, we replace SDPA with QuantizedSDPA
+            sdpa = getattr(child, "SDPA", None)
+            if sdpa is None:
+                continue
+            if not isinstance(sdpa, SDPACustom):
+                continue
+            kv_cache.return_float_values = False
+            _update_attention_module_with_quantized_sdpa(child, kv_cache)
         else:
-            _replace_sdpa_with_custom_op(child)
+            _replace_sdpa_with_quantized_sdpa(child)
 
 
-def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
+def replace_sdpa_with_quantized_sdpa(module: torch.nn.Module) -> torch.nn.Module:
     from executorch.extension.llm.custom_ops import custom_ops  # noqa
 
-    _replace_sdpa_with_custom_op(module)
+    _replace_sdpa_with_quantized_sdpa(module)
     return module
 
 
diff --git a/examples/models/llama/source_transformation/test_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_quantized_kv_cache.py
index 4252518a4ee..07c8e1bf9a0 100644
--- a/examples/models/llama/source_transformation/test_quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/test_quantized_kv_cache.py
@@ -10,7 +10,7 @@
 
 from executorch.examples.models.llama.attention import KVCache
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     QuantizedCacheType,
     QuantizedKVCache,
 )
diff --git a/examples/models/llama/source_transformation/test_quantized_sdpa.py b/examples/models/llama/source_transformation/test_quantized_sdpa.py
new file mode 100644
index 00000000000..242f3a0876d
--- /dev/null
+++ b/examples/models/llama/source_transformation/test_quantized_sdpa.py
@@ -0,0 +1,173 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.examples.models.llama.attention import Attention, KVCache, SDPA
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
+    QuantizedCacheType,
+    QuantizedKVCache,
+)
+from executorch.examples.models.llama.source_transformation.sdpa import (
+    QuantizedSDPA,
+    replace_sdpa_with_custom_op,
+    replace_sdpa_with_quantized_sdpa,
+    SDPACustom,
+)
+
+
+class MockAttention(Attention):
+    """Mock Attention class for testing purposes."""
+
+    def __init__(
+        self, dim, head_dim, n_rep, max_context_len=100, enable_dynamic_shape=False
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+        self.SDPA = SDPA(dim, head_dim, n_rep, max_context_len, enable_dynamic_shape)
+        self.kv_cache = None
+
+    def forward(self, x, freqs_cos, freqs_sin, **kwargs):
+        # Not used in tests
+        pass
+
+
+class QuantizedSDPATest(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+        self.max_batch_size = 1
+        self.max_context_len = 5
+        self.n_kv_heads = 4
+        self.n_heads = 8
+        self.head_dim = 16
+        self.dim = self.n_heads * self.head_dim
+        self.enable_dynamic_shape = False
+        self.dtype = torch.float32
+
+    def _create_test_model(self):
+        """Create a simple model with SDPA modules for testing."""
+        model = torch.nn.Module()
+        attention = MockAttention(
+            self.dim, self.head_dim, self.n_heads // self.n_kv_heads
+        )
+        # Add KVCache to the attention module
+        attention.kv_cache = KVCache(
+            self.max_batch_size,
+            self.max_context_len,
+            self.n_kv_heads,
+            self.head_dim,
+            self.enable_dynamic_shape,
+            dtype=self.dtype,
+        )
+        model.attention = attention
+        return model
+
+    def test_replace_sdpa_with_quantized_sdpa(self):
+        """Test that replace_sdpa_with_quantized_sdpa correctly transforms SDPA to QuantizedSDPA."""
+        # Create a model with SDPA
+        model = self._create_test_model()
+
+        # First replace standard SDPA with SDPACustom (required before quantization)
+        model = replace_sdpa_with_custom_op(model)
+        self.assertIsInstance(model.attention.SDPA, SDPACustom)
+
+        # Replace KVCache with QuantizedKVCache
+        model.attention.kv_cache = QuantizedKVCache.from_float(
+            model.attention.kv_cache,
+            QuantizedCacheType.AffineAsymmetric,
+            use_custom_update_cache_op=True,
+        )
+        self.assertIsInstance(model.attention.kv_cache, QuantizedKVCache)
+
+        # Set return_float_values to False to enable quantized operation
+        model.attention.kv_cache.return_float_values = False
+
+        # Apply the transformation
+        model = replace_sdpa_with_quantized_sdpa(model)
+
+        # Verify that SDPA has been replaced with QuantizedSDPA
+        self.assertIsInstance(model.attention.SDPA, QuantizedSDPA)
+
+        # Verify that the QuantizedSDPA has the correct properties
+        self.assertEqual(model.attention.SDPA.dim, self.dim)
+        self.assertEqual(model.attention.SDPA.quantized_dtype, torch.int8)
+        self.assertEqual(model.attention.SDPA.float_dtype, torch.float32)
+        self.assertIs(model.attention.SDPA.kv_cache, model.attention.kv_cache)
+
+    def test_no_replacement_when_no_quantized_kv_cache(self):
+        """Test that SDPA is not replaced when there's no QuantizedKVCache."""
+        # Create a model with SDPA
+        model = self._create_test_model()
+
+        # First replace standard SDPA with SDPACustom
+        model = replace_sdpa_with_custom_op(model)
+        self.assertIsInstance(model.attention.SDPA, SDPACustom)
+
+        # Apply the transformation without replacing KVCache
+        model = replace_sdpa_with_quantized_sdpa(model)
+
+        # Verify that SDPA has NOT been replaced with QuantizedSDPA
+        self.assertIsInstance(model.attention.SDPA, SDPACustom)
+        self.assertNotIsInstance(model.attention.SDPA, QuantizedSDPA)
+
+    def test_forward_functionality(self):
+        """Test that the QuantizedSDPA forward function works correctly."""
+        # This test requires the custom ops to be loaded, so we'll check if they're available
+        try:
+            from executorch.extension.llm.custom_ops import custom_ops  # noqa
+        except ImportError:
+            self.skipTest(
+                "Custom ops not available, skipping forward functionality test"
+            )
+
+        # Create a model with SDPA
+        model = self._create_test_model()
+
+        # First replace standard SDPA with SDPACustom
+        model = replace_sdpa_with_custom_op(model)
+
+        # Replace KVCache with QuantizedKVCache
+        model.attention.kv_cache = QuantizedKVCache.from_float(
+            model.attention.kv_cache,
+            QuantizedCacheType.AffineAsymmetric,
+            use_custom_update_cache_op=True,
+        )
+
+        # Set return_float_values to False to enable quantized operation
+        model.attention.kv_cache.return_float_values = False
+
+        # Save the original SDPACustom for comparison
+        # Apply the transformation
+        model = replace_sdpa_with_quantized_sdpa(model)
+
+        # Create test inputs
+        input_pos = torch.tensor([0], dtype=torch.int64)
+        bsz = 1
+        seqlen = 1
+        q = torch.randn(bsz, self.n_heads, seqlen, self.head_dim, dtype=self.dtype)
+        k = torch.randn(bsz, self.n_kv_heads, seqlen, self.head_dim, dtype=self.dtype)
+        v = torch.randn(bsz, self.n_kv_heads, seqlen, self.head_dim, dtype=self.dtype)
+
+        # Update the KV cache
+        k_quantized, v_quantized = model.attention.kv_cache.update(input_pos, k, v)
+
+        # Run the forward pass with the quantized SDPA
+        try:
+            output = model.attention.SDPA(
+                input_pos, q, k_quantized, v_quantized, bsz, seqlen, None
+            )
+
+            # Verify the output shape
+            self.assertEqual(output.shape, (bsz, seqlen, self.dim))
+        except Exception:
+            # If the forward pass fails, it might be due to missing custom ops
+            self.skipTest(
+                "Custom ops not available, skipping forward functionality test"
+            )
diff --git a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
index 35c88e10b6b..e5e278f8ce8 100644
--- a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
@@ -10,7 +10,7 @@
 
 from executorch.examples.models.llama.attention import KVCache
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     CustomKVCache,
     QuantizedCacheType,
     QuantizedKVCache,
@@ -71,8 +71,8 @@ def test_simple(self, is_dynamic_shape=False):
         self.seq_len = 3
         self._init_cache()
         q, k_val, v_val = self._init_kv()
-        self.float_sdpa = SDPACustom(self.dim)
-        self.quantized_sdpa = SDPACustom(self.dim)
+        self.float_sdpa = SDPACustom(self.dim, self.max_context_len, True)
+        self.quantized_sdpa = SDPACustom(self.dim, self.max_context_len, True)
         k, v = self.custom_kv_cache.update(input_pos, k_val, v_val)
         float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
         k, v = self.quantized_kv_cache.update(input_pos, k_val, v_val)
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 92ddbf74d94..615ad3948fc 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -41,7 +41,7 @@ You can export and run the original Llama 2 7B model.
     ```
 4. Create tokenizer.bin.
     ```
-    python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
+    python -m pytorch_tokenizers.tools.llama2c.convert -t <tokenizer.model> -o tokenizer.bin
     ```
 
     Pass the converted `tokenizer.bin` file instead of `tokenizer.model` for subsequent steps.
diff --git a/examples/models/llama3_2_vision/preprocess/test_preprocess.py b/examples/models/llama3_2_vision/preprocess/test_preprocess.py
index 4c0a5635e5c..220b0dc9b6f 100644
--- a/examples/models/llama3_2_vision/preprocess/test_preprocess.py
+++ b/examples/models/llama3_2_vision/preprocess/test_preprocess.py
@@ -124,9 +124,9 @@ class TestImageTransform:
     same output as the reference model.
 
     Reference model: CLIPImageTransform
-        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L115
+        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transform.py#L127
     Eager and exported models: _CLIPImageTransform
-        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L26
+        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transform.py#L28
     """
 
     models_no_resize = initialize_models(resize_to_max_canvas=False)
@@ -147,7 +147,7 @@ def prepare_inputs(
             without distortion.
 
         These calculations are done by the reference model inside __init__ and __call__
-        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L115
+        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transform.py#L198
         """
         image_tensor = F.to_dtype(
             F.grayscale_to_rgb_image(F.to_image(image)), scale=True
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index 6003f3a000d..eeb6c296dd5 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -15,7 +15,7 @@
 # ~~~
 # It should also be cmake-lint clean.
 #
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24)  # 3.24 is required for WHOLE_ARCHIVE
 project(llava)
 
 # Duplicating options as root CMakeLists.txt
@@ -124,7 +124,7 @@ target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  list(APPEND link_libraries custom_ops)
+  list(APPEND link_libraries $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
 endif()
 
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md
index d0dc71c0a85..6ba9ef21555 100644
--- a/examples/models/llava/README.md
+++ b/examples/models/llava/README.md
@@ -11,7 +11,7 @@ huggingface page [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llav
 
 
 <p align="center">
-      <img src="./llava_via_xnnpack.gif" width=300>
+      <img src="llava_via_xnnpack.gif" width=300>
       <br>
       <em>
       Running Llava1.5 7B on Android phone
@@ -26,17 +26,26 @@ model) for general-purpose visual and language understanding, achieving
 impressive chat capabilities mimicking spirits of the cutting edge multimodal
 models and setting a high bar for accuracy on Science QA.
 
-## Instructions
+## Instructions to run Llava on Android/iOS
 
 First you need to generate a .PTE file for the model, along with input image,
 and other artifacts. Then you need either a C++ runner, or Android or iOS
 application to test things out on device.
 
+### Host machine requirements
+
+The biggest requirement is to have a host machine with at least 32GiB memory, preferably 64GiB.
+
+The model weights is 15GiB, and the other memory usage at export stage (`export_llava`) is around 10GiB. So you need at least 25GiB memory to run the export script.
+
+
 ### Generate ExecuTorch .PTE and other artifacts
 
 Run the following command to generate `llava.pte`, `tokenizer.bin` and an image
 tensor (serialized in TorchScript) `image.pt`.
 
+> **Warning**: The C++ runner `llava_main` binary cannot process raw image inputs such as JPEG, PNG, or BMP files directly. You must convert these images to a `.pt` file format using the `examples/models/llava/image_util.py` script before using them with `llava_main`.
+
 Prerequisite: run `install_executorch.sh` to install ExecuTorch and run
 `examples/models/llava/install_requirements.sh` to install dependencies.
 
@@ -69,6 +78,13 @@ cmake-out/examples/models/llava/llava_main
 
 ### Build Mobile Apps
 
+#### Device Requirements
+
+To run the Android/iOS apps, you need a device with at least 12GiB memory.
+
+- iPhone 13 Pro or above
+- Samsung Galaxy S23 or above
+
 #### Android
 
 We can run LLAVA using the LLAMA Demo Apps. Please refer to [this
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 63ae0f4a118..66b61840866 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -20,13 +20,13 @@
     build_args_parser,
     get_quantizer_and_quant_params,
 )
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
+)
 from executorch.examples.models.llama.source_transformation.quantize import (
     EmbeddingQuantHandler,
     get_quant_weight_transform,
 )
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
-    replace_kv_cache_with_custom_kv_cache,
-)
 from executorch.examples.models.llama.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
@@ -92,7 +92,6 @@ def forward(self, input_pos, embeddings):
         use_kv_cache=True,
         example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings),
         dynamic_shapes=dynamic_shapes,
-        args=llava.text_model_args,
     )
 
     dtype_override = DType.fp32
@@ -161,7 +160,6 @@ def forward(self, images):
             use_kv_cache=True,
             example_inputs=(resized,),
             dynamic_shapes=dynamic_shapes,
-            args=None,
         )
         .export()
         .pt2e_quantize([quantizer])
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 6ce4b701bbe..351356607c8 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -15,7 +15,7 @@
 from executorch.examples.models.llama.llama_transformer import Transformer
 from executorch.examples.models.llama.model_args import ModelArgs
 
-from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
     replace_kv_cache_with_custom_kv_cache,
 )
 from executorch.examples.models.llama.source_transformation.sdpa import (
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
index 971e126a14c..aab5bfb4720 100644
--- a/examples/models/llava/runner/llava_runner.cpp
+++ b/examples/models/llava/runner/llava_runner.cpp
@@ -47,8 +47,10 @@ Error LlavaRunner::load() {
   tokenizer_->load(tokenizer_path_);
 
   // Load the text decoder runner
-  text_decoder_runner_ = std::make_unique<LlavaTextDecoderRunner>(
-      module_.get(), tokenizer_->vocab_size(), temperature_);
+  text_decoder_runner_ =
+      // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+      std::make_unique<LlavaTextDecoderRunner>(module_.get());
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
   text_decoder_runner_->load();
 
   // Load the text prefiller
@@ -117,7 +119,11 @@ Error LlavaRunner::generate_from_pos(
 
   // Generate tokens
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      {prefill_next_token}, start_pos, seq_len, token_callback));
+      /*tokens=*/{prefill_next_token},
+      /*start_pos=*/start_pos,
+      /*max_new_tokens=*/seq_len - start_pos + 1,
+      /*temperature=*/temperature_,
+      /*token_callback=*/token_callback));
 
   // Bookkeeping
   stats_.num_generated_tokens = num_generated_tokens;
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
index 4c7809361b0..3de418b57ea 100644
--- a/examples/models/llava/runner/llava_text_decoder_runner.h
+++ b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -17,11 +17,8 @@ namespace example {
 class ET_EXPERIMENTAL LlavaTextDecoderRunner
     : public executorch::extension::llm::TextDecoderRunner {
  public:
-  LlavaTextDecoderRunner(
-      executorch::extension::Module* module,
-      int32_t vocab_size,
-      float temperature)
-      : TextDecoderRunner(module, true, vocab_size, temperature){};
+  explicit LlavaTextDecoderRunner(executorch::extension::Module* module)
+      : TextDecoderRunner(module, true) {}
 
   inline executorch::runtime::Result<executorch::aten::Tensor> step(
       executorch::extension::TensorPtr& tokens,
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
index 5fd60399415..36381b27124 100644
--- a/examples/models/llava/test/test_llava.py
+++ b/examples/models/llava/test/test_llava.py
@@ -131,7 +131,7 @@ def test_llava_export(self):
         # being tested, using llama_transformer
         new_tokens = [torch.argmax(pte_prefill_after_img).item()]
         # TODO: uncomment this line
-        # self.assertEquals(new_tokens[0], 1932)  # When
+        # self.assertEqual(new_tokens[0], 1932)  # When
         for i in range(4):
             print(i, llava_model.tokenizer.decode(new_tokens[i]))
             token_embeds = llava_module.run_method(
diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py
index 69859fa39bc..7e2cfb14c49 100644
--- a/examples/models/moshi/mimi/test_mimi.py
+++ b/examples/models/moshi/mimi/test_mimi.py
@@ -135,16 +135,28 @@ def test_streaming_encoding_decoding(self):
 
         all_codes_th = torch.cat(all_codes, dim=-1)
 
+        pcm_ref = self.mimi.decode(all_codes_th)
+
         all_pcms = []
+        for i in range(all_codes_th.shape[-1]):
+            codes = all_codes_th[..., i : i + 1]
+            pcm = self.mimi.decode(codes)
+            all_pcms.append(pcm)
+        all_pcms = torch.cat(all_pcms, dim=-1)
+        sqnr = compute_sqnr(pcm_ref, all_pcms)
+        print(f"sqnr = {sqnr} dB")
+        self.assertTrue(sqnr > 4)
+
+        all_pcms_streaming = []
         with self.mimi.streaming(1):
             for i in range(all_codes_th.shape[-1]):
                 codes = all_codes_th[..., i : i + 1]
-                pcm = self.mimi.decode(codes)
-                all_pcms.append(pcm)
-        all_pcms = torch.cat(all_pcms, dim=-1)
-
-        pcm_ref = self.mimi.decode(all_codes_th)
-        self.assertTrue(torch.allclose(pcm_ref, all_pcms, atol=1e-5))
+                pcm_streaming = self.mimi.decode(codes)
+                all_pcms_streaming.append(pcm_streaming)
+        all_pcms_streaming = torch.cat(all_pcms_streaming, dim=-1)
+        sqnr_streaming = compute_sqnr(pcm_ref, all_pcms_streaming)
+        print(f"sqnr_streaming = {sqnr_streaming} dB")
+        self.assertTrue(sqnr_streaming > 100)
 
     def test_exported_encoding(self):
         """Ensure exported encoding model is consistent with reference output."""
diff --git a/examples/models/phi-3-mini-lora/README.md b/examples/models/phi-3-mini-lora/README.md
index 2b7cc0ba401..62efda6c3dc 100644
--- a/examples/models/phi-3-mini-lora/README.md
+++ b/examples/models/phi-3-mini-lora/README.md
@@ -16,8 +16,9 @@ To see how you can use the model exported for training in a fully involved finet
 python export_model.py
 ```
 
-2. Run the inference model using an example runtime. For more detailed steps on this, check out [Build & Run](https://pytorch.org/executorch/stable/getting-started-setup.html#build-run).
+2. Run the inference model using an example runtime. For more detailed steps on this, check out [Building from Source](https://pytorch.org/executorch/main/using-executorch-building-from-source).
 ```
+
 # Clean and configure the CMake build system. Compiled programs will appear in the executorch/cmake-out directory we create here.
 ./install_executorch.sh --clean
 (mkdir cmake-out && cd cmake-out && cmake ..)
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
index ba878d42a3f..f52f2a3a06d 100644
--- a/examples/models/phi-3-mini/README.md
+++ b/examples/models/phi-3-mini/README.md
@@ -13,7 +13,7 @@ pip uninstall -y transformers ; pip install transformers==4.44.2
 ```
 cd executorch
 wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
-python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 ```
 2. Export the model. This step will take a few minutes to finish.
 ```
diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
index 8fa948e7dc7..11c2f3834eb 100644
--- a/examples/models/phi-3-mini/export_phi-3-mini.py
+++ b/examples/models/phi-3-mini/export_phi-3-mini.py
@@ -65,7 +65,7 @@ def export(args) -> None:
         xnnpack_quantizer.set_global(xnnpack_quant_config)
 
         model = export_for_training(
-            model, example_inputs, dynamic_shapes=dynamic_shapes
+            model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
         ).module()
         model = prepare_pt2e(model, xnnpack_quantizer)  # pyre-fixme[6]
         model(*example_inputs)
diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
index 18f82957f94..3d91747f468 100644
--- a/examples/models/phi_4_mini/convert_weights.py
+++ b/examples/models/phi_4_mini/convert_weights.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 from typing import Dict
 
 import torch
@@ -7,6 +8,63 @@
 
 from torchtune.training import FullModelHFCheckpointer
 
+_HF_PHI_4_FROM_META = {
+    "tok_embeddings.weight": "model.embed_tokens.weight",
+    "norm.weight": "model.norm.weight",
+    "layers.{}.attention.wq.weight": "model.layers.{}.self_attn.q_proj.weight",
+    "layers.{}.attention.wk.weight": "model.layers.{}.self_attn.k_proj.weight",
+    "layers.{}.attention.wv.weight": "model.layers.{}.self_attn.v_proj.weight",
+    "layers.{}.attention.wo.weight": "model.layers.{}.self_attn.o_proj.weight",
+    "layers.{}.attention_norm.weight": "model.layers.{}.input_layernorm.weight",
+    "layers.{}.ffn_norm.weight": "model.layers.{}.post_attention_layernorm.weight",
+    "layers.{}.feed_forward.w1.weight": "model.layers.{}.mlp.gate_proj.weight",
+    "layers.{}.feed_forward.w3.weight": "model.layers.{}.mlp.up_proj.weight",
+    "layers.{}.feed_forward.w2.weight": "model.layers.{}.mlp.down_proj.weight",
+    "output.weight": "lm_head.weight",
+}
+
+
+def phi_4_hf_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from hf's format to Meta's format.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in hf's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _HF_PHI_4_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        if key.endswith("mlp.gate_up_proj.weight"):
+            # Split the gate_up_proj into gate_proj and up_proj
+            hidden_dim = value.shape[0] // 2
+            assert 2 * hidden_dim == value.shape[0]
+            gate = value[0:hidden_dim, :]
+            up = value[hidden_dim:, :]
+            for new_key, new_value in [("gate_proj", gate), ("up_proj", up)]:
+                new_key = key.replace("gate_up_proj", new_key)
+                new_key = get_mapped_key(new_key, inverted_mapping_dict)
+                converted_state_dict[new_key] = new_value
+        elif key.endswith("self_attn.qkv_proj.weight"):
+            # Split the qkv_proj into q_proj, k_proj, and v_proj
+            q_dim = value.shape[1]
+            kv_dim = (value.shape[0] - q_dim) // 2
+            assert 2 * kv_dim + q_dim == value.shape[0]
+            q = value[0:q_dim, :]
+            k = value[q_dim : (q_dim + kv_dim), :]
+            v = value[(q_dim + kv_dim) :, :]
+            for new_key, new_value in [("q_proj", q), ("k_proj", k), ("v_proj", v)]:
+                new_key = key.replace("qkv_proj", new_key)
+                new_key = get_mapped_key(new_key, inverted_mapping_dict)
+                converted_state_dict[new_key] = new_value
+        else:
+            new_key = get_mapped_key(key, inverted_mapping_dict)
+            converted_state_dict[new_key] = value
+    return converted_state_dict
+
 
 # Standard _FROM_META weight mapping of Meta weights to TorchTune.
 _PHI_4_FROM_META = {
@@ -51,22 +109,30 @@ def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     return converted_state_dict
 
 
-def convert_weights(input_dir: str, output_file: str) -> None:
-    # Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
-    checkpointer = FullModelHFCheckpointer(
-        checkpoint_dir=input_dir,
-        checkpoint_files=[
-            "model-00001-of-00002.safetensors",
-            "model-00002-of-00002.safetensors",
-        ],
-        output_dir=".",
-        model_type="PHI4",
-    )
+def convert_weights(input_dir_or_checkpoint: str, output_file: str) -> None:
+    # If input_dir_or_checkpoint is a directory downloaded from HF, FullModelHFCheckpointer is used to extract the state dict
+    # If input_dir_or_checkpoint is a checkpoint (from eager model model), it is loaded directly
+    if os.path.isdir(input_dir_or_checkpoint):
+        checkpointer = FullModelHFCheckpointer(
+            checkpoint_dir=input_dir_or_checkpoint,
+            checkpoint_files=[
+                "model-00001-of-00002.safetensors",
+                "model-00002-of-00002.safetensors",
+            ],
+            output_dir=".",
+            model_type="PHI4",
+        )
+        print("Loading checkpoint from directory...")
+        sd = checkpointer.load_checkpoint()
+        sd = sd["model"]
+        print("Converting checkpoint...")
+        sd = phi_4_tune_to_meta(sd)
+    else:
+        print("Loading checkpoint from file...")
+        sd = torch.load(input_dir_or_checkpoint, map_location="cpu", weights_only=True)
+        print("Converting checkpoint...")
+        sd = phi_4_hf_to_meta(sd)
 
-    print("Loading checkpoint...")
-    sd = checkpointer.load_checkpoint()
-    print("Converting checkpoint...")
-    sd = phi_4_tune_to_meta(sd["model"])
     print("Saving checkpoint...")
     torch.save(sd, output_file)
     print("Done.")
@@ -79,7 +145,7 @@ def main():
     parser.add_argument(
         "input_dir",
         type=str,
-        help="Path to directory containing checkpoint files",
+        help="Path to directory containing checkpoint files, or path to a single checkpoint file.",
     )
     parser.add_argument("output", type=str, help="Path to the output checkpoint")
 
diff --git a/examples/models/test/test_export.py b/examples/models/test/test_export.py
index 9a4ff7a35ed..306f54c0e89 100644
--- a/examples/models/test/test_export.py
+++ b/examples/models/test/test_export.py
@@ -29,7 +29,9 @@ def collect_executorch_and_eager_outputs(
         Returns a tuple containing the outputs of the eager mode model and the executorch mode model.
         """
         eager_model = eager_model.eval()
-        model = torch.export.export_for_training(eager_model, example_inputs).module()
+        model = torch.export.export_for_training(
+            eager_model, example_inputs, strict=True
+        ).module()
         edge_model = export_to_edge(model, example_inputs)
 
         executorch_prog = edge_model.to_executorch()
diff --git a/examples/portable/README.md b/examples/portable/README.md
index a6658197da3..ef9b44a48a3 100644
--- a/examples/portable/README.md
+++ b/examples/portable/README.md
@@ -20,7 +20,7 @@ We will walk through an example model to generate a `.pte` file in [portable mod
 from the [`models/`](../models) directory using scripts in the `portable/scripts` directory. Then we will run on the `.pte` model on the ExecuTorch runtime. For that we will use `executor_runner`.
 
 
-1. Following the setup guide in [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
+1. Following the setup guide in [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup)
 you should be able to get the basic development environment for ExecuTorch working.
 
 2. Using the script `portable/scripts/export.py` generate a model binary file by selecting a
@@ -78,4 +78,4 @@ Output 0: tensor(sizes=[1, 1000], [
 
 ## Custom Operator Registration
 
-Explore the demos in the [`custom_ops/`](./custom_ops) directory to learn how to register custom operators into ExecuTorch as well as register its kernels into ExecuTorch runtime.
+Explore the demos in the [`custom_ops/`](custom_ops) directory to learn how to register custom operators into ExecuTorch as well as register its kernels into ExecuTorch runtime.
diff --git a/examples/portable/custom_ops/README.md b/examples/portable/custom_ops/README.md
index db517e84a0c..bf17d6a6753 100644
--- a/examples/portable/custom_ops/README.md
+++ b/examples/portable/custom_ops/README.md
@@ -3,7 +3,7 @@ This folder contains examples to register custom operators into PyTorch as well
 
 ## How to run
 
-Prerequisite: finish the [setting up wiki](https://pytorch.org/executorch/stable/getting-started-setup).
+Prerequisite: finish the [setting up wiki](https://pytorch.org/executorch/main/getting-started-setup).
 
 Run:
 
diff --git a/examples/portable/custom_ops/custom_ops_2_out.cpp b/examples/portable/custom_ops/custom_ops_2_out.cpp
index 138a8eeed89..2fb50e521c1 100644
--- a/examples/portable/custom_ops/custom_ops_2_out.cpp
+++ b/examples/portable/custom_ops/custom_ops_2_out.cpp
@@ -13,7 +13,7 @@ namespace native {
 
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 
 namespace {
 void check_preconditions(const Tensor& in, Tensor& out) {
diff --git a/examples/portable/scripts/export_and_delegate.py b/examples/portable/scripts/export_and_delegate.py
index 6a8a28d5338..1c2adf67688 100644
--- a/examples/portable/scripts/export_and_delegate.py
+++ b/examples/portable/scripts/export_and_delegate.py
@@ -61,7 +61,7 @@ def export_composite_module_with_lower_graph():
     m_compile_spec = m.get_compile_spec()
 
     # pre-autograd export. eventually this will become torch.export
-    m = torch.export.export_for_training(m, m_inputs).module()
+    m = torch.export.export_for_training(m, m_inputs, strict=True).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
@@ -84,7 +84,7 @@ def forward(self, *args):
     m = CompositeModule()
     m = m.eval()
     # pre-autograd export. eventually this will become torch.export
-    m = torch.export.export_for_training(m, m_inputs).module()
+    m = torch.export.export_for_training(m, m_inputs, strict=True).module()
     composited_edge = export_to_edge(m, m_inputs)
 
     # The graph module is still runnerable
@@ -134,7 +134,7 @@ def get_example_inputs(self):
     m = Model()
     m_inputs = m.get_example_inputs()
     # pre-autograd export. eventually this will become torch.export
-    m = torch.export.export_for_training(m, m_inputs).module()
+    m = torch.export.export_for_training(m, m_inputs, strict=True).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
@@ -171,7 +171,7 @@ def export_and_lower_the_whole_graph():
 
     m_inputs = m.get_example_inputs()
     # pre-autograd export. eventually this will become torch.export
-    m = torch.export.export_for_training(m, m_inputs).module()
+    m = torch.export.export_for_training(m, m_inputs, strict=True).module()
     edge = export_to_edge(m, m_inputs)
     logging.info(f"Exported graph:\n{edge.exported_program().graph}")
 
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index bdac58d2bfc..04354cda3f6 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -4,12 +4,12 @@ This directory contains examples for some AI models.
 
 We have seperated the example scripts into the following subfolders, please refer to [README.md](../../backends/qualcomm/README.md) for the example scripts' directory structure:
 
-1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama](./oss_scripts/llama/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script.
+1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama](oss_scripts/llama/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script.
 
 2. oss_scripts: OSS stands for Open Source Software. This folder contains python scripts for open source models. Some models under this folder might also have their own customized runner.
-   For example, [llama](./oss_scripts/llama/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model.
+   For example, [llama](oss_scripts/llama/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model.
 
-3. qaihub_scripts: QAIHub stands for [Qualcomm AI Hub](https://aihub.qualcomm.com/). On QAIHub, users can find pre-compiled context binaries, a format used by QNN to save its models. This provides users with a new option for model deployment. Different from oss_scripts & scripts, which the example scripts are converting a model from nn.Module to ExecuTorch .pte files, qaihub_scripts provides example scripts for converting pre-compiled context binaries to ExecuTorch .pte files. Additionaly, users can find customized example runners specific to the QAIHub models for execution. For example [qaihub_llama2_7b](./qaihub_scripts/llama2/qaihub_llama2_7b.py) is a script converting context binaries to ExecuTorch .pte files, and [qaihub_llama2_7b_runner](./qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp) is a customized example runner to execute llama2 .pte files. Please be aware that context-binaries downloaded from QAIHub are tied to a specific QNN SDK version.
+3. qaihub_scripts: QAIHub stands for [Qualcomm AI Hub](https://aihub.qualcomm.com/). On QAIHub, users can find pre-compiled context binaries, a format used by QNN to save its models. This provides users with a new option for model deployment. Different from oss_scripts & scripts, which the example scripts are converting a model from nn.Module to ExecuTorch .pte files, qaihub_scripts provides example scripts for converting pre-compiled context binaries to ExecuTorch .pte files. Additionaly, users can find customized example runners specific to the QAIHub models for execution. For example [qaihub_llama2_7b](qaihub_scripts/llama/llama2/qaihub_llama2_7b.py) is a script converting context binaries to ExecuTorch .pte files, and [qaihub_llama2_7b_runner](qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp) is a customized example runner to execute llama2 .pte files. Please be aware that context-binaries downloaded from QAIHub are tied to a specific QNN SDK version.
 Before executing the scripts and runner, please ensure that you are using the QNN SDK version that is matching the context binary. Please refer to [Check context binary version](#check-context-binary-version) for tutorial on how to check the QNN Version for a context binary.
 
 4. scripts: This folder contains scripts to build models provided by Executorch.
@@ -22,15 +22,15 @@ Here are some general information and limitations.
 
 ## Prerequisite
 
-Please finish tutorial [Setting up executorch](https://pytorch.org/executorch/stable/getting-started-setup).
+Please finish tutorial [Setting up executorch](https://pytorch.org/executorch/main/getting-started-setup).
 
-Please finish [setup QNN backend](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md).
+Please finish [setup QNN backend](../../docs/source/backends-qualcomm.md).
 
 ## Environment
 
 Please set up `QNN_SDK_ROOT` environment variable.
 Note that this version should be exactly same as building QNN backend.
-Please check [setup](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md).
+Please check [setup](../../docs/source/backends-qualcomm.md).
 
 Please set up `LD_LIBRARY_PATH` to `$QNN_SDK_ROOT/lib/x86_64-linux-clang`.
 Or, you could put QNN libraries to default search path of the dynamic linker.
diff --git a/examples/qualcomm/TARGETS b/examples/qualcomm/TARGETS
index 47f4fa422ce..43ca4db6be5 100644
--- a/examples/qualcomm/TARGETS
+++ b/examples/qualcomm/TARGETS
@@ -4,7 +4,7 @@
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 oncall("executorch")
 
@@ -27,8 +27,8 @@ python_binary(
 runtime.command_alias(
     name = "export_example_qnn",
     env = {
-        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()),
-        "QNN_SDK_ROOT": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:__dir__)".format(get_qnn_library_verision()),
+        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()),
+        "QNN_SDK_ROOT": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:__dir__)".format(get_qnn_library_version()),
     },
     exe = ":export_example",
 )
diff --git a/examples/qualcomm/oss_scripts/conv_former.py b/examples/qualcomm/oss_scripts/conv_former.py
index 76131d659df..8ce16abcc87 100644
--- a/examples/qualcomm/oss_scripts/conv_former.py
+++ b/examples/qualcomm/oss_scripts/conv_former.py
@@ -12,10 +12,14 @@
 import numpy as np
 import timm
 import torch
-from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_PASS_EXPAND_BROADCAST_SHAPE,
+from executorch.backends.qualcomm._passes.expand_broadcast_tensor_shape import (
+    ExpandBroadcastTensorShape,
+)
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
 )
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     get_imagenet_dataset,
@@ -55,6 +59,9 @@ def main(args):
 
     model = model.eval()
 
+    # lower to QNN
+    passes_job = get_capture_program_passes()
+    passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
     build_executorch_binary(
         model,
         inputs[0],
@@ -62,7 +69,7 @@ def main(args):
         f"{args.artifact}/{pte_filename}",
         inputs,
         quant_dtype=QuantDtype.use_8a8w,
-        custom_pass_config={QCOM_PASS_EXPAND_BROADCAST_SHAPE},
+        passes_job=passes_job,
     )
 
     if args.compile_only:
diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py
index 2eb26e6cece..18b5ade8b35 100644
--- a/examples/qualcomm/oss_scripts/dino_v2.py
+++ b/examples/qualcomm/oss_scripts/dino_v2.py
@@ -10,7 +10,12 @@
 
 import numpy as np
 import torch
+from executorch.backends.qualcomm._passes import ConvertUpsampleBicubicWithBilinear
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
+)
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY
 
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
@@ -56,6 +61,8 @@ def main(args):
 
     pte_filename = "dino_v2"
     instance = get_instance()
+    passes_job = get_capture_program_passes()
+    passes_job[ConvertUpsampleBicubicWithBilinear][QCOM_PASS_ACTIVATE_KEY] = True
     build_executorch_binary(
         instance,
         sample_input,
@@ -65,6 +72,7 @@ def main(args):
         skip_node_id_set=skip_node_id_set,
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
+        passes_job=passes_job,
         shared_buffer=args.shared_buffer,
     )
 
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py b/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py
new file mode 100644
index 00000000000..ea65917dcd9
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py
@@ -0,0 +1,357 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import getpass
+import json
+import os
+import zipfile
+from multiprocessing.connection import Client
+from typing import Callable, List
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm._passes import (
+    ConvertUpsampleBicubicWithBilinear,
+    ExpandBroadcastTensorShape,
+)
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
+)
+from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY
+from executorch.examples.qualcomm.oss_scripts.efficientSAM.source_transformation import (
+    replace_maskdecoder_with_custom_op,
+    replace_pos_emb_with_custom_op,
+)
+
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    class_agnostic_mIoU,
+    make_output_dir,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+from PIL import Image, ImageDraw
+from scipy.ndimage import label
+from torch.utils.data import DataLoader, Dataset
+from torchvision import datasets, transforms
+
+
+def load_dataset(dataset_path):
+    image_shape = (224, 224)
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(image_shape),
+            transforms.ToTensor(),
+        ]
+    )
+    imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+
+    return list(imagenet_data)
+
+
+class EfficientSAMDataset(Dataset):
+    def __init__(self, dataset_path, data_size=1) -> None:
+        self.to_tensor = transforms.ToTensor()
+        dataset = load_dataset(dataset_path)
+        self.inputs = self.get_val_dataset(dataset, data_size)
+        self.data_size = data_size
+
+    def get_val_dataset(self, dataset, data_size):
+        imgs, pt_prompts, pt_labels = [], [], []
+        for i, data in enumerate(dataset):
+            if i >= data_size:
+                break
+            img = data[0]
+            h, w = img.shape[-2:]
+
+            # Assuming the main object usually appears in the middle of the image, this default value is set for better demo visualization.
+            # Users can modify/add the point prompt here.
+            pt_prompt = torch.tensor([[w / 2, (h * 2 / 3)]], dtype=torch.float32)[
+                None, ...
+            ]
+            # Users can increase the tensor size by adding more labels (0 for negative samples, 1 for positive samples) to label the corresponding points.
+            # The default label is [[1]], indicating that the point is a positive sample.
+            pt_label = torch.tensor([[1]], dtype=torch.float32)
+
+            imgs.append(img)
+            pt_prompts.append(pt_prompt)
+            pt_labels.append(pt_label)
+
+        imgs = torch.stack(imgs)
+        pt_prompts = torch.stack(pt_prompts)
+        pt_labels = torch.stack(pt_labels)
+        inputs = (imgs, pt_prompts, pt_labels)
+        return inputs
+
+    def __getitem__(self, idx):
+        return self.inputs[0][idx], self.inputs[1][idx], self.inputs[2][idx]
+
+    def __len__(self):
+        return self.data_size
+
+
+def get_dataset(dataset_path, data_size=1):
+
+    dataset = EfficientSAMDataset(dataset_path, data_size=data_size)
+    dataloader = DataLoader(dataset)
+
+    # prepare input data
+    inputs, input_list = [], ""
+    for index, data in enumerate(dataloader):
+        if index >= data_size:
+            break
+        inputs.append(tuple(data))
+        num_feature = len(data)
+        for idx, _ in enumerate(data):
+            input_name = f"input_{index}_{idx}.raw"
+            input_list += input_name + " " if idx < num_feature - 1 else input_name
+
+        input_list = input_list + "\n"
+
+    return inputs, input_list
+
+
+def source_transform(
+    model, transforms: List[Callable[[torch.nn.Module], torch.nn.Module]]
+):
+    for transform in transforms:
+        model = transform(model)
+    return model
+
+
+def get_instance(args):
+    import sys
+
+    sys.path.insert(0, args.oss_repo)
+    from efficient_sam.efficient_sam import build_efficient_sam
+
+    ckpt = args.pretrained_weight
+    file_path, file_extension = os.path.splitext(ckpt)
+    file_dir, filename = os.path.split(file_path)
+
+    if file_extension == ".zip":
+        with zipfile.ZipFile(ckpt, "r") as zip_ref:
+            zip_ref.extractall(file_dir)
+        ckpt = file_path
+        filename = os.path.splitext(filename)[0]
+
+    model_arch = filename.split("_")[-1]
+
+    if model_arch == "vitt":
+        encoder_patch_embed_dim, encoder_num_heads = (192, 3)
+    elif model_arch == "vits":
+        encoder_patch_embed_dim, encoder_num_heads = (384, 6)
+    else:
+        raise ValueError(f"Unsupported model architecture: {model_arch}")
+
+    model = build_efficient_sam(
+        encoder_patch_embed_dim=encoder_patch_embed_dim,
+        encoder_num_heads=encoder_num_heads,
+        checkpoint=ckpt,
+    ).eval()
+
+    return model
+
+
+def generate_mask(predicted_logits, predicted_iou):
+    sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
+    predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
+    predicted_logits = torch.take_along_dim(
+        predicted_logits, sorted_ids[..., None, None], dim=2
+    )
+
+    # The masks are already sorted by their predicted IOUs.
+    # We use the first mask.
+    mask = torch.ge(predicted_logits[0, 0, 0, :, :], 0).cpu().detach().numpy()
+    return mask
+
+
+def save_mask(mask, input, save_path):
+    image, prompt, pt_label = input
+    original_image_tensor = image[0]
+
+    # Convert tensor to numpy array if necessary
+    if not isinstance(original_image_tensor, np.ndarray):
+        original_image_tensor = original_image_tensor.detach().numpy()
+
+    # Transpose if the image has 3 channels
+    if original_image_tensor.shape[0] == 3:
+        original_image_tensor = original_image_tensor.transpose(1, 2, 0)
+
+    original_img = Image.fromarray(
+        (original_image_tensor * 255).astype(np.uint8)
+    ).convert("RGBA")
+
+    # Create an empty RGBA image for the mask
+    mask_img = np.ones((mask.shape[0], mask.shape[1], 4))
+    mask_img[:, :, 3] = 0
+
+    colors = [
+        [1, 0, 0, 0.5],
+        [0, 1, 0, 0.5],
+        [0, 0, 1, 0.5],
+        [1, 1, 0, 0.5],
+        [1, 0, 1, 0.5],
+        [0, 1, 1, 0.5],
+    ]
+
+    # Apply mask
+    labeled_mask, num_feature = label(mask)
+    for i in range(1, num_feature + 1):
+        mask_img[labeled_mask == i] = colors[(i - 1) % len(colors)]
+
+    mask_img = Image.fromarray((mask_img * 255).astype(np.uint8), "RGBA")
+
+    # Combine original image with mask
+    combined_img = Image.alpha_composite(original_img, mask_img)
+
+    # Draw prompts point ("green" for positive samples, "red" for negative samples)
+    draw = ImageDraw.Draw(combined_img)
+    for pt, l in zip(prompt[0][0], pt_label[0][0]):
+        color = "green" if l else "red"
+        point_size = 3
+        x1, y1 = max(0, int(pt[0]) - point_size), max(0, int(pt[1]) - point_size)
+        x2, y2 = min(combined_img.size[0], int(pt[0]) + point_size), min(
+            combined_img.size[1], int(pt[1]) + point_size
+        )
+        draw.ellipse((x1, y1, x2, y2), fill=color, outline=color)
+
+    combined_img.save(save_path)
+
+
+def main(args):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_size = 1
+    inputs, input_list = get_dataset(args.dataset, data_size)
+    assert args.pretrained_weight, "Checkpoint params can't be empty"
+
+    # Get the EfficientSAM model.
+    model = get_instance(args)
+    model = source_transform(
+        model,
+        [
+            replace_maskdecoder_with_custom_op,
+            replace_pos_emb_with_custom_op,
+        ],
+    )
+
+    pte_filename = "efficientSAM_qnn"
+
+    # lower to QNN
+    passes_job = get_capture_program_passes()
+    passes_job[ConvertUpsampleBicubicWithBilinear][QCOM_PASS_ACTIVATE_KEY] = True
+    passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
+    build_executorch_binary(
+        model,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        dataset=inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        passes_job=passes_job,
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}"
+    pte_path = f"{args.artifact}/{pte_filename}.pte"
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=pte_path,
+        workspace=workspace,
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+    )
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+    outputs = []
+
+    def post_process():
+        for i, f in enumerate(sorted(os.listdir(output_data_folder))):
+            filename = os.path.join(output_data_folder, f)
+            output = np.fromfile(filename, dtype=np.float32)
+            output_shape = [1, 1, 3] if i % 2 else [1, 1, 3, 224, 224]
+            output = torch.from_numpy(output).reshape(output_shape)
+            outputs.append(output)
+
+    adb.pull(output_path=args.artifact, callback=post_process)
+
+    # MIoU analysis
+    miou = 0
+    targets = [model(img, pt, pt_label) for img, pt, pt_label in inputs]
+    for i in range(data_size):
+        pred_mask = generate_mask(outputs[i * 2], outputs[i * 2 + 1])
+        save_mask(pred_mask, inputs[i], f"{args.artifact}/output_{i}.png")
+        target_mask = generate_mask(targets[i][0], targets[i][1])
+        miou += class_agnostic_mIoU([pred_mask], [target_mask])
+    miou /= data_size
+
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({"MIoU": miou}))
+    else:
+        print(f"MIoU->{miou}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts and output by this example. Default ./EfficientSAM_qnn",
+        default="./EfficientSAM_qnn",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--pretrained_weight",
+        help="Path to ESAM checkpoint, such as ./efficient_sam_vitt.pt or ./efficient_sam_vits.pt.zip",
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--oss_repo",
+        help="Path to clone https://github.com/yformer/EfficientSAM",
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py
new file mode 100644
index 00000000000..fd54a727136
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from executorch.examples.qualcomm.oss_scripts.efficientSAM.source_transformation.mask_decoder import (
+    replace_maskdecoder_with_custom_op,
+)
+from executorch.examples.qualcomm.oss_scripts.efficientSAM.source_transformation.pos_emb import (
+    replace_pos_emb_with_custom_op,
+)
+
+
+__all__ = [
+    replace_maskdecoder_with_custom_op,
+    replace_pos_emb_with_custom_op,
+]
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py
new file mode 100644
index 00000000000..c70d51a48fe
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py
@@ -0,0 +1,125 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+
+
+class MaskDecoderCustom(nn.Module):
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+
+        Arguments:
+          image_embeddings: A tensor of shape [B, C, H, W] or [B*max_num_queries, C, H, W]
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings (the batch dimension is broadcastable).
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+        """
+
+        (
+            batch_size,
+            max_num_queries,
+            sparse_embed_dim_1,
+            sparse_embed_dim_2,
+        ) = sparse_prompt_embeddings.shape
+
+        (
+            _,
+            image_embed_dim_c,
+            image_embed_dim_h,
+            image_embed_dim_w,
+        ) = image_embeddings.shape
+
+        # QNN don't support dim greater than 4
+        image_embeddings_expanded = image_embeddings.expand(max_num_queries, -1, -1, -1)
+        image_embeddings_tiled = image_embeddings_expanded.contiguous().view(
+            batch_size * max_num_queries,
+            image_embed_dim_c,
+            image_embed_dim_h,
+            image_embed_dim_w,
+        )
+        sparse_prompt_embeddings = sparse_prompt_embeddings.reshape(
+            batch_size * max_num_queries, sparse_embed_dim_1, sparse_embed_dim_2
+        )
+        masks, iou_pred = self.predict_masks(
+            image_embeddings=image_embeddings_tiled,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+        )
+        if multimask_output and self.num_multimask_outputs > 1:
+            return masks[:, 1:, :], iou_pred[:, 1:]
+        else:
+            return masks[:, :1, :], iou_pred[:, :1]
+
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        output_tokens = torch.cat(
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0
+        )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+        # Expand per-image data in batch direction to be per-mask
+        # QNN don't support dim greater than 4,
+        pos_src = image_pe.expand([tokens.shape[0]] + [*image_pe.shape[1:]])
+        b, c, h, w = image_embeddings.shape
+        hs, src = self.transformer(image_embeddings, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+
+        # Upscale mask embeddings and predict masks using the mask tokens
+        upscaled_embedding = src.transpose(1, 2).view(b, c, h, w)
+
+        for upscaling_layer in self.final_output_upscaling_layers:
+            upscaled_embedding = upscaling_layer(upscaled_embedding)
+        hyper_in_list: List[torch.Tensor] = []
+        for i, output_hypernetworks_mlp in enumerate(self.output_hypernetworks_mlps):
+            hyper_in_list.append(output_hypernetworks_mlp(mask_tokens_out[:, i, :]))
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        return masks, iou_pred
+
+
+def _replace_maskdecoder_with_custom_op(module: torch.nn.Module):
+    from efficient_sam.efficient_sam_decoder import MaskDecoder  # B007
+
+    for _, child in module.named_children():
+        if isinstance(child, MaskDecoder):
+            child.forward = MaskDecoderCustom.forward.__get__(child, MaskDecoder)
+            child.predict_masks = MaskDecoderCustom.predict_masks.__get__(
+                child, MaskDecoder
+            )
+        else:
+            _replace_maskdecoder_with_custom_op(child)
+
+
+def replace_maskdecoder_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
+
+    _replace_maskdecoder_with_custom_op(module)
+    return module
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py
new file mode 100644
index 00000000000..7a3a91c7607
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py
@@ -0,0 +1,64 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Tuple
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+
+class PositionEmbeddingRandomCustom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords.unsqueeze(0)
+        coords = torch.matmul(
+            coords, self.positional_encoding_gaussian_matrix.unsqueeze(0)
+        )
+        coords = coords.squeeze(0)
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords_0 = coords[:, :, 0] / image_size[1]
+        coords_1 = coords[:, :, 1] / image_size[0]
+        coords = torch.stack((coords_0, coords_1), dim=-1)
+
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+
+
+def _replace_pos_emb_with_custom_op(module: torch.nn.Module):
+    from efficient_sam.efficient_sam_decoder import PositionEmbeddingRandom  # B007
+
+    for _, child in module.named_children():
+        if isinstance(child, PositionEmbeddingRandom):
+            child._pe_encoding = PositionEmbeddingRandomCustom._pe_encoding.__get__(
+                child, PositionEmbeddingRandom
+            )
+            child.forward_with_coords = (
+                PositionEmbeddingRandomCustom.forward_with_coords.__get__(
+                    child, PositionEmbeddingRandom
+                )
+            )
+        else:
+            _replace_pos_emb_with_custom_op(child)
+
+
+def replace_pos_emb_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
+
+    _replace_pos_emb_with_custom_op(module)
+    return module
diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py
index 501ea522acd..ee062735fbd 100644
--- a/examples/qualcomm/oss_scripts/fastvit.py
+++ b/examples/qualcomm/oss_scripts/fastvit.py
@@ -101,20 +101,22 @@ def main(args):
         ),
     )
     # rewrite default per-channel ptq config
-    quantizer.per_channel_quant_config = QuantizationConfig(
+    quantizer.default_quant_config.per_channel_quant_config = QuantizationConfig(
         input_activation=act_qspec,
         output_activation=act_qspec,
         weight=weight_qspec,
         bias=_derived_bias_quant_spec,
     )
+
     # rewrite default ptq config
-    q_config = quantizer.bit8_quant_config
-    quantizer.bit8_quant_config = QuantizationConfig(
+    q_config = quantizer.default_quant_config.quant_config
+    quantizer.default_quant_config.quant_config = QuantizationConfig(
         input_activation=act_qspec,
         output_activation=act_qspec,
         weight=q_config.weight,
         bias=q_config.bias,
     )
+
     # lower to QNN
     passes_job = get_capture_program_passes()
     passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index cd468eebb26..3ee2d3789e4 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -14,7 +14,7 @@ Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache
   - AR-N model: The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use it to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor in hybrid mode.
   - Prompt processing with AR-N model: 
   <figure>
-    <img src="./assets/PromptProcessingWithARN.png" alt="Prompt Processing With AR-N Model">
+    <img src="assets/PromptProcessingWithARN.png" alt="Prompt Processing With AR-N Model">
     <figcaption>Prompt processing is done using a for-loop. An N-token block is taken, and the KV cache is updated for that block. This process is repeated until all tokens are consumed, with the last block potentially requiring padding. For flexibility, the AR-N model can handle any input length less than the maximum sequence length. For TTFT, the input length (or number of blocks) will vary depending on the actual input length, rather than always being the same.
     </figcaption>
 </figure>
@@ -28,7 +28,7 @@ Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache
 
 ### Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend.
 
 ### Step 2: Prepare Model
 
@@ -41,7 +41,7 @@ wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
 wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
 
 # tokenizer.bin:
-python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 
 # params.json:
 echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
@@ -70,14 +70,14 @@ We have two distinct mechanisms for updating the key-value (KV) cache, which can
 #### Shift Pointer mechanism
 
 <figure>
-    <img src="./assets/ShiftPointer.png" alt="Shift Pointer mechanism"> <figcaption>
+    <img src="assets/ShiftPointer.png" alt="Shift Pointer mechanism"> <figcaption>
     The figure illustrates the process of updating the key and value caches during each inference step. In key cache update process, we initially allocate memory for each layer with <code>num_head</code> size of <code>(head_dim + 1) * (seq_len - 1)</code>. After a single inference, the new key cache is copied from the key output pointer <code>k_out</code> and appended to the key cache. Subsequently, the buffer start pointer of the key cache <code>k_in</code> moves to the next token, making the previous position of the buffer start pointer unused. This process is repeated for each subsequent inference step.
     For the value cache update process, we first allocate a contiguous memory of size <code>(num_head + 1) * head_dim * (seq_len - 1)</code> for each layer, with the last head reserved for I/O shifting, After the first inference, the cache is updated by simply shifting the pointers of all heads to the next token position, making only the previous <code>head_dim * 1</code> section of the buffer start pointer <code>v_in</code> of the first head unused. This process is repeated for each subsequent inference step.</figcaption>
 </figure>
 
 #### Smart Mask mechanism:
 <figure>
-    <img src="./assets/SmartMask.png" alt="Smart Mask mechanism">
+    <img src="assets/SmartMask.png" alt="Smart Mask mechanism">
     <figcaption>The Smart Mask mechanism streamlines the process of updating tokens in the cache. Unlike the Shift Pointer mechanism, which requires moving the buffer start pointer <code>k_in</code>/<code>v_in</code> of the cache, the Smart Mask mechanism updates only the new token at the specified position. This approach eliminates the need to adjust the buffer start pointer. This mechanism is beneficial for shared buffers but requires CPU memory copying. </figcaption>
 </figure>
 
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
index e4bad10a234..024b45b65cd 100644
--- a/examples/qualcomm/oss_scripts/llama/TARGETS
+++ b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -1,5 +1,5 @@
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
@@ -48,7 +48,7 @@ python_binary(
 runtime.command_alias(
     name = "llama_qnn",
     env = {
-        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()),
+        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()),
         # Place holder to pass the QNN_SDK_ROOT check in executorch/examples/qualcomm/utils.py
         "QNN_SDK_ROOT": "",
     },
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 7e1b6872882..375edf9fb6c 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -390,6 +390,14 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()):
             fx_graph_module = torch.export.export(
                 self.llama_graph_module, self.inputs, strict=True
             ).module()
+
+            if QuantDtype == QuantDtype.use_16a4w_block:
+                conv_nodes = [
+                    n for n in fx_graph_module.graph.nodes if "conv" in n.name
+                ]
+                block_size_map = {n.name: (1, 64, 1, 1) for n in conv_nodes}
+                quantizer.set_block_size_map(block_size_map)
+
             fx_graph_module = prepare_pt2e(fx_graph_module, quantizer)
 
         logging.info("Quantizing the model...")
@@ -574,13 +582,14 @@ def permute(w, heads):
         fixed_point_type["kv_type"] = torch.uint8
         if args.ptq == "8a8w":
             fixed_point_type["io_type"] = torch.uint8
-        elif args.ptq == "16a4w":
+        elif args.ptq in ("16a4w", "16a4w_block"):
             fixed_point_type["io_type"] = torch.uint16
         else:
             assert args.ptq in [
                 "8a8w",
                 "16a4w",
-            ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w."
+                "16a4w_block",
+            ], f"No support for quant type {args.ptq}. Support 8a8w, 16a4w and 16a4w_block."
         quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
 
     assert args.tokenizer_model is not None, "Need tokenizer model for calibration"
@@ -954,7 +963,7 @@ def _build_parser():
     parser.add_argument(
         "-P",
         "--ptq",
-        help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.",
+        help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w, 16a4w and 16a4w_block.",
         type=str,
     )
 
diff --git a/examples/qualcomm/oss_scripts/llama/targets.bzl b/examples/qualcomm/oss_scripts/llama/targets.bzl
index c3f7e7fbbda..a67281e7e66 100644
--- a/examples/qualcomm/oss_scripts/llama/targets.bzl
+++ b/examples/qualcomm/oss_scripts/llama/targets.bzl
@@ -1,5 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 def define_common_targets():
     runtime.cxx_library(
@@ -20,7 +20,7 @@ def define_common_targets():
             "//executorch/extension/llm/runner:stats",
             "//executorch/extension/tensor:tensor",
              "//executorch/kernels/quantized:generated_lib",
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()),
         ],
         exported_deps = [
             "//executorch/extension/module:module",
diff --git a/examples/qualcomm/oss_scripts/moshi/mimi.py b/examples/qualcomm/oss_scripts/moshi/mimi.py
new file mode 100644
index 00000000000..6b59a71ae64
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/moshi/mimi.py
@@ -0,0 +1,402 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# import argparse
+import io
+import json
+import os
+import random
+from multiprocessing.connection import Client
+
+import numpy as np
+import requests
+
+import sphn
+import torch
+
+import torch.nn as nn
+import torchaudio
+
+from executorch.backends.qualcomm.quantizer.custom_annotation import (
+    annotate_mimi_decoder,
+)
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    make_output_dir,
+    make_quantizer,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+
+from huggingface_hub import hf_hub_download
+from moshi.models import loaders
+
+from torch.ao.quantization.observer import MinMaxObserver
+
+
+def seed_all(seed):
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)  # for multi-GPU setups
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+
+def read_mp3_from_url(url):
+    response = requests.get(url)
+    response.raise_for_status()  # Ensure request is successful
+
+    # Convert to a file-like object
+    audio_stream = io.BytesIO(response.content)
+
+    # Load audio using torchaudio
+    waveform, sample_rate = torchaudio.load(audio_stream, format="mp3")
+
+    return waveform.numpy(), sample_rate
+
+
+def compute_scores(cpu_decode_res: torch.Tensor, htp_decode_res: torch.Tensor):
+    assert cpu_decode_res.shape == htp_decode_res.shape, "Tensor shapes do not match"
+    abs_diff = torch.abs(cpu_decode_res - htp_decode_res)
+    atol = torch.max(abs_diff)
+    print("Atol: ", atol)
+
+    cpu_decode_res = cpu_decode_res.float()
+    htp_decode_res = htp_decode_res.float()
+    error = cpu_decode_res - htp_decode_res
+    original_power = torch.mean(torch.pow(cpu_decode_res, 2))
+    error_power = torch.mean(torch.pow(error, 2))
+    sqnr = 10 * torch.log10(original_power / error_power)
+    print("SQNR: ", sqnr)
+
+
+def test_decoder_with_emb_input(mimi, args):
+    class MimiDecode(nn.Module):
+        def __init__(self, mimi: nn.Module):
+            super().__init__()
+            self.mimi_model = mimi
+
+        def forward(self, x):
+            x = x.transpose(1, 2)
+            x = self.mimi_model.upsample(x)
+            (emb,) = self.mimi_model.decoder_transformer(x)
+            emb.transpose(1, 2)
+            with self.mimi_model._context_for_encoder_decoder:
+                out = self.mimi_model.decoder(emb)
+            return out
+
+    emb_input = torch.rand(1, 1, 512, device="cpu")
+    mimi_decode = MimiDecode(mimi).eval()
+    cpu_res = mimi_decode(emb_input)
+    pte_filename = "mimi_decoder_emb_qnn"
+
+    quantizer = make_quantizer(
+        quant_dtype=QuantDtype.use_16a8w,
+        per_channel_conv=True,
+        per_channel_linear=True,
+        act_observer=MinMaxObserver,
+    )
+    quantizer.add_custom_quant_annotations((annotate_mimi_decoder,))
+
+    emb_inputs = [(emb_input,)]
+    build_executorch_binary(
+        mimi_decode,
+        emb_inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        emb_inputs,
+        custom_quantizer=quantizer,
+        quant_dtype=QuantDtype.use_16a8w,
+        shared_buffer=args.shared_buffer,
+    )
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        shared_buffer=args.shared_buffer,
+    )
+    adb.push(inputs=emb_inputs, input_list="input_0_0.raw\n")
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    emb_predictions = []
+    for i in range(len(emb_inputs)):
+        np_arr = np.fromfile(
+            os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+        )
+        emb_predictions.append(torch.from_numpy(np_arr).view(1, 1, 1920))
+    print("Emb input test results")
+    compute_scores(cpu_res, emb_predictions[0])
+
+
+def mimi_encode(
+    mimi,
+    encode_inputs,
+    encoder_input_list,
+    pcm_chunk_size,
+    skip_node_id_set,
+    skip_node_op_set,
+) -> torch.Tensor:
+    class MimiEncode(nn.Module):
+        def __init__(self, mimi: nn.Module):
+            super().__init__()
+            self.mimi_model = mimi
+
+        def forward(self, x):
+            return self.mimi_model.encode(x)
+
+    mimi_encode_model = MimiEncode(mimi)
+
+    pte_filename = "mimi_encoder_qnn"
+    build_executorch_binary(
+        mimi_encode_model.eval(),
+        encode_inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        encode_inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        quant_dtype=QuantDtype.use_8a8w,
+        shared_buffer=args.shared_buffer,
+    )
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        shared_buffer=args.shared_buffer,
+    )
+    adb.push(inputs=encode_inputs, input_list=encoder_input_list)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    encoder_predictions = []
+    # Num chunks should align with args.chunks_per_batch
+    num_chunks = encode_inputs[0][0].shape[-1] // pcm_chunk_size
+    for i in range(len(encode_inputs)):
+        np_arr = np.fromfile(
+            os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.int64
+        )
+        encoder_predictions.append(torch.from_numpy(np_arr).view(1, 8, num_chunks))
+    return encoder_predictions
+
+
+def mimi_decode(
+    mimi, encode_res_list, pcm_chunk_size, skip_node_id_set, skip_node_op_set
+) -> torch.Tensor:
+    class MimiDecode(nn.Module):
+        def __init__(self, mimi: nn.Module):
+            super().__init__()
+            self.mimi_model = mimi
+
+        def forward(self, x):
+            return self.mimi_model.decode(x)
+
+    mimi_decode_model = MimiDecode(mimi)
+    decode_inputs, decode_input_list = [], ""
+    for index, encoder_res in enumerate(encode_res_list):
+        decode_inputs.append((encoder_res.to(torch.int32),))
+        decode_input_list += f"input_{index}_0.raw\n"
+
+    pte_filename = "mimi_decoder_qnn"
+
+    quantizer = make_quantizer(
+        quant_dtype=QuantDtype.use_16a8w,
+        per_channel_conv=True,
+        per_channel_linear=True,
+        act_observer=MinMaxObserver,
+    )
+    quantizer.add_custom_quant_annotations((annotate_mimi_decoder,))
+
+    build_executorch_binary(
+        mimi_decode_model.eval(),
+        decode_inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        decode_inputs,
+        skip_node_id_set=skip_node_id_set,
+        skip_node_op_set=skip_node_op_set,
+        custom_quantizer=quantizer,
+        quant_dtype=QuantDtype.use_16a8w,
+        shared_buffer=args.shared_buffer,
+    )
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        shared_buffer=args.shared_buffer,
+    )
+    adb.push(inputs=decode_inputs, input_list=decode_input_list)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    decoder_predictions = []
+    # Num chunks should align with args.chunks_per_batch
+    num_chunks = decode_inputs[0][0].shape[-1]
+    shape = num_chunks * pcm_chunk_size
+    for i in range(len(decode_inputs)):
+        np_arr = np.fromfile(
+            os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+        )
+        decoder_predictions.append(torch.from_numpy(np_arr).view(1, 1, shape))
+    htp_decode_res = torch.cat(decoder_predictions, dim=-1)
+
+    return htp_decode_res
+
+
+def export_mimi(mimi, args, max_duration_sec=10.0):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+    os.makedirs(args.artifact, exist_ok=True)
+
+    if args.emb_input_test:
+        test_decoder_with_emb_input(mimi, args)
+        return
+
+    sample_rate = mimi.sample_rate
+    url = "https://huggingface.co/lmz/moshi-swift/resolve/main/bria-24khz.mp3"
+    sample_pcm, sample_sr = read_mp3_from_url(url)
+    sample_rate = mimi.sample_rate
+    sample_pcm = torch.tensor(sample_pcm, device="cpu")
+    max_duration_len = int(sample_rate * max_duration_sec)
+    if sample_pcm.shape[-1] > max_duration_len:
+        sample_pcm = sample_pcm[..., :max_duration_len]
+    sample_pcm = sample_pcm[None].to(device="cpu")
+
+    encoder_inputs, encoder_input_list = [], ""
+    # 1920 chunk_size = 0.08sec
+    pcm_chunk_size = int(mimi.sample_rate / mimi.frame_rate)
+    batch_size = pcm_chunk_size * args.chunks_per_batch
+    count = 0
+    for start_idx in range(0, sample_pcm.shape[-1], batch_size):
+        end_idx = min(sample_pcm.shape[-1], start_idx + batch_size)
+        chunk = sample_pcm[..., start_idx:end_idx]
+        encoder_inputs.append((chunk,))
+        encoder_input_list += f"input_{count}_0.raw\n"
+        count += 1
+
+    print("streaming encoding...")
+    cpu_encode_res = mimi.encode(sample_pcm)
+    htp_encode_res = mimi_encode(
+        mimi,
+        encoder_inputs,
+        encoder_input_list,
+        pcm_chunk_size,
+        skip_node_id_set,
+        skip_node_op_set,
+    )
+
+    # Leave it here for now, uncomment this to check htp_encoder with cpu_decoder
+    # htp_res = torch.cat(htp_encode_res, dim=-1)
+    # cpu_decode_htp_encode =  mimi.decode(htp_res)
+    # sphn.write_wav("cpu_decode_htp_encode.wav", cpu_decode_htp_encode[0, 0].cpu().numpy(), sample_rate)
+
+    print("streaming decoding...")
+    cpu_decode_res = mimi.decode(cpu_encode_res)
+    # TODO: Enable streaming mode, which is the correct way to execute 1 chunk at a time.
+    # with mimi.streaming(1):
+    htp_decode_res = mimi_decode(
+        mimi, htp_encode_res, pcm_chunk_size, skip_node_id_set, skip_node_op_set
+    )
+    compute_scores(cpu_decode_res, htp_decode_res)
+
+    sphn.write_wav(
+        f"{args.artifact}/cpu_decode_res.wav",
+        cpu_decode_res[0, 0].cpu().numpy(),
+        sample_rate,
+    )
+    sphn.write_wav(
+        f"{args.artifact}/htp_decode_res.wav",
+        htp_decode_res[0, 0].cpu().numpy(),
+        sample_rate,
+    )
+
+
+def main(args):
+    seed_all(42424242)
+
+    print("loading mimi")
+    if args.mimi_weight is None:
+        args.mimi_weight = hf_hub_download(args.hf_repo, loaders.MIMI_NAME)
+    mimi = loaders.get_mimi(args.mimi_weight, "cpu")
+    print("mimi loaded")
+
+    with torch.no_grad():
+        export_mimi(mimi, args)
+
+
+if __name__ == "__main__":
+
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./mimi",
+        default="./mimi",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--chunks_per_batch",
+        help="Number of chunks to process per time. Default is 1 chunk per batch, which equals to 0.08 second",
+        default=1,
+        type=int,
+    )
+
+    parser.add_argument(
+        "--emb_input_test",
+        help="This is just a metrics used to compute accuracy scores, not recommended for general users.",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument("--mimi-weight", type=str)
+    parser.add_argument("--hf-repo", type=str, default=loaders.DEFAULT_REPO)
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/qaihub_scripts/llama/README.md b/examples/qualcomm/qaihub_scripts/llama/README.md
index 0fec6ea867f..1e08b97bccb 100644
--- a/examples/qualcomm/qaihub_scripts/llama/README.md
+++ b/examples/qualcomm/qaihub_scripts/llama/README.md
@@ -12,14 +12,14 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o
 ### Instructions
 #### Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend.
 
 #### Step2: Prepare Model
 1. Create account for https://aihub.qualcomm.com/
 2. Follow instructions in https://huggingface.co/qualcomm/Llama-v2-7B-Chat to export context binaries (will take some time to finish)
 
 ```bash
-# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model
+# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main
 # tokenizer.bin:
 python -m examples.models.llama.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
 ```
@@ -40,7 +40,7 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o
 ### Instructions
 #### Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend.
 
 #### Step2: Prepare Model
 1. Create account for https://aihub.qualcomm.com/
@@ -54,4 +54,4 @@ Please refer to [Check context binary version](../../README.md#check-context-bin
 ```bash
 # AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v3_8b_chat_quantized
 python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
-```
\ No newline at end of file
+```
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md
index b008d3135d4..d2649cf72c2 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md
@@ -11,7 +11,7 @@ The model architecture, scheduler, and time embedding are from the [stabilityai/
 ### Instructions
 #### Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend.
 
 #### Step2: Prepare Model
 1. Download the context binaries for TextEncoder, UNet, and VAEDecoder under https://huggingface.co/qualcomm/Stable-Diffusion-v2.1/tree/main
diff --git a/examples/qualcomm/qaihub_scripts/utils/README.md b/examples/qualcomm/qaihub_scripts/utils/README.md
index 61f465f3926..996b72f7937 100644
--- a/examples/qualcomm/qaihub_scripts/utils/README.md
+++ b/examples/qualcomm/qaihub_scripts/utils/README.md
@@ -1,6 +1,6 @@
 # CLI Tool for Compile / Deploy Pre-Built QNN Artifacts
 
-An easy-to-use tool for generating / executing .pte program from pre-built model libraries / context binaries from Qualcomm AI Engine Direct. Tool is verified with [host environement](../../../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md#host-os).
+An easy-to-use tool for generating / executing .pte program from pre-built model libraries / context binaries from Qualcomm AI Engine Direct. Tool is verified with [host environement](../../../../docs/source/backends-qualcomm.md#host-os).
 
 ## Description
 
@@ -20,7 +20,7 @@ If users are interested in well-known applications, [Qualcomm AI HUB](https://ai
 ### Dependencies
 
 * Register for Qualcomm AI HUB.
-* Download the corresponding QNN SDK via [link](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) which your favorite model is compiled with. Ths link will automatically download the latest version at this moment (users should be able to specify version soon, please refer to [this](../../../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md#software) for earlier releases).
+* Download the corresponding QNN SDK via [link](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) which your favorite model is compiled with. Ths link will automatically download the latest version at this moment (users should be able to specify version soon, please refer to [this](../../../../docs/source/backends-qualcomm.md#software) for earlier releases).
 
 ### Target Model
 
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
index fa7efc0c459..a12a5069c3f 100755
--- a/examples/qualcomm/scripts/edsr.py
+++ b/examples/qualcomm/scripts/edsr.py
@@ -24,7 +24,7 @@
 
 from PIL import Image
 from torch.utils.data import Dataset
-from torchsr.datasets import B100
+from torchsr.datasets import B100, Div2K
 from torchvision.transforms.functional import to_pil_image, to_tensor
 
 
@@ -75,6 +75,16 @@ def get_b100(
     return SrDataset(hr_dir, lr_dir)
 
 
+def get_Div2K(
+    dataset_dir: str,
+):
+    hr_dir = f"{dataset_dir}/sr_bm_dataset/DIV2K/DIV2K_valid_HR"
+    lr_dir = f"{dataset_dir}/sr_bm_dataset/DIV2K/DIV2K_valid_LR_bicubic/X2"
+    if not os.path.exists(hr_dir) or not os.path.exists(lr_dir):
+        Div2K(root=f"{dataset_dir}/sr_bm_dataset", scale=2, download=True)
+    return SrDataset(hr_dir, lr_dir)
+
+
 def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str):
     if not (lr_dir and hr_dir) and not default_dataset:
         raise RuntimeError(
@@ -85,7 +95,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str
         raise RuntimeError("Either use custom dataset, or use default dataset.")
 
     if default_dataset:
-        return get_b100(dataset_dir)
+        return get_Div2K(dataset_dir)
 
     return SrDataset(hr_dir, lr_dir)
 
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 47a489f6d52..bd0b6dfbcf2 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -23,7 +23,6 @@
     make_output_dir,
     make_quantizer,
     parse_skip_delegation_node,
-    QnnPartitioner,
     setup_common_args_and_variables,
     SimpleADB,
 )
@@ -103,10 +102,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
     from transformers import get_linear_schedule_with_warmup
 
     # grab dataset
-    url = (
-        "https://raw.githubusercontent.com/susanli2016/NLP-with-Python"
-        "/master/data/title_conference.csv"
-    )
+    url = "https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv"
     content = requests.get(url, allow_redirects=True).content
     data = pd.read_csv(BytesIO(content))
 
@@ -273,19 +269,15 @@ def calibrator(gm):
 
         quantizer = make_quantizer(quant_dtype=quant_dtype)
         backend_options = generate_htp_compiler_spec(quant_dtype is not None)
-        partitioner = QnnPartitioner(
-            generate_qnn_executorch_compiler_spec(
-                soc_model=getattr(QcomChipset, args.model),
-                backend_options=backend_options,
-            ),
-            skip_node_id_set=skip_node_id_set,
-            skip_node_op_set=skip_node_op_set,
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=getattr(QcomChipset, args.model),
+            backend_options=backend_options,
         )
         # skip embedding layer cause it's quantization sensitive
         graph_module, _ = skip_annotation(
             nn_module=model,
             quantizer=quantizer,
-            partitioner=partitioner,
+            compiler_specs=compiler_specs,
             sample_input=inputs[0],
             calibration_cb=calibrator,
             fp_node_op_set={torch.ops.aten.embedding.default},
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 2b2f32b037b..242170712e1 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -14,12 +14,16 @@
 import tempfile
 from pathlib import Path
 
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Tuple
 
 import numpy as np
 
 import torch
-from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype
+from executorch.backends.qualcomm.quantizer.quantizer import (
+    ModuleQConfig,
+    QnnQuantizer,
+    QuantDtype,
+)
 from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.utils import (
     generate_htp_compiler_spec,
@@ -254,18 +258,23 @@ def qat_train(ori_model, captured_model, quantizer, dataset):
 def make_quantizer(
     quant_dtype: Optional[QuantDtype] = QuantDtype.use_8a8w,
     custom_annotations=(),
-    per_block_conv=False,
     per_channel_conv=True,
     per_channel_linear=False,
     act_observer=MovingAverageMinMaxObserver,
     is_qat=False,
+    submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None,
 ):
     quantizer = QnnQuantizer()
     quantizer.add_custom_quant_annotations(custom_annotations)
-    quantizer.set_per_block_conv_quant(per_block_conv)
-    quantizer.set_per_channel_conv_quant(per_channel_conv)
-    quantizer.set_per_channel_linear_quant(per_channel_linear)
-    quantizer.set_quant_config(quant_dtype, is_qat, act_observer)
+    quantizer.set_default_quant_config(
+        quant_dtype,
+        is_qat=is_qat,
+        is_conv_per_channel=per_channel_conv,
+        is_linear_per_channel=per_channel_linear,
+        act_observer=act_observer,
+    )
+    submodule_qconfig_list = submodule_qconfig_list or []
+    quantizer.set_submodule_qconfig_list(submodule_qconfig_list)
     return quantizer
 
 
@@ -279,7 +288,7 @@ def build_executorch_binary(
     skip_node_id_set=None,
     skip_node_op_set=None,
     quant_dtype: Optional[QuantDtype] = None,
-    custom_quantizer=None,
+    custom_quantizer: Optional[QnnQuantizer] = None,
     shared_buffer=False,
     metadata=None,
     dump_intermediate_outputs=False,
@@ -316,8 +325,8 @@ def build_executorch_binary(
         shared_buffer=shared_buffer,
         dump_intermediate_outputs=dump_intermediate_outputs,
     )
-    if quant_dtype is not None:
-        captured_model = torch.export.export(model, inputs, strict=True).module()
+    if quant_dtype is not None or custom_quantizer is not None:
+        captured_model = torch.export.export(model, inputs, strict=False).module()
         if qat_training_data:
             quantizer = custom_quantizer or make_quantizer(
                 quant_dtype=quant_dtype, is_qat=True
@@ -419,6 +428,15 @@ def histogram(golden, predict):
     return (pa, mpa, miou, cls_iou)
 
 
+def class_agnostic_mIoU(predictions, targets):
+    total_iou = 0
+    for pred, tar in zip(predictions, targets):
+        inter = np.count_nonzero(pred & tar)
+        union = np.count_nonzero(pred | tar)
+        total_iou += inter / (union + 1e-10)
+    return total_iou / len(predictions)
+
+
 def get_imagenet_dataset(
     dataset_path, data_size, image_shape, crop_size=None, shuffle=True
 ):
diff --git a/examples/selective_build/README.md b/examples/selective_build/README.md
index 6c655e18a3d..97706d70c48 100644
--- a/examples/selective_build/README.md
+++ b/examples/selective_build/README.md
@@ -3,7 +3,7 @@ To optimize binary size of ExecuTorch runtime, selective build can be used. This
 
 ## How to run
 
-Prerequisite: finish the [setting up wiki](https://pytorch.org/executorch/stable/getting-started-setup).
+Prerequisite: finish the [setting up wiki](https://pytorch.org/executorch/main/getting-started-setup).
 
 Run:
 
diff --git a/examples/xnnpack/README.md b/examples/xnnpack/README.md
index 179e47004a1..5c307d34717 100644
--- a/examples/xnnpack/README.md
+++ b/examples/xnnpack/README.md
@@ -1,8 +1,8 @@
 # XNNPACK Backend
 
 [XNNPACK](https://github.com/google/XNNPACK) is a library of optimized neural network operators for ARM and x86 CPU platforms. Our delegate lowers models to run using these highly optimized CPU operators. You can try out lowering and running some example models in the demo. Please refer to the following docs for information on the XNNPACK Delegate
-- [XNNPACK Backend Delegate Overview](https://pytorch.org/executorch/stable/native-delegates-executorch-xnnpack-delegate.html)
-- [XNNPACK Delegate Export Tutorial](https://pytorch.org/executorch/stable/tutorial-xnnpack-delegate-lowering.html)
+- [XNNPACK Backend Delegate Overview](https://pytorch.org/executorch/main/backends-xnnpack)
+- [XNNPACK Delegate Export Tutorial](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering)
 
 
 ## Directory structure
@@ -60,7 +60,7 @@ Now finally you should be able to run this model with the following command
 ```
 
 ## Quantization
-First, learn more about the generic PyTorch 2 Export Quantization workflow in the [Quantization Flow Docs](https://pytorch.org/executorch/stable/quantization-overview.html), if you are not familiar already.
+First, learn more about the generic PyTorch 2 Export Quantization workflow in the [Quantization Flow Docs](https://pytorch.org/executorch/main/quantization-overview), if you are not familiar already.
 
 Here we will discuss quantizing a model suitable for XNNPACK delegation using XNNPACKQuantizer.
 
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index 6db0d82a274..f67150169dc 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -87,14 +87,14 @@
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    ep = torch.export.export_for_training(model, example_inputs)
+    ep = torch.export.export_for_training(model, example_inputs, strict=True)
     model = ep.module()
 
     if args.quantize:
         logging.info("Quantizing Model...")
         # TODO(T165162973): This pass shall eventually be folded into quantizer
         model = quantize(model, example_inputs, quant_type)
-        ep = torch.export.export_for_training(model, example_inputs)
+        ep = torch.export.export_for_training(model, example_inputs, strict=True)
 
     edge = to_edge_transform_and_lower(
         ep,
diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py
index 3e30c239215..90a6b94d02b 100644
--- a/examples/xnnpack/quantization/example.py
+++ b/examples/xnnpack/quantization/example.py
@@ -60,7 +60,9 @@ def verify_xnnpack_quantizer_matching_fx_quant_model(model_name, model, example_
     m = model
 
     # 1. pytorch 2.0 export quantization flow (recommended/default flow)
-    m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
+    m = torch.export.export_for_training(
+        m, copy.deepcopy(example_inputs), strict=True
+    ).module()
     quantizer = XNNPACKQuantizer()
     quantization_config = get_symmetric_quantization_config(is_per_channel=True)
     quantizer.set_global(quantization_config)
@@ -177,7 +179,9 @@ def main() -> None:
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    model = torch.export.export_for_training(model, example_inputs).module()
+    model = torch.export.export_for_training(
+        model, example_inputs, strict=True
+    ).module()
     start = time.perf_counter()
     quantized_model = quantize(model, example_inputs)
     end = time.perf_counter()
diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index ab2e66f7885..310e5ea9379 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -8,8 +8,9 @@
 import copy
 import logging
 from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
 from functools import singledispatch
-from typing import Generator, List
+from typing import Dict, Generator, List, Mapping
 
 import torch
 
@@ -36,7 +37,7 @@
     update_to_real_program,
 )
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
-from torch.export import ExportedProgram
+from torch.export.exported_program import ExportedProgram, InputSpec, OutputSpec
 
 
 @singledispatch
@@ -190,6 +191,65 @@ def _get_node_list_with_same_tag(
     return node_list
 
 
+def _insert_lowered_submodule(
+    submodule_program: ExportedProgram,
+    owning_program: ExportedProgram,
+    call_submodule_node: torch.fx.Node,
+    submodule_output_node: torch.fx.Node,
+    lowered_module: LoweredBackendModule,
+    is_submodule: bool,
+    toplevel_input_specs_to_delete: Dict[str, InputSpec],
+    toplevel_output_specs_to_delete: Dict[str, OutputSpec],
+):
+    owning_graph_module = call_submodule_node.graph.owning_module
+    # call delegate args should only use user_inputs
+    call_delegate_args = []
+    # Preserve input order as user_inputs
+    for inp_name in submodule_program.graph_signature.user_inputs:
+        for inp_node in call_submodule_node.all_input_nodes:
+            if inp_node.name == inp_name:
+                call_delegate_args.append(inp_node)
+                break
+
+    def generate_debug_handle(ep: ExportedProgram) -> int:
+        """
+        Generate a debug handle for the given ExportedProgram.
+        """
+        debug_handle = 0
+        for node in ep.graph_module.graph.nodes:
+            debug_handle = max(debug_handle, node.meta.get("debug_handle", 0))
+        return debug_handle + 1
+
+    # Replace the partitioned submodule with a lowered submodule
+    # Add call_method node with function "forward"
+    with owning_graph_module.graph.inserting_before(call_submodule_node):
+        lowered_name = get_lowered_module_name(owning_graph_module, lowered_module)
+        lowered_node = owning_graph_module.graph.get_attr(lowered_name)
+        call_delegate_node = owning_graph_module.graph.call_function(
+            executorch_call_delegate,
+            (lowered_node,) + tuple(call_delegate_args),
+            call_submodule_node.kwargs,
+        )
+        call_delegate_node.meta["debug_handle"] = generate_debug_handle(owning_program)
+        call_delegate_node.meta["val"] = submodule_output_node.meta["val"]
+        call_submodule_node.replace_all_uses_with(call_delegate_node)
+        owning_graph_module.graph.erase_node(call_submodule_node)
+
+    if is_submodule:
+        assert len(toplevel_input_specs_to_delete) == 0
+        assert len(toplevel_output_specs_to_delete) == 0
+    elif (
+        len(toplevel_input_specs_to_delete) > 0
+        or len(toplevel_output_specs_to_delete) > 0
+    ):
+        _unsafe_adjust_original_program(
+            owning_program,
+            call_delegate_node,
+            toplevel_input_specs_to_delete,
+            toplevel_output_specs_to_delete,
+        )
+
+
 def _partition_and_lower_one_graph_module(
     tagged_graph_module: torch.fx.GraphModule,
     partition_result: PartitionResult,
@@ -254,56 +314,16 @@ def _partition_and_lower_one_graph_module(
             delegation_spec.compile_specs,
         )
 
-        # call delegate args should only use user_inputs
-        call_delegate_args = []
-        # Preserve input order as user_inputs
-        for inp_name in submodule_program.graph_signature.user_inputs:
-            for inp_node in call_module_node.all_input_nodes:
-                if inp_node.name == inp_name:
-                    call_delegate_args.append(inp_node)
-                    break
-
-        def generate_debug_handle(ep: ExportedProgram) -> int:
-            """
-            Generate a debug handle for the given ExportedProgram.
-            """
-            debug_handle = 0
-            for node in ep.graph_module.graph.nodes:
-                debug_handle = max(debug_handle, node.meta.get("debug_handle", 0))
-            return debug_handle + 1
-
-        # Replace the partitioned submodule with a lowered submodule
-        # Add call_method node with function "forward"
-        with tagged_graph_module.graph.inserting_before(call_module_node):
-            lowered_name = get_lowered_module_name(
-                tagged_graph_module, lowered_submodule
-            )
-            lowered_node = tagged_graph_module.graph.get_attr(lowered_name)
-            call_delegate_node = tagged_graph_module.graph.call_function(
-                executorch_call_delegate,
-                (lowered_node,) + tuple(call_delegate_args),
-                call_module_node.kwargs,
-            )
-            call_delegate_node.meta["debug_handle"] = generate_debug_handle(
-                owning_program
-            )
-            call_delegate_node.meta["val"] = submodule_output_node.meta["val"]
-            call_module_node.replace_all_uses_with(call_delegate_node)
-            tagged_graph_module.graph.erase_node(call_module_node)
-
-        if is_submodule:
-            assert len(toplevel_input_specs_to_delete) == 0
-            assert len(toplevel_output_specs_to_delete) == 0
-        elif (
-            len(toplevel_input_specs_to_delete) > 0
-            or len(toplevel_output_specs_to_delete) > 0
-        ):
-            _unsafe_adjust_original_program(
-                owning_program,
-                call_delegate_node,
-                toplevel_input_specs_to_delete,
-                toplevel_output_specs_to_delete,
-            )
+        _insert_lowered_submodule(
+            submodule_program,
+            owning_program,
+            call_module_node,
+            submodule_output_node,
+            lowered_submodule,
+            is_submodule,
+            toplevel_input_specs_to_delete,
+            toplevel_output_specs_to_delete,
+        )
 
     return tagged_graph_module
 
@@ -417,3 +437,330 @@ def to_backend(
         constants=tagged_exported_program.constants,
         verifiers=[tagged_exported_program.verifier],
     )
+
+
+def _create_partitions_in_graph_module(
+    tagged_graph_module: torch.fx.GraphModule,
+    partition_result: PartitionResult,
+    owning_program: ExportedProgram,
+    is_submodule: bool,
+) -> Dict[str, List[torch.fx.Node]]:
+    backend_id_to_submodule_name = {}
+    for tag, delegation_spec in partition_result.partition_tags.items():
+        # Create partition with nodes containing this tag. There should only be
+        # one contained submodule per tag
+        node_list = _get_node_list_with_same_tag(
+            tagged_graph_module, tag, owning_program
+        )
+
+        if len(node_list) == 0:
+            logging.debug(f"Did not find any nodes for tag {tag}")
+            continue
+
+        logging.debug(f"For tag {tag}, found nodes {node_list}")
+        # Tag the nodes that are params as buffers, so we can order the submodule as (Parms + Buffers) (User Inputs)
+
+        replace_ctx = (
+            tagged_graph_module._set_replace_hook(
+                owning_program.graph_signature.get_replace_hook()
+            )
+            if not is_submodule
+            else nullcontext()
+        )
+        with replace_ctx:
+            submodule, call_module_node = create_submodule_from_nodes(
+                tagged_graph_module, node_list, tag
+            )
+
+        tagged_graph_module_output_node = [
+            node for node in tagged_graph_module.graph.nodes if node.op == "output"
+        ][0]
+        submodule_output_node = [
+            node for node in submodule.graph.nodes if node.op == "output"
+        ][0]
+        # Copy the output node meta from the original output node, because
+        # create_submodule_from_nodes doesn't cover the meta field
+        submodule_output_node.meta = tagged_graph_module_output_node.meta
+        logging.debug(f"Partitioned graph module: {tagged_graph_module}")
+        (
+            submodule_program,
+            toplevel_input_specs_to_delete,
+            toplevel_output_specs_to_delete,
+        ) = create_exported_program_from_submodule(
+            submodule,
+            owning_program,
+            tag,
+            call_module_node,
+            is_submodule,
+        )
+        call_module_node.meta["backend_id"] = delegation_spec.backend_id
+        call_module_node.meta["compile_spec"] = delegation_spec.compile_specs
+        call_module_node.meta["submodule_program"] = submodule_program
+        call_module_node.meta["toplevel_input_specs_to_delete"] = (
+            toplevel_input_specs_to_delete
+        )
+        call_module_node.meta["toplevel_output_specs_to_delete"] = (
+            toplevel_output_specs_to_delete
+        )
+        call_module_node.meta["is_submodule"] = is_submodule
+        call_module_node.meta["submodule_output_node"] = submodule_output_node
+
+        if delegation_spec.backend_id not in backend_id_to_submodule_name:
+            backend_id_to_submodule_name[delegation_spec.backend_id] = []
+
+        # The call_module_node created here might not be the same node instance as
+        # the one in the final graph module. This is because this node might be replaced
+        # in future edits to the graph. As a result, we just keep track of the node's name
+        # and at the end we search for this node in our final graph module
+        backend_id_to_submodule_name[delegation_spec.backend_id].append(
+            call_module_node.target
+        )
+
+    created_submodule_nodes = {key: [] for key in backend_id_to_submodule_name.keys()}
+    for backend_id, submodule_name in backend_id_to_submodule_name.items():
+        for node in tagged_graph_module.graph.nodes:
+            if node.op == "call_module" and node.target in submodule_name:
+                created_submodule_nodes[backend_id].append(node)
+
+    # check the number of submodule_names and submodule_nodes are equal
+    for backend_id in created_submodule_nodes.keys():
+        assert len(created_submodule_nodes[backend_id]) == len(
+            backend_id_to_submodule_name[backend_id]
+        )
+
+    return created_submodule_nodes
+
+
+def _create_partitions(
+    tagged_graph_module: torch.fx.GraphModule,
+    partition_result: PartitionResult,
+    owning_program: ExportedProgram,
+    is_submodule: bool = False,
+) -> Dict[str, List[torch.fx.Node]]:
+    backend_id_to_call_submodules = _create_partitions_in_graph_module(
+        tagged_graph_module, partition_result, owning_program, is_submodule
+    )
+
+    # Recursively partition and lower for submodules
+    for _, submod, _ in get_control_flow_submodules(tagged_graph_module):
+        nested_backend_id_to_call_submodules = _create_partitions(
+            submod, partition_result, owning_program, is_submodule=True
+        )
+        for (
+            backend_id,
+            nested_submodules,
+        ) in nested_backend_id_to_call_submodules.items():
+            if backend_id not in backend_id_to_call_submodules:
+                backend_id_to_call_submodules[backend_id] = nested_submodules
+            else:
+                backend_id_to_call_submodules[backend_id].extend(nested_submodules)
+
+    return backend_id_to_call_submodules
+
+
+def lower_all_submodules_to_backend(
+    backend_id: str,
+    method_to_submodules_nodes: Dict[str, List[torch.fx.Node]],
+    method_to_tagged_edge_program: Dict[str, ExportedProgram],
+) -> None:
+    """
+    Lower all submodules nodes given in the method_to_submodule_nodes map to backend_id.
+    """
+    # The created exported program for the submodules are in the call_module node's meta data
+    # We just map the method_to_submodule_nodes directly to the method_to_partitioned_exported_programs
+    method_to_partitioned_program = {
+        method_name: [node.meta["submodule_program"] for node in call_submodule_nodes]
+        for method_name, call_submodule_nodes in method_to_submodules_nodes.items()
+    }
+    method_to_compile_specs = {
+        method_name: [node.meta["compile_spec"] for node in call_submodule_nodes]
+        for method_name, call_submodule_nodes in method_to_submodules_nodes.items()
+    }
+    backend_found = False
+    for cls in BackendDetails.__subclasses__():
+        if backend_id == cls.__name__:
+            method_to_preprocess_result: dict[str, List[PreprocessResult]] = (
+                cls.preprocess_multimethod(
+                    method_to_partitioned_program, method_to_compile_specs
+                )
+            )
+            backend_found = True
+
+    if not backend_found:
+        raise NotImplementedError(f"Backend {backend_id} was not found.")
+
+    for method_name in method_to_preprocess_result.keys():
+        owning_program = method_to_tagged_edge_program[method_name]
+        list_of_preprocess_results = method_to_preprocess_result[method_name]
+        list_of_call_submodule_nodes = method_to_submodules_nodes[method_name]
+        list_of_compile_specs = method_to_compile_specs[method_name]
+        for preprocess_result, call_submodule_node, compile_spec in zip(
+            list_of_preprocess_results,
+            list_of_call_submodule_nodes,
+            list_of_compile_specs,
+        ):
+            submodule_program = call_submodule_node.meta["submodule_program"]
+            lowered_module = LoweredBackendModule(
+                edge_program=submodule_program,
+                backend_id=backend_id,
+                processed_bytes=preprocess_result.processed_bytes,
+                compile_specs=compile_spec,
+                named_data_store_output=preprocess_result.data_store_output,
+            )
+            is_submodule = call_submodule_node.meta["is_submodule"]
+            toplevel_input_specs_to_delete = call_submodule_node.meta[
+                "toplevel_input_specs_to_delete"
+            ]
+            toplevel_output_specs_to_delete = call_submodule_node.meta[
+                "toplevel_output_specs_to_delete"
+            ]
+            submodule_output_node = call_submodule_node.meta["submodule_output_node"]
+
+            _insert_lowered_submodule(
+                submodule_program,
+                owning_program,
+                call_submodule_node,
+                submodule_output_node,
+                lowered_module,
+                is_submodule,
+                toplevel_input_specs_to_delete,
+                toplevel_output_specs_to_delete,
+            )
+
+
+@dataclass
+class MethodProgramsPartitionerSpec:
+    """
+    Since single dispatch for to_backend requires the first argument to be a
+    valid class, we create the following dataclass spec to hold the dictionaries
+    mapping the method name to the corresponding program, partitioner
+    """
+
+    method_to_edge_program: Mapping[str, ExportedProgram]
+    method_to_partitioner: Mapping[str, Partitioner]
+
+
+@to_backend.register
+def _(
+    method_edge_program_partitioners: MethodProgramsPartitionerSpec,
+) -> Dict[str, ExportedProgram]:
+    """
+    Add overloaded implementations for to_backend:
+
+    ::
+
+     def to_backend(
+        method_edge_program_partitioners: MethodProgramsPartitionerSpec
+    ) -> Dict[str, ExportedProgram]:
+
+    Returns a semantically-equivalent dictionary of programs to the programs given as input (represented
+    as a graph module in Edge dialect), but with portions of the program targeted for
+    delegation as determined by the partitioner.
+
+    Args:
+        method_edge_program_partitioners: contains two mappings,
+        - method_to_edge_program: mapping of method names to their respective programs in Edge dialect.
+        - method_to_partitioner: mapping of method names to an instance of the partitioner, in charge with tagging
+        portions of the specified program for delegation. A valid partitioner must return PartitionerResult
+        including both tagged exported program and partitioner_tag: Dict[str, DelegationSpec], where each key is a tag name and
+        the nodes with same tag will be fused a one subgraph and delegated to backend specififed in delegation spec.
+
+
+    Returns:
+        ExportedProgram: The input program, with some portions targeted for delegation.
+    """
+    method_to_edge_program = method_edge_program_partitioners.method_to_edge_program
+    method_to_partitioner = method_edge_program_partitioners.method_to_partitioner
+
+    partitioned_and_lowered_exported_programs = {}
+    backend_id_to_method_submodules_map = {}
+    method_to_tagged_exported_program = {}
+
+    for method_name, partitioner_instance in method_to_partitioner.items():
+        assert (
+            method_name in method_to_edge_program
+        ), f"Partitioner for method {method_name} is not provided"
+        edge_program = method_to_edge_program[method_name]
+        edge_program._validate()
+
+        # Use fake program, with FakeTensors in the state dict, to avoid copying large constant values.
+        # Fall back to deepcopy if no fake mode is found. TODO(T182910699): Remove this fallback.
+        try:
+            fake_edge_program = get_fake_program(edge_program)
+        except Exception as e:
+            logging.warning(
+                f"Error in get_fake_program for graph {edge_program.graph_module}, fallback to deepcopy: {e}"
+            )
+            fake_edge_program = copy.deepcopy(edge_program)
+        partitioner_result = partitioner_instance(fake_edge_program)
+        tagged_exported_program = partitioner_result.tagged_exported_program
+        method_to_tagged_exported_program[method_name] = tagged_exported_program
+
+        # Check that the partitioner did not modify the original graph
+        if _ENABLE_VALIDATION:
+            assert is_identical_graph(
+                tagged_exported_program.graph_module,
+                edge_program.graph_module,
+            ), f"The partitioner {partitioner_instance} should not modify the graph module"
+        else:
+            logging.warning("Disabled validating the partitioner.")
+
+        assert (
+            partitioner_result.partition_tags is not None
+        ), f"Partitioner {partitioner_instance} needs a `partition_tags` field containing a mapping of tags to delegate spec"
+
+        update_to_real_program(tagged_exported_program, edge_program)
+
+        for tag, _ in partitioner_result.partition_tags.items():
+            _maybe_duplicate_constant_nodes(tagged_exported_program, tag)
+
+        backend_id_to_call_submodule_nodes = _create_partitions(
+            tagged_exported_program.graph_module,
+            partitioner_result,
+            tagged_exported_program,
+        )
+        for (
+            backend_id,
+            call_submodule_nodes,
+        ) in backend_id_to_call_submodule_nodes.items():
+            if backend_id not in backend_id_to_method_submodules_map:
+                backend_id_to_method_submodules_map[backend_id] = {}
+            backend_id_to_method_submodules_map[backend_id][
+                method_name
+            ] = call_submodule_nodes
+
+    for (
+        backend_id,
+        method_to_submodule_nodes,
+    ) in backend_id_to_method_submodules_map.items():
+        lower_all_submodules_to_backend(
+            backend_id,
+            method_to_submodule_nodes,
+            method_to_tagged_exported_program,
+        )
+
+    for method_name in method_to_edge_program.keys():
+        if method_name in method_to_tagged_exported_program:
+            tagged_exported_program = method_to_tagged_exported_program[method_name]
+            partitioned_and_lowered_exported_programs[method_name] = ExportedProgram(
+                root=tagged_exported_program.graph_module,
+                graph=tagged_exported_program.graph_module.graph,
+                graph_signature=tagged_exported_program.graph_signature,
+                state_dict=tagged_exported_program.state_dict,
+                range_constraints=copy.deepcopy(
+                    tagged_exported_program.range_constraints
+                ),
+                module_call_graph=copy.deepcopy(
+                    tagged_exported_program.module_call_graph
+                ),
+                example_inputs=None,
+                constants=tagged_exported_program.constants,
+                verifiers=[tagged_exported_program.verifier],
+            )
+        else:
+            # this edge program wasn't partitioned, so we can just return it as is
+            partitioned_and_lowered_exported_programs[method_name] = (
+                method_to_edge_program[method_name]
+            )
+
+    return partitioned_and_lowered_exported_programs
diff --git a/exir/backend/backend_details.py b/exir/backend/backend_details.py
index 248d03f2b05..513ae7c64b3 100644
--- a/exir/backend/backend_details.py
+++ b/exir/backend/backend_details.py
@@ -50,15 +50,6 @@ class BackendDetails(ABC):
     the decorators, this interface will be static, abstract and all inheritances are
     enforced to implement this method.
 
-    Args:
-        edge_program: The original exported program. It will not be modified in place.
-        compile_specs: List of values needed for compilation
-
-    Returns:
-        PreprocessResult: It wraps the following information:
-            processed_bytes -> bytes: A compiled blob - a binary that can run the desired program in the backend.
-            debug_handle_map (Optional[Dict[int, Tuple[int]]]): For profiling purposes, a map from the node_id in the final graph (either EXIR or the user's self-defined IR)
-            to debug handle id attached in the original exported program.
     """
 
     @staticmethod
@@ -70,6 +61,69 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
+        """
+        Preprocesses an edge program and returns the preprocess result fo the given backend
+
+        Args:
+            edge_program: The original exported program. It will not be modified in place.
+            compile_specs: List of values needed for compilation
+
+        Returns:
+            PreprocessResult: It wraps the following information:
+                processed_bytes -> bytes: A compiled blob - a binary that can run the desired
+                program in the backend.
+                debug_handle_map (Optional[Dict[int, Tuple[int]]]): For profiling purposes, a
+                map from the node_id  in the final graph (either EXIR or the user's self-defined
+                IR) to debug handle id attached in the original exported program.
+        """
         # Users should return a compiled blob - a binary that can run the desired
         # program in the backend.
         pass
+
+    @classmethod
+    def preprocess_multimethod(
+        cls,
+        edge_programs: Dict[str, List[ExportedProgram]],
+        compile_specs: Dict[str, List[List[CompileSpec]]],
+    ) -> Dict[str, list[PreprocessResult]]:
+        """
+        Runs preprocess on all partitioned Edge Programs across multiple methods. This allows
+        backends to share information across partitioned graphs. Backend can serialize shared
+        data by putting the shared data into the data_store_output of the preprocess results.
+        This will record the shared data used by that specific partition.
+
+        Default implementation is running the existing preprocess implementation on all
+
+        Args:
+            edge_programs: Dictionary mapping the method name to a list of all the partitioned
+                           edge_programs from that method to be lowered.
+            compile_specs: Dictionary mapping the method name to a list of compile_specs. The
+                           list of compile specs maps directly to the list of edge_programs for the
+                           same given method name i.e. edge_program[method_name][i] --> compile_specs[method_name][i]
+
+        Returns:
+            Dictionary mapping the method name to a list of PreprocessResults. The list of
+            PreprocessResults maps directly to the list of edge_programs for the same given
+            method name. i.e. edge_program[method_name][i] --> result[method_name][i]
+
+
+        """
+        preprocess_results = {}
+        for method_name, programs in edge_programs.items():
+            assert (
+                method_name in compile_specs
+            ), f"Error: missing compile specs for {method_name}"
+            compile_specs_for_method = compile_specs[method_name]
+            assert len(compile_specs_for_method) == len(
+                programs
+            ), f"Error: method {method_name} has {len(programs)} partitions but only {len(compile_specs_for_method)}"
+            results_for_method = []
+            for program, compile_spec_for_program in zip(
+                programs, compile_specs_for_method
+            ):
+                preprocess_result = cls.preprocess(program, compile_spec_for_program)
+                results_for_method.append(preprocess_result)
+
+            preprocess_results[method_name] = results_for_method
+
+        return preprocess_results
diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS
index f0ba618936d..5b12d673f7c 100644
--- a/exir/backend/test/TARGETS
+++ b/exir/backend/test/TARGETS
@@ -189,6 +189,59 @@ python_unittest(
     ],
 )
 
+python_unittest(
+    name = "test_to_backend_multi_method",
+    srcs = [
+        "test_to_backend_multi_method.py",
+    ],
+    preload_deps = [
+        "//executorch/kernels/portable:custom_ops_generated_lib",
+        "//executorch/kernels/quantized:custom_ops_generated_lib",
+        "//executorch/runtime/executor/test:test_backend_compiler_lib",
+    ],
+    deps = [
+        ":backend_with_preprocess_all_demo",
+        "//caffe2:torch",
+        "//caffe2/functorch:functorch_src",
+        "//executorch/exir/backend/test:backend_with_compiler_demo",
+        "//executorch/exir:delegate",
+        "//executorch/exir:graph_module",
+        "//executorch/exir:lib",
+        "//executorch/exir:lowered_backend_module",
+        "//executorch/exir:print_program",
+        "//executorch/exir:schema",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir/backend:compile_spec_schema",
+        "//executorch/exir/backend:partitioner",
+        "//executorch/exir/dialects:lib",
+        "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/extension/pytree:pylib",
+    ],
+)
+
+python_library(
+    name = "backend_with_preprocess_all_demo",
+    srcs = [
+        "backend_with_preprocess_all_demo.py"
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//caffe2/functorch:functorch_src",
+        "//executorch/exir:delegate",
+        "//executorch/exir:graph_module",
+        "//executorch/exir:lib",
+        "//executorch/exir:lowered_backend_module",
+        "//executorch/exir:print_program",
+        "//executorch/exir:schema",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir/backend:compile_spec_schema",
+        "//executorch/exir/backend:partitioner",
+        "//executorch/exir/dialects:lib",
+        "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/extension/pytree:pylib",
+    ],
+)
+
 python_unittest(
     name = "test_debug_handle_map",
     srcs = [
diff --git a/exir/backend/test/backend_with_preprocess_all_demo.py b/exir/backend/test/backend_with_preprocess_all_demo.py
new file mode 100644
index 00000000000..ae9a8174be5
--- /dev/null
+++ b/exir/backend/test/backend_with_preprocess_all_demo.py
@@ -0,0 +1,266 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, final, List, Tuple
+
+import torch
+
+from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
+from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+    generate_pattern_op_partitions,
+)
+
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.graph_module import get_control_flow_submodules
+from torch.export.exported_program import ExportedProgram
+from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+
+def _preprocess_multimethod(
+    edge_programs: Dict[str, List[ExportedProgram]],
+    compile_specs: Dict[str, List[List[CompileSpec]]],
+    supported_ops: List[torch._ops.OpOverload],
+    backend_name: str,
+) -> Dict[str, List[PreprocessResult]]:
+    """
+    Helper function to abstract out the logic to be shared between the two backends:
+    FirstBackendWithPreprocessAll and SecondBackendWithPreprocessAll. This will be used
+    in testing for a partitioner which tags different partitions for different backends
+    to be lowered to
+    """
+    total_number_of_ops = 0
+    for edge_program in edge_programs.values():
+        for partitioned_program in edge_program:
+            for node in partitioned_program.graph.nodes:
+                if node.op == "call_function":
+                    if node.target in supported_ops:
+                        total_number_of_ops += 1
+    all_processed_results = {key: [] for key in edge_programs.keys()}
+
+    for method_name, partitioned_programs in edge_programs.items():
+        compile_specs_for_method = compile_specs[method_name]
+
+        assert len(compile_specs_for_method) == len(partitioned_programs)
+        for compile_spec_for_partition, partitioned_program in zip(
+            compile_specs_for_method, partitioned_programs
+        ):
+            debug_handle_map = {}
+            processed_bytes = f"{backend_name}#{total_number_of_ops}#"
+            for node in partitioned_program.graph.nodes:
+                if node.op == "call_function":
+                    if node.target in supported_ops:
+                        op_name = node.target.__name__
+                        processed_bytes += f"{op_name}:"
+                        original_debug_id = node.meta["debug_handle"]
+                        new_debug_id = original_debug_id
+                        debug_handle_map[new_debug_id] = (original_debug_id,)
+                    else:
+                        raise RuntimeError(
+                            f"{node.op} {node.target.__name__} is not supported in backend {backend_name}"
+                        )
+
+            processed_bytes += "#"
+            for cs in compile_spec_for_partition:
+                processed_bytes += f"{cs.key}:{cs.value};"
+
+            all_processed_results[method_name].append(
+                PreprocessResult(
+                    processed_bytes=bytes(processed_bytes, encoding="utf8"),
+                    debug_handle_map=debug_handle_map,
+                )
+            )
+
+    return all_processed_results
+
+
+@final
+class FirstBackendWithPreprocessAll(BackendDetails):
+    """
+    Backend used to test the preprocess_multimethod for multi methods lowering.
+    lowered modules are returned in the format:
+    FirstBackendWithPreprocessAll#<all number of ops>#<op1>:<op2>:<op3>#<compile_spec.key>;<compile_spec.value>:
+
+
+    lowered blobs are not functional, and are purely used for testing purposes
+    """
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        """
+        Not used for testing
+        """
+        return PreprocessResult(
+            processed_bytes=bytes(b"\x00"),
+            debug_handle_map={},
+        )
+
+    @staticmethod
+    def preprocess_multimethod(
+        edge_programs: Dict[str, List[ExportedProgram]],
+        compile_specs: Dict[str, List[List[CompileSpec]]],
+    ) -> Dict[str, list[PreprocessResult]]:
+        """
+        Preprocess all the edge programs in the given dictionary and return a dictionary
+        of preprocess results. The preprocess result is a tuple of processed bytes and
+        a map from the node name to the original debug handle.
+        """
+        match_ops = [
+            exir_ops.edge.aten.sin.default,
+            exir_ops.edge.aten.add.Tensor,
+        ]
+
+        return _preprocess_multimethod(
+            edge_programs, compile_specs, match_ops, "FirstBackendWithPreprocessAll"
+        )
+
+
+@final
+class SecondBackendWithPreprocessAll(BackendDetails):
+    """
+    Backend used to test the preprocess_multimethod for multi methods lowering.
+    lowered modules are returned in the format:
+    SecondBackendWithPreprocessAll#<all number of ops>#<op1>:<op2>:<op3>#<compile_spec.key>;<compile_spec.value>:
+
+
+    lowered blobs are not functional, and are purely used for testing purposes
+    """
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        """
+        Not used for testing
+        """
+        return PreprocessResult(
+            processed_bytes=bytes(b"\x00"),
+            debug_handle_map={},
+        )
+
+    @staticmethod
+    def preprocess_multimethod(
+        edge_programs: Dict[str, List[ExportedProgram]],
+        compile_specs: Dict[str, List[List[CompileSpec]]],
+    ) -> Dict[str, list[PreprocessResult]]:
+        """
+        Preprocess all the edge programs in the given dictionary and return a dictionary
+        of preprocess results. The preprocess result is a tuple of processed bytes and
+        a map from the node name to the original debug handle.
+        """
+        match_ops = [
+            exir_ops.edge.aten.cos.default,
+            exir_ops.edge.aten.sub.Tensor,
+        ]
+
+        return _preprocess_multimethod(
+            edge_programs, compile_specs, match_ops, "SecondBackendWithPreprocessAll"
+        )
+
+
+class AddSinOperatorSupport(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        return node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.sin.default,
+        ]
+
+
+class SubCosOperatorSupport(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        return node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.cos.default,
+        ]
+
+
+@final
+class BackendWithPreprocessAllPartitioner(Partitioner):
+    """
+    Partitioner that partitions for both FirstBackendWithPreprocessAll
+    and SecondBackendWithPreprocessAll.
+
+    - The partitioner tags all sin and add nodes for delegation to
+      FirstBackendWithPreprocessAll
+    - The partitioner tags all cos and sub nodes for delegation to
+      SecondBackendWithPreprocessAll
+    """
+
+    def __init__(self) -> None:
+        self.add_sin_support = any_chain(AddSinOperatorSupport())
+        self.add_sin_backend_id = FirstBackendWithPreprocessAll.__name__
+
+        self.sub_cos_support = any_chain(SubCosOperatorSupport())
+        self.sub_cos_backend_id = SecondBackendWithPreprocessAll.__name__
+
+    def _partition_graph_module(
+        self,
+        graph_module: torch.fx.GraphModule,
+        id_start=0,
+    ) -> Tuple[Dict[str, DelegationSpec], int]:
+        partition_tags: Dict[str, DelegationSpec] = {}
+
+        num_partitions_in_gm = 0
+        for op_support, backend_id, tag_prefix in [
+            (self.add_sin_support, self.add_sin_backend_id, "first"),
+            (self.sub_cos_support, self.sub_cos_backend_id, "second"),
+        ]:
+            partition_list = generate_pattern_op_partitions(
+                graph_module, op_support=op_support
+            )
+            num_partitions_in_gm = num_partitions_in_gm + len(partition_list)
+            for partition in partition_list:
+                compile_specs = []
+                delegation_tag = f"{tag_prefix}_tag{id_start + partition.id}"
+                for node in partition.nodes:
+                    node.meta["delegation_tag"] = delegation_tag
+                    if (
+                        node.op == "call_function"
+                        and node.target == exir_ops.edge.aten.add.Tensor
+                    ):
+                        compile_specs.append(CompileSpec("add", bytes(b"\x00")))
+                    if (
+                        node.op == "call_function"
+                        and node.target == exir_ops.edge.aten.sin.default
+                    ):
+                        compile_specs.append(CompileSpec("sin", bytes(b"\x01")))
+                    if (
+                        node.op == "call_function"
+                        and node.target == exir_ops.edge.aten.sub.Tensor
+                    ):
+                        compile_specs.append(CompileSpec("sub", bytes(b"\x02")))
+                    if (
+                        node.op == "call_function"
+                        and node.target == exir_ops.edge.aten.cos.default
+                    ):
+                        compile_specs.append(CompileSpec("cos", bytes(b"\x03")))
+
+                delegation_spec = DelegationSpec(backend_id, compile_specs)
+                partition_tags[delegation_tag] = delegation_spec
+
+        start_idx_for_submodules = num_partitions_in_gm
+        for _, submodule, _ in get_control_flow_submodules(graph_module):
+            ret_partition_tags, start_idx_for_submodules = self._partition_graph_module(
+                submodule, id_start=start_idx_for_submodules
+            )
+            partition_tags.update(ret_partition_tags)
+
+        return partition_tags, start_idx_for_submodules
+
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        partition_tags, _ = self._partition_graph_module(exported_program.graph_module)
+        return PartitionResult(
+            tagged_exported_program=exported_program, partition_tags=partition_tags
+        )
diff --git a/exir/backend/test/test_lowered_backend_module.py b/exir/backend/test/test_lowered_backend_module.py
index dcc5841bc3e..6cdaf92b3d2 100644
--- a/exir/backend/test/test_lowered_backend_module.py
+++ b/exir/backend/test/test_lowered_backend_module.py
@@ -22,7 +22,6 @@
 from executorch.extension.pybindings.portable_lib import (  # @manual
     _load_for_executorch_from_buffer,
 )
-from hypothesis import given, settings, strategies as st
 from torch.export import export
 
 
@@ -65,7 +64,6 @@ def forward(self, *args):
             .executorch_program
         )
 
-    @settings(deadline=500000)
     def test_emit_lowered_backend_module_end_to_end(self):
         class SinModule(torch.nn.Module):
             def __init__(self):
@@ -109,11 +107,7 @@ def forward(self, x):
             torch.allclose(model_outputs[0], expected_res, atol=1e-03, rtol=1e-03)
         )
 
-    @given(
-        unlift=st.booleans(),  # verify both lifted and unlifted graph
-    )
-    @settings(deadline=500000)
-    def test_emit_lowered_backend_module(self, unlift):
+    def test_emit_lowered_backend_module(self):
         module_list = [
             models.Emformer(),
             models.Repeat(),
@@ -166,11 +160,7 @@ def test_emit_lowered_backend_module(self, unlift):
             _ = lowered_model.buffer()
             self.validate_lowered_module_program(program)
 
-    @given(
-        unlift=st.booleans(),  # verify both lifted and unlifted graph
-    )
-    @settings(deadline=500000)
-    def test_emit_nested_lowered_backend_module(self, unlift):
+    def test_emit_nested_lowered_backend_module(self):
         module_list = [
             models.Emformer(),
             models.Repeat(),
diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index 917dae32d74..e9320cf415d 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -76,7 +76,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = export_for_training(mlp, example_inputs).module()
+        model = export_for_training(mlp, example_inputs, strict=True).module()
         aten = export(model, example_inputs, strict=True)
         spec_key = "path"
         spec_value = "/a/b/c/d"
@@ -137,7 +137,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = export_for_training(mlp, example_inputs).module()
+        model = export_for_training(mlp, example_inputs, strict=True).module()
         aten = export(model, example_inputs, strict=True)
         edge = exir.to_edge(aten)
 
@@ -177,7 +177,7 @@ def partition(
 
         mlp = MLP()
         example_inputs = mlp.get_random_inputs()
-        model = export_for_training(mlp, example_inputs).module()
+        model = export_for_training(mlp, example_inputs, strict=True).module()
         edge = exir.to_edge(export(model, example_inputs, strict=True))
 
         with self.assertRaisesRegex(
@@ -229,7 +229,9 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            self.AddConst(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         delegated = edge.to_backend(PartitionerNoTagData())
 
@@ -308,7 +310,9 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            self.AddConst(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         delegated = edge.to_backend(PartitionerTagData())
 
@@ -383,7 +387,9 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            self.AddConst(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         delegated = edge.to_backend(PartitionerTagData())
 
@@ -471,7 +477,9 @@ def partition(
                 )
 
         inputs = (torch.ones(2, 2),)
-        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            ReuseConstData(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         exec_prog = edge.to_backend(PartitionerTagData()).to_executorch()
         executorch_module = _load_for_executorch_from_buffer(exec_prog.buffer)
@@ -531,7 +539,9 @@ def partition(
                     partition_tags=partition_tags,
                 )
 
-        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            ReuseConstData(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True))
         with self.assertRaises(RuntimeError) as error:
             _ = edge.to_backend(PartitionerTagData())
diff --git a/exir/backend/test/test_passes.py b/exir/backend/test/test_passes.py
index bc18f090238..1cdf494fa01 100644
--- a/exir/backend/test/test_passes.py
+++ b/exir/backend/test/test_passes.py
@@ -28,7 +28,9 @@ def forward(self, x):
                 z = x - self.const
                 return y, z
 
-        model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module()
+        model = export_for_training(
+            ReuseConstData(), (torch.ones(2, 2),), strict=True
+        ).module()
         edge = exir.to_edge(
             torch.export.export(model, (torch.ones(2, 2),), strict=True)
         )
diff --git a/exir/backend/test/test_to_backend_multi_method.py b/exir/backend/test/test_to_backend_multi_method.py
new file mode 100644
index 00000000000..d4f8fccb8f2
--- /dev/null
+++ b/exir/backend/test/test_to_backend_multi_method.py
@@ -0,0 +1,524 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+
+from executorch.exir import EdgeProgramManager, to_edge
+from executorch.exir.backend.backend_api import (
+    MethodProgramsPartitionerSpec,
+    to_backend,
+)
+
+from executorch.exir.backend.canonical_partitioners.all_node_partitioner import (
+    AllNodePartitioner,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import Partitioner
+from executorch.exir.backend.test.backend_with_compiler_demo import (
+    BackendWithCompilerDemo,
+)
+
+from executorch.exir.backend.test.backend_with_preprocess_all_demo import (
+    BackendWithPreprocessAllPartitioner,
+)
+from executorch.exir.graph_module import get_control_flow_submodules
+from executorch.exir.lowered_backend_module import (
+    get_lowered_submodules,
+    LoweredBackendModule,
+)
+from executorch.exir.schema import (
+    BackendDelegate,
+    BackendDelegateDataReference,
+    DataLocation,
+    Program,
+)
+from executorch.extension.pybindings.portable_lib import (  # @manual
+    _load_for_executorch_from_buffer,
+)
+from torch.export.exported_program import ExportedProgram
+
+from torch.testing import FileCheck
+
+
+class TestToBackendMultiMethod(unittest.TestCase):
+    """
+    Testing suite used to test multi method to_backend lowering. The test suite uses demo backends
+    FirstBackendWithPreprocessAll and SecondBackendWithPreprocessAll.
+    - FirstBackendWithPreprocessAll: supports add + sin
+    - SecondBackendWithPreprocessAll: supports sub + cos
+
+    Both backends lower exported programs into payloads in the string format:
+    - (backend_id)#(total_number_ops across methods)#[op_target_name;]#[compile_spec.key:compile_spec.value;]
+
+    We leverage the above expectation to test various lowering across different modules, and ensure
+    that the right exported programs and compile specs are given when lowering a specifed exported program
+
+    We leverage the demo partitioner BackendWithPreprocessAll which partitions add + sin nodes to
+    FirstBackendWithPreprocessAll and sub + cos nodes to SecondBackendWithPreprocessAll. This allows
+    us to test cases in which multiple backends are being lowered.
+    """
+
+    def _get_lowered_submodules_across_controlflow(
+        self, graph_module: torch.fx.GraphModule
+    ) -> List[Tuple[str, LoweredBackendModule, torch.fx.Node]]:
+        top_level_submodules = get_lowered_submodules(graph_module)
+
+        for _, submodule, _ in get_control_flow_submodules(graph_module):
+            top_level_submodules.extend(
+                self._get_lowered_submodules_across_controlflow(submodule)
+            )
+
+        return top_level_submodules
+
+    def check_backend_delegate(
+        self,
+        program: Program,
+        delegate: BackendDelegate,
+        expected_id: str,
+        expected_processed: bytes,
+    ) -> None:
+        self.assertEqual(delegate.id, expected_id)
+        processed: BackendDelegateDataReference = delegate.processed
+        self.assertEqual(processed.location, DataLocation.INLINE)
+        self.assertLess(processed.index, len(program.backend_delegate_data))
+        self.assertEqual(
+            program.backend_delegate_data[processed.index].data, expected_processed
+        )
+
+    def _test(
+        self, test_set: Dict[str, Tuple[ExportedProgram, Partitioner, List[str]]]
+    ):
+        method_to_edge_program = {
+            method_name: ep for method_name, (ep, _, _) in test_set.items()
+        }
+
+        method_to_partitioner = {
+            method_name: partitioner
+            for method_name, (_, partitioner, _) in test_set.items()
+        }
+
+        lowered_ep_dict = to_backend(
+            MethodProgramsPartitionerSpec(
+                method_to_edge_program,
+                method_to_partitioner,
+            )
+        )
+
+        self.assertEqual(len(lowered_ep_dict.keys()), len(test_set.keys()))
+        for method_name in test_set.keys():
+            self.assertTrue(method_name in lowered_ep_dict.keys())
+            (_, _, list_of_payload_as_string) = test_set[method_name]
+            lowered_ep = lowered_ep_dict[method_name]
+            FileCheck().check_count(
+                "torch.ops.higher_order.executorch_call_delegate",
+                len(list_of_payload_as_string),
+                exactly=True,
+            ).run(str(lowered_ep))
+            lowered_submodules = self._get_lowered_submodules_across_controlflow(
+                lowered_ep.graph_module
+            )
+            self.assertEqual(len(lowered_submodules), len(list_of_payload_as_string))
+
+            for idx, (_, lowered_backend_module, _) in enumerate(lowered_submodules):
+                self.assertEqual(
+                    lowered_backend_module.processed_bytes.decode("utf-8"),
+                    list_of_payload_as_string[idx],
+                )
+
+    def test_multi_method_to_backend_single_method(self):
+        class SinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.sin(x)
+
+        edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),)))
+        # Payload String:
+        # [Number of Ops lowered across all methods/partitions]#OpTargetNames#CompileSpecs;
+        test_set = {
+            "forward": (
+                edgeir_m.exported_program(),
+                AllNodePartitioner(
+                    "FirstBackendWithPreprocessAll",
+                    [CompileSpec("max_value", bytes([1]))],
+                ),
+                [
+                    "FirstBackendWithPreprocessAll#1#aten.sin.default:#max_value:b'\\x01';"
+                ],
+            )
+        }
+        self._test(test_set)
+
+    def test_multi_method_to_backend_two_methods(self):
+        class SinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.sin(x)
+
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x + x
+
+        sin_edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),)))
+        add_edgeir_m = to_edge(torch.export.export(AddModule(), (torch.ones(1),)))
+        sin_partitioner = AllNodePartitioner(
+            "FirstBackendWithPreprocessAll", [CompileSpec("sin", bytes([2]))]
+        )
+        add_partitioner = AllNodePartitioner(
+            "FirstBackendWithPreprocessAll", [CompileSpec("add", bytes([3]))]
+        )
+        # Payload String:
+        # [Number of Ops lowered across all methods/partitions]#OpTargetNames#CompileSpecs;
+        test_set = {
+            "sin": (
+                sin_edgeir_m.exported_program(),
+                sin_partitioner,
+                ["FirstBackendWithPreprocessAll#2#aten.sin.default:#sin:b'\\x02';"],
+            ),
+            "add": (
+                add_edgeir_m.exported_program(),
+                add_partitioner,
+                ["FirstBackendWithPreprocessAll#2#aten.add.Tensor:#add:b'\\x03';"],
+            ),
+        }
+        self._test(test_set)
+
+    def test_multi_method_to_backend_two_methods_multiple_partitions(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = x + x
+                y = y * y
+                y = y + y
+                return y
+
+        class SinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = torch.sin(x)
+                y = y * y
+                return torch.sin(y)
+
+        add_edgeir_m = to_edge(torch.export.export(AddModule(), (torch.ones(1),)))
+        sin_edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),)))
+        test_set = {
+            "add": (
+                add_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';",
+                    "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';",
+                ],
+            ),
+            "sin": (
+                sin_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';",
+                    "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';",
+                ],
+            ),
+        }
+        self._test(test_set)
+
+    def test_multi_method_to_backend_two_methods_different_partitions(self):
+        class AddSinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = x + x
+                y = y * y
+                y = torch.sin(y)
+                return y
+
+        class SinAddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = torch.sin(x)
+                y = y * y
+                return y + y
+
+        add_sin_edgeir_m = to_edge(
+            torch.export.export(AddSinModule(), (torch.ones(1),))
+        )
+        sin_add_edgeir_m = to_edge(
+            torch.export.export(SinAddModule(), (torch.ones(1),))
+        )
+        test_set = {
+            "add_sin": (
+                add_sin_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';",
+                    "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';",
+                ],
+            ),
+            "sin_add": (
+                sin_add_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';",
+                    "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';",
+                ],
+            ),
+        }
+        self._test(test_set)
+
+    def test_multi_method_to_backend_two_methods_different_backends(self):
+        class AddSinCosSubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = x + x
+                y = torch.sin(y)
+                y = torch.cos(y)
+                y = y - x
+                return y
+
+        class CosSubAddSinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = torch.cos(x)
+                y = y - x
+                y = y + y
+                y = torch.sin(y)
+                return y
+
+        first_second_edgeir_m = to_edge(
+            torch.export.export(AddSinCosSubModule(), (torch.ones(1),))
+        )
+        second_first_edgeir_m = to_edge(
+            torch.export.export(CosSubAddSinModule(), (torch.ones(1),))
+        )
+        test_set = {
+            "first_second": (
+                first_second_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "FirstBackendWithPreprocessAll#4#aten.add.Tensor:aten.sin.default:#add:b'\\x00';sin:b'\\x01';",
+                    "SecondBackendWithPreprocessAll#4#aten.cos.default:aten.sub.Tensor:#cos:b'\\x03';sub:b'\\x02';",
+                ],
+            ),
+            "second_first": (
+                second_first_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "SecondBackendWithPreprocessAll#4#aten.cos.default:aten.sub.Tensor:#cos:b'\\x03';sub:b'\\x02';",
+                    "FirstBackendWithPreprocessAll#4#aten.add.Tensor:aten.sin.default:#add:b'\\x00';sin:b'\\x01';",
+                ],
+            ),
+        }
+        self._test(test_set)
+
+    def test_multi_method_to_backend_control_flow(self):
+        class SinCosModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def true_fn(self, x):
+                return torch.sin(x)
+
+            def false_fn(self, x):
+                return torch.cos(x)
+
+            def forward(self, x):
+                x = x + x
+                return torch.cond(x > 0, self.true_fn, self.false_fn, [x])
+
+        class SinAddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def true_fn(self, x):
+                return torch.sin(x)
+
+            def false_fn(self, x):
+                return x + x
+
+            def forward(self, x):
+                return torch.cond(x > 0, self.true_fn, self.false_fn, [x])
+
+        sin_cos_edgeir_m = to_edge(
+            torch.export.export(SinCosModule(), (torch.ones(1),))
+        )
+        sin_add_edgeir_m = to_edge(
+            torch.export.export(SinAddModule(), (torch.ones(1),))
+        )
+
+        test_set = {
+            "sin_cos": (
+                sin_cos_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';",
+                    # True Module Partition
+                    "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';",
+                    # False Module Partition
+                    "SecondBackendWithPreprocessAll#1#aten.cos.default:#cos:b'\\x03';",
+                ],
+            ),
+            "sin_add": (
+                sin_add_edgeir_m.exported_program(),
+                BackendWithPreprocessAllPartitioner(),
+                [
+                    # True Module Partition
+                    "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';",
+                    # False Module Partition
+                    "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';",
+                ],
+            ),
+        }
+        self._test(test_set)
+
+    def test_multi_method_to_backend_not_found(self):
+        class SinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.sin(x)
+
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x + x
+
+        sin_edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),)))
+        add_edgeir_m = to_edge(torch.export.export(AddModule(), (torch.ones(1),)))
+        sin_partitioner = AllNodePartitioner(
+            "Invalid", [CompileSpec("sin", bytes([2]))]
+        )
+        add_partitioner = AllNodePartitioner(
+            "FirstBackendWithPreprocessAll", [CompileSpec("add", bytes([3]))]
+        )
+
+        test_set = {
+            "sin": (
+                sin_edgeir_m.exported_program(),
+                sin_partitioner,
+                [],
+            ),
+            "add": (
+                add_edgeir_m.exported_program(),
+                add_partitioner,
+                [],
+            ),
+        }
+        with self.assertRaisesRegex(
+            NotImplementedError, "Backend Invalid was not found."
+        ):
+            self._test(test_set)
+
+    def test_multi_method_end_to_end(self):
+        """
+        Tests multi method lowering end-to-end. Lowers the same Sin Module for two methods
+        "forward" and "forward_copy". Ensures that the lowered program has two delegates
+        but only one serialized blob. Ensures that the lowered program runs correctly.
+        """
+
+        class SinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.sin(x)
+
+        sin_edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),)))
+        sin_edgeir_m_copy = to_edge(torch.export.export(SinModule(), (torch.ones(1),)))
+
+        method_edge_program = {
+            "forward": sin_edgeir_m.exported_program(),
+            "forward_copy": sin_edgeir_m_copy.exported_program(),
+        }
+        compile_specs = [CompileSpec("max_value", bytes([1]))]
+
+        method_partitioner = {
+            "forward": AllNodePartitioner(
+                BackendWithCompilerDemo.__name__, compile_specs
+            ),
+            "forward_copy": AllNodePartitioner(
+                BackendWithCompilerDemo.__name__, compile_specs
+            ),
+        }
+
+        lowered_ep_dict = to_backend(
+            MethodProgramsPartitionerSpec(
+                method_edge_program,
+                method_partitioner,
+            )
+        )
+
+        new_edge_manager = EdgeProgramManager(lowered_ep_dict)
+
+        exec_prog = new_edge_manager.to_executorch()
+
+        program = exec_prog.executorch_program
+        # Since the preprocessed bytes are the same, there should only be on copy
+        self.assertEqual(len(program.backend_delegate_data), 1)
+
+        self.check_backend_delegate(
+            program=program,
+            delegate=program.execution_plan[0].delegates[0],
+            expected_id=BackendWithCompilerDemo.__name__,
+            expected_processed=b"1version:0#op:demo::aten.sin.default, numel:1, dtype:torch.float32<debug_handle>2#",
+        )
+        self.check_backend_delegate(
+            program=program,
+            delegate=program.execution_plan[1].delegates[0],
+            expected_id=BackendWithCompilerDemo.__name__,
+            expected_processed=b"1version:0#op:demo::aten.sin.default, numel:1, dtype:torch.float32<debug_handle>2#",
+        )
+
+        # Check that there are two methods
+        self.assertEqual(len(program.execution_plan), 2)
+
+        delegate_method_1 = program.execution_plan[0].delegates
+        delegate_method_2 = program.execution_plan[1].delegates
+
+        # 1 delegate blob for each method
+        self.assertEqual(len(delegate_method_1), 1)
+        self.assertEqual(len(delegate_method_2), 1)
+
+        # Delegate Blobs reference the same underlying bytes
+        delegate_reference1 = delegate_method_1[0].processed
+        delegate_reference2 = delegate_method_2[0].processed
+        self.assertEqual(delegate_reference1.index, delegate_reference2.index)
+
+        et_module = _load_for_executorch_from_buffer(exec_prog.buffer)
+        model_inputs = torch.ones(1)
+        model_outputs = et_module.run_method("forward", [model_inputs])
+        self.assertEqual(model_inputs, torch.ones(1))
+        model_outputs_from_copy_method = et_module.run_method(
+            "forward_copy", [model_inputs]
+        )
+        self.assertEqual(model_inputs, torch.ones(1))
+        self.assertEqual(model_outputs, model_outputs_from_copy_method)
+        self.assertTrue(
+            torch.allclose(
+                model_outputs[0], 0.8333 * torch.ones(1), atol=1e-03, rtol=1e-03
+            )
+        )
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index abb7aa74b93..9267af4f2dc 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -97,3 +97,8 @@ class ExecutorchBackendConfig:
     # If set to true, all trainable weights will be stored in a separate file,
     # external to the PTE file.
     external_mutable_weights: bool = False
+
+    # If set to true, all mutable buffers will have their fully qualified names
+    # serialized in the PTE file. Its value is ignored if mutable buffers are not
+    # memory planned as the names must be serialized in that case.
+    emit_mutable_buffer_names: bool = False
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index f9571143a1b..f456626feed 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -118,6 +118,7 @@ def emit_program(
     methods: Union[ExportedProgram, Dict[str, ExportedProgram]],
     emit_stacktrace: bool = False,
     prim_getters: Optional[Dict[str, Any]] = None,
+    emit_mutable_buffer_names: bool = False,
 ) -> EmitterOutput:
     """
     Given a exported program, it returns the program in the format
@@ -163,6 +164,7 @@ def emit_program(
             operator_cache={},
             delegate_cache={},
             emit_stacktrace=emit_stacktrace,
+            emit_mutable_buffer_names=emit_mutable_buffer_names,
         )
 
         gm = _remove_non_user_outputs(exported_program)
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index 0cbc63bde21..fe18e49a623 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -149,6 +149,7 @@ class _EmitterState:
     # delegate_cache: the key is hash(delegated_payload) and the value is the index in delegates
     delegate_cache: Dict[str, int]
     emit_stacktrace: bool
+    emit_mutable_buffer_names: bool
 
     spec2id_dict: Dict[TensorSpec, int] = field(default_factory=dict)
 
@@ -1610,7 +1611,7 @@ def _find_fqn_for_placeholder(
             )
         return fqn, is_mutable_buffer
 
-    def placeholder(
+    def placeholder(  # noqa: C901
         self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument]
     ) -> _AbstractValue:
         """Emits the value within the placeholder node.
@@ -1640,6 +1641,26 @@ def placeholder(
                     spec.extra_tensor_info.fully_qualified_name = fqn
                     spec.extra_tensor_info.location = TensorDataLocation.EXTERNAL
 
+            if is_mutable_buffer:
+                # Emit names if we are supposed to.
+                if self.emitter_state.emit_mutable_buffer_names:
+                    if spec.extra_tensor_info is None:
+                        spec.extra_tensor_info = ExtraTensorInfo(
+                            fully_qualified_name=fqn,
+                            location=TensorDataLocation.SEGMENT,
+                        )
+                    else:
+                        spec.extra_tensor_info.fully_qualified_name = fqn
+                # if We aren't emitting the name then it needs to be memory planned.
+                elif spec.mem_id is None or spec.mem_offset is None:
+                    raise InternalError(
+                        self._emit_node_specific_error(
+                            self.node,
+                            # [2:] to remove the b_ prefix buffers get
+                            f'Mutable buffer "{target[2:]}" must have a memory id and offset if we are emitting it without a name. Please either memory plan your mutable buffers or call to_executorch with config=ExecutorchBackendConfig(emit_mutable_buffer_names=True)',
+                        )
+                    )
+
             # From the fqn find the corresponding tensor
             real_tensor = None
             if fqn in self.exported_program.state_dict:
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 362796146ee..186c5a402ab 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -1751,8 +1751,8 @@ def forward(self, x):
         module_1(*example_inputs)
         module_2(*example_inputs)
 
-        ep1 = export_for_training(module_1, example_inputs)
-        ep2 = export_for_training(module_2, example_inputs)
+        ep1 = export_for_training(module_1, example_inputs, strict=True)
+        ep2 = export_for_training(module_2, example_inputs, strict=True)
 
         edge_program_manager = exir.to_edge(
             {"forward1": ep1, "forward2": ep2},
@@ -1819,3 +1819,59 @@ def forward(self, input, label):
         ]
         self.assertEqual(external_map["net.linear.weight"], 0)
         self.assertEqual(external_map["net.linear.bias"], 1)
+
+    def test_emit_mutable_buffer_names(self) -> None:
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(2, 2)
+                self.register_buffer("buffer", torch.zeros(1, 2))
+
+            def forward(self, x):
+                self.buffer.add_(1)
+                return self.linear(x) + self.buffer
+
+        net = Net()
+
+        ep = export(net, (torch.randn(1, 2),), strict=True)
+        # Lower the graph to edge dialect.
+        ep = to_edge(ep)
+        # Lower the graph to executorch.
+        ep = ep.to_executorch(
+            config=ExecutorchBackendConfig(
+                emit_mutable_buffer_names=True,
+                memory_planning_pass=MemoryPlanningPass(alloc_mutable_buffers=False),
+            )
+        )
+        for val in ep.executorch_program.execution_plan[0].values:
+            if isinstance(val, Tensor) and val.extra_tensor_info:
+                self.assertEqual(val.extra_tensor_info.fully_qualified_name, "buffer")
+                self.assertEqual(val.allocation_info, None)
+
+    def test_emit_mutable_buffer_names_fails(self) -> None:
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(2, 2)
+                self.register_buffer("buffer", torch.zeros(1, 2))
+
+            def forward(self, x):
+                self.buffer.add_(1)
+                return self.linear(x) + self.buffer
+
+        net = Net()
+
+        ep = export(net, (torch.randn(1, 2),), strict=True)
+        # Lower the graph to edge dialect.
+        ep = to_edge(ep)
+        # Lower the graph to executorch.
+        # Must emit mutable buffer names if we don't allocate mutable buffers
+        with self.assertRaises(InternalError):
+            ep.to_executorch(
+                config=ExecutorchBackendConfig(
+                    emit_mutable_buffer_names=False,
+                    memory_planning_pass=MemoryPlanningPass(
+                        alloc_mutable_buffers=False
+                    ),
+                )
+            )
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 6bcc1b2f3d8..78b031a238e 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -766,15 +766,15 @@ def create_submodule_from_nodes(
     gm = insert_subgm(gm, sub_gm, orig_inputs, orig_outputs)
     submodule_node = None
     for node in gm.graph.nodes:
-        if node.op == "call_module":
-            if node.target == submodule_name:
-                submodule_node = node
-            else:
-                raise RuntimeError(
-                    f"The submodule created with nodes {node_list} did not form \
-                    one fully contained subgraph. Check that these nodes form a \
-                    fully contained graph. Partitioned graph: {gm.graph}."
-                )
+        if node.op == "call_module" and node.target == submodule_name:
+            submodule_node = node
+
+    if submodule_node is None:
+        raise RuntimeError(
+            f"The submodule created with nodes {node_list} did not form \
+            one fully contained subgraph. Check that these nodes form a \
+            fully contained graph. Partitioned graph: {gm.graph}."
+        )
 
     if len(orig_outputs) == 1 and isinstance(orig_outputs[0].meta["val"], FakeTensor):
         # If the original output is a single tensor, it has been
@@ -809,12 +809,13 @@ def create_submodule_from_nodes(
     for node in gm.graph.nodes:
         if node.op == "call_module" and node.target == submodule_name:
             submodule_node = node
-        elif node.op == "call_module":
-            raise RuntimeError(
-                f"The submodule created with nodes {node_list} did not form \
-                one fully contained subgraph. Check that these nodes form a \
-                fully contained graph. Partitioned graph: {gm.graph}."
-            )
+
+    if submodule_node is None:
+        raise RuntimeError(
+            f"The submodule created with nodes {node_list} did not form \
+            one fully contained subgraph. Check that these nodes form a \
+            fully contained graph. Partitioned graph: {gm.graph}."
+        )
 
     assert (
         submodule_node is not None
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index 3f45276c9e2..83598940882 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -44,12 +44,14 @@ def __init__(
         graph_module: torch.fx.GraphModule,
         alloc_graph_input: bool,
         alloc_graph_output: bool,
+        alloc_mutable_buffers: bool,
         graph_signature: Optional[ExportGraphSignature] = None,
     ) -> None:
         self.graph_module = graph_module
         self.graph_signature = graph_signature
         self.alloc_graph_input = alloc_graph_input
         self.alloc_graph_output = alloc_graph_output
+        self.alloc_mutable_buffers = alloc_mutable_buffers
 
     @classmethod
     def mem_obj_id_match(
@@ -149,6 +151,7 @@ def verify_storage_reuse(
                 ignore_const=True,
                 ignore_graph_input=not self.alloc_graph_input,
                 ignore_graph_output=not self.alloc_graph_output,
+                ignore_mutable_buffers=not self.alloc_mutable_buffers,
                 do_assertion=False,
                 ignore_out_var_node=False,
                 dedup=True,
@@ -374,6 +377,7 @@ def collect_specs_from_nodes(  # noqa: C901
     graph_signature: Optional[ExportGraphSignature] = None,
     ignore_graph_input: bool = False,
     ignore_graph_output: bool = False,
+    ignore_mutable_buffers: bool = False,
     ignore_const: bool = True,
     ignore_out_var_node: bool = True,
     dedup: bool = True,
@@ -414,6 +418,9 @@ def collect_specs_from_nodes(  # noqa: C901
         if _is_inplace_node(node):
             continue
 
+        if _is_mutable_buffer(node, graph_signature) and ignore_mutable_buffers:
+            continue
+
         if do_assertion:
             internal_assert(
                 node.op in ("placeholder", "output")
@@ -469,6 +476,7 @@ def update_all_tensors_lifetime(
     Set the lifetime for all the tensors encountered in the Fx graph.
     """
     specs = set()
+
     for node_idx, node in enumerate(graph_module.graph.nodes):
         for spec in collect_specs_from_nodes(
             filter_nodes(itertools.chain([node], node.args, node.kwargs.values())),
@@ -731,53 +739,43 @@ def _contains_xnnpack_delegate(graph_module: torch.fx.GraphModule) -> bool:
 
 
 def greedy(
-    graph_module: torch.fx.GraphModule,
     alignment: int,
-    graph_signature: Optional[ExportGraphSignature] = None,
-    alloc_graph_input: bool = True,
-    alloc_graph_output: bool = True,
+    specs: Set[TensorSpec],
+    graph_module: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+    extra_padding: int = 0,
+    *,
     allow_overlapping_allocations: bool = True,
 ) -> MemoryAlgoResult:
     r"""Greedy algorithm to allocate memory for tensors in the graph.
-    alloc_graph_input: If set to true, the algorithm will allocate memory for graph input.
-    alloc_graph_output: If set to true, the algorithm will allocate memory for graph output.
-    allow_overlapping_allocations: If set to true, allows for allocations that overlap
-    in their lifetime but are at different offsets in the storage. By default true.
-    This flag is added to allow for Vulkan to use MemoryPlanningPass with overlapping
-    allocations disabled
+
+    Args:
+        alignment: Memory alignment requirement
+        specs: Set of TensorSpec objects with updated lifetimes
+        graph_module: Graph module
+        graph_signature: Graph signature
+        extra_padding: Additional padding to add to each memory buffer (in bytes)
+        allow_overlapping_allocations: If set to true, allows for allocations that overlap
+            in their lifetime but are at different offsets in the storage. By default true.
+            This flag is added to allow for Vulkan to use MemoryPlanningPass with overlapping
+            allocations disabled
+
+    Returns:
+        MemoryAlgoResult containing the allocation decisions
     """
     greedy_result = MemoryAlgoResult({}, [])
-    # padding allocation with 64 bytes.
-    # this requirement is really for XNNPACK backend which can read tensors
-    # beyond the end of the tensor. This is done for performance
-    # optimizations in XNNPACK.
-    # While accounting for backend specific requirement is not the right choice
-    # in backend agnostic memory planning, we do it here as it seems most appropriate.
-    # Right now this applies to greedy only so any other
-    # algorithm that plans memory for XNNPACK backend will
-    # not have this.
-    extra_padded_bytes = 0
-    if _contains_xnnpack_delegate(graph_module):
-        extra_padded_bytes = 64
     spec2obj = {}
     shared_objects = defaultdict(list)
-    # Don't do assertion in collect_specs_from_nodes if we have already encountered
-    # and ignored some to_out_variant errors.
-    do_assertion = not getattr(graph_module, "encounter_to_out_var_failure", False)
+
     # For each tensor, pick the available shared object with closest size to
     # the tensor. If there are no available shared object left, create a new
     # one.
     import bisect
 
     sorted_specs = []
-    for spec in collect_specs_from_nodes(
-        graph_module.graph.nodes,
-        graph_signature,
-        do_assertion=do_assertion,
-        ignore_graph_input=not alloc_graph_input,
-        ignore_graph_output=not alloc_graph_output,
-    ):
+    for spec in specs:
         bisect.insort(sorted_specs, spec, key=lambda x: x.allocated_memory)
+
     sorted_specs.reverse()
 
     for spec in sorted_specs:
@@ -806,15 +804,13 @@ def greedy(
         for mem_id in shared_objects:
             input_total_size = 0
             if bufsizes := getattr(graph_module, "input_mem_buffer_sizes", None):
-                # pyre-fixme[6]: For 1st argument expected
-                #  `pyre_extensions.ReadOnly[Sized]` but got `Union[Tensor, Module]`.
+                assert isinstance(bufsizes, list)
                 if len(bufsizes) > mem_id:
-                    # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.Ten...
                     input_total_size = bufsizes[mem_id]
             total_sizes[mem_id] = materialize_buffer(
                 shared_objects[mem_id], input_total_size
             )
-            total_sizes[mem_id] += extra_padded_bytes
+            total_sizes[mem_id] += extra_padding
 
             # Since we now know the number of shared objects we need and the size of
             # each shared object, we can assign offset in the memory buffer for each
@@ -838,72 +834,107 @@ def greedy(
     return greedy_result
 
 
-def memory_planning_algorithm_suite(
-    graph_module: torch.fx.GraphModule,
-    alignment: int,
-    graph_signature: Optional[ExportGraphSignature] = None,
-    alloc_graph_input: bool = True,
-    alloc_graph_output: bool = True,
-    allow_overlapping_allocations: bool = True,
-    algo_list: Optional[List[Callable[..., MemoryAlgoResult]]] = None,
-) -> List[int]:
-    r"""
-    Memory planning algorithm suite that runs a list of memory planning algorithms
-    and returns the result of the algorithm that minimizes the total memory usage.
-    """
-    if algo_list is None:
-        algo_list = [greedy]
-    mem_algo_results = {}
-    for algo in algo_list:
-        if isinstance(algo, functools.partial):
-            name = algo.func.__name__
-        else:
-            name = getattr(algo, "__name__", None)
-        # Run this memory planning algorithm and store the result in mem_algo_results
-        # with the name of the algorithm as the key.
-        mem_algo_results[name] = algo(
-            graph_module,
-            alignment,
-            graph_signature,
-            alloc_graph_input,
-            alloc_graph_output,
-        )
+class MemoryPlanningAlgorithmSuite:
+    def __init__(
+        self,
+        algo_list: Optional[List[Callable[..., MemoryAlgoResult]]] = None,
+    ) -> None:
+        if algo_list is None:
+            algo_list = [greedy]
+        self.algo_list: List[Callable[..., MemoryAlgoResult]] = algo_list
 
-    # All the algorithms should have the same number of buffers allocated.
-    assert (
-        len(
-            {
-                len(mem_algo_result.bufsizes)
-                for mem_algo_result in mem_algo_results.values()
-            }
+    def __call__(
+        self,
+        alignment: int,
+        specs: Set[TensorSpec],
+        graph_module: torch.fx.GraphModule,
+        graph_signature: ExportGraphSignature,
+        extra_padding: int,
+    ) -> List[int]:
+        r"""
+        Memory planning algorithm suite that runs a list of memory planning algorithms
+        and returns the result of the algorithm that minimizes the total memory usage.
+
+        Args:
+            graph_module: The graph module to allocate memory for
+            alignment: Memory alignment requirement
+            graph_signature: Optional graph signature
+            alloc_graph_input: Whether to allocate memory for graph input
+            alloc_graph_output: Whether to allocate memory for graph output
+            allow_overlapping_allocations: Whether to allow overlapping allocations
+            algo_list: List of memory planning algorithms to run
+            specs: Optional set of TensorSpec objects with updated lifetimes. If None, they will be
+                calculated from the graph_module.
+
+        Returns:
+            List of buffer sizes for each memory hierarchy
+        """
+
+        mem_algo_results = {}
+        for algo in self.algo_list:
+            if isinstance(algo, functools.partial):
+                name = algo.func.__name__
+            else:
+                name = getattr(algo, "__name__", None)
+
+            mem_algo_results[name] = algo(
+                alignment,
+                specs,
+                graph_module,
+                graph_signature,
+                extra_padding,
+            )
+
+        # All the algorithms should have the same number of buffers allocated.
+        assert (
+            len(
+                {
+                    len(mem_algo_result.bufsizes)
+                    for mem_algo_result in mem_algo_results.values()
+                }
+            )
+            == 1
+        ), "Different memory planning algorithms should have the same number of buffers allocated."
+
+        # Find the algorithm that minimizes the total memory usage.
+        best_algo = min(
+            mem_algo_results, key=lambda k: sum(mem_algo_results[k].bufsizes)
         )
-        == 1
-    ), "Different memory planning algorithms should have the same number of buffers allocated."
-
-    # Find the algorithm that minimizes the total memory usage.
-    best_algo = min(mem_algo_results, key=lambda k: sum(mem_algo_results[k].bufsizes))
-    logging.debug(f"Best memory planning algo for this model is {best_algo}")
-    bufsizes = mem_algo_results[best_algo].bufsizes
-
-    # Update the mem_id and mem_offset for each spec in the graph module based on the
-    # values provided by the best memory planning algorithm.
-    for spec in mem_algo_results[best_algo].spec_dict:
-        spec_alloc_result = mem_algo_results[best_algo].spec_dict[spec]
-        spec.mem_id = spec_alloc_result.mem_id
-        spec.mem_offset = spec_alloc_result.mem_offset
-        spec.mem_obj_id = spec_alloc_result.mem_obj_id
+        logging.debug(f"Best memory planning algo for this model is {best_algo}")
+        bufsizes = mem_algo_results[best_algo].bufsizes
 
-    return bufsizes
+        # Update the mem_id and mem_offset for each spec in the graph module based on the
+        # values provided by the best memory planning algorithm.
+        for spec in mem_algo_results[best_algo].spec_dict:
+            spec_alloc_result = mem_algo_results[best_algo].spec_dict[spec]
+            spec.mem_id = spec_alloc_result.mem_id
+            spec.mem_offset = spec_alloc_result.mem_offset
+            spec.mem_obj_id = spec_alloc_result.mem_obj_id
+
+        return bufsizes
 
 
 def naive(
-    graph_module: torch.fx.GraphModule,
     alignment: int,
-    graph_signature: Optional[ExportGraphSignature] = None,
-    alloc_graph_input: bool = True,
-    alloc_graph_output: bool = True,
+    specs: Set[TensorSpec],
+    graph_module: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+    extra_padding: int,
 ) -> MemoryAlgoResult:
+    """Naive algorithm to allocate memory for tensors in the graph.
+
+    This algorithm simply allocates memory for each tensor sequentially without reusing memory.
+
+    Args:
+        alignment: Memory alignment requirement
+        specs: Set of TensorSpec objects with updated lifetimes
+        graph_module: Graph module
+        graph_signature: Graph signature
+        extra_padding: Additional padding to add to each memory buffer (in bytes)
 
+    Returns:
+        MemoryAlgoResult containing the allocation decisions
+    """
     naive_result = MemoryAlgoResult({}, [])
 
     # allocate 'allocated' bytes from buffer with id mem_id.
@@ -918,14 +949,9 @@ def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
     bufsizes = getattr(graph_module, "input_mem_buffer_sizes", None)
     if bufsizes is None:
         bufsizes = [0, 0]
-
     bufsizes = typing.cast(List[int], bufsizes)
-    for spec in collect_specs_from_nodes(
-        graph_module.graph.nodes,
-        graph_signature,
-        ignore_graph_input=not alloc_graph_input,
-        ignore_graph_output=not alloc_graph_output,
-    ):
+
+    for spec in specs:
         spec_alloc_result = naive_result.spec_dict.get(spec, SpecAllocResult(0, 0, 0))
         # assume a single memory layer which has mem_id 1
         if spec.mem_id is None:
@@ -1027,7 +1053,7 @@ def insert_calls_to_free(
 
 def apply_algo(
     algo: Callable[
-        [torch.fx.GraphModule, int, Optional[ExportGraphSignature], bool, bool],
+        ...,
         List[int],
     ],
     graph_module: torch.fx.GraphModule,
@@ -1035,6 +1061,7 @@ def apply_algo(
     graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
+    alloc_mutable_buffers: bool = True,
 ) -> List[int]:
     """
     Recursively apply algo to graph_module and its submodules for control flow.
@@ -1047,12 +1074,35 @@ def apply_algo(
        storage with tensors in the outer module.
     TODO: make these optimizations once we have some baseline working.
     """
+    # Extract the nodes and their lifespans from the graph_module
+    # Difficult to just filter the list of specs returned by this due to
+    # how we flag trainable weights.
+    _ = update_all_tensors_lifetime(graph_module, graph_signature)
+    # Filter specs based on alloc_graph_input and alloc_graph_output
+    specs = collect_specs_from_nodes(
+        graph_module.graph.nodes,
+        graph_signature,
+        do_assertion=False,
+        ignore_graph_input=not alloc_graph_input,
+        ignore_graph_output=not alloc_graph_output,
+        ignore_mutable_buffers=not alloc_mutable_buffers,
+    )
 
-    specs = update_all_tensors_lifetime(graph_module, graph_signature)
+    # Get extra padding for XNNPACK if needed
+    extra_padding = 0
+    if _contains_xnnpack_delegate(graph_module):
+        extra_padding = 64
+
+    # Pass the filtered specs to the algorithm
     bufsizes: List[int] = algo(
-        graph_module, alignment, graph_signature, alloc_graph_input, alloc_graph_output
+        alignment,
+        specs,
+        graph_module,
+        graph_signature,
+        extra_padding,
     )
-    insert_calls_to_free(graph_module, specs)
+
+    insert_calls_to_free(graph_module, set(specs))
 
     def handle_submodule(
         submodule_nd: torch.fx.Node, alloc_graph_input: bool = False
@@ -1063,6 +1113,7 @@ def handle_submodule(
         # memory planning for submodule need to be aware of the amount of
         # buffer already allocated.
         submodule.input_mem_buffer_sizes = bufsizes
+
         bufsizes = apply_algo(
             algo,
             submodule,
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
index f4881e7ab71..9bd4ab20bf5 100644
--- a/exir/passes/memory_planning_pass.py
+++ b/exir/passes/memory_planning_pass.py
@@ -17,7 +17,7 @@
     _is_out_var_node,
     apply_algo,
     get_node_tensor_specs,
-    memory_planning_algorithm_suite,
+    MemoryPlanningAlgorithmSuite,
     Verifier,
 )
 from executorch.exir.operator.convert import get_out_args_from_opoverload
@@ -40,12 +40,11 @@ def _callable_name(any_callable: Callable[..., Any]) -> str:
 class MemoryPlanningPass(PassBase):
     def __init__(
         self,
-        memory_planning_algo: Callable[
-            ..., List[int]
-        ] = memory_planning_algorithm_suite,
+        memory_planning_algo: Optional[Callable[..., List[int]]] = None,
         allow_lifetime_and_storage_overlap: bool = False,
         alloc_graph_input: bool = True,
         alloc_graph_output: bool = True,
+        alloc_mutable_buffers: bool = True,
         alignment: int = ALIGNMENT,
     ) -> None:
         r"""
@@ -54,10 +53,13 @@ def __init__(
         the graph input/output. The default behavior is the algorithm will allocate
         memory for both graph input and output.
         """
-        self.memory_planning_algo = memory_planning_algo
+        if memory_planning_algo is None:
+            memory_planning_algo = MemoryPlanningAlgorithmSuite()
+        self.memory_planning_algo: Callable[..., List[int]] = memory_planning_algo
         self.allow_lifetime_and_storage_overlap = allow_lifetime_and_storage_overlap
         self.alloc_graph_input = alloc_graph_input
         self.alloc_graph_output = alloc_graph_output
+        self.alloc_mutable_buffers = alloc_mutable_buffers
         self.alignment = alignment
 
     def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
@@ -124,6 +126,7 @@ def run(
         # customized fields. Using the graph_module object to convey information across
         # passes/stages is quite natural and avoid yet another 'context' data structure
         # to do the job.
+
         _ = apply_algo(
             self.memory_planning_algo,
             graph_module,
@@ -131,6 +134,7 @@ def run(
             graph_signature,
             self.alloc_graph_input,
             self.alloc_graph_output,
+            self.alloc_mutable_buffers,
         )
 
         # TODO: make the verifier do the work recursively to handle
@@ -139,6 +143,7 @@ def run(
             graph_module,
             self.alloc_graph_input,
             self.alloc_graph_output,
+            self.alloc_mutable_buffers,
             graph_signature,
         )
 
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 7a2120f9e9b..e0484f4f4ff 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -212,7 +212,30 @@ def _get_updated_graph_signature(
     return new_signature
 
 
-def _transform(self, *passes: PassType) -> "ExportedProgram":
+def _transform(
+    self,
+    *passes: PassType,
+    override_verifiers: None | list[Type[Verifier]] = None,
+) -> "ExportedProgram":
+    """
+    Transforms the program according to the provided passes.
+
+    Args:
+        self: The ExportedProgram instance to transform
+        *passes: A sequence of passes to apply to the program
+        override_verifiers: Optional list of verifier classes to use instead of the default verifiers.
+            This is needed if the transforms yields illegal graph that the default verifier cannot handle.
+
+    Returns:
+        ExportedProgram: A new ExportedProgram with the transformations applied, or self if no changes were made
+    """
+    # A user friendly check to avoid vararg surprises, PEP 3102
+    assert not any(
+        isinstance(p, (list, Verifier)) for p in passes
+    ), f"Expected all passes to be of PassType, not list or Verifier. Use override_verifiers kwarg instead. Got: {list(passes)}"
+
+    for p in list(passes):
+        print(type(p))
     pm = PassManager(list(passes))
     res = pm(self.graph_module)
     transformed_gm = res.graph_module if res is not None else self.graph_module
@@ -221,7 +244,9 @@ def _transform(self, *passes: PassType) -> "ExportedProgram":
     if transformed_gm is self.graph_module and not res.modified:
         return self
 
-    return _update_exported_program_graph_module(self, transformed_gm)
+    return _update_exported_program_graph_module(
+        self, transformed_gm, override_verifiers
+    )
 
 
 def _update_exported_program_graph_module(
@@ -986,7 +1011,7 @@ def keep(op):
         try:
             # Ops in torch.ops.quant are not always loaded, so we use try/except
             # Aliases output, but we need to allow it for XNNPACK
-            allow_list.append(torch.ops.quant.choose_qparams_affine.default)
+            allow_list.append(torch.ops.torchao.choose_qparams_affine.default)
         except:
             pass
 
@@ -1027,6 +1052,7 @@ def keep(op):
             torch.ops.aten.item.default,
             torch.ops.aten._local_scalar_dense.default,
             torch.ops.aten.unbind.int,
+            torch.ops.aten.split_with_sizes.default,
         ]:
             logging.warn(
                 f"Op {op} was requested for preservation by partitioner.  This request is ignored because it is in a blocklist."
@@ -1325,7 +1351,7 @@ def to_edge(
 class EdgeProgramManager:
     """
     Package of one or more `ExportedPrograms` in Edge dialect. Designed to simplify
-    lowering to ExecuTorch. See: https://pytorch.org/executorch/stable/ir-exir.html
+    lowering to ExecuTorch. See: https://pytorch.org/executorch/main/ir-exir
 
     Allows easy applications of transforms across a collection of exported programs
     including the delegation of subgraphs.
@@ -1565,7 +1591,7 @@ def to_executorch(
 class ExecutorchProgramManager:
     """
     Package of one or more `ExportedPrograms` in Execution dialect. Designed to simplify
-    lowering to ExecuTorch. See: https://pytorch.org/executorch/stable/ir-exir.html
+    lowering to ExecuTorch. See: https://pytorch.org/executorch/main/ir-exir
 
     When the ExecutorchProgramManager is constructed the ExportedPrograms in execution dialect
     are used to form the executorch binary (in a process called emission) and then serialized
@@ -1612,6 +1638,7 @@ def __init__(
             self._execution_programs,
             backend_config.emit_stacktrace,
             self._config_methods,
+            backend_config.emit_mutable_buffer_names,
         )
 
         # Serialize emitter output, ready to be written to a file.
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index fca8bd2212f..9889417c56e 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -22,6 +22,7 @@
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.program._program import (
+    _transform,
     EdgeProgramManager,
     ExecutorchProgramManager,
     to_edge,
@@ -34,6 +35,7 @@
 from executorch.extension.pybindings.portable_lib import (
     _load_for_executorch_from_buffer,
 )
+from torch._export.verifier import Verifier
 from torch.export import Dim, export, ExportedProgram
 from torch.export._trace import _export
 
@@ -273,7 +275,6 @@ def get_executorch_memory_planning_passes() -> Dict[str, MemoryPlanningPass]:
             for output_val in method.outputs:
                 evalue = method.values[output_val]
                 self.assertNotEqual(evalue.val.allocation_info, None)
-        else:
             for input_val in method.inputs:
                 evalue = method.values[input_val]
                 self.assertEqual(evalue.val.allocation_info, None)
@@ -725,17 +726,17 @@ def count_nodes(graph_module, target):
         )
 
     def test_edge_dialect_non_core_aten_ops(self):
-        class LinalgNorm(torch.nn.Module):
+        class LinalgRank(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.linalg.norm(x)
+                return torch.linalg.matrix_rank(x)
 
         from torch._export.verifier import SpecViolationError
 
-        input = torch.arange(9, dtype=torch.float) - 4
-        ep = torch.export.export(LinalgNorm(), (input,), strict=True)
+        input = torch.ones((9, 9, 9), dtype=torch.float)
+        ep = torch.export.export(LinalgRank(), (input,), strict=True)
 
         # aten::linalg_norm is not a core op, so it should error out
         with self.assertRaises(SpecViolationError):
@@ -748,9 +749,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 ep,
                 compile_config=EdgeCompileConfig(
                     _check_ir_validity=True,
-                    _core_aten_ops_exception_list=[
-                        torch.ops.aten.linalg_vector_norm.default
-                    ],
+                    _core_aten_ops_exception_list=[torch.ops.aten._linalg_svd.default],
                 ),
             )
         except SpecViolationError:
@@ -849,3 +848,23 @@ def test_save_fails(self):
         et = edge.to_executorch()
         with self.assertRaises(ValueError):
             _ = et.save("/tmp/test_save.pt")
+
+    def test__transform_override_verifiers(self):
+        """Test that _transform can override verifiers in the exported program."""
+
+        class MyVerifier(Verifier):
+            dialect: str = "MY_DIALECT"
+
+            def __init__(self):
+                super().__init__()
+
+        model = TestLinear()
+        program = torch.export.export(model, model._get_random_inputs(), strict=True)
+        self.assertFalse(issubclass(program.verifiers[0], MyVerifier))
+
+        # Apply transformation with custom verifier
+        transformed = _transform(
+            program, AddToMulPassEdge(), override_verifiers=[MyVerifier]
+        )
+        self.assertTrue(issubclass(transformed.verifiers[0], MyVerifier))
+        self.assertFalse(issubclass(program.verifiers[0], MyVerifier))
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index 8df0cfed0bf..b87ae2dfb58 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -8,7 +8,6 @@
 
 import itertools
 import unittest
-from functools import partial
 from typing import Any, Callable, List, Optional, Tuple, Type
 
 import executorch.exir as exir
@@ -20,8 +19,8 @@
     filter_nodes,
     get_node_tensor_specs,
     greedy,
-    memory_planning_algorithm_suite,
     MemoryAlgoResult,
+    MemoryPlanningAlgorithmSuite,
     naive,
     Verifier,
 )
@@ -242,6 +241,7 @@ def maketest(
     use_functionalization: bool = True,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
+    alloc_mutable_buffer: bool = True,
     has_unused_graph_input: bool = False,
 ) -> Callable[..., None]:
     # parameterized.expand is not compatible with maketest. I'll just loop thru
@@ -269,7 +269,7 @@ def wrapper(self: "TestMemoryPlanning") -> None:
                 .exported_program()
                 .graph_module
             )
-            mem_algo = partial(memory_planning_algorithm_suite, algo_list=[algo])
+            mem_algo = MemoryPlanningAlgorithmSuite(algo_list=[algo])
             graph_module = PassManager(
                 passes=[
                     SpecPropPass(),
@@ -283,10 +283,17 @@ def wrapper(self: "TestMemoryPlanning") -> None:
             )(graph_module).graph_module
 
             self.verify_reuse(
-                graph_module, expect_reuse, alloc_graph_input, alloc_graph_output
+                graph_module,
+                expect_reuse,
+                alloc_graph_input,
+                alloc_graph_output,
+                alloc_mutable_buffer,
             )
             self.verify_graph_input_output(
-                graph_module, alloc_graph_input, alloc_graph_output
+                graph_module,
+                alloc_graph_input,
+                alloc_graph_output,
+                alloc_mutable_buffer,
             )
 
             self.verify_overlap_placeholders(has_unused_graph_input, graph_module)
@@ -307,6 +314,7 @@ def verify_reuse(
         expect_reuse: bool,
         alloc_graph_input: bool,
         alloc_graph_output: bool,
+        alloc_mutable_buffer: bool,
     ) -> None:
         r"""
         Do sanity check and verify tensor storage reuse.
@@ -322,6 +330,7 @@ def verify_reuse(
             graph_module,
             alloc_graph_input=alloc_graph_input,
             alloc_graph_output=alloc_graph_output,
+            alloc_mutable_buffers=alloc_mutable_buffer,
         ).verify_storage_reuse()
 
         print(f"num_reuse_pairs is {num_reuse_pairs}")
@@ -335,9 +344,10 @@ def verify_graph_input_output(
         graph_module: torch.fx.GraphModule,
         alloc_graph_input: bool,
         alloc_graph_output: bool,
+        alloc_mutable_buffers: bool,
     ) -> None:
         Verifier(
-            graph_module, alloc_graph_input, alloc_graph_output
+            graph_module, alloc_graph_input, alloc_graph_output, alloc_mutable_buffers
         ).verify_graph_input_output()
 
     def verify_overlap_placeholders(
@@ -405,13 +415,16 @@ def verify_overlap_placeholders(
     )
 
     def test_graph_input_output(self) -> None:
-        for alloc_graph_input, alloc_graph_output in itertools.product(
-            [True, False], [True, False]
-        ):
+        for (
+            alloc_graph_input,
+            alloc_graph_output,
+            alloc_mutable_buffers,
+        ) in itertools.product([True, False], [True, False], [True, False]):
             case = maketest(
                 ModelWithDifferentTensorSizes,
                 alloc_graph_input=alloc_graph_input,
                 alloc_graph_output=alloc_graph_output,
+                alloc_mutable_buffer=alloc_mutable_buffers,
             )
             case(self)
 
@@ -497,7 +510,6 @@ def quantize(self, eager_model: nn.Module) -> nn.Module:
         )
         return quantized_model
 
-    # pyre-ignore
     @parameterized.expand(
         [
             (
@@ -514,7 +526,7 @@ def quantize(self, eager_model: nn.Module) -> nn.Module:
     )
     def test_multiple_pools(
         self,
-        algo: Callable[..., List[int]],
+        algo: Callable[..., MemoryAlgoResult],
         expected_allocs: List[Tuple[int, int]],
         expected_bufsizes: List[int],
     ) -> None:
@@ -522,7 +534,7 @@ def test_multiple_pools(
             export(MultiplePoolsToyModel(), (torch.ones(1),), strict=True)
         )
 
-        mem_algo = partial(memory_planning_algorithm_suite, algo_list=[algo])
+        mem_algo = MemoryPlanningAlgorithmSuite(algo_list=[algo])
         edge_program.to_executorch(
             exir.ExecutorchBackendConfig(
                 memory_planning_pass=CustomPoolMemoryPlanningPass(
@@ -537,6 +549,7 @@ def test_multiple_pools(
             graph_module,
             alloc_graph_input=True,
             alloc_graph_output=True,
+            alloc_mutable_buffers=True,
         )
         verifier.verify_storage_reuse()
         verifier.verify_graph_input_output()
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 39dbd3f51d3..887ca39864a 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -1164,7 +1164,9 @@ def forward(self, query, key, value):
         value = torch.randn(32, 32, 32, 32)
 
         # Capture the model
-        m = torch.export.export_for_training(M(32), (query, key, value)).module()
+        m = torch.export.export_for_training(
+            M(32), (query, key, value), strict=True
+        ).module()
 
         # 8w16a quantization
         from torch.ao.quantization.observer import (
@@ -1405,8 +1407,7 @@ def quantize_model(
         ) -> Tuple[EdgeProgramManager, int, int]:
             # program capture
             m = torch.export.export_for_training(
-                m_eager,
-                example_inputs,
+                m_eager, example_inputs, strict=True
             ).module()
 
             quantizer = XNNPACKQuantizer()
diff --git a/exir/tests/test_quantization.py b/exir/tests/test_quantization.py
index 61e3410186e..0a0a85077bb 100644
--- a/exir/tests/test_quantization.py
+++ b/exir/tests/test_quantization.py
@@ -52,7 +52,7 @@ def test_resnet(self) -> None:
             m_copy = copy.deepcopy(m)
             # program capture
             m = torch.export.export_for_training(
-                m, copy.deepcopy(example_inputs)
+                m, copy.deepcopy(example_inputs), strict=True
             ).module()
 
             quantizer = XNNPACKQuantizer()
diff --git a/exir/tests/test_quantize_io_pass.py b/exir/tests/test_quantize_io_pass.py
index aab941b538c..ddc0294ba68 100644
--- a/exir/tests/test_quantize_io_pass.py
+++ b/exir/tests/test_quantize_io_pass.py
@@ -39,12 +39,14 @@ def _quantize(self, mod, example_inputs):
         operator_config = get_symmetric_quantization_config()
         quantizer.set_global(operator_config)
         m = torch.export.export_for_training(
-            mod, copy.deepcopy(example_inputs)
+            mod, copy.deepcopy(example_inputs), strict=True
         ).module()
         m = prepare_pt2e(m, quantizer)
         _ = m(*example_inputs)
         m = convert_pt2e(m)
-        exported_program = torch.export.export_for_training(m, example_inputs)
+        exported_program = torch.export.export_for_training(
+            m, example_inputs, strict=True
+        )
         return exported_program
 
     def _check_count(self, op, count, epm):
diff --git a/exir/tracer.py b/exir/tracer.py
index 82f93424a14..c749df510ad 100644
--- a/exir/tracer.py
+++ b/exir/tracer.py
@@ -631,8 +631,18 @@ def _default_decomposition_table(
         ]
         # pyre-fixme[7]: Expected `Dict[OpOverload, typing.Callable[..., executorch.e...
         return get_decompositions(decomp_opset)
+
+    decomps = default_decompositions()
+    # Add edge specific decompositions
+    additional_decomp_ops = [
+        # TODO: Eventually this op should be added to the core decompo table, and will not
+        # need to be added here.
+        torch.ops.aten.linalg_vector_norm.default,
+    ]
+    additional_decomps = get_decompositions(additional_decomp_ops)
+    decomps.update(additional_decomps)
     # pyre-fixme[7]: Expected `Dict[OpOverload, typing.Callable[..., executorch.exir....
-    return default_decompositions()
+    return decomps
 
 
 def dynamo_trace(
diff --git a/extension/android/README.md b/extension/android/README.md
new file mode 100644
index 00000000000..5fc4ba4429d
--- /dev/null
+++ b/extension/android/README.md
@@ -0,0 +1,50 @@
+# ExecuTorch Android
+
+This directory contains the Android Java/Kotlin binding. The final product is an AAR,
+which contains the `.so` libraries for c++ runtime, and `.jar` for Java API, and required
+metadata `AndroidManifest.xml`.
+
+## Core contents
+
+Under `extension/android/`,
+
+- `executorch_android/` is the root for the Java `org.pytorch.executorch` package
+  - `src/`
+    - `androidTest/` contains the android instrumentation test source
+    - `main/` contains the Java source
+    - `test/` contains the Java unit test source
+  - `build.gradle` is the rule to build the Java package.
+- `jni/` contains the JNI layer code, which depends on the ExecuTorch c++ runtime library.
+- `CMakeLists.txt` is the rule for building the JNI library.
+
+## Build
+
+`scripts/build_android_library.sh` is a helper script to build the Java library (into .jar), native library (into .so), and the packaged AAR file.
+
+The usage is:
+```sh
+export ANDROID_HOME=/path/to/sdk
+export ANDROID_NDK=/path/to/ndk
+sh scripts/build_android_library.sh
+```
+
+The AAR file will be `extension/android/executorch_android/build/outputs/aar/executorch_android-debug.aar`.
+If you set an environment variable `BUILD_AAR_DIR`, then the AAR will be copied to `$BUILD_AAR_DIR/executorch.aar`.
+Later, you can copy `$BUILD_AAR_DIR/executorch.aar` to your app directory to use as a library.
+
+Please see [Android building from source](https://pytorch.org/executorch/main/using-executorch-android#building-from-source) for details
+
+## Test
+
+After the library is built,
+
+```sh
+# Set up models for testing
+sh executorch_android/android_test_setup.sh
+
+# Run unit test
+./gradlew :executorch_android:testDebugUnitTest
+
+# Run instrumentation test
+./gradlew :executorch_android:connectedAndroidTest
+```
diff --git a/extension/android/executorch_android/android_test_setup.sh b/extension/android/executorch_android/android_test_setup.sh
new file mode 100644
index 00000000000..c1fb2a19386
--- /dev/null
+++ b/extension/android/executorch_android/android_test_setup.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+which "${PYTHON_EXECUTABLE}"
+
+BASEDIR=$(dirname "$(realpath $0)")
+
+prepare_add() {
+  cp "${BASEDIR}/../../../extension/module/test/resources/add.pte" "${BASEDIR}/src/androidTest/resources"
+}
+
+prepare_tinyllama() {
+  pushd "${BASEDIR}/../../../"
+  curl -C - -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt" --output stories15M.pt
+  curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
+  # Create params.json file
+  touch params.json
+  echo '{"dim": 288, "multiple_of": 32, "n_heads": 6, "n_layers": 6, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+  python -m examples.models.llama.export_llama -c stories15M.pt -p params.json -d fp16 -n stories15m_h.pte -kv
+  python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+
+  cp stories15m_h.pte "${BASEDIR}/src/androidTest/resources/stories.pte"
+  cp tokenizer.bin "${BASEDIR}/src/androidTest/resources/tokenizer.bin"
+  popd
+}
+
+prepare_vision() {
+  pushd "${BASEDIR}/../../../"
+  python3 -m examples.xnnpack.aot_compiler --model_name "mv2" --delegate
+  python3 -m examples.xnnpack.aot_compiler --model_name "mv3" --delegate
+  python3 -m examples.xnnpack.aot_compiler --model_name "resnet50" --quantize --delegate
+  cp mv2*.pte mv3*.pte resnet50*.pte "${BASEDIR}/src/androidTest/resources/"
+  popd
+}
+
+prepare_add
+prepare_tinyllama
+prepare_vision
diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
index b284ce3896e..15088f4097f 100644
--- a/extension/android/executorch_android/build.gradle
+++ b/extension/android/executorch_android/build.gradle
@@ -27,12 +27,19 @@ android {
     }
 
     sourceSets {
+        main {
+            jniLibs.srcDirs = ['../../../cmake-out-android-so/']
+        }
         androidTest {
             resources.srcDirs += [ 'src/androidTest/resources' ]
         }
     }
 }
 
+task copyTestRes(type: Exec) {
+  commandLine 'bash', 'android_test_setup.sh'
+}
+
 dependencies {
     implementation 'com.facebook.fbjni:fbjni:0.5.1'
     implementation 'com.facebook.soloader:nativeloader:0.10.5'
@@ -40,6 +47,7 @@ dependencies {
     androidTestImplementation 'androidx.test.ext:junit:1.1.5'
     androidTestImplementation 'androidx.test:rules:1.2.0'
     androidTestImplementation 'commons-io:commons-io:2.4'
+    androidTestImplementation 'org.json:json:20250107'
 }
 
 import com.vanniktech.maven.publish.SonatypeHost
@@ -48,7 +56,7 @@ mavenPublishing {
   publishToMavenCentral(SonatypeHost.DEFAULT)
   signAllPublications()
 
-  coordinates("org.pytorch", "executorch-android", "0.5.0-SNAPSHOT")
+  coordinates("org.pytorch", "executorch-android", "0.7.0")
 
   pom {
     name = "ExecuTorch Android"
diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java
index b3b515d7ed0..c0a43b25a98 100644
--- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java
@@ -34,13 +34,15 @@
 import org.apache.commons.io.FileUtils;
 import androidx.test.ext.junit.runners.AndroidJUnit4;
 import androidx.test.InstrumentationRegistry;
+import org.json.JSONException;
+import org.json.JSONObject;
 import org.pytorch.executorch.extension.llm.LlmCallback;
 import org.pytorch.executorch.extension.llm.LlmModule;
 
 /** Unit tests for {@link org.pytorch.executorch.extension.llm.LlmModule}. */
 @RunWith(AndroidJUnit4.class)
 public class LlmModuleInstrumentationTest implements LlmCallback {
-    private static String TEST_FILE_NAME = "/tinyllama_portable_fp16_h.pte";
+    private static String TEST_FILE_NAME = "/stories.pte";
     private static String TOKENIZER_FILE_NAME = "/tokenizer.bin";
     private static String TEST_PROMPT = "Hello";
     private static int OK = 0x00;
@@ -86,7 +88,6 @@ public void testGenerate() throws IOException, URISyntaxException{
 
     @Test
     public void testGenerateAndStop() throws IOException, URISyntaxException{
-        int seqLen = 32;
         mModule.generate(TEST_PROMPT, SEQ_LEN, new LlmCallback() {
             @Override
             public void onResult(String result) {
@@ -95,8 +96,8 @@ public void onResult(String result) {
             }
 
             @Override
-            public void onStats(float tps) {
-                LlmModuleInstrumentationTest.this.onStats(tps);
+            public void onStats(String stats) {
+                LlmModuleInstrumentationTest.this.onStats(stats);
             }
         });
 
@@ -110,7 +111,16 @@ public void onResult(String result) {
     }
 
     @Override
-    public void onStats(float tps) {
-        tokensPerSecond.add(tps);
+    public void onStats(String stats) {
+        float tps = 0;
+        try {
+            JSONObject jsonObject = new JSONObject(stats);
+            int numGeneratedTokens = jsonObject.getInt("generated_tokens");
+            int inferenceEndMs = jsonObject.getInt("inference_end_ms");
+            int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
+            tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
+            tokensPerSecond.add(tps);
+        } catch (JSONException e) {
+        }
     }
 }
diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java
new file mode 100644
index 00000000000..3a033851be9
--- /dev/null
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.executorch;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.fail;
+
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.os.Environment;
+import androidx.test.rule.GrantPermissionRule;
+import android.Manifest;
+import android.content.Context;
+import org.junit.Test;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.runner.RunWith;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.io.IOException;
+import java.io.File;
+import java.io.FileOutputStream;
+import org.junit.runners.JUnit4;
+import org.apache.commons.io.FileUtils;
+import androidx.test.ext.junit.runners.AndroidJUnit4;
+import androidx.test.InstrumentationRegistry;
+
+/** Unit tests for {@link Module}. */
+@RunWith(AndroidJUnit4.class)
+public class ModuleE2ETest {
+    private static String getTestFilePath(String fileName) {
+        return InstrumentationRegistry.getInstrumentation().getTargetContext().getExternalCacheDir() + fileName;
+    }
+
+    @Rule
+    public GrantPermissionRule mRuntimePermissionRule = GrantPermissionRule.grant(Manifest.permission.READ_EXTERNAL_STORAGE);
+
+    static int argmax(float[] array) {
+        if (array.length == 0) {
+            throw new IllegalArgumentException("Array cannot be empty");
+        }
+        int maxIndex = 0;
+        float maxValue = array[0];
+        for (int i = 1; i < array.length; i++) {
+            if (array[i] > maxValue) {
+                maxValue = array[i];
+                maxIndex = i;
+            }
+        }
+        return maxIndex;
+    }
+
+    public void testClassification(String filePath) throws IOException, URISyntaxException {
+        File pteFile = new File(getTestFilePath(filePath));
+        InputStream inputStream = getClass().getResourceAsStream(filePath);
+        FileUtils.copyInputStreamToFile(inputStream, pteFile);
+        inputStream.close();
+
+        InputStream imgInputStream = getClass().getResourceAsStream("/banana.jpeg");
+        Bitmap bitmap = BitmapFactory.decodeStream(imgInputStream);
+        bitmap = Bitmap.createScaledBitmap(bitmap, 224, 224, true);
+        imgInputStream.close();
+
+        Tensor inputTensor =
+        TensorImageUtils.bitmapToFloat32Tensor(
+            bitmap,
+            TensorImageUtils.TORCHVISION_NORM_MEAN_RGB,
+            TensorImageUtils.TORCHVISION_NORM_STD_RGB);
+
+        Module module = Module.load(getTestFilePath(filePath));
+
+        EValue[] results = module.forward(EValue.from(inputTensor));
+        assertTrue(results[0].isTensor());
+        float[] scores = results[0].toTensor().getDataAsFloatArray();
+
+        int bananaClass = 954;  // From ImageNet 1K
+        assertEquals(bananaClass, argmax(scores));
+    }
+
+    @Test
+    public void testMv2Fp32() throws IOException, URISyntaxException {
+        testClassification("/mv2_xnnpack_fp32.pte");
+    }
+
+    @Test
+    public void testMv3Fp32() throws IOException, URISyntaxException {
+        testClassification("/mv3_xnnpack_fp32.pte");
+    }
+
+    @Test
+    public void testResnet50() throws IOException, URISyntaxException {
+        testClassification("/resnet50_xnnpack_q8.pte");
+    }
+}
diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TensorImageUtils.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TensorImageUtils.java
new file mode 100644
index 00000000000..95434dcb734
--- /dev/null
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TensorImageUtils.java
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.executorch;
+
+import android.graphics.Bitmap;
+import android.util.Log;
+import java.nio.FloatBuffer;
+import org.pytorch.executorch.Tensor;
+
+/**
+ * Contains utility functions for {@link Tensor} creation from {@link android.graphics.Bitmap} or
+ * {@link android.media.Image} source.
+ */
+public final class TensorImageUtils {
+
+  public static float[] TORCHVISION_NORM_MEAN_RGB = new float[] {0.485f, 0.456f, 0.406f};
+  public static float[] TORCHVISION_NORM_STD_RGB = new float[] {0.229f, 0.224f, 0.225f};
+
+  /**
+   * Creates new {@link Tensor} from full {@link android.graphics.Bitmap}, normalized with specified
+   * in parameters mean and std.
+   *
+   * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order
+   * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB
+   *     order
+   */
+  public static Tensor bitmapToFloat32Tensor(
+      final Bitmap bitmap, final float[] normMeanRGB, final float normStdRGB[]) {
+    checkNormMeanArg(normMeanRGB);
+    checkNormStdArg(normStdRGB);
+
+    return bitmapToFloat32Tensor(
+        bitmap, 0, 0, bitmap.getWidth(), bitmap.getHeight(), normMeanRGB, normStdRGB);
+  }
+
+  /**
+   * Writes tensor content from specified {@link android.graphics.Bitmap}, normalized with specified
+   * in parameters mean and std to specified {@link java.nio.FloatBuffer} with specified offset.
+   *
+   * @param bitmap {@link android.graphics.Bitmap} as a source for Tensor data
+   * @param x - x coordinate of top left corner of bitmap's area
+   * @param y - y coordinate of top left corner of bitmap's area
+   * @param width - width of bitmap's area
+   * @param height - height of bitmap's area
+   * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order
+   * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB
+   *     order
+   */
+  public static void bitmapToFloatBuffer(
+      final Bitmap bitmap,
+      final int x,
+      final int y,
+      final int width,
+      final int height,
+      final float[] normMeanRGB,
+      final float[] normStdRGB,
+      final FloatBuffer outBuffer,
+      final int outBufferOffset) {
+    checkOutBufferCapacity(outBuffer, outBufferOffset, width, height);
+    checkNormMeanArg(normMeanRGB);
+    checkNormStdArg(normStdRGB);
+    final int pixelsCount = height * width;
+    final int[] pixels = new int[pixelsCount];
+    bitmap.getPixels(pixels, 0, width, x, y, width, height);
+    final int offset_g = pixelsCount;
+    final int offset_b = 2 * pixelsCount;
+    for (int i = 0; i < 100; i++) {
+      final int c = pixels[i];
+      Log.i("Image", ": " + i + " " + ((c >> 16) & 0xff));
+    }
+    for (int i = 0; i < pixelsCount; i++) {
+      final int c = pixels[i];
+      float r = ((c >> 16) & 0xff) / 255.0f;
+      float g = ((c >> 8) & 0xff) / 255.0f;
+      float b = ((c) & 0xff) / 255.0f;
+      outBuffer.put(outBufferOffset + i, (r - normMeanRGB[0]) / normStdRGB[0]);
+      outBuffer.put(outBufferOffset + offset_g + i, (g - normMeanRGB[1]) / normStdRGB[1]);
+      outBuffer.put(outBufferOffset + offset_b + i, (b - normMeanRGB[2]) / normStdRGB[2]);
+    }
+  }
+
+  /**
+   * Creates new {@link Tensor} from specified area of {@link android.graphics.Bitmap}, normalized
+   * with specified in parameters mean and std.
+   *
+   * @param bitmap {@link android.graphics.Bitmap} as a source for Tensor data
+   * @param x - x coordinate of top left corner of bitmap's area
+   * @param y - y coordinate of top left corner of bitmap's area
+   * @param width - width of bitmap's area
+   * @param height - height of bitmap's area
+   * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order
+   * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB
+   *     order
+   */
+  public static Tensor bitmapToFloat32Tensor(
+      final Bitmap bitmap,
+      int x,
+      int y,
+      int width,
+      int height,
+      float[] normMeanRGB,
+      float[] normStdRGB) {
+    checkNormMeanArg(normMeanRGB);
+    checkNormStdArg(normStdRGB);
+
+    final FloatBuffer floatBuffer = Tensor.allocateFloatBuffer(3 * width * height);
+    bitmapToFloatBuffer(bitmap, x, y, width, height, normMeanRGB, normStdRGB, floatBuffer, 0);
+    return Tensor.fromBlob(floatBuffer, new long[] {1, 3, height, width});
+  }
+
+  private static void checkOutBufferCapacity(
+      FloatBuffer outBuffer, int outBufferOffset, int tensorWidth, int tensorHeight) {
+    if (outBufferOffset + 3 * tensorWidth * tensorHeight > outBuffer.capacity()) {
+      throw new IllegalStateException("Buffer underflow");
+    }
+  }
+
+  private static void checkTensorSize(int tensorWidth, int tensorHeight) {
+    if (tensorHeight <= 0 || tensorWidth <= 0) {
+      throw new IllegalArgumentException("tensorHeight and tensorWidth must be positive");
+    }
+  }
+
+  private static void checkRotateCWDegrees(int rotateCWDegrees) {
+    if (rotateCWDegrees != 0
+        && rotateCWDegrees != 90
+        && rotateCWDegrees != 180
+        && rotateCWDegrees != 270) {
+      throw new IllegalArgumentException("rotateCWDegrees must be one of 0, 90, 180, 270");
+    }
+  }
+
+  private static void checkNormStdArg(float[] normStdRGB) {
+    if (normStdRGB.length != 3) {
+      throw new IllegalArgumentException("normStdRGB length must be 3");
+    }
+  }
+
+  private static void checkNormMeanArg(float[] normMeanRGB) {
+    if (normMeanRGB.length != 3) {
+      throw new IllegalArgumentException("normMeanRGB length must be 3");
+    }
+  }
+}
diff --git a/extension/android/executorch_android/src/androidTest/resources/banana.jpeg b/extension/android/executorch_android/src/androidTest/resources/banana.jpeg
new file mode 100644
index 00000000000..2b237ce3d14
Binary files /dev/null and b/extension/android/executorch_android/src/androidTest/resources/banana.jpeg differ
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java
index c05b30b0625..639fd0812bd 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java
@@ -31,8 +31,11 @@ public interface LlmCallback {
   /**
    * Called when the statistics for the generate() is available.
    *
-   * @param tps Tokens/second for generated tokens.
+   * The result will be a JSON string. See extension/llm/stats.h for the field
+   * definitions.
+   *
+   * @param stats JSON string containing the statistics for the generate()
    */
   @DoNotStrip
-  public void onStats(float tps);
+  default void onStats(String stats) {}
 }
diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK
index d9ef6d1455e..da2ac49e446 100644
--- a/extension/android/jni/BUCK
+++ b/extension/android/jni/BUCK
@@ -72,6 +72,7 @@ non_fbcode_target(_kind = fb_android_cxx_library,
         "//xplat/executorch/extension/module:module_static",
         "//xplat/executorch/extension/runner_util:inputs_static",
         "//xplat/executorch/extension/tensor:tensor_static",
+        "//xplat/executorch/kernels/quantized:generated_lib_static",
     ],
 )
 
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index d6ade74ee1f..83ca1d898ed 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -100,14 +100,12 @@ class ExecuTorchLlmCallbackJni
 
   void onStats(const llm::Stats& result) const {
     static auto cls = ExecuTorchLlmCallbackJni::javaClassStatic();
-    static const auto method = cls->getMethod<void(jfloat)>("onStats");
-    double eval_time =
-        (double)(result.inference_end_ms - result.prompt_eval_end_ms);
-
-    float tps = result.num_generated_tokens / eval_time *
-        result.SCALING_FACTOR_UNITS_PER_SECOND;
-
-    method(self(), tps);
+    static const auto on_stats_method =
+        cls->getMethod<void(facebook::jni::local_ref<jstring>)>("onStats");
+    on_stats_method(
+        self(),
+        facebook::jni::make_jstring(
+            executorch::extension::llm::stats_to_json_string(result)));
   }
 };
 
@@ -149,7 +147,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<jstring> data_path = nullptr) {
 #if defined(ET_USE_THREADPOOL)
     // Reserve 1 thread for the main thread.
-    uint32_t num_performant_cores =
+    int32_t num_performant_cores =
         ::executorch::extension::cpuinfo::get_num_performant_cores() - 1;
     if (num_performant_cores > 0) {
       ET_LOG(Info, "Resetting threadpool to %d threads", num_performant_cores);
@@ -219,12 +217,15 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           [callback](const llm::Stats& result) { callback->onStats(result); },
           echo);
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
+      executorch::extension::llm::GenerationConfig config{
+          .echo = static_cast<bool>(echo),
+          .seq_len = seq_len,
+      };
       runner_->generate(
           prompt->toStdString(),
-          seq_len,
+          config,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const llm::Stats& result) { callback->onStats(result); },
-          echo);
+          [callback](const llm::Stats& result) { callback->onStats(result); });
     }
     return 0;
   }
diff --git a/extension/benchmark/README.md b/extension/benchmark/README.md
index a9918864e9c..d1367379bb8 100644
--- a/extension/benchmark/README.md
+++ b/extension/benchmark/README.md
@@ -61,7 +61,7 @@ Users can schedule a benchmarking workflow on a pull request through GitHub Acti
 
 ## Retrieving Benchmark Results
 
-The easiest way to view benchmark results is on the [dashboard](./README.md#dashboard), while raw results for individual configurations can be manually accessed by downloading the `Customer_Artifacts.zip` from the CI.
+The easiest way to view benchmark results is on the [dashboard](README.md#dashboard), while raw results for individual configurations can be manually accessed by downloading the `Customer_Artifacts.zip` from the CI.
 
 
 ## Feedback and Issue Reporting
diff --git a/extension/benchmark/android/benchmark/app/build.gradle.kts b/extension/benchmark/android/benchmark/app/build.gradle.kts
index dcf99ca9cd0..28dfc8ae49d 100644
--- a/extension/benchmark/android/benchmark/app/build.gradle.kts
+++ b/extension/benchmark/android/benchmark/app/build.gradle.kts
@@ -39,6 +39,7 @@ dependencies {
   implementation("com.facebook.soloader:soloader:0.10.5")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
   implementation("com.google.code.gson:gson:2.8.6")
+  implementation("org.json:json:20250107")
   testImplementation("junit:junit:4.13.2")
   androidTestImplementation("androidx.test.ext:junit:1.2.1")
   androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1")
diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
index 3bc38aad403..f6a894d6a1f 100644
--- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
+++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java
@@ -21,8 +21,8 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import org.json.JSONException;
+import org.json.JSONObject;
 
 public class LlmBenchmarkActivity extends Activity implements ModelRunnerCallback {
   ModelRunner mModelRunner;
@@ -80,7 +80,17 @@ public void onTokenGenerated(String token) {}
 
   @Override
   public void onStats(String stats) {
-    mStatsInfo.tokens = stats;
+    float tps = 0;
+    try {
+      JSONObject jsonObject = new JSONObject(stats);
+      int numGeneratedTokens = jsonObject.getInt("generated_tokens");
+      int inferenceEndMs = jsonObject.getInt("inference_end_ms");
+      int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
+      tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
+      mStatsInfo.tps = tps;
+    } catch (JSONException e) {
+      Log.e("LLM", "Error parsing JSON: " + e.getMessage());
+    }
   }
 
   @Override
@@ -108,8 +118,7 @@ public void onGenerationStopped() {
             (mStatsInfo.generateEnd - mStatsInfo.generateStart) * 1e-6,
             0.0f));
     // Token per second
-    results.add(
-        new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsInfo.tokens), 0.0f));
+    results.add(new BenchmarkMetric(benchmarkModel, "token_per_sec", mStatsInfo.tps, 0.0f));
 
     try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
       Gson gson = new Gson();
@@ -118,15 +127,6 @@ public void onGenerationStopped() {
       e.printStackTrace();
     }
   }
-
-  private double extractTPS(final String tokens) {
-    final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens);
-    if (m.find()) {
-      return Double.parseDouble(m.group());
-    } else {
-      return 0.0f;
-    }
-  }
 }
 
 class StatsInfo {
@@ -135,7 +135,7 @@ class StatsInfo {
   long loadEnd;
   long generateStart;
   long generateEnd;
-  String tokens;
+  float tps;
   String modelName;
 
   @Override
@@ -149,6 +149,6 @@ public String toString() {
         + "\ngenerateEnd: "
         + generateEnd
         + "\n"
-        + tokens;
+        + tps;
   }
 }
diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
index 6ba1f57c4f3..0a75b47f3a6 100644
--- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
+++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
@@ -68,8 +68,8 @@ public void onResult(String result) {
   }
 
   @Override
-  public void onStats(float tps) {
-    mCallback.onStats("tokens/second: " + tps);
+  public void onStats(String result) {
+    mCallback.onStats(result);
   }
 }
 
diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java
index 63701a7bbc6..8503d47ccce 100644
--- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java
+++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java
@@ -18,7 +18,7 @@ public interface ModelRunnerCallback {
 
   void onTokenGenerated(String token);
 
-  void onStats(String token);
+  void onStats(String result);
 
   void onGenerationStopped();
 }
diff --git a/extension/benchmark/apple/Benchmark/README.md b/extension/benchmark/apple/Benchmark/README.md
index a68a9bf8abb..4d8e9374634 100644
--- a/extension/benchmark/apple/Benchmark/README.md
+++ b/extension/benchmark/apple/Benchmark/README.md
@@ -33,7 +33,7 @@ This command performs a shallow clone to speed up the process.
 
 The Benchmark App is configured to use a Swift PM package that provides the prebuilt ExecuTorch frameworks.
 
-By default, the app relies on the package referencing locally built binaries. To ensure it functions correctly, you must first build the frameworks by following the [guide](https://pytorch.org/executorch/main/using-executorch-ios.html#building-from-source).
+By default, the app relies on the package referencing locally built binaries. To ensure it functions correctly, you must first build the frameworks by following the [guide](https://pytorch.org/executorch/main/using-executorch-ios#building-from-source).
 
 ## Adding Models and Resources
 
diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
index 332c3986b0b..985f77956b6 100644
--- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
+++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
@@ -60,7 +60,7 @@ @implementation LLaMATests
 + (NSDictionary<NSString *, BOOL (^)(NSString *)> *)predicates {
   return @{
     @"model" : ^BOOL(NSString *filename){
-      return [filename hasSuffix:@".pte"] && [filename containsString:@"llama"];
+      return [filename hasSuffix:@".pte"] && [filename.lowercaseString containsString:@"llama"];
     },
     @"tokenizer" : ^BOOL(NSString *filename) {
       return [filename isEqual:@"tokenizer.bin"] || [filename isEqual:@"tokenizer.model"];
@@ -85,14 +85,18 @@ @implementation LLaMATests
       [testCase measureWithMetrics:@[ tokensPerSecondMetric, [XCTClockMetric new], [XCTMemoryMetric new] ]
                             block:^{
                               tokensPerSecondMetric.tokenCount = 0;
+                              // Create a GenerationConfig object
+                              ::executorch::extension::llm::GenerationConfig config{
+                                .max_new_tokens = 50,
+                                .warming = false,
+                              };
+
                               const auto status = runner->generate(
                                   "Once upon a time",
-                                  50,
+                                  config,
                                   [=](const std::string &token) {
                                     tokensPerSecondMetric.tokenCount++;
-                                  },
-                                  nullptr,
-                                  false);
+                                  });
                               XCTAssertEqual(status, Error::Ok);
                             }];
     },
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index 1a9ddad259f..503539774a5 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -86,6 +86,9 @@ Result<FileDataLoader> FileDataLoader::from(
       "Alignment %zu is not a power of 2",
       alignment);
 
+  ET_CHECK_OR_RETURN_ERROR(
+      file_name != nullptr, InvalidArgument, "File name cannot be empty.");
+
   // Use open() instead of fopen() to avoid the layer of buffering that
   // fopen() does. We will be reading large portions of the file in one shot,
   // so buffering does not help.
diff --git a/extension/data_loader/test/file_data_loader_test.cpp b/extension/data_loader/test/file_data_loader_test.cpp
index 1d4f4c16196..7dc872995a5 100644
--- a/extension/data_loader/test/file_data_loader_test.cpp
+++ b/extension/data_loader/test/file_data_loader_test.cpp
@@ -154,6 +154,12 @@ TEST_P(FileDataLoaderTest, FromMissingFileFails) {
   EXPECT_NE(fdl.error(), Error::Ok);
 }
 
+TEST_P(FileDataLoaderTest, FromEmptyFilePathFails) {
+  // Nullptr should fail
+  Result<FileDataLoader> fdl = FileDataLoader::from(nullptr);
+  EXPECT_NE(fdl.error(), Error::Ok);
+}
+
 TEST_P(FileDataLoaderTest, BadAlignmentFails) {
   // Create a temp file; contents don't matter.
   uint8_t data[256] = {};
diff --git a/extension/export_util/utils.py b/extension/export_util/utils.py
index 2679930178a..aa3a736af3c 100644
--- a/extension/export_util/utils.py
+++ b/extension/export_util/utils.py
@@ -108,7 +108,7 @@ def export_to_exec_prog(
 ) -> ExecutorchProgramManager:
     m = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    m = export_for_training(m, example_inputs).module()
+    m = export_for_training(m, example_inputs, strict=True).module()
 
     core_aten_ep = _to_core_aten(
         m,
diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp
index bf54ae014b5..8aa0af13928 100644
--- a/extension/flat_tensor/flat_tensor_data_map.cpp
+++ b/extension/flat_tensor/flat_tensor_data_map.cpp
@@ -25,8 +25,8 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 
 using executorch::aten::ScalarType;
+using executorch::ET_RUNTIME_NAMESPACE::TensorLayout;
 using executorch::runtime::DataLoader;
-using executorch::runtime::TensorLayout;
 
 namespace executorch {
 namespace extension {
diff --git a/extension/flat_tensor/flat_tensor_data_map.h b/extension/flat_tensor/flat_tensor_data_map.h
index 972a5fa9c55..0e7aee8ffc8 100644
--- a/extension/flat_tensor/flat_tensor_data_map.h
+++ b/extension/flat_tensor/flat_tensor_data_map.h
@@ -32,7 +32,8 @@ namespace extension {
 /**
  * A NamedDataMap implementation for FlatTensor-serialized data.
  */
-class FlatTensorDataMap final : public executorch::runtime::NamedDataMap {
+class FlatTensorDataMap final
+    : public executorch::ET_RUNTIME_NAMESPACE::NamedDataMap {
  public:
   /**
    * Creates a new DataMap that wraps FlatTensor data.
@@ -51,7 +52,8 @@ class FlatTensorDataMap final : public executorch::runtime::NamedDataMap {
    * @return Error::NotFound if the key is not present.
    */
   ET_NODISCARD
-  executorch::runtime::Result<const executorch::runtime::TensorLayout>
+  executorch::runtime::Result<
+      const executorch::ET_RUNTIME_NAMESPACE::TensorLayout>
   get_metadata(const char* key) const override;
 
   /**
diff --git a/extension/flat_tensor/targets.bzl b/extension/flat_tensor/targets.bzl
index 0d49995aa6e..4ac515b7bf0 100644
--- a/extension/flat_tensor/targets.bzl
+++ b/extension/flat_tensor/targets.bzl
@@ -1,24 +1,26 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
-    runtime.cxx_library(
-        name = "flat_tensor_data_map",
-        srcs = [
-            "flat_tensor_data_map.cpp",
-        ],
-        exported_headers = ["flat_tensor_data_map.h"],
-        deps = [
-            "//executorch/runtime/core:core",
-            "//executorch/runtime/core:evalue",
-            "//executorch/runtime/core:named_data_map",
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/util:tensor_util",
-        ],
-        exported_deps = [
-            "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
-            "//executorch/extension/flat_tensor/serialize:generated_headers",
-        ],
-        visibility = [
-            "//executorch/...",
-        ],
-    )
+    for aten_mode in [True, False]:
+        aten_suffix = "_aten" if aten_mode else ""
+        runtime.cxx_library(
+            name = "flat_tensor_data_map" + aten_suffix,
+            srcs = [
+                "flat_tensor_data_map.cpp",
+            ],
+            exported_headers = ["flat_tensor_data_map.h"],
+            deps = [
+                "//executorch/runtime/core:core",
+                "//executorch/runtime/core:evalue",
+                "//executorch/runtime/core:named_data_map" + aten_suffix,
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:tensor_util",
+            ],
+            exported_deps = [
+                "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
+                "//executorch/extension/flat_tensor/serialize:generated_headers",
+            ],
+            visibility = [
+                "//executorch/...",
+            ],
+        )
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index fd2ead6c8b0..42e82dc360f 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -21,6 +21,9 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+  list(APPEND _common_compile_options "-march=armv8.2-a+dotprod")
+endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
@@ -38,6 +41,7 @@ include(${EXECUTORCH_SRCS_FILE})
 
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+list(APPEND _common_include_directories ${EXECUTORCH_ROOT}/third-party/ao)
 
 # Custom op libraries
 set(custom_ops_libs pthreadpool)
diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
index 5d0c0490506..61be3d191a7 100644
--- a/extension/llm/custom_ops/TARGETS
+++ b/extension/llm/custom_ops/TARGETS
@@ -47,3 +47,17 @@ runtime.python_test(
         "//caffe2:torch",
     ],
 )
+
+runtime.python_test(
+    name = "test_quantized_sdpa",
+    srcs = [
+        "test_quantized_sdpa.py",
+    ],
+    preload_deps = [
+        ":custom_ops_aot_lib_mkl_noomp",
+        ":custom_ops_aot_py",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
index d299b314816..6d96a926497 100644
--- a/extension/llm/custom_ops/custom_ops.py
+++ b/extension/llm/custom_ops/custom_ops.py
@@ -229,3 +229,127 @@ def update_cache_meta(
     # workaround. Should we just return cache instead? But I am afraid that
     # will result in extra memory allocation
     return torch.empty((1,), dtype=value.dtype, device="meta")
+
+
+def _validate_quantized_sdpa_params(
+    query,
+    key,
+    value,
+    start_pos,
+    seq_len,
+    attn_mask,
+    drpout_p,
+    is_causal,
+    scale,
+    q_scale,
+    q_zero_point,
+    k_scale,
+    k_zero_point,
+    v_scale,
+    v_zero_point,
+    is_seq_at_dim_2,
+):
+    assert (
+        query.dim() == 4
+    ), f"Expected query to be 4 dimensional but got {query.dim()} dimensions."
+    assert (
+        key.dim() == 4
+    ), f"Expected key to be 4 dimensional but got {key.dim()} dimensions."
+    assert (
+        value.dim() == 4
+    ), f"Expected value to be 4 dimensional but got {value.dim()} dimensions."
+
+    assert (q_scale is not None) and (
+        q_zero_point is not None
+    ), "q_scale and q_zero_point must be provided"
+    assert (k_scale is not None) and (
+        k_zero_point is not None
+    ), "k_scale and k_zero_point must be provided"
+    assert (v_scale is not None) and (
+        v_zero_point is not None
+    ), "v_scale and v_zero_point must be provided"
+
+    assert query.dtype == torch.int8, f"Expected query to be int8 but got {query.dtype}"
+    assert key.dtype == torch.int8, f"Expected key to be int8 but got {key.dtype}"
+    assert value.dtype == torch.int8, f"Expected value to be int8 but got {value.dtype}"
+
+    assert (
+        q_scale.dtype == torch.float32
+    ), f"Expected q_scale to be float32 but got {q_scale.dtype}"
+    assert (
+        q_zero_point.dtype == torch.int8
+    ), f"Expected q_zero_point to be int8 but got {q_zero_point.dtype}"
+    assert (
+        k_scale.dtype == torch.float32
+    ), f"Expected k_scale to be float32 but got {k_scale.dtype}"
+    assert (
+        k_zero_point.dtype == torch.int8
+    ), f"Expected k_zero_point to be int8 but got {k_zero_point.dtype}"
+    assert (
+        v_scale.dtype == torch.float32
+    ), f"Expected v_scale to be float32 but got {v_scale.dtype}"
+    assert (
+        v_zero_point.dtype == torch.int8
+    ), f"Expected v_zero_point to be int8 but got {v_zero_point.dtype}"
+
+    assert (
+        query.size()[:-1] == q_scale.size()[:-1]
+    ), f"Expected query and q_scale to have same size except last dimensions but got {query.size()} and {q_scale.size()}"
+    assert (
+        query.size()[:-1] == q_zero_point.size()[:-1]
+    ), f"Expected query and q_zero_point to have same size except last dimensions but got {query.size()} and {q_zero_point.size()}"
+
+    assert (
+        key.size()[:-1] == k_scale.size()[:-1]
+    ), f"Expected key and k_scale to have same size except last dimensions but got {key.size()} and {k_scale.size()}"
+    assert (
+        key.size()[:-1] == k_zero_point.size()[:-1]
+    ), f"Expected key and k_zero_point to have same size except last dimensions but got {key.size()} and {k_zero_point.size()}"
+
+    assert (
+        value.size()[:-1] == v_scale.size()[:-1]
+    ), f"Expected value and v_scale to have same size except last dimensions but got {value.size()} and {v_scale.size()}"
+    assert (
+        value.size()[:-1] == v_zero_point.size()[:-1]
+    ), f"Expected value and v_zero_point to have same size except last dimensions but got {value.size()} and {v_zero_point.size()}"
+
+
+@impl(custom_ops_lib, "custom_quantized_sdpa", "Meta")
+def custom_quantized_sdpa_meta(
+    query,
+    key,
+    value,
+    start_pos,
+    attn_mask=None,
+    drpout_p=0.0,
+    is_causal=False,
+    scale=None,
+    q_zero_point=None,
+    q_scale=None,
+    k_zero_point=None,
+    k_scale=None,
+    v_zero_point=None,
+    v_scale=None,
+    is_seq_at_dim_2=False,
+):
+    seq_len = query.size(1)
+    _validate_quantized_sdpa_params(
+        query,
+        key,
+        value,
+        start_pos,
+        seq_len,
+        attn_mask,
+        drpout_p,
+        is_causal,
+        scale,
+        q_scale,
+        q_zero_point,
+        k_scale,
+        k_zero_point,
+        v_scale,
+        v_zero_point,
+        is_seq_at_dim_2,
+    )
+
+    return torch.empty(query.size(), dtype=torch.float32, device="meta")
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index 202ff17188d..4a2c464eb56 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -44,7 +44,9 @@ bool validate_flash_attention_args(
       "scaled_dot_product_attention_flash_attention: Q/K/V should have the same head size");
 
   ET_CHECK_OR_RETURN_FALSE(
-      (query.scalar_type() == ScalarType::Float), "Query must be Float type");
+      (query.scalar_type() == ScalarType::Float) ||
+          (query.scalar_type() == ScalarType::Char),
+      "Query must be Float type");
 
   ET_CHECK_OR_RETURN_FALSE(
       (query.scalar_type() == key.scalar_type()) &&
@@ -262,14 +264,14 @@ Tensor& flash_attention_kernel_out(
       InvalidArgument,
       output);
 
-  auto q_seq_len = query.size(2);
+  auto seq_len = query.size(2);
 
   ET_SWITCH_FLOAT_TYPES(
       query.scalar_type(), ctx, "flash_attention", CTYPE, [&] {
         // TODO we need to re-evaluate this for ARM CPUs
         // And there can be many so instead of templatizing
         // we might consider another appraoch
-        if (q_seq_len >= 768) {
+        if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
               output,
               query,
@@ -285,7 +287,7 @@ Tensor& flash_attention_kernel_out(
               nullopt,
               nullopt,
               nullopt);
-        } else if (q_seq_len >= 192) {
+        } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
               output,
               query,
@@ -339,7 +341,8 @@ Tensor& custom_sdpa_out_impl(
     const optional<Tensor>& k_zero_points = nullopt,
     const optional<Tensor>& k_scales = nullopt,
     const optional<Tensor>& v_zero_points = nullopt,
-    const optional<Tensor>& v_scales = nullopt) {
+    const optional<Tensor>& v_scales = nullopt,
+    bool is_seq_at_dim_2 = false) {
   ET_KERNEL_CHECK_MSG(
       ctx,
       !attn_mask.has_value() || !is_causal,
@@ -354,9 +357,16 @@ Tensor& custom_sdpa_out_impl(
       output,
       "Invalid arguments");
 
-  bool is_seq_at_dim_1{true};
+  int64_t seq_len = q.size(1);
+  SeqDim seq_dim{SeqDim::TWO};
+  if (!is_seq_at_dim_2) {
+    seq_dim = SeqDim::ONE;
+  }
+
   if (q.scalar_type() == ScalarType::Char) {
-    is_seq_at_dim_1 = false;
+    if (seq_dim == SeqDim::TWO) {
+      seq_len = q.size(2);
+    }
     ET_KERNEL_CHECK_MSG(
         ctx,
         q_scales.has_value() && q_zero_points.has_value() &&
@@ -390,10 +400,8 @@ Tensor& custom_sdpa_out_impl(
 
   ET_CHECK_MSG(q.dim() == 4, "query must be a 4D tensor");
 
-  const int64_t seq_len = q.size(1);
-  auto q_seq_len = q.size(1);
-
-  const int64_t num_keys_for_causal_attention = start_pos + seq_len;
+  const int64_t num_keys_for_causal_attention =
+      attn_mask.has_value() ? -1 : start_pos + seq_len;
 
   ET_KERNEL_CHECK(
       ctx,
@@ -408,7 +416,7 @@ Tensor& custom_sdpa_out_impl(
         // TODO we need to re-evaluate this for ARM CPUs
         // And there can be many so instead of templatizing
         // we might consider another appraoch
-        if (q_seq_len >= 768) {
+        if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
               output,
               q,
@@ -418,16 +426,16 @@ Tensor& custom_sdpa_out_impl(
               is_causal,
               attn_mask,
               scale,
-              nullopt, // q_zero_points
-              nullopt, // q_scales
-              nullopt, // k_zero_points
-              nullopt, // k_scales
-              nullopt, // v_zero_points
-              nullopt, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              q_zero_points, // q_zero_points
+              q_scales, // q_scales
+              k_zero_points, // k_zero_points
+              k_scales, // k_scales
+              v_zero_points, // v_zero_points
+              v_scales, // v_scales
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
-        } else if (q_seq_len >= 192) {
+        } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
               output,
               q,
@@ -437,13 +445,13 @@ Tensor& custom_sdpa_out_impl(
               is_causal,
               attn_mask,
               scale,
-              nullopt, // q_zero_points
-              nullopt, // q_scales
-              nullopt, // k_zero_points
-              nullopt, // k_scales
-              nullopt, // v_zero_points
-              nullopt, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              q_zero_points, // q_zero_points
+              q_scales, // q_scales
+              k_zero_points, // k_zero_points
+              k_scales, // k_scales
+              v_zero_points, // v_zero_points
+              v_scales, // v_scales
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
         } else {
@@ -456,13 +464,13 @@ Tensor& custom_sdpa_out_impl(
               is_causal,
               attn_mask,
               scale,
-              nullopt, // q_zero_points
-              nullopt, // q_scales
-              nullopt, // k_zero_points
-              nullopt, // k_scales
-              nullopt, // v_zero_points
-              nullopt, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              q_zero_points, // q_zero_points
+              q_scales, // q_scales
+              k_zero_points, // k_zero_points
+              k_scales, // k_scales
+              v_zero_points, // v_zero_points
+              v_scales, // v_scales
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
         }
@@ -470,6 +478,45 @@ Tensor& custom_sdpa_out_impl(
   return output;
 }
 
+Tensor& custom_quantized_sdpa_out(
+    RuntimeContext& ctx,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const int64_t start_pos,
+    const optional<Tensor>& attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    const optional<Tensor>& q_zero_points,
+    const optional<Tensor>& q_scales,
+    const optional<Tensor>& k_zero_points,
+    const optional<Tensor>& k_scales,
+    const optional<Tensor>& v_zero_points,
+    const optional<Tensor>& v_scales,
+    const bool is_seq_at_dim_2,
+    Tensor& output) {
+  return custom_sdpa_out_impl(
+      ctx,
+      q,
+      k,
+      v,
+      start_pos,
+      attn_mask,
+      dropout_p,
+      is_causal,
+      scale,
+      output,
+      q_zero_points,
+      q_scales,
+      k_zero_points,
+      k_scales,
+      v_zero_points,
+      v_scales,
+      is_seq_at_dim_2);
+}
+
 /*
   Input params
   @param[in] q_projected Projected query with query weights.
@@ -570,3 +617,8 @@ EXECUTORCH_LIBRARY(
     llama,
     "custom_sdpa.out",
     torch::executor::native::custom_sdpa_out);
+
+EXECUTORCH_LIBRARY(
+    llama,
+    "custom_quantized_sdpa.out",
+    torch::executor::native::custom_quantized_sdpa_out);
diff --git a/extension/llm/custom_ops/op_sdpa.h b/extension/llm/custom_ops/op_sdpa.h
index bc2202b9bd8..9d357eb6ea1 100644
--- a/extension/llm/custom_ops/op_sdpa.h
+++ b/extension/llm/custom_ops/op_sdpa.h
@@ -56,6 +56,25 @@ Tensor& flash_attention_kernel_out(
     const optional<double> scale,
     Tensor& output);
 
+Tensor& custom_quantized_sdpa_out(
+    RuntimeContext& ctx,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const int64_t start_pos,
+    const optional<Tensor>& attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    const optional<Tensor>& q_zero_points,
+    const optional<Tensor>& q_scales,
+    const optional<Tensor>& k_zero_points,
+    const optional<Tensor>& k_scales,
+    const optional<Tensor>& v_zero_points,
+    const optional<Tensor>& v_scales,
+    const bool is_seq_at_dim_1,
+    Tensor& output);
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp
index 213adf1c8ab..ff367c85c8a 100644
--- a/extension/llm/custom_ops/op_sdpa_aot.cpp
+++ b/extension/llm/custom_ops/op_sdpa_aot.cpp
@@ -77,6 +77,47 @@ at::Tensor custom_sdpa_aten(
     // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
     const std::optional<double> scale);
 
+Tensor& custom_quantized_sdpa_out_no_context(
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const int64_t start_pos,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    const optional<Tensor> q_zero_points,
+    const optional<Tensor> q_scales,
+    const optional<Tensor> k_zero_points,
+    const optional<Tensor> k_scales,
+    const optional<Tensor> v_zero_points,
+    const optional<Tensor> v_scales,
+    const bool is_seq_at_dim_2,
+    Tensor& output);
+
+at::Tensor custom_quantized_sdpa_aten(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const int64_t start_pos,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const std::optional<at::Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const std::optional<double> scale,
+    const std::optional<at::Tensor>& q_zero_points,
+    const std::optional<at::Tensor>& q_scales,
+    const std::optional<at::Tensor>& k_zero_points,
+    const std::optional<at::Tensor>& k_scales,
+    const std::optional<at::Tensor>& v_zero_points,
+    const std::optional<at::Tensor>& v_scales,
+    const bool is_seq_at_dim_2);
+
 Tensor& update_cache_out_no_context(
     const Tensor& value,
     Tensor& cache,
@@ -198,6 +239,87 @@ at::Tensor custom_sdpa_aten(
   return output;
 }
 
+Tensor& custom_quantized_sdpa_out_no_context(
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const int64_t start_pos,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    const optional<Tensor> q_zero_points,
+    const optional<Tensor> q_scales,
+    const optional<Tensor> k_zero_points,
+    const optional<Tensor> k_scales,
+    const optional<Tensor> v_zero_points,
+    const optional<Tensor> v_scales,
+    const bool is_seq_at_dim_2,
+    Tensor& output) {
+  executorch::aten::RuntimeContext context{};
+  return torch::executor::native::custom_quantized_sdpa_out(
+      context,
+      q,
+      k,
+      v,
+      start_pos,
+      attn_mask,
+      dropout_p,
+      is_causal,
+      scale,
+      q_zero_points,
+      q_scales,
+      k_zero_points,
+      k_scales,
+      v_zero_points,
+      v_scales,
+      is_seq_at_dim_2,
+      output);
+}
+
+at::Tensor custom_quantized_sdpa_aten(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const int64_t start_pos,
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const std::optional<at::Tensor> attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const std::optional<double> scale,
+    const std::optional<at::Tensor>& q_zero_points,
+    const std::optional<at::Tensor>& q_scales,
+    const std::optional<at::Tensor>& k_zero_points,
+    const std::optional<at::Tensor>& k_scales,
+    const std::optional<at::Tensor>& v_zero_points,
+    const std::optional<at::Tensor>& v_scales,
+    const bool is_seq_at_dim_2) {
+  auto output = at::empty(q.sizes());
+  WRAP_TO_ATEN(custom_quantized_sdpa_out_no_context, 15)
+  (q,
+   k,
+   v,
+   start_pos,
+   attn_mask,
+   dropout_p,
+   is_causal,
+   scale,
+   q_zero_points,
+   q_scales,
+   k_zero_points,
+   k_scales,
+   v_zero_points,
+   v_scales,
+   is_seq_at_dim_2,
+   output);
+  return output;
+}
+
 Tensor& update_cache_out_no_context(
     const Tensor& value,
     Tensor& cache,
@@ -245,6 +367,18 @@ TORCH_LIBRARY_FRAGMENT(llama, m) {
   m.def(
       "update_cache.out(Tensor value, Tensor(a!) cache, "
       "SymInt start_pos, *, Tensor(b!) out) -> Tensor(b!)");
+  m.def(
+      "custom_quantized_sdpa(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
+      "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
+      "float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, "
+      "Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, "
+      "Tensor? v_scales=None, bool is_seq_at_dim_2=False) -> Tensor");
+  m.def(
+      "custom_quantized_sdpa.out(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
+      "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
+      "float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, "
+      "Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, "
+      "Tensor? v_scales=None, bool is_seq_at_dim_2=False, *, Tensor(a!) out) -> Tensor(a!)");
 }
 
 // TODO: Rename this file to op_custom_ops_aot.cpp
@@ -263,4 +397,11 @@ TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
   m.impl(
       "update_cache.out",
       WRAP_TO_ATEN(torch::executor::native::update_cache_out_no_context, 3));
+  m.impl(
+      "custom_quantized_sdpa",
+      torch::executor::native::custom_quantized_sdpa_aten);
+  m.impl(
+      "custom_quantized_sdpa.out",
+      WRAP_TO_ATEN(
+          torch::executor::native::custom_quantized_sdpa_out_no_context, 15));
 }
diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
index 0639c539ed1..c907a84f14c 100644
--- a/extension/llm/custom_ops/op_sdpa_impl.h
+++ b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -23,11 +23,15 @@
 #endif
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 
+#include <torchao/experimental/kernels/cpu/interface/quantized_matmul.h>
+
 namespace torch {
 namespace executor {
 
 namespace native {
 
+enum class SeqDim { ONE = 1, TWO };
+
 namespace sdpa::impl {
 
 struct MaybeQuantizedMatrixData {
@@ -35,6 +39,8 @@ struct MaybeQuantizedMatrixData {
   const int8_t* zero_points{nullptr};
   const float* scales{nullptr};
   int64_t m = 0, n = 0;
+  const int64_t zero_points_stride{1};
+  const int64_t scales_stride{1};
   ScalarType dtype{ScalarType::Float};
   MaybeQuantizedMatrixData() = default;
   MaybeQuantizedMatrixData(
@@ -43,12 +49,15 @@ struct MaybeQuantizedMatrixData {
       const float* scales_,
       int64_t m_,
       int64_t n_,
+      int64_t qparams_stride,
       ScalarType dtype_)
       : data(data_),
         zero_points(zero_points_),
         scales(scales_),
         m(m_),
         n(n_),
+        zero_points_stride(qparams_stride),
+        scales_stride(qparams_stride),
         dtype(dtype_) {}
 };
 
@@ -67,7 +76,32 @@ void _q_at_k_gemm(
       q_data.dtype == ScalarType::Char || q_data.dtype == ScalarType::Float,
       "q and k must be either int8 or float");
   if (q_data.dtype == ScalarType::Char) {
-    ET_CHECK_MSG(false, "int8 not supported yet");
+    if constexpr (std::is_same<accum_t, float>::value) {
+      int a_stride_m_tmp, b_stride_n_tmp;
+      auto kernel = torchao::kernels::cpu::quantized_matmul::
+          get_int8_a_int8_b_channelwise_qmatmul(
+              q_m, k_n, qk_k, false, true, a_stride_m_tmp, b_stride_n_tmp);
+      kernel(
+          q_m,
+          k_n,
+          qk_k,
+          static_cast<const int8_t*>(q_data.data),
+          q_stride_m,
+          static_cast<const int8_t*>(k_data.data),
+          k_stride_n,
+          qk_data,
+          k_n,
+          static_cast<const int8_t*>(q_data.zero_points),
+          static_cast<const int8_t*>(k_data.zero_points),
+          static_cast<const float*>(q_data.scales),
+          static_cast<const float*>(k_data.scales),
+          // LHS and RHS are assumed to have same stride for qparams
+          q_data.zero_points_stride,
+          k_data.zero_points_stride);
+    } else {
+      ET_CHECK_MSG(
+          false, "Accumulation in dtype other than float not supported yet");
+    }
   } else {
     ::executorch::cpublas::gemm(
         ::executorch::cpublas::TransposeType::Transpose,
@@ -86,6 +120,131 @@ void _q_at_k_gemm(
   }
 }
 
+// Refactor op_dequantize.cpp to avoid code duplication
+void dequantize_optimized(
+    const int8_t* in,
+    const float scale,
+    const int8_t zero_point,
+    float* out,
+    int64_t quant_min,
+    int64_t quant_max,
+    size_t numel) {
+  size_t i = 0;
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  int8x8_t zero_point_vec = vdup_n_s8(zero_point);
+  float32x4_t scales = vdupq_n_f32(static_cast<float>(scale));
+  constexpr int32_t kVecSize = 16;
+  const size_t num_vecs = numel / kVecSize;
+  const int8_t* in_copy = in;
+  float* out_copy = out;
+  for (; i < num_vecs; i++) {
+    int8x16_t in_vec = vld1q_s8(in_copy);
+    int16x8_t sub_vec_0_7 = vsubl_s8(vget_low_s8(in_vec), zero_point_vec);
+    int32x4_t sub_vec_0_3 = vmovl_s16(vget_low_s16(sub_vec_0_7));
+    int32x4_t sub_vec_4_7 = vmovl_s16(vget_high_s16(sub_vec_0_7));
+    float32x4_t out_vec_0_3 = vmulq_f32(vcvtq_f32_s32(sub_vec_0_3), scales);
+    float32x4_t out_vec_4_7 = vmulq_f32(vcvtq_f32_s32(sub_vec_4_7), scales);
+
+    int16x8_t sub_vec_8_15 = vsubl_s8(vget_high_s8(in_vec), zero_point_vec);
+    int32x4_t sub_vec_8_11 = vmovl_s16(vget_low_s16(sub_vec_8_15));
+    int32x4_t sub_vec_12_15 = vmovl_s16(vget_high_s16(sub_vec_8_15));
+    float32x4_t out_vec_8_11 = vmulq_f32(vcvtq_f32_s32(sub_vec_8_11), scales);
+    float32x4_t out_vec_12_15 = vmulq_f32(vcvtq_f32_s32(sub_vec_12_15), scales);
+    vst1q_f32(out_copy + 0, out_vec_0_3);
+    vst1q_f32(out_copy + 4, out_vec_4_7);
+    vst1q_f32(out_copy + 8, out_vec_8_11);
+    vst1q_f32(out_copy + 12, out_vec_12_15);
+    in_copy += kVecSize;
+    out_copy += kVecSize;
+  }
+  i = i * kVecSize;
+#endif
+  for (; i < numel; i++) {
+    out[i] = (static_cast<int16_t>(in[i]) - static_cast<int16_t>(zero_point)) *
+        scale;
+  }
+}
+
+void dequantize_per_channel_optimized(
+    const int8_t* in_data,
+    const float* scales_data,
+    const int8_t* zero_points_data,
+    float* out_data,
+    int64_t quant_min,
+    int64_t quant_max,
+    size_t outer_size,
+    size_t in_outer_stride,
+    size_t out_outer_stride,
+    size_t num_channels,
+    size_t in_channel_stride,
+    size_t out_channel_stride,
+    size_t channel_size,
+    size_t qparams_stride) {
+  for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+    // Loop through dim
+    for (size_t channel_idx = 0; channel_idx < num_channels; ++channel_idx) {
+      const int8_t* in_data_local = in_data + outer_idx * in_outer_stride +
+          channel_idx * in_channel_stride;
+      const float scale = *(scales_data + channel_idx * qparams_stride);
+      const int8_t zero_point =
+          *(zero_points_data + channel_idx * qparams_stride);
+      float* out_data_local = out_data + outer_idx * out_outer_stride +
+          channel_idx * out_channel_stride;
+      dequantize_optimized(
+          in_data_local,
+          scale,
+          zero_point,
+          out_data_local,
+          quant_min,
+          quant_max,
+          channel_size);
+    }
+  }
+}
+
+void dequant_and_gemm(
+    const int64_t m,
+    const int64_t n,
+    const int64_t k,
+    float* qk_data,
+    const int64_t qk_stride_m,
+    const MaybeQuantizedMatrixData& v_data,
+    const int64_t v_stride_n,
+    float* o_data,
+    const int64_t o_stride_m,
+    const float beta) {
+  std::vector<float> dequantized_v_data(v_data.m * v_data.n);
+  dequantize_per_channel_optimized(
+      static_cast<const int8_t*>(v_data.data),
+      static_cast<const float*>(v_data.scales),
+      static_cast<const int8_t*>(v_data.zero_points),
+      dequantized_v_data.data(),
+      -128,
+      127,
+      1,
+      0,
+      0,
+      v_data.m,
+      v_stride_n,
+      v_data.n,
+      v_data.n,
+      v_data.zero_points_stride);
+  ::executorch::cpublas::gemm(
+      ::executorch::cpublas::TransposeType::NoTranspose,
+      ::executorch::cpublas::TransposeType::NoTranspose,
+      n,
+      m,
+      k,
+      static_cast<float>(1),
+      dequantized_v_data.data(),
+      v_data.n,
+      qk_data,
+      qk_stride_m,
+      beta,
+      o_data,
+      o_stride_m);
+}
+
 template <typename accum_t>
 void _qk_at_v_gemm(
     const int64_t m,
@@ -99,7 +258,46 @@ void _qk_at_v_gemm(
     const int64_t o_stride_m,
     const accum_t beta) {
   if (v_data.dtype == ScalarType::Char) {
-    ET_CHECK_MSG(false, "int8 not supported yet");
+    if constexpr (std::is_same<accum_t, float>::value) {
+      if (m > 4) {
+        // For larger batch sizes, dequantize and use BLAS for better
+        // performance
+        dequant_and_gemm(
+            m,
+            n,
+            k,
+            const_cast<float*>(qk_data),
+            qk_stride_m,
+            v_data,
+            v_stride_n,
+            o_data,
+            o_stride_m,
+            beta);
+      } else {
+        // For smaller batch sizes, use quantized gemm
+        int a_stride_m_tmp, b_stride_n_tmp;
+        auto kernel = torchao::kernels::cpu::quantized_matmul::
+            get_fp32_a_input_channelwise_8bit_b_f32_c_matmul(
+                m, n, k, false, false, a_stride_m_tmp, b_stride_n_tmp);
+        kernel(
+            m,
+            n,
+            k,
+            qk_data,
+            qk_stride_m /*lhs_stride_m*/,
+            static_cast<const int8_t*>(v_data.data),
+            v_stride_n /*rhs_stride_n*/,
+            o_data,
+            o_stride_m /*out_stride_n*/,
+            static_cast<const int8_t*>(v_data.zero_points),
+            static_cast<const float*>(v_data.scales),
+            beta,
+            v_data.zero_points_stride);
+      }
+    } else {
+      ET_CHECK_MSG(
+          false, "Accumulation in dtype other than float not supported yet");
+    }
   } else {
     ::executorch::cpublas::gemm(
         ::executorch::cpublas::TransposeType::NoTranspose,
@@ -289,6 +487,40 @@ sdpa_with_kv_cache does not use attn_mask.
 
 TODO: Just handle conversion of bool mask to float
 */
+/**
+ * @brief Implements Flash Attention algorithm on CPU
+ *
+ * This function computes scaled dot-product attention with optimizations for
+ CPU.
+ * It supports both regular and quantized attention computation.
+ *
+ * @tparam scalar_t The data type for computation (e.g., float)
+ * @tparam q_split_size Block size for query matrix in tiling algorithm
+ * @tparam kv_split_size Block size for key/value matrices in tiling algorithm
+ *
+ * @param output Output tensor to store attention results
+ * @param query Query tensor [Batch x Num_heads x Q_seq_len x Dim_per_head]
+ * @param key Key tensor [Batch x Num_heads_kv x KV_seq_len x Dim_per_head]
+ * @param value Value tensor [Batch x Num_heads_kv x KV_seq_len x Dim_per_head]
+ * @param dropout_p Dropout probability (not used in current implementation)
+ * @param is_causal Whether to apply causal mask (lower triangular)
+ * @param attn_mask Optional explicit attention mask
+ * @param scale Optional custom scaling factor (default: 1/sqrt(head_dim))
+ * @param q_zero_points Optional zero points for quantized query
+ * @param q_scales Optional scales for quantized query
+ * @param k_zero_points Optional zero points for quantized key
+ * @param k_scales Optional scales for quantized key
+ * @param v_zero_points Optional zero points for quantized value
+ * @param v_scales Optional scales for quantized value
+ * @param seq_dim Which dimension is sequence dimension.
+ If SeqDim::One, then query, key, value are
+ expected to be in shape [Batch x Q_seq_len x Dim_per_head x Num_heads] and
+ output is expected to be in shape [Batch x Q_seq_len x Dim_per_head x
+ Num_heads]
+ * @param start_pos Starting position for causal masking in generation
+ * @param num_keys_for_causal_attention Number of keys to consider for causal
+ attention (-1 for all)
+ */
 template <typename scalar_t, int64_t q_split_size, int64_t kv_split_size>
 void cpu_flash_attention(
     Tensor& output,
@@ -305,22 +537,10 @@ void cpu_flash_attention(
     const optional<Tensor>& k_scales,
     const optional<Tensor>& v_zero_points,
     const optional<Tensor>& v_scales,
-    bool is_seq_at_dim_1 = false,
+    const SeqDim seq_dim = SeqDim::TWO,
     const int64_t start_pos = 0,
     const int64_t num_keys_for_causal_attention = -1) {
   (void)dropout_p;
-  // Query (Batch x Num_heads  x Q_seq_len  x Dim_per_head)
-  // Key   (Batch x Num_heads  x KV_seq_len x Dim_per_head)
-  // Value (Batch x Num_heads  x KV_seq_len x Dim_per_head)
-
-  /*
-  //    -> (Batch x Q_seq_len  x Num_heads  x Dim_per_head)
-  at::Tensor query = q.transpose(1, 2);
-  //    -> (Batch x KV_seq_len x Num_heads  x Dim_per_head)
-  at::Tensor key = k.transpose(1, 2);
-  //    -> (Batch x KV_seq_len x Num_heads  x Dim_per_head)
-  at::Tensor value = v.transpose(1, 2);
-  */
 
   // Without this we have out-of-bounds writes for
   // causal masking
@@ -346,7 +566,7 @@ void cpu_flash_attention(
   int64_t kvSize = value.size(2);
   int64_t num_heads_kv = key.size(1);
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     num_head = query.size(2);
     num_heads_kv = key.size(2);
     qSize = query.size(1);
@@ -385,7 +605,11 @@ void cpu_flash_attention(
     */
     ET_CHECK_MSG(attn_mask.value().dim() == 2, "attn_mask must be 2D");
     ET_CHECK_MSG(
-        attn_mask.value().size(0) == qSize, "attn_mask shape mismatch");
+        attn_mask.value().size(0) == qSize,
+        "attn_mask shape mismatch"
+        "attn_mask.size(0)=%zd qSize=%" PRId64,
+        attn_mask.value().size(0),
+        qSize);
     ET_CHECK_MSG(
         attn_mask.value().size(1) == kvSize,
         "attn_mask shape mismatch"
@@ -394,14 +618,15 @@ void cpu_flash_attention(
         kvSize);
   }
 
-  bool is_quantized_sdpa = query.scalar_type() == ScalarType::Char;
+  bool is_quantized_sdpa = false;
+  is_quantized_sdpa = query.scalar_type() == ScalarType::Char;
 
   auto strides = query.strides();
   int64_t qStrideB = strides[0];
   int64_t qStrideH = strides[1];
   int64_t qStrideM = strides[2];
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     qStrideH = strides[2];
     qStrideM = strides[1];
   }
@@ -411,7 +636,7 @@ void cpu_flash_attention(
   int64_t kStrideH = strides[1];
   int64_t kStrideN = strides[2];
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     kStrideH = strides[2];
     kStrideN = strides[1];
   }
@@ -421,17 +646,60 @@ void cpu_flash_attention(
   int64_t vStrideH = strides[1];
   int64_t vStrideN = strides[2];
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     vStrideH = strides[2];
     vStrideN = strides[1];
   }
 
+  int64_t q_quant_params_StrideB = 0;
+  int64_t q_quant_params_StrideH = 0;
+  int64_t q_quant_params_StrideM = 0;
+  int64_t k_quant_params_StrideB = 0;
+  int64_t k_quant_params_StrideH = 0;
+  int64_t k_quant_params_StrideN = 0;
+  int64_t v_quant_params_StrideB = 0;
+  int64_t v_quant_params_StrideH = 0;
+  int64_t v_quant_params_StrideN = 0;
+
+  if (is_quantized_sdpa) {
+    auto q_strides = q_zero_points.value().strides();
+    q_quant_params_StrideB = q_strides[0];
+    q_quant_params_StrideH = q_strides[1];
+    q_quant_params_StrideM = q_strides[2];
+
+    auto k_strides = k_zero_points.value().strides();
+    k_quant_params_StrideB = k_strides[0];
+    k_quant_params_StrideH = k_strides[1];
+    k_quant_params_StrideN = k_strides[2];
+
+    auto v_strides = v_zero_points.value().strides();
+    v_quant_params_StrideB = v_strides[0];
+    v_quant_params_StrideH = v_strides[1];
+    v_quant_params_StrideN = v_strides[2];
+
+    ET_CHECK_MSG(
+        (v_quant_params_StrideN == k_quant_params_StrideN) &&
+            (v_quant_params_StrideN == q_quant_params_StrideM),
+        "Quant params strides must be same for seq dim");
+
+    if (seq_dim == SeqDim::ONE) {
+      q_quant_params_StrideH = q_strides[2];
+      q_quant_params_StrideM = q_strides[1];
+
+      k_quant_params_StrideH = k_strides[2];
+      k_quant_params_StrideN = k_strides[1];
+
+      v_quant_params_StrideH = v_strides[2];
+      v_quant_params_StrideN = v_strides[1];
+    }
+  }
+
   strides = output.strides();
   int64_t oStrideB = strides[0];
   int64_t oStrideH = strides[1];
   int64_t oStrideM = strides[2];
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     oStrideH = strides[2];
     oStrideM = strides[1];
   }
@@ -473,7 +741,11 @@ void cpu_flash_attention(
       /* qk_sum */ qSplitSize +
       /* dst    */ qSplitSize * headSize;
 
-  int64_t size_bytes = size_per_thread * num_thread * query.element_size();
+  // Since all intermediate compute is accum_t, we need to
+  // allocate a buffer accordingly.
+  int64_t size_of_intermediate_precision = sizeof(accum_t);
+  int64_t size_bytes = size_per_thread * num_thread * query.element_size() *
+      size_of_intermediate_precision;
   std::vector<char> buf_vec(size_bytes);
   void* buf = reinterpret_cast<void*>(buf_vec.data());
   // Need to double check the following
@@ -559,14 +831,18 @@ void cpu_flash_attention(
         int64_t q_offset = i * qStrideB + j * qStrideH + m * qStrideM;
         int64_t k_offset = i * kStrideB + j_kv * kStrideH + n * kStrideN;
         if (is_quantized_sdpa) {
-          ET_CHECK_MSG(
-              !is_seq_at_dim_1, "For quantized SDPA, seq_len must be at dim 2");
-          q_scales_ptr = q_scales.value().const_data_ptr<float>() + q_offset;
-          k_scales_ptr = k_scales.value().const_data_ptr<float>() + k_offset;
-          q_zero_points_ptr =
-              q_zero_points.value().const_data_ptr<int8_t>() + q_offset;
-          k_zero_points_ptr =
-              k_zero_points.value().const_data_ptr<int8_t>() + k_offset;
+          int64_t q_quant_params_offset = i * q_quant_params_StrideB +
+              j * q_quant_params_StrideH + m * q_quant_params_StrideM;
+          int64_t k_quant_params_offset = i * k_quant_params_StrideB +
+              j_kv * k_quant_params_StrideH + n * k_quant_params_StrideN;
+          q_scales_ptr =
+              q_scales.value().const_data_ptr<float>() + q_quant_params_offset;
+          k_scales_ptr =
+              k_scales.value().const_data_ptr<float>() + k_quant_params_offset;
+          q_zero_points_ptr = q_zero_points.value().const_data_ptr<int8_t>() +
+              q_quant_params_offset;
+          k_zero_points_ptr = k_zero_points.value().const_data_ptr<int8_t>() +
+              k_quant_params_offset;
           q_sub_matrix_data_ptr = (const int8_t*)(q_data) + q_offset;
           k_sub_matrix_data_ptr = (const int8_t*)(k_data) + k_offset;
         } else {
@@ -579,6 +855,7 @@ void cpu_flash_attention(
             q_scales_ptr,
             qBlockSize,
             headSize,
+            q_quant_params_StrideM,
             query.scalar_type());
         MaybeQuantizedMatrixData k_sub_matrix_data = MaybeQuantizedMatrixData(
             static_cast<const void*>(k_sub_matrix_data_ptr),
@@ -586,6 +863,7 @@ void cpu_flash_attention(
             k_scales_ptr,
             kvBlockSize,
             headSize,
+            k_quant_params_StrideN,
             key.scalar_type());
         _q_at_k_gemm<accum_t>(
             qBlockSize,
@@ -719,11 +997,12 @@ void cpu_flash_attention(
         const int8_t* v_zero_points_ptr = nullptr;
         int64_t v_offset = i * vStrideB + j_kv * vStrideH + n * vStrideN;
         if (is_quantized_sdpa) {
-          ET_CHECK_MSG(
-              !is_seq_at_dim_1, "For quantized SDPA, seq_len must be at dim 2");
-          v_scales_ptr = v_scales.value().const_data_ptr<float>() + v_offset;
-          v_zero_points_ptr =
-              v_zero_points.value().const_data_ptr<int8_t>() + v_offset;
+          int64_t v_quant_params_offset = i * v_quant_params_StrideB +
+              j_kv * v_quant_params_StrideH + n * v_quant_params_StrideN;
+          v_scales_ptr =
+              v_scales.value().const_data_ptr<float>() + v_quant_params_offset;
+          v_zero_points_ptr = v_zero_points.value().const_data_ptr<int8_t>() +
+              v_quant_params_offset;
           v_sub_matrix_data_ptr = (const int8_t*)(v_data) + v_offset;
         } else {
           v_sub_matrix_data_ptr = (const scalar_t*)(v_data) + v_offset;
@@ -734,6 +1013,7 @@ void cpu_flash_attention(
             v_scales_ptr,
             kvBlockSize,
             headSize,
+            v_quant_params_StrideN,
             value.scalar_type());
         // Calculate Softmax(q @ k.T) @ v
         _qk_at_v_gemm<accum_t>(
diff --git a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
index 435cf44e66f..6c0496af32d 100644
--- a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
@@ -524,289 +524,6 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_5, 1e-4, 1e-4);
 }
 
-TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
-  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
-
-  executorch::aten::Tensor query = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.8823,
-       0.9150,
-       0.3829,
-       0.9593,
-       0.3904,
-       0.6009,
-       0.2566,
-       0.7936,
-       0.9408,
-       0.1332,
-       0.9346,
-       0.5936,
-       0.8694,
-       0.5677,
-       0.7411,
-       0.4294});
-  executorch::aten::Tensor key = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.8854,
-       0.5739,
-       0.2666,
-       0.6274,
-       0.2696,
-       0.4414,
-       0.2969,
-       0.8317,
-       0.1053,
-       0.2695,
-       0.3588,
-       0.1994,
-       0.5472,
-       0.0062,
-       0.9516,
-       0.0753});
-  executorch::aten::Tensor value = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.8860,
-       0.5832,
-       0.3376,
-       0.8090,
-       0.5779,
-       0.9040,
-       0.5547,
-       0.3423,
-       0.6343,
-       0.3644,
-       0.7104,
-       0.9464,
-       0.7890,
-       0.2814,
-       0.7886,
-       0.5895});
-  executorch::aten::Tensor attn_mask = tfFloat.make({1, 1}, {0});
-  executorch::aten::Tensor key_cache_0 = tfFloat.zeros({1, 5, 4, 4});
-  executorch::aten::Tensor value_cache_0 = tfFloat.zeros({1, 5, 4, 4});
-  executorch::aten::Tensor key_cache_1 = tfFloat.zeros({1, 5, 4, 4});
-  executorch::aten::Tensor value_cache_1 = tfFloat.zeros({1, 5, 4, 4});
-  executorch::aten::Tensor key_cache_2 = tfFloat.zeros({1, 5, 4, 4});
-  executorch::aten::Tensor value_cache_2 = tfFloat.zeros({1, 5, 4, 4});
-  double dropout_p = 0;
-  bool is_causal = false;
-  executorch::aten::optional<double> scale;
-
-  // start pos: 0 layer id 0
-  executorch::aten::Tensor ret_expected_0 = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.8860,
-       0.5832,
-       0.3376,
-       0.8090,
-       0.5779,
-       0.9040,
-       0.5547,
-       0.3423,
-       0.6343,
-       0.3644,
-       0.7104,
-       0.9464,
-       0.7890,
-       0.2814,
-       0.7886,
-       0.5895});
-
-  std::vector<int32_t> out_size = {1, 1, 4, 4};
-  executorch::aten::Tensor out = tfFloat.zeros(out_size);
-  executorch::aten::Tensor ret = op_sdpa_with_kv_cache(
-      query,
-      key,
-      value,
-      key_cache_0,
-      value_cache_0,
-      0,
-      1,
-      attn_mask,
-      dropout_p,
-      is_causal,
-      scale,
-      out);
-  EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_0, 1e-4, 1e-4);
-
-  // start pos: 0 layer id 2
-  executorch::aten::Tensor ret_expected_1 = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.8860,
-       0.5832,
-       0.3376,
-       0.8090,
-       0.5779,
-       0.9040,
-       0.5547,
-       0.3423,
-       0.6343,
-       0.3644,
-       0.7104,
-       0.9464,
-       0.7890,
-       0.2814,
-       0.7886,
-       0.5895});
-  out = tfFloat.zeros(out_size);
-  ret = op_sdpa_with_kv_cache(
-      query,
-      key,
-      value,
-      key_cache_2,
-      value_cache_2,
-      0,
-      1,
-      attn_mask,
-      dropout_p,
-      is_causal,
-      scale,
-      out);
-  EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_1, 1e-4, 1e-4);
-
-  attn_mask = tfFloat.make({1, 2}, {0, 0});
-  // start pos: 1 layer id 0
-  executorch::aten::Tensor ret_expected_2 = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.8860,
-       0.5832,
-       0.3376,
-       0.8090,
-       0.5779,
-       0.9040,
-       0.5547,
-       0.3423,
-       0.6343,
-       0.3644,
-       0.7104,
-       0.9464,
-       0.7890,
-       0.2814,
-       0.7886,
-       0.5895});
-  out = tfFloat.zeros(out_size);
-  ret = op_sdpa_with_kv_cache(
-      query,
-      key,
-      value,
-      key_cache_0,
-      value_cache_0,
-      1,
-      1,
-      attn_mask,
-      dropout_p,
-      is_causal,
-      scale,
-      out);
-  EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_2, 1e-4, 1e-4);
-
-  // start pos: 1 layer id 1
-  executorch::aten::Tensor ret_expected_3 = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.6486,
-       0.4270,
-       0.2472,
-       0.5922,
-       0.3669,
-       0.5740,
-       0.3522,
-       0.2173,
-       0.3635,
-       0.2088,
-       0.4071,
-       0.5423,
-       0.5110,
-       0.1822,
-       0.5107,
-       0.3817});
-  out = tfFloat.zeros(out_size);
-  ret = op_sdpa_with_kv_cache(
-      query,
-      key,
-      value,
-      key_cache_1,
-      value_cache_1,
-      1,
-      1,
-      attn_mask,
-      dropout_p,
-      is_causal,
-      scale,
-      out);
-  EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_3, 1e-4, 1e-4);
-
-  attn_mask = tfFloat.make({1, 3}, {0, 0, 0});
-  // start pos: 2 layer id 1
-  executorch::aten::Tensor ret_expected_4 = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.7490,
-       0.4930,
-       0.2854,
-       0.6838,
-       0.4489,
-       0.7021,
-       0.4308,
-       0.2659,
-       0.4622,
-       0.2655,
-       0.5176,
-       0.6895,
-       0.6202,
-       0.2212,
-       0.6199,
-       0.4634});
-  out = tfFloat.zeros(out_size);
-  ret = op_sdpa_with_kv_cache(
-      query,
-      key,
-      value,
-      key_cache_1,
-      value_cache_1,
-      2,
-      1,
-      attn_mask,
-      dropout_p,
-      is_causal,
-      scale,
-      out);
-  EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_4, 1e-4, 1e-4);
-
-  // start pos: 2 layer id 2
-  executorch::aten::Tensor ret_expected_5 = tfFloat.make(
-      {1, 1, 4, 4},
-      {0.7490,
-       0.4930,
-       0.2854,
-       0.6838,
-       0.4489,
-       0.7021,
-       0.4308,
-       0.2659,
-       0.4622,
-       0.2655,
-       0.5176,
-       0.6895,
-       0.6202,
-       0.2212,
-       0.6199,
-       0.4634});
-  out = tfFloat.zeros(out_size);
-  ret = op_sdpa_with_kv_cache(
-      query,
-      key,
-      value,
-      key_cache_2,
-      value_cache_2,
-      2,
-      1,
-      attn_mask,
-      dropout_p,
-      is_causal,
-      scale,
-      out);
-  EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_5, 1e-4, 1e-4);
-}
-
 TEST(OpScaledDotProductAttentionTest, SequenceTest) {
   TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index 5b68715e401..545f6516bb7 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -9,6 +9,18 @@ load(
     "get_compiler_optimization_flags",
 )
 
+def _get_quantized_sdpa_deps():
+    if runtime.is_oss:
+        return []
+    else:
+        return ["//pytorch/ao/torchao/experimental/kernels/cpu/interface:interface"]
+
+def _get_quantized_preproc_flags():
+    if runtime.is_oss:
+        return []
+    else:
+        return ["-DENABLE_CUSTOM_QUANTIZED_SDPA"]
+
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
@@ -33,7 +45,8 @@ def define_common_targets():
             headers = [
                 "op_sdpa_impl.h",
             ],
-            preprocessor_flags = get_vec_preprocessor_flags(),
+            exported_preprocessor_flags = get_vec_preprocessor_flags() +
+                _get_quantized_preproc_flags(),
             exported_deps = [
                 "//executorch/runtime/kernel:kernel_includes",
                 "//executorch/kernels/portable/cpu:scalar_utils",
@@ -45,8 +58,12 @@ def define_common_targets():
             deps = [
                 "//executorch/kernels/portable/cpu/util:reduce_util",
                 "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
-            ] + get_vec_deps(),
-            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags(),
+            ] + get_vec_deps() + _get_quantized_sdpa_deps(),
+            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags() +
+            select({
+                "DEFAULT": [],
+                "ovr_config//cpu:arm64": ["-march=armv8.2-a+dotprod"],
+            }),
             visibility = [
                 "//executorch/...",
                 "//executorch/extension/llm/custom_ops/...",
diff --git a/extension/llm/custom_ops/test_quantized_sdpa.py b/extension/llm/custom_ops/test_quantized_sdpa.py
new file mode 100644
index 00000000000..f7b28e1508f
--- /dev/null
+++ b/extension/llm/custom_ops/test_quantized_sdpa.py
@@ -0,0 +1,536 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import unittest
+
+import torch
+import torch.nn.functional as F
+
+from .custom_ops import custom_ops_lib  # noqa
+
+
+class SDPATestForCustomQuantizedSDPA(unittest.TestCase):
+    """
+    This test is to test the custom quantized SDPA op
+    Tensors are in [B, H, S, D] format
+    """
+
+    def setUp(self):
+        from torch.ao.quantization.fx._decomposed import (  # noqa: F401
+            quantized_decomposed_lib,
+        )
+
+        torch.manual_seed(42)
+        self.n_batch = 1
+        self.n_heads_kv = 32
+        self.n_heads_q = 32
+        self.head_dim = 128
+        self.max_seq_len = 2048
+        self.quantized_dtype = torch.int8
+        self.float_dtype = torch.float32
+        self.q_shape = None
+        self.kv_shape = None
+        self.is_seq_at_dim_2 = True
+
+    def _scale_tensor(self, tensor, min_value, max_value, scale=True):
+        normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())
+
+        scaled_tensor = normalized_tensor * (max_value - min_value) + min_value
+
+        return scaled_tensor if scale else tensor
+
+    def setup_caches_and_mask(self, tensor_scale_max, tensor_scale_min, scale_tensors):
+        self.mask = torch.full(
+            (self.max_seq_len, self.max_seq_len),
+            float("-inf"),
+        )
+        self.mask = torch.triu(self.mask, diagonal=1)
+
+        self.k = self._scale_tensor(
+            torch.rand(self.kv_shape),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        self.v = self._scale_tensor(
+            torch.rand(self.kv_shape),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+
+    def _sdpa_ref(
+        self,
+        q_quantized,
+        k_quantized,
+        v_quantized,
+        start_pos,
+        q_zero_point,
+        q_scale,
+        k_zero_point,
+        k_scale,
+        v_zero_point,
+        v_scale,
+        attn_mask,
+    ):
+        q = torch.ops.quantized_decomposed.dequantize_per_token(
+            q_quantized,
+            q_scale,
+            q_zero_point,
+            torch.iinfo(self.quantized_dtype).min,
+            torch.iinfo(self.quantized_dtype).max,
+            self.quantized_dtype,
+            self.float_dtype,
+        )
+        k = torch.ops.quantized_decomposed.dequantize_per_token(
+            k_quantized,
+            k_scale,
+            k_zero_point,
+            torch.iinfo(self.quantized_dtype).min,
+            torch.iinfo(self.quantized_dtype).max,
+            self.quantized_dtype,
+            self.float_dtype,
+        )
+        v = torch.ops.quantized_decomposed.dequantize_per_token(
+            v_quantized,
+            v_scale,
+            v_zero_point,
+            torch.iinfo(self.quantized_dtype).min,
+            torch.iinfo(self.quantized_dtype).max,
+            self.quantized_dtype,
+            self.float_dtype,
+        )
+
+        if not self.is_seq_at_dim_2:
+            q = q.transpose(1, 2).contiguous()
+            k = k.transpose(1, 2).contiguous()
+            v = v.transpose(1, 2).contiguous()
+        num_heads_q = q.size(1)
+        num_heads_kv = k.size(1)
+        seq_len = q.size(2)
+        k = torch.narrow(k, 2, 0, start_pos + seq_len)
+        v = torch.narrow(v, 2, 0, start_pos + seq_len)
+        if num_heads_q != num_heads_kv:
+            assert (
+                num_heads_q % num_heads_kv == 0
+            ), f"{num_heads_q} not divisible by {num_heads_kv}"
+        n_reps = num_heads_q // num_heads_kv
+        if n_reps > 1:
+            k = k.repeat_interleave(n_reps, dim=1)
+            v = v.repeat_interleave(n_reps, dim=1)
+        out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        if not self.is_seq_at_dim_2:
+            out = out.transpose(1, 2).contiguous()
+        return out
+
+    def _int_matmul(
+        self, quantized_q, quantized_k, q_zero_points, q_scale, k_zero_point, k_scale
+    ):
+        row_sum_q = torch.sum(quantized_q, dim=-1, keepdim=True)
+        row_sum_k = torch.sum(quantized_k, dim=-1, keepdim=True)
+        q_at_k = torch.matmul(quantized_q, quantized_k.transpose(-2, -1))
+        row_sum_q_scaled = row_sum_q * k_zero_point.squeeze(-1).unsqueeze(0)
+        row_sum_k_scaled = q_zero_points * row_sum_k.squeeze(-1).unsqueeze(0)
+        zero_points_product = (
+            quantized_q.size(-1) * q_zero_points * k_zero_point.squeeze(-1).unsqueeze(0)
+        )
+        res = q_at_k - row_sum_q_scaled - row_sum_k_scaled + zero_points_product
+        q_scale_mul_k_scale = q_scale * k_scale.squeeze(-1).unsqueeze(0)
+        res = res.to(torch.float32) * q_scale_mul_k_scale
+        return res
+
+    def _quantized_sdpa_ref(
+        self,
+        quantized_q,
+        quantized_k,
+        quantized_v,
+        q_zero_points,
+        q_scale,
+        k_scale,
+        k_zero_point,
+        v_scale,
+        v_zero_point,
+        attn_mask,
+    ):
+        import math
+
+        quantized_q = quantized_q.to(torch.int32)
+        quantized_k = quantized_k.to(torch.int32)
+        quantized_v = quantized_v.to(torch.int32)
+        batch_size = quantized_q.size(0)
+        num_heads_q = quantized_q.size(1)
+        num_heads_kv = quantized_k.size(1)
+        q_scale = q_scale.to(torch.float32)
+        k_scale = k_scale.to(torch.float32)
+        q_zero_points = q_zero_points.to(torch.int32)
+        k_zero_point = k_zero_point.to(torch.int32)
+        if num_heads_q != num_heads_kv:
+            assert (
+                num_heads_q % num_heads_kv == 0
+            ), f"{num_heads_q} not divisible by {num_heads_kv}"
+        n_reps = num_heads_q // num_heads_kv
+        if n_reps > 1:
+            quantized_k = quantized_k.repeat_interleave(n_reps, dim=1)
+            quantized_v = quantized_v.repeat_interleave(n_reps, dim=1)
+        res_b = []
+        scale_factor = 1 / math.sqrt(quantized_k.size(-1))
+        dequantized_v = torch.ops.quantized_decomposed.dequantize_per_token(
+            quantized_v,
+            v_scale,
+            v_zero_point,
+            torch.iinfo(torch.int8).min,
+            torch.iinfo(torch.int8).max,
+            torch.int8,
+            torch.float32,
+        )
+        for b in range(batch_size):
+            res_h = []
+            for h in range(num_heads_q):
+                q_at_k = self._int_matmul(
+                    quantized_q[b][h],
+                    quantized_k[b][h],
+                    q_zero_points[b][h],
+                    q_scale[b][h],
+                    k_zero_point[b][h],
+                    k_scale[b][h],
+                )
+                q_at_k = q_at_k * scale_factor
+                q_at_k += attn_mask
+                attn_weight = torch.softmax(q_at_k, dim=-1)
+                y = torch.matmul(attn_weight, dequantized_v[b][h])
+                res_h.append(y)
+            res = torch.stack(res_h, dim=0)
+            res_b.append(res.unsqueeze(0))
+        res = torch.cat(res_b, dim=0)
+        return res
+
+    def _test_sdpa_common(
+        self,
+        n_heads_kv,
+        n_heads_q,
+        head_dim,
+        max_seq_len,
+        start_pos,
+        seq_len,
+        scale_tensors=False,
+        atol=1e-5,
+        is_seq_at_dim_2=False,
+    ):
+        # Range arbitrarily chosen to reproduce a numerical error on x86 in some of the long context tests
+        tensor_scale_max = 15
+        tensor_scale_min = -15
+        self.n_heads_kv = n_heads_kv
+        self.n_heads_q = n_heads_q
+        self.head_dim = head_dim
+        self.max_seq_len = max_seq_len
+        self.is_seq_at_dim_2 = is_seq_at_dim_2
+        seq_dim = 2
+        self.q_shape = (self.n_batch, self.n_heads_q, seq_len, self.head_dim)
+        self.kv_shape = (self.n_batch, self.n_heads_kv, self.max_seq_len, self.head_dim)
+        if not is_seq_at_dim_2:
+            seq_dim = 1
+            self.q_shape = (self.n_batch, seq_len, self.n_heads_q, self.head_dim)
+            self.kv_shape = (
+                self.n_batch,
+                self.max_seq_len,
+                self.n_heads_kv,
+                self.head_dim,
+            )
+
+        q = self._scale_tensor(
+            torch.rand(self.q_shape),
+            tensor_scale_max,
+            tensor_scale_min,
+            scale_tensors,
+        )
+        self.setup_caches_and_mask(tensor_scale_max, tensor_scale_min, scale_tensors)
+        k = self.k
+        v = self.v
+
+        quantized_dtype = self.quantized_dtype
+        q_scale, q_zero_point = (
+            torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
+                q, quantized_dtype
+            )
+        )
+        k_scale, k_zero_point = (
+            torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
+                k, quantized_dtype
+            )
+        )
+        v_scale, v_zero_point = (
+            torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
+                v, quantized_dtype
+            )
+        )
+
+        q_quantized = torch.ops.quantized_decomposed.quantize_per_token(
+            q,
+            q_scale,
+            q_zero_point,
+            torch.iinfo(quantized_dtype).min,
+            torch.iinfo(quantized_dtype).max,
+            quantized_dtype,
+        )
+        k_quantized = torch.ops.quantized_decomposed.quantize_per_token(
+            k,
+            k_scale,
+            k_zero_point,
+            torch.iinfo(quantized_dtype).min,
+            torch.iinfo(quantized_dtype).max,
+            quantized_dtype,
+        )
+        v_quantized = torch.ops.quantized_decomposed.quantize_per_token(
+            v,
+            v_scale,
+            v_zero_point,
+            torch.iinfo(quantized_dtype).min,
+            torch.iinfo(quantized_dtype).max,
+            quantized_dtype,
+        )
+
+        seq_len = q.size(seq_dim)
+        attn_mask = self.mask[start_pos : start_pos + seq_len, :]
+        attn_mask = attn_mask[:, : start_pos + seq_len]
+
+        # quantized_sdpa_ref_output = self._quantized_sdpa_ref(q_quantized, k_quantized, v_quantized, q_zero_point, q_scale, k_scale, k_zero_point, v_scale, v_zero_point, attn_mask)
+
+        from torch.nn.attention import SDPBackend
+
+        with torch.nn.attention.sdpa_kernel(
+            [SDPBackend.FLASH_ATTENTION]
+        ), torch.no_grad():
+            ref_output = self._sdpa_ref(
+                q_quantized,
+                k_quantized,
+                v_quantized,
+                start_pos,
+                q_zero_point,
+                q_scale,
+                k_zero_point,
+                k_scale,
+                v_zero_point,
+                v_scale,
+                attn_mask,
+            )
+
+        q_zero_point_int8 = q_zero_point.to(dtype=torch.int8)
+        k_zero_point_int8 = k_zero_point.to(dtype=torch.int8)
+        v_zero_point_int8 = v_zero_point.to(dtype=torch.int8)
+        q_scale_fp32 = q_scale.to(dtype=torch.float32)
+        k_scale_fp32 = k_scale.to(dtype=torch.float32)
+        v_scale_fp32 = v_scale.to(dtype=torch.float32)
+
+        op_output = torch.ops.llama.custom_quantized_sdpa(
+            q_quantized,
+            k_quantized,
+            v_quantized,
+            start_pos,
+            None,
+            0,
+            True,
+            None,
+            q_zero_point_int8,
+            q_scale_fp32,
+            k_zero_point_int8,
+            k_scale_fp32,
+            v_zero_point_int8,
+            v_scale_fp32,
+            is_seq_at_dim_2,
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output, atol=atol))
+        # Following line crashes due to some weird issues in mkldnn with crash in mkl_sgemm with `wild jump`
+        # self.assertTrue(torch.allclose(ref_output, quantized_sdpa_ref_output, atol=1e-3))
+
+        start_pos = seq_len
+        seq_len = q.size(seq_dim)
+        attn_mask = self.mask[start_pos : start_pos + seq_len, :]
+        attn_mask = attn_mask[:, : start_pos + seq_len]
+        with torch.nn.attention.sdpa_kernel(
+            [SDPBackend.FLASH_ATTENTION]
+        ), torch.no_grad():
+            ref_output = self._sdpa_ref(
+                q_quantized,
+                k_quantized,
+                v_quantized,
+                start_pos,
+                q_zero_point,
+                q_scale,
+                k_zero_point,
+                k_scale,
+                v_zero_point,
+                v_scale,
+                attn_mask,
+            )
+        op_output = torch.ops.llama.custom_quantized_sdpa(
+            q_quantized,
+            k_quantized,
+            v_quantized,
+            start_pos,
+            None,
+            0,
+            True,
+            None,
+            q_zero_point_int8,
+            q_scale_fp32,
+            k_zero_point_int8,
+            k_scale_fp32,
+            v_zero_point_int8,
+            v_scale_fp32,
+            is_seq_at_dim_2,
+        )
+        self.assertTrue(torch.allclose(ref_output, op_output, atol=atol))
+
+    def test_sdpa_with_custom_quantized(self):
+        n_heads_kv = 8
+        n_heads_q = 8
+        head_dim = 128
+        max_seq_len = 2048
+        seq_len = 24
+        start_pos = 0
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            True,
+            atol=1e-4,
+            is_seq_at_dim_2=True,
+        )
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            True,
+            atol=1e-4,
+            is_seq_at_dim_2=False,
+        )
+
+    def test_sdpa_with_custom_quantized_seq_len_1(self):
+        n_heads_kv = 4
+        n_heads_q = 4
+        head_dim = 4
+        max_seq_len = 8
+        seq_len = 1
+        start_pos = 0
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=True,
+        )
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=False,
+        )
+
+    def test_sdpa_with_custom_quantized_seq_len_small(self):
+        n_heads_kv = 4
+        n_heads_q = 4
+        head_dim = 4
+        max_seq_len = 8
+        seq_len = 4
+        start_pos = 0
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=True,
+        )
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=False,
+        )
+
+    def test_sdpa_with_custom_quantized_seq_len_llava_example(self):
+        n_heads_kv = 32
+        n_heads_q = 32
+        head_dim = 128
+        max_seq_len = 2048
+        seq_len = 634
+        start_pos = 0
+        self._test_sdpa_common(
+            n_heads_kv, n_heads_q, head_dim, max_seq_len, start_pos, seq_len
+        )
+
+    def test_sdpa_with_custom_quantized_seq_len_130_gqa(self):
+        n_heads_kv = 8
+        n_heads_q = 32
+        head_dim = 128
+        max_seq_len = 2048
+        seq_len = 130
+        start_pos = 0
+        # For some reason when scaling tensors, the test fails with smaller atol
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            True,
+            atol=1e-3,
+        )
+
+    def test_sdpa_with_custom_quantized_seq_len_llava_example_gqa(self):
+        n_heads_kv = 16
+        n_heads_q = 32
+        head_dim = 128
+        max_seq_len = 2048
+        seq_len = 634
+        start_pos = 0
+        self._test_sdpa_common(
+            n_heads_kv, n_heads_q, head_dim, max_seq_len, start_pos, seq_len
+        )
+
+    def test_sdpa_with_cache_mqa(self):
+        n_heads_kv = 1
+        n_heads_q = 8
+        head_dim = 128
+        max_seq_len = 2048
+        seq_len = 24
+        start_pos = 0
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=True,
+        )
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=False,
+        )
diff --git a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
index a1f054a153e..334e53c437f 100644
--- a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
+++ b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
@@ -67,12 +67,14 @@ def test_sdpa_with_cache_no_mqa_1(self):
         )
         if self.use_mask_with_custom_op:
             attn_mask = attn_mask.contiguous()
+            sliced_k_cache = self.k_cache[:, : start_pos + seq_len, :, :]
+            sliced_v_cache = self.v_cache[:, : start_pos + seq_len, :, :]
             op_output = torch.ops.llama.sdpa_with_kv_cache(
                 q,
                 k,
                 v,
-                self.k_cache,
-                self.v_cache,
+                sliced_k_cache,
+                sliced_v_cache,
                 start_pos,
                 seq_len,
                 attn_mask,
@@ -108,12 +110,14 @@ def test_sdpa_with_cache_no_mqa_2(self):
         )
         if self.use_mask_with_custom_op:
             attn_mask = attn_mask.contiguous()
+            sliced_k_cache = self.k_cache[:, : start_pos + seq_len, :, :]
+            sliced_v_cache = self.v_cache[:, : start_pos + seq_len, :, :]
             op_output = torch.ops.llama.sdpa_with_kv_cache(
                 q,
                 k,
                 v,
-                self.k_cache,
-                self.v_cache,
+                sliced_k_cache,
+                sliced_v_cache,
                 start_pos,
                 seq_len,
                 attn_mask,
@@ -150,12 +154,14 @@ def test_sdpa_with_cache_no_mqa_3(self):
         )
         if self.use_mask_with_custom_op:
             attn_mask = attn_mask.contiguous()
+            sliced_k_cache = self.k_cache[:, : start_pos + seq_len, :, :]
+            sliced_v_cache = self.v_cache[:, : start_pos + seq_len, :, :]
             op_output = torch.ops.llama.sdpa_with_kv_cache(
                 q,
                 k,
                 v,
-                self.k_cache,
-                self.v_cache,
+                sliced_k_cache,
+                sliced_v_cache,
                 start_pos,
                 seq_len,
                 attn_mask,
@@ -191,12 +197,14 @@ def test_sdpa_with_cache_no_mqa_4(self):
         )
         if self.use_mask_with_custom_op:
             attn_mask = attn_mask.contiguous()
+            sliced_k_cache = self.k_cache[:, : start_pos + seq_len, :, :]
+            sliced_v_cache = self.v_cache[:, : start_pos + seq_len, :, :]
             op_output = torch.ops.llama.sdpa_with_kv_cache(
                 q,
                 k,
                 v,
-                self.k_cache,
-                self.v_cache,
+                sliced_k_cache,
+                sliced_v_cache,
                 start_pos,
                 seq_len,
                 attn_mask,
@@ -489,11 +497,11 @@ def _test_sdpa_common(
 class SDPATestForLargeSeqLength(SDPATestCommon):
 
     def test_sdpa_with_cache_seq_len_130(self):
-        n_heads_kv = 32
-        n_heads_q = 32
+        n_heads_kv = 8
+        n_heads_q = 8
         head_dim = 128
         max_seq_len = 2048
-        seq_len = 130
+        seq_len = 24
         self._test_sdpa_common(
             n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, True
         )
diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS
index 40f8599e9e0..298a64ba328 100644
--- a/extension/llm/export/TARGETS
+++ b/extension/llm/export/TARGETS
@@ -22,6 +22,7 @@ runtime.python_library(
         "//bento/...",
         "//bento_kernels/...",
         "//executorch/examples/...",
+        "//executorch/extension/llm/...",
         "//meta_intern_odllm/...",
     ],
     deps = [
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index cf3a1087cfb..2dee6b0954a 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -13,7 +13,7 @@
 import contextlib
 import logging
 from enum import Enum
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple
 from unittest.mock import patch
 
 import torch
@@ -41,6 +41,7 @@
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
 from torch.export import export_for_training, ExportedProgram
 from torch.nn.attention import SDPBackend
+from torchao.utils import unwrap_tensor_subclass
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -80,14 +81,13 @@ class LLMEdgeManager:
 
     def __init__(
         self,
-        model,
-        modelname,
-        max_seq_len,
-        dtype,
-        use_kv_cache,
-        example_inputs,
+        model: torch.nn.Module,
+        modelname: str,
+        max_seq_len: int,
+        use_kv_cache: bool,
+        example_inputs: Tuple[torch.Tensor, ...],
+        dtype: Optional[DType] = None,
         example_kwarg_inputs: Optional[Dict] = None,
-        args: Optional[Any] = None,
         enable_dynamic_shape: bool = False,
         generate_full_logits: bool = False,
         calibration_tasks: Optional[List[str]] = None,
@@ -98,36 +98,42 @@ def __init__(
         verbose: bool = False,
         metadata: Optional[dict] = None,
         dynamic_shapes: Optional[Any] = None,
+        use_legacy_export: bool = False,
+        save_exported_program: bool = False,
     ):
+        # Store necessary constructor arguments.
         self.model = model
-        # Note: treat this as the source of truth for the result of
-        # torch.export'ing a model. If the overall ExportedProgram is needed,
-        # make sure to re-export this graph module to persist any changes. See
-        # https://github.com/pytorch/pytorch/blob/main/torch/export/exported_program.py#L921
-        self.pre_autograd_graph_module: Optional[torch.nn.Module] = None
         self.modelname = modelname
         self.max_seq_len = max_seq_len
-        self.dtype = dtype
+        self.use_kv_cache = use_kv_cache
         self.example_inputs = example_inputs
+        self.dtype = dtype
         self.example_kwarg_inputs = example_kwarg_inputs
-        self.use_kv_cache = use_kv_cache
-        self.generate_full_logits = generate_full_logits
         self.enable_dynamic_shape = enable_dynamic_shape
-        self.verbose = verbose
-        self.metadata = metadata
-        self.applied_source_transforms = []
-        self.edge_manager: Optional[EdgeProgramManager] = None
-        self.export_program = None
-        self.output_dir = "."
-        self.dynamic_shapes = dynamic_shapes
-        self._saved_pte_filename = None
-        self.args = args
+        self.generate_full_logits = generate_full_logits
         self.calibration_tasks = calibration_tasks
         self.calibration_limit = calibration_limit
         self.calibration_seq_length = calibration_seq_length
         self.calibration_data = calibration_data
         self.tokenizer_path = tokenizer_path
-        self.canonical_passes = [RemoveRedundantTransposes()]
+        self.verbose = verbose
+        self.metadata = metadata
+        self.dynamic_shapes = dynamic_shapes
+        self.use_legacy_export = use_legacy_export
+        self.save_exported_program = save_exported_program
+
+        # Note: treat this as the source of truth for the result of
+        # torch.export'ing a model. If the overall ExportedProgram is needed,
+        # make sure to re-export this graph module to persist any changes. See
+        # https://github.com/pytorch/pytorch/blob/main/torch/export/exported_program.py#L921
+        self.pre_autograd_graph_module: Optional[torch.nn.Module] = None
+        self.edge_manager: Optional[EdgeProgramManager] = None
+        self.canonical_passes = [
+            RemoveRedundantTransposes()
+        ]  # Graph transformations optimizations.
+        self.export_program = None  # Final result of lowering to executorch.
+        self.output_dir = "."
+        self._saved_pte_filename = None
 
     def set_output_dir(self, output_dir: str) -> "LLMEdgeManager":
         """
@@ -166,10 +172,9 @@ def source_transform(
         """
         for transform in transforms:
             self.model = transform(self.model)
-        self.applied_source_transforms.extend(transforms)
 
         if self.verbose:
-            logging.info(f"Applied source transforms: {self.applied_source_transforms}")
+            logging.info(f"Applied source transforms: {transforms}")
         logging.info(f"Model after source transforms: {self.model}")
         return self
 
@@ -178,13 +183,13 @@ def _get_dynamic_shape(self) -> Any:
             return self.dynamic_shapes
 
         dim = torch.export.Dim("token_dim", max=self.max_seq_len - 1)
-
-        if not self.use_kv_cache:
-            # Only one input argument: tokens
-            self.dynamic_shapes = ({1: dim},)
-        elif self.enable_dynamic_shape:
-            # Two input arguments: tokens and input_pos but input_pos is static shape
-            self.dynamic_shapes = ({1: dim}, {"input_pos": {0: 1}})
+        if self.enable_dynamic_shape:
+            if not self.use_kv_cache:
+                # Only one input argument: tokens
+                self.dynamic_shapes = ({1: dim},)
+            else:
+                # Two input arguments: tokens and input_pos but input_pos is static shape
+                self.dynamic_shapes = ({1: dim}, {"input_pos": {0: 1}})
         else:
             # Two input arguments: tokens and input_pos but both are of static shape
             self.dynamic_shapes = None
@@ -199,12 +204,17 @@ def _get_edge_config(self) -> EdgeCompileConfig:
         return edge_config
 
     def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
+        if module is not None:
+            unwrap_tensor_subclass(module)
+        else:
+            unwrap_tensor_subclass(self.model)
+
         dynamic_shape = self._get_dynamic_shape()
         # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
-            if hasattr(self.args, "qnn") and self.args.qnn:
-                # TODO: this is temporary, as qnn flow does not work with new, non-functional export IR.
+            if self.use_legacy_export:
+                # TODO: for use cases such as qnn, which does not work with new, non-functional export IR.
                 # See issue: https://github.com/pytorch/executorch/issues/7373
 
                 with patch.object(
@@ -234,6 +244,7 @@ def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
                     self.example_inputs,
                     kwargs=self.example_kwarg_inputs,
                     dynamic_shapes=dynamic_shape,
+                    strict=True,
                 )
         return exported_module
 
@@ -249,8 +260,12 @@ def export(self) -> "LLMEdgeManager":
         # Persisting those changes back to an ExportedProgram will require
         # an additional export().
         self.pre_autograd_graph_module = exported_module.module()
-        if hasattr(self.args, "export_only") and self.args.export_only:
-            torch.export.save(exported_module, self.args.output_name)
+        if self.save_exported_program:
+            export_output = f"{self.modelname}.pt2"
+            logging.info(
+                f"Saving torch.export()/export_for_training() result to {export_output}"
+            )
+            torch.export.save(exported_module, export_output)
         return self
 
     def run_canonical_optimizations(self):
@@ -414,7 +429,7 @@ def export_to_edge(self) -> "LLMEdgeManager":
                 self.export()
 
             override_export_behaviour = contextlib.nullcontext()
-            if hasattr(self.args, "qnn") and self.args.qnn:
+            if self.use_legacy_export:
                 override_export_behaviour = patch.object(
                     torch._utils_internal,
                     "export_training_ir_rollout_check",
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index 76e8c357119..20604bbf635 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -57,7 +57,7 @@ def get_mps_partitioner(use_kv_cache: bool = False):
         )
     except ImportError:
         raise ImportError(
-            "Please install the MPS backend follwing https://pytorch.org/executorch/main/build-run-mps.html"
+            "Please install the MPS backend follwing https://pytorch.org/executorch/main/backends-mps"
         )
 
     compile_specs = [CompileSpec("use_fp16", bytes([True]))]
@@ -81,7 +81,7 @@ def get_coreml_partitioner(
         )
     except ImportError:
         raise ImportError(
-            "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"
+            "Please install the CoreML backend follwing https://pytorch.org/executorch/main/backends-coreml"
             + "; for buck users, please add example dependancies: //executorch/backends/apple/coreml:backend, and etc"
         )
 
@@ -195,7 +195,7 @@ def get_qnn_partitioner(
         )
     except ImportError:
         raise ImportError(
-            "Please install the Qualcomm backend following https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html"
+            "Please install the Qualcomm backend following https://pytorch.org/executorch/main/backends-qualcomm"
         )
 
     use_fp16 = True
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 40d81075d9f..24c3be2e802 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -158,7 +158,7 @@ def get_qnn_quantizer(
 
     except ImportError:
         raise ImportError(
-            "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
+            "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/backends-qualcomm"
         )
 
     backend, quant_config = pt2e_quantize.split("_")
@@ -166,30 +166,39 @@ def get_qnn_quantizer(
         backend == "qnn"
     ), f"The quantization config is for backend {backend} instead of qnn."
     qnn_quantizer = QnnQuantizer()  # pyre-fixme[16]
-    qnn_quantizer.set_per_channel_conv_quant(enable=True)
-    qnn_quantizer.set_per_channel_linear_quant(enable=True)
+
     # more custom quantization are supported including 16a4w etc. default to 8bit quantized
     custom_annotations = ()
     if quant_config == "8a8w":
         quant_dtype = QuantDtype.use_8a8w  # pyre-fixme[16]
-        qnn_quantizer.set_quant_config(quant_dtype, is_qat=is_qat)
+        qnn_quantizer.set_default_quant_config(
+            quant_dtype,
+            is_qat=is_qat,
+            is_conv_per_channel=True,
+            is_linear_per_channel=True,
+        )
     elif quant_config == "16a16w":
-        quant_dtype = QuantDtype.use_16a16w  # pyre-fixme[16]
         # Due to the error with 16a16w in Qnn Htp, we need to disable per channel linear quantization when use 16a16w
         # TODO: enable it after the issue is fixed
         logging.warning(
             "Disable per channel quantization for linear and conv due to the error with QNN HTP 16a16w."
         )
-        qnn_quantizer.set_per_channel_conv_quant(enable=False)
-        qnn_quantizer.set_per_channel_linear_quant(enable=False)
-        qnn_quantizer.set_quant_config(
-            quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver
+        quant_dtype = QuantDtype.use_16a16w  # pyre-fixme[16]
+        qnn_quantizer.set_default_quant_config(
+            quant_dtype,
+            is_qat=is_qat,
+            is_conv_per_channel=False,
+            is_linear_per_channel=False,
+            act_observer=MinMaxObserver,
         )
     elif quant_config == "16a4w":
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
-        quant_dtype = QuantDtype.use_16a4w
-        qnn_quantizer.set_quant_config(
-            quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver
+        quant_dtype = QuantDtype.use_16a16w  # pyre-fixme[16]
+        qnn_quantizer.set_default_quant_config(
+            quant_dtype,
+            is_qat=is_qat,
+            is_conv_per_channel=True,
+            is_linear_per_channel=True,
+            act_observer=MinMaxObserver,
         )
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         custom_annotations = (custom_annotate_llama_matmul_16a8w,)
@@ -217,7 +226,7 @@ def get_coreml_quantizer(pt2e_quantize: str):
         from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
     except ImportError:
         raise ImportError(
-            "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"
+            "Please install the CoreML backend follwing https://pytorch.org/executorch/main/backends-coreml"
         )
 
     if pt2e_quantize == "coreml_8a_c8w":
diff --git a/extension/llm/export/test/TARGETS b/extension/llm/export/test/TARGETS
new file mode 100644
index 00000000000..63efce84119
--- /dev/null
+++ b/extension/llm/export/test/TARGETS
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.python_test(
+    name = "test_builder",
+    srcs = ["test_builder.py"],
+    deps = [
+        "//executorch/extension/llm/export:export_lib",
+        "//caffe2:torch",
+    ],
+)
diff --git a/extension/llm/export/test/__init__.py b/extension/llm/export/test/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/extension/llm/export/test/test_builder.py b/extension/llm/export/test/test_builder.py
new file mode 100644
index 00000000000..7883480c1e7
--- /dev/null
+++ b/extension/llm/export/test/test_builder.py
@@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+import unittest
+from unittest.mock import MagicMock
+
+import torch
+
+from executorch.extension.llm.export.builder import DType, LLMEdgeManager
+
+
+class TestLLMEdgeManager(unittest.TestCase):
+    def setUp(self) -> None:
+        # Create a mock model
+        self.mock_model = MagicMock()
+        self.modelname = "test_model"
+        self.max_seq_len = 2048
+        self.dtype = DType.fp32
+        self.example_inputs = (torch.zeros((1, 10), dtype=torch.long),)
+        self.example_kwarg_inputs = {"input_pos": torch.tensor([0])}
+
+    def test_get_dynamic_shape_with_preset_dynamic_shapes(self) -> None:
+        """Test that _get_dynamic_shape returns preset dynamic_shapes if available."""
+        # Create a manager with preset dynamic_shapes
+        preset_dynamic_shapes = {"preset": "shapes"}
+        manager = LLMEdgeManager(
+            model=self.mock_model,
+            modelname=self.modelname,
+            max_seq_len=self.max_seq_len,
+            dtype=self.dtype,
+            use_kv_cache=False,
+            example_inputs=self.example_inputs,
+            dynamic_shapes=preset_dynamic_shapes,
+        )
+
+        # Call _get_dynamic_shape and verify it returns the preset value
+        result = manager._get_dynamic_shape()
+        self.assertEqual(result, preset_dynamic_shapes)
+
+    def test_get_dynamic_shape_with_dynamic_shape_enabled_no_kv_cache(self) -> None:
+        """Test _get_dynamic_shape when enable_dynamic_shape=True and use_kv_cache=False."""
+        # Create a manager with enable_dynamic_shape=True and use_kv_cache=False
+        manager = LLMEdgeManager(
+            model=self.mock_model,
+            modelname=self.modelname,
+            max_seq_len=self.max_seq_len,
+            dtype=self.dtype,
+            use_kv_cache=False,
+            example_inputs=self.example_inputs,
+            enable_dynamic_shape=True,
+        )
+
+        # Call _get_dynamic_shape
+        result = manager._get_dynamic_shape()
+
+        # Verify the result has the expected structure
+        self.assertIsInstance(result, tuple)
+        self.assertEqual(len(result), 1)
+        self.assertIsInstance(result[0], dict)
+        self.assertIn(1, result[0])
+        # Check that the value at key 1 is a torch.export.Dim with the correct max value
+        self.assertEqual(result[0][1].max, self.max_seq_len - 1)
+
+    def test_get_dynamic_shape_with_dynamic_shape_enabled_with_kv_cache(self) -> None:
+        """Test _get_dynamic_shape when enable_dynamic_shape=True and use_kv_cache=True."""
+        # Create a manager with enable_dynamic_shape=True and use_kv_cache=True
+        manager = LLMEdgeManager(
+            model=self.mock_model,
+            modelname=self.modelname,
+            max_seq_len=self.max_seq_len,
+            dtype=self.dtype,
+            use_kv_cache=True,
+            example_inputs=self.example_inputs,
+            enable_dynamic_shape=True,
+        )
+
+        # Call _get_dynamic_shape
+        result = manager._get_dynamic_shape()
+
+        # Verify the result has the expected structure
+        self.assertIsInstance(result, tuple)
+        self.assertEqual(len(result), 2)
+
+        # Check first element (tokens dimension)
+        self.assertIsInstance(result[0], dict)
+        self.assertIn(1, result[0])
+        self.assertEqual(result[0][1].max, self.max_seq_len - 1)
+
+        # Check second element (input_pos dimension)
+        self.assertIsInstance(result[1], dict)
+        self.assertIn("input_pos", result[1])
+        self.assertIsInstance(result[1]["input_pos"], dict)
+        self.assertIn(0, result[1]["input_pos"])
+        self.assertEqual(result[1]["input_pos"][0], 1)
+
+    def test_get_dynamic_shape_with_dynamic_shape_disabled(self) -> None:
+        """Test _get_dynamic_shape when enable_dynamic_shape=False."""
+        # Create a manager with enable_dynamic_shape=False
+        manager = LLMEdgeManager(
+            model=self.mock_model,
+            modelname=self.modelname,
+            max_seq_len=self.max_seq_len,
+            dtype=self.dtype,
+            use_kv_cache=True,  # Doesn't matter for this test
+            example_inputs=self.example_inputs,
+            enable_dynamic_shape=False,
+        )
+
+        # Call _get_dynamic_shape
+        result = manager._get_dynamic_shape()
+
+        # Verify the result is None
+        self.assertIsNone(result)
diff --git a/extension/llm/export/test_export_passes.py b/extension/llm/export/test_export_passes.py
index 12ce18ebb79..b0c5af7e65f 100644
--- a/extension/llm/export/test_export_passes.py
+++ b/extension/llm/export/test_export_passes.py
@@ -10,10 +10,7 @@
 
 class RemoveRedundantTransposesPassTest(unittest.TestCase):
     def _export(self, model, example_inputs):
-        exported_module = export_for_training(
-            model,
-            example_inputs,
-        )
+        exported_module = export_for_training(model, example_inputs, strict=True)
         return exported_module.module()
 
     def _check(self, model, example_inputs, key, before_count, after_count):
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 993314ccd63..75c30cff71b 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -53,3 +53,7 @@ target_include_directories(
   extension_llm_runner INTERFACE ${_common_include_directories}
                                  ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
+
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/extension/llm/runner/irunner.h b/extension/llm/runner/irunner.h
index 35d87e997a0..c3ed668a4be 100644
--- a/extension/llm/runner/irunner.h
+++ b/extension/llm/runner/irunner.h
@@ -6,42 +6,124 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// An interface for LLM runners. Developers can create their own runner that
-// implements their own load and generation logic to run the model.
+// Interface for text generation runners.
 
 #pragma once
 
+#include <cstdint>
 #include <functional>
+#include <memory>
 #include <string>
 
 #include <executorch/extension/llm/runner/stats.h>
-#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/error.h>
 
 namespace executorch {
 namespace extension {
 namespace llm {
 
+// Configuration struct for generation parameters, fields should be sorted in
+// alphabetic order
+struct GenerationConfig {
+  // Whether to echo the input prompt in the output
+  bool echo = true;
+
+  // Maximum number of new tokens to generate
+  // If the max_context_len metadata that's serialized in the .pte file exists,
+  // then the number of prompt tokens + max_new_tokens won't exceed
+  // max_context_len. If this field is -1, it means we will rely on
+  // max_context_len metadata and seq_len value. Check resolve_max_new_tokens
+  // for details.
+  int32_t max_new_tokens = -1;
+
+  // Whether this is a warmup run (affects perf benchmarking)
+  bool warming = false;
+
+  // Maximum number of total tokens
+  // If the .pte file contains the max_context_len metadata, it will override
+  // this value if it's smaller. If this field is -1, we will use the
+  // max_context_len metadata directly. Check resolve_max_new_tokens for
+  // details.
+  int32_t seq_len = -1;
+
+  // Temperature for sampling (higher = more random)
+  float temperature = 0.8f;
+
+  /**
+   * Resolve the maximum number of new tokens to generate based on constraints.
+   *
+   * This method calculates the maximum number of new tokens that can be
+   * generated considering both seq_len and max_new_tokens constraints, as well
+   * as the model's maximum context length and the number of tokens in the
+   * prompt.
+   *
+   * @param max_context_len The maximum context length supported by the model
+   * @param num_prompt_tokens The number of tokens in the input prompt
+   * @return The resolved maximum number of new tokens to generate
+   */
+  int32_t resolve_max_new_tokens(
+      int32_t max_context_len,
+      int32_t num_prompt_tokens) const {
+    int32_t result;
+
+    if (seq_len == -1 && max_new_tokens == -1) {
+      // Both are -1, use max context len minus prompt tokens
+      result = max_context_len - num_prompt_tokens;
+    } else if (seq_len == -1 && max_new_tokens != -1) {
+      // Only max_new_tokens is specified
+      result = std::min(max_new_tokens, max_context_len - num_prompt_tokens);
+    } else if (seq_len != -1 && max_new_tokens == -1) {
+      // Only seq_len is specified
+      result = std::min(seq_len, max_context_len) - num_prompt_tokens;
+    } else {
+      // Both are specified
+      result = std::min(
+          std::min(seq_len, max_context_len) - num_prompt_tokens,
+          max_new_tokens);
+    }
+
+    // Ensure result is not negative
+    return std::max(0, result);
+  }
+};
+
+// Base interface for LLM runners
 class ET_EXPERIMENTAL IRunner {
  public:
   virtual ~IRunner() = default;
 
-  // Checks if the model is loaded.
+  /**
+   * Check if the runner is loaded and ready for inference.
+   *
+   * @return true if the runner is loaded, false otherwise
+   */
   virtual bool is_loaded() const = 0;
 
-  // Load the model and tokenizer.
-  virtual ::executorch::runtime::Error load() = 0;
+  /**
+   * Load the model and prepare for inference.
+   *
+   * @return Error::Ok if successful, an error otherwise
+   */
+  virtual runtime::Error load() = 0;
 
-  // Generate the output tokens.
-  virtual ::executorch::runtime::Error generate(
+  /**
+   * Generate text based on the provided prompt and generation config.
+   *
+   * @param prompt The input prompt to generate from
+   * @param config Generation configuration parameters
+   * @param token_callback Callback function called for each generated token
+   * @param stats_callback Callback function for generation statistics
+   * @return Error::Ok if successful, an error otherwise
+   */
+  virtual runtime::Error generate(
       const std::string& prompt,
-      int32_t seq_len,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const ::executorch::extension::llm::Stats&)>
-          stats_callback = {},
-      bool echo = true,
-      bool warming = false) = 0;
-
-  // Stop the generation.
+      const GenerationConfig& config,
+      std::function<void(const std::string&)> token_callback,
+      std::function<void(const Stats&)> stats_callback) = 0;
+
+  /**
+   * Stop the generation process.
+   */
   virtual void stop() = 0;
 };
 
diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
new file mode 100644
index 00000000000..b17a318a080
--- /dev/null
+++ b/extension/llm/runner/test/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+cmake_minimum_required(VERSION 3.19)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+
+set(_test_srcs generation_config_test.cpp)
+
+et_cxx_test(
+  generation_config_test
+  SOURCES
+  ${_test_srcs}
+  EXTRA_LIBS
+  executorch
+)
diff --git a/extension/llm/runner/test/TARGETS b/extension/llm/runner/test/TARGETS
new file mode 100644
index 00000000000..97de7abe9b1
--- /dev/null
+++ b/extension/llm/runner/test/TARGETS
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/runner/test/generation_config_test.cpp b/extension/llm/runner/test/generation_config_test.cpp
new file mode 100644
index 00000000000..061f982c684
--- /dev/null
+++ b/extension/llm/runner/test/generation_config_test.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/runner/irunner.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::extension::llm::GenerationConfig;
+
+class GenerationConfigTest : public Test {};
+
+TEST_F(GenerationConfigTest, TestResolveMaxNewTokensBothDefault) {
+  // Test when both seq_len and max_new_tokens are -1 (default)
+  GenerationConfig config;
+  // Default values: seq_len = -1, max_new_tokens = -1
+
+  // max_context_len = 100, num_prompt_tokens = 20
+  // Expected: max_context_len - num_prompt_tokens = 100 - 20 = 80
+  EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 80);
+
+  // max_context_len = 50, num_prompt_tokens = 30
+  // Expected: max_context_len - num_prompt_tokens = 50 - 30 = 20
+  EXPECT_EQ(config.resolve_max_new_tokens(50, 30), 20);
+
+  // Edge case: num_prompt_tokens equals max_context_len
+  // Expected: 0 (no tokens left)
+  EXPECT_EQ(config.resolve_max_new_tokens(40, 40), 0);
+
+  // Edge case: num_prompt_tokens exceeds max_context_len
+  // Expected: 0 (no tokens left, and we ensure non-negative result)
+  EXPECT_EQ(config.resolve_max_new_tokens(30, 50), 0);
+}
+
+TEST_F(GenerationConfigTest, TestResolveMaxNewTokensOnlyMaxNewTokens) {
+  // Test when only max_new_tokens is specified (seq_len = -1)
+  GenerationConfig config;
+  config.seq_len = -1;
+  config.max_new_tokens = 25;
+
+  // max_context_len = 100, num_prompt_tokens = 20
+  // Available tokens: 100 - 20 = 80
+  // Expected: min(max_new_tokens, available) = min(25, 80) = 25
+  EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 25);
+
+  // max_context_len = 50, num_prompt_tokens = 40
+  // Available tokens: 50 - 40 = 10
+  // Expected: min(max_new_tokens, available) = min(25, 10) = 10
+  EXPECT_EQ(config.resolve_max_new_tokens(50, 40), 10);
+
+  // Edge case: num_prompt_tokens equals max_context_len
+  // Available tokens: 0
+  // Expected: 0 (no tokens left)
+  EXPECT_EQ(config.resolve_max_new_tokens(40, 40), 0);
+}
+
+TEST_F(GenerationConfigTest, TestResolveMaxNewTokensOnlySeqLen) {
+  // Test when only seq_len is specified (max_new_tokens = -1)
+  GenerationConfig config;
+  config.seq_len = 50;
+  config.max_new_tokens = -1;
+
+  // max_context_len = 100, num_prompt_tokens = 20
+  // Effective seq_len: min(seq_len, max_context_len) = min(50, 100) = 50
+  // Expected: effective_seq_len - num_prompt_tokens = 50 - 20 = 30
+  EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 30);
+
+  // max_context_len = 40, num_prompt_tokens = 20
+  // Effective seq_len: min(seq_len, max_context_len) = min(50, 40) = 40
+  // Expected: effective_seq_len - num_prompt_tokens = 40 - 20 = 20
+  EXPECT_EQ(config.resolve_max_new_tokens(40, 20), 20);
+
+  // Edge case: num_prompt_tokens equals effective seq_len
+  // Expected: 0 (no tokens left)
+  EXPECT_EQ(config.resolve_max_new_tokens(100, 50), 0);
+
+  // Edge case: num_prompt_tokens exceeds effective seq_len
+  // Expected: 0 (no tokens left, and we ensure non-negative result)
+  EXPECT_EQ(config.resolve_max_new_tokens(100, 60), 0);
+}
+
+TEST_F(GenerationConfigTest, TestResolveMaxNewTokensBothSpecified) {
+  // Test when both seq_len and max_new_tokens are specified
+  GenerationConfig config;
+  config.seq_len = 50;
+  config.max_new_tokens = 25;
+
+  // max_context_len = 100, num_prompt_tokens = 20
+  // Effective seq_len: min(seq_len, max_context_len) = min(50, 100) = 50
+  // Available tokens: effective_seq_len - num_prompt_tokens = 50 - 20 = 30
+  // Expected: min(max_new_tokens, available) = min(25, 30) = 25
+  EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 25);
+
+  // max_context_len = 40, num_prompt_tokens = 20
+  // Effective seq_len: min(seq_len, max_context_len) = min(50, 40) = 40
+  // Available tokens: effective_seq_len - num_prompt_tokens = 40 - 20 = 20
+  // Expected: min(max_new_tokens, available) = min(25, 20) = 20
+  EXPECT_EQ(config.resolve_max_new_tokens(40, 20), 20);
+
+  // Edge case: num_prompt_tokens equals effective seq_len
+  // Available tokens: 0
+  // Expected: 0 (no tokens left)
+  EXPECT_EQ(config.resolve_max_new_tokens(40, 40), 0);
+
+  // Edge case: max_new_tokens is very small
+  config.max_new_tokens = 5;
+  // Available tokens: 50 - 20 = 30
+  // Expected: min(max_new_tokens, available) = min(5, 30) = 5
+  EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 5);
+}
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
new file mode 100644
index 00000000000..9cdaad990bb
--- /dev/null
+++ b/extension/llm/runner/test/targets.bzl
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_test(
+        name = "generation_config_test",
+        srcs = ["generation_config_test.cpp"],
+        deps = [
+            "//executorch/extension/llm/runner:irunner",
+            "//executorch/extension/llm/runner:stats",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index 7a546574e37..8705dfeb842 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -21,18 +21,8 @@ namespace llm {
 // NOTE: we observed ~2x loading performance increase on iPhone 15
 // and a ~5% improvement on Galaxy S22 by switching to
 // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-TextDecoderRunner::TextDecoderRunner(
-    Module* module,
-    bool use_kv_cache,
-    int32_t vocab_size,
-    float temperature)
-    : module_(module),
-      sampler_(std::make_unique<Sampler>(
-          vocab_size,
-          temperature,
-          kTopp,
-          static_cast<unsigned long long>(std::time(nullptr)))),
-      use_kv_cache_(use_kv_cache) {}
+TextDecoderRunner::TextDecoderRunner(Module* module, bool use_kv_cache)
+    : module_(module), use_kv_cache_(use_kv_cache) {}
 
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index ca4d127e516..b0db48ee75e 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -22,11 +22,7 @@ namespace llm {
 
 class ET_EXPERIMENTAL TextDecoderRunner {
  public:
-  TextDecoderRunner(
-      Module* module,
-      bool use_kv_cache,
-      int32_t vocab_size,
-      float temperature);
+  TextDecoderRunner(Module* module, bool use_kv_cache);
 
   virtual ~TextDecoderRunner() = default;
 
@@ -64,10 +60,13 @@ class ET_EXPERIMENTAL TextDecoderRunner {
   /**
    * Sample the next token from the logits tensor.
    * @param logits_tensor The logits tensor.
+   * @param temperature The temperature parameter used to control randomness in
+   * sampling.
    * @return The next token.
    */
   inline int32_t logits_to_token(
-      const executorch::aten::Tensor& logits_tensor) {
+      const executorch::aten::Tensor& logits_tensor,
+      const float temperature = 0.0f) {
     int32_t result = 0;
     ET_SWITCH_THREE_TYPES(
         Float,
@@ -82,15 +81,14 @@ class ET_EXPERIMENTAL TextDecoderRunner {
           // vocab_size], get the last logits, sample and return. Else the model
           // outputs the last logit, directly sample and return.
           auto* logits = logits_tensor.mutable_data_ptr<CTYPE>();
+          ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
           if (logits_tensor.dim() == 3) {
             auto num_tokens = logits_tensor.size(1);
-            auto vocab_size = logits_tensor.size(2);
-            auto* logits_last = logits;
-            logits_last += (num_tokens - 1) * vocab_size;
-            result = sampler_->sample(logits_last);
-          } else {
-            result = sampler_->sample(logits);
+            logits += (num_tokens - 1) * vocab_size;
           }
+          // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+          Sampler sampler(vocab_size, temperature);
+          result = sampler.sample(logits);
         });
     return result;
   }
@@ -98,7 +96,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
  protected:
   // TODO: use shared_ptr for module
   Module* module_;
-  std::unique_ptr<Sampler> sampler_;
   bool use_kv_cache_;
   bool should_stop_{false};
 };
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index e8bf891f8ec..1b928de1717 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -38,16 +38,20 @@ class ET_EXPERIMENTAL TextTokenGenerator {
    * prefill.
    * @param start_pos the start position of the new tokens, based on how many
    * prompt tokens is prefilled.
-   * @param seq_len the total sequence length, including the prompt tokens, next
-   * token from prefill and new tokens.
+   * @param max_new_tokens Maximum number of new tokens to generate.
+   * @param temperature controls the randomness of predictions by scaling the
+   * logits before applying softmax. A higher temperature results in more
+   * random predictions, while a lower temperature results in more deterministic
+   * predictions.
    * @param token_callback what to do after a token is generated.
    * @return how many tokens are generated.
    */
   inline ::executorch::runtime::Result<int64_t> generate(
       std::vector<uint64_t> tokens,
       int64_t start_pos,
-      int32_t seq_len,
-      std::function<void(const std::string&)> token_callback) {
+      int32_t max_new_tokens,
+      float temperature = 0.0f,
+      const std::function<void(const std::string&)>& token_callback = {}) {
     ET_CHECK_MSG(
         !tokens.empty(), "Token generation loop shouldn't take empty tokens");
     int64_t pos = start_pos; // position in the sequence
@@ -78,7 +82,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
     should_stop_ = false;
 
     // Generate our tokens
-    while (pos < seq_len - 1) {
+    while (pos < start_pos + max_new_tokens) {
       // Run the model
       auto logits_res =
           text_decoder_runner_->step(tokens_managed, start_pos_managed);
@@ -89,7 +93,8 @@ class ET_EXPERIMENTAL TextTokenGenerator {
       prev_token = cur_token;
 
       stats_->on_sampling_begin();
-      cur_token = text_decoder_runner_->logits_to_token(logits_tensor);
+      cur_token =
+          text_decoder_runner_->logits_to_token(logits_tensor, temperature);
       stats_->on_sampling_end();
 
       pos++;
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index f7b48433cfb..0cb2463d163 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -68,9 +68,9 @@ ET_EXPERIMENTAL void inline safe_printf(const char* piece) {
 ET_EXPERIMENTAL long inline time_in_ms() {
   // return time in milliseconds, for benchmarking the model speed
   struct timespec time;
-  // The `timespec_get` function is only available on Android API levels
-  // 29 or later.
-#if defined(__ANDROID_API__) && __ANDROID_API__ < 29
+  // The `timespec_get` function is for windows time access. Some AOSP OS does
+  // not have timespec_get support.
+#if defined(__ANDROID_API__)
   clock_gettime(CLOCK_REALTIME, &time);
 #else
   timespec_get(&time, TIME_UTC);
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index 71167bf9cf4..295ee78e4b0 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit 71167bf9cf4bed861eb9547d1d77e993fd1004f1
+Subproject commit 295ee78e4b0d99d4527bbe81bc3156341366de11
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 400a2c45049..6c534b8d560 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -37,6 +37,9 @@
 namespace executorch {
 namespace extension {
 
+using ET_RUNTIME_NAMESPACE::MethodMeta;
+using ET_RUNTIME_NAMESPACE::Program;
+
 namespace {
 runtime::Result<std::unique_ptr<runtime::DataLoader>> load_file(
     const std::string& file_path,
@@ -113,7 +116,7 @@ Module::Module(
 }
 
 Module::Module(
-    std::shared_ptr<runtime::Program> program,
+    std::shared_ptr<Program> program,
     std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
     std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
     std::unique_ptr<runtime::EventTracer> event_tracer,
@@ -131,7 +134,7 @@ Module::Module(
   runtime::runtime_init();
 }
 
-runtime::Error Module::load(const runtime::Program::Verification verification) {
+runtime::Error Module::load(const Program::Verification verification) {
   if (!is_loaded()) {
     // Load the program
     if (!data_loader_) {
@@ -156,10 +159,10 @@ runtime::Error Module::load(const runtime::Program::Verification verification) {
     }
     // else: either the map itself was provided or we have no data map, either
     // way no work to do.
-    auto program = ET_UNWRAP_UNIQUE(
-        runtime::Program::load(data_loader_.get(), verification));
-    program_ = std::shared_ptr<runtime::Program>(
-        program.release(), [](runtime::Program* pointer) { delete pointer; });
+    auto program =
+        ET_UNWRAP_UNIQUE(Program::load(data_loader_.get(), verification));
+    program_ = std::shared_ptr<Program>(
+        program.release(), [](Program* pointer) { delete pointer; });
   }
   return runtime::Error::Ok;
 }
@@ -224,7 +227,7 @@ runtime::Error Module::load_method(
   return runtime::Error::Ok;
 }
 
-runtime::Result<runtime::MethodMeta> Module::method_meta(
+runtime::Result<MethodMeta> Module::method_meta(
     const std::string& method_name) {
   ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
   return methods_.at(method_name).method->method_meta();
diff --git a/extension/module/module.h b/extension/module/module.h
index 45d2cc1d14b..73c7328ee0a 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -19,6 +19,11 @@
 namespace executorch {
 namespace extension {
 
+using ET_RUNTIME_NAMESPACE::Method;
+using ET_RUNTIME_NAMESPACE::MethodMeta;
+using ET_RUNTIME_NAMESPACE::NamedDataMap;
+using ET_RUNTIME_NAMESPACE::Program;
+
 /**
  * A facade class for loading programs and executing methods within them.
  */
@@ -95,7 +100,7 @@ class Module {
    * @param[in] data_map_loader A DataLoader used for loading external weights.
    */
   explicit Module(
-      std::shared_ptr<runtime::Program> program,
+      std::shared_ptr<Program> program,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
@@ -116,8 +121,8 @@ class Module {
    */
   ET_NODISCARD
   runtime::Error load(
-      const runtime::Program::Verification verification =
-          runtime::Program::Verification::Minimal);
+      const Program::Verification verification =
+          Program::Verification::Minimal);
 
   /**
    * Checks if the program is loaded.
@@ -134,7 +139,7 @@ class Module {
    *
    * @returns Shared pointer to the program or nullptr if it's not yet loaded.
    */
-  inline std::shared_ptr<runtime::Program> program() const {
+  inline std::shared_ptr<Program> program() const {
     return program_;
   }
 
@@ -224,8 +229,7 @@ class Module {
    * @returns A method metadata, or an error if the program or method failed to
    * load.
    */
-  runtime::Result<runtime::MethodMeta> method_meta(
-      const std::string& method_name);
+  runtime::Result<MethodMeta> method_meta(const std::string& method_name);
 
   /**
    * Execute a specific method with the given input values and retrieve the
@@ -473,20 +477,20 @@ class Module {
     std::vector<runtime::Span<uint8_t>> planned_spans;
     std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
     std::unique_ptr<runtime::MemoryManager> memory_manager;
-    std::unique_ptr<runtime::Method> method;
+    std::unique_ptr<Method> method;
     std::vector<runtime::EValue> inputs;
   };
 
   std::string file_path_;
   std::string data_map_path_;
   LoadMode load_mode_{LoadMode::MmapUseMlock};
-  std::shared_ptr<runtime::Program> program_;
+  std::shared_ptr<Program> program_;
   std::unique_ptr<runtime::DataLoader> data_loader_;
   std::unique_ptr<runtime::MemoryAllocator> memory_allocator_;
   std::unique_ptr<runtime::MemoryAllocator> temp_allocator_;
   std::unique_ptr<runtime::EventTracer> event_tracer_;
   std::unique_ptr<runtime::DataLoader> data_map_loader_;
-  std::unique_ptr<runtime::NamedDataMap> data_map_;
+  std::unique_ptr<NamedDataMap> data_map_;
 
  protected:
   std::unordered_map<std::string, MethodHolder> methods_;
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 09a610a1fca..d8019ce9c4e 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -25,7 +25,7 @@ def define_common_targets():
                 "//executorch/extension/memory_allocator:malloc_memory_allocator",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/extension/data_loader:mmap_data_loader",
-                "//executorch/extension/flat_tensor:flat_tensor_data_map",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix,
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program" + aten_suffix,
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index a998e591f30..69952c5d173 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -84,27 +84,27 @@ void et_pal_emit_log_message(
 }
 
 namespace py = pybind11;
-using executorch::bundled_program::verify_method_outputs;
+using executorch::BUNDLED_PROGRAM_NAMESPACE::verify_method_outputs;
+using ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface;
+using ::executorch::ET_RUNTIME_NAMESPACE::get_backend_class;
+using ::executorch::ET_RUNTIME_NAMESPACE::get_backend_name;
+using ::executorch::ET_RUNTIME_NAMESPACE::get_num_registered_backends;
+using ::executorch::ET_RUNTIME_NAMESPACE::get_registered_kernels;
+using ::executorch::ET_RUNTIME_NAMESPACE::Kernel;
+using ::executorch::ET_RUNTIME_NAMESPACE::Method;
+using ::executorch::ET_RUNTIME_NAMESPACE::Program;
 using ::executorch::extension::BufferDataLoader;
 using ::executorch::extension::MallocMemoryAllocator;
 using ::executorch::extension::MmapDataLoader;
 using ::executorch::runtime::ArrayRef;
-using ::executorch::runtime::BackendInterface;
 using ::executorch::runtime::DataLoader;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::EventTracerDebugLogLevel;
-using ::executorch::runtime::get_backend_class;
-using ::executorch::runtime::get_backend_name;
-using ::executorch::runtime::get_num_registered_backends;
-using ::executorch::runtime::get_registered_kernels;
 using ::executorch::runtime::HierarchicalAllocator;
-using ::executorch::runtime::Kernel;
 using ::executorch::runtime::MemoryAllocator;
 using ::executorch::runtime::MemoryManager;
-using ::executorch::runtime::Method;
 using ::executorch::runtime::prof_result_t;
-using ::executorch::runtime::Program;
 using ::executorch::runtime::Result;
 using ::executorch::runtime::Span;
 using ::executorch::runtime::Tag;
@@ -826,7 +826,7 @@ struct PyModule final {
       const std::string method_name,
       size_t testset_idx) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    Error status = executorch::bundled_program::load_bundled_input(
+    Error status = executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input(
         module_->get_method(method_name), bundled_program_ptr, testset_idx);
     THROW_IF_ERROR(
         status,
@@ -842,14 +842,14 @@ struct PyModule final {
       double atol = 1e-8) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
     auto& method = module_->get_method(method_name);
-    Error status = executorch::bundled_program::load_bundled_input(
+    Error status = executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input(
         method, bundled_program_ptr, testset_idx);
     THROW_IF_ERROR(
         status,
         "load_bundled_input failed with status 0x%" PRIx32,
         static_cast<uint32_t>(status));
     py::list outputs = plan_execute(method_name);
-    status = executorch::bundled_program::verify_method_outputs(
+    status = executorch::BUNDLED_PROGRAM_NAMESPACE::verify_method_outputs(
         method, bundled_program_ptr, testset_idx, rtol, atol);
     THROW_IF_ERROR(
         status,
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index 64ea14f08ff..7aede1c29a9 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -161,7 +161,7 @@ def _load_for_executorch(
     Args:
         path: File path to the ExecuTorch program as a string.
         enable_etdump: If true, enables an ETDump which can store profiling information.
-            See documentation at https://pytorch.org/executorch/stable/etdump.html
+            See documentation at https://pytorch.org/executorch/main/etdump
             for how to use it.
         debug_buffer_size: If non-zero, enables a debug buffer which can store
             intermediate results of each instruction in the ExecuTorch program.
@@ -192,7 +192,7 @@ def _load_for_executorch_from_bundled_program(
 ) -> ExecuTorchModule:
     """Same as _load_for_executorch, but takes a bundled program instead of a file path.
 
-    See https://pytorch.org/executorch/stable/bundled-io.html for documentation.
+    See https://pytorch.org/executorch/main/bundled-io for documentation.
 
     .. warning::
 
diff --git a/extension/runner_util/inputs.cpp b/extension/runner_util/inputs.cpp
index 11cd176b5d1..842ba25532f 100644
--- a/extension/runner_util/inputs.cpp
+++ b/extension/runner_util/inputs.cpp
@@ -12,12 +12,12 @@
 #include <executorch/runtime/executor/method_meta.h>
 #include <executorch/runtime/platform/log.h>
 
+using executorch::ET_RUNTIME_NAMESPACE::Method;
+using executorch::ET_RUNTIME_NAMESPACE::MethodMeta;
+using executorch::ET_RUNTIME_NAMESPACE::TensorInfo;
 using executorch::runtime::Error;
-using executorch::runtime::Method;
-using executorch::runtime::MethodMeta;
 using executorch::runtime::Result;
 using executorch::runtime::Tag;
-using executorch::runtime::TensorInfo;
 
 namespace executorch {
 namespace extension {
diff --git a/extension/runner_util/inputs.h b/extension/runner_util/inputs.h
index 73722c0d7bf..214b76d67e3 100644
--- a/extension/runner_util/inputs.h
+++ b/extension/runner_util/inputs.h
@@ -15,6 +15,9 @@
 namespace executorch {
 namespace extension {
 
+using ::executorch::ET_RUNTIME_NAMESPACE::Method;
+using ::executorch::ET_RUNTIME_NAMESPACE::TensorInfo;
+
 /**
  * RAII helper that frees a set of buffers when destroyed. Movable.
  */
@@ -80,7 +83,7 @@ struct PrepareInputTensorsOptions {
  * @returns An error on failure.
  */
 executorch::runtime::Result<BufferCleanup> prepare_input_tensors(
-    executorch::runtime::Method& method,
+    Method& method,
     PrepareInputTensorsOptions options = {});
 
 namespace internal {
@@ -89,8 +92,8 @@ namespace internal {
  * fills it with ones, and sets the input at `input_index`.
  */
 executorch::runtime::Error fill_and_set_input(
-    executorch::runtime::Method& method,
-    executorch::runtime::TensorInfo& tensor_meta,
+    Method& method,
+    TensorInfo& tensor_meta,
     size_t input_index,
     void* data_ptr);
 } // namespace internal
diff --git a/extension/runner_util/inputs_aten.cpp b/extension/runner_util/inputs_aten.cpp
index 83d12dac42d..b89562a2f69 100644
--- a/extension/runner_util/inputs_aten.cpp
+++ b/extension/runner_util/inputs_aten.cpp
@@ -15,8 +15,8 @@
 #include <executorch/runtime/executor/method_meta.h>
 
 using executorch::runtime::Error;
-using executorch::runtime::Method;
-using executorch::runtime::TensorInfo;
+using executorch::runtime::aten::Method;
+using executorch::runtime::aten::TensorInfo;
 
 namespace executorch {
 namespace extension {
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
index c1742fc599b..8a35e83a526 100644
--- a/extension/tensor/tensor_ptr.cpp
+++ b/extension/tensor/tensor_ptr.cpp
@@ -188,7 +188,7 @@ TensorPtr clone_tensor_ptr(const executorch::aten::Tensor& tensor) {
 runtime::Error resize_tensor_ptr(
     TensorPtr& tensor,
     const std::vector<executorch::aten::SizesType>& sizes) {
-  return runtime::resize_tensor(
+  return ET_RUNTIME_NAMESPACE::resize_tensor(
       *tensor,
       executorch::aten::ArrayRef<executorch::aten::SizesType>(
           sizes.data(), sizes.size()));
diff --git a/install_executorch.py b/install_executorch.py
index 1d3fe8af1fb..4c7b51ef239 100644
--- a/install_executorch.py
+++ b/install_executorch.py
@@ -120,8 +120,10 @@ def check_folder(folder: str, file: str) -> bool:
     if missing_submodules:
         logger.warning("Some required submodules are missing. Updating submodules...")
         try:
-            subprocess.check_call(["git", "submodule", "sync"])
-            subprocess.check_call(["git", "submodule", "update", "--init"])
+            subprocess.check_call(["git", "submodule", "sync", "--recursive"])
+            subprocess.check_call(
+                ["git", "submodule", "update", "--init", "--recursive"]
+            )
         except subprocess.CalledProcessError as e:
             logger.error(f"Error updating submodules: {e}")
             exit(1)
@@ -130,13 +132,10 @@ def check_folder(folder: str, file: str) -> bool:
         for path, file in missing_submodules.items():
             if not check_folder(path, file):
                 logger.error(f"{file} not found in {path}.")
-                logger.error("Please run `git submodule update --init`.")
+                logger.error(
+                    "Submodule update failed. Please run `git submodule update --init --recursive` manually."
+                )
                 exit(1)
-    # Go into tokenizers submodule and install its submodules
-    tokenizers_path = get_required_submodule_paths().get("tokenizers", None)
-    if tokenizers_path:
-        with pushd(tokenizers_path):
-            subprocess.check_call(["git", "submodule", "update", "--init"])
     logger.info("All required submodules are present.")
 
 
diff --git a/kernels/README.md b/kernels/README.md
index e1c6d02afa8..58931beb984 100644
--- a/kernels/README.md
+++ b/kernels/README.md
@@ -351,9 +351,11 @@ Once you have your operator and corresponding tests in place, we can try it out.
 cmake . \
   -DCMAKE_INSTALL_PREFIX=cmake-out \
   -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \
+  -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
   -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -369,6 +371,7 @@ cmake --build cmake-out -j9 --target install
 ```
 ./cmake-out/kernels/test/portable_kernels_test
 ./cmake-out/kernels/test/optimized_kernels_test
+./cmake-out/kernels/test/quantized_kernels_test
 ```
 
 #### Implementation restrictions
diff --git a/kernels/aten/cpu/op__empty_dim_order.cpp b/kernels/aten/cpu/op__empty_dim_order.cpp
index e75963a9c4e..654b29c778d 100644
--- a/kernels/aten/cpu/op__empty_dim_order.cpp
+++ b/kernels/aten/cpu/op__empty_dim_order.cpp
@@ -102,7 +102,7 @@ Tensor& _empty_dim_order_out(
     IntArrayRef size,
     OptionalIntArrayRef dim_order,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext ctx{};
+  KernelRuntimeContext ctx{};
   return _empty_dim_order_out(ctx, size, dim_order, out);
 }
 
diff --git a/kernels/aten/cpu/op__to_dim_order_copy.cpp b/kernels/aten/cpu/op__to_dim_order_copy.cpp
index 10793d24db5..a8216c9a8e9 100644
--- a/kernels/aten/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/aten/cpu/op__to_dim_order_copy.cpp
@@ -116,7 +116,7 @@ Tensor& _to_dim_order_copy_out(
     bool non_blocking,
     OptionalArrayRef<int64_t> dim_order,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext ctx{};
+  KernelRuntimeContext ctx{};
   return _to_dim_order_copy_out(ctx, self, non_blocking, dim_order, out);
 }
 
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
index a8fa6611478..28f1a215562 100644
--- a/kernels/aten/functions.yaml
+++ b/kernels/aten/functions.yaml
@@ -6,6 +6,8 @@
 
 - op: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out
 
+- op: _fft_c2r.out
+
 - op: _fft_r2c.out
 
 - op: _linalg_det.result
@@ -423,6 +425,8 @@
 
 - op: var.out
 
+- op: view_as_real_copy.out
+
 - op: view_copy.out
 
 - op: where.self_out
diff --git a/kernels/optimized/cpu/fft_utils.h b/kernels/optimized/cpu/fft_utils.h
new file mode 100644
index 00000000000..2225e8ddfa7
--- /dev/null
+++ b/kernels/optimized/cpu/fft_utils.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <pocketfft_hdronly.h>
+#include <optional>
+
+namespace torch::executor::native {
+
+// TODO: contents of this anonymous namespace are copy/pasted from
+// PyTorch core (aten/src/ATen/native/mkl/SpectralOps.cpp). Small
+// portions (the parts that don't depend on Tensor) could be reused;
+// refactor to enable that once we can share headers from PyTorch
+// core.
+namespace {
+inline pocketfft::stride_t stride_from_tensor(const Tensor& t) {
+  pocketfft::stride_t stride(t.strides().begin(), t.strides().end());
+  for (auto& s : stride) {
+    s *= t.element_size();
+  }
+  return stride;
+}
+
+inline pocketfft::shape_t shape_from_tensor(const Tensor& t) {
+  return pocketfft::shape_t(t.sizes().begin(), t.sizes().end());
+}
+
+// NOTE: The reinterpret_cast in tensor_cdata is UB, but it's what
+// PyTorch core does and I'm not aware of a portable way to do this
+// that doesn't rely on UB.
+template <typename T>
+inline std::complex<T>* tensor_cdata(Tensor& t) {
+  return reinterpret_cast<std::complex<T>*>(
+      t.data_ptr<executorch::runtime::etensor::complex<T>>());
+}
+
+template <typename T>
+inline const std::complex<T>* tensor_cdata(const Tensor& t) {
+  return reinterpret_cast<const std::complex<T>*>(
+      t.const_data_ptr<executorch::runtime::etensor::complex<T>>());
+}
+
+// NOTE: in particular this is in ATen/native/SpectralOpsUtils.h and
+// could be shared immediately.
+enum class fft_norm_mode {
+  none, // No normalization
+  by_root_n, // Divide by sqrt(signal_size)
+  by_n, // Divide by signal_size
+};
+
+// NOTE: slight fork from upstream PyTorch to use ET_KERNEL_CHECK;
+// upstream with TORCH_CHECK will be fine to use once we have code
+// sharing.
+template <typename T>
+std::optional<T>
+compute_fct(KernelRuntimeContext& ctx, int64_t size, int64_t normalization) {
+  constexpr auto one = static_cast<T>(1);
+  switch (static_cast<fft_norm_mode>(normalization)) {
+    case fft_norm_mode::none:
+      return one;
+    case fft_norm_mode::by_n:
+      return one / static_cast<T>(size);
+    case fft_norm_mode::by_root_n:
+      return one / std::sqrt(static_cast<T>(size));
+  }
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      false,
+      InvalidArgument,
+      std::nullopt,
+      "Unsupported normalization type: %" PRId64,
+      normalization);
+}
+
+template <typename T>
+std::optional<T> compute_fct(
+    KernelRuntimeContext& ctx,
+    const Tensor& t,
+    IntArrayRef dim,
+    int64_t normalization) {
+  if (static_cast<fft_norm_mode>(normalization) == fft_norm_mode::none) {
+    return static_cast<T>(1);
+  }
+  const auto& sizes = t.sizes();
+  int64_t n = 1;
+  for (auto idx : dim) {
+    n *= sizes[idx];
+  }
+  return compute_fct<T>(ctx, n, normalization);
+}
+} // namespace
+
+} // namespace torch::executor::native
diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp
index 5e7fa1dd839..11697f9b0de 100644
--- a/kernels/optimized/cpu/op_bmm.cpp
+++ b/kernels/optimized/cpu/op_bmm.cpp
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/runtime/kernel/kernel_includes.h>
-
 #include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
 
 // Performs a batch matrix-matrix product of matrices stored in input and mat2.
 
@@ -136,33 +136,32 @@ Error resize_out_tensor(const Tensor& self, const Tensor& mat2, Tensor& out) {
 
 // bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
 Tensor& opt_bmm_out(
-    KernelRuntimeContext& context,
+    KernelRuntimeContext& ctx,
     const Tensor& self,
     const Tensor& mat2,
     Tensor& out) {
-  (void)context;
+  (void)ctx;
 
   ET_KERNEL_CHECK(
-      context,
+      ctx,
       resize_out_tensor(self, mat2, out) == Error::Ok,
       InvalidArgument,
       out);
   ET_KERNEL_CHECK(
-      context, check_bmm_out_args(self, mat2, out), InvalidArgument, out);
-
-#define BMM_TENSOR(ctype, dtype)        \
-  case ScalarType::dtype:               \
-    bmm_kernel<ctype>(self, mat2, out); \
-    break;
-
-  auto scalar_type = self.scalar_type();
-  switch (scalar_type) {
-    ET_FORALL_REAL_TYPES_AND(Half, BMM_TENSOR)
-    default:
-      ET_CHECK_MSG(
-          false, "Unhandled dtype %" PRId8, static_cast<int8_t>(scalar_type));
+      ctx, check_bmm_out_args(self, mat2, out), InvalidArgument, out);
+
+  constexpr auto name = "bmm.out";
+  auto self_type = self.scalar_type();
+
+  if (executorch::runtime::isComplexType(self_type)) {
+    ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, name, CTYPE, [&]() {
+      internal::bmm_out_impl<CTYPE>(self, mat2, out);
+    });
+  } else {
+    ET_SWITCH_REALH_TYPES(self_type, ctx, name, CTYPE, [&]() {
+      bmm_kernel<CTYPE>(self, mat2, out);
+    });
   }
-#undef BMM_TENSOR
 
   return out;
 }
diff --git a/kernels/optimized/cpu/op_fft_c2r.cpp b/kernels/optimized/cpu/op_fft_c2r.cpp
new file mode 100644
index 00000000000..f595b5f7299
--- /dev/null
+++ b/kernels/optimized/cpu/op_fft_c2r.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/cpu/fft_utils.h>
+#include <executorch/runtime/core/span.h>
+
+namespace torch::executor::native {
+Tensor& opt_fft_c2r_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef dim,
+    int64_t normalization,
+    int64_t last_dim_size,
+    Tensor& out) {
+  auto in_sizes = in.sizes();
+  ET_KERNEL_CHECK(ctx, in.dim() <= kTensorDimensionLimit, InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, !dim.empty(), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, last_dim_size >= 1, InvalidArgument, out);
+
+  // Determine the output size
+  std::array<Tensor::SizesType, kTensorDimensionLimit> out_sizes_storage{};
+  executorch::runtime::Span<Tensor::SizesType> out_sizes(
+      out_sizes_storage.data(), in_sizes.size());
+  std::copy(in_sizes.begin(), in_sizes.end(), out_sizes.begin());
+  out_sizes[dim.back()] = last_dim_size;
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      in.scalar_type() == executorch::runtime::toComplexType(out.scalar_type()),
+      InvalidArgument,
+      out,
+      "the input type for _fft_c2r must be the Complex type corresponding to the output type");
+
+  for (auto d : dim) {
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        d >= 0 && d < in.dim(),
+        InvalidArgument,
+        out,
+        "dims must be in bounds (got %" PRId64 ")",
+        d);
+  }
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(
+          out,
+          executorch::runtime::ArrayRef<Tensor::SizesType>(
+              out_sizes.data(), out_sizes.size())) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor (last dim %d).",
+      out_sizes[dim.back()]);
+
+  pocketfft::shape_t axes(dim.begin(), dim.end());
+  auto out_shape = shape_from_tensor(out);
+  // TODO: if arbitrary strides are a possibility, we need to validate
+  // these, because pocketfft README says "Strides that lead to
+  // multiple accesses of the same memory address are not allowed."
+  auto in_stride = stride_from_tensor(in);
+  auto out_stride = stride_from_tensor(out);
+  // NOTE: as of this writing, upstream PyTorch only supports
+  // float/double, so we follow suit.
+  ET_SWITCH_FLOAT_TYPES(out.scalar_type(), ctx, "_fft_c2r.out", CTYPE_OUT, [&] {
+    auto fct = compute_fct<CTYPE_OUT>(ctx, out, dim, normalization);
+    if (!fct) {
+      // Check failed, just bail out of the lambda.
+      return;
+    }
+    pocketfft::c2r<CTYPE_OUT>(
+        out_shape,
+        in_stride,
+        out_stride,
+        axes,
+        false /* forward */,
+        tensor_cdata<CTYPE_OUT>(in),
+        out.mutable_data_ptr<CTYPE_OUT>(),
+        *fct);
+  });
+  return out;
+}
+} // namespace torch::executor::native
diff --git a/kernels/optimized/cpu/op_fft_r2c.cpp b/kernels/optimized/cpu/op_fft_r2c.cpp
index 45d3d9acb42..750a7e8f0a2 100644
--- a/kernels/optimized/cpu/op_fft_r2c.cpp
+++ b/kernels/optimized/cpu/op_fft_r2c.cpp
@@ -6,99 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/fft_utils.h>
 #include <executorch/runtime/core/span.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-
-#include <pocketfft_hdronly.h>
-
-#include <optional>
 
 namespace torch::executor::native {
-
-// TODO: contents of this anonymous namespace are copy/pasted from
-// PyTorch core (aten/src/ATen/native/mkl/SpectralOps.cpp). Small
-// portions (the parts that don't depend on Tensor) could be reused;
-// refactor to enable that once we can share headers from PyTorch
-// core.
-namespace {
-pocketfft::stride_t stride_from_tensor(const Tensor& t) {
-  pocketfft::stride_t stride(t.strides().begin(), t.strides().end());
-  for (auto& s : stride) {
-    s *= t.element_size();
-  }
-  return stride;
-}
-
-pocketfft::shape_t shape_from_tensor(const Tensor& t) {
-  return pocketfft::shape_t(t.sizes().begin(), t.sizes().end());
-}
-
-// NOTE: The reinterpret_cast in tensor_cdata is UB, but it's what
-// PyTorch core does and I'm not aware of a portable way to do this
-// that doesn't rely on UB.
-template <typename T>
-inline std::complex<T>* tensor_cdata(Tensor& t) {
-  return reinterpret_cast<std::complex<T>*>(
-      t.data_ptr<executorch::runtime::etensor::complex<T>>());
-}
-
-template <typename T>
-inline const std::complex<T>* tensor_cdata(const Tensor& t) {
-  return reinterpret_cast<const std::complex<T>*>(
-      t.const_data_ptr<executorch::runtime::etensor::complex<T>>());
-}
-
-// NOTE: in particular this is in ATen/native/SpectralOpsUtils.h and
-// could be shared immediately.
-enum class fft_norm_mode {
-  none, // No normalization
-  by_root_n, // Divide by sqrt(signal_size)
-  by_n, // Divide by signal_size
-};
-
-// NOTE: slight fork from upstream PyTorch to use ET_KERNEL_CHECK;
-// upstream with TORCH_CHECK will be fine to use once we have code
-// sharing.
-template <typename T>
-std::optional<T>
-compute_fct(KernelRuntimeContext& ctx, int64_t size, int64_t normalization) {
-  constexpr auto one = static_cast<T>(1);
-  switch (static_cast<fft_norm_mode>(normalization)) {
-    case fft_norm_mode::none:
-      return one;
-    case fft_norm_mode::by_n:
-      return one / static_cast<T>(size);
-    case fft_norm_mode::by_root_n:
-      return one / std::sqrt(static_cast<T>(size));
-  }
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      false,
-      InvalidArgument,
-      std::nullopt,
-      "Unsupported normalization type: %" PRId64,
-      normalization);
-}
-
-template <typename T>
-std::optional<T> compute_fct(
-    KernelRuntimeContext& ctx,
-    const Tensor& t,
-    IntArrayRef dim,
-    int64_t normalization) {
-  if (static_cast<fft_norm_mode>(normalization) == fft_norm_mode::none) {
-    return static_cast<T>(1);
-  }
-  const auto& sizes = t.sizes();
-  int64_t n = 1;
-  for (auto idx : dim) {
-    n *= sizes[idx];
-  }
-  return compute_fct<T>(ctx, n, normalization);
-}
-
-} // namespace
-
 Tensor& opt_fft_r2c_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index c9da2584f08..7406cc21b53 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -15,6 +15,7 @@ _OPTIMIZED_ATEN_OPS = (
         name = "op_bmm",
         deps = [
             "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
         ],
     ),
     op_target(
@@ -34,13 +35,21 @@ _OPTIMIZED_ATEN_OPS = (
         ],
     ),
     op_target(name = "op_exp"),
+    op_target(
+        name = "op_fft_c2r",
+        compiler_flags = [] if runtime.is_oss else [
+            "-Wno-global-constructors",
+            "-Wno-shadow",
+        ],
+        deps = [":fft_utils"],
+    ),
     op_target(
         name = "op_fft_r2c",
         compiler_flags = [] if runtime.is_oss else [
             "-Wno-global-constructors",
             "-Wno-shadow",
         ],
-        deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"],
+        deps = [":fft_utils"],
     ),
     op_target(name = "op_sigmoid"),
     op_target(
@@ -142,6 +151,14 @@ def define_common_targets():
         exported_deps = ["//executorch/runtime/core:core"],
     )
 
+    runtime.cxx_library(
+        name = "fft_utils",
+        srcs = [],
+        exported_headers = ["fft_utils.h"],
+        visibility = ["//executorch/kernels/optimized/cpu/..."],
+        exported_deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"],
+    )
+
     runtime.cxx_library(
         name = "binary_ops",
         exported_headers = ["binary_ops.h"],
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
index 864c3ed5780..42a065f63ed 100644
--- a/kernels/optimized/optimized.yaml
+++ b/kernels/optimized/optimized.yaml
@@ -2,6 +2,11 @@
 #
 # This yaml file contains operators that have optimized kernels available.
 
+- op: _fft_c2r.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_fft_c2r_out
+
 - op: _fft_r2c.out
   kernels:
     - arg_meta: null
diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp
index 40ce86e8fdc..70fc3507f05 100644
--- a/kernels/portable/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp
@@ -125,7 +125,7 @@ Tensor& _to_dim_order_copy_out(
     bool non_blocking,
     OptionalArrayRef<int64_t> dim_order,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return _to_dim_order_copy_out(context, self, non_blocking, dim_order, out);
 }
 
diff --git a/kernels/portable/cpu/op_bmm.cpp b/kernels/portable/cpu/op_bmm.cpp
index b9f9d4f2c94..a887cd3c926 100644
--- a/kernels/portable/cpu/op_bmm.cpp
+++ b/kernels/portable/cpu/op_bmm.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
-#include <executorch/kernels/portable/cpu/vec_ops.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -37,26 +36,19 @@ Tensor& bmm_out(
       InvalidArgument,
       out);
 
-  ET_SWITCH_REAL_TYPES_AND(
-      Half, in.scalar_type(), ctx, "bmm.out", CTYPE, [&]() {
-        const CTYPE* in_data = in.const_data_ptr<CTYPE>();
-        const CTYPE* mat2_data = mat2.const_data_ptr<CTYPE>();
-        CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+  constexpr auto name = "bmm.out";
 
-        int64_t batch_size = in.size(0);
-        int64_t m = in.size(1);
-        int64_t n = in.size(2);
-        int64_t p = mat2.size(2);
+  auto in_type = in.scalar_type();
 
-        for (int i = 0; i < batch_size; ++i) {
-          const CTYPE* in_data_offset = in_data + i * m * n;
-          const CTYPE* mat2_data_offset = mat2_data + i * n * p;
-          CTYPE* out_data_offset = out_data + i * m * p;
-
-          vec_matmul<CTYPE>(
-              out_data_offset, in_data_offset, mat2_data_offset, m, n, p);
-        }
-      });
+  if (executorch::runtime::isComplexType(in_type)) {
+    ET_SWITCH_COMPLEXH_TYPES(in_type, ctx, name, CTYPE, [&]() {
+      internal::bmm_out_impl<CTYPE>(in, mat2, out);
+    });
+  } else {
+    ET_SWITCH_REALH_TYPES(in_type, ctx, name, CTYPE, [&]() {
+      internal::bmm_out_impl<CTYPE>(in, mat2, out);
+    });
+  }
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_view_as_real_copy.cpp b/kernels/portable/cpu/op_view_as_real_copy.cpp
new file mode 100644
index 00000000000..4a2803eded0
--- /dev/null
+++ b/kernels/portable/cpu/op_view_as_real_copy.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+
+namespace {
+
+template <typename SELF_CTYPE, typename OUT_CTYPE>
+inline void _to_impl(const Tensor& self, Tensor& out) {
+  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
+  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
+
+  for (size_t i = 0, e = self.numel(); i < e; i++) {
+    auto val_in = self_data[i];
+    out_data[2 * i] = static_cast<OUT_CTYPE>(val_in.real_);
+    out_data[2 * i + 1] = static_cast<OUT_CTYPE>(val_in.imag_);
+  }
+}
+
+} // namespace
+
+// view_as_real_copy(Tensor self) -> Tensor
+Tensor& view_as_real_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    Tensor& out) {
+  (void)ctx;
+
+  // Get the output shape
+  Tensor::SizesType expected_output_size[kTensorDimensionLimit];
+  get_view_as_real_copy_out_target_size(self, expected_output_size);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(
+          out, {expected_output_size, static_cast<size_t>(out.dim())}) ==
+          Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  // The input tensor must be complex type
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      executorch::runtime::isComplexType(self.scalar_type()),
+      InvalidArgument,
+      out,
+      "Input tensor must be complex type");
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+
+  constexpr auto op_name = "view_as_real_copy.out";
+
+  ET_SWITCH_COMPLEXH_TYPES(self.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      _to_impl<CTYPE_IN, CTYPE_OUT>(self, out);
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp
index 93725d92dab..02b2910fc88 100644
--- a/kernels/portable/cpu/util/copy_ops_util.cpp
+++ b/kernels/portable/cpu/util/copy_ops_util.cpp
@@ -1018,5 +1018,14 @@ void get_unfold_copy_out_target_size(
   *out_ndim = self.dim() + 1;
 }
 
+void get_view_as_real_copy_out_target_size(
+    const Tensor& self,
+    executorch::aten::SizesType* out_sizes) {
+  for (auto i : c10::irange(self.dim())) {
+    out_sizes[i] = self.size(i);
+  }
+  out_sizes[self.dim()] = 2;
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
index edcc6eb0021..cef2b3d4ee1 100644
--- a/kernels/portable/cpu/util/copy_ops_util.h
+++ b/kernels/portable/cpu/util/copy_ops_util.h
@@ -247,5 +247,9 @@ void get_unfold_copy_out_target_size(
     executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
+void get_view_as_real_copy_out_target_size(
+    const Tensor& self,
+    executorch::aten::SizesType* out_sizes);
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/matmul_ops_util.h b/kernels/portable/cpu/util/matmul_ops_util.h
index d2991868e95..2d2799eaa59 100644
--- a/kernels/portable/cpu/util/matmul_ops_util.h
+++ b/kernels/portable/cpu/util/matmul_ops_util.h
@@ -45,5 +45,36 @@ void get_linear_out_target_size(
     Tensor::SizesType* out_sizes,
     size_t* out_ndim);
 
+namespace internal {
+
+template <typename CTYPE>
+void bmm_out_impl(const Tensor& in, const Tensor& mat2, Tensor& out) {
+  const CTYPE* in_data = in.const_data_ptr<CTYPE>();
+  const CTYPE* mat2_data = mat2.const_data_ptr<CTYPE>();
+  CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+
+  int64_t batch_size = in.size(0);
+  int64_t m = in.size(1);
+  int64_t n = in.size(2);
+  int64_t p = mat2.size(2);
+
+  for (int b = 0; b < batch_size; ++b) {
+    const CTYPE* in_data_offset = in_data + b * m * n;
+    const CTYPE* mat2_data_offset = mat2_data + b * n * p;
+    CTYPE* out_data_offset = out_data + b * m * p;
+
+    for (const auto i : c10::irange(m)) {
+      for (const auto j : c10::irange(p)) {
+        CTYPE sum = static_cast<CTYPE>(0.0);
+        for (const auto k : c10::irange(n)) {
+          sum += in_data_offset[i * n + k] * mat2_data_offset[k * p + j];
+        }
+        out_data_offset[i * p + j] = sum;
+      }
+    }
+  }
+}
+
+} // namespace internal
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 5e45a210a70..ab04d3b26ac 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -957,6 +957,11 @@
     - arg_meta: null
       kernel_name: torch::executor::var_out
 
+- op: view_as_real_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::view_as_real_copy_out
+
 - op: view_copy.out
   kernels:
     - arg_meta: null
diff --git a/kernels/portable/test/op_gelu_test.cpp b/kernels/portable/test/op_gelu_test.cpp
index 19e757b4bd0..2e5cad55c35 100644
--- a/kernels/portable/test/op_gelu_test.cpp
+++ b/kernels/portable/test/op_gelu_test.cpp
@@ -25,7 +25,7 @@ using torch::executor::testing::TensorFactory;
 // executorch/kernels/test/op_gelu_test.cpp instead.
 
 Tensor& op_gelu_out(const Tensor& self, string_view approximate, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::native::gelu_out(context, self, approximate, out);
 }
 
diff --git a/kernels/prim_ops/et_view.cpp b/kernels/prim_ops/et_view.cpp
index 44ac7470193..f32c43ee3a4 100644
--- a/kernels/prim_ops/et_view.cpp
+++ b/kernels/prim_ops/et_view.cpp
@@ -93,7 +93,14 @@ void et_view(KernelRuntimeContext& context, EValue** stack) {
       "Failed to resize output tensor.");
 
   // Do some checks
-  ET_KERNEL_CHECK(context, self.numel() == out.numel(), InvalidArgument, );
+  ET_KERNEL_CHECK_MSG(
+      context,
+      self.numel() == out.numel(),
+      InvalidArgument,
+      ,
+      "self.numel(): %" ET_PRIsize_t ", out.numel(): %" ET_PRIsize_t,
+      static_cast<size_t>(self.numel()),
+      static_cast<size_t>(out.numel()));
 
   // Update data ptr
   ET_KERNEL_CHECK_MSG(
diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp
index 1d197b63584..62aead8978f 100644
--- a/kernels/prim_ops/register_prim_ops.cpp
+++ b/kernels/prim_ops/register_prim_ops.cpp
@@ -381,14 +381,13 @@ static Kernel prim_ops[] = {
 
 };
 
-executorch::runtime::Span<const executorch::runtime::Kernel> kernel_span(
-    prim_ops,
-    prim_ops + sizeof(prim_ops) / sizeof(Kernel));
+executorch::runtime::Span<const executorch::ET_RUNTIME_NAMESPACE::Kernel>
+    kernel_span(prim_ops, prim_ops + sizeof(prim_ops) / sizeof(Kernel));
 
 // Return value not used. Keep the static variable assignment to register
 // operators in static initialization time.
 auto success_with_kernel_reg =
-    executorch::runtime::register_kernels(kernel_span);
+    executorch::ET_RUNTIME_NAMESPACE::register_kernels(kernel_span);
 
 } // namespace
 } // namespace function
diff --git a/kernels/prim_ops/targets.bzl b/kernels/prim_ops/targets.bzl
index c1af21a7e73..d2cff10a194 100644
--- a/kernels/prim_ops/targets.bzl
+++ b/kernels/prim_ops/targets.bzl
@@ -56,7 +56,7 @@ def define_common_targets():
                 ":et_copy_index" + aten_suffix,
                 ":et_view" + aten_suffix,
                 "//executorch/runtime/core:evalue" + aten_suffix,
-                "//executorch/runtime/kernel:operator_registry",
+                "//executorch/runtime/kernel:operator_registry" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_includes" + aten_suffix,
             ],
         )
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index 5079109683f..632bddd58c4 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -22,6 +22,7 @@ namespace native {
 using Tensor = executorch::aten::Tensor;
 using Scalar = executorch::aten::Scalar;
 using ScalarType = executorch::aten::ScalarType;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
 
 namespace {
 
@@ -214,7 +215,7 @@ Tensor& quantize_per_tensor_tensor_args_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-  auto context = executorch::runtime::KernelRuntimeContext();
+  auto context = KernelRuntimeContext();
   auto& res = quantize_per_tensor_tensor_args_out(
       context, input, scale, zero_point, quant_min, quant_max, dtype, out);
   ET_CHECK(context.failure_state() == Error::Ok);
diff --git a/kernels/quantized/targets.bzl b/kernels/quantized/targets.bzl
index a2533cb003a..7bd8f6852a7 100644
--- a/kernels/quantized/targets.bzl
+++ b/kernels/quantized/targets.bzl
@@ -61,6 +61,10 @@ def define_common_targets():
         name = "all_quantized_ops",
         ops_schema_yaml_target = ":quantized.yaml",
         define_static_targets = True,
+        visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+        ],
     )
 
     # On Windows we can only compile these two ops currently, so adding a
diff --git a/kernels/quantized/test/op_add_test.cpp b/kernels/quantized/test/op_add_test.cpp
index 17dd1cfb3fc..3f258827973 100644
--- a/kernels/quantized/test/op_add_test.cpp
+++ b/kernels/quantized/test/op_add_test.cpp
@@ -24,7 +24,7 @@ using executorch::aten::optional;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::native::add_out;
 using torch::executor::native::dequantize_per_tensor_out;
 using torch::executor::native::quantize_per_tensor_out;
diff --git a/kernels/quantized/test/op_embedding2b_test.cpp b/kernels/quantized/test/op_embedding2b_test.cpp
index bf48fa4227b..a350b77ec0d 100644
--- a/kernels/quantized/test/op_embedding2b_test.cpp
+++ b/kernels/quantized/test/op_embedding2b_test.cpp
@@ -21,7 +21,7 @@ using executorch::aten::ArrayRef;
 using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::native::quantized_embedding_2bit_out;
 
 using torch::executor::testing::TensorFactory;
diff --git a/kernels/quantized/test/op_embedding4b_test.cpp b/kernels/quantized/test/op_embedding4b_test.cpp
index 9f205be80e3..6ab10376b88 100644
--- a/kernels/quantized/test/op_embedding4b_test.cpp
+++ b/kernels/quantized/test/op_embedding4b_test.cpp
@@ -21,7 +21,7 @@ using executorch::aten::ArrayRef;
 using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::native::quantized_embedding_4bit_out;
 
 using torch::executor::testing::TensorFactory;
diff --git a/kernels/quantized/test/op_embedding_test.cpp b/kernels/quantized/test/op_embedding_test.cpp
index 252aca41314..6c949bd6e69 100644
--- a/kernels/quantized/test/op_embedding_test.cpp
+++ b/kernels/quantized/test/op_embedding_test.cpp
@@ -24,7 +24,7 @@ using executorch::aten::optional;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::native::dequantize_per_tensor_out;
 using torch::executor::native::embedding_out;
 using torch::executor::native::quantize_per_tensor_out;
diff --git a/kernels/quantized/test/op_mixed_linear_test.cpp b/kernels/quantized/test/op_mixed_linear_test.cpp
index 6b86b199f60..833fc766ffd 100644
--- a/kernels/quantized/test/op_mixed_linear_test.cpp
+++ b/kernels/quantized/test/op_mixed_linear_test.cpp
@@ -20,7 +20,7 @@ using namespace ::testing;
 using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::native::quantized_mixed_linear_out;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/quantized/test/op_mixed_mm_test.cpp b/kernels/quantized/test/op_mixed_mm_test.cpp
index e20ac96d610..4d81089fa91 100644
--- a/kernels/quantized/test/op_mixed_mm_test.cpp
+++ b/kernels/quantized/test/op_mixed_mm_test.cpp
@@ -20,7 +20,7 @@ using namespace ::testing;
 using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::native::quantized_mixed_mm_out;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 2d497dfc124..deb61410b10 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -242,6 +242,7 @@ set(all_test_sources
     "op_upsample_bilinear2d_test.cpp"
     "op_upsample_nearest2d_test.cpp"
     "op_var_test.cpp"
+    "op_view_as_real_copy_test.cpp"
     "op_view_copy_test.cpp"
     "op_where_test.cpp"
     "op_zeros_test.cpp"
@@ -276,6 +277,7 @@ set(_optimized_kernels_test_sources
     "op_div_test.cpp"
     "op_elu_test.cpp"
     "op_exp_test.cpp"
+    "op_fft_c2r_test.cpp"
     "op_fft_r2c_test.cpp"
     "op_gelu_test.cpp"
     "op_le_test.cpp"
diff --git a/kernels/test/TestUtil.h b/kernels/test/TestUtil.h
index aa220f5bfd5..7ec20c11bef 100644
--- a/kernels/test/TestUtil.h
+++ b/kernels/test/TestUtil.h
@@ -116,6 +116,6 @@ class OperatorTest : public ::testing::Test {
   }
 
  protected:
-  executorch::runtime::KernelRuntimeContext context_;
+  ::torch::executor::KernelRuntimeContext context_;
   bool expect_failure_;
 };
diff --git a/kernels/test/custom_kernel_example/op_relu.cpp b/kernels/test/custom_kernel_example/op_relu.cpp
index 2cc3eefe0a8..074ebe6b900 100644
--- a/kernels/test/custom_kernel_example/op_relu.cpp
+++ b/kernels/test/custom_kernel_example/op_relu.cpp
@@ -17,8 +17,8 @@ namespace native {
 
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using executorch::runtime::Error;
-using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::resize_tensor;
 using executorch::runtime::tensors_have_same_shape_and_dtype;
 
diff --git a/kernels/test/op_abs_test.cpp b/kernels/test/op_abs_test.cpp
index 0d022d0a839..eb4a808b321 100644
--- a/kernels/test/op_abs_test.cpp
+++ b/kernels/test/op_abs_test.cpp
@@ -44,12 +44,7 @@ class OpAbsTest : public OperatorTest {
     TensorFactory<DTYPE> tf;
     constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE);
     TensorFactory<REAL_DTYPE> tf_out;
-    using REAL_CTYPE =
-        typename executorch::runtime::ScalarTypeToCppType<REAL_DTYPE>::type;
-    Tensor in = tf.make(
-        {1, 2},
-        {CTYPE{REAL_CTYPE(3), REAL_CTYPE(4)},
-         CTYPE{REAL_CTYPE(5), REAL_CTYPE(12)}});
+    Tensor in = tf.make({1, 2}, {CTYPE(3, 4), CTYPE(5, 12)});
     Tensor out = tf_out.zeros({1, 2});
     Tensor expected = tf_out.make({1, 2}, {5, 13});
     Tensor ret = op_abs_out(in, out);
diff --git a/kernels/test/op_atan2_test.cpp b/kernels/test/op_atan2_test.cpp
index e69ea0e90c8..436826e2b6d 100644
--- a/kernels/test/op_atan2_test.cpp
+++ b/kernels/test/op_atan2_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_atan2_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::atan2_outf(context, self, other, out);
 }
 
diff --git a/kernels/test/op_bmm_test.cpp b/kernels/test/op_bmm_test.cpp
index 88671467f46..70a5f37946d 100644
--- a/kernels/test/op_bmm_test.cpp
+++ b/kernels/test/op_bmm_test.cpp
@@ -43,6 +43,61 @@ class OpBmmOutTest : public OperatorTest {
 
     EXPECT_TENSOR_EQ(out, expected);
   }
+
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_complex_dtype() {
+    TensorFactory<DTYPE> tf;
+    Tensor x = tf.make(
+        {2, 2, 3},
+        {CTYPE(1, 1),
+         CTYPE(2, 2),
+         CTYPE(3, 3),
+         CTYPE(4, 4),
+         CTYPE(5, 5),
+         CTYPE(6, 6),
+         CTYPE(7, 7),
+         CTYPE(8, 8),
+         CTYPE(9, 9),
+         CTYPE(10, 10),
+         CTYPE(11, 11),
+         CTYPE(12, 12)});
+    Tensor y = tf.make(
+        {2, 3, 2},
+        {CTYPE(2, 1),
+         CTYPE(4, 2),
+         CTYPE(6, 3),
+         CTYPE(8, 4),
+         CTYPE(10, 5),
+         CTYPE(12, 6),
+         CTYPE(14, 7),
+         CTYPE(16, 8),
+         CTYPE(18, 9),
+         CTYPE(20, 10),
+         CTYPE(22, 11),
+         CTYPE(24, 12)});
+    Tensor out = tf.make(
+        {2, 2, 2},
+        {CTYPE(0, 0),
+         CTYPE(0, 0),
+         CTYPE(0, 0),
+         CTYPE(0, 0),
+         CTYPE(0, 0),
+         CTYPE(0, 0),
+         CTYPE(0, 0),
+         CTYPE(0, 0)});
+    Tensor expected = tf.make(
+        {2, 2, 2},
+        {CTYPE(22, 66),
+         CTYPE(28, 84),
+         CTYPE(49, 147),
+         CTYPE(64, 192),
+         CTYPE(220, 660),
+         CTYPE(244, 732),
+         CTYPE(301, 903),
+         CTYPE(334, 1002)});
+    op_bmm_out(x, y, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
 };
 
 TEST_F(OpBmmOutTest, OutputDim) {
@@ -132,7 +187,7 @@ TEST_F(OpBmmOutTest, OutputDimFloat) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-TEST_F(OpBmmOutTest, AllDtypesSupported) {
+TEST_F(OpBmmOutTest, AllRealDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
@@ -141,6 +196,16 @@ TEST_F(OpBmmOutTest, AllDtypesSupported) {
   // for those types.
 }
 
+TEST_F(OpBmmOutTest, AllComplexDtypesSupported) {
+#define TEST_ENTRY(ctype, dtype) test_complex_dtype<ctype, ScalarType::dtype>();
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    ET_FORALL_COMPLEX_TYPES(TEST_ENTRY);
+  } else {
+    ET_FORALL_COMPLEXH_TYPES(TEST_ENTRY);
+  }
+#undef TEST_ENTRY
+}
+
 TEST_F(OpBmmOutTest, EmptyInputWithEmptyOutTensorPasses) {
   TensorFactory<ScalarType::Int> tf;
 
diff --git a/kernels/test/op_cdist_forward_test.cpp b/kernels/test/op_cdist_forward_test.cpp
index 32465ca439b..9ddab4c3c49 100644
--- a/kernels/test/op_cdist_forward_test.cpp
+++ b/kernels/test/op_cdist_forward_test.cpp
@@ -21,6 +21,7 @@ using executorch::aten::ArrayRef;
 using executorch::aten::optional;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_cdist_forward_out(
@@ -29,7 +30,7 @@ Tensor& op_cdist_forward_out(
     double p,
     optional<int64_t> compute_mode,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  KernelRuntimeContext context{};
   return torch::executor::aten::_cdist_forward_outf(
       context, x1, x2, p, compute_mode, out);
 }
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index a1003e892e0..8a021c70303 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -260,7 +260,7 @@ class OpClampTensorOutTest : public OperatorTest {
       const optional<Tensor>& min,
       const optional<Tensor>& max,
       Tensor& out) {
-    executorch::runtime::KernelRuntimeContext context{};
+    executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
     return torch::executor::aten::clamp_outf(context, self, min, max, out);
   }
 };
diff --git a/kernels/test/op_diagonal_copy_test.cpp b/kernels/test/op_diagonal_copy_test.cpp
index cc0bd02e1a5..080b0d70645 100644
--- a/kernels/test/op_diagonal_copy_test.cpp
+++ b/kernels/test/op_diagonal_copy_test.cpp
@@ -27,7 +27,7 @@ Tensor& op_diagonal_copy_out(
     int64_t dim1,
     int64_t dim2,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::diagonal_copy_outf(
       context, input, offset, dim1, dim2, out);
 }
@@ -54,31 +54,22 @@ class OpDiagonalCopyOutTest : public ::testing::Test {
   template <typename CTYPE, ScalarType DTYPE>
   void run_2d_complex_dtype() {
     TensorFactory<DTYPE> tf;
-    constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE);
-    using REAL_CTYPE =
-        typename executorch::runtime::ScalarTypeToCppType<REAL_DTYPE>::type;
     Tensor input = tf.make(
         {3, 4},
-        {CTYPE{REAL_CTYPE(1), REAL_CTYPE(1)},
-         CTYPE{REAL_CTYPE(2), REAL_CTYPE(2)},
-         CTYPE{REAL_CTYPE(3), REAL_CTYPE(3)},
-         CTYPE{REAL_CTYPE(4), REAL_CTYPE(4)},
-         CTYPE{REAL_CTYPE(5), REAL_CTYPE(5)},
-         CTYPE{REAL_CTYPE(6), REAL_CTYPE(6)},
-         CTYPE{REAL_CTYPE(7), REAL_CTYPE(7)},
-         CTYPE{REAL_CTYPE(8), REAL_CTYPE(8)},
-         CTYPE{REAL_CTYPE(9), REAL_CTYPE(9)},
-         CTYPE{REAL_CTYPE(10), REAL_CTYPE(10)},
-         CTYPE{REAL_CTYPE(11), REAL_CTYPE(11)},
-         CTYPE{REAL_CTYPE(12), REAL_CTYPE(12)}});
-    Tensor out = tf.make(
-        {2},
-        {CTYPE{REAL_CTYPE(0), REAL_CTYPE(0)},
-         CTYPE{REAL_CTYPE(0), REAL_CTYPE(0)}});
-    Tensor out_expected = tf.make(
-        {2},
-        {CTYPE{REAL_CTYPE(5), REAL_CTYPE(5)},
-         CTYPE{REAL_CTYPE(10), REAL_CTYPE(10)}});
+        {CTYPE(1, 1),
+         CTYPE(2, 2),
+         CTYPE(3, 3),
+         CTYPE(4, 4),
+         CTYPE(5, 5),
+         CTYPE(6, 6),
+         CTYPE(7, 7),
+         CTYPE(8, 8),
+         CTYPE(9, 9),
+         CTYPE(10, 10),
+         CTYPE(11, 11),
+         CTYPE(12, 12)});
+    Tensor out = tf.make({2}, {CTYPE(0, 0), CTYPE(0, 0)});
+    Tensor out_expected = tf.make({2}, {CTYPE(5, 5), CTYPE(10, 10)});
     op_diagonal_copy_out(input, 1, 1, 0, out);
     EXPECT_TENSOR_CLOSE(out, out_expected);
   }
diff --git a/kernels/test/op_fft_c2r_test.cpp b/kernels/test/op_fft_c2r_test.cpp
new file mode 100644
index 00000000000..58c8a216e42
--- /dev/null
+++ b/kernels/test/op_fft_c2r_test.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+
+class OpFftC2rOutTest : public OperatorTest {
+ protected:
+  Tensor& op_fft_c2r_out(
+      const Tensor& in,
+      IntArrayRef dim,
+      int64_t normalization,
+      int64_t last_dim_size,
+      Tensor& out) {
+    return torch::executor::aten::_fft_c2r_outf(
+        context_, in, dim, normalization, last_dim_size, out);
+  }
+
+  template <
+      class CTYPE_OUT,
+      executorch::aten::ScalarType DTYPE_OUT,
+      bool expect_failure = false>
+  void test_dtype(int64_t norm, int64_t dim = 0) {
+    TensorFactory<DTYPE_OUT> tf_out;
+    constexpr auto DTYPE_IN = executorch::runtime::toComplexType(DTYPE_OUT);
+    TensorFactory<DTYPE_IN> tf_in;
+
+    using CTYPE_IN =
+        typename executorch::runtime::ScalarTypeToCppType<DTYPE_IN>::type;
+
+    std::vector<CTYPE_IN> input_data = {
+        CTYPE_IN{24, 4},
+        CTYPE_IN{4, -8},
+        CTYPE_IN{0, 4},
+
+        CTYPE_IN{8, -16},
+        CTYPE_IN{-4, 0},
+        CTYPE_IN{0, 32},
+
+        CTYPE_IN{12, 0},
+        CTYPE_IN{0, 4},
+        CTYPE_IN{-8, 4},
+
+        CTYPE_IN{0, 8},
+        CTYPE_IN{-4, 8},
+        CTYPE_IN{8, 0},
+    };
+
+    Tensor in = tf_in.make({4, 3}, input_data);
+    Tensor out = tf_out.full({4, 3}, 0);
+
+    int64_t last_dim_size =
+        (dim >= 0 && dim < out.dim()) ? out.sizes()[dim] : 0;
+    op_fft_c2r_out(in, {dim}, norm, last_dim_size, out);
+
+    double norm_factor = 1;
+    if (norm == 1) {
+      norm_factor = 2;
+    } else if (norm == 2) {
+      norm_factor = 4;
+    }
+    std::vector<CTYPE_OUT> expected_data = {
+        52., -4., -8., 44., 4., -56., 20., 12., -8., -20., 4., 72.};
+    for (auto& elem : expected_data) {
+      elem /= norm_factor;
+    }
+    Tensor expected = tf_out.make({4, 3}, expected_data);
+
+    if (!expect_failure) {
+      EXPECT_TENSOR_CLOSE(out, expected);
+    }
+  }
+
+  template <class CTYPE_OUT, executorch::aten::ScalarType DTYPE_OUT>
+  void test_dtype_multiple_axes() {
+    TensorFactory<DTYPE_OUT> tf_out;
+    constexpr auto DTYPE_IN = executorch::runtime::toComplexType(DTYPE_OUT);
+    TensorFactory<DTYPE_IN> tf_in;
+
+    using CTYPE_IN =
+        typename executorch::runtime::ScalarTypeToCppType<DTYPE_IN>::type;
+
+    std::vector<CTYPE_IN> input_data = {
+        CTYPE_IN{16, 4},
+        CTYPE_IN{4, -8},
+        CTYPE_IN{0, 4},
+
+        CTYPE_IN{8, -16},
+        CTYPE_IN{-4, 0},
+        CTYPE_IN{0, 36},
+
+        CTYPE_IN{32, 0},
+        CTYPE_IN{0, 4},
+        CTYPE_IN{-8, 4},
+
+        CTYPE_IN{0, 8},
+        CTYPE_IN{-4, 8},
+        CTYPE_IN{8, 0},
+    };
+
+    Tensor in = tf_in.make({4, 3}, input_data);
+    Tensor out = tf_out.full({4, 4}, 0);
+
+    int64_t last_dim_size = out.sizes()[0];
+    std::array<int64_t, 2> dim = {0, 1};
+    op_fft_c2r_out(in, dim, 1, last_dim_size, out);
+
+    std::vector<CTYPE_OUT> expected_data = {
+        12.,
+        12.,
+        16.,
+        16.,
+        1.,
+        15.,
+        -11.,
+        3.,
+        12.,
+        20.,
+        0.,
+        8.,
+        -1.,
+        -15.,
+        3.,
+        -27.};
+    Tensor expected = tf_out.make({4, 4}, expected_data);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+};
+
+TEST_F(OpFftC2rOutTest, AllDtypesSupported) {
+#define TEST_ENTRY(ctype, dtype)           \
+  test_dtype<ctype, ScalarType::dtype>(0); \
+  test_dtype<ctype, ScalarType::dtype>(1); \
+  test_dtype<ctype, ScalarType::dtype>(2);
+  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpFftC2rOutTest, MultipleDims) {
+#define TEST_ENTRY(ctype, dtype) \
+  test_dtype_multiple_axes<ctype, ScalarType::dtype>();
+  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpFftC2rOutTest, InvalidNorm) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen MKL path does not validate norm";
+    return;
+  }
+  auto invalid_norm = [this](int64_t norm) {
+    test_dtype<float, ScalarType::Float, /* expect_failure = */ true>(norm);
+  };
+  ET_EXPECT_KERNEL_FAILURE(context_, invalid_norm(3));
+  ET_EXPECT_KERNEL_FAILURE(context_, invalid_norm(4));
+  ET_EXPECT_KERNEL_FAILURE(context_, invalid_norm(-1));
+  ET_EXPECT_KERNEL_FAILURE(context_, invalid_norm(9999999));
+}
+
+TEST_F(OpFftC2rOutTest, InvalidDim) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen fails UBSAN";
+    return;
+  }
+  auto negative_dim = [this]() {
+    test_dtype<float, ScalarType::Float, /* expect_failure = */ true>(0, -1);
+    test_dtype<float, ScalarType::Float, /* expect_failure = */ true>(0, 3);
+    test_dtype<float, ScalarType::Float, /* expect_failure = */ true>(0, 9001);
+  };
+  ET_EXPECT_KERNEL_FAILURE(context_, negative_dim());
+}
diff --git a/kernels/test/op_flip_test.cpp b/kernels/test/op_flip_test.cpp
index f240dfd4ad3..be06e397be2 100644
--- a/kernels/test/op_flip_test.cpp
+++ b/kernels/test/op_flip_test.cpp
@@ -22,7 +22,7 @@ using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_flip_out(const Tensor& input, IntArrayRef dims, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::flip_outf(context, input, dims, out);
 }
 
diff --git a/kernels/test/op_ge_test.cpp b/kernels/test/op_ge_test.cpp
index 4b21644a5c5..a79502b266e 100644
--- a/kernels/test/op_ge_test.cpp
+++ b/kernels/test/op_ge_test.cpp
@@ -18,7 +18,7 @@ using namespace ::testing;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpGeTensorOutTest : public OperatorTest {
diff --git a/kernels/test/op_gt_test.cpp b/kernels/test/op_gt_test.cpp
index 29a2fb0e8b8..96c0e95f950 100644
--- a/kernels/test/op_gt_test.cpp
+++ b/kernels/test/op_gt_test.cpp
@@ -18,7 +18,7 @@ using namespace ::testing;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpGtScalarOutTest : public OperatorTest {
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index 49ef5235d0f..bcd40d24d89 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -18,7 +18,7 @@ using namespace ::testing;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpLeScalarOutTest : public OperatorTest {
diff --git a/kernels/test/op_lt_test.cpp b/kernels/test/op_lt_test.cpp
index 51ccb310e4a..eee12c50521 100644
--- a/kernels/test/op_lt_test.cpp
+++ b/kernels/test/op_lt_test.cpp
@@ -18,7 +18,7 @@ using namespace ::testing;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpLtScalarOutTest : public OperatorTest {
diff --git a/kernels/test/op_maximum_test.cpp b/kernels/test/op_maximum_test.cpp
index 9c701e208eb..faa18fa56cd 100644
--- a/kernels/test/op_maximum_test.cpp
+++ b/kernels/test/op_maximum_test.cpp
@@ -21,7 +21,7 @@ using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_maximum_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::maximum_outf(context, self, other, out);
 }
 
diff --git a/kernels/test/op_native_batch_norm_test.cpp b/kernels/test/op_native_batch_norm_test.cpp
index 67e46b27508..bf05a87312d 100644
--- a/kernels/test/op_native_batch_norm_test.cpp
+++ b/kernels/test/op_native_batch_norm_test.cpp
@@ -173,7 +173,7 @@ class OpNativeBatchNormLegitOutTest : public OperatorTest {
       executorch::aten::Tensor& out0,
       executorch::aten::Tensor& out1,
       executorch::aten::Tensor& out2) {
-    executorch::runtime::KernelRuntimeContext context{};
+    executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
     return torch::executor::aten::_native_batch_norm_legit_outf(
         context,
         input,
diff --git a/kernels/test/op_native_group_norm_test.cpp b/kernels/test/op_native_group_norm_test.cpp
index ea742e97231..7452350ad29 100644
--- a/kernels/test/op_native_group_norm_test.cpp
+++ b/kernels/test/op_native_group_norm_test.cpp
@@ -32,7 +32,7 @@ ::std::tuple<Tensor&, Tensor&, Tensor&> op_native_group_norm_out(
     Tensor& out0,
     Tensor& out1,
     Tensor& out2) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::native_group_norm_outf(
       context, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2);
 }
diff --git a/kernels/test/op_ne_test.cpp b/kernels/test/op_ne_test.cpp
index 6cb0217ec0f..fe4e6c3621c 100644
--- a/kernels/test/op_ne_test.cpp
+++ b/kernels/test/op_ne_test.cpp
@@ -18,7 +18,7 @@ using namespace ::testing;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
 class OpNeTest : public OperatorTest {
diff --git a/kernels/test/op_pdist_forward_test.cpp b/kernels/test/op_pdist_forward_test.cpp
index e6c0d472517..2b28591f7fc 100644
--- a/kernels/test/op_pdist_forward_test.cpp
+++ b/kernels/test/op_pdist_forward_test.cpp
@@ -23,7 +23,7 @@ using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_pdist_forward_out(const Tensor& input, double p, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::_pdist_forward_outf(context, input, p, out);
 }
 
diff --git a/kernels/test/op_prod_test.cpp b/kernels/test/op_prod_test.cpp
index f9cf53ded57..11a7e3fae4f 100644
--- a/kernels/test/op_prod_test.cpp
+++ b/kernels/test/op_prod_test.cpp
@@ -23,7 +23,7 @@ using torch::executor::testing::TensorFactory;
 
 Tensor&
 op_prod_out(const Tensor& self, optional<ScalarType> dtype, Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::prod_outf(context, self, dtype, out);
 }
 
@@ -33,7 +33,7 @@ Tensor& op_prod_int_out(
     bool keepdim,
     optional<ScalarType> dtype,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::prod_outf(
       context, self, dim, keepdim, dtype, out);
 }
diff --git a/kernels/test/op_reflection_pad1d_test.cpp b/kernels/test/op_reflection_pad1d_test.cpp
index 5f3b2a1c273..aebf057326a 100644
--- a/kernels/test/op_reflection_pad1d_test.cpp
+++ b/kernels/test/op_reflection_pad1d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_reflection_pad1d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::reflection_pad1d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_reflection_pad2d_test.cpp b/kernels/test/op_reflection_pad2d_test.cpp
index 8696b5dff7b..01e0619b9f1 100644
--- a/kernels/test/op_reflection_pad2d_test.cpp
+++ b/kernels/test/op_reflection_pad2d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_reflection_pad2d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::reflection_pad2d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_reflection_pad3d_test.cpp b/kernels/test/op_reflection_pad3d_test.cpp
index 7d5cc84c6bc..55ed906a958 100644
--- a/kernels/test/op_reflection_pad3d_test.cpp
+++ b/kernels/test/op_reflection_pad3d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_reflection_pad3d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::reflection_pad3d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_replication_pad1d_test.cpp b/kernels/test/op_replication_pad1d_test.cpp
index 9a6d3b2285e..f8a3fc0a48b 100644
--- a/kernels/test/op_replication_pad1d_test.cpp
+++ b/kernels/test/op_replication_pad1d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_replication_pad1d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::replication_pad1d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_replication_pad2d_test.cpp b/kernels/test/op_replication_pad2d_test.cpp
index 00bc76ac093..7f62f5c9b6e 100644
--- a/kernels/test/op_replication_pad2d_test.cpp
+++ b/kernels/test/op_replication_pad2d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_replication_pad2d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::replication_pad2d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_replication_pad3d_test.cpp b/kernels/test/op_replication_pad3d_test.cpp
index 010870298d9..5b931fee3f9 100644
--- a/kernels/test/op_replication_pad3d_test.cpp
+++ b/kernels/test/op_replication_pad3d_test.cpp
@@ -25,7 +25,7 @@ Tensor& op_replication_pad3d_out(
     const Tensor& input,
     ArrayRef<int64_t> padding,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::replication_pad3d_outf(
       context, input, padding, out);
 }
diff --git a/kernels/test/op_roll_test.cpp b/kernels/test/op_roll_test.cpp
index fc5baaad4a7..4407e395db6 100644
--- a/kernels/test/op_roll_test.cpp
+++ b/kernels/test/op_roll_test.cpp
@@ -26,7 +26,7 @@ Tensor& op_roll_out(
     ArrayRef<int64_t> shifts,
     ArrayRef<int64_t> dims,
     Tensor& out) {
-  executorch::runtime::KernelRuntimeContext context{};
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
   return torch::executor::aten::roll_outf(context, input, shifts, dims, out);
 }
 
diff --git a/kernels/test/op_topk_test.cpp b/kernels/test/op_topk_test.cpp
index 46098a81b68..bdd185daaae 100644
--- a/kernels/test/op_topk_test.cpp
+++ b/kernels/test/op_topk_test.cpp
@@ -106,7 +106,8 @@ std::tuple<Tensor&, Tensor&> op_topk_values(
     Tensor& values,
     Tensor& indices) {
   TempMemoryAllocator allocator = TempMemoryAllocator();
-  executorch::runtime::KernelRuntimeContext context(nullptr, &allocator);
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context(
+      nullptr, &allocator);
   return torch::executor::aten::topk_outf(
       context, input, k, dim, largest, sorted, values, indices);
 }
diff --git a/kernels/test/op_view_as_real_copy_test.cpp b/kernels/test/op_view_as_real_copy_test.cpp
new file mode 100644
index 00000000000..8e959c3db8c
--- /dev/null
+++ b/kernels/test/op_view_as_real_copy_test.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpViewAsRealTest : public OperatorTest {
+ protected:
+  Tensor& view_as_real_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::view_as_real_copy_outf(context_, self, out);
+  }
+
+  template <typename CTYPE, ScalarType DTYPE>
+  void run_complex_smoke_test() {
+    TensorFactory<DTYPE> tf;
+    constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE);
+    TensorFactory<REAL_DTYPE> tf_out;
+
+    Tensor in = tf.make(
+        {2, 2},
+        {CTYPE(3, 4), CTYPE(-1.7, 7.4), CTYPE(5, -12), CTYPE(8.3, 0.1)});
+    Tensor out = tf_out.zeros({2, 2, 2});
+    Tensor expected =
+        tf_out.make({2, 2, 2}, {3, 4, -1.7, 7.4, 5, -12, 8.3, 0.1});
+    Tensor ret = view_as_real_copy_out(in, out);
+
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+
+  // Tests on tensors with 0 size
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_empty_input() {
+    TensorFactory<DTYPE> tf;
+    constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE);
+    TensorFactory<REAL_DTYPE> tf_out;
+
+    Tensor in = tf.make(/*sizes=*/{3, 0, 4}, /*data=*/{});
+    Tensor out = tf_out.zeros({3, 0, 4, 2});
+    Tensor expected = tf_out.make(/*sizes=*/{3, 0, 4, 2}, /*data=*/{});
+    Tensor ret = view_as_real_copy_out(in, out);
+
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+
+  // Tests on 0-dim input tensors
+  template <typename CTYPE, ScalarType DTYPE>
+  void zero_dim_input() {
+    TensorFactory<DTYPE> tf;
+    constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE);
+    TensorFactory<REAL_DTYPE> tf_out;
+
+    Tensor in = tf.make(/*sizes=*/{}, {CTYPE(0, 0)});
+    Tensor out = tf_out.zeros({2});
+    Tensor expected = tf_out.zeros(/*sizes=*/{2});
+    Tensor ret = view_as_real_copy_out(in, out);
+
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpViewAsRealTest, ComplexSmokeTest) {
+#define RUN_SMOKE_TEST(ctype, dtype)                  \
+  run_complex_smoke_test<ctype, ScalarType::dtype>(); \
+  test_empty_input<ctype, ScalarType::dtype>();       \
+  zero_dim_input<ctype, ScalarType::dtype>();
+  ET_FORALL_COMPLEXH_TYPES(RUN_SMOKE_TEST);
+#undef RUN_SMOKE_TEST
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 05e678c6229..b9e1d3d6dac 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -223,6 +223,7 @@ def define_common_targets():
     _common_op_test("op_exp_test", ["aten", "portable", "optimized"])
     _common_op_test("op_expand_copy_test", ["aten", "portable"])
     _common_op_test("op_expm1_test", ["aten", "portable"])
+    _common_op_test("op_fft_c2r_test", ["aten", "optimized"])
     _common_op_test("op_fft_r2c_test", ["aten", "optimized"])
     _common_op_test("op_fill_test", ["aten", "portable"])
     _common_op_test("op_flip_test", ["aten", "portable"])
@@ -331,6 +332,7 @@ def define_common_targets():
     _common_op_test("op_upsample_bilinear2d_test", ["aten", "portable"])
     _common_op_test("op_upsample_nearest2d_test", ["aten", "portable"])
     _common_op_test("op_var_test", ["aten", "portable"])
+    _common_op_test("op_view_as_real_copy_test", ["aten", "portable"])
     _common_op_test("op_view_copy_test", ["aten", "portable"])
     _common_op_test("op_where_test", ["aten", "portable"])
     _common_op_test("op_zeros_test", ["aten", "portable"])
diff --git a/pytest.ini b/pytest.ini
index cd647c43a1c..8c661aa9ee4 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -63,8 +63,6 @@ addopts =
     --ignore=exir/backend/test/demos
     --ignore=exir/backend/test/test_backends.py
     --ignore=exir/backend/test/test_backends_lifted.py
-    --ignore=exir/backend/test/test_compatibility.py
-    --ignore=exir/backend/test/test_lowered_backend_module.py
     --ignore=exir/backend/test/test_partitioner.py
     --ignore=exir/tests/test_common.py
     --ignore=exir/tests/test_memory_format_ops_pass_aten.py
diff --git a/runtime/COMPATIBILITY.md b/runtime/COMPATIBILITY.md
index 7d9fd47c590..583dab172cc 100644
--- a/runtime/COMPATIBILITY.md
+++ b/runtime/COMPATIBILITY.md
@@ -1,7 +1,7 @@
 # Runtime Compatibility Policy
 
 This document describes the compatibility guarantees between the [PTE file
-format](https://pytorch.org/executorch/stable/pte-file-format.html) and the
+format](https://pytorch.org/executorch/main/pte-file-format) and the
 ExecuTorch runtime.
 
 > [!IMPORTANT]
diff --git a/runtime/backend/backend_execution_context.h b/runtime/backend/backend_execution_context.h
index d2790b158ef..7ee41d8e5b1 100644
--- a/runtime/backend/backend_execution_context.h
+++ b/runtime/backend/backend_execution_context.h
@@ -12,7 +12,7 @@
 #include <executorch/runtime/core/memory_allocator.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 /**
  * BackendExecutionContext will be used to inject run time context.
@@ -68,13 +68,13 @@ class BackendExecutionContext final {
   const char* method_name_ = nullptr;
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::BackendExecutionContext;
+using ::executorch::ET_RUNTIME_NAMESPACE::BackendExecutionContext;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/backend/backend_init_context.h b/runtime/backend/backend_init_context.h
index de1661c3af0..71c5182f401 100644
--- a/runtime/backend/backend_init_context.h
+++ b/runtime/backend/backend_init_context.h
@@ -7,12 +7,12 @@
  */
 
 #pragma once
+#include <executorch/runtime/core/event_tracer.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/core/named_data_map.h>
 
 namespace executorch {
-namespace runtime {
-
+namespace ET_RUNTIME_NAMESPACE {
 /**
  * BackendInitContext will be used to inject runtime info for to initialize
  * delegate.
@@ -70,13 +70,13 @@ class BackendInitContext final {
   const NamedDataMap* named_data_map_ = nullptr;
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::BackendInitContext;
+using ::executorch::ET_RUNTIME_NAMESPACE::BackendInitContext;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp
index 4fb1eadfa87..ffeb133fbf2 100644
--- a/runtime/backend/interface.cpp
+++ b/runtime/backend/interface.cpp
@@ -9,7 +9,7 @@
 #include <executorch/runtime/backend/interface.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 // Pure-virtual dtors still need an implementation.
 BackendInterface::~BackendInterface() {}
@@ -66,5 +66,5 @@ Result<const char*> get_backend_name(size_t index) {
   return registered_backends[index].name;
 }
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/backend/interface.h b/runtime/backend/interface.h
index 0a3c069a201..95705d48f92 100644
--- a/runtime/backend/interface.h
+++ b/runtime/backend/interface.h
@@ -22,7 +22,7 @@
 #include <executorch/runtime/platform/compiler.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 struct SizedBuffer {
   void* buffer;
@@ -150,19 +150,20 @@ size_t get_num_registered_backends();
  */
 Result<const char*> get_backend_name(size_t index);
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::Backend;
-using ::executorch::runtime::CompileSpec;
-using ::executorch::runtime::DelegateHandle;
-using ::executorch::runtime::get_backend_class;
-using ::executorch::runtime::register_backend;
-using ::executorch::runtime::SizedBuffer;
-using PyTorchBackendInterface = ::executorch::runtime::BackendInterface;
+using ::executorch::ET_RUNTIME_NAMESPACE::Backend;
+using ::executorch::ET_RUNTIME_NAMESPACE::CompileSpec;
+using ::executorch::ET_RUNTIME_NAMESPACE::DelegateHandle;
+using ::executorch::ET_RUNTIME_NAMESPACE::get_backend_class;
+using ::executorch::ET_RUNTIME_NAMESPACE::register_backend;
+using ::executorch::ET_RUNTIME_NAMESPACE::SizedBuffer;
+using PyTorchBackendInterface =
+    ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/event_tracer.h b/runtime/core/event_tracer.h
index 77d7fc64102..5bcdd0cfb1f 100644
--- a/runtime/core/event_tracer.h
+++ b/runtime/core/event_tracer.h
@@ -313,8 +313,11 @@ class EventTracer {
    * @param[in] evalue The value to be logged.
    * @param[in] evalue_type Indicates what type of output this is logging e.g.
    * an intermediate output, program output etc.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the evalue output was successfully logged.
+   *         - An error code if an error occurs during logging.
    */
-  virtual void log_evalue(
+  virtual Result<bool> log_evalue(
       const EValue& evalue,
       LoggedEValueType evalue_type) = 0;
 
@@ -439,6 +442,12 @@ class EventTracer {
       DelegateDebugIntId delegate_debug_index,
       const double& output) = 0;
 
+  /**
+   * Set the filter of event tracer for delegation intermediate outputs.
+   */
+  virtual void set_delegation_intermediate_output_filter(
+      EventTracerFilterBase* event_tracer_filter) = 0;
+
   /**
    * Helper function to set the chain id ands debug handle. Users have two
    * options, the first is that they can directly pass in the chain id and debug
@@ -513,12 +522,6 @@ class EventTracer {
     event_tracer_profiling_level_ = profiling_level;
   }
 
-  /**
-   * Set the filter of event tracer for delegation intermediate outputs.
-   */
-  void set_delegation_intermediate_output_filter(
-      EventTracerFilterBase* event_tracer_filter);
-
   /**
    * Return the current level of event tracer profiling.
    */
diff --git a/runtime/core/event_tracer_hooks.h b/runtime/core/event_tracer_hooks.h
index 40754160c41..cd74b447ca8 100644
--- a/runtime/core/event_tracer_hooks.h
+++ b/runtime/core/event_tracer_hooks.h
@@ -30,7 +30,7 @@
  */
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 namespace internal {
 
 /**
@@ -305,7 +305,7 @@ inline void event_tracer_set_bundled_input_index(
 }
 
 } // namespace internal
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
@@ -313,18 +313,27 @@ namespace executor {
 namespace internal {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::internal::event_tracer_begin_profiling_event;
-using ::executorch::runtime::internal::event_tracer_create_event_block;
-using ::executorch::runtime::internal::event_tracer_end_profiling_event;
-using ::executorch::runtime::internal::event_tracer_log_evalue;
-using ::executorch::runtime::internal::event_tracer_log_evalue_output;
-using ::executorch::runtime::internal::event_tracer_set_bundled_input_index;
-using ::executorch::runtime::internal::event_tracer_track_allocation;
-using ::executorch::runtime::internal::event_tracer_track_allocator;
-using ::executorch::runtime::internal::EventTracerProfileInstructionScope;
-using ::executorch::runtime::internal::EventTracerProfileMethodScope;
-using ::executorch::runtime::internal::EventTracerProfileOpScope;
-using ::executorch::runtime::internal::EventTracerProfileScope;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_begin_profiling_event;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_create_event_block;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_end_profiling_event;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::event_tracer_log_evalue;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_log_evalue_output;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_set_bundled_input_index;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_track_allocation;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    event_tracer_track_allocator;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    EventTracerProfileInstructionScope;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::
+    EventTracerProfileMethodScope;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::EventTracerProfileOpScope;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::EventTracerProfileScope;
 
 } // namespace internal
 } // namespace executor
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index 704bb868abd..10075ab5920 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -47,6 +47,21 @@
 
 #endif
 
+/**
+ * This hack is for separating out ATen mode vs non-ATen mode. In ATen mode,
+ * we use the ATen types directly. In non-ATen mode, we use the portable types.
+ * To avoid duplicate symbols and/or duplicate operator registration, when a
+ * user depends on both the ATen mode and non-ATen mode versions of the
+ * ExecuTorch library.
+ */
+#ifndef ET_RUNTIME_NAMESPACE
+#if defined(USE_ATEN_LIB)
+#define ET_RUNTIME_NAMESPACE runtime::aten
+#else
+#define ET_RUNTIME_NAMESPACE runtime
+#endif
+#endif
+
 namespace executorch {
 namespace aten {
 
diff --git a/runtime/core/exec_aten/testing_util/tensor_factory.h b/runtime/core/exec_aten/testing_util/tensor_factory.h
index 367db09285a..1e29b220251 100644
--- a/runtime/core/exec_aten/testing_util/tensor_factory.h
+++ b/runtime/core/exec_aten/testing_util/tensor_factory.h
@@ -133,7 +133,7 @@ inline bool check_dim_order(
   size_t gauss_sum = 0;
   std::vector<int> count(dim_order.size(), 0);
   for (int i = 0; i < dim_order.size(); i++) {
-    if (dim_order[i] < 0 || dim_order[i] >= sizes.size()) {
+    if (dim_order[i] >= sizes.size()) {
       return false;
     }
     gauss_sum += static_cast<size_t>(dim_order[i]) + 1;
diff --git a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
index ed8cc00f4ef..feb00f79b8f 100644
--- a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
+++ b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
@@ -26,8 +26,8 @@ using executorch::aten::SizesType;
 using executorch::aten::StridesType;
 using executorch::aten::Tensor;
 using executorch::aten::TensorList;
+using executorch::ET_RUNTIME_NAMESPACE::resize_tensor;
 using executorch::runtime::Error;
-using executorch::runtime::resize_tensor;
 using executorch::runtime::TensorShapeDynamism;
 using executorch::runtime::testing::TensorFactory;
 using executorch::runtime::testing::TensorListFactory;
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index d07052c2ec2..6f81146e925 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -921,55 +921,7 @@ struct promote_types {
     }                                                \
   }()
 
-#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...)                    \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)           \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)          \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)        \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)      \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__)       \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__)         \
-  ET_INTERNAL_SWITCH_CASE(                                                     \
-      ::executorch::aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...)           \
+#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)            \
   ET_INTERNAL_SWITCH_CASE(                                             \
       ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
   ET_INTERNAL_SWITCH_CASE(                                             \
@@ -979,12 +931,73 @@ struct promote_types {
   ET_INTERNAL_SWITCH_CASE(                                             \
       ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
   ET_INTERNAL_SWITCH_CASE(                                             \
-      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)  \
+      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_UINT_TYPES(CTYPE_ALIAS, ...)            \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::UInt16, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::UInt32, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                              \
+      ::executorch::aten::ScalarType::UInt64, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...)          \
   ET_INTERNAL_SWITCH_CASE(                                             \
       ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) \
   ET_INTERNAL_SWITCH_CASE(                                             \
       ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...)  \
+  ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...)               \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, ...)  \
+  ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                        \
+      ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...)              \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)    \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_BITS_TYPES(CTYPE_ALIAS, ...)             \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__)   \
+  ET_INTERNAL_SWITCH_CASE(                                               \
+      ::executorch::aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__)
+
+#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...)               \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)            \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__)     \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                \
+      ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__)     \
+  ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, __VA_ARGS__)        \
+  ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, __VA_ARGS__)            \
+  ET_INTERNAL_SWITCH_CASE_BITS_TYPES(CTYPE_ALIAS, __VA_ARGS__)
+
 #define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                   \
@@ -1008,29 +1021,11 @@ struct promote_types {
   ET_INTERNAL_SWITCH_CASE(                                                   \
       ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
 
-#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)            \
-  ET_INTERNAL_SWITCH_CASE(                                             \
-      ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                             \
-      ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                             \
-      ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                             \
-      ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                             \
-      ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__)
-
 #define ET_INTERNAL_SWITCH_CASE_INT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                  \
       ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
 
-#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...)           \
-  ET_INTERNAL_SWITCH_CASE(                                              \
-      ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                              \
-      ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__)
-
 #define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
   ET_INTERNAL_SWITCH_CASE(                                                    \
@@ -1050,32 +1045,6 @@ struct promote_types {
   ET_INTERNAL_SWITCH_CASE(                                   \
       ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
 
-#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...)              \
-  ET_INTERNAL_SWITCH_CASE(                                                \
-      ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)    \
-  ET_INTERNAL_SWITCH_CASE(                                                \
-      ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                                \
-      ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__)   \
-  ET_INTERNAL_SWITCH_CASE(                                                \
-      ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                                \
-      ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...)               \
-  ET_INTERNAL_SWITCH_CASE(                                                    \
-      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                                    \
-      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
-
-#define ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, ...)              \
-  ET_INTERNAL_SWITCH_CASE(                                                    \
-      ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)  \
-  ET_INTERNAL_SWITCH_CASE(                                                    \
-      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
-  ET_INTERNAL_SWITCH_CASE(                                                    \
-      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
-
 #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, ...)    \
   ET_INTERNAL_SWITCH_CASE(                                            \
       ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__) \
@@ -1204,26 +1173,15 @@ struct promote_types {
   ET_SWITCH_REAL_TYPES_AND3(                                              \
       Half, Bool, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
 
-#define ET_SWITCH_REALHBBF16_AND_UINT_TYPES(                  \
-    TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)                    \
-  ET_INTERNAL_SWITCH(                                         \
-      TYPE,                                                   \
-      CONTEXT,                                                \
-      NAME,                                                   \
-      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(                \
-          Half, Bool, BFloat16, CTYPE_ALIAS, __VA_ARGS__)     \
-          ET_INTERNAL_SWITCH_CASE(                            \
-              ::executorch::aten::ScalarType::UInt16,         \
-              CTYPE_ALIAS,                                    \
-              __VA_ARGS__)                                    \
-              ET_INTERNAL_SWITCH_CASE(                        \
-                  ::executorch::aten::ScalarType::UInt32,     \
-                  CTYPE_ALIAS,                                \
-                  __VA_ARGS__)                                \
-                  ET_INTERNAL_SWITCH_CASE(                    \
-                      ::executorch::aten::ScalarType::UInt64, \
-                      CTYPE_ALIAS,                            \
-                      __VA_ARGS__))
+#define ET_SWITCH_REALHBBF16_AND_UINT_TYPES(              \
+    TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)                \
+  ET_INTERNAL_SWITCH(                                     \
+      TYPE,                                               \
+      CONTEXT,                                            \
+      NAME,                                               \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(            \
+          Half, Bool, BFloat16, CTYPE_ALIAS, __VA_ARGS__) \
+          ET_INTERNAL_SWITCH_CASE_UINT_TYPES(CTYPE_ALIAS, __VA_ARGS__))
 
 #define ET_SWITCH_INT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH(                                              \
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
index 4e5a0cebb07..b0b79882361 100644
--- a/runtime/core/exec_aten/util/tensor_util.h
+++ b/runtime/core/exec_aten/util/tensor_util.h
@@ -396,8 +396,7 @@
       #scalar_tensor " could not be extracted: wrong type or out of range");
 
 namespace executorch {
-namespace runtime {
-
+namespace ET_RUNTIME_NAMESPACE {
 //
 // Utility functions for checking tensor attributes
 //
@@ -446,10 +445,10 @@ inline bool tensor_can_cast_to(
     executorch::aten::Tensor a,
     executorch::aten::ScalarType dtype) {
   ET_CHECK_OR_RETURN_FALSE(
-      torch::executor::canCast(a.scalar_type(), dtype),
+      ::torch::executor::canCast(a.scalar_type(), dtype),
       "Tensor of dtype %s cannot cast to dtype %s",
-      torch::executor::toString(a.scalar_type()),
-      torch::executor::toString(dtype));
+      ::torch::executor::toString(a.scalar_type()),
+      ::torch::executor::toString(dtype));
 
   return true;
 }
@@ -458,7 +457,7 @@ inline bool tensor_is_bool_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
       t.scalar_type() == executorch::aten::ScalarType::Bool,
       "Expected to find bool type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
@@ -469,8 +468,8 @@ inline bool tensor_is_type(
   ET_CHECK_OR_RETURN_FALSE(
       t.scalar_type() == dtype,
       "Expected to find %s type, but tensor has type %s",
-      torch::executor::toString(dtype),
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(dtype),
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
@@ -482,9 +481,9 @@ inline bool tensor_is_type(
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
       t.scalar_type() == dtype || t.scalar_type() == dtype2,
       "Expected to find %s or %s type, but tensor has type %s",
-      torch::executor::toString(dtype),
-      torch::executor::toString(dtype2),
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(dtype),
+      ::torch::executor::toString(dtype2),
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
@@ -498,10 +497,10 @@ inline bool tensor_is_type(
       t.scalar_type() == dtype || t.scalar_type() == dtype2 ||
           t.scalar_type() == dtype3,
       "Expected to find %s, %s, or %s type, but tensor has type %s",
-      torch::executor::toString(dtype),
-      torch::executor::toString(dtype2),
-      torch::executor::toString(dtype3),
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(dtype),
+      ::torch::executor::toString(dtype2),
+      ::torch::executor::toString(dtype3),
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
@@ -510,36 +509,36 @@ inline bool tensor_is_integral_type(
     executorch::aten::Tensor t,
     bool includeBool = false) {
   ET_CHECK_OR_RETURN_FALSE(
-      torch::executor::isIntegralType(t.scalar_type(), includeBool),
+      ::torch::executor::isIntegralType(t.scalar_type(), includeBool),
       "Expected to find a integral type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
 
 inline bool tensor_is_floating_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
-      torch::executor::isFloatingType(t.scalar_type()),
+      ::torch::executor::isFloatingType(t.scalar_type()),
       "Expected to find a floating type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
 
 inline bool tensor_is_real_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
-      torch::executor::isRealType(t.scalar_type()),
+      ::torch::executor::isRealType(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
 
 inline bool tensor_is_realh_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
-      torch::executor::isRealHType(t.scalar_type()),
+      ::torch::executor::isRealHType(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
@@ -548,16 +547,16 @@ inline bool tensor_is_realhbf16_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
       executorch::runtime::isRealHBF16Type(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
 
 inline bool tensor_is_realhb_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
-      torch::executor::isRealHBType(t.scalar_type()),
+      ::torch::executor::isRealHBType(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
@@ -566,25 +565,25 @@ inline bool tensor_is_realhbbf16_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
       executorch::runtime::isRealHBBF16Type(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
 
 inline bool tensor_is_complex_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
-      torch::executor::isComplexType(t.scalar_type()),
+      ::torch::executor::isComplexType(t.scalar_type()),
       "Expected to find a complex type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
 
 inline bool tensor_is_bits_type(executorch::aten::Tensor t) {
   ET_CHECK_OR_RETURN_FALSE(
-      torch::executor::isBitsType(t.scalar_type()),
+      ::torch::executor::isBitsType(t.scalar_type()),
       "Expected to find a bits type, but tensor has type %s",
-      torch::executor::toString(t.scalar_type()));
+      ::torch::executor::toString(t.scalar_type()));
 
   return true;
 }
@@ -595,8 +594,8 @@ inline bool tensors_have_same_dtype(
   ET_CHECK_OR_RETURN_FALSE(
       a.scalar_type() == b.scalar_type(),
       ET_TENSOR_CHECK_PREFIX__ ": dtype={%s, %s}",
-      torch::executor::toString(a.scalar_type()),
-      torch::executor::toString(b.scalar_type()));
+      ::torch::executor::toString(a.scalar_type()),
+      ::torch::executor::toString(b.scalar_type()));
   return true;
 }
 
@@ -607,9 +606,9 @@ inline bool tensors_have_same_dtype(
   ET_CHECK_OR_RETURN_FALSE(
       a.scalar_type() == b.scalar_type() && b.scalar_type() == c.scalar_type(),
       ET_TENSOR_CHECK_PREFIX__ ": dtype={%s, %s, %s}",
-      torch::executor::toString(a.scalar_type()),
-      torch::executor::toString(b.scalar_type()),
-      torch::executor::toString(c.scalar_type()));
+      ::torch::executor::toString(a.scalar_type()),
+      ::torch::executor::toString(b.scalar_type()),
+      ::torch::executor::toString(c.scalar_type()));
   return true;
 }
 
@@ -1349,60 +1348,61 @@ inline size_t calculate_linear_index(
   return index;
 }
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::calculate_linear_index;
-using ::executorch::runtime::coordinateToIndex;
-using ::executorch::runtime::dim_is_valid;
-using ::executorch::runtime::extract_scalar_tensor;
-using ::executorch::runtime::get_dim_order;
-using ::executorch::runtime::getLeadingDims;
-using ::executorch::runtime::getTrailingDims;
-using ::executorch::runtime::indexToCoordinate;
+using ::executorch::ET_RUNTIME_NAMESPACE::calculate_linear_index;
+using ::executorch::ET_RUNTIME_NAMESPACE::coordinateToIndex;
+using ::executorch::ET_RUNTIME_NAMESPACE::dim_is_valid;
+using ::executorch::ET_RUNTIME_NAMESPACE::extract_scalar_tensor;
+using ::executorch::ET_RUNTIME_NAMESPACE::get_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::getLeadingDims;
+using ::executorch::ET_RUNTIME_NAMESPACE::getTrailingDims;
+using ::executorch::ET_RUNTIME_NAMESPACE::indexToCoordinate;
+using ::executorch::ET_RUNTIME_NAMESPACE::nonempty_size;
+using ::executorch::ET_RUNTIME_NAMESPACE::nonzero_dim;
+using ::executorch::ET_RUNTIME_NAMESPACE::resize;
+using ::executorch::ET_RUNTIME_NAMESPACE::resize_tensor;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_can_cast_to;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_dim_has_index;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_dim;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_expected_size;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_non_empty_dim;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_rank_greater_or_equal_to;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_rank_smaller_or_equal_to;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_valid_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_bits_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_bool_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_complex_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_contiguous;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_default_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::
+    tensor_is_default_or_channels_last_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_floating_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_integral_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_rank;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_real_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_realh_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_realhb_type;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_scalar;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_dim_order;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_dtype;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_rank;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape_and_dtype;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_size_at_dims;
+using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_strides;
 using ::executorch::runtime::kTensorDimensionLimit;
-using ::executorch::runtime::nonempty_size;
-using ::executorch::runtime::nonzero_dim;
-using ::executorch::runtime::resize;
-using ::executorch::runtime::resize_tensor;
-using ::executorch::runtime::tensor_can_cast_to;
-using ::executorch::runtime::tensor_dim_has_index;
-using ::executorch::runtime::tensor_has_dim;
-using ::executorch::runtime::tensor_has_expected_size;
-using ::executorch::runtime::tensor_has_non_empty_dim;
-using ::executorch::runtime::tensor_has_rank_greater_or_equal_to;
-using ::executorch::runtime::tensor_has_rank_smaller_or_equal_to;
-using ::executorch::runtime::tensor_has_valid_dim_order;
-using ::executorch::runtime::tensor_is_bits_type;
-using ::executorch::runtime::tensor_is_bool_type;
-using ::executorch::runtime::tensor_is_complex_type;
-using ::executorch::runtime::tensor_is_contiguous;
-using ::executorch::runtime::tensor_is_default_dim_order;
-using ::executorch::runtime::tensor_is_default_or_channels_last_dim_order;
-using ::executorch::runtime::tensor_is_floating_type;
-using ::executorch::runtime::tensor_is_integral_type;
-using ::executorch::runtime::tensor_is_rank;
-using ::executorch::runtime::tensor_is_real_type;
-using ::executorch::runtime::tensor_is_realh_type;
-using ::executorch::runtime::tensor_is_realhb_type;
-using ::executorch::runtime::tensor_is_scalar;
-using ::executorch::runtime::tensors_have_same_dim_order;
-using ::executorch::runtime::tensors_have_same_dtype;
-using ::executorch::runtime::tensors_have_same_rank;
-using ::executorch::runtime::tensors_have_same_shape;
-using ::executorch::runtime::tensors_have_same_shape_and_dtype;
-using ::executorch::runtime::tensors_have_same_size_at_dims;
-using ::executorch::runtime::tensors_have_same_strides;
 namespace internal {
-using ::executorch::runtime::internal::copy_tensor_data;
-using ::executorch::runtime::internal::reset_data_ptr;
-using ::executorch::runtime::internal::resize_tensor_impl;
-using ::executorch::runtime::internal::set_tensor_data;
-using ::executorch::runtime::internal::share_tensor_data;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::copy_tensor_data;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::reset_data_ptr;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::resize_tensor_impl;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::set_tensor_data;
+using ::executorch::ET_RUNTIME_NAMESPACE::internal::share_tensor_data;
 } // namespace internal
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp
index 4df273d4dbb..ddfd0560a69 100644
--- a/runtime/core/exec_aten/util/tensor_util_aten.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_aten.cpp
@@ -12,7 +12,7 @@
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 /**
  * Implementation for ATen tensor util, should only be included in
  * `<target>_aten` target and only be used in ATen mode. Explicitly taking
@@ -214,6 +214,5 @@ Error resize_tensor_impl(
 }
 
 } // namespace internal
-
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/core/exec_aten/util/test/tensor_util_test.cpp b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
index 7d30b0bbdbe..cdc391adf20 100644
--- a/runtime/core/exec_aten/util/test/tensor_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
@@ -17,7 +17,7 @@
 using namespace ::testing;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
-using executorch::runtime::extract_scalar_tensor;
+using executorch::ET_RUNTIME_NAMESPACE::extract_scalar_tensor;
 using executorch::runtime::testing::TensorFactory;
 
 class TensorUtilTest : public ::testing::Test {
@@ -148,13 +148,13 @@ TEST_F(TensorUtilTest, GetLeadingDimsSmokeTest) {
   Tensor t = tf_int_.ones({2, 3, 4});
 
   // getLeadingDims(t, 1) => t.size(0)
-  EXPECT_EQ(executorch::runtime::getLeadingDims(t, 1), 2);
+  EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 1), 2);
 
   // getLeadingDims(t, 2) => t.size(0) * t.size(1)
-  EXPECT_EQ(executorch::runtime::getLeadingDims(t, 2), 6);
+  EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 2), 6);
 
   // getLeadingDims(t, 3) => t.size(0) * t.size(1) * t.size(2)
-  EXPECT_EQ(executorch::runtime::getLeadingDims(t, 3), 24);
+  EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 3), 24);
 }
 
 TEST_F(TensorUtilTest, GetLeadingDimsInputOutOfBoundDies) {
@@ -162,9 +162,9 @@ TEST_F(TensorUtilTest, GetLeadingDimsInputOutOfBoundDies) {
   Tensor t = tf_int_.ones({2, 3, 4});
 
   // dim needs to be in the range [0, t.dim()]
-  ET_EXPECT_DEATH(executorch::runtime::getLeadingDims(t, -2), "");
-  ET_EXPECT_DEATH(executorch::runtime::getLeadingDims(t, -1), "");
-  ET_EXPECT_DEATH(executorch::runtime::getLeadingDims(t, 4), "");
+  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, -2), "");
+  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, -1), "");
+  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 4), "");
 }
 
 TEST_F(TensorUtilTest, GetTrailingDimsSmokeTest) {
@@ -172,13 +172,13 @@ TEST_F(TensorUtilTest, GetTrailingDimsSmokeTest) {
   Tensor t = tf_int_.ones({2, 3, 4});
 
   // getTrailingDims(t, 1) => t.size(2)
-  EXPECT_EQ(executorch::runtime::getTrailingDims(t, 1), 4);
+  EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 1), 4);
 
   // getTrailingDims(t, 0) => t.size(1) * t.size(2)
-  EXPECT_EQ(executorch::runtime::getTrailingDims(t, 0), 12);
+  EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 0), 12);
 
   // getTrailingDims(t, -1) => t.size(0) * t.size(1) * t.size(2)
-  EXPECT_EQ(executorch::runtime::getTrailingDims(t, -1), 24);
+  EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, -1), 24);
 }
 
 TEST_F(TensorUtilTest, GetTrailingDimsInputOutOfBoundDies) {
@@ -186,9 +186,9 @@ TEST_F(TensorUtilTest, GetTrailingDimsInputOutOfBoundDies) {
   Tensor t = tf_int_.ones({2, 3, 4});
 
   // dim needs to be in the range [-1, t.dim() - 1)
-  ET_EXPECT_DEATH(executorch::runtime::getTrailingDims(t, -2), "");
-  ET_EXPECT_DEATH(executorch::runtime::getTrailingDims(t, 3), "");
-  ET_EXPECT_DEATH(executorch::runtime::getTrailingDims(t, 4), "");
+  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, -2), "");
+  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 3), "");
+  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 4), "");
 }
 
 TEST_F(TensorUtilTest, ContiguousCheckSupported) {
@@ -421,7 +421,7 @@ TEST_F(TensorUtilTest, BoolTensorNotScalarFails) {
 //
 
 TEST_F(TensorUtilTest, TensorIsRankTest) {
-  using executorch::runtime::tensor_is_rank;
+  using executorch::ET_RUNTIME_NAMESPACE::tensor_is_rank;
   Tensor a = tf_float_.ones({2, 3, 5});
 
   EXPECT_TRUE(tensor_is_rank(a, 3));
@@ -430,7 +430,7 @@ TEST_F(TensorUtilTest, TensorIsRankTest) {
 }
 
 TEST_F(TensorUtilTest, TensorHasDimTest) {
-  using executorch::runtime::tensor_has_dim;
+  using executorch::ET_RUNTIME_NAMESPACE::tensor_has_dim;
   Tensor a = tf_float_.ones({2, 3, 5});
 
   EXPECT_TRUE(tensor_has_dim(a, 2));
@@ -445,7 +445,7 @@ TEST_F(TensorUtilTest, TensorHasDimTest) {
 }
 
 TEST_F(TensorUtilTest, TensorsHaveSameDtypeTest) {
-  using executorch::runtime::tensors_have_same_dtype;
+  using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_dtype;
   Tensor a = tf_float_.ones({2, 3});
   Tensor b = tf_float_.ones({2, 3});
   Tensor c = tf_float_.ones({3, 3});
@@ -458,7 +458,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameDtypeTest) {
 }
 
 TEST_F(TensorUtilTest, TensorsHaveSameSizeAtDimTest) {
-  using executorch::runtime::tensors_have_same_size_at_dims;
+  using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_size_at_dims;
   Tensor a = tf_float_.ones({2, 3, 4, 5});
   Tensor b = tf_float_.ones({5, 4, 3, 2});
 
@@ -470,7 +470,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameSizeAtDimTest) {
 }
 
 TEST_F(TensorUtilTest, TensorsHaveSameShapeTest) {
-  using executorch::runtime::tensors_have_same_shape;
+  using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape;
   Tensor a = tf_float_.ones({2, 3});
   Tensor b = tf_int_.ones({2, 3});
   Tensor c = tf_byte_.ones({2, 3});
@@ -493,7 +493,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameShapeTest) {
 }
 
 TEST_F(TensorUtilTest, TensorsHaveSameShapeAndDtypeTest) {
-  using executorch::runtime::tensors_have_same_shape_and_dtype;
+  using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape_and_dtype;
   Tensor a = tf_float_.ones({2, 3});
   Tensor b = tf_float_.ones({2, 3});
   Tensor c = tf_float_.ones({2, 3});
@@ -515,7 +515,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameShapeAndDtypeTest) {
 }
 
 TEST_F(TensorUtilTest, TensorsHaveSameStridesTest) {
-  using executorch::runtime::tensors_have_same_strides;
+  using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_strides;
   Tensor a = tf_float_.full_channels_last({4, 5, 2, 3}, 1);
   Tensor b = tf_float_.full_channels_last({4, 5, 2, 3}, 2);
   Tensor c = tf_float_.full_channels_last({4, 5, 2, 3}, 3);
@@ -530,7 +530,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameStridesTest) {
 }
 
 TEST_F(TensorUtilTest, TensorIsContiguous) {
-  using executorch::runtime::tensor_is_contiguous;
+  using executorch::ET_RUNTIME_NAMESPACE::tensor_is_contiguous;
   // Note that the strides.size() == 0 case is not tested, since
   Tensor a = tf_float_.full_channels_last({4, 5, 2, 3}, 1);
   Tensor b = tf_float_.ones({4, 5, 2, 3});
@@ -547,7 +547,7 @@ TEST_F(TensorUtilTest, ResizeZeroDimTensor) {
   Tensor a = tf_float_.ones({});
 
   EXPECT_EQ(
-      executorch::runtime::resize_tensor(a, {}),
+      executorch::ET_RUNTIME_NAMESPACE::resize_tensor(a, {}),
       executorch::runtime::Error::Ok);
   EXPECT_EQ(a.dim(), 0);
 }
diff --git a/runtime/core/named_data_map.h b/runtime/core/named_data_map.h
index e79c7035989..14179d22795 100644
--- a/runtime/core/named_data_map.h
+++ b/runtime/core/named_data_map.h
@@ -22,8 +22,7 @@
 #include <executorch/runtime/platform/compiler.h>
 
 namespace executorch {
-namespace runtime {
-
+namespace ET_RUNTIME_NAMESPACE {
 /**
  * Interface to access and retrieve data via name.
  * See executorch/extension/flat_tensor/ for an example.
@@ -37,8 +36,8 @@ class ET_EXPERIMENTAL NamedDataMap {
    * @param key The name of the tensor.
    * @return Result containing TensorLayout with tensor metadata.
    */
-  ET_NODISCARD virtual Result<const executorch::runtime::TensorLayout>
-  get_metadata(const char* key) const = 0;
+  ET_NODISCARD virtual Result<const TensorLayout> get_metadata(
+      const char* key) const = 0;
   /**
    * Get data by key.
    *
@@ -78,7 +77,7 @@ class ET_EXPERIMENTAL NamedDataMap {
   ET_NODISCARD virtual Result<const char*> get_key(size_t index) const = 0;
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 #ifdef __GNUC__
diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h
index 919eb6c8567..1429eda2acb 100644
--- a/runtime/core/portable_type/c10/c10/macros/Macros.h
+++ b/runtime/core/portable_type/c10/c10/macros/Macros.h
@@ -241,7 +241,7 @@ using namespace c10::xpu;
 #ifdef __HIPCC__
 // Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
 // We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
-// See https://github.com/ROCm-Developer-Tools/HIP/issues/441
+// See https://github.com/ROCm/hip/issues/441
 #include <hip/hip_runtime.h>
 #endif
 
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index d9d72b5be3f..4555d42a567 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -25,6 +25,9 @@ def define_common_targets():
             "util/Half-inl.h",
             "util/TypeSafeSignMath.h",
             "util/bit_cast.h",
+            "util/complex.h",
+            "util/complex_math.h",
+            "util/complex_utils.h",
             "util/floating_point_utils.h",
             "util/irange.h",
         ],
@@ -36,6 +39,7 @@ def define_common_targets():
         ]),
         visibility = [
             "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
         ],
         deps = select({
             "DEFAULT": [],
diff --git a/runtime/core/portable_type/c10/c10/util/complex.h b/runtime/core/portable_type/c10/c10/util/complex.h
new file mode 100644
index 00000000000..b63710d9458
--- /dev/null
+++ b/runtime/core/portable_type/c10/c10/util/complex.h
@@ -0,0 +1,668 @@
+#pragma once
+
+#include <complex>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Half.h>
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <thrust/complex.h>
+#endif
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wfloat-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wfloat-conversion")
+#endif
+
+namespace c10 {
+
+// c10::complex is an implementation of complex numbers that aims
+// to work on all devices supported by PyTorch
+//
+// Most of the APIs duplicates std::complex
+// Reference: https://en.cppreference.com/w/cpp/numeric/complex
+//
+// [NOTE: Complex Operator Unification]
+// Operators currently use a mix of std::complex, thrust::complex, and
+// c10::complex internally. The end state is that all operators will use
+// c10::complex internally.  Until then, there may be some hacks to support all
+// variants.
+//
+//
+// [Note on Constructors]
+//
+// The APIs of constructors are mostly copied from C++ standard:
+//   https://en.cppreference.com/w/cpp/numeric/complex/complex
+//
+// Since C++14, all constructors are constexpr in std::complex
+//
+// There are three types of constructors:
+// - initializing from real and imag:
+//     `constexpr complex( const T& re = T(), const T& im = T() );`
+// - implicitly-declared copy constructor
+// - converting constructors
+//
+// Converting constructors:
+// - std::complex defines converting constructor between float/double/long
+// double,
+//   while we define converting constructor between float/double.
+// - For these converting constructors, upcasting is implicit, downcasting is
+//   explicit.
+// - We also define explicit casting from std::complex/thrust::complex
+//   - Note that the conversion from thrust is not constexpr, because
+//     thrust does not define them as constexpr ????
+//
+//
+// [Operator =]
+//
+// The APIs of operator = are mostly copied from C++ standard:
+//   https://en.cppreference.com/w/cpp/numeric/complex/operator%3D
+//
+// Since C++20, all operator= are constexpr. Although we are not building with
+// C++20, we also obey this behavior.
+//
+// There are three types of assign operator:
+// - Assign a real value from the same scalar type
+//   - In std, this is templated as complex& operator=(const T& x)
+//     with specialization `complex& operator=(T x)` for float/double/long
+//     double Since we only support float and double, on will use `complex&
+//     operator=(T x)`
+// - Copy assignment operator and converting assignment operator
+//   - There is no specialization of converting assignment operators, which type
+//   is
+//     convertible is solely dependent on whether the scalar type is convertible
+//
+// In addition to the standard assignment, we also provide assignment operators
+// with std and thrust
+//
+//
+// [Casting operators]
+//
+// std::complex does not have casting operators. We define casting operators
+// casting to std::complex and thrust::complex
+//
+//
+// [Operator ""]
+//
+// std::complex has custom literals `i`, `if` and `il` defined in namespace
+// `std::literals::complex_literals`. We define our own custom literals in the
+// namespace `c10::complex_literals`. Our custom literals does not follow the
+// same behavior as in std::complex, instead, we define _if, _id to construct
+// float/double complex literals.
+//
+//
+// [real() and imag()]
+//
+// In C++20, there are two overload of these functions, one it to return the
+// real/imag, another is to set real/imag, they are both constexpr. We follow
+// this design.
+//
+//
+// [Operator +=,-=,*=,/=]
+//
+// Since C++20, these operators become constexpr. In our implementation, they
+// are also constexpr.
+//
+// There are two types of such operators: operating with a real number, or
+// operating with another complex number. For the operating with a real number,
+// the generic template form has argument type `const T &`, while the overload
+// for float/double/long double has `T`. We will follow the same type as
+// float/double/long double in std.
+//
+// [Unary operator +-]
+//
+// Since C++20, they are constexpr. We also make them expr
+//
+// [Binary operators +-*/]
+//
+// Each operator has three versions (taking + as example):
+// - complex + complex
+// - complex + real
+// - real + complex
+//
+// [Operator ==, !=]
+//
+// Each operator has three versions (taking == as example):
+// - complex == complex
+// - complex == real
+// - real == complex
+//
+// Some of them are removed on C++20, but we decide to keep them
+//
+// [Operator <<, >>]
+//
+// These are implemented by casting to std::complex
+//
+//
+//
+// TODO(@zasdfgbnm): c10::complex<c10::Half> is not currently supported,
+// because:
+//  - lots of members and functions of c10::Half are not constexpr
+//  - thrust::complex only support float and double
+
+template <typename T>
+struct alignas(sizeof(T) * 2) complex {
+  using value_type = T;
+
+  T real_ = T(0);
+  T imag_ = T(0);
+
+  constexpr complex() = default;
+  C10_HOST_DEVICE constexpr complex(const T& re, const T& im = T())
+      : real_(re), imag_(im) {}
+  template <typename U>
+  explicit constexpr complex(const std::complex<U>& other)
+      : complex(other.real(), other.imag()) {}
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  explicit C10_HOST_DEVICE complex(const thrust::complex<U>& other)
+      : real_(other.real()), imag_(other.imag()) {}
+// NOTE can not be implemented as follow due to ROCm bug:
+//   explicit C10_HOST_DEVICE complex(const thrust::complex<U> &other):
+//   complex(other.real(), other.imag()) {}
+#endif
+
+  // Use SFINAE to specialize casting constructor for c10::complex<float> and
+  // c10::complex<double>
+  template <typename U = T>
+  C10_HOST_DEVICE explicit constexpr complex(
+      const std::enable_if_t<std::is_same_v<U, float>, complex<double>>& other)
+      : real_(other.real_), imag_(other.imag_) {}
+  template <typename U = T>
+  C10_HOST_DEVICE constexpr complex(
+      const std::enable_if_t<std::is_same_v<U, double>, complex<float>>& other)
+      : real_(other.real_), imag_(other.imag_) {}
+
+  constexpr complex<T>& operator=(T re) {
+    real_ = re;
+    imag_ = 0;
+    return *this;
+  }
+
+  constexpr complex<T>& operator+=(T re) {
+    real_ += re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator-=(T re) {
+    real_ -= re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator*=(T re) {
+    real_ *= re;
+    imag_ *= re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator/=(T re) {
+    real_ /= re;
+    imag_ /= re;
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator=(const complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator+=(const complex<U>& rhs) {
+    real_ += rhs.real();
+    imag_ += rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator-=(const complex<U>& rhs) {
+    real_ -= rhs.real();
+    imag_ -= rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator*=(const complex<U>& rhs) {
+    // (a + bi) * (c + di) = (a*c - b*d) + (a * d + b * c) i
+    T a = real_;
+    T b = imag_;
+    U c = rhs.real();
+    U d = rhs.imag();
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
+  }
+
+#ifdef __APPLE__
+#define FORCE_INLINE_APPLE __attribute__((always_inline))
+#else
+#define FORCE_INLINE_APPLE
+#endif
+  template <typename U>
+  constexpr FORCE_INLINE_APPLE complex<T>& operator/=(const complex<U>& rhs)
+      __ubsan_ignore_float_divide_by_zero__ {
+    // (a + bi) / (c + di) = (ac + bd)/(c^2 + d^2) + (bc - ad)/(c^2 + d^2) i
+    // the calculation below follows numpy's complex division
+    T a = real_;
+    T b = imag_;
+    U c = rhs.real();
+    U d = rhs.imag();
+
+#if defined(__GNUC__) && !defined(__clang__)
+    // std::abs is already constexpr by gcc
+    auto abs_c = std::abs(c);
+    auto abs_d = std::abs(d);
+#else
+    auto abs_c = c < 0 ? -c : c;
+    auto abs_d = d < 0 ? -d : d;
+#endif
+
+    if (abs_c >= abs_d) {
+      if (abs_c == U(0) && abs_d == U(0)) {
+        /* divide by zeros should yield a complex inf or nan */
+        real_ = a / abs_c;
+        imag_ = b / abs_d;
+      } else {
+        auto rat = d / c;
+        auto scl = U(1.0) / (c + d * rat);
+        real_ = (a + b * rat) * scl;
+        imag_ = (b - a * rat) * scl;
+      }
+    } else {
+      auto rat = c / d;
+      auto scl = U(1.0) / (d + c * rat);
+      real_ = (a * rat + b) * scl;
+      imag_ = (b * rat - a) * scl;
+    }
+    return *this;
+  }
+#undef FORCE_INLINE_APPLE
+
+  template <typename U>
+  constexpr complex<T>& operator=(const std::complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  C10_HOST_DEVICE complex<T>& operator=(const thrust::complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+#endif
+
+  template <typename U>
+  explicit constexpr operator std::complex<U>() const {
+    return std::complex<U>(std::complex<T>(real(), imag()));
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  C10_HOST_DEVICE explicit operator thrust::complex<U>() const {
+    return static_cast<thrust::complex<U>>(thrust::complex<T>(real(), imag()));
+  }
+#endif
+
+  // consistent with NumPy behavior
+  explicit constexpr operator bool() const {
+    return real() || imag();
+  }
+
+  C10_HOST_DEVICE constexpr T real() const {
+    return real_;
+  }
+  constexpr void real(T value) {
+    real_ = value;
+  }
+  C10_HOST_DEVICE constexpr T imag() const {
+    return imag_;
+  }
+  constexpr void imag(T value) {
+    imag_ = value;
+  }
+};
+
+namespace complex_literals {
+
+constexpr complex<float> operator""_if(long double imag) {
+  return complex<float>(0.0f, static_cast<float>(imag));
+}
+
+constexpr complex<double> operator""_id(long double imag) {
+  return complex<double>(0.0, static_cast<double>(imag));
+}
+
+constexpr complex<float> operator""_if(unsigned long long imag) {
+  return complex<float>(0.0f, static_cast<float>(imag));
+}
+
+constexpr complex<double> operator""_id(unsigned long long imag) {
+  return complex<double>(0.0, static_cast<double>(imag));
+}
+
+} // namespace complex_literals
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& val) {
+  return val;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& val) {
+  return complex<T>(-val.real(), -val.imag());
+}
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result += rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result += rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator+(const T& lhs, const complex<T>& rhs) {
+  return complex<T>(lhs + rhs.real(), rhs.imag());
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result -= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result -= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const T& lhs, const complex<T>& rhs) {
+  complex<T> result = -rhs;
+  return result += lhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result *= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result *= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const T& lhs, const complex<T>& rhs) {
+  complex<T> result = rhs;
+  return result *= lhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result /= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result /= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const T& lhs, const complex<T>& rhs) {
+  complex<T> result(lhs, T());
+  return result /= rhs;
+}
+
+// Define operators between integral scalars and c10::complex. std::complex does
+// not support this when T is a floating-point number. This is useful because it
+// saves a lot of "static_cast" when operate a complex and an integer. This
+// makes the code both less verbose and potentially more efficient.
+#define COMPLEX_INTEGER_OP_TEMPLATE_CONDITION                 \
+  typename std::enable_if_t<                                  \
+      std::is_floating_point_v<fT> && std::is_integral_v<iT>, \
+      int> = 0
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator+(const c10::complex<fT>& a, const iT& b) {
+  return a + static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator+(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) + b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator-(const c10::complex<fT>& a, const iT& b) {
+  return a - static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator-(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) - b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator*(const c10::complex<fT>& a, const iT& b) {
+  return a * static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator*(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) * b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator/(const c10::complex<fT>& a, const iT& b) {
+  return a / static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator/(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) / b;
+}
+
+#undef COMPLEX_INTEGER_OP_TEMPLATE_CONDITION
+
+template <typename T>
+constexpr bool operator==(const complex<T>& lhs, const complex<T>& rhs) {
+  return (lhs.real() == rhs.real()) && (lhs.imag() == rhs.imag());
+}
+
+template <typename T>
+constexpr bool operator==(const complex<T>& lhs, const T& rhs) {
+  return (lhs.real() == rhs) && (lhs.imag() == T());
+}
+
+template <typename T>
+constexpr bool operator==(const T& lhs, const complex<T>& rhs) {
+  return (lhs == rhs.real()) && (T() == rhs.imag());
+}
+
+template <typename T>
+constexpr bool operator!=(const complex<T>& lhs, const complex<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T>
+constexpr bool operator!=(const complex<T>& lhs, const T& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T>
+constexpr bool operator!=(const T& lhs, const complex<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T, typename CharT, typename Traits>
+std::basic_ostream<CharT, Traits>& operator<<(
+    std::basic_ostream<CharT, Traits>& os,
+    const complex<T>& x) {
+  return (os << static_cast<std::complex<T>>(x));
+}
+
+template <typename T, typename CharT, typename Traits>
+std::basic_istream<CharT, Traits>& operator>>(
+    std::basic_istream<CharT, Traits>& is,
+    complex<T>& x) {
+  std::complex<T> tmp;
+  is >> tmp;
+  x = tmp;
+  return is;
+}
+
+} // namespace c10
+
+// std functions
+//
+// The implementation of these functions also follow the design of C++20
+
+namespace std {
+
+template <typename T>
+constexpr T real(const c10::complex<T>& z) {
+  return z.real();
+}
+
+template <typename T>
+constexpr T imag(const c10::complex<T>& z) {
+  return z.imag();
+}
+
+template <typename T>
+C10_HOST_DEVICE T abs(const c10::complex<T>& z) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return thrust::abs(static_cast<thrust::complex<T>>(z));
+#else
+  return std::abs(static_cast<std::complex<T>>(z));
+#endif
+}
+
+#if defined(USE_ROCM)
+#define ROCm_Bug(x)
+#else
+#define ROCm_Bug(x) x
+#endif
+
+template <typename T>
+C10_HOST_DEVICE T arg(const c10::complex<T>& z) {
+  return ROCm_Bug(std)::atan2(std::imag(z), std::real(z));
+}
+
+#undef ROCm_Bug
+
+template <typename T>
+constexpr T norm(const c10::complex<T>& z) {
+  return z.real() * z.real() + z.imag() * z.imag();
+}
+
+// For std::conj, there are other versions of it:
+//   constexpr std::complex<float> conj( float z );
+//   template< class DoubleOrInteger >
+//   constexpr std::complex<double> conj( DoubleOrInteger z );
+//   constexpr std::complex<long double> conj( long double z );
+// These are not implemented
+// TODO(@zasdfgbnm): implement them as c10::conj
+template <typename T>
+constexpr c10::complex<T> conj(const c10::complex<T>& z) {
+  return c10::complex<T>(z.real(), -z.imag());
+}
+
+// Thrust does not have complex --> complex version of thrust::proj,
+// so this function is not implemented at c10 right now.
+// TODO(@zasdfgbnm): implement it by ourselves
+
+// There is no c10 version of std::polar, because std::polar always
+// returns std::complex. Use c10::polar instead;
+
+} // namespace std
+
+namespace c10 {
+
+template <typename T>
+C10_HOST_DEVICE complex<T> polar(const T& r, const T& theta = T()) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<complex<T>>(thrust::polar(r, theta));
+#else
+  // std::polar() requires r >= 0, so spell out the explicit implementation to
+  // avoid a branch.
+  return complex<T>(r * std::cos(theta), r * std::sin(theta));
+#endif
+}
+
+template <>
+struct alignas(4) complex<Half> {
+  Half real_;
+  Half imag_;
+
+  // Constructors
+  complex() = default;
+  // Half constructor is not constexpr so the following constructor can't
+  // be constexpr
+  C10_HOST_DEVICE explicit inline complex(const Half& real, const Half& imag)
+      : real_(real), imag_(imag) {}
+  C10_HOST_DEVICE inline complex(const c10::complex<float>& value)
+      : real_(value.real()), imag_(value.imag()) {}
+
+  // Conversion operator
+  inline C10_HOST_DEVICE operator c10::complex<float>() const {
+    return {real_, imag_};
+  }
+
+  constexpr C10_HOST_DEVICE Half real() const {
+    return real_;
+  }
+  constexpr C10_HOST_DEVICE Half imag() const {
+    return imag_;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator+=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) + static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) + static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator-=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) - static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) - static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator*=(const complex<Half>& other) {
+    auto a = static_cast<float>(real_);
+    auto b = static_cast<float>(imag_);
+    auto c = static_cast<float>(other.real());
+    auto d = static_cast<float>(other.imag());
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
+  }
+};
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#define C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H
+// math functions are included in a separate file
+#include <c10/util/complex_math.h> // IWYU pragma: keep
+// utilities for complex types
+#include <c10/util/complex_utils.h> // IWYU pragma: keep
+#undef C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H
diff --git a/runtime/core/portable_type/c10/c10/util/complex_math.h b/runtime/core/portable_type/c10/c10/util/complex_math.h
new file mode 100644
index 00000000000..2b591026c94
--- /dev/null
+++ b/runtime/core/portable_type/c10/c10/util/complex_math.h
@@ -0,0 +1,406 @@
+#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H)
+#error \
+    "c10/util/complex_math.h is not meant to be individually included. Include c10/util/complex.h instead."
+#endif
+
+namespace c10_complex_math {
+
+// Exponential functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> exp(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::exp(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::exp(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::log(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::log(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log10(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::log10(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::log10(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log2(const c10::complex<T>& x) {
+  const c10::complex<T> log2 = c10::complex<T>(::log(2.0), 0.0);
+  return c10_complex_math::log(x) / log2;
+}
+
+// Power functions
+//
+#if defined(_LIBCPP_VERSION) || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))
+namespace _detail {
+C10_API c10::complex<float> sqrt(const c10::complex<float>& in);
+C10_API c10::complex<double> sqrt(const c10::complex<double>& in);
+C10_API c10::complex<float> acos(const c10::complex<float>& in);
+C10_API c10::complex<double> acos(const c10::complex<double>& in);
+} // namespace _detail
+#endif
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sqrt(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sqrt(static_cast<thrust::complex<T>>(x)));
+#elif !(                        \
+    defined(_LIBCPP_VERSION) || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX)))
+  return static_cast<c10::complex<T>>(
+      std::sqrt(static_cast<std::complex<T>>(x)));
+#else
+  return _detail::sqrt(x);
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const c10::complex<T>& x,
+    const c10::complex<T>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(thrust::pow(
+      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(std::pow(
+      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const c10::complex<T>& x,
+    const T& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(static_cast<thrust::complex<T>>(x), y));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(static_cast<std::complex<T>>(x), y));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const T& x,
+    const c10::complex<T>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(x, static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(x, static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const c10::complex<T>& x,
+    const c10::complex<U>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(thrust::pow(
+      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(std::pow(
+      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const c10::complex<T>& x,
+    const U& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(static_cast<thrust::complex<T>>(x), y));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(static_cast<std::complex<T>>(x), y));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const T& x,
+    const c10::complex<U>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(x, static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(x, static_cast<std::complex<T>>(y)));
+#endif
+}
+
+// Trigonometric functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sin(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sin(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::sin(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> cos(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::cos(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::cos(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> tan(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::tan(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::tan(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> asin(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::asin(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::asin(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> acos(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::acos(static_cast<thrust::complex<T>>(x)));
+#elif !defined(_LIBCPP_VERSION)
+  return static_cast<c10::complex<T>>(
+      std::acos(static_cast<std::complex<T>>(x)));
+#else
+  return _detail::acos(x);
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> atan(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::atan(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::atan(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+// Hyperbolic functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sinh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sinh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::sinh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> cosh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::cosh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::cosh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> tanh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::tanh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::tanh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> asinh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::asinh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::asinh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> acosh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::acosh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::acosh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> atanh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::atanh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::atanh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log1p(const c10::complex<T>& z) {
+#if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \
+    defined(__HIPCC__)
+  // For Mac, the new implementation yielded a high relative error. Falling back
+  // to the old version for now.
+  // See https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
+  // For CUDA we also use this one, as thrust::log(thrust::complex) takes
+  // *forever* to compile
+
+  // log1p(z) = log(1 + z)
+  // Let's define 1 + z = r * e ^ (i * a), then we have
+  // log(r * e ^ (i * a)) = log(r) + i * a
+  // With z = x + iy, the term r can be written as
+  // r = ((1 + x) ^ 2 + y ^ 2) ^ 0.5
+  //   = (1 + x ^ 2 + 2 * x + y ^ 2) ^ 0.5
+  // So, log(r) is
+  // log(r) = 0.5 * log(1 + x ^ 2 + 2 * x + y ^ 2)
+  //        = 0.5 * log1p(x * (x + 2) + y ^ 2)
+  // we need to use the expression only on certain condition to avoid overflow
+  // and underflow from `(x * (x + 2) + y ^ 2)`
+  T x = z.real();
+  T y = z.imag();
+  T zabs = std::abs(z);
+  T theta = std::atan2(y, x + T(1));
+  if (zabs < 0.5) {
+    T r = x * (T(2) + x) + y * y;
+    if (r == 0) { // handle underflow
+      return {x, theta};
+    }
+    return {T(0.5) * std::log1p(r), theta};
+  } else {
+    T z0 = std::hypot(x + 1, y);
+    return {std::log(z0), theta};
+  }
+#else
+  // CPU path
+  // Based on https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
+  c10::complex<T> u = z + T(1);
+  if (u == T(1)) {
+    return z;
+  } else {
+    auto log_u = log(u);
+    if (u - T(1) == z) {
+      return log_u;
+    }
+    return log_u * (z / (u - T(1)));
+  }
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> expm1(const c10::complex<T>& z) {
+  // expm1(z) = exp(z) - 1
+  // Define z = x + i * y
+  // f = e ^ (x + i * y) - 1
+  //   = e ^ x * e ^ (i * y) - 1
+  //   = (e ^ x * cos(y) - 1) + i * (e ^ x * sin(y))
+  //   = (e ^ x - 1) * cos(y) - (1 - cos(y)) + i * e ^ x * sin(y)
+  //   = expm1(x) * cos(y) - 2 * sin(y / 2) ^ 2 + i * e ^ x * sin(y)
+  T x = z.real();
+  T y = z.imag();
+  T a = std::sin(y / 2);
+  T er = std::expm1(x) * std::cos(y) - T(2) * a * a;
+  T ei = std::exp(x) * std::sin(y);
+  return {er, ei};
+}
+
+} // namespace c10_complex_math
+
+using c10_complex_math::acos;
+using c10_complex_math::acosh;
+using c10_complex_math::asin;
+using c10_complex_math::asinh;
+using c10_complex_math::atan;
+using c10_complex_math::atanh;
+using c10_complex_math::cos;
+using c10_complex_math::cosh;
+using c10_complex_math::exp;
+using c10_complex_math::expm1;
+using c10_complex_math::log;
+using c10_complex_math::log10;
+using c10_complex_math::log1p;
+using c10_complex_math::log2;
+using c10_complex_math::pow;
+using c10_complex_math::sin;
+using c10_complex_math::sinh;
+using c10_complex_math::sqrt;
+using c10_complex_math::tan;
+using c10_complex_math::tanh;
+
+namespace std {
+
+using c10_complex_math::acos;
+using c10_complex_math::acosh;
+using c10_complex_math::asin;
+using c10_complex_math::asinh;
+using c10_complex_math::atan;
+using c10_complex_math::atanh;
+using c10_complex_math::cos;
+using c10_complex_math::cosh;
+using c10_complex_math::exp;
+using c10_complex_math::expm1;
+using c10_complex_math::log;
+using c10_complex_math::log10;
+using c10_complex_math::log1p;
+using c10_complex_math::log2;
+using c10_complex_math::pow;
+using c10_complex_math::sin;
+using c10_complex_math::sinh;
+using c10_complex_math::sqrt;
+using c10_complex_math::tan;
+using c10_complex_math::tanh;
+
+} // namespace std
diff --git a/runtime/core/portable_type/c10/c10/util/complex_utils.h b/runtime/core/portable_type/c10/c10/util/complex_utils.h
new file mode 100644
index 00000000000..1ca105f1d0a
--- /dev/null
+++ b/runtime/core/portable_type/c10/c10/util/complex_utils.h
@@ -0,0 +1,46 @@
+#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H)
+#error \
+    "c10/util/complex_utils.h is not meant to be individually included. Include c10/util/complex.h instead."
+#endif
+
+#include <limits>
+
+namespace c10 {
+
+template <typename T>
+struct is_complex : public std::false_type {};
+
+template <typename T>
+struct is_complex<std::complex<T>> : public std::true_type {};
+
+template <typename T>
+struct is_complex<c10::complex<T>> : public std::true_type {};
+
+// Extract double from std::complex<double>; is identity otherwise
+// TODO: Write in more idiomatic C++17
+template <typename T>
+struct scalar_value_type {
+  using type = T;
+};
+template <typename T>
+struct scalar_value_type<std::complex<T>> {
+  using type = T;
+};
+template <typename T>
+struct scalar_value_type<c10::complex<T>> {
+  using type = T;
+};
+
+} // namespace c10
+
+namespace std {
+
+template <typename T>
+class numeric_limits<c10::complex<T>> : public numeric_limits<T> {};
+
+template <typename T>
+bool isnan(const c10::complex<T>& v) {
+  return std::isnan(v.real()) || std::isnan(v.imag());
+}
+
+} // namespace std
diff --git a/runtime/core/portable_type/complex.h b/runtime/core/portable_type/complex.h
index e89a19e54d7..faf13a0432f 100644
--- a/runtime/core/portable_type/complex.h
+++ b/runtime/core/portable_type/complex.h
@@ -8,39 +8,14 @@
 
 #pragma once
 
-#include <executorch/runtime/core/portable_type/half.h>
+#include <c10/util/complex.h>
 
-namespace executorch {
-namespace runtime {
-namespace etensor {
+namespace executorch::runtime::etensor {
+using c10::complex;
+} // namespace executorch::runtime::etensor
 
-/**
- * An implementation of complex numbers, compatible with c10/util/complex.h from
- * pytorch core.
- */
-template <typename T>
-struct alignas(sizeof(T) * 2) complex {
-  T real_ = T(0);
-  T imag_ = T(0);
-};
-
-/**
- * Specialization for Half, which is not a primitive C numeric type.
- */
-template <>
-struct alignas(4) complex<Half> {
-  Half real_;
-  Half imag_;
-};
-
-} // namespace etensor
-} // namespace runtime
-} // namespace executorch
-
-namespace torch {
-namespace executor {
+namespace torch::executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
 using ::executorch::runtime::etensor::complex;
-} // namespace executor
-} // namespace torch
+} // namespace torch::executor
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index 3195e727d96..d3e02b1afb5 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -95,9 +95,9 @@ def define_common_targets():
                 "@EXECUTORCH_CLIENTS",
             ],
             exported_deps = [
-                "//executorch/runtime/core:core",
-                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                ":core",
                 ":tag",
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
             ],
         )
 
@@ -119,6 +119,37 @@ def define_common_targets():
             ],
         )
 
+        runtime.cxx_library(
+            name = "named_data_map" + aten_suffix,
+            exported_headers = [
+                "named_data_map.h",
+            ],
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                ":tensor_layout" + aten_suffix,
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+            ],
+        )
+
+
+        runtime.cxx_library(
+            name = "tensor_layout" + aten_suffix,
+            srcs = ["tensor_layout.cpp"],
+            exported_headers = ["tensor_layout.h"],
+            deps = [
+                "//executorch/runtime/core/portable_type/c10/c10:c10",
+            ],
+            exported_deps = [
+                ":core",
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix,
+            ],
+            visibility = ["//executorch/..."],
+        )
+
     runtime.cxx_library(
         name = "tag",
         srcs = ["tag.cpp"],
@@ -133,31 +164,3 @@ def define_common_targets():
             "//executorch/...",
         ],
     )
-
-    runtime.cxx_library(
-        name = "named_data_map",
-        exported_headers = [
-            "named_data_map.h",
-        ],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        exported_deps = [
-            ":tensor_layout",
-        ],
-    )
-
-    runtime.cxx_library(
-        name = "tensor_layout",
-        srcs = ["tensor_layout.cpp"],
-        exported_headers = ["tensor_layout.h"],
-        deps = [
-            "//executorch/runtime/core/portable_type/c10/c10:c10",
-        ],
-        exported_deps = [
-            ":core",
-            "//executorch/runtime/core/exec_aten:lib",
-        ],
-        visibility = ["//executorch/..."],
-    )
diff --git a/runtime/core/tensor_layout.cpp b/runtime/core/tensor_layout.cpp
index 2b862e6dc14..d33f79f27c4 100644
--- a/runtime/core/tensor_layout.cpp
+++ b/runtime/core/tensor_layout.cpp
@@ -13,7 +13,7 @@
 #include <executorch/runtime/core/tensor_layout.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 namespace {
 Result<size_t> calculate_nbytes(
@@ -51,5 +51,5 @@ Result<const TensorLayout> TensorLayout::create(
   }
   return TensorLayout(sizes, dim_order, scalar_type, nbytes.get());
 }
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/core/tensor_layout.h b/runtime/core/tensor_layout.h
index c2c3833f528..42131e6506e 100644
--- a/runtime/core/tensor_layout.h
+++ b/runtime/core/tensor_layout.h
@@ -14,7 +14,7 @@
 #include <executorch/runtime/core/span.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 /**
  * Describes the layout of a tensor.
@@ -89,5 +89,5 @@ class ET_EXPERIMENTAL TensorLayout final {
   const size_t nbytes_;
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/core/test/event_tracer_test.cpp b/runtime/core/test/event_tracer_test.cpp
index 224e87cc2b1..1c9e1a446b9 100644
--- a/runtime/core/test/event_tracer_test.cpp
+++ b/runtime/core/test/event_tracer_test.cpp
@@ -28,6 +28,7 @@ using executorch::runtime::EValue;
 using executorch::runtime::EventTracer;
 using executorch::runtime::EventTracerDebugLogLevel;
 using executorch::runtime::EventTracerEntry;
+using executorch::runtime::EventTracerFilterBase;
 using executorch::runtime::kUnsetChainId;
 using executorch::runtime::kUnsetDebugHandle;
 using executorch::runtime::kUnsetDelegateDebugIntId;
@@ -90,6 +91,11 @@ class DummyEventTracer : public EventTracer {
     (void)metadata_len;
   }
 
+  void set_delegation_intermediate_output_filter(
+      EventTracerFilterBase* event_tracer_filter) override {
+    (void)event_tracer_filter;
+  }
+
   void log_profiling_delegate(
       const char* name,
       DelegateDebugIntId delegate_debug_id,
@@ -155,9 +161,11 @@ class DummyEventTracer : public EventTracer {
     return true;
   }
 
-  void log_evalue(const EValue& evalue, LoggedEValueType evalue_type) override {
+  Result<bool> log_evalue(const EValue& evalue, LoggedEValueType evalue_type)
+      override {
     logged_evalue_ = evalue;
     logged_evalue_type_ = evalue_type;
+    return true;
   }
 
   EValue logged_evalue() {
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 41d44522a22..f09af8ac2e7 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -32,9 +32,8 @@
 #include <executorch/schema/program_generated.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
-using deserialization::NamedData;
 using internal::PlatformMemoryAllocator;
 
 /**
@@ -1594,6 +1593,37 @@ EValue& Method::mutable_input(size_t i) {
   return mutable_value(get_input_index(i));
 }
 
+Result<executorch::aten::Tensor> Method::get_attribute(
+    executorch::aten::string_view name) {
+  auto flatbuffer_values = serialization_plan_->values();
+  size_t counter = 0;
+
+  for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
+    auto serialization_value = flatbuffer_values->Get(i);
+    if (serialization_value->val_type() ==
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      const auto s_tensor = static_cast<const executorch_flatbuffer::Tensor*>(
+          serialization_value->val());
+      if (s_tensor->extra_tensor_info() != nullptr &&
+          s_tensor->extra_tensor_info()->fully_qualified_name() != nullptr &&
+          strcmp(
+              s_tensor->extra_tensor_info()->fully_qualified_name()->c_str(),
+              name.data()) == 0) {
+        if (!this->values_[counter].isTensor()) {
+          ET_LOG(
+              Error,
+              "Attribute tensor not at the expected location. The .pte is likely malformed. Please file a bug report on https://github.com/pytorch/executorch/issues");
+          return Error::Internal;
+        }
+        return this->values_[counter].toTensor();
+      }
+    }
+    ++counter;
+  }
+
+  return Error::NotFound;
+}
+
 size_t Method::outputs_size() const {
   const auto* outputs = serialization_plan_->outputs();
   return outputs == nullptr ? 0 : outputs->size();
@@ -1640,5 +1670,5 @@ Method::~Method() {
   }
   // All other fields are trivially destructible.
 }
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/executor/method.h b/runtime/executor/method.h
index 0ca2df440ad..0cf7164c98e 100644
--- a/runtime/executor/method.h
+++ b/runtime/executor/method.h
@@ -32,7 +32,7 @@ struct EValue;
 } // namespace executorch_flatbuffer
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 // Forward declare NamedData. This is a public header and must not include
 // internal data types.
@@ -192,6 +192,18 @@ class Method final {
    */
   ET_NODISCARD Error get_inputs(EValue* input_evalues, size_t length);
 
+  /**
+   *
+   * Retrieves the attribute tensor associated with the given name.
+   *
+   * @param[in] name The name of the attribute tensor to retrieve.
+   *
+   * @returns Result containing the attribute tensor on success, non-Ok on
+   * failure.
+   */
+  ET_NODISCARD Result<executorch::aten::Tensor> get_attribute(
+      executorch::aten::string_view name);
+
   /**
    * Execute the method.
    *
@@ -394,14 +406,14 @@ class Method final {
   void log_outputs();
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::Method;
+using ::executorch::ET_RUNTIME_NAMESPACE::Method;
 } // namespace executor
 } // namespace torch
 
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index 8f84fea940f..e810d195370 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -16,7 +16,7 @@
 #include <executorch/schema/program_generated.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 namespace {
 Result<Tag> get_tag(
@@ -69,9 +69,11 @@ TensorInfo::TensorInfo(
     Span<const int32_t> sizes,
     Span<const uint8_t> dim_order,
     executorch::aten::ScalarType scalar_type,
-    const bool is_memory_planned)
+    const bool is_memory_planned,
+    executorch::aten::string_view name)
     : sizes_(sizes),
       dim_order_(dim_order),
+      name_(name),
       scalar_type_(scalar_type),
       is_memory_planned_(is_memory_planned),
       nbytes_(calculate_nbytes(sizes_, scalar_type_)) {}
@@ -96,6 +98,10 @@ size_t TensorInfo::nbytes() const {
   return nbytes_;
 }
 
+executorch::aten::string_view TensorInfo::name() const {
+  return name_;
+}
+
 MethodMeta::MethodMeta(const executorch_flatbuffer::ExecutionPlan* s_plan)
     : s_plan_(s_plan) {}
 
@@ -149,8 +155,9 @@ Result<TensorInfo> MethodMeta::input_tensor_meta(size_t index) const {
           tensor_value->dim_order()->data(), tensor_value->dim_order()->size()),
       static_cast<executorch::aten::ScalarType>(tensor_value->scalar_type()),
       tensor_value->allocation_info() != nullptr ||
-          tensor_value->data_buffer_idx() !=
-              0); // Count constant returns as memory planned.
+          tensor_value->data_buffer_idx() != 0 /* is_memory_planned */,
+      executorch::aten::string_view{nullptr, 0}); // Count constant returns as
+                                                  // memory planned.
 }
 
 size_t MethodMeta::num_outputs() const {
@@ -200,8 +207,60 @@ Result<TensorInfo> MethodMeta::output_tensor_meta(size_t index) const {
           tensor_value->dim_order()->data(), tensor_value->dim_order()->size()),
       static_cast<executorch::aten::ScalarType>(tensor_value->scalar_type()),
       tensor_value->allocation_info() != nullptr ||
-          tensor_value->data_buffer_idx() !=
-              0); // Count constant returns as memory planned.
+          tensor_value->data_buffer_idx() != 0 /* is_memory_planned */,
+      executorch::aten::string_view{nullptr, 0}); // Count constant returns as
+                                                  // memory planned.
+}
+
+size_t MethodMeta::num_attributes() const {
+  size_t counter = 0;
+  auto values = s_plan_->values();
+  for (size_t i = 0; i < values->size(); ++i) {
+    auto value = values->Get(i);
+    if (value->val_type() == executorch_flatbuffer::KernelTypes::Tensor) {
+      auto tensor_value = value->val_as_Tensor();
+      if (tensor_value->extra_tensor_info() != nullptr &&
+          tensor_value->extra_tensor_info()->fully_qualified_name()->c_str() !=
+              nullptr) {
+        ++counter;
+      }
+    }
+  }
+  return counter;
+}
+
+Result<TensorInfo> MethodMeta::attribute_tensor_meta(size_t index) const {
+  size_t counter = 0;
+  auto values = s_plan_->values();
+  for (size_t i = 0; i < values->size(); ++i) {
+    auto value = values->Get(i);
+    if (value->val_type() == executorch_flatbuffer::KernelTypes::Tensor) {
+      auto tensor_value = value->val_as_Tensor();
+      if (tensor_value->extra_tensor_info() != nullptr &&
+          tensor_value->extra_tensor_info()->fully_qualified_name()->c_str() !=
+              nullptr) {
+        if (counter == index) {
+          auto t_name =
+              tensor_value->extra_tensor_info()->fully_qualified_name();
+          // Count constant returns as memory planned
+          return TensorInfo(
+              Span<const int32_t>(
+                  tensor_value->sizes()->data(), tensor_value->sizes()->size()),
+              Span<const uint8_t>(
+                  tensor_value->dim_order()->data(),
+                  tensor_value->dim_order()->size()),
+              static_cast<executorch::aten::ScalarType>(
+                  tensor_value->scalar_type()),
+              tensor_value->allocation_info() != nullptr ||
+                  tensor_value->data_buffer_idx() != 0 /* is_memory_planned */,
+              executorch::aten::string_view{t_name->c_str(), t_name->size()});
+        }
+        ++counter;
+      }
+    }
+  }
+  ET_LOG(Error, "No attribute tensor found at index %zu", index);
+  return Error::InvalidArgument;
 }
 
 size_t MethodMeta::num_memory_planned_buffers() const {
@@ -279,6 +338,5 @@ size_t MethodMeta::num_instructions() const {
   }
   return num_instructions;
 }
-
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h
index d9bb64d68a7..ec910f9f6e4 100644
--- a/runtime/executor/method_meta.h
+++ b/runtime/executor/method_meta.h
@@ -20,7 +20,7 @@ struct ExecutionPlan;
 } // namespace executorch_flatbuffer
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 /**
  * Metadata about a specific tensor of an ExecuTorch Program.
@@ -62,6 +62,12 @@ class TensorInfo final {
    */
   size_t nbytes() const;
 
+  /**
+   * Returns the fully qualified name of the Tensor might be empty if the tensor
+   * is nameless.
+   */
+  executorch::aten::string_view name() const;
+
  private:
   // Let MethodMeta create TensorInfo.
   friend class MethodMeta;
@@ -70,7 +76,8 @@ class TensorInfo final {
       Span<const int32_t> sizes,
       Span<const uint8_t> dim_order,
       executorch::aten::ScalarType scalar_type,
-      const bool is_memory_planned);
+      const bool is_memory_planned,
+      executorch::aten::string_view name);
 
   /**
    * The sizes of the tensor.
@@ -88,6 +95,9 @@ class TensorInfo final {
    */
   Span<const uint8_t> dim_order_;
 
+  /// The fully qualified name of the Tensor.
+  executorch::aten::string_view name_;
+
   /// The scalar type of the tensor.
   executorch::aten::ScalarType scalar_type_;
 
@@ -170,6 +180,21 @@ class MethodMeta final {
    */
   Result<TensorInfo> output_tensor_meta(size_t index) const;
 
+  /**
+   * Get the number of attribute tensors in this method.
+   *
+   * @returns The number of attribute tensors.
+   */
+  size_t num_attributes() const;
+
+  /**
+   * Get metadata about the specified attribute tensor.
+   *
+   * @param[in] index The index of the attribute tensor to look up.
+   * @returns The metadata on success, or an error on failure.
+   */
+  Result<TensorInfo> attribute_tensor_meta(size_t index) const;
+
   /**
    * Get the number of memory-planned buffers this method requires.
    *
@@ -240,14 +265,14 @@ class MethodMeta final {
   const executorch_flatbuffer::ExecutionPlan* s_plan_;
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::MethodMeta;
-using ::executorch::runtime::TensorInfo;
+using ::executorch::ET_RUNTIME_NAMESPACE::MethodMeta;
+using ::executorch::ET_RUNTIME_NAMESPACE::TensorInfo;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/executor/platform_memory_allocator.h b/runtime/executor/platform_memory_allocator.h
index 09195a460ac..7ab58bf0f3d 100644
--- a/runtime/executor/platform_memory_allocator.h
+++ b/runtime/executor/platform_memory_allocator.h
@@ -17,7 +17,7 @@
 #include <executorch/runtime/platform/platform.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 namespace internal {
 
 /**
@@ -107,5 +107,5 @@ class PlatformMemoryAllocator final : public MemoryAllocator {
 };
 
 } // namespace internal
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp
index 14e0b83d8aa..238c806b1d6 100644
--- a/runtime/executor/program.cpp
+++ b/runtime/executor/program.cpp
@@ -28,8 +28,7 @@
 #endif
 
 namespace executorch {
-namespace runtime {
-
+namespace ET_RUNTIME_NAMESPACE {
 namespace {
 
 /**
@@ -535,5 +534,5 @@ Error Program::load_mutable_subsegment_into(
       segment_base_offset_ + segment->offset() + offset, size, info, buffer);
 }
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/executor/program.h b/runtime/executor/program.h
index 0932e51619f..9670fd7c79f 100644
--- a/runtime/executor/program.h
+++ b/runtime/executor/program.h
@@ -36,8 +36,7 @@ struct Program;
 } // namespace executorch_flatbuffer
 
 namespace executorch {
-namespace runtime {
-
+namespace ET_RUNTIME_NAMESPACE {
 namespace testing {
 // Provides test access to private Program methods.
 class ProgramTestFriend;
@@ -313,14 +312,14 @@ class Program final {
   std::optional<internal::PteDataMap> pte_data_map_;
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::Program;
+using ::executorch::ET_RUNTIME_NAMESPACE::Program;
 } // namespace executor
 } // namespace torch
 
diff --git a/runtime/executor/pte_data_map.cpp b/runtime/executor/pte_data_map.cpp
index 5829395028a..fd064cb8256 100644
--- a/runtime/executor/pte_data_map.cpp
+++ b/runtime/executor/pte_data_map.cpp
@@ -10,7 +10,7 @@
 #include <executorch/schema/program_generated.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 namespace internal {
 
 /* static */ executorch::runtime::Result<PteDataMap> PteDataMap::create(
@@ -83,5 +83,5 @@ ET_NODISCARD executorch::runtime::Result<const char*> PteDataMap::get_key(
 }
 
 } // namespace internal
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/executor/pte_data_map.h b/runtime/executor/pte_data_map.h
index 01c15555786..b26c0ac42f9 100644
--- a/runtime/executor/pte_data_map.h
+++ b/runtime/executor/pte_data_map.h
@@ -46,7 +46,7 @@ using FlatbufferDataSegment = flatbuffers::
 #endif
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 namespace internal {
 
 /**
@@ -147,5 +147,5 @@ class PteDataMap final : public NamedDataMap {
 };
 
 } // namespace internal
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index cfb6c607359..649b2c13cc1 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -42,30 +42,33 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_library(
-        name = "pte_data_map",
-        srcs = [
-            "pte_data_map.cpp",
-        ],
-        exported_headers = [
-            "pte_data_map.h",
-        ],
-        visibility = [
-            "//executorch/runtime/executor/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        exported_deps = [
-            "//executorch/runtime/core:core",
-            "//executorch/runtime/core:named_data_map",
-        ],
-        deps = [
-            "//executorch/schema:program",
-        ],
-        exported_preprocessor_flags = [] if runtime.is_oss else ["-DEXECUTORCH_INTERNAL_FLATBUFFERS=1"],
-    )
 
     for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
+
+        runtime.cxx_library(
+            name = "pte_data_map" + aten_suffix,
+            srcs = [
+                "pte_data_map.cpp",
+            ],
+            exported_headers = [
+                "pte_data_map.h",
+            ],
+            visibility = [
+                "//executorch/runtime/executor/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                "//executorch/runtime/core:core",
+                "//executorch/runtime/core:named_data_map" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix,
+            ],
+            deps = [
+                "//executorch/schema:program",
+            ],
+            exported_preprocessor_flags = [] if runtime.is_oss else ["-DEXECUTORCH_INTERNAL_FLATBUFFERS=1"],
+        )
+
         runtime.cxx_library(
             name = "program" + aten_suffix,
             exported_deps = [
@@ -103,17 +106,17 @@ def define_common_targets():
             preprocessor_flags = _program_preprocessor_flags(),
             exported_deps = [
                 ":memory_manager",
-                ":pte_data_map",
-                "//executorch/runtime/backend:interface",
+                ":pte_data_map" + aten_suffix,
+                "//executorch/runtime/backend:interface" + aten_suffix,
                 "//executorch/runtime/core:core",
-                "//executorch/runtime/core:named_data_map",
+                "//executorch/runtime/core:named_data_map" + aten_suffix,
                 "//executorch/runtime/core:evalue" + aten_suffix,
                 "//executorch/runtime/core:event_tracer" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
-                "//executorch/runtime/kernel:operator_registry",
+                "//executorch/runtime/kernel:operator_registry" + aten_suffix,
                 "//executorch/runtime/platform:platform",
                 "//executorch/schema:extended_header",
             ],
diff --git a/runtime/executor/tensor_parser.h b/runtime/executor/tensor_parser.h
index 1fae84cfb05..e2b5ff8d6ea 100644
--- a/runtime/executor/tensor_parser.h
+++ b/runtime/executor/tensor_parser.h
@@ -21,7 +21,7 @@
 #include <executorch/schema/program_generated.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 namespace deserialization {
 
 /// Data structure to hold key and data buffer for external data used
@@ -142,7 +142,7 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
     Span<NamedData> external_constants = {});
 
 } // namespace deserialization
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
@@ -150,10 +150,11 @@ namespace executor {
 namespace deserialization {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::deserialization::getTensorDataPtr;
-using ::executorch::runtime::deserialization::parseListOptionalType;
-using ::executorch::runtime::deserialization::parseTensor;
-using ::executorch::runtime::deserialization::parseTensorList;
+using ::executorch::ET_RUNTIME_NAMESPACE::deserialization::getTensorDataPtr;
+using ::executorch::ET_RUNTIME_NAMESPACE::deserialization::
+    parseListOptionalType;
+using ::executorch::ET_RUNTIME_NAMESPACE::deserialization::parseTensor;
+using ::executorch::ET_RUNTIME_NAMESPACE::deserialization::parseTensorList;
 } // namespace deserialization
 } // namespace executor
 } // namespace torch
diff --git a/runtime/executor/tensor_parser_aten.cpp b/runtime/executor/tensor_parser_aten.cpp
index d1a2f712853..2d454d15be5 100644
--- a/runtime/executor/tensor_parser_aten.cpp
+++ b/runtime/executor/tensor_parser_aten.cpp
@@ -19,7 +19,9 @@
 #include <ATen/ATen.h> // @donotremove @manual=//caffe2/aten:ATen-core
 
 namespace executorch {
+// This file is only used in ATen mode, so we use the runtime_aten namespace.
 namespace runtime {
+namespace aten {
 namespace deserialization {
 
 namespace {
@@ -126,5 +128,6 @@ Result<at::Tensor> parseTensor(
 }
 
 } // namespace deserialization
+} // namespace aten
 } // namespace runtime
 } // namespace executorch
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
index 14ba5e0d42c..aa27bbf929d 100644
--- a/runtime/executor/tensor_parser_exec_aten.cpp
+++ b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -16,11 +16,10 @@
 #include <executorch/schema/program_generated.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 namespace deserialization {
 
 using executorch::aten::ScalarType;
-using executorch::runtime::TensorLayout;
 // Provides access to private Program methods.
 class TensorParser final {
  public:
@@ -256,5 +255,5 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
 }
 
 } // namespace deserialization
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
index 787af8b506b..e1f09d557ac 100644
--- a/runtime/executor/tensor_parser_portable.cpp
+++ b/runtime/executor/tensor_parser_portable.cpp
@@ -18,13 +18,13 @@
 #include <executorch/schema/program_generated.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 namespace deserialization {
 
 using executorch::runtime::Span;
-using torch::executor::ScalarType;
-using torch::executor::Tensor;
-using torch::executor::TensorImpl;
+using ::torch::executor::ScalarType;
+using ::torch::executor::Tensor;
+using ::torch::executor::TensorImpl;
 
 Result<Tensor> parseTensor(
     const Program* program,
@@ -176,5 +176,5 @@ Result<Tensor> parseTensor(
 }
 
 } // namespace deserialization
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index 2de32c9176a..512f832858f 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -27,9 +27,10 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte"
   COMMAND
     python3 -m test.models.export_program --modules
-    "ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain"
+    "ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful"
     --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   COMMAND
     python3 -m test.models.export_program --modules "ModuleLinear"
@@ -51,6 +52,7 @@ add_custom_target(
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte"
 )
 
 set(test_env
@@ -64,6 +66,7 @@ set(test_env
     "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
     "ET_MODULE_MULTI_ENTRY_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte"
     "ET_MODULE_SIMPLE_TRAIN_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte"
+    "ET_MODULE_STATEFUL_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte"
 )
 
 et_cxx_test(
@@ -152,3 +155,23 @@ target_include_directories(
   PRIVATE "${CMAKE_INSTALL_PREFIX}/schema/include"
           "${EXECUTORCH_ROOT}/third-party/flatbuffers/include"
 )
+
+list(TRANSFORM _test_backend_compiler_lib__srcs PREPEND "${EXECUTORCH_ROOT}/")
+add_library(
+  test_backend_compiler_lib
+  STATIC
+  ${_test_backend_compiler_lib__srcs}
+)
+
+target_link_libraries(
+  test_backend_compiler_lib
+  PUBLIC
+  executorch_core
+)
+
+target_link_options_shared_lib(test_backend_compiler_lib)
+
+install(
+  TARGETS test_backend_compiler_lib
+  DESTINATION lib
+)
diff --git a/runtime/executor/test/executor_test.cpp b/runtime/executor/test/executor_test.cpp
index 328b23a8df3..e2a44429941 100644
--- a/runtime/executor/test/executor_test.cpp
+++ b/runtime/executor/test/executor_test.cpp
@@ -22,14 +22,14 @@ using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::SizesType;
 using executorch::aten::Tensor;
+using executorch::ET_RUNTIME_NAMESPACE::get_op_function_from_registry;
+using executorch::ET_RUNTIME_NAMESPACE::Kernel;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
+using executorch::ET_RUNTIME_NAMESPACE::OpFunction;
+using executorch::ET_RUNTIME_NAMESPACE::register_kernel;
+using executorch::ET_RUNTIME_NAMESPACE::registry_has_op_function;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
-using executorch::runtime::get_op_function_from_registry;
-using executorch::runtime::Kernel;
-using executorch::runtime::KernelRuntimeContext;
-using executorch::runtime::OpFunction;
-using executorch::runtime::register_kernel;
-using executorch::runtime::registry_has_op_function;
 using executorch::runtime::Result;
 using executorch::runtime::testing::TensorFactory;
 
diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp
index e9f09c38a59..211800d5dff 100644
--- a/runtime/executor/test/method_meta_test.cpp
+++ b/runtime/executor/test/method_meta_test.cpp
@@ -26,26 +26,34 @@ using torch::executor::util::FileDataLoader;
 
 class MethodMetaTest : public ::testing::Test {
  protected:
-  void SetUp() override {
-    // Create a loader for the serialized ModuleAdd program.
-    const char* path = std::getenv("ET_MODULE_ADD_PATH");
+  void load_program(const char* path, const char* module_name) {
+    // Create a loader for the serialized program.
     Result<FileDataLoader> loader = FileDataLoader::from(path);
     ASSERT_EQ(loader.error(), Error::Ok);
-    loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+    loaders_.insert(
+        {module_name,
+         std::make_unique<FileDataLoader>(std::move(loader.get()))});
 
     // Use it to load the program.
     Result<Program> program = Program::load(
-        loader_.get(), Program::Verification::InternalConsistency);
+        loaders_[module_name].get(),
+        Program::Verification::InternalConsistency);
     ASSERT_EQ(program.error(), Error::Ok);
-    program_ = std::make_unique<Program>(std::move(program.get()));
+    programs_.insert(
+        {module_name, std::make_unique<Program>(std::move(program.get()))});
+  }
+
+  void SetUp() override {
+    load_program(std::getenv("ET_MODULE_ADD_PATH"), "add");
+    load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful");
   }
 
  private:
   // Must outlive program_, but tests shouldn't need to touch it.
-  std::unique_ptr<FileDataLoader> loader_;
+  std::unordered_map<std::string, std::unique_ptr<FileDataLoader>> loaders_;
 
  protected:
-  std::unique_ptr<Program> program_;
+  std::unordered_map<std::string, std::unique_ptr<Program>> programs_;
 };
 
 namespace {
@@ -67,7 +75,7 @@ void check_tensor(const TensorInfo& tensor_info) {
 } // namespace
 
 TEST_F(MethodMetaTest, MethodMetaApi) {
-  Result<MethodMeta> method_meta = program_->method_meta("forward");
+  Result<MethodMeta> method_meta = programs_["add"]->method_meta("forward");
   ASSERT_EQ(method_meta.error(), Error::Ok);
 
   // Appropriate amount of inputs
@@ -97,11 +105,12 @@ TEST_F(MethodMetaTest, MethodMetaApi) {
 
   // Missing method fails
   EXPECT_EQ(
-      program_->method_meta("not_a_method").error(), Error::InvalidArgument);
+      programs_["add"]->method_meta("not_a_method").error(),
+      Error::InvalidArgument);
 }
 
 TEST_F(MethodMetaTest, TensorInfoApi) {
-  Result<MethodMeta> method_meta = program_->method_meta("forward");
+  Result<MethodMeta> method_meta = programs_["add"]->method_meta("forward");
   ASSERT_EQ(method_meta.error(), Error::Ok);
 
   // Input 1
@@ -138,3 +147,19 @@ TEST_F(MethodMetaTest, TensorInfoApi) {
   EXPECT_EQ(
       method_meta->output_tensor_meta(-1).error(), Error::InvalidArgument);
 }
+
+TEST_F(MethodMetaTest, MethodMetaAttribute) {
+  Result<MethodMeta> method_meta =
+      programs_["stateful"]->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  ASSERT_EQ(method_meta->num_attributes(), 1);
+  auto state = method_meta->attribute_tensor_meta(0);
+  ASSERT_TRUE(state.ok());
+
+  ASSERT_EQ(state->name(), "state");
+  ASSERT_FALSE(state->is_memory_planned());
+
+  auto bad_access = method_meta->attribute_tensor_meta(1);
+  ASSERT_EQ(bad_access.error(), Error::InvalidArgument);
+}
diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp
index 0c6a2db94b7..5324ff5916d 100644
--- a/runtime/executor/test/method_test.cpp
+++ b/runtime/executor/test/method_test.cpp
@@ -79,6 +79,7 @@ class MethodTest : public ::testing::Test {
     load_program(
         std::getenv("ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH"), "cat");
     load_program(std::getenv("ET_MODULE_LINEAR_PATH"), "linear");
+    load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful");
     load_program(
         std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"),
         "linear_constant_buffer");
@@ -339,6 +340,31 @@ TEST_F(MethodTest, ProgramDataSeparationTest) {
   ASSERT_EQ(err, Error::Ok);
 }
 
+TEST_F(MethodTest, MethodGetAttributeTest) {
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+  Result<Method> method =
+      programs_["stateful"]->load_method("forward", &mmm.get());
+  ASSERT_EQ(method.error(), Error::Ok);
+
+  auto res = method->get_attribute("state");
+  ASSERT_TRUE(res.ok());
+  // expect data to be empty
+  EXPECT_EQ(res->const_data_ptr(), nullptr);
+
+  int32_t data = 0;
+  res->set_data(&data);
+
+  // expect data to be set
+  EXPECT_EQ(res->const_data_ptr(), &data);
+
+  // Can execute the method.
+  Error err = method->execute();
+  ASSERT_EQ(err, Error::Ok);
+
+  // Expect the state to be incremented
+  EXPECT_EQ(res->const_data_ptr<int32_t>()[0], 1);
+}
+
 /*
  * TODO(T161163608): Test is disabled due to a resize bug in tensor_index_out of
  * the portable op lib
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index dd5262b5ac6..75ea2674aa7 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -70,7 +70,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core:evalue" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
-                "//executorch/runtime/kernel:operator_registry",
+                "//executorch/runtime/kernel:operator_registry" + aten_suffix,
                 "//executorch/runtime/platform:platform",
             ],
         )
@@ -122,6 +122,7 @@ def define_common_targets(is_fbcode = False):
             "ET_MODULE_LINEAR_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])",
             "ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])",
             "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])",
+            "ET_MODULE_STATEFUL_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleStateful.pte])",
             "ET_MODULE_LINEAR_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.pte])",
             "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])",
         }
@@ -233,9 +234,9 @@ def define_common_targets(is_fbcode = False):
                 # Uses an fbcode target path because the authoring/export tools
                 # intentionally don't work in xplat (since they're host-only
                 # tools).
-                "ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH": "$(location fbcode//executorch/test/models:exported_delegated_programs[ModuleAddMul-nosegments-da1024.pte])",
-                "ET_MODULE_ADD_MUL_NOSEGMENTS_PATH": "$(location fbcode//executorch/test/models:exported_delegated_programs[ModuleAddMul-nosegments.pte])",
-                "ET_MODULE_ADD_MUL_PATH": "$(location fbcode//executorch/test/models:exported_delegated_programs[ModuleAddMul.pte])",
+                "ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH": "$(location fbcode//executorch/test/models:exported_delegated_add_mul[ModuleAddMul-nosegments-da1024.pte])",
+                "ET_MODULE_ADD_MUL_NOSEGMENTS_PATH": "$(location fbcode//executorch/test/models:exported_delegated_add_mul[ModuleAddMul-nosegments.pte])",
+                "ET_MODULE_ADD_MUL_PATH": "$(location fbcode//executorch/test/models:exported_delegated_add_mul[ModuleAddMul.pte])",
             },
         )
 
diff --git a/runtime/executor/test/test_backend_compiler_lib.cpp b/runtime/executor/test/test_backend_compiler_lib.cpp
index 9eea6384d6f..ce631eb4f57 100644
--- a/runtime/executor/test/test_backend_compiler_lib.cpp
+++ b/runtime/executor/test/test_backend_compiler_lib.cpp
@@ -13,13 +13,13 @@
 #include <cstdio>
 #include <cstdlib> /* strtol */
 
+using executorch::ET_RUNTIME_NAMESPACE::Backend;
+using executorch::ET_RUNTIME_NAMESPACE::BackendExecutionContext;
+using executorch::ET_RUNTIME_NAMESPACE::BackendInitContext;
+using executorch::ET_RUNTIME_NAMESPACE::BackendInterface;
+using executorch::ET_RUNTIME_NAMESPACE::CompileSpec;
+using executorch::ET_RUNTIME_NAMESPACE::DelegateHandle;
 using executorch::runtime::ArrayRef;
-using executorch::runtime::Backend;
-using executorch::runtime::BackendExecutionContext;
-using executorch::runtime::BackendInitContext;
-using executorch::runtime::BackendInterface;
-using executorch::runtime::CompileSpec;
-using executorch::runtime::DelegateHandle;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
diff --git a/runtime/executor/test/test_backend_with_delegate_mapping.cpp b/runtime/executor/test/test_backend_with_delegate_mapping.cpp
index e6d84aca189..a0b79b09c6d 100644
--- a/runtime/executor/test/test_backend_with_delegate_mapping.cpp
+++ b/runtime/executor/test/test_backend_with_delegate_mapping.cpp
@@ -14,13 +14,13 @@
 #include <cstdlib> /* strtol */
 #include <cstring>
 
+using executorch::ET_RUNTIME_NAMESPACE::Backend;
+using executorch::ET_RUNTIME_NAMESPACE::BackendExecutionContext;
+using executorch::ET_RUNTIME_NAMESPACE::BackendInitContext;
+using executorch::ET_RUNTIME_NAMESPACE::BackendInterface;
+using executorch::ET_RUNTIME_NAMESPACE::CompileSpec;
+using executorch::ET_RUNTIME_NAMESPACE::DelegateHandle;
 using executorch::runtime::ArrayRef;
-using executorch::runtime::Backend;
-using executorch::runtime::BackendExecutionContext;
-using executorch::runtime::BackendInitContext;
-using executorch::runtime::BackendInterface;
-using executorch::runtime::CompileSpec;
-using executorch::runtime::DelegateHandle;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
diff --git a/runtime/kernel/kernel_runtime_context.h b/runtime/kernel/kernel_runtime_context.h
index ad269f5dd4b..6facecc7632 100644
--- a/runtime/kernel/kernel_runtime_context.h
+++ b/runtime/kernel/kernel_runtime_context.h
@@ -15,7 +15,7 @@
 #include <executorch/runtime/platform/compiler.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 /**
  * Runtime state and functionality for kernel implementations.
@@ -107,7 +107,7 @@ class KernelRuntimeContext {
   Error failure_state_ = Error::Ok;
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
@@ -115,15 +115,15 @@ class KernelRuntimeContext {
 namespace torch {
 namespace executor {
 /// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
-using ::executorch::runtime::KernelRuntimeContext;
+using ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 /// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
-using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
+using RuntimeContext = ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 } // namespace executor
 } // namespace torch
 namespace executorch {
 namespace aten {
 /// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead.
-using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
+using RuntimeContext = ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 } // namespace aten
 } // namespace executorch
 // DEPRECATED: The executorch::aten:: namespace is deprecated. Use
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index 85705e5b3fd..d7e7b298c10 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -15,7 +15,7 @@
 #include <executorch/runtime/platform/system.h>
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 namespace {
 
@@ -258,5 +258,5 @@ Span<const Kernel> get_registered_kernels() {
   return {registered_kernels, num_registered_kernels};
 }
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
diff --git a/runtime/kernel/operator_registry.h b/runtime/kernel/operator_registry.h
index a3cdcd66cee..f7a62208dd8 100644
--- a/runtime/kernel/operator_registry.h
+++ b/runtime/kernel/operator_registry.h
@@ -40,7 +40,7 @@
   }
 
 namespace executorch {
-namespace runtime {
+namespace ET_RUNTIME_NAMESPACE {
 
 class KernelRuntimeContext; // Forward declaration
 using OpFunction = void (*)(KernelRuntimeContext&, EValue**);
@@ -258,38 +258,41 @@ ET_NODISCARD inline Error register_kernel(const Kernel& kernel) {
   return register_kernels({&kernel, 1});
 };
 
-} // namespace runtime
+} // namespace ET_RUNTIME_NAMESPACE
 } // namespace executorch
 
 namespace torch {
 namespace executor {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
-using ::executorch::runtime::Kernel;
-using ::executorch::runtime::KernelKey;
-using ::executorch::runtime::KernelRuntimeContext;
-using ::executorch::runtime::OpFunction;
-using ::executorch::runtime::TensorMeta;
-using KernelRuntimeContext = ::executorch::runtime::KernelRuntimeContext;
+using ::executorch::ET_RUNTIME_NAMESPACE::Kernel;
+using ::executorch::ET_RUNTIME_NAMESPACE::KernelKey;
+using ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
+using ::executorch::ET_RUNTIME_NAMESPACE::OpFunction;
+using ::executorch::ET_RUNTIME_NAMESPACE::TensorMeta;
+using KernelRuntimeContext =
+    ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 
 inline ::executorch::runtime::Error register_kernels(ArrayRef<Kernel> kernels) {
-  return ::executorch::runtime::register_kernels(
+  return ::executorch::ET_RUNTIME_NAMESPACE::register_kernels(
       {kernels.data(), kernels.size()});
 }
 inline OpFunction getOpsFn(
     const char* name,
     ArrayRef<TensorMeta> meta_list = {}) {
-  auto result = ::executorch::runtime::get_op_function_from_registry(
-      name, {meta_list.data(), meta_list.size()});
+  auto result =
+      ::executorch::ET_RUNTIME_NAMESPACE::get_op_function_from_registry(
+          name, {meta_list.data(), meta_list.size()});
   ET_CHECK(result.ok()); // get_op_function_from_registry() logs details.
   return *result;
 }
 inline bool hasOpsFn(const char* name, ArrayRef<TensorMeta> meta_list = {}) {
-  return ::executorch::runtime::registry_has_op_function(
+  return ::executorch::ET_RUNTIME_NAMESPACE::registry_has_op_function(
       name, {meta_list.data(), meta_list.size()});
 }
 inline ArrayRef<Kernel> get_kernels() {
-  Span<const Kernel> kernels = ::executorch::runtime::get_registered_kernels();
+  Span<const Kernel> kernels =
+      ::executorch::ET_RUNTIME_NAMESPACE::get_registered_kernels();
   return ArrayRef<Kernel>(kernels.data(), kernels.size());
 }
 } // namespace executor
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index b6aa9d7a95e..8a945f19881 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -21,21 +21,6 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    runtime.cxx_library(
-        name = "operator_registry",
-        srcs = ["operator_registry.cpp"],
-        exported_headers = ["operator_registry.h"],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        exported_deps = [
-            "//executorch/runtime/core:core",
-            "//executorch/runtime/core:evalue",
-        ],
-        preprocessor_flags = _operator_registry_preprocessor_flags(),
-    )
-
     runtime.cxx_library(
         name = "operator_registry_MAX_NUM_KERNELS_TEST_ONLY",
         srcs = ["operator_registry.cpp"],
@@ -68,6 +53,21 @@ def define_common_targets():
     for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
+        runtime.cxx_library(
+            name = "operator_registry" + aten_suffix,
+            srcs = ["operator_registry.cpp"],
+            exported_headers = ["operator_registry.h"],
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                "//executorch/runtime/core:core",
+                "//executorch/runtime/core:evalue" + aten_suffix,
+            ],
+            preprocessor_flags = _operator_registry_preprocessor_flags(),
+        )
+
         runtime.cxx_library(
             name = "kernel_runtime_context" + aten_suffix,
             exported_headers = [
diff --git a/runtime/kernel/test/kernel_runtime_context_test.cpp b/runtime/kernel/test/kernel_runtime_context_test.cpp
index 50bd860fb9c..2c3b536b0d4 100644
--- a/runtime/kernel/test/kernel_runtime_context_test.cpp
+++ b/runtime/kernel/test/kernel_runtime_context_test.cpp
@@ -13,8 +13,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext;
 using executorch::runtime::Error;
-using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
 
diff --git a/runtime/kernel/test/targets.bzl b/runtime/kernel/test/targets.bzl
index bd66fc05b6f..4b3ed0f3139 100644
--- a/runtime/kernel/test/targets.bzl
+++ b/runtime/kernel/test/targets.bzl
@@ -101,3 +101,16 @@ def define_common_targets():
                 ":specialized_kernel_generated_lib",
             ],
         )
+
+        if aten_mode:
+            # Make sure we can depend on both generated_lib and generated_lib_aten
+            # in the same binary.
+            runtime.cxx_test(
+                name = "test_generated_lib_and_aten",
+                srcs = ["test_generated_lib_and_aten.cpp"],
+                deps = [
+                    "//executorch/kernels/portable:generated_lib",
+                    "//executorch/kernels/portable:generated_lib_aten",
+                    "//executorch/runtime/kernel:operator_registry_aten",
+                ],
+            )
diff --git a/runtime/kernel/test/test_generated_lib_and_aten.cpp b/runtime/kernel/test/test_generated_lib_and_aten.cpp
new file mode 100644
index 00000000000..f9bfebc4a80
--- /dev/null
+++ b/runtime/kernel/test/test_generated_lib_and_aten.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
+
+using namespace ::testing;
+using executorch::aten::ScalarType;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+
+class GeneratedLibAndAtenTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    executorch::runtime::runtime_init();
+  }
+};
+
+TEST_F(GeneratedLibAndAtenTest, GetKernelsFromATenRegistry) {
+  // Check if the kernel exists in the ATen registry
+  bool has_kernel =
+      executorch::runtime::aten::registry_has_op_function("aten::add.out");
+  EXPECT_TRUE(has_kernel)
+      << "Kernel 'aten::add.out' not found in ATen registry";
+
+  // Get the kernel from the ATen registry
+  auto result =
+      executorch::runtime::aten::get_op_function_from_registry("aten::add.out");
+  EXPECT_EQ(result.error(), Error::Ok)
+      << "Failed to get kernel from ATen registry";
+  EXPECT_NE(*result, nullptr) << "Kernel function from ATen registry is null";
+}
diff --git a/scripts/build_android_library.sh b/scripts/build_android_library.sh
index 8a385ad6876..cbde7ae3d43 100755
--- a/scripts/build_android_library.sh
+++ b/scripts/build_android_library.sh
@@ -12,11 +12,6 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
 fi
 which "${PYTHON_EXECUTABLE}"
 
-copy_src() {
-  cp -r extension/android/build.gradle extension/android/settings.gradle extension/android/gradlew extension/android/gradle extension/android/gradlew.bat extension/android/gradle.properties "${BUILD_AAR_DIR}"
-  cp -r extension/android/executorch_android "${BUILD_AAR_DIR}/executorch_android"
-}
-
 build_android_native_library() {
   ANDROID_ABI="$1"
   ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}"
@@ -70,11 +65,6 @@ build_android_native_library() {
   fi
   cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
 
-  # Update tokenizers submodule
-  pushd extension/llm/tokenizers
-  echo "Update tokenizers submodule"
-  git submodule update --init
-  popd
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI="${ANDROID_ABI}" \
@@ -93,54 +83,52 @@ build_android_native_library() {
   cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
 
   # Copy artifacts to ABI specific directory
-  mkdir -p "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}"
-  cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+  local SO_STAGE_DIR="cmake-out-android-so/${ANDROID_ABI}"
+  mkdir -p ${SO_STAGE_DIR}
+  cp "${CMAKE_OUT}"/extension/android/*.so "${SO_STAGE_DIR}/libexecutorch.so"
 
   # Copy QNN related so library
   if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
-    cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so ${SO_STAGE_DIR}
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so ${SO_STAGE_DIR}
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so ${SO_STAGE_DIR}
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so ${SO_STAGE_DIR}
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so ${SO_STAGE_DIR}
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so ${SO_STAGE_DIR}
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${SO_STAGE_DIR}
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${SO_STAGE_DIR}
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${SO_STAGE_DIR}
   fi
 
   # Copy MTK related so library
   if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ] && [ -n "$NEURON_USDK_ADAPTER_LIB" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
-    cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
-    cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
-    cp "${NEURON_USDK_ADAPTER_LIB}" ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
+    cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${SO_STAGE_DIR}
+    cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${SO_STAGE_DIR}
+    cp "${NEURON_USDK_ADAPTER_LIB}" ${SO_STAGE_DIR}
   fi
 }
 
 build_aar() {
-  pushd "${BUILD_AAR_DIR}"
-  # Rename libexecutorch_jni.so to libexecutorch.so for soname consistency
-  # between Java and JNI
-  find . -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
   if [ "$EXECUTORCH_CMAKE_BUILD_TYPE" == "Release" ]; then
-    find . -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \;
+    find cmake-out-android-so -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \;
   fi
+  pushd extension/android/
   ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build
-  cp executorch_android/build/outputs/aar/executorch_android-debug.aar executorch.aar
+  # Use java unit test as sanity check
+  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
   popd
+  if [ ! -z $BUILD_AAR_DIR ]; then
+    cp extension/android/executorch_android/build/outputs/aar/executorch_android-debug.aar "${BUILD_AAR_DIR}/executorch.aar"
+  fi
 }
 
 main() {
-  if [[ -z "${BUILD_AAR_DIR:-}" ]]; then
-    BUILD_AAR_DIR="$(mktemp -d)"
-  fi
-  export BUILD_AAR_DIR
   if [ -z "$ANDROID_ABIS" ]; then
     ANDROID_ABIS=("arm64-v8a" "x86_64")
   fi
   export ANDROID_ABIS
 
-  copy_src
+  mkdir -p cmake-out-android-so/
   for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do
     build_android_native_library ${ANDROID_ABI}
   done
diff --git a/scripts/check_urls.sh b/scripts/check_urls.sh
new file mode 100755
index 00000000000..ad6c1440ebe
--- /dev/null
+++ b/scripts/check_urls.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+status=0
+green='\e[1;32m'; red='\e[1;31m'; cyan='\e[1;36m'; yellow='\e[1;33m'; reset='\e[0m'
+user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
+last_filepath=
+
+while IFS=: read -r filepath url; do
+  if [ "$filepath" != "$last_filepath" ]; then
+    printf '\n%s:\n' "$filepath"
+    last_filepath=$filepath
+  fi
+  code=$(curl -gsLm30 --retry 3 --retry-delay 3 --retry-connrefused -o /dev/null -w "%{http_code}" -I "$url") || code=000
+  if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
+    code=$(curl -gsLm30 --retry 3 --retry-delay 3 --retry-connrefused -o /dev/null -w "%{http_code}" -r 0-0 -A "$user_agent" "$url") || code=000
+  fi
+  if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
+    request_id=$(curl -sS -H 'Accept: application/json' "https://check-host.net/check-http?host=$url&max_nodes=1&node=us3.node.check-host.net" \
+      | jq -r .request_id)
+    for _ in {1..3}; do
+      code=$(curl -sS -H 'Accept: application/json' "https://check-host.net/check-result/$request_id" \
+        | jq -r -e '.[][0][3]') || code=000
+      [[ "$code" =~ ^[0-9]+$ ]] || code=000
+      sleep 3
+    done
+  fi
+  if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
+    printf "${red}%s${reset} ${yellow}%s${reset}\n" "$code" "$url" >&2
+    status=1
+  else
+    printf "${green}%s${reset} ${cyan}%s${reset}\n" "$code" "$url"
+  fi
+done < <(
+  git --no-pager grep --no-color -I -P -o \
+    '(?<!git\+)(?<!\$\{)https?://(?![^\s<>\")]*[\{\}\$])[^[:space:]<>\")\[\]\(]+' \
+    -- '*' \
+    ':(exclude).*' \
+    ':(exclude)**/.*' \
+    ':(exclude)**/*.lock' \
+    ':(exclude)**/*.svg' \
+    ':(exclude)**/*.xml' \
+    ':(exclude)**/*.gradle*' \
+    ':(exclude)**/*gradle*' \
+    ':(exclude)**/third-party/**' \
+  | sed -E 's/[^/[:alnum:]]+$//' \
+  | grep -Ev '://(0\.0\.0\.0|127\.0\.0\.1|localhost)([:/])' \
+  | grep -Ev 'fwdproxy:8080' \
+  || true
+)
+
+exit $status
diff --git a/scripts/check_xrefs.sh b/scripts/check_xrefs.sh
new file mode 100755
index 00000000000..69e083a8a67
--- /dev/null
+++ b/scripts/check_xrefs.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+status=0
+green='\e[1;32m'; red='\e[1;31m'; cyan='\e[1;36m'; yellow='\e[1;33m'; reset='\e[0m'
+last_filepath=
+
+while IFS=: read -r filepath link; do
+  if [ "$filepath" != "$last_filepath" ]; then
+    printf '\n%s:\n' "$filepath"
+    last_filepath=$filepath
+  fi
+  if [ -e "$(dirname "$filepath")/${link%%#*}" ]; then
+    printf " ${green}OK${reset}  ${cyan}%s${reset}\n" "$link"
+  else
+    printf "${red}FAIL${reset} ${yellow}%s${reset}\n" "$link" >&2
+    status=1
+  fi
+done < <(
+  git --no-pager grep --no-color -I -o -E \
+    '\[[^]]+\]\([^[:space:])]*/[^[:space:])]*\)|href="[^"]*/[^"]*"|src="[^"]*/[^"]*"' \
+    -- '*' \
+    ':(exclude).*' \
+    ':(exclude)**/.*' \
+    ':(exclude)**/*.lock' \
+    ':(exclude)**/*.svg' \
+    ':(exclude)**/*.xml' \
+    ':(exclude)**/third-party/**' \
+  | grep -Ev 'https?://' \
+  | sed -E \
+      -e 's#([^:]+):\[[^]]+\]\(([^)]+)\)#\1:\2#' \
+      -e 's#([^:]+):href="([^"]+)"#\1:\2#' \
+      -e 's#([^:]+):src="([^"]+)"#\1:\2#' \
+      -e 's/[[:punct:]]*$//' \
+  | grep -Ev '\{\{' \
+  || true
+)
+
+exit $status
diff --git a/scripts/run_android_emulator.sh b/scripts/run_android_emulator.sh
index fe73ec8a1d7..29c2425cd0e 100755
--- a/scripts/run_android_emulator.sh
+++ b/scripts/run_android_emulator.sh
@@ -18,17 +18,14 @@ $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; d
 echo "List all running emulators"
 $ADB_PATH devices
 
-adb uninstall com.example.executorchllamademo || true
-adb uninstall com.example.executorchllamademo.test || true
-adb install -t app-debug.apk
-adb install -t app-debug-androidTest.apk
-
-adb shell mkdir -p /data/local/tmp/llama
-adb push model.pte /data/local/tmp/llama
-adb push tokenizer.bin /data/local/tmp/llama
-adb shell am instrument -w -r com.example.executorchllamademo.test/androidx.test.runner.AndroidJUnitRunner
-
 adb uninstall org.pytorch.executorch.test || true
 adb install -t android-test-debug-androidTest.apk
 
-adb shell am instrument -w -r org.pytorch.executorch.test/androidx.test.runner.AndroidJUnitRunner
+adb logcat -c
+adb shell am instrument -w -r \
+  org.pytorch.executorch.test/androidx.test.runner.AndroidJUnitRunner >result.txt 2>&1
+adb logcat -d > logcat.txt
+cat logcat.txt
+grep -q FAILURES result.txt && cat result.txt
+grep -q FAILURES result.txt && exit -1
+exit 0
diff --git a/scripts/test_ios.sh b/scripts/test_ios.sh
index 385c85f3dfe..245f7b06f7a 100755
--- a/scripts/test_ios.sh
+++ b/scripts/test_ios.sh
@@ -15,7 +15,7 @@ set -e
 
 OUTPUT="${1:-executorch}"
 EXIT_STATUS=0
-APP_PATH="examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo"
+APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo"
 MODEL_NAME="mv3"
 SIMULATOR_NAME="executorch"
 
@@ -44,26 +44,17 @@ say() {
   echo -e "\033[1m\n\t** $1 **\n\033[0m"
 }
 
-say "Cloning the Code"
-
-pushd . > /dev/null
-git clone -b viable/strict https://github.com/pytorch/executorch.git "$OUTPUT"
-cd "$OUTPUT"
-
-say "Updating the Submodules"
-
-git submodule update --init
-
 say "Activating a Virtual Environment"
 
-python3 -m venv .venv
-source .venv/bin/activate
+python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
 
 say "Installing Requirements"
 
-pip install --upgrade cmake pip setuptools wheel zstd
+./install_executorch.sh
 
-./install_executorch.sh --pybind coreml mps xnnpack
+say "Cloning the Demo App"
+
+git clone --depth 1 https://github.com/pytorch-labs/executorch-examples.git
 
 say "Installing CoreML Backend Requirements"
 
@@ -88,11 +79,6 @@ say "Downloading Labels"
 curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \
   -o "$APP_PATH/Resources/Models/MobileNet/imagenet_classes.txt"
 
-say "Building Frameworks"
-
-./scripts/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
-mv cmake-out "$APP_PATH/Frameworks"
-
 say "Creating Simulator"
 
 xcrun simctl create "$SIMULATOR_NAME" "iPhone 15"
diff --git a/setup.py b/setup.py
index 44fb9a712a3..2c5f5578bcf 100644
--- a/setup.py
+++ b/setup.py
@@ -606,8 +606,8 @@ def run(self):
         # be found in the pip package. This is the subset of headers that are
         # essential for building custom ops extensions.
         # TODO: Use cmake to gather the headers instead of hard-coding them here.
-        # For example: https://discourse.cmake.org/t/installing-headers-the-modern-
-        # way-regurgitated-and-revisited/3238/3
+        # For example:
+        # https://discourse.cmake.org/t/installing-headers-the-modern-way-regurgitated-and-revisited/3238/3
         for include_dir in [
             "runtime/core/",
             "runtime/kernel/",
@@ -718,6 +718,7 @@ def run(self):
             # enabled. TODO(dbort): Remove this override once this option is
             # managed by cmake itself.
             "-DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF",
+            "-DEXECUTORCH_BUILD_TESTS=ON",
         ]
 
         build_args = [f"-j{self.parallel}"]
diff --git a/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl b/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl
index 5cb801489ed..bd011748786 100644
--- a/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl
+++ b/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl
@@ -1,2 +1,2 @@
-def get_qnn_library_verision():
+def get_qnn_library_version():
     return "2.28"
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index a6d6d59e0c2..e1cebaa1140 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -688,7 +688,7 @@ def executorch_generated_lib(
                 "ovr_config//os:windows": [],
             }) + compiler_flags,
             deps = [
-                "//executorch/runtime/kernel:operator_registry",
+                "//executorch/runtime/kernel:operator_registry" + aten_suffix,
                 "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
                 "//executorch/runtime/core:evalue" + aten_suffix,
                 "//executorch/codegen:macros",
diff --git a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
index 61eeaf7c179..1616304c3ea 100644
--- a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
+++ b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
@@ -21,9 +21,9 @@ PORTABLE_MODULE_DEPS = [
 ] + get_all_cpu_backend_targets()
 
 ATEN_MODULE_DEPS = [
-    "//executorch/runtime/kernel:operator_registry",
+    "//executorch/runtime/kernel:operator_registry_aten",
     "//executorch/runtime/executor:program_aten",
-    "//executorch/runtime/core/exec_aten:lib",
+    "//executorch/runtime/core/exec_aten:lib_aten",
     "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs",
     "//executorch/extension/data_loader:buffer_data_loader",
     "//executorch/extension/data_loader:mmap_data_loader",
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index a1ffdc1eed3..d0c39bcf17f 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -372,7 +372,6 @@ ATEN_OPS = (
         name = "op_bmm",
         deps = [
             "//executorch/kernels/portable/cpu/util:matmul_ops_util",
-            ":vec_ops",
         ],
     ),
     op_target(
@@ -1269,6 +1268,12 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     ),
+    op_target(
+        name = "op_view_as_real_copy",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+        ],
+    ),
     op_target(
         name = "op_view_copy",
         deps = [
diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
index d3dcc229100..a8124d62dd4 100644
--- a/test/end2end/exported_module.py
+++ b/test/end2end/exported_module.py
@@ -70,6 +70,7 @@ def export(
         skip_type_promotion: bool = False,
         export_joint_graph: bool = False,
         external_constants: bool = False,
+        export_state_names: bool = False,
     ) -> "ExportedModule":
         """
         Creates a new ExportedModule for the specified module class.
@@ -148,7 +149,9 @@ def return_wrapper():
             for method in methods:
                 method_name_to_dynamic_shapes[method] = trace_dynamic_shapes
 
-        memory_planning_pass = MemoryPlanningPass()
+        memory_planning_pass = MemoryPlanningPass(
+            alloc_mutable_buffers=not export_state_names
+        )
         if hasattr(eager_module, "get_memory_planning_pass"):
             memory_planning_pass = eager_module.get_memory_planning_pass()  # type: ignore[operator]
 
@@ -208,6 +211,7 @@ def __init__(self, method):
                 memory_planning_pass=memory_planning_pass,
                 to_out_var_pass=ToOutVarPass(ignore_to_out_var_failure),
                 external_constants=external_constants,
+                emit_mutable_buffer_names=export_state_names,
             )
         )
 
diff --git a/test/models/export_delegated_program.py b/test/models/export_delegated_program.py
index 4f4429aca88..44ae8df544f 100644
--- a/test/models/export_delegated_program.py
+++ b/test/models/export_delegated_program.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import argparse
 import inspect
 import os
@@ -19,6 +21,7 @@
 from executorch.exir.backend.test.backend_with_compiler_demo import (
     BackendWithCompilerDemo,
 )
+from executorch.exir.program import ExecutorchProgramManager
 from torch import nn
 from torch.export import export
 
@@ -111,10 +114,10 @@ def export_module_to_program(
     *,
     backend_id: str,
     extract_delegate_segments: bool,
-    constant_tensor_alignemnt: Optional[int] = None,
+    constant_tensor_alignment: Optional[int] = None,
     delegate_alignment: Optional[int] = None,
     method: str = "forward",
-) -> bytes:
+) -> ExecutorchProgramManager:
     eager_module = module_class().eval()
     inputs = ()
     if hasattr(eager_module, "get_random_inputs"):
@@ -135,7 +138,7 @@ def forward(self, *args, **kwargs):
     edge_config = EdgeCompileConfig(_check_ir_validity=False)
     et_config = exir.ExecutorchBackendConfig(
         extract_delegate_segments=extract_delegate_segments,
-        constant_tensor_alignment=constant_tensor_alignemnt,
+        constant_tensor_alignment=constant_tensor_alignment,
         delegate_alignment=delegate_alignment,
     )
 
@@ -170,7 +173,7 @@ def forward(self, *args, **kwargs):
             export(composite_module, args=inputs, strict=True)
         ).to_executorch(config=et_config)
 
-    return executorch_program.buffer
+    return executorch_program
 
 
 def main() -> None:
@@ -199,6 +202,14 @@ def main() -> None:
         help="ID of the backend to use for delegation; "
         + f"one of {known_backend_ids}",
     )
+    parser.add_argument(
+        "--inline_delegate_segments",
+        action="store_true",
+        help="Store delegate data inside the flatbuffer.",
+    )
+    parser.add_argument(
+        "--delegate_alignment", type=int, default=None, help="Delegate alignment."
+    )
     parser.add_argument(
         "--outdir",
         type=str,
@@ -219,25 +230,22 @@ def main() -> None:
 
     # Export and write to the output files.
     os.makedirs(args.outdir, exist_ok=True)
+    suffix = ""
     for module_name, module_class in module_names_to_classes.items():
-        for extract_delegate_segments in (True, False):
-            suffix = "" if extract_delegate_segments else "-nosegments"
-            # Create files with the default alignment, and a large alignment.
-            # This alignment should be so large that it's extremely unlikely for
-            # the data to accidentally be aligned to it in the default case.
-            for delegate_alignment in (None, 1024):
-                suffix += f"-da{delegate_alignment}" if delegate_alignment else ""
-                outfile = os.path.join(args.outdir, f"{module_name}{suffix}.pte")
-                with open(outfile, "wb") as fp:
-                    fp.write(
-                        export_module_to_program(
-                            module_class,
-                            backend_id=args.backend_id,
-                            extract_delegate_segments=extract_delegate_segments,
-                            delegate_alignment=delegate_alignment,
-                        )
-                    )
-                print(f"Exported {module_name} and wrote program data to {outfile}")
+        if args.inline_delegate_segments:
+            suffix += "-nosegments"
+        if args.delegate_alignment is not None:
+            suffix += f"-da{args.delegate_alignment}"
+        outfile = os.path.join(args.outdir, f"{module_name}{suffix}.pte")
+        executorch_program = export_module_to_program(
+            module_class,
+            backend_id=args.backend_id,
+            extract_delegate_segments=not args.inline_delegate_segments,
+            delegate_alignment=args.delegate_alignment,
+        )
+        with open(outfile, "wb") as fp:
+            fp.write(executorch_program.buffer)
+        print(f"Exported {module_name} and wrote program data to {outfile}")
 
 
 if __name__ == "__main__":
diff --git a/test/models/export_program.py b/test/models/export_program.py
index ccf8a965eb2..5ed9cba4f8e 100644
--- a/test/models/export_program.py
+++ b/test/models/export_program.py
@@ -183,6 +183,23 @@ def export_joint():
         return True
 
 
+class ModuleStateful(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer("state", torch.zeros(1, dtype=torch.int32))
+
+    def forward(self, x):
+        self.state.add_(1)
+        return x + self.state
+
+    def get_random_inputs(self):
+        return (torch.ones(1),)
+
+    @staticmethod
+    def export_state_names():
+        return True
+
+
 #
 # Main logic.
 #
@@ -201,8 +218,11 @@ def export_module_to_program(
         # pyre-ignore[16]: pyre doesn't know about get_export_kwargs.
         export_kwargs = module_class.get_export_kwargs()
     export_joint = False
+    export_state_names = False
     if hasattr(module_class, "export_joint"):
         export_joint = module_class.export_joint()  # pyre-ignore
+    if hasattr(module_class, "export_state_names"):
+        export_state_names = module_class.export_state_names()
     if hasattr(module_class, "get_method_names_to_export"):
         # pyre-ignore[16]: pyre doesn't know about get_export_kwargs.
         methods = module_class.get_method_names_to_export()
@@ -214,6 +234,7 @@ def export_module_to_program(
         skip_type_promotion=skip_type_promotion,
         export_joint_graph=export_joint,
         external_constants=external_constants,
+        export_state_names=export_state_names,
         **export_kwargs,
     )
     return module.executorch_program
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index 6d5b6753f3f..6538302c507 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -67,6 +67,7 @@ def define_common_targets():
         "ModuleIndex",
         "ModuleDynamicCatUnallocatedIO",
         "ModuleSimpleTrain",
+        "ModuleStateful",
     ]
 
     # Generates Executorch .pte program files for various modules at build time.
@@ -150,7 +151,7 @@ def define_common_targets():
         visibility = [],  # Private
     )
 
-    # Class names of nn.Modules for :exported_delegated_programs to export.
+    # Class names of nn.Modules available in export_delegated_program.py.
     DELEGATED_MODULES_TO_EXPORT = [
         "ModuleAddMul",
         "ModuleAddLarge",
@@ -160,23 +161,23 @@ def define_common_targets():
     # Name of the backend to use when exporting delegated programs.
     BACKEND_ID = "StubBackend"
 
-    # Generates Executorch .pte program files for various modules at build time.
+    # Generates Executorch .pte program files for the AddMul module at build time.
     # To use one, depend on a target like
-    # ":exported_delegated_programs[ModuleAdd.pte]" or
-    # ":exported_delegated_programs[ModuleAdd-nosegments.pte]" (which does not
+    # ":exported_delegated_add_mul[ModuleAdd.pte]" or
+    # ":exported_delegated_add_mul[ModuleAdd-nosegments.pte]" (which does not
     # extract the delegate data blobs into segments).
     runtime.genrule(
-        name = "exported_delegated_programs",
-        cmd = "$(exe :export_delegated_program)" +
-              " --modules " + ",".join(DELEGATED_MODULES_TO_EXPORT) +
-              " --backend_id " + BACKEND_ID +
-              " --outdir $OUT",
+        name = "exported_delegated_add_mul",
+        cmd = "$(exe :export_delegated_program) --modules ModuleAddMul --backend_id " + BACKEND_ID + " --outdir $OUT" +
+              " && $(exe :export_delegated_program) --modules ModuleAddMul --backend_id " + BACKEND_ID + " --inline_delegate_segments --outdir $OUT" +
+            # Create files with a large alignment as well as the default.
+            # This alignment should be so large that it's extremely unlikely for
+            # the data to accidentally be aligned to it in the default case.
+              " && $(exe :export_delegated_program) --modules ModuleAddMul --backend_id " + BACKEND_ID + " --inline_delegate_segments --delegate_alignment 1024 --outdir $OUT",
         outs = {
-            fname + seg_suffix + da_suffix + ".pte": [fname + seg_suffix + da_suffix + ".pte"]
-            for fname in DELEGATED_MODULES_TO_EXPORT
-            for seg_suffix in ["", "-nosegments"]
-            # "da" = delegate alignment
-            for da_suffix in ["", "-da1024"]
+            "ModuleAddMul.pte": ["ModuleAddMul.pte"],
+            "ModuleAddMul-nosegments.pte": ["ModuleAddMul-nosegments.pte"],
+            "ModuleAddMul-nosegments-da1024.pte": ["ModuleAddMul-nosegments-da1024.pte"],
         },
         default_outs = ["."],
         visibility = [
@@ -188,7 +189,7 @@ def define_common_targets():
     runtime.genrule(
         name = "exported_xnnp_delegated_programs",
         cmd = "$(exe :export_delegated_program)" +
-              " --modules " + ",".join(DELEGATED_MODULES_TO_EXPORT) +
+              " --modules ModuleAddLarge,ModuleSubLarge" +
               " --backend_id " + "XnnpackBackend" +
               " --outdir $OUT",
         outs = {
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index ff2ed048257..422cd579d04 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -40,6 +40,7 @@ build_executorch() {
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
diff --git a/test/utils/DeathTest.h b/test/utils/DeathTest.h
index 2ba9bd61bd9..be83593adf1 100644
--- a/test/utils/DeathTest.h
+++ b/test/utils/DeathTest.h
@@ -15,6 +15,10 @@
 
 #include <gtest/gtest.h>
 
+#ifndef ET_BUILD_MODE_COV
+#define ET_BUILD_MODE_COV 0
+#endif // ET_BUILD_MODE_COV
+
 #if ET_BUILD_MODE_COV
 
 /**
diff --git a/third-party/ao b/third-party/ao
index 923242e22b5..7fa9c69dc09 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit 923242e22b5fb67646473605ab959b90cc450abc
+Subproject commit 7fa9c69dc0999023add31d000d4750e0ac2cd799
diff --git a/tools/cmake/cmake_deps.toml b/tools/cmake/cmake_deps.toml
index ee810c2bfd5..9913a02c4d5 100644
--- a/tools/cmake/cmake_deps.toml
+++ b/tools/cmake/cmake_deps.toml
@@ -150,6 +150,20 @@ deps = [
   "optimized_cpublas",
   "portable_kernels",
 ]
+
+[targets.test_backend_compiler_lib]
+buck_targets = [
+  "//runtime/executor/test:test_backend_compiler_lib",
+]
+filters = [
+  ".cpp$",
+]
+excludes = [
+]
+deps = [
+  "executorch",
+  "executorch_core",
+]
 # ---------------------------------- core end ----------------------------------
 # ---------------------------------- extension start ----------------------------------
 [targets.extension_data_loader]
diff --git a/util/collect_env.py b/util/collect_env.py
index 7d35c0636ce..ec44c9d6149 100644
--- a/util/collect_env.py
+++ b/util/collect_env.py
@@ -220,8 +220,7 @@ def get_cudnn_version(run_lambda):
         cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
     elif get_platform() == "darwin":
         # CUDA libraries and drivers can be found in /usr/local/cuda/. See
-        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
-        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+        # https://docs.nvidia.com/cuda/archive/10.1/cuda-installation-guide-mac-os-x/index.html#3.2-Install
         # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
         cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
     else:
diff --git a/util/python_profiler.py b/util/python_profiler.py
index 8993beb9429..c62b0ffafe0 100644
--- a/util/python_profiler.py
+++ b/util/python_profiler.py
@@ -44,7 +44,9 @@ def _from_pstat_to_static_html(stats: Stats, html_filename: str):
         html_filename: Output filename in which populated template is rendered
     """
     RESTR = r'(?<!] \+ ")/static/'
-    REPLACE_WITH = "https://cdn.rawgit.com/jiffyclub/snakeviz/v0.4.2/snakeviz/static/"
+    REPLACE_WITH = (
+        "https://cdn.jsdelivr.net/gh/jiffyclub/snakeviz@v0.4.2/snakeviz/static/"
+    )
 
     if not isinstance(html_filename, str):
         raise ValueError("A valid file name must be provided.")