diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index b93b7b0283c..ff0277c2056 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -17,6 +17,7 @@ parameterized==0.9.0 # Doc build requirements, same as https://github.com/pytorch/pytorch/blob/main/.ci/docker/requirements-docs.txt sphinx==5.3.0 +sphinx-reredirects==0.1.4 sphinx-gallery==0.14.0 breathe==4.34.0 exhale==0.2.3 diff --git a/.ci/scripts/build_android_instrumentation.sh b/.ci/scripts/build_android_instrumentation.sh deleted file mode 100644 index 5e074d9e215..00000000000 --- a/.ci/scripts/build_android_instrumentation.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set -ex - -if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then - PYTHON_EXECUTABLE=python3 -fi -which "${PYTHON_EXECUTABLE}" - -mkdir -p "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources -cp extension/module/test/resources/add.pte "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources - -pushd "${BUILD_AAR_DIR}" -ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest -ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest -popd diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py index b2126f84e78..9a4723d7e56 100755 --- a/.ci/scripts/gather_benchmark_configs.py +++ b/.ci/scripts/gather_benchmark_configs.py @@ -23,6 +23,7 @@ "samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa", "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db", "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a", + "google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d", } # Predefined benchmark configurations diff --git a/.ci/scripts/test_ios_ci.sh b/.ci/scripts/test_ios_ci.sh index 50c6448d4b2..e87dbec8444 100755 --- a/.ci/scripts/test_ios_ci.sh +++ b/.ci/scripts/test_ios_ci.sh @@ -7,7 +7,7 @@ set -e -APP_PATH="examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo" +APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo" MODEL_NAME="mv3" SIMULATOR_NAME="executorch" @@ -34,6 +34,10 @@ say() { echo -e "\033[1m\n\t** $1 **\n\033[0m" } +say "Cloning the Demo App" + +git clone --depth 1 https://github.com/pytorch-labs/executorch-examples.git + say "Installing CoreML Backend Requirements" ./backends/apple/coreml/scripts/install_requirements.sh diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 15df725f9c1..8a1d5683b33 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -154,7 +154,7 @@ run_and_verify() { EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various" else # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. - EXPECTED_PREFIX="ASSISTANT:" + EXPECTED_PREFIX="ASSISTANT: image" fi if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then echo "Expected result prefix: ${EXPECTED_PREFIX}" diff --git a/.github/release.yml b/.github/release.yml index 8caa4ede084..fc4accd252a 100644 --- a/.github/release.yml +++ b/.github/release.yml @@ -15,57 +15,82 @@ changelog: - title: ARM labels: - "release notes: arm" + - "module: arm" + - "partner: arm" - title: NXP - labels: + labels: - "release notes: nxp" + - "module: nxp" - title: Exir - labels: + labels: - "release notes: exir" + - "module: exir" - title: Misc - labels: + labels: - "release notes: misc" - title: Apple - labels: + labels: - "release notes: apple" + - "module: coreml" + - "module: mps" + - title: Android + labels: + - "module: android" + - title: IOS + labels: + - "module: ios" - title: Build - labels: + labels: - "release notes: build" - title: Vulkan - labels: + labels: - "release notes: vulkan" + - "module: vulkan" - title: Cadence - labels: + labels: - "release notes: cadence" + - "module: cadence" - title: Runtime - labels: + labels: - "release notes: runtime" + - "module: runtime" - title: XNNPACK - labels: + labels: - "release notes: xnnpack" + - "module: xnnpack" - title: Devtools - labels: + labels: - "release notes: devtools" + - "module: devtools" - title: Examples - labels: + labels: - "release notes: examples" + - title: LLM + labels: + - "module: llm" - title: Mediatek - labels: + labels: - "release notes: mediatek" + - "partner: mediatek" - title: Openvino - labels: + labels: - "release notes: openvino" - title: Qualcomm - labels: + labels: - "release notes: qualcomm" + - "partner: qualcomm" + - "module: qnn" - title: Training - labels: + labels: - "release notes: training" + - "module: training" - title: Quantization - labels: + labels: - "release notes: quantization" - title: Ops & kernels - labels: - - "release notes: ops & kernels" + labels: + - "release notes: ops & kernels" + - "module: kernels" - title: Other Changes labels: - "*" diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml index e29833015d3..630ae2747bf 100644 --- a/.github/workflows/_android.yml +++ b/.github/workflows/_android.yml @@ -14,7 +14,7 @@ jobs: with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android - submodules: 'true' + submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 upload-artifact: android-apps @@ -22,6 +22,10 @@ jobs: script: | set -eux + # Use sccache for NDK compiler as well + export CMAKE_CXX_COMPILER_LAUNCHER=sccache + export CMAKE_C_COMPILER_LAUNCHER=sccache + # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" @@ -36,8 +40,9 @@ jobs: cp ${BUILD_AAR_DIR}/executorch.aar $ARTIFACTS_DIR_NAME mkdir -p ${ARTIFACTS_DIR_NAME}/library_test_dir - bash .ci/scripts/build_android_instrumentation.sh - cp ${BUILD_AAR_DIR}/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir" + bash extension/android/executorch_android/android_test_setup.sh + (cd extension/android; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest) + cp extension/android/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir" mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom bash examples/models/llama/install_requirements.sh @@ -130,7 +135,8 @@ jobs: # https://github.com/ReactiveCircus/android-emulator-runner. The max number # of cores we can set is 6, any higher number will be reduced to 6. cores: 6 - ram-size: 12288M + ram-size: 16384M + heap-size: 12288M force-avd-creation: false disable-animations: true emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none diff --git a/.github/workflows/android-perf-private-device-experiment.yml b/.github/workflows/android-perf-private-device-experiment.yml new file mode 100644 index 00000000000..0108ab119ca --- /dev/null +++ b/.github/workflows/android-perf-private-device-experiment.yml @@ -0,0 +1,62 @@ +name: android-perf (private devices) + +on: + schedule: + - cron: 0 0,4,8,12,16,20 * * * + pull_request: + paths: + - .github/workflows/android-perf-private-device-experiment.yml + push: + branches: + - main + paths: + - .github/workflows/android-perf-private-device-experiment.yml + # Note: GitHub has an upper limit of 10 inputs + workflow_dispatch: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8 + devices: + description: Target devices to run benchmark + required: false + type: string + default: google_pixel_3_private_rooted + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + workflow_call: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8 + devices: + description: Target devices to run benchmark + required: false + type: string + default: google_pixel_3_private_rooted + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + +concurrency: + group: android-perf-private-devices-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + android: + uses: ./.github/workflows/android-perf.yml + secrets: inherit + permissions: + id-token: write + contents: read + with: + models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }} + devices: google_pixel_3_private_rooted + benchmark_configs: ${{ inputs.benchmark_configs }} diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 09a6453094f..5245d2f4f12 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -345,7 +345,7 @@ jobs: with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android - submodules: 'true' + submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 upload-artifact: android-apps @@ -353,6 +353,10 @@ jobs: script: | set -eux + # Use sccache for NDK compiler as well + export CMAKE_CXX_COMPILER_LAUNCHER=sccache + export CMAKE_C_COMPILER_LAUNCHER=sccache + # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" @@ -392,7 +396,7 @@ jobs: fail-fast: false with: # Due to scheduling a job may be pushed beyond the default 60m threshold - timeout: 120 + timeout: 240 device-type: android runner: linux.2xlarge test-infra-ref: '' diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml index 24aa6c1ad27..b31ff644d94 100644 --- a/.github/workflows/android-release-artifacts.yml +++ b/.github/workflows/android-release-artifacts.yml @@ -7,6 +7,17 @@ on: description: Version name to be uploaded for AAR release required: false type: string + upload_to_maven: + description: Upload the AAR to maven staging repository + required: false + type: boolean + flavor: + type: choice + options: + - "xnnpack" + - "vulkan+xnnpack" + schedule: + - cron: 0 10 * * * concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -22,6 +33,10 @@ jobs: shell: bash run: | VERSION="${{ inputs.version }}" + if [ -z "$VERSION" ]; then + echo "No version name specified. Will create a snapshot AAR" + exit 0 + fi if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" | grep "200 OK"; then echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" echo "Will skip build/upload" @@ -31,14 +46,18 @@ jobs: build-aar: name: build-aar needs: check-if-aar-exists - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.7 + secrets: inherit permissions: id-token: write contents: read with: - runner: linux.2xlarge + secrets-env: EXECUTORCH_MAVEN_SIGNING_KEYID EXECUTORCH_MAVEN_SIGNING_PASSWORD EXECUTORCH_MAVEN_CENTRAL_PASSWORD EXECUTORCH_MAVEN_CENTRAL_USERNAME EXECUTORCH_MAVEN_SIGNING_GPG_KEY_CONTENTS + # As this job has access to Maven credential, run this on a fresh ephemeral runner + runner: ephemeral.linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android - submodules: 'true' + submodules: 'recursive' ref: ${{ github.sha }} timeout: 90 upload-artifact: android-apps @@ -46,12 +65,37 @@ jobs: script: | set -eux + # Use sccache for NDK compiler as well + export CMAKE_CXX_COMPILER_LAUNCHER=sccache + export CMAKE_C_COMPILER_LAUNCHER=sccache + # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool buck2 export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + mkdir -p ~/.gradle + touch ~/.gradle/gradle.properties + echo "signing.keyId=${SECRET_EXECUTORCH_MAVEN_SIGNING_KEYID}" >> ~/.gradle/gradle.properties + echo "signing.password=${SECRET_EXECUTORCH_MAVEN_SIGNING_PASSWORD}" >> ~/.gradle/gradle.properties + echo "mavenCentralUsername=${SECRET_EXECUTORCH_MAVEN_CENTRAL_USERNAME}" >> ~/.gradle/gradle.properties + echo "mavenCentralPassword=${SECRET_EXECUTORCH_MAVEN_CENTRAL_PASSWORD}" >> ~/.gradle/gradle.properties + echo "signing.secretKeyRingFile=/tmp/secring.gpg" >> ~/.gradle/gradle.properties + + echo -n "$SECRET_EXECUTORCH_MAVEN_SIGNING_GPG_KEY_CONTENTS" | base64 -d > /tmp/secring.gpg + + # Update the version name in build.gradle in case of maven publish + VERSION="${{ inputs.version }}" + if [ ! -z "$VERSION" ]; then + sed -i "s/\(coordinates(\"org.pytorch\", \"executorch-android\", \"\)\([0-9]\+.[0-9]\+.[0-9]\+\)\(\")\)/\1$VERSION\3/" extension/android/executorch_android/build.gradle + fi + + FLAVOR="${{ inputs.flavor }}" + if [[ "$FLAVOR" == "vulkan+xnnpack" ]]; then + export EXECUTORCH_BUILD_VULKAN=ON + fi + # Build AAR Package mkdir aar-out export BUILD_AAR_DIR=aar-out @@ -61,6 +105,12 @@ jobs: shasum -a 256 "${ARTIFACTS_DIR_NAME}/executorch.aar" + # Publish to maven staging + UPLOAD_TO_MAVEN="${{ inputs.upload_to_maven }}" + if [[ "$UPLOAD_TO_MAVEN" == "true" ]]; then + (cd extension/android; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:publishToMavenCentral) + fi + upload-release-aar: name: upload-release-aar needs: build-aar @@ -84,6 +134,8 @@ jobs: pip install awscli==1.32.18 AWS_CMD="aws s3 cp" VERSION="${{ inputs.version }}" - VERSION_NAME="${VERSION:-temp_snapshot}" - ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar --acl public-read - ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar.sha256sums --acl public-read + if [ -z "$VERSION" ]; then + VERSION="snapshot-$(date +"%Y%m%d")" + fi + ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION}/executorch.aar --acl public-read + ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION}/executorch.aar.sha256sums --acl public-read diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index b8b63078643..7fa40e3ea75 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -14,6 +14,20 @@ on: - cron: '0 0 * * *' jobs: + check-urls: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Check URLs + run: bash ./scripts/check_urls.sh + + check-xrefs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Check Links + run: bash ./scripts/check_xrefs.sh + build: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: diff --git a/.mypy.ini b/.mypy.ini index 8c1c9dbcadc..5ee07ddb2bf 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -80,6 +80,9 @@ ignore_missing_imports = True [mypy-serializer.*] ignore_missing_imports = True +[mypy-tosa_tools.*] +ignore_missing_imports = True + [mypy-setuptools.*] ignore_missing_imports = True diff --git a/CMakeLists.txt b/CMakeLists.txt index 6dbb66afdaa..34538d1e5ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -761,12 +761,16 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor) endif() +if(EXECUTORCH_BUILD_EXTENSION_MODULE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module) +endif() + if(EXECUTORCH_BUILD_EXTENSION_LLM) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizers) endif() -if(EXECUTORCH_BUILD_EXTENSION_MODULE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module) +if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) endif() if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) @@ -810,6 +814,10 @@ if(EXECUTORCH_BUILD_PYBIND) torch ) + if(EXECUTORCH_BUILD_TESTS) + list(APPEND _dep_libs test_backend_compiler_lib) + endif() + if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) list(APPEND _dep_libs optimized_native_cpu_ops_lib) else() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ed1e2b30323..c0df9cefebe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,19 +1,109 @@ Thank you for your interest in contributing to ExecuTorch! We want to make it easy to contribute to this project. -  ## Dev Install Set up your environment by following the instructions at -https://pytorch.org/executorch/stable/getting-started-setup.html to clone +https://pytorch.org/executorch/main/getting-started-setup to clone the repo and install the necessary requirements. +Refer to this [document](docs/source/using-executorch-building-from-source.md) to build ExecuTorch from source. + +### Dev Setup for Android +For Android, please refer to the [Android documentation](docs/source/using-executorch-android.md). + +### Dev Setup for Apple +For Apple, please refer to the [iOS documentation](docs/source/using-executorch-ios.md). +  + +## Codebase structure + +
+
+executorch
+├── backends - Backend delegate implementations for various hardware targets. Each backend uses partitioner to split the graph into subgraphs that can be executed on specific hardware, quantizer to optimize model precision, and runtime components to execute the graph on target hardware. For details refer to the backend documentation and the Export and Lowering tutorial for more information.
+│   ├── apple - Apple-specific backends.
+│   │   ├── coreml - CoreML backend for Apple devices. See doc.
+│   │   └── mps - Metal Performance Shaders backend for Apple devices. See doc.
+│   ├── arm - ARM architecture backends. See doc.
+│   ├── cadence - Cadence-specific backends. See doc.
+│   ├── example - Example backend implementations.
+│   ├── mediatek - MediaTek-specific backends. See doc.
+│   ├── openvino - OpenVINO backend for Intel hardware.
+│   ├── qualcomm - Qualcomm-specific backends. See doc.
+│   ├── transforms - Transformations for backend optimization.
+│   ├── vulkan - Vulkan backend for cross-platform GPU support. See doc.
+│   └── xnnpack - XNNPACK backend for optimized neural network operations. See doc.
+├── codegen - Tooling to autogenerate bindings between kernels and the runtime.
+├── configurations - Configuration files.
+├── devtools - Model profiling, debugging, and inspection. Please refer to the tools documentation for more information.
+│   ├── bundled_program - a tool for validating ExecuTorch model. See doc.
+│   ├── etdump - ETDump - a format for saving profiling and debugging data from runtime. See doc.
+│   ├── etrecord - ETRecord - AOT debug artifact for ExecuTorch. See doc.
+│   ├── inspector - Python API to inspect ETDump and ETRecord. See doc.
+│   └── visualization - Visualization tools for representing model structure and performance metrics.
+├── docs - Static docs tooling and documentation source files.
+├── examples - Examples of various user flows, such as model export, delegates, and runtime execution.
+├── exir - Ahead-of-time library: model capture and lowering APIs. EXport Intermediate Representation (EXIR) is a format for representing the result of torch.export. This directory contains utilities and passes for lowering the EXIR graphs into different dialects and eventually suitable to run on target hardware.
+│   ├── _serialize - Serialize final export artifact.
+│   ├── backend - Backend delegate ahead of time APIs.
+│   ├── capture - Program capture.
+│   ├── dialects - Op sets for various dialects in the export process. Please refer to the EXIR spec and the backend dialect doc for more details.
+│   ├── emit - Conversion from ExportedProgram to ExecuTorch execution instructions.
+│   ├── operator - Operator node manipulation utilities.
+│   ├── passes - Built-in compiler passes.
+│   ├── program - Export artifacts.
+│   ├── serde - Graph module serialization/deserialization.
+│   ├── verification - IR verification.
+├── extension - Extensions built on top of the runtime.
+│   ├── android - ExecuTorch wrappers for Android apps. Please refer to the Android documentation and Javadoc for more information.
+│   ├── apple - ExecuTorch wrappers for iOS apps. Please refer to the iOS documentation on how to integrate into Apple platform for more information.
+│   ├── aten_util - Converts to and from PyTorch ATen types.
+│   ├── data_loader - 1st party data loader implementations.
+│   ├── evalue_util - Helpers for working with EValue objects.
+│   ├── gguf_util - Tools to convert from the GGUF format.
+│   ├── kernel_util - Helpers for registering kernels.
+│   ├── llm - Library to run LLM on ExecuTorch including common optimization passes, runtime C++ components. Please refer to the LLM documentation for more information.
+│   ├── memory_allocator - 1st party memory allocator implementations.
+│   ├── module - A simplified C++ wrapper for the runtime. An abstraction that deserializes and executes an ExecuTorch artifact (.pte file). Refer to the module documentation for more information.
+│   ├── parallel - C++ threadpool integration.
+│   ├── pybindings - Python API for executorch runtime. This is powering up the runtime Python API for ExecuTorch.
+│   ├── pytree - C++ and Python flattening and unflattening lib for pytrees.
+│   ├── runner_util - Helpers for writing C++ PTE-execution tools.
+│   ├── tensor - Tensor maker and TensorPtr, details in this documentation. For how to use TensorPtr and Module, please refer to the "Using ExecuTorch with C++" doc.
+│   ├── testing_util - Helpers for writing C++ tests.
+│   ├── threadpool - Threadpool.
+│   └── training - Experimental libraries for on-device training.
+├── kernels - 1st party kernel implementations.
+│   ├── aten - ATen kernel implementations.
+│   ├── optimized - Optimized kernel implementations.
+│   ├── portable - Reference implementations of ATen operators.
+│   ├── prim_ops - Special ops used in executorch runtime for control flow and symbolic primitives.
+│   └── quantized - Quantized kernel implementations.
+├── profiler - Utilities for profiling runtime execution.
+├── runtime - Core C++ runtime. These components are used to execute the ExecuTorch program. Please refer to the runtime documentation for more information.
+│   ├── backend - Backend delegate runtime APIs.
+│   ├── core - Core structures used across all levels of the runtime. Basic components such as Tensor, EValue, Error and Result etc.
+│   ├── executor - Model loading, initialization, and execution. Runtime components that execute the ExecuTorch program, such as Program, Method. Refer to the runtime API documentation for more information.
+│   ├── kernel - Kernel registration and management.
+│   └── platform - Layer between architecture specific code and portable C++.
+├── schema - ExecuTorch PTE file format flatbuffer schemas.
+├── scripts - Utility scripts for building libs, size management, dependency management, etc.
+├── shim_et - Compatibility layer between OSS and Internal builds.
+├── test - Broad scoped end-to-end tests.
+├── third-party - Third-party dependencies.
+├── tools - Tools for building ExecuTorch from source, for different built tools (CMake, Buck).
+└── util - Various helpers and scripts.
+
+   ## Contributing workflow We actively welcome your pull requests (PRs). +If you're completely new to open-source projects, GitHub, or ExecuTorch, please see our [New Contributor Guide](docs/source/new-contributor-guide.md) for a step-by-step walkthrough on making your first contribution. Otherwise, read on. + 1. [Claim an issue](#claiming-issues), if present, before starting work. If an issue doesn't cover the work you plan to do, consider creating one to provide context about it, and to build consensus about the scope and solution. @@ -24,7 +114,7 @@ We actively welcome your pull requests (PRs). 1. If you've changed APIs or added a new tool or feature, [update the documentation](#updating-documentation). 1. If you added an experimental API or deprecated an existing API, follow the - [API Life Cycle and Deprecation Policy](/docs/source/api-life-cycle.md). + [API Life Cycle and Deprecation Policy](docs/source/api-life-cycle.md). 1. Make sure your code follows the [style guides](#coding-style) and passes the [lint checks](#lintrunner). 1. If you haven't already, complete the [Contributor License Agreement ("CLA")](#contributor-license-agreement-cla). @@ -103,9 +193,6 @@ in the Github repo. ## Coding Style -Goal: Encourage standards that make it easier to read, edit, maintain, and debug -the ExecuTorch code. - ### lintrunner We use [`lintrunner`](https://pypi.org/project/lintrunner/) to help make sure the @@ -158,7 +245,7 @@ modifications to the Google C++ style guide. ### C++ Portability Guidelines -See also [Portable C++ Programming](/docs/source/portable-cpp-programming.md) +See also [Portable C++ Programming](docs/source/portable-cpp-programming.md) for detailed advice. #### C++ language version @@ -170,7 +257,7 @@ toolchains, and having access to relatively modern C++ features. #### C/C++ standard library usage -**Restricted usage of the C++ standard library.** +**Restricted usage of the C++ standard library** Rationale: ExecuTorch is intended to be portable to bare-metal systems that lack certain features, like dynamic memory, threading, and locking, required by parts @@ -191,7 +278,7 @@ careful to also manually destroy objects initialized in this way. #### C++ language features -**Exceptions: Do not use.** +**Exceptions: Do not use** - Rationale: Exceptions are not widely supported on some classes of microcontrollers and DSPs, and they can significantly increase binary size. @@ -200,12 +287,12 @@ must work with threading** - Rationale: The core runtime must work on systems that do not have threading support. -**RTTI, dynamic_cast, and ``: Do not use.** +**RTTI, dynamic_cast, and ``: Do not use** - Rationale: RTTI adds extra data to every virtual class. ExecuTorch doesn't have a strong need for `dynamic_cast` and friends, so it's better to reduce the binary size. -**Templates and template metaprogramming: Be careful and avoid if possible.** +**Templates and template metaprogramming: Be careful and avoid if possible** - Rationale: Most templating results in code generation, and is one of the most common sources of binary bloat. Some use of templates is fine (e.g. an `ArrayRef`, or code that handles multiple `ScalarType` types), but for the @@ -221,7 +308,7 @@ CI is run automatically on all pull requests. However, if you want to run tests - The `sh test/build_size_test.sh` script will compile the C++runtime along with portable kernels. - The `test/run_oss_cpp_tests.sh` script will build and run C++ tests locally -- Running `pytest` from the root directory will run Python tests locally. +- Running `pytest` from the root directory will run Python tests locally. Make sure to run this after finishing [Dev Install](#dev-install). ### Writing Tests To help keep code quality high, ExecuTorch uses a combination of unit tests and @@ -270,7 +357,7 @@ docs](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/ for basics. 1. Push your branch to your fork of `pytorch/executorch`. Most people do not - have permission to push a branch directoy to the upstream repo. + have permission to push a branch directory to the upstream repo. 1. Create your PR - Use the `main` branch as the base. - Give the PR a clear and descriptive title. It will become the title of the @@ -279,7 +366,8 @@ for basics. - Good title: "Add XYZ method to ABC" - Give the PR a clear and thorough description. Don't just describe what the PR does: the diff will do that. Explain *why* you are making this change, in a - way that will make sense to someone years from now. + way that will make sense to someone years from now. If the PR is a bug fix, + include the issue number at the beginning of the description: "Fixes #1234" - Explain how you have tested your changes by including repeatable instructions for testing the PR. - If you added tests, this can be as simple as the command you used to run the @@ -321,26 +409,17 @@ for basics. - If the reviewers have requests or questions, follow up with them. - The goal of the reviewer is to ensure that the code in the `main` branch of the repo is consistent, maintainable, and of high quality. -1. Once the PR has been approved, - - If you have the "write permission" in this repo, you can merge it yourself - by clicking the "Squash and merge" button once it is green and all CI - signals are passing. - - If you don't have "write permission" in this repo, the reviewer will take - care of the PR. The reviewer may import the PR into Meta's internal system - to validate it against internal CI. - - If the PR is approved but not merged within 5 business days, please comment - on the PR to ask about its status. - - Note that if the `main` [CI](#continuous-integration) jobs are broken, we - will only merge PRs that fix the broken jobs until all critical jobs are - fixed. +1. Once the PR has been approved, you can merge it yourself + by clicking the "Squash and merge" button once it is + green and all CI signals are passing.   ## For Backend Delegate Authors -- Use [this](/docs/source/backend-delegates-integration.md) guide when +- Use [this](docs/source/backend-delegates-integration.md) guide when integrating your delegate with ExecuTorch. -- Refer to [this](/docs/source/backend-delegates-dependencies.md) set of +- Refer to [this](docs/source/backend-delegates-dependencies.md) set of guidelines when including a third-party depenency for your delegate.   diff --git a/Package.swift b/Package.swift index 1322b918c07..b8a8b7d064b 100644 --- a/Package.swift +++ b/Package.swift @@ -15,7 +15,7 @@ // // For details on building frameworks locally or using prebuilt binaries, // see the documentation: -// https://pytorch.org/executorch/main/using-executorch-ios.html +// https://pytorch.org/executorch/main/using-executorch-ios import PackageDescription diff --git a/README-wheel.md b/README-wheel.md index 9f074ab5ee3..12906bfd382 100644 --- a/README-wheel.md +++ b/README-wheel.md @@ -10,32 +10,21 @@ The `executorch` pip package is in beta. The prebuilt `executorch.runtime` module included in this package provides a way to run ExecuTorch `.pte` files, with some restrictions: -* Only [core ATen - operators](https://pytorch.org/executorch/stable/ir-ops-set-definition.html) - are linked into the prebuilt module -* Only the [XNNPACK backend - delegate](https://pytorch.org/executorch/main/native-delegates-executorch-xnnpack-delegate.html) - is linked into the prebuilt module. -* \[macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html) - and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend - delegates are also linked into the prebuilt module. +* Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module +* Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module. +* \[macOS only] [Core ML](docs/source/backends-coreml.md) and [MPS](docs/source/backends-mps.md) backend + are also linked into the prebuilt module. -Please visit the [ExecuTorch website](https://pytorch.org/executorch/) for +Please visit the [ExecuTorch website](https://pytorch.org/executorch) for tutorials and documentation. Here are some starting points: -* [Getting - Started](https://pytorch.org/executorch/stable/getting-started-setup.html) +* [Getting Started](https://pytorch.org/executorch/main/getting-started-setup) * Set up the ExecuTorch environment and run PyTorch models locally. -* [Working with - local LLMs](https://pytorch.org/executorch/stable/llm/getting-started.html) +* [Working with local LLMs](docs/source/llm/getting-started.md) * Learn how to use ExecuTorch to export and accelerate a large-language model from scratch. -* [Exporting to - ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial.html) +* [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) * Learn the fundamentals of exporting a PyTorch `nn.Module` to ExecuTorch, and optimizing its performance using quantization and hardware delegation. -* Running LLaMA on - [iOS](https://pytorch.org/executorch/stable/llm/llama-demo-ios.html) and - [Android](https://pytorch.org/executorch/stable/llm/llama-demo-android.html) - devices. +* Running LLaMA on [iOS](docs/source/llm/llama-demo-ios.md) and [Android](docs/source/llm/llama-demo-android.md) devices. * Build and run LLaMA in a demo mobile app, and learn how to integrate models with your own apps. diff --git a/README.md b/README.md index dd1fafe715b..c0d594e7733 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- Logo + Logo

ExecuTorch: A powerful on-device AI Framework

@@ -8,7 +8,7 @@ Contributors Stargazers Join our Discord community - Check out the documentation + Check out the documentation
@@ -49,9 +49,9 @@ Key value propositions of ExecuTorch are: ## Getting Started To get started you can: -- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index.html) on getting things running locally and deploy a model to a device -- Use this [Colab Notebook](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away -- Jump straight into LLMs use cases by following specific instructions for [Llama](./examples/models/llama/README.md) and [Llava](./examples/models/llava/README.md) +- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index) to get things running locally and deploy a model to a device +- Use this [Colab Notebook](https://pytorch.org/executorch/main/getting-started-setup#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away +- Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md) ## Feedback and Engagement @@ -65,62 +65,7 @@ We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md ## Directory Structure -``` -executorch -├── backends # Backend delegate implementations. -├── codegen # Tooling to autogenerate bindings between kernels and the runtime. -├── configurations -├── docs # Static docs tooling. -├── examples # Examples of various user flows, such as model export, delegates, and runtime execution. -├── exir # Ahead-of-time library: model capture and lowering APIs. -| ├── _serialize # Serialize final export artifact. -| ├── backend # Backend delegate ahead of time APIs -| ├── capture # Program capture. -| ├── dialects # Op sets for various dialects in the export process. -| ├── emit # Conversion from ExportedProgram to ExecuTorch execution instructions. -| ├── operator # Operator node manipulation utilities. -| ├── passes # Built-in compiler passes. -| ├── program # Export artifacts. -| ├── serde # Graph module serialization/deserialization. -| ├── verification # IR verification. -├── extension # Extensions built on top of the runtime. -| ├── android # ExecuTorch wrappers for Android apps. -| ├── apple # ExecuTorch wrappers for iOS apps. -| ├── aten_util # Converts to and from PyTorch ATen types. -| ├── data_loader # 1st party data loader implementations. -| ├── evalue_util # Helpers for working with EValue objects. -| ├── gguf_util # Tools to convert from the GGUF format. -| ├── kernel_util # Helpers for registering kernels. -| ├── memory_allocator # 1st party memory allocator implementations. -| ├── module # A simplified C++ wrapper for the runtime. -| ├── parallel # C++ threadpool integration. -| ├── pybindings # Python API for executorch runtime. -| ├── pytree # C++ and Python flattening and unflattening lib for pytrees. -| ├── runner_util # Helpers for writing C++ PTE-execution tools. -| ├── testing_util # Helpers for writing C++ tests. -| ├── training # Experimental libraries for on-device training -├── kernels # 1st party kernel implementations. -| ├── aten -| ├── optimized -| ├── portable # Reference implementations of ATen operators. -| ├── prim_ops # Special ops used in executorch runtime for control flow and symbolic primitives. -| ├── quantized -├── profiler # Utilities for profiling runtime execution. -├── runtime # Core C++ runtime. -| ├── backend # Backend delegate runtime APIs. -| ├── core # Core structures used across all levels of the runtime. -| ├── executor # Model loading, initialization, and execution. -| ├── kernel # Kernel registration and management. -| ├── platform # Layer between architecture specific code and portable C++. -├── schema # ExecuTorch PTE file format flatbuffer schemas. -├── scripts # Utility scripts for building libs, size management, dependency management, etc. -├── tools # Development tool management. -├── devtools # Model profiling, debugging, and introspection. -├── shim # Compatibility layer between OSS and Internal builds -├── test # Broad scoped end-to-end tests. -├── third-party # Third-party dependencies. -├── util # Various helpers and scripts. -``` +Please refer to the [Codebase structure](CONTRIBUTING.md#codebase-structure) section of the [Contributing Guidelines](CONTRIBUTING.md) for more details. ## License ExecuTorch is BSD licensed, as found in the LICENSE file. diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md index e8a062774d0..d063dfc8b71 100644 --- a/backends/apple/coreml/README.md +++ b/backends/apple/coreml/README.md @@ -1,8 +1,7 @@ # ExecuTorch Core ML Delegate - This subtree contains the Core ML Delegate implementation for ExecuTorch. -Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices. +Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices. To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends-coreml.md). ## Layout - `compiler/` : Lowers a module to Core ML backend. @@ -19,110 +18,6 @@ Core ML is an optimized framework for running machine learning models on Apple d - `workspace` : Xcode workspace for the runtime. - `third-party/`: External dependencies. -## Partition and Delegation - -To delegate a Program to the **Core ML** backend, the client must call `to_backend` with the **CoreMLPartitioner**. - -```python -import torch -import executorch.exir - -from executorch.backends.apple.coreml.compiler import CoreMLBackend -from executorch.backends.apple.coreml.partition import CoreMLPartitioner - -class Model(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return torch.sin(x) - -source_model = Model() -example_inputs = (torch.ones(1), ) - -# Export the source model to Edge IR representation -aten_program = torch.export.export(source_model, example_inputs) -edge_program_manager = executorch.exir.to_edge(aten_program) - -# Delegate to Core ML backend -delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner()) - -# Serialize delegated program -executorch_program = delegated_program_manager.to_executorch() -with open("model.pte", "wb") as f: - f.write(executorch_program.buffer) -``` - -The module will be fully or partially delegated to **Core ML**, depending on whether all or part of ops are supported by the **Core ML** backend. User may force skip certain ops by `CoreMLPartitioner(skip_ops_for_coreml_delegation=...)` - -The `to_backend` implementation is a thin wrapper over [coremltools](https://apple.github.io/coremltools/docs-guides/), `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**. - -## Quantization - -To quantize a Program in a Core ML favored way, the client may utilize **CoreMLQuantizer**. - -```python -import torch -import executorch.exir - -from torch.export import export_for_training -from torch.ao.quantization.quantize_pt2e import ( - convert_pt2e, - prepare_pt2e, - prepare_qat_pt2e, -) - -from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer -from coremltools.optimize.torch.quantization.quantization_config import ( - LinearQuantizerConfig, - QuantizationScheme, -) - -class Model(torch.nn.Module): - def __init__(self) -> None: - super().__init__() - self.conv = torch.nn.Conv2d( - in_channels=3, out_channels=16, kernel_size=3, padding=1 - ) - self.relu = torch.nn.ReLU() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - a = self.conv(x) - return self.relu(a) - -source_model = Model() -example_inputs = (torch.randn((1, 3, 256, 256)), ) - -pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module() - -quantization_config = LinearQuantizerConfig.from_dict( - { - "global_config": { - "quantization_scheme": QuantizationScheme.symmetric, - "activation_dtype": torch.quint8, - "weight_dtype": torch.qint8, - "weight_per_channel": True, - } - } -) -quantizer = CoreMLQuantizer(quantization_config) - -# For post-training quantization, use `prepare_pt2e` -# For quantization-aware trainin,g use `prepare_qat_pt2e` -prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer) - -prepared_graph(*example_inputs) -converted_graph = convert_pt2e(prepared_graph) -``` - -The `converted_graph` is the quantized torch model, and can be delegated to **Core ML** similarly through **CoreMLPartitioner** - -## Runtime - -To execute a Core ML delegated program, the application must link to the `coremldelegate` library. Once linked there are no additional steps required, ExecuTorch when running the program would call the Core ML runtime to execute the Core ML delegated part of the program. - -Please follow the instructions described in the [Core ML setup](/backends/apple/coreml/setup.md) to link the `coremldelegate` library. - ## Help & Improvements If you have problems or questions or have suggestions for ways to make implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues). diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.h b/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.h index d97b3cf9b76..01655ca06c1 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.h @@ -7,7 +7,7 @@ #import -#import +#import "asset.h" NS_ASSUME_NONNULL_BEGIN diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.mm index 6b1723f7113..455edf89480 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAsset.mm @@ -5,15 +5,15 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "ETCoreMLAsset.h" + +#import "ETCoreMLLogging.h" +#import "objc_safe_cast.h" #import #import #import #import - -#import - namespace { using namespace executorchcoreml; @@ -85,6 +85,10 @@ - (void)dealloc { - (BOOL)_keepAliveAndReturnError:(NSError * __autoreleasing *)error { if (!_isValid) { + ETCoreMLLogErrorAndSetNSError(error, + ETCoreMLErrorCorruptedModel, + "The asset with identifier = %@ is invalid. Some required asset files appear to be missing.", + _identifier); return NO; } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h index 04fef204e1a..11d957044e9 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h @@ -7,7 +7,7 @@ #import -#import +#import "database.hpp" @class ETCoreMLAsset; diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm index 73e9cc0f33b..256026e1f09 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm @@ -6,12 +6,14 @@ // Please refer to the license found in the LICENSE file in the root directory of the source tree. #import "ETCoreMLAssetManager.h" -#import -#import -#import + +#import "ETCoreMLAsset.h" +#import "ETCoreMLLogging.h" +#import "database.hpp" +#import "json_key_value_store.hpp" +#import "serde_json.h" + #import -#import -#import #import namespace { @@ -365,8 +367,7 @@ - (void)cleanupAssetIfNeeded:(ETCoreMLAsset *)asset { NSError *cleanupError = nil; if (![self _removeAssetWithIdentifier:asset.identifier error:&cleanupError]) { ETCoreMLLogError(cleanupError, - "%@: Failed to remove asset with identifier = %@", - NSStringFromClass(ETCoreMLAssetManager.class), + "Failed to remove asset with identifier = %@", identifier); } }); @@ -440,9 +441,7 @@ - (void)triggerCompaction { dispatch_async(self.syncQueue, ^{ NSError *localError = nil; if (![weakSelf _compact:self.maxAssetsSizeInBytes error:&localError]) { - ETCoreMLLogError(localError, - "%@: Failed to compact asset store.", - NSStringFromClass(ETCoreMLAssetManager.class)); + ETCoreMLLogError(localError, "Failed to compact asset store."); } }); } @@ -486,11 +485,11 @@ - (nullable ETCoreMLAsset *)assetWithIdentifier:(NSString *)identifier if ([result keepAliveAndReturnError:error]) { [self.assetsInUseMap setObject:result forKey:identifier]; - } else { - [self cleanupAssetIfNeeded:result]; - } + return result; + } - return result; + [self cleanupAssetIfNeeded:result]; + return nil; } - (BOOL)_containsAssetWithIdentifier:(NSString *)identifier @@ -587,8 +586,7 @@ - (BOOL)removeAssetWithIdentifier:(NSString *)identifier [assets addObject:asset]; } else if (localError) { ETCoreMLLogError(localError, - "%@: Failed to retrieve asset with identifier = %@", - NSStringFromClass(ETCoreMLAssetManager.class), + "Failed to retrieve asset with identifier = %@.", identifier); } @@ -647,8 +645,7 @@ - (NSUInteger)_compact:(NSUInteger)sizeInBytes error:(NSError * __autoreleasing NSString *identifier = @(asset.identifier.c_str()); if (![self _removeAssetWithIdentifier:identifier error:&cleanupError] && cleanupError) { ETCoreMLLogError(cleanupError, - "%@: Failed to remove asset with identifier = %@", - NSStringFromClass(ETCoreMLAssetManager.class), + "Failed to remove asset with identifier = %@.", identifier); } } @@ -689,8 +686,7 @@ - (void)removeFilesInTrashDirectory { for (NSURL *itemURL in enumerator) { if (![fileManager removeItemAtURL:itemURL error:&localError]) { ETCoreMLLogError(localError, - "%@: Failed to remove item in trash directory with name = %@", - NSStringFromClass(ETCoreMLAssetManager.class), + "Failed to remove item in trash directory with name = %@", itemURL.lastPathComponent); } } @@ -720,9 +716,7 @@ - (BOOL)_purge:(NSError * __autoreleasing *)error { NSError *localError = nil; // Create the assets directory, if we fail here it's okay. if (![self.fileManager createDirectoryAtURL:self.assetsDirectoryURL withIntermediateDirectories:NO attributes:@{} error:&localError]) { - ETCoreMLLogError(localError, - "%@: Failed to create assets directory", - NSStringFromClass(ETCoreMLAssetManager.class)); + ETCoreMLLogError(localError, "Failed to create assets directory."); } return true; diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h index 13b1023bcbc..3cf9e3df5f4 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.h @@ -1,13 +1,13 @@ // -// ETCoreMLDefaultModelExecutor.h -// executorchcoreml_tests +// ETCoreMLDefaultModelExecutor.h // -// Created by Gyan Sinha on 2/25/24. +// Copyright © 2024 Apple Inc. All rights reserved. // +// Please refer to the license found in the LICENSE file in the root directory of the source tree. #import -#import +#import "ETCoreMLModelExecutor.h" @class ETCoreMLModel; diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm index 226307f3c8f..63bc60695ce 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm @@ -1,14 +1,14 @@ // -// ETCoreMLDefaultModelExecutor.m -// executorchcoreml_tests +// ETCoreMLDefaultModelExecutor.mm // -// Created by Gyan Sinha on 2/25/24. +// Copyright © 2024 Apple Inc. All rights reserved. // +// Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import -#import -#import +#import "ETCoreMLAsset.h" +#import "ETCoreMLDefaultModelExecutor.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModel.h" @implementation ETCoreMLDefaultModelExecutor @@ -27,7 +27,9 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model { eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable __unused)eventLogger error:(NSError * __autoreleasing *)error { if (self.ignoreOutputBackings) { - predictionOptions.outputBackings = @{}; + if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { + predictionOptions.outputBackings = @{}; + } } id outputs = [self.model predictionFromFeatures:inputs @@ -44,8 +46,7 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model { if (!featureValue.multiArrayValue) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorBrokenModel, - "%@: Model is broken, expected multiarray for output=%@.", - NSStringFromClass(self.class), + "Model is broken, expected multiarray for output=%@.", outputName); return nil; } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h index d9c4d4ef638..d1bb7c2caa5 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.h @@ -6,9 +6,9 @@ // Please refer to the license found in the LICENSE file in the root directory of the source tree. #import +#import #import -#import NS_ASSUME_NONNULL_BEGIN @@ -18,15 +18,15 @@ extern NSErrorDomain const ETCoreMLErrorDomain; /// The error codes that are exposed publicly. typedef NS_ERROR_ENUM(ETCoreMLErrorDomain, ETCoreMLError) { ETCoreMLErrorCorruptedData = 1, // AOT blob can't be parsed. - ETCoreMLErrorCorruptedMetadata, // AOT blob has incorrect or missing metadata. - ETCoreMLErrorCorruptedModel, // AOT blob has incorrect or missing CoreML model. - ETCoreMLErrorBrokenModel, // CoreML model doesn't match the input and output specification. - ETCoreMLErrorCompilationFailed, // CoreML model failed to compile. - ETCoreMLErrorModelCompilationNotSupported, // CoreML model compilation is not supported by the target. - ETCoreMLErrorModelProfilingNotSupported, // Model profiling is not supported by the target. - ETCoreMLErrorModelSaveFailed, // Failed to save CoreML model to disk. - ETCoreMLErrorModelCacheCreationFailed, // Failed to create model cache. - ETCoreMLErrorInternalError, // Internal error. + ETCoreMLErrorCorruptedMetadata = 2, // AOT blob has incorrect or missing metadata. + ETCoreMLErrorCorruptedModel = 3, // AOT blob has incorrect or missing CoreML model. + ETCoreMLErrorBrokenModel = 4, // CoreML model doesn't match the input and output specification. + ETCoreMLErrorCompilationFailed = 5, // CoreML model failed to compile. + ETCoreMLErrorModelCompilationNotSupported = 6, // CoreML model compilation is not supported by the target. + ETCoreMLErrorModelProfilingNotSupported = 7, // Model profiling is not supported by the target. + ETCoreMLErrorModelSaveFailed = 8, // Failed to save CoreML model to disk. + ETCoreMLErrorModelCacheCreationFailed = 9, // Failed to create model cache. + ETCoreMLErrorInternalError = 10, // Internal error. }; @interface ETCoreMLErrorUtils : NSObject @@ -47,47 +47,47 @@ typedef NS_ERROR_ENUM(ETCoreMLErrorDomain, ETCoreMLError) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" +#if ET_LOG_ENABLED +#define ETCoreMLLogError(error, formatString, ...) \ + do { \ + NSString* message = error.localizedDescription; \ + message = [NSString stringWithFormat:@"[Core ML] " formatString " %@", ##__VA_ARGS__, message]; \ + ET_LOG(Error, "%s", message.UTF8String); \ + } while (0) +#else +#define ETCoreMLLogError(error, formatString, ...) \ + os_log_error(ETCoreMLErrorUtils.loggingChannel, formatString " %@", ##__VA_ARGS__, error.localizedDescription) +#endif + +#if ET_LOG_ENABLED +#define ETCoreMLLogInfo(formatString, ...) \ + ET_LOG(Info, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String) +#else +#define ETCoreMLLogInfo(formatString, ...) os_log_info(ETCoreMLErrorUtils.loggingChannel, formatString, ##__VA_ARGS__) +#endif + /// Record the error with `os_log_error` and fills `*errorOut` with `NSError`. -#define ETCoreMLLogErrorAndSetNSError(errorOut, errorCode, formatString, ...) \ - if (ET_LOG_ENABLED) { \ - ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String); \ - } else { \ - os_log_error(ETCoreMLErrorUtils.loggingChannel, formatString, ##__VA_ARGS__); \ - } \ - if (errorOut) { \ - *errorOut = \ - [NSError errorWithDomain:ETCoreMLErrorDomain \ - code:errorCode \ - userInfo:@{ \ - NSLocalizedDescriptionKey : [NSString stringWithFormat:@formatString, ##__VA_ARGS__] \ - }]; \ - } +#define ETCoreMLLogErrorAndSetNSError(errorOut, errorCode, formatString, ...) \ + do { \ + NSDictionary* userInfo = \ + @{ NSLocalizedDescriptionKey : [NSString stringWithFormat:@formatString, ##__VA_ARGS__] }; \ + NSError* localError = [NSError errorWithDomain:ETCoreMLErrorDomain code:errorCode userInfo:userInfo]; \ + ETCoreMLLogError(localError, ""); \ + if (errorOut) { \ + *errorOut = localError; \ + } \ + } while (0) /// Record the error and its underlying error with `os_log_error` and fills `*errorOut` with `NSError`. #define ETCoreMLLogUnderlyingErrorAndSetNSError(errorOut, errorCode, underlyingNSError, formatString, ...) \ - if (ET_LOG_ENABLED) { \ - ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String); \ - } else { \ - os_log_error(ETCoreMLErrorUtils.loggingChannel, \ - formatString ", with underlying error= %@.", \ - ##__VA_ARGS__, \ - (underlyingNSError).localizedDescription); \ - } \ - if (errorOut) { \ - *errorOut = [ETCoreMLErrorUtils errorWithCode:errorCode \ - underlyingError:underlyingNSError \ - format:@formatString, ##__VA_ARGS__]; \ - } - -#define ETCoreMLLogError(error, formatString, ...) \ - if (ET_LOG_ENABLED) { \ - ET_LOG(Error, "%s", [NSString stringWithFormat:@formatString, ##__VA_ARGS__].UTF8String); \ - } else { \ - os_log_error(ETCoreMLErrorUtils.loggingChannel, \ - formatString ", with error= %@.", \ - ##__VA_ARGS__, \ - (error).localizedDescription); \ - } + do { \ + ETCoreMLLogError(underlyingNSError, formatString, ##__VA_ARGS__); \ + if (errorOut) { \ + *errorOut = [ETCoreMLErrorUtils errorWithCode:errorCode \ + underlyingError:underlyingNSError \ + format:@formatString, ##__VA_ARGS__]; \ + } \ + } while (0) #pragma clang diagnostic pop diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.mm index 15d60d35704..f76b86a36b3 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLLogging.mm @@ -5,9 +5,9 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "ETCoreMLLogging.h" -#import +#import "ETCoreMLStrings.h" const NSErrorDomain ETCoreMLErrorDomain = @"com.apple.executorchcoreml"; diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm index 6b39ae5f920..4201293d1c5 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm @@ -5,7 +5,7 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "ETCoreMLModel.h" #import "ETCoreMLAsset.h" #import "ETCoreMLLogging.h" @@ -256,14 +256,23 @@ - (NSString *)identifier { } if (multiArrayArg && lCopyData) { - [multiArrayArg getMutableBytesWithHandler:^(void *_Nonnull mutableBytes, - NSInteger __unused size, - NSArray *strides) { - MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type(constraint.dataType).value(), + void (^copy_data)(void *, NSArray *) = ^(void *bytes, NSArray *strides) { + MultiArray buffer(bytes, MultiArray::MemoryLayout(to_multiarray_data_type(constraint.dataType).value(), layout.shape(), to_vector(strides))); arg.copy(buffer); - }]; + }; + + + if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) { + [multiArrayArg getMutableBytesWithHandler:^(void *_Nonnull mutableBytes, + NSInteger __unused size, + NSArray *strides) { + copy_data(mutableBytes, strides); + }]; + } else { + copy_data(multiArrayArg.dataPointer, multiArrayArg.strides); + } } [result addObject:multiArrayArg]; @@ -318,8 +327,7 @@ - (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error { BOOL result = [self.mlModel prewarmUsingState:self.state error:error]; if (!result) { ETCoreMLLogError(localError, - "%@: Failed to prewarm model with identifier = %@", - NSStringFromClass(self.class), + "Failed to prewarm model with identifier = %@", self.identifier); } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm index c50bf3002fa..5b2c5a225a3 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm @@ -5,8 +5,10 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import +#import "ETCoreMLModelCompiler.h" + +#import "ETCoreMLLogging.h" + #import @implementation ETCoreMLModelCompiler @@ -20,8 +22,7 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL (void)error; ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorModelCompilationNotSupported, - "%@: Model compilation is not supported on the target, please make sure to export a compiled model.", - NSStringFromClass(ETCoreMLModelCompiler.class)); + "Model compilation is not supported on the target, please make sure to export a compiled model."); return nil; #else __block NSError *localError = nil; @@ -37,11 +38,10 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL long status = dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(maxWaitTimeInSeconds * NSEC_PER_SEC))); if (status != 0) { - ETCoreMLLogErrorAndSetNSError(error, - ETCoreMLErrorCompilationFailed, - "%@: Failed to compile model in %f seconds.", - NSStringFromClass(ETCoreMLModelCompiler.class), - maxWaitTimeInSeconds); + ETCoreMLLogErrorAndSetNSError(error, + ETCoreMLErrorCompilationFailed, + "Failed to compile model in %f seconds.", + maxWaitTimeInSeconds); return nil; } } else { @@ -50,10 +50,9 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL if (localError) { ETCoreMLLogErrorAndSetNSError(error, - ETCoreMLErrorCompilationFailed, - "%@: Failed to compile model, error: %@", - NSStringFromClass(ETCoreMLModelCompiler.class), - localError); + ETCoreMLErrorCompilationFailed, + "Failed to compile model, error = %@.", + localError); return nil; } else { return result; diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm index 11690793baa..05aa910d954 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm @@ -5,14 +5,15 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import -#import -#import -#import -#import -#import -#import +#import "ETCoreMLModelLoader.h" + +#import "asset.h" +#import "ETCoreMLAsset.h" +#import "ETCoreMLAssetManager.h" +#import "ETCoreMLDefaultModelExecutor.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModel.h" +#import "model_metadata.h" using namespace executorchcoreml; @@ -64,8 +65,7 @@ + (nullable ETCoreMLModel *)loadModelWithContentsOfURL:(NSURL *)compiledModelURL if (localError) { ETCoreMLLogError(localError, - "%@: Failed to load model from compiled asset with identifier = %@", - NSStringFromClass(ETCoreMLModelLoader.class), + "Failed to load model from compiled asset with identifier = %@", identifier); } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm index 3848f7c9b3c..c6da7750a11 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm @@ -5,6 +5,8 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. +#import "ETCoreMLModelManager.h" + #import "ETCoreMLAsset.h" #import "ETCoreMLAssetManager.h" #import "ETCoreMLDefaultModelExecutor.h" @@ -13,20 +15,20 @@ #import "ETCoreMLModelCompiler.h" #import "ETCoreMLModelExecutor.h" #import "ETCoreMLModelLoader.h" -#import "ETCoreMLModelManager.h" #import "ETCoreMLStrings.h" #import "MLModel_Prewarm.h" #import "MLMultiArray_Copy.h" -#import #import "inmemory_filesystem_utils.hpp" -#import -#import #import "model_metadata.h" #import "multiarray.h" #import "objc_array_util.h" +#import "serde_json.h" + +#import +#import +#import #import #import -#import "serde_json.h" #import #import #import @@ -73,11 +75,15 @@ BOOL is_backed_by_same_buffer(MLMultiArray *array1, MLMultiArray *array2) { __block BOOL result = NO; - [array1 getBytesWithHandler:^(const void *bytes1, NSInteger __unused size1){ - [array2 getBytesWithHandler:^(const void *bytes2, NSInteger __unused size2) { - result = (bytes1 == bytes2); + if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) { + [array1 getBytesWithHandler:^(const void *bytes1, NSInteger __unused size1){ + [array2 getBytesWithHandler:^(const void *bytes2, NSInteger __unused size2) { + result = (bytes1 == bytes2); + }]; }]; - }]; + } else { + result = (array1.dataPointer == array2.dataPointer); + } return result; } @@ -86,17 +92,19 @@ BOOL is_backed_by_same_buffer(MLMultiArray *array1, MLMultiArray *array2) { NSOrderedSet *output_names, NSError * __autoreleasing *error) { MLPredictionOptions *options = [MLPredictionOptions new]; - NSMutableDictionary *output_backings = [NSMutableDictionary new]; - NSEnumerator *enumerator = [output_names objectEnumerator]; - for (MLMultiArray *output in outputs) { - NSString *output_name = [enumerator nextObject]; - if (output_name.length == 0) { - ETCoreMLLogErrorAndSetNSError(error, 0, "%@: Model is broken.", NSStringFromClass(ETCoreMLModelManager.class)); - return nil; + if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { + NSMutableDictionary *output_backings = [NSMutableDictionary dictionary]; + NSEnumerator *enumerator = [output_names objectEnumerator]; + for (MLMultiArray *output in outputs) { + NSString *output_name = [enumerator nextObject]; + if (output_name.length == 0) { + ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, "Model is broken."); + return nil; + } + output_backings[output_name] = output; } - output_backings[output_name] = output; + options.outputBackings = output_backings; } - options.outputBackings = output_backings; return options; } @@ -138,14 +146,25 @@ void set_outputs(NSArray *outputs, NSArray *mode } void copy(MLMultiArray *src, executorchcoreml::MultiArray& dst) { - [src getBytesWithHandler:^(const void * _Nonnull bytes, NSInteger size) { + void (^copy_data)(void *) = ^(void *bytes) { if (bytes == dst.data()) { return; } - - MultiArray::MemoryLayout src_layout(get_data_type(src.dataType).value(), to_vector(src.shape), to_vector(src.strides)); + + MultiArray::MemoryLayout src_layout( + get_data_type(src.dataType).value(), + to_vector(src.shape), + to_vector(src.strides) + ); MultiArray(const_cast(bytes), std::move(src_layout)).copy(dst); - }]; + }; + if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) { + [src getBytesWithHandler:^(const void * _Nonnull bytes, NSInteger size) { + copy_data(const_cast(bytes)); + }]; + } else { + copy_data(src.dataPointer); + } } void set_outputs(std::vector& outputs, @@ -212,8 +231,7 @@ void set_outputs(std::vector& outputs, ETCoreMLLogUnderlyingErrorAndSetNSError(error, ETCoreMLErrorModelSaveFailed, local_error, - "%@: Failed to create directory when saving model with identifier = %@.", - NSStringFromClass(ETCoreMLModelManager.class), + "Failed to create directory when saving model with identifier = %@.", identifier); return nil; } @@ -236,8 +254,7 @@ void set_outputs(std::vector& outputs, if (!inmemory_fs->write_item_to_disk(file_path, model_path, true, ec)) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorModelSaveFailed, - "%@: Failed to write model files to disk when saving model with identifier = %@.", - NSStringFromClass(ETCoreMLModelManager.class), + "Failed to write model files to disk when saving model with identifier = %@.", identifier); return nil; } @@ -395,8 +412,7 @@ - (nullable ETCoreMLAsset *)assetWithIdentifier:(NSString *)identifier { modelAsset = [self.assetManager assetWithIdentifier:identifier error:&localError]; if (localError) { ETCoreMLLogError(localError, - "%@: Failed to retrieve asset with identifier = %@", - NSStringFromClass(self.assetManager.class), + "Failed to retrieve asset with identifier = %@.", identifier); } @@ -411,8 +427,7 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier if (!modelAssetType) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, - "%@: AOT blob is missing model file.", - NSStringFromClass(ETCoreMLModelManager.class)); + "AOT blob is missing model file."); return nil; } @@ -420,11 +435,12 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier NSURL *modelURL = ::write_model_files(dstURL, self.fileManager, identifier, modelAssetType.value(), inMemoryFS, error); switch (modelAssetType.value()) { case ModelAssetType::CompiledModel: { + // Model is already compiled. return modelURL; } case ModelAssetType::Model: { - // we need to compiled the model. + // Compile the model. NSURL *compiledModelURL = [ETCoreMLModelCompiler compileModelAtURL:modelURL maxWaitTimeInSeconds:(5 * 60) error:error]; @@ -442,6 +458,12 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier NSString *identifier = @(metadata.identifier.c_str()); // Otherwise try to retrieve the compiled asset. ETCoreMLAsset *compiledModelAsset = [self assetWithIdentifier:identifier]; + if (compiledModelAsset) { + ETCoreMLLogInfo("Cache Hit: Successfully retrieved model with identifier=%@ from the models cache.", identifier); + } else { + ETCoreMLLogInfo("Cache Miss: Model with identifier=%@ was not found in the models cache.", identifier); + } + // Create a unique directory for writing model files. NSURL *dstURL = [self.assetManager.trashDirectoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString]; auto modelAssetType = get_model_asset_type(inMemoryFS); @@ -499,9 +521,11 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier ETCoreMLAsset *asset = [self assetWithIdentifier:identifier]; ETCoreMLModel *model = asset ? get_model_from_asset(asset, configuration, metadata, error) : nil; if (model) { + ETCoreMLLogInfo("Cache Hit: Successfully retrieved model with identifier=%@ from the models cache.", identifier); return [[ETCoreMLDefaultModelExecutor alloc] initWithModel:model]; } + ETCoreMLLogInfo("Cache Miss: Model with identifier=%@ was not found in the models cache.", identifier); // Compile the model. NSURL *compiledModelURL = [self compiledModelURLWithIdentifier:identifier inMemoryFS:inMemoryFS @@ -531,8 +555,7 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier if (!inMemoryFS) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, - "%@: Model data is corrupted.", - NSStringFromClass(ETCoreMLModelManager.class)); + "Model data is corrupted."); return nil; } @@ -540,8 +563,7 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier if (!metadata) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedMetadata, - "%@: Metadata is invalid or missing.", - NSStringFromClass(ETCoreMLModelManager.class)); + "Metadata is invalid or missing."); return nil; } @@ -607,9 +629,7 @@ - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount { NSArray *assets = [self.assetManager mostRecentlyUsedAssetsWithMaxCount:maxCount error:&localError]; if (localError) { - ETCoreMLLogError(localError, - "%@: Failed to retrieve recently used assets.", - NSStringFromClass(self.assetManager.class)); + ETCoreMLLogError(localError, "Failed to retrieve recently used assets."); } if (assets.count == 0) { @@ -627,8 +647,7 @@ - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount { NSError *prewarmError = nil; if (![asset prewarmAndReturnError:&prewarmError]) { ETCoreMLLogError(prewarmError, - "%@: Failed to prewarm asset with identifier = %@", - NSStringFromClass(strongSelf.assetManager.class), + "Failed to prewarm asset with identifier = %@", asset.identifier); return; } @@ -664,18 +683,20 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset { NSArray *modelOutputs = [executor executeModelWithInputs:inputFeatures predictionOptions:predictionOptions - loggingOptions:loggingOptions + loggingOptions:loggingOptions eventLogger:eventLogger error:&localError]; // Try without output backings. - if (!modelOutputs && predictionOptions.outputBackings.count > 0) { - executor.ignoreOutputBackings = YES; - localError = nil; - modelOutputs = [executor executeModelWithInputs:inputFeatures - predictionOptions:predictionOptions - loggingOptions:loggingOptions - eventLogger:eventLogger - error:&localError]; + if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { + if (!modelOutputs && predictionOptions.outputBackings.count > 0) { + executor.ignoreOutputBackings = YES; + localError = nil; + modelOutputs = [executor executeModelWithInputs:inputFeatures + predictionOptions:predictionOptions + loggingOptions:loggingOptions + eventLogger:eventLogger + error:&localError]; + } } if (error) { @@ -693,9 +714,8 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle id executor = [self executorWithHandle:handle]; if (!executor) { ETCoreMLLogErrorAndSetNSError(error, - 0, - "%@: Model is already unloaded.", - NSStringFromClass(self.class)); + ETCoreMLErrorInternalError, + "Model is already unloaded."); return NO; } @@ -703,8 +723,7 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle if (args.count != model.orderedInputNames.count + model.orderedOutputNames.count) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, - "%@: Model is invalid, expected args count to be %lu but got %lu.", - NSStringFromClass(self.class), + "Model is invalid, expected args count to be %lu but got %lu.", static_cast(model.orderedInputNames.count + model.orderedOutputNames.count), args.count); return NO; @@ -741,9 +760,8 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle id executor = [self executorWithHandle:handle]; if (!executor) { ETCoreMLLogErrorAndSetNSError(error, - 0, - "%@: Model is already unloaded.", - NSStringFromClass(self.class)); + ETCoreMLErrorInternalError, + "Model is already unloaded."); return NO; } @@ -751,8 +769,7 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle if (argsVec.size() != model.orderedInputNames.count + model.orderedOutputNames.count) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, - "%@: Model is invalid, expected args count to be %lu but got %lu.", - NSStringFromClass(self.class), + "Model is invalid, expected args count to be %lu but got %lu.", static_cast(model.orderedInputNames.count + model.orderedOutputNames.count), argsVec.size()); return NO; diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm index d6f59666cf0..6a737d1e82b 100644 --- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm +++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm @@ -5,10 +5,34 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "MLModel_Prewarm.h" +#include #import +namespace { + size_t get_number_of_bytes(MLMultiArrayDataType data_type) { + switch (data_type) { + case MLMultiArrayDataTypeFloat16: { + return 2; + } + case MLMultiArrayDataTypeFloat32: { + return 4; + } + case MLMultiArrayDataTypeInt32: { + return 4; + } + case MLMultiArrayDataTypeFloat64: { + return 8; + } + default: { + return 0; + } + } + } + +} + @interface MLMultiArray (Prewarm) + (nullable MLMultiArray *)zeroedMultiArrayWithShape:(NSArray *)shape @@ -28,11 +52,22 @@ + (MLMultiArray *)zeroedMultiArrayWithShape:(NSArray *)shape return nil; } - [multiArray getMutableBytesWithHandler:^(void *mutableBytes, NSInteger size, NSArray * __unused strides) { - uint8_t *start = reinterpret_cast(mutableBytes); - uint8_t *end = start + size; - std::fill(start, end, uint8_t(0)); - }]; + + if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) { + void (^fill_zeroes)(void *, NSInteger) = ^(void *bytes, NSInteger size) { + uint8_t *start = reinterpret_cast(bytes); + uint8_t *end = start + size; + std::fill(start, end, uint8_t(0)); + }; + + if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) { + [multiArray getMutableBytesWithHandler:^(void *mutableBytes, NSInteger size, NSArray * __unused strides) { + fill_zeroes(mutableBytes, size); + }]; + } else { + fill_zeroes(multiArray.dataPointer, multiArray.count * get_number_of_bytes(multiArray.dataType)); + } + } return multiArray; } diff --git a/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm b/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm index b8a10fcbbbc..313ee3edaf9 100644 --- a/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm +++ b/backends/apple/coreml/runtime/delegate/MLMultiArray_Copy.mm @@ -5,10 +5,10 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "MLMultiArray_Copy.h" -#import -#import +#import "objc_array_util.h" +#import "multiarray.h" namespace { using namespace executorchcoreml; @@ -27,13 +27,19 @@ MultiArray to_multi_array(void *data, @implementation MLMultiArray (Copy) - (void)copyInto:(MLMultiArray *)dstMultiArray { - [self getBytesWithHandler:^(const void *srcBytes, __unused NSInteger srcSize) { - [dstMultiArray getMutableBytesWithHandler:^(void *dstBytes, __unused NSInteger size, NSArray * strides) { - auto src = ::to_multi_array(const_cast(srcBytes), self.dataType, self.shape, self.strides); - auto dst = ::to_multi_array(dstBytes, dstMultiArray.dataType, dstMultiArray.shape, strides); - src.copy(dst); + if (@available(macOS 12.3, iOS 15.4, tvOS 15.4, watchOS 8.5, *)) { + [self getBytesWithHandler:^(const void *srcBytes, __unused NSInteger srcSize) { + [dstMultiArray getMutableBytesWithHandler:^(void *dstBytes, __unused NSInteger size, NSArray * strides) { + auto src = ::to_multi_array(const_cast(srcBytes), self.dataType, self.shape, self.strides); + auto dst = ::to_multi_array(dstBytes, dstMultiArray.dataType, dstMultiArray.shape, strides); + src.copy(dst); + }]; }]; - }]; + } else { + auto src = ::to_multi_array(self.dataPointer, self.dataType, self.shape, self.strides); + auto dst = ::to_multi_array(dstMultiArray.dataPointer, dstMultiArray.dataType, dstMultiArray.shape, dstMultiArray.strides); + src.copy(dst); + } } @end diff --git a/backends/apple/coreml/runtime/delegate/asset.mm b/backends/apple/coreml/runtime/delegate/asset.mm index c9a6e16d2af..6df2dfbd3c5 100644 --- a/backends/apple/coreml/runtime/delegate/asset.mm +++ b/backends/apple/coreml/runtime/delegate/asset.mm @@ -1,16 +1,16 @@ // -// ModelAsset.cpp +// asset.cpp // // Copyright © 2024 Apple Inc. All rights reserved. // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "asset.h" #import -#import +#import "objc_safe_cast.h" namespace { diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.h b/backends/apple/coreml/runtime/delegate/backend_delegate.h index 9af3df01af2..93c420e11d2 100644 --- a/backends/apple/coreml/runtime/delegate/backend_delegate.h +++ b/backends/apple/coreml/runtime/delegate/backend_delegate.h @@ -7,7 +7,8 @@ #pragma once -#include +#include "model_logging_options.h" + #include #include #include diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm index d8096e16781..2cb274f0a89 100644 --- a/backends/apple/coreml/runtime/delegate/backend_delegate.mm +++ b/backends/apple/coreml/runtime/delegate/backend_delegate.mm @@ -6,13 +6,15 @@ // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import -#import -#import -#import -#import -#import +#import "backend_delegate.h" + +#import "ETCoreMLAssetManager.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModel.h" +#import "ETCoreMLModelManager.h" +#import "ETCoreMLStrings.h" +#import "model_event_logger.h" +#import "multiarray.h" namespace { using namespace executorchcoreml; @@ -282,6 +284,9 @@ explicit BackendDelegateImpl(const Config& config) noexcept ModelHandle *modelHandle = [model_manager_ loadModelFromAOTData:data configuration:configuration error:&localError]; + if (localError != nil) { + ETCoreMLLogError(localError, "Model init failed"); + } return modelHandle; } @@ -290,13 +295,16 @@ bool execute(Handle* handle, const ModelLoggingOptions& logging_options, ModelEventLogger *event_logger, std::error_code& ec) const noexcept override { - NSError *error = nil; + NSError *localError = nil; if (![model_manager_ executeModelWithHandle:handle argsVec:args loggingOptions:logging_options eventLogger:event_logger - error:&error]) { - ec = static_cast(error.code); + error:&localError]) { + if (localError != nil) { + ETCoreMLLogError(localError, "Model execution failed"); + ec = static_cast(localError.code); + } return false; } diff --git a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm index 380ec52b7d7..028191ce497 100644 --- a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm +++ b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm @@ -5,22 +5,25 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import -#import -#import -#import +#import "coreml_backend/delegate.h" + +#import "backend_delegate.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModel.h" +#import "ETCoreMLStrings.h" +#import "model_event_logger.h" +#import "model_logging_options.h" +#import "multiarray.h" +#import "objc_safe_cast.h" + #import #import #import + +#include #import -#import -#import -#import -#import #import #import -#include #ifdef ET_EVENT_TRACER_ENABLED #import diff --git a/backends/apple/coreml/runtime/delegate/model_metadata.h b/backends/apple/coreml/runtime/delegate/model_metadata.h index 275aa39dd3b..8d0c1f0914d 100644 --- a/backends/apple/coreml/runtime/delegate/model_metadata.h +++ b/backends/apple/coreml/runtime/delegate/model_metadata.h @@ -10,7 +10,7 @@ #import #import -#import +#import "serde_json.h" namespace executorchcoreml { diff --git a/backends/apple/coreml/runtime/delegate/multiarray.mm b/backends/apple/coreml/runtime/delegate/multiarray.mm index de705991780..d38ac377799 100644 --- a/backends/apple/coreml/runtime/delegate/multiarray.mm +++ b/backends/apple/coreml/runtime/delegate/multiarray.mm @@ -6,13 +6,14 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "multiarray.h" + +#import "objc_array_util.h" #import #import #import #import -#import #import #import diff --git a/backends/apple/coreml/runtime/delegate/serde_json.mm b/backends/apple/coreml/runtime/delegate/serde_json.mm index 3568ffe4ce8..e39df4d734e 100644 --- a/backends/apple/coreml/runtime/delegate/serde_json.mm +++ b/backends/apple/coreml/runtime/delegate/serde_json.mm @@ -5,11 +5,11 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "serde_json.h" -#import -#import -#import +#import "asset.h" +#import "objc_json_serde.h" +#import "model_metadata.h" namespace { struct FileInfoKeys { diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm index 988b5d808a0..87e086c5bbd 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm @@ -65,9 +65,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod assetManager:assetManager error:&localError]; if (!model) { - ETCoreMLLogError(localError, - "%@: Failed to create model profiler.", - NSStringFromClass(ETCoreMLAssetManager.class)); + ETCoreMLLogError(localError, "Failed to create model profiler."); } self = [super init]; @@ -98,8 +96,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod if (!self.profiler) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorModelProfilingNotSupported, - "%@: Model profiling is only available for macOS >= 14.4, iOS >= 17.4, tvOS >= 17.4 and watchOS >= 10.4.", - NSStringFromClass(ETCoreMLModelAnalyzer.class)); + "Model profiling is only available for macOS >= 14.4, iOS >= 17.4, tvOS >= 17.4 and watchOS >= 10.4."); return nil; } @@ -125,8 +122,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod if (!self.modelAsset) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedData, - "%@: There is no mlpackage, mlpackage is required for debugging a model. Please check the export path.", - NSStringFromClass(ETCoreMLModelAnalyzer.class)); + "The AOT blob is missing an 'mlpackage', which is required for debugging the model. Please check the export path."); return nil; } diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm index 3be28b56d66..1cac0de40f3 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm @@ -7,7 +7,6 @@ #import "ETCoreMLModelDebugger.h" -#import #import "ETCoreMLAsset.h" #import "ETCoreMLAssetManager.h" #import "ETCoreMLLogging.h" @@ -16,12 +15,14 @@ #import "ETCoreMLModelStructurePath.h" #import "ETCoreMLPair.h" #import "ETCoreMLStrings.h" -#import -#import -#import -#import +#import "format/MIL.pb.h" +#import "format/Model.pb.h" #import "model_package_info.h" #import "objc_json_serde.h" + +#import +#import +#import #import #import @@ -43,13 +44,19 @@ const auto& info_value = info.value(); auto it = info_value.items.find(info_value.root_model_identifier); if (it == info_value.items.end()) { - ETCoreMLLogErrorAndSetNSError(error, 0, "%@ is broken, root model info doesn't exist.", model_package_url.lastPathComponent); + ETCoreMLLogErrorAndSetNSError(error, + ETCoreMLErrorCorruptedModel, + "%@ is broken, root model info doesn't exist.", + model_package_url.lastPathComponent); return nil; } auto path = it->second.path; if (path.empty()) { - ETCoreMLLogErrorAndSetNSError(error, 0, "%@ is broken, root model path doesn't exist.", model_package_url.lastPathComponent); + ETCoreMLLogErrorAndSetNSError(error, + ETCoreMLErrorCorruptedModel, + "%@ is broken, root model path doesn't exist.", + model_package_url.lastPathComponent); return nil; } @@ -350,8 +357,8 @@ void set_model_outputs(id output_features, NSMutableArray *values = [NSMutableArray arrayWithCapacity:output_names.count]; for (NSString *output_name in output_names) { MLFeatureValue *feature_value = [output_features featureValueForName:output_name]; - NSCAssert(feature_value.multiArrayValue != nil, @"%@: Expected a multiarray value for output name=%@.", - NSStringFromClass(ETCoreMLModelDebugger.class), + NSCAssert(feature_value.multiArrayValue != nil, + @"Expected a multiarray value for output name=%@.", output_name); [values addObject:feature_value.multiArrayValue]; } @@ -570,8 +577,7 @@ - (nullable ETCoreMLAsset *)compiledModelAssetWithOutputsAtPaths:(NSArray #import diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm index 5998701eb0f..e381bbb03d1 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm @@ -14,9 +14,10 @@ #import "ETCoreMLOperationProfilingInfo.h" #import "ETCoreMLPair.h" #import "ETCoreMLStrings.h" +#import "program_path.h" + #import #import -#import "program_path.h" namespace { using namespace executorchcoreml::modelstructure; @@ -42,8 +43,7 @@ ETCoreMLLogUnderlyingErrorAndSetNSError(error, ETCoreMLErrorCompilationFailed, local_error, - "%@: Failed to get compute plan of model with name=%@.", - NSStringFromClass(ETCoreMLModelProfiler.class), + "Failed to get compute plan of model with name=%@.", model_url.lastPathComponent); return nil; } @@ -288,8 +288,7 @@ - (nullable instancetype)initWithModel:(ETCoreMLModel *)model #endif ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorModelProfilingNotSupported, - "%@: Model profiling is only available for macOS >= 14.4, iOS >= 17.4, tvOS >= 17.4 and watchOS >= 10.4.", - NSStringFromClass(self.class)); + "Model profiling is only available for macOS >= 14.4, iOS >= 17.4, tvOS >= 17.4 and watchOS >= 10.4."); return nil; } diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h b/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h index 80c49f8965e..13d4bb8e6ac 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.h @@ -7,7 +7,7 @@ #import -#import +#import "ETCoreMLComputeUnits.h" NS_ASSUME_NONNULL_BEGIN diff --git a/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm b/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm index 12ac8ec15a3..be34e384b72 100644 --- a/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm +++ b/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm @@ -9,11 +9,13 @@ #import "ETCoreMLModelStructurePath.h" #import "ETCoreMLOperationProfilingInfo.h" -#import #import "objc_array_util.h" +#import "MLMultiArray_Copy.h" + +#import + #import #import -#import "MLMultiArray_Copy.h" namespace { diff --git a/backends/apple/coreml/runtime/sdk/model_package_info.mm b/backends/apple/coreml/runtime/sdk/model_package_info.mm index b7b26178fde..f4b13048718 100644 --- a/backends/apple/coreml/runtime/sdk/model_package_info.mm +++ b/backends/apple/coreml/runtime/sdk/model_package_info.mm @@ -66,7 +66,7 @@ static void from_json(id json, ModelPackageInfo& package_info) { NSURL *manifest_url = [model_package_url URLByAppendingPathComponent:@"manifest.json"].URLByStandardizingPath; BOOL is_directory = NO; if (![fm fileExistsAtPath:manifest_url.path isDirectory:&is_directory] || is_directory) { - ETCoreMLLogErrorAndSetNSError(error, 0, "%@ is broken, manifest doesn't exist.", model_package_url.lastPathComponent); + ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, "%@ is broken, manifest doesn't exist.", model_package_url.lastPathComponent); return std::nullopt; } diff --git a/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm b/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm index 3c0908201ac..50b0f2ec766 100644 --- a/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm +++ b/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm @@ -250,16 +250,14 @@ + (BOOL)extractModelAssetAndMetadataFromAOTData:(NSData *)data if (!inMemoryFS) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedModel, - "%@: Model data is corrupted.", - NSStringFromClass(ETCoreMLTestUtils.class)); + "Model data is corrupted."); return NO; } if (!extract_model_metadata(*inMemoryFS, metadata) || !metadata.is_valid()) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorCorruptedMetadata, - "%@: Model metadata is corrupted.", - NSStringFromClass(ETCoreMLTestUtils.class)); + "Model metadata is corrupted."); return NO; } @@ -269,8 +267,7 @@ + (BOOL)extractModelAssetAndMetadataFromAOTData:(NSData *)data if (![fileManager createDirectoryAtURL:modelURL withIntermediateDirectories:NO attributes:@{} error:error]) { ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorModelSaveFailed, - "%@: Failed to create directory when saving model with name = %@.", - NSStringFromClass(ETCoreMLTestUtils.class), + "Failed to create directory when saving model with name = %@.", modelURL.lastPathComponent); return NO; } diff --git a/backends/apple/coreml/runtime/test/setup.md b/backends/apple/coreml/runtime/test/setup.md index 9876dfc8a3d..1e038c306a4 100644 --- a/backends/apple/coreml/runtime/test/setup.md +++ b/backends/apple/coreml/runtime/test/setup.md @@ -4,18 +4,18 @@ This is a tutorial for setting up tests for the **Core ML** backend. ## Running tests -1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment. +1. Follow the instructions described in [Setting Up ExecuTorch](../../../../../docs/source/getting-started-setup.rst) to set up ExecuTorch environment. 2. Run `install_requirements.sh` to install dependencies required by the **Core ML** backend. ```bash cd executorch -sh backends/apple/coreml/scripts/install_requirements.sh +sh backends/apple/coreml/scripts/install_requirements.sh -``` +``` -3. Follow the instructions described in [Building with CMake](/docs/source/runtime-build-and-cross-compilation.md#building-with-cmake) to set up CMake build system. +3. Follow the instructions described in [Building with CMake](../../../../../docs/source/using-executorch-cpp.md#building-with-cmake) to set up CMake build system. 4. Install [Xcode](https://developer.apple.com/xcode/). @@ -26,7 +26,7 @@ sh backends/apple/coreml/scripts/install_requirements.sh ```bash cd executorch -# Builds macOS universal test bundle. +# Builds macOS universal test bundle. sh backends/apple/coreml/srcipts/build_tests.sh @@ -40,7 +40,7 @@ cd executorch sh backends/apple/coreml/srcipts/run_tests.sh ``` - + ## Updating tests 1. Open the Xcode workspace. @@ -48,7 +48,7 @@ sh backends/apple/coreml/srcipts/run_tests.sh ```bash cd executorch -# Builds macOS universal test bundle. +# Builds macOS universal test bundle. open backends/apple/coreml/runtime/workspace/executorchcoreml.xcworkspace @@ -62,4 +62,4 @@ cd executorch # There is no need to build the tests. sh backends/apple/coreml/srcipts/run_tests.sh -``` \ No newline at end of file +``` diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md index c6daae0d989..887873d4911 100644 --- a/backends/apple/coreml/setup.md +++ b/backends/apple/coreml/setup.md @@ -4,7 +4,7 @@ This is a tutorial for setting up the Core ML backend. ## AOT Setup -1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment. +1. Follow the instructions described in [Setting Up ExecuTorch](../../../docs/source/getting-started-setup.rst) to set up ExecuTorch environment. 2. Run the example script to validate that the **Core ML** backend is set up correctly. @@ -28,7 +28,7 @@ delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner()) ## Integrating Core ML delegate into runtime. -1. Follow the instructions described in [Building with CMake](/docs/source/runtime-build-and-cross-compilation.md#building-with-cmake) to set up CMake build system. +1. Follow the instructions described in [Building with CMake](../../../docs/source/using-executorch-cpp.md#building-with-cmake) to set up CMake build system. 2. Install [Xcode](https://developer.apple.com/xcode/). diff --git a/backends/apple/mps/mps_preprocess.py b/backends/apple/mps/mps_preprocess.py index 749f32a04e5..2982ebc2e01 100644 --- a/backends/apple/mps/mps_preprocess.py +++ b/backends/apple/mps/mps_preprocess.py @@ -6,6 +6,7 @@ from typing import ClassVar, Dict, final, List, Tuple import torch +from executorch import exir from executorch.backends.apple.mps.operators.node_visitor import ( get_node_visitors, @@ -35,6 +36,7 @@ from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass from executorch.exir.program._program import _transform +from executorch.exir.verification.verifier import EXIREdgeDialectVerifier from torch.export.exported_program import ExportedProgram FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" @@ -87,7 +89,19 @@ def preprocess( # the `output_ids` array in the schema. # TODO: Remove this once we have a better support for the dim-order ops. - edge_program = _transform(edge_program, DimOrderOpsRevertPass()) + # Need to override the verifier to skip the non dim-order ops from tripping the default verifier. + edge_program = _transform( + edge_program, + DimOrderOpsRevertPass(), + override_verifiers=[ + EXIREdgeDialectVerifier( + edge_compile_config=exir.EdgeCompileConfig( + _check_ir_validity=False, # Disable the edge dialect verifier, since we are in the mps backend. + ), + class_only=True, + ) + ], + ) mps_graph = MPSGraph( version="0", diff --git a/backends/apple/mps/setup.md b/backends/apple/mps/setup.md index 5c14ad673df..bd688fe8b78 100644 --- a/backends/apple/mps/setup.md +++ b/backends/apple/mps/setup.md @@ -12,11 +12,11 @@ The MPS backend device maps machine learning computational graphs and primitives ::: :::{grid-item-card} Tutorials we recommend you complete before this: :class-card: card-prerequisites -* [Introduction to ExecuTorch](intro-how-it-works.md) -* [Setting up ExecuTorch](getting-started-setup.md) -* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md) -* [ExecuTorch iOS Demo App](demo-apps-ios.md) -* [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md) +* [Introduction to ExecuTorch](../../../docs/source/intro-how-it-works.md) +* [Setting up ExecuTorch](../../../docs/source/getting-started-setup.rst) +* [Building ExecuTorch with CMake](../../../docs/source/using-executorch-cpp.md#building-with-cmake) +* [ExecuTorch iOS Demo App](../../../docs/source/demo-apps-ios.md) +* [ExecuTorch iOS LLaMA Demo App](../../../docs/source/llm/llama-demo-ios.md) ::: :::: @@ -40,7 +40,7 @@ In order to be able to successfully build and run a model using the MPS backend ## Setting up Developer Environment -***Step 1.*** Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup). +***Step 1.*** Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup). ***Step 2.*** Install dependencies needed to lower MPS delegate: @@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp ``` ### Profiling: -1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model. +1. [Optional] Generate an [ETRecord](../../../docs/source/etrecord.rst) while you're exporting your model. ```bash cd executorch python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b ``` -2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md). +2. Run your Program on the ExecuTorch runtime and generate an [ETDump](../../../docs/source/etdump.md). ``` ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs ``` diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py index dd7f3d02518..ddca8ea4a06 100644 --- a/backends/arm/_passes/__init__.py +++ b/backends/arm/_passes/__init__.py @@ -20,13 +20,16 @@ from .convert_to_clamp import ConvertToClampPass # noqa from .decompose_batchnorm_pass import DecomposeBatchNormPass # noqa from .decompose_div_pass import DecomposeDivPass # noqa +from .decompose_gelu_pass import DecomposeGeluPass # noqa from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass # noqa from .decompose_linear_pass import DecomposeLinearPass # noqa from .decompose_meandim_pass import DecomposeMeanDimPass # noqa from .decompose_select import DecomposeSelectPass # noqa +from .decompose_silu_pass import DecomposeSiluPass # noqa from .decompose_softmax_pass import DecomposeSoftmaxPass # noqa from .decompose_softmax_unstable_pass import DecomposeSoftmaxUnstablePass # noqa +from .decompose_sqrt_pass import DecomposeSqrtPass # noqa from .decompose_var_pass import DecomposeVarPass # noqa from .fold_qdq_with_annotated_qparams_pass import ( # noqa FoldAndAnnotateQParamsPass, diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index 703c6ff214c..dd4ca7ad7bd 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -25,13 +25,16 @@ ConvertToClampPass, DecomposeBatchNormPass, DecomposeDivPass, + DecomposeGeluPass, DecomposeLayerNormPass, DecomposeLeakyReLUPass, DecomposeLinearPass, DecomposeMeanDimPass, DecomposeSelectPass, + DecomposeSiluPass, DecomposeSoftmaxPass, DecomposeSoftmaxUnstablePass, + DecomposeSqrtPass, DecomposeVarPass, FoldAndAnnotateQParamsPass, FuseBatchnorm2DPass, @@ -115,6 +118,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul return self._transform(exported_program.graph_module) def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule: + self.add_pass(DecomposeSqrtPass()) self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI()) self.add_pass(FuseQuantizedActivationPass()) self.add_pass(RemoveGetItemPass()) @@ -130,6 +134,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul self.add_pass(ConvertMeanDimToAveragePoolPass()) self.add_pass(DecomposeDivPass()) self.add_pass(DecomposeSoftmaxPass()) + self.add_pass(DecomposeGeluPass()) self.add_pass(ConvertFullLikeToFullPass()) self.add_pass(ConvertToClampPass()) self.add_pass(ConvertMinMaxPass()) @@ -162,12 +167,22 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul return self._transform(exported_program.graph_module) + def _tosa_1_0_int_quantized_pipeline(self, exported_program: ExportedProgram): + return self._tosa_080_BI_pipeline(exported_program) + + def _tosa_1_0_fp_pipeline(self, exported_program: ExportedProgram): + return self._tosa_080_MI_pipeline(exported_program) + def transform_to_backend_pipeline(self, exported_program: ExportedProgram): """Apply passes before transforming program to backend""" if self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+BI"): return self._tosa_080_BI_pipeline(exported_program) elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+MI"): return self._tosa_080_MI_pipeline(exported_program) + elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-1.0+FP"): + return self._tosa_1_0_fp_pipeline(exported_program) + elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-1.0+INT"): + return self._tosa_1_0_int_quantized_pipeline(exported_program) else: raise NotImplementedError( f"No pass pipeline implemented for {self.tosa_spec=}" @@ -181,6 +196,8 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule): self.add_pass(DecomposeMeanDimPass()) self.add_pass(DecomposeDivPass()) self.add_pass(DecomposeLeakyReLUPass()) + self.add_pass(DecomposeSqrtPass()) + self.add_pass(DecomposeSiluPass()) if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset: # Numerically stable softmax uses amax which is not supported on Ethos-U55 diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py index 3b97b944fd4..87512f9fb3c 100644 --- a/backends/arm/_passes/cast_int64_pass.py +++ b/backends/arm/_passes/cast_int64_pass.py @@ -12,7 +12,6 @@ from torch._export.utils import is_buffer logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) class CastInt64BuffersToInt32Pass(ExportPass): diff --git a/backends/arm/_passes/decompose_gelu_pass.py b/backends/arm/_passes/decompose_gelu_pass.py new file mode 100644 index 00000000000..6e72175e68b --- /dev/null +++ b/backends/arm/_passes/decompose_gelu_pass.py @@ -0,0 +1,149 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.backends.arm._passes.arm_pass_utils import get_node_arg +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass + +torch_gelu = (torch.ops.aten.gelu.default,) + +edge_gelu = (exir_ops.edge.aten.gelu.default,) + + +def _get_gelu_ops(op) -> tuple: + """ + Returns the operators needed to decompose GELU + """ + + if op in edge_gelu: + return ( + exir_ops.edge.aten.full.default, + exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.tanh.default, + exir_ops.edge.aten.erf.default, + ) + if op in torch_gelu: + return ( + torch.ops.aten.full.default, + torch.ops.aten.add.Tensor, + torch.ops.aten.mul.Tensor, + torch.ops.aten.tanh.default, + torch.ops.aten.erf.default, + ) + raise RuntimeError(f"Can't get GeLU decomposition ops for op {op}") + + +class DecomposeGeluPass(ExportPass): + """ + This pass decomposes the GELU operator into primitive ops. + Aiming to adhere closely to the reference implementations built into + ExecuTorch. Including using the same pre-calculated constants. + + This operator has two formulae depending on the value of the + approximate argument. Examples below include the added full + operators necessary for the initialization for constants used in + each respective formula. + + aten.gelu(x, approximate="none") becomes: + %FULL_0_5 = full() + %FULL_1 = full() + %FULL_SQRT1_2 = full() + %op1 = mul(x, %FULL_SQRT1_2) + %op2 = erf(%op1) + %op3 = add(%op2, %FULL_1) + %op4 = mul(%op3, %FULL_0_5) + %op5 = mul(%x, %op4) + + aten.gelu(x, approximate="tanh") becomes: + %FULL_0_5 = full() + %FULL_1 = full() + %FULL_SQRT2 = full() + %FULL_2_SQRTPI = full() + %FULL_CUBE_COEFF = full() + %SQRT_MUL = mul(%FULL_SQRT2, %FULL_2_SQRTPI) + %SQRT_2_PI = mul(%SQRT_MUL, %FULL_0_5) + %sqr_x = mul(x, x) + %cube_x = mul(sqr_x, x) + %op1 = mul(%cube_x, %FULL_CUBE_COEFF) + %op2 = add(%x, %op1) + %op3 = mul(%op2, %SQRT_2_PI) + %op4 = tanh(%op3) + %op5 = add(%op4, %FULL_1) + %op6 = mul(%x, %op5) + %op7 = mul(%op6, %FULL_0_5) + """ + + def call_operator(self, op, args, kwargs, meta): + if op not in torch_gelu + edge_gelu: + return super().call_operator(op, args, kwargs, meta) + + full_op, add_op, mul_op, tanh_op, erf_op = _get_gelu_ops(op) + + input = get_node_arg(args, 0) + # If approximate is default (none) it does not appear in kwargs + approximate = get_node_arg(kwargs, "approximate", "none") + + shape = meta["val"].size() + dtype = meta["val"].dtype + + FULL_0_5 = super().call_operator( + full_op, ([1] * len(shape), 0.5), {"dtype": dtype}, meta + ) + FULL_1 = super().call_operator( + full_op, ([1] * len(shape), 1), {"dtype": dtype}, meta + ) + + if approximate == "none": + # Constant mirrors ExecuTorch implementation for parity. + FULL_SQRT1_2 = super().call_operator( + full_op, ([1] * len(shape), 0.70710678118654752440), {}, meta + ) + + op1 = super().call_operator(mul_op, (input, FULL_SQRT1_2), {}, meta) + op2 = super().call_operator(erf_op, (op1,), {}, meta) + op3 = super().call_operator(add_op, (op2, FULL_1), {}, meta) + op4 = super().call_operator(mul_op, (op3, FULL_0_5), {}, meta) + return super().call_operator(mul_op, (input, op4), {}, meta) + + elif approximate == "tanh": + # Constants mirror ExecuTorch implementation for parity. + FULL_SQRT2 = super().call_operator( + full_op, + ([1] * len(shape), 1.41421356237309504880), + {"dtype": dtype}, + meta, + ) + FULL_2_SQRTPI = super().call_operator( + full_op, + ([1] * len(shape), 1.12837916709551257390), + {"dtype": dtype}, + meta, + ) + FULL_CUBE_COEFF = super().call_operator( + full_op, ([1] * len(shape), 0.044715), {"dtype": dtype}, meta + ) + + # Mirrors ExecuTorch implementations for calculating this value + SQRT_MUL = super().call_operator( + mul_op, (FULL_SQRT2, FULL_2_SQRTPI), {}, meta + ) + SQRT_2_PI = super().call_operator(mul_op, (SQRT_MUL, FULL_0_5), {}, meta) + + # Avoiding using POW in order to reduce pass order reliance. + sqr_x = super().call_operator(mul_op, (input, input), {}, meta) + cube_x = super().call_operator(mul_op, (sqr_x, input), {}, meta) + op1 = super().call_operator(mul_op, (cube_x, FULL_CUBE_COEFF), {}, meta) + op2 = super().call_operator(add_op, (input, op1), {}, meta) + op3 = super().call_operator(mul_op, (op2, SQRT_2_PI), {}, meta) + op4 = super().call_operator(tanh_op, (op3,), {}, meta) + op5 = super().call_operator(add_op, (op4, FULL_1), {}, meta) + op6 = super().call_operator(mul_op, (input, op5), {}, meta) + return super().call_operator(mul_op, (op6, FULL_0_5), {}, meta) + else: + raise RuntimeError( + f"approximate argument expected 'none' or 'tanh' but got {approximate}" + ) diff --git a/backends/arm/_passes/decompose_silu_pass.py b/backends/arm/_passes/decompose_silu_pass.py new file mode 100644 index 00000000000..68ebb3f4515 --- /dev/null +++ b/backends/arm/_passes/decompose_silu_pass.py @@ -0,0 +1,34 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import torch +from executorch.exir.pass_base import ExportPass + +aten_silu_ops = (torch.ops.aten.silu.default, torch.ops.aten.silu_.default) + + +class DecomposeSiluPass(ExportPass): + """ + This pass decomposes silu into a mul and a sigmoid node. + + Example: + y = silu(a) + Becomes: + x = sigmoid(a) + y = mul(a,x) + """ + + def call_operator(self, op, args, kwargs, meta): + if op not in (aten_silu_ops): + return super().call_operator(op, args, kwargs, meta) + sigmoid_op = torch.ops.aten.sigmoid.default + mul_op = torch.ops.aten.mul.Tensor + + original = args[0] + sigmoid = super().call_operator(sigmoid_op, (original,), {}, meta) + + return super().call_operator(mul_op, (original, sigmoid), {}, meta) diff --git a/backends/arm/_passes/decompose_sqrt_pass.py b/backends/arm/_passes/decompose_sqrt_pass.py new file mode 100644 index 00000000000..d4a678affea --- /dev/null +++ b/backends/arm/_passes/decompose_sqrt_pass.py @@ -0,0 +1,39 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +import torch +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass + +edge_sqrt_ops = (exir_ops.edge.aten.sqrt.default,) +aten_sqrt_ops = ( + torch.ops.aten.sqrt.default, + torch.ops.aten.sqrt_.default, +) + + +def get_sqrt_decomposition(op) -> tuple: + # TODO : "MLETORCH-863 : Replace current sqrt -> pow.Tensor_Scalar workaround with pow.Tensor_Tensor" + if op in edge_sqrt_ops: + return exir_ops.edge.aten.pow.Tensor_Scalar + if op in aten_sqrt_ops: + return torch.ops.aten.pow.Tensor_Scalar + raise RuntimeError(f"Can't get sqrt decomposition for op {op}") + + +class DecomposeSqrtPass(ExportPass): + + def call_operator(self, op, args, kwargs, meta): + """ + Decomposes `sqrt(x)` into `pow(x, 0.5)` for backend support. + """ + + if op not in (edge_sqrt_ops + aten_sqrt_ops): + return super().call_operator(op, args, kwargs, meta) + + pow_op = get_sqrt_decomposition(op) + + return super().call_operator(pow_op, (args[0], 0.5), {}, meta) diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py index 02510600d82..a5f66829da5 100644 --- a/backends/arm/_passes/insert_table_ops.py +++ b/backends/arm/_passes/insert_table_ops.py @@ -56,6 +56,7 @@ class TableOps: # Targets that must be treated explicitly special_table_ops: Set[EdgeOpOverload] = { exir_ops.edge.aten.pow.Tensor_Scalar, + exir_ops.edge.aten.gelu.default, } def __init__(self, exported_program: ExportedProgram): @@ -76,6 +77,19 @@ def __getitem__(self, node: Node): # Exponent is a constant. Embed it into a lambda. exp = cast(int, node.args[1]) return lambda x: torch.pow(x, exp).flatten() + case exir_ops.edge.aten.gelu.default: + # If kwargs not present it is default "none" + approximate = cast( + str, + ( + node.kwargs["approximate"] + if "approximate" in node.kwargs + else "none" + ), + ) + return lambda x: torch.nn.functional.gelu( + x, approximate=approximate + ).flatten() case _: # Op must be handled if it's inside self.special_ops raise AssertionError("Unhandled table operation") diff --git a/backends/arm/_passes/match_arg_ranks_pass.py b/backends/arm/_passes/match_arg_ranks_pass.py index 2cfc9b2b86a..3554fc0954c 100644 --- a/backends/arm/_passes/match_arg_ranks_pass.py +++ b/backends/arm/_passes/match_arg_ranks_pass.py @@ -48,6 +48,9 @@ def __init__(self, exported_program): exir_ops.edge.aten.bitwise_right_shift.Tensor, exir_ops.edge.aten.bitwise_left_shift.Tensor, exir_ops.edge.aten.eq.Tensor, + exir_ops.edge.aten.gt.Tensor, + exir_ops.edge.aten.ge.Tensor, + exir_ops.edge.aten.lt.Tensor, exir_ops.edge.aten.pow.Tensor_Tensor, exir_ops.edge.aten.where.self, ] diff --git a/backends/arm/_passes/replace_scalar_with_tensor_pass.py b/backends/arm/_passes/replace_scalar_with_tensor_pass.py index 97e89132979..fed72e664f5 100644 --- a/backends/arm/_passes/replace_scalar_with_tensor_pass.py +++ b/backends/arm/_passes/replace_scalar_with_tensor_pass.py @@ -26,6 +26,9 @@ exir_ops.edge.aten.__rshift__.Scalar: exir_ops.edge.aten.bitwise_right_shift.Tensor, exir_ops.edge.aten.__lshift__.Scalar: exir_ops.edge.aten.bitwise_left_shift.Tensor, exir_ops.edge.aten.eq.Scalar: exir_ops.edge.aten.eq.Tensor, + exir_ops.edge.aten.gt.Scalar: exir_ops.edge.aten.gt.Tensor, + exir_ops.edge.aten.ge.Scalar: exir_ops.edge.aten.ge.Tensor, + exir_ops.edge.aten.lt.Scalar: exir_ops.edge.aten.lt.Tensor, torch.ops.aten.add.Scalar: torch.ops.aten.add.Tensor, torch.ops.aten.sub.Scalar: torch.ops.aten.sub.Tensor, torch.ops.aten.mul.Scalar: torch.ops.aten.mul.Tensor, @@ -33,6 +36,9 @@ torch.ops.aten.__rshift__.Scalar: torch.ops.aten.bitwise_right_shift.Tensor, torch.ops.aten.__lshift__.Scalar: torch.ops.aten.bitwise_left_shift.Tensor, torch.ops.aten.eq.Scalar: torch.ops.aten.eq.Tensor, + torch.ops.aten.gt.Scalar: torch.ops.aten.gt.Tensor, + torch.ops.aten.ge.Scalar: torch.ops.aten.ge.Tensor, + torch.ops.aten.lt.Scalar: torch.ops.aten.lt.Tensor, } diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index e6a885b43fa..05b101bef7d 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -11,8 +11,6 @@ # JIT compiler flows. # -import logging - from typing import List, Optional from executorch.backends.arm.tosa_specification import TosaSpecification @@ -20,10 +18,6 @@ from executorch.exir.backend.compile_spec_schema import CompileSpec -logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) - - class ArmCompileSpecBuilder: def __init__(self): self.compile_spec: List[CompileSpec] = [] diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py index 9e13babe23a..75899eb7425 100644 --- a/backends/arm/operator_support/convolution_support.py +++ b/backends/arm/operator_support/convolution_support.py @@ -22,6 +22,8 @@ class ConvolutionSupported(SupportedTOSAOperatorCheck): tosa_specs = [ TosaSpecification.create_from_string("TOSA-0.80+BI"), TosaSpecification.create_from_string("TOSA-0.80+MI"), + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): diff --git a/backends/arm/operator_support/ethos_u55_support.py b/backends/arm/operator_support/ethos_u55_support.py index 69fda636423..7276e8efffe 100644 --- a/backends/arm/operator_support/ethos_u55_support.py +++ b/backends/arm/operator_support/ethos_u55_support.py @@ -134,9 +134,12 @@ class EthosU55NotSupported(OperatorSupportBase): exir_ops.edge.aten.eq.Tensor, exir_ops.edge.aten.eq.Scalar, exir_ops.edge.aten.ge.Tensor, + exir_ops.edge.aten.ge.Scalar, exir_ops.edge.aten.gt.Tensor, + exir_ops.edge.aten.gt.Scalar, exir_ops.edge.aten.le.Tensor, exir_ops.edge.aten.lt.Tensor, + exir_ops.edge.aten.lt.Scalar, exir_ops.edge.aten.flip.default, # REVERSE exir_ops.edge.aten.grid_sampler_2d, # GATHER exir_ops.edge.aten.scatter.src, diff --git a/backends/arm/operator_support/minmax_support.py b/backends/arm/operator_support/minmax_support.py index bdff368a5ce..86b949082eb 100644 --- a/backends/arm/operator_support/minmax_support.py +++ b/backends/arm/operator_support/minmax_support.py @@ -22,6 +22,7 @@ class MinMaxSupported(SupportedTOSAOperatorCheck): # TODO : "MLETORCH-718 : Quantization of indices in arm_quantizer" tosa_specs = [ TosaSpecification.create_from_string("TOSA-0.80+MI"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): diff --git a/backends/arm/operator_support/pool_2d_support.py b/backends/arm/operator_support/pool_2d_support.py index 8291ede8ad9..750fab2730d 100644 --- a/backends/arm/operator_support/pool_2d_support.py +++ b/backends/arm/operator_support/pool_2d_support.py @@ -41,6 +41,8 @@ class AvgPool2dSupported(SupportedTOSAOperatorCheck): tosa_specs = [ TosaSpecification.create_from_string("TOSA-0.80+BI"), TosaSpecification.create_from_string("TOSA-0.80+MI"), + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): @@ -94,6 +96,8 @@ class MaxPool2dSupported(SupportedTOSAOperatorCheck): tosa_specs = [ TosaSpecification.create_from_string("TOSA-0.80+BI"), TosaSpecification.create_from_string("TOSA-0.80+MI"), + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): diff --git a/backends/arm/operator_support/reduce_sum_support.py b/backends/arm/operator_support/reduce_sum_support.py index 37a71d7264c..a50bcbceab7 100644 --- a/backends/arm/operator_support/reduce_sum_support.py +++ b/backends/arm/operator_support/reduce_sum_support.py @@ -21,6 +21,8 @@ class SumSupported(SupportedTOSAOperatorCheck): tosa_specs = [ TosaSpecification.create_from_string("TOSA-0.80+BI"), TosaSpecification.create_from_string("TOSA-0.80+MI"), + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): diff --git a/backends/arm/operator_support/right_shift_support.py b/backends/arm/operator_support/right_shift_support.py index 6c61347ba68..49976b2346f 100644 --- a/backends/arm/operator_support/right_shift_support.py +++ b/backends/arm/operator_support/right_shift_support.py @@ -17,7 +17,6 @@ from executorch.exir.dialects._ops import ops as exir_ops logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) @register_tosa_support_check @@ -30,6 +29,8 @@ class RightShiftSupported(SupportedTOSAOperatorCheck): tosa_specs = [ TosaSpecification.create_from_string("TOSA-0.80+BI"), TosaSpecification.create_from_string("TOSA-0.80+MI"), + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification): diff --git a/backends/arm/operator_support/slice_copy_support.py b/backends/arm/operator_support/slice_copy_support.py index 1f5ace91cde..ea18c408149 100644 --- a/backends/arm/operator_support/slice_copy_support.py +++ b/backends/arm/operator_support/slice_copy_support.py @@ -16,7 +16,6 @@ from executorch.exir.dialects._ops import ops as exir_ops logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) @register_tosa_support_check @@ -26,6 +25,8 @@ class SliceCopySupported(SupportedTOSAOperatorCheck): tosa_specs = [ TosaSpecification.create_from_string("TOSA-0.80+BI"), TosaSpecification.create_from_string("TOSA-0.80+MI"), + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool: # type: ignore[override, misc] diff --git a/backends/arm/operator_support/to_copy_support.py b/backends/arm/operator_support/to_copy_support.py index 7926b3dc053..aa0be8cfcd0 100644 --- a/backends/arm/operator_support/to_copy_support.py +++ b/backends/arm/operator_support/to_copy_support.py @@ -30,6 +30,8 @@ class ToCopySupported(SupportedTOSAOperatorCheck): tosa_specs = [ TosaSpecification.create_from_string("TOSA-0.80+BI"), TosaSpecification.create_from_string("TOSA-0.80+MI"), + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), ] SupportedTypeDict = dict[torch.dtype, list[torch.dtype]] diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py index 09230e44257..952cfb17cf0 100644 --- a/backends/arm/operator_support/tosa_supported_operators.py +++ b/backends/arm/operator_support/tosa_supported_operators.py @@ -66,6 +66,8 @@ def is_node_tosa_supported( _tosa_spec_support: dict[TosaSpecification, list[Type[SupportedTOSAOperatorCheck]]] = { TosaSpecification.create_from_string("TOSA-0.80+BI"): [], TosaSpecification.create_from_string("TOSA-0.80+MI"): [], + TosaSpecification.create_from_string("TOSA-1.0+INT"): [], + TosaSpecification.create_from_string("TOSA-1.0+FP"): [], } @@ -112,6 +114,7 @@ def tosa_support_factory( # Negative checks: Remove nodes from partitioning negative_checks: list[OperatorSupportBase] = [ CheckInt64Inputs(exported_program, reporter), + CheckFloat64Inputs(exported_program, reporter), *[ reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}") for check in (additional_checks if additional_checks else []) @@ -175,9 +178,12 @@ def is_node_supported( exir_ops.edge.aten.full.default, exir_ops.edge.aten.full_like.default, exir_ops.edge.aten.ge.Tensor, + exir_ops.edge.aten.ge.Scalar, exir_ops.edge.aten.gt.Tensor, + exir_ops.edge.aten.gt.Scalar, exir_ops.edge.aten.le.Tensor, exir_ops.edge.aten.lt.Tensor, + exir_ops.edge.aten.lt.Scalar, exir_ops.edge.aten.mul.Tensor, exir_ops.edge.aten.add.Scalar, exir_ops.edge.aten.sub.Scalar, @@ -194,6 +200,7 @@ def is_node_supported( exir_ops.edge.aten.reciprocal.default, exir_ops.edge.aten.relu.default, exir_ops.edge.aten.leaky_relu.default, + exir_ops.edge.aten.sqrt.default, exir_ops.edge.aten.rsqrt.default, exir_ops.edge.aten._softmax.default, exir_ops.edge.aten.select_copy.int, @@ -221,6 +228,8 @@ def is_node_supported( exir_ops.edge.aten.bitwise_left_shift.Tensor, exir_ops.edge.aten.__lshift__.Scalar, torch.ops.aten.scalar_tensor.default, + exir_ops.edge.aten.gelu.default, + exir_ops.edge.aten.alias_copy.default, ] return supported @@ -256,6 +265,7 @@ def is_node_supported( exir_ops.edge.aten.var.correction, exir_ops.edge.aten.var.dim, exir_ops.edge.aten.add.Scalar, + exir_ops.edge.aten.sqrt.default, exir_ops.edge.aten.sub.Scalar, exir_ops.edge.aten.mul.Scalar, exir_ops.edge.aten.div.Scalar, @@ -356,6 +366,7 @@ def is_node_supported( exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.tanh.default, exir_ops.edge.aten.upsample_nearest2d.vec, + exir_ops.edge.aten.gelu.default, ): return True elif node.target in ( @@ -439,3 +450,26 @@ def is_node_supported( ) return False return True + + +class CheckFloat64Inputs(OperatorSupportBase): + + def __init__( + self, exported_program: ExportedProgram, reporter: WhyNoPartitionReporter + ): + self.reporter = reporter + super().__init__() + + def is_node_supported( + self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node + ) -> bool: + + for input_node in node.all_input_nodes: + tensor = get_first_fake_tensor(input_node) + if tensor.dtype == torch.float64: + self.reporter.report_reject( + node, + f"Had float64 input {input_node.name} that couldn't be handled.", + ) + return False + return True diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index b62e8940ed2..da050c5994e 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -21,9 +21,7 @@ op_eq, op_erf, op_exp, - op_full, op_ge, - op_get_item, op_gt, op_le, op_log, @@ -52,5 +50,6 @@ op_view, op_where, ops_binary, + ops_identity, ops_unary, ) diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py index f2c7ce9f9ce..5056c5f7f54 100644 --- a/backends/arm/operators/node_visitor.py +++ b/backends/arm/operators/node_visitor.py @@ -5,10 +5,10 @@ # pyre-unsafe -from typing import Dict, List +from typing import Any, Dict, List -import serializer.tosa_serializer as ts # type: ignore import torch + from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification from torch.export import ExportedProgram @@ -24,11 +24,18 @@ class NodeVisitor: # a specific TOSA version. # When all node_visitors has been refactored to target a specific # version, this list should be removed. - tosa_specs = [ + tosa_specs_1_00 = [ + TosaSpecification.create_from_string("TOSA-1.0+INT"), + TosaSpecification.create_from_string("TOSA-1.0+FP"), + ] + + tosa_specs_0_80 = [ TosaSpecification.create_from_string("TOSA-0.80+BI"), TosaSpecification.create_from_string("TOSA-0.80+MI"), ] + tosa_specs = tosa_specs_0_80 + tosa_specs_1_00 + def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecification): self._exported_program = exported_program self.tosa_spec = tosa_spec @@ -36,7 +43,7 @@ def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecificati def define_node( self, node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, + tosa_graph: Any, inputs: List[TosaArg], output: TosaArg, ) -> None: @@ -47,6 +54,8 @@ def define_node( _node_visitor_dicts: Dict[TosaSpecification, Dict] = { TosaSpecification.create_from_string("TOSA-0.80+BI"): {}, TosaSpecification.create_from_string("TOSA-0.80+MI"): {}, + TosaSpecification.create_from_string("TOSA-1.0+INT"): {}, + TosaSpecification.create_from_string("TOSA-1.0+FP"): {}, } diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py index 886a96fd520..648edde04f4 100644 --- a/backends/arm/operators/op_abs.py +++ b/backends/arm/operators/op_abs.py @@ -9,15 +9,13 @@ import executorch.backends.arm.tosa_quant_utils as tqutils import executorch.backends.arm.tosa_utils as tutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification - -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -70,7 +68,7 @@ def define_node( # Do the INT32 Abs tosa_graph.addOperator( - TosaOp.Op().ABS, + ts.TosaOp.Op().ABS, [ rescaled_inputs[0].name, ], @@ -126,7 +124,7 @@ def define_node( # MI lowering tosa_graph.addOperator( - TosaOp.Op().ABS, + ts.TosaOp.Op().ABS, [inputs[0].name], [output.name], None, diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py index 1be4a218232..904a2405047 100644 --- a/backends/arm/operators/op_add.py +++ b/backends/arm/operators/op_add.py @@ -10,14 +10,13 @@ import executorch.backends.arm.tosa_quant_utils as tqutils import executorch.backends.arm.tosa_utils as tutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -82,7 +81,7 @@ def define_node( # Do the INT32 Add tosa_graph.addOperator( - TosaOp.Op().ADD, + ts.TosaOp.Op().ADD, [input1.name, input2.name], [add_output.name], None, @@ -135,7 +134,7 @@ def define_node( # MI lowering tosa_graph.addOperator( - TosaOp.Op().ADD, + ts.TosaOp.Op().ADD, [input1.name, input2.name], [output.name], None, diff --git a/backends/arm/operators/op_amax.py b/backends/arm/operators/op_amax.py index 7347648c454..059f6c1e553 100644 --- a/backends/arm/operators/op_amax.py +++ b/backends/arm/operators/op_amax.py @@ -4,14 +4,13 @@ # LICENSE file in the root directory of this source tree. from typing import List -import serializer.tosa_serializer as ts +import tosa_tools.v0_80.serializer.tosa_serializer as ts from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -48,5 +47,5 @@ def define_node( attr.AxisAttribute(input.dim_order.index(dim)) tosa_graph.addOperator( - TosaOp.Op().REDUCE_MAX, [input.name], [output.name], attr + ts.TosaOp.Op().REDUCE_MAX, [input.name], [output.name], attr ) diff --git a/backends/arm/operators/op_amin.py b/backends/arm/operators/op_amin.py index 37625cfcc52..85e43b76c4c 100644 --- a/backends/arm/operators/op_amin.py +++ b/backends/arm/operators/op_amin.py @@ -4,14 +4,13 @@ # LICENSE file in the root directory of this source tree. from typing import List -import serializer.tosa_serializer as ts +import tosa_tools.v0_80.serializer.tosa_serializer as ts from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -48,5 +47,5 @@ def define_node( attr.AxisAttribute(input.dim_order.index(dim)) tosa_graph.addOperator( - TosaOp.Op().REDUCE_MIN, [input.name], [output.name], attr + ts.TosaOp.Op().REDUCE_MIN, [input.name], [output.name], attr ) diff --git a/backends/arm/operators/op_any.py b/backends/arm/operators/op_any.py index ffb2e8a3c5d..b65ebb2ac5d 100644 --- a/backends/arm/operators/op_any.py +++ b/backends/arm/operators/op_any.py @@ -6,14 +6,13 @@ # pyre-unsafe from typing import cast, List -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( # type: ignore NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg # type: ignore -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -49,5 +48,5 @@ def define_node( attr.AxisAttribute(inputs[0].dim_order.index(dim)) tosa_graph.addOperator( - TosaOp.Op().REDUCE_ANY, [inputs[0].name], [output.name], attr + ts.TosaOp.Op().REDUCE_ANY, [inputs[0].name], [output.name], attr ) diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py index 772f8353565..bdd3425fda5 100644 --- a/backends/arm/operators/op_avg_pool2d.py +++ b/backends/arm/operators/op_avg_pool2d.py @@ -6,9 +6,10 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore + from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, get_output_qparams, diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py index af02fc30dd8..6dc0ec8002d 100644 --- a/backends/arm/operators/op_bmm.py +++ b/backends/arm/operators/op_bmm.py @@ -7,9 +7,10 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore + from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, get_output_qparams, @@ -20,7 +21,6 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_quant_utils import build_rescale -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -64,7 +64,7 @@ def define_node( attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp) tosa_graph.addOperator( - TosaOp.Op().MATMUL, + ts.TosaOp.Op().MATMUL, [inputs[0].name, inputs[1].name], [bmm_output_name], attr, diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py index f786395cc39..6b1710301b1 100644 --- a/backends/arm/operators/op_cat.py +++ b/backends/arm/operators/op_cat.py @@ -7,13 +7,12 @@ from typing import List -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -42,5 +41,8 @@ def define_node( attr.AxisAttribute(dim) tosa_graph.addOperator( - TosaOp.Op().CONCAT, [tensor.name for tensor in tensors], [output.name], attr + ts.TosaOp.Op().CONCAT, + [tensor.name for tensor in tensors], + [output.name], + attr, ) diff --git a/backends/arm/operators/op_clamp.py b/backends/arm/operators/op_clamp.py index 7c4ad8682fa..b18ed640b5f 100644 --- a/backends/arm/operators/op_clamp.py +++ b/backends/arm/operators/op_clamp.py @@ -8,9 +8,9 @@ from typing import Any, List, Tuple -import serializer.tosa_serializer as ts # type: ignore - import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, @@ -18,7 +18,6 @@ from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -51,7 +50,7 @@ def _create_clamp_node( min_fp32, max_fp32, ) - tosa_graph.addOperator(TosaOp.Op().CLAMP, [input_name], [output_name], attr) + tosa_graph.addOperator(ts.TosaOp.Op().CLAMP, [input_name], [output_name], attr) def _get_min_max_arguments( self, node: Node, dtype_min: int | float, dtype_max: int | float @@ -64,7 +63,8 @@ def cast_type(value: Any) -> int | float: # Attempt to cast to float return float(value) - assert 2 <= len(node.args) <= 3 + if len(node.args) != 2 and len(node.args) != 3: + raise ValueError(f"Expected len(node.args) to be 2 or 3, got {node.args}") min_arg = dtype_min max_arg = dtype_max @@ -85,7 +85,10 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert len(node.all_input_nodes) == 1 + if len(node.all_input_nodes) != 1: + raise ValueError( + f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}" + ) min_int8, max_int8 = self._get_min_max_arguments( node, @@ -123,7 +126,10 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert len(node.all_input_nodes) == 1 + if len(node.all_input_nodes) != 1: + raise ValueError( + f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}" + ) if inputs[0].dtype == ts.DType.INT8: # Call the inherited define_node for handling integers diff --git a/backends/arm/operators/op_constant_pad_nd.py b/backends/arm/operators/op_constant_pad_nd.py index 73f6d2751c5..b2c31df96ab 100644 --- a/backends/arm/operators/op_constant_pad_nd.py +++ b/backends/arm/operators/op_constant_pad_nd.py @@ -7,9 +7,10 @@ from typing import List -import serializer.tosa_serializer as ts import torch +import tosa_tools.v0_80.serializer.tosa_serializer as ts + from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, ) @@ -18,7 +19,6 @@ register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -71,4 +71,6 @@ def define_node( attr = ts.TosaSerializerAttribute() attr.PadAttribute(tosa_graph.builder, output_pad, pad_const_qs, pad_const_fp) - tosa_graph.addOperator(TosaOp.Op().PAD, [inputs[0].name], [output.name], attr) + tosa_graph.addOperator( + ts.TosaOp.Op().PAD, [inputs[0].name], [output.name], attr + ) diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py index 2fe00b6758f..90475af1476 100644 --- a/backends/arm/operators/op_conv2d.py +++ b/backends/arm/operators/op_conv2d.py @@ -6,9 +6,10 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore + from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, get_output_qparams, diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py index 02fc89099e0..7f87fb5a81d 100644 --- a/backends/arm/operators/op_eq.py +++ b/backends/arm/operators/op_eq.py @@ -9,13 +9,12 @@ import executorch.backends.arm.tosa_quant_utils as tqutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -34,9 +33,11 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert ( - inputs[0].dtype == inputs[1].dtype - ), "EQ must have the same dtypes as input" + if inputs[0].dtype != inputs[1].dtype: + raise TypeError( + "All inputs need to have the same data type for operator EQ but got " + f"{inputs[0].dtype=}, {inputs[1].dtype=}" + ) input_nodes = inputs # Handle quantization @@ -51,7 +52,7 @@ def define_node( # Do the equal comparison tosa_graph.addOperator( - TosaOp.Op().EQUAL, + ts.TosaOp.Op().EQUAL, [input_nodes[0].name, input_nodes[1].name], output.name, None, diff --git a/backends/arm/operators/op_erf.py b/backends/arm/operators/op_erf.py index d0dc2af572f..01243716129 100644 --- a/backends/arm/operators/op_erf.py +++ b/backends/arm/operators/op_erf.py @@ -5,15 +5,15 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch.fx +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.tosa.Op as TosaOp # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp @register_node_visitor diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py index 4b8232ef6e7..ca067b3b8be 100644 --- a/backends/arm/operators/op_exp.py +++ b/backends/arm/operators/op_exp.py @@ -6,15 +6,13 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification - -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -36,7 +34,14 @@ def define_node( output: TosaArg, ) -> None: - assert len(node.all_input_nodes) == 1 - assert inputs[0].dtype == output.dtype == ts.DType.FP32 - - tosa_graph.addOperator(TosaOp.Op().EXP, [inputs[0].name], [output.name]) + if len(node.all_input_nodes) != 1: + raise ValueError( + f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}" + ) + if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32: + raise ValueError( + f"Input and output for {self.target} need to be FP32, got input dtype: " + f"{inputs[0].dtype} and output dtype: {output.dtype}" + ) + + tosa_graph.addOperator(ts.TosaOp.Op().EXP, [inputs[0].name], [output.name]) diff --git a/backends/arm/operators/op_full.py b/backends/arm/operators/op_full.py deleted file mode 100644 index f06b9873e63..00000000000 --- a/backends/arm/operators/op_full.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2024-2025 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe -from typing import List - -import numpy as np - -import serializer.tosa_serializer as ts # type: ignore -from executorch.backends.arm.operators.node_visitor import ( - NodeVisitor, - register_node_visitor, -) -from executorch.backends.arm.tosa_mapping import TosaArg -from executorch.backends.arm.tosa_utils import tosa_shape -from torch.fx import Node - - -@register_node_visitor -class FullVisitor(NodeVisitor): - target = "aten.full.default" - - def __init__(self, *args): - super().__init__(*args) - - def define_node( - self, - node: Node, - tosa_graph: ts.TosaSerializer, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - - shape = tosa_shape(inputs[0].special, output.dim_order) - - value = inputs[1].number - - if output.dtype == ts.DType.INT8: - fill_dtype = np.int8 - else: - fill_dtype = np.float32 # type: ignore[assignment] - data = np.full(shape, value, dtype=fill_dtype) - - tosa_graph.addConst(shape, output.dtype, data, node.name + "full-const") - tosa_graph.addOperator( - ts.TosaOp.Op.IDENTITY, [node.name + "full-const"], [output.name] - ) diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py index e4de12f3327..b2193a2e7ed 100644 --- a/backends/arm/operators/op_ge.py +++ b/backends/arm/operators/op_ge.py @@ -9,13 +9,12 @@ import executorch.backends.arm.tosa_quant_utils as tqutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -34,9 +33,11 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert ( - inputs[0].dtype == inputs[1].dtype - ), "GE must have the same dtypes as input" + if inputs[0].dtype != inputs[1].dtype: + raise TypeError( + "All inputs need to have the same data type for operator GE but got " + f"{inputs[0].dtype=}, {inputs[1].dtype=}" + ) input_nodes = inputs # Handle quantization @@ -50,7 +51,7 @@ def define_node( input_nodes = rescaled_inputs tosa_graph.addOperator( - TosaOp.Op().GREATER_EQUAL, + ts.TosaOp.Op().GREATER_EQUAL, [input_nodes[0].name, input_nodes[1].name], [output.name], None, diff --git a/backends/arm/operators/op_get_item.py b/backends/arm/operators/op_get_item.py deleted file mode 100644 index 577a8c8d2ea..00000000000 --- a/backends/arm/operators/op_get_item.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2023-2025 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe -from typing import List - -import serializer.tosa_serializer as ts # type: ignore -import torch -from executorch.backends.arm.operators.node_visitor import ( - NodeVisitor, - register_node_visitor, -) -from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp - - -@register_node_visitor -class GetItemVisitor(NodeVisitor): - target = "getitem" - - def __init__(self, *args): - super().__init__(*args) - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - item_name = inputs[0].name - ## Simply add an identityOp - tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name]) diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py index 65cf8197bdc..06f29e4505c 100644 --- a/backends/arm/operators/op_gt.py +++ b/backends/arm/operators/op_gt.py @@ -9,13 +9,12 @@ import executorch.backends.arm.tosa_quant_utils as tqutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -34,9 +33,11 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert ( - inputs[0].dtype == inputs[1].dtype - ), "GT must have the same dtypes as input" + if inputs[0].dtype != inputs[1].dtype: + raise TypeError( + "All inputs need to have the same data type for operator GT but got " + f"{inputs[0].dtype=}, {inputs[1].dtype=}" + ) input_nodes = inputs # Handle quantization @@ -50,7 +51,7 @@ def define_node( input_nodes = rescaled_inputs tosa_graph.addOperator( - TosaOp.Op().GREATER, + ts.TosaOp.Op().GREATER, [input_nodes[0].name, input_nodes[1].name], [output.name], None, diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py index 8fea2b92088..fadf4848359 100644 --- a/backends/arm/operators/op_le.py +++ b/backends/arm/operators/op_le.py @@ -9,13 +9,12 @@ import executorch.backends.arm.tosa_quant_utils as tqutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -34,9 +33,11 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert ( - inputs[0].dtype == inputs[1].dtype - ), "LE must have the same dtypes as input" + if inputs[0].dtype != inputs[1].dtype: + raise TypeError( + "All inputs need to have the same data type for operator LE but got " + f"{inputs[0].dtype=}, {inputs[1].dtype=}" + ) input_nodes = inputs # Handle quantization @@ -50,7 +51,7 @@ def define_node( input_nodes = rescaled_inputs tosa_graph.addOperator( - TosaOp.Op().GREATER_EQUAL, + ts.TosaOp.Op().GREATER_EQUAL, [input_nodes[1].name, input_nodes[0].name], [output.name], None, diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py index d8a136e37f8..34911075065 100644 --- a/backends/arm/operators/op_log.py +++ b/backends/arm/operators/op_log.py @@ -6,15 +6,13 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification - -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -38,4 +36,4 @@ def define_node( assert len(node.all_input_nodes) == 1 assert inputs[0].dtype == output.dtype == ts.DType.FP32 - tosa_graph.addOperator(TosaOp.Op().LOG, [inputs[0].name], [output.name]) + tosa_graph.addOperator(ts.TosaOp.Op().LOG, [inputs[0].name], [output.name]) diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py index da93ab41799..a261cd2db9f 100644 --- a/backends/arm/operators/op_lt.py +++ b/backends/arm/operators/op_lt.py @@ -9,13 +9,12 @@ import executorch.backends.arm.tosa_quant_utils as tqutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -34,9 +33,11 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert ( - inputs[0].dtype == inputs[1].dtype - ), "LT must have the same dtypes as input" + if inputs[0].dtype != inputs[1].dtype: + raise TypeError( + "All inputs need to have the same data type for operator LT but got " + f"{inputs[0].dtype=}, {inputs[1].dtype=}" + ) input_nodes = inputs # Handle quantization @@ -50,7 +51,7 @@ def define_node( input_nodes = rescaled_inputs tosa_graph.addOperator( - TosaOp.Op().GREATER, + ts.TosaOp.Op().GREATER, [input_nodes[1].name, input_nodes[0].name], [output.name], None, diff --git a/backends/arm/operators/op_max_pool2d.py b/backends/arm/operators/op_max_pool2d.py index 9dd627a3e4f..fcf2636977d 100644 --- a/backends/arm/operators/op_max_pool2d.py +++ b/backends/arm/operators/op_max_pool2d.py @@ -6,9 +6,10 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore + from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, get_output_qparams, @@ -18,7 +19,6 @@ register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -75,7 +75,7 @@ def define_node( ) tosa_graph.addOperator( - TosaOp.Op().MAX_POOL2D, + ts.TosaOp.Op().MAX_POOL2D, [input_tensor.name], [output.name], attr, diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py index 4eb7e47fac8..ee52e5276cd 100644 --- a/backends/arm/operators/op_maximum.py +++ b/backends/arm/operators/op_maximum.py @@ -8,7 +8,7 @@ from typing import List import executorch.backends.arm.tosa_quant_utils as tqutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, @@ -19,8 +19,6 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_utils import tosa_shape - -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -38,20 +36,27 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert inputs[0].dtype == inputs[1].dtype + if inputs[0].dtype != inputs[1].dtype and inputs[0].dtype != output.dtype: + raise TypeError( + f"Data type of inputs and output must be the same. Got input 0 dtype: " + f"{inputs[0].dtype}, input 1 dtype: {inputs[1].dtype} and output " + f"dtype: {output.dtype}" + ) scale_back = 1.0 max_output = output if inputs[0].dtype == ts.DType.INT8: input_qparams = get_input_qparams(node) - assert ( - len(input_qparams) == 2 - ), f"Both inputs needs to have quantization information for {node}" - # insert RESCALEs to int32 - assert ( - input_qparams[0] == input_qparams[1] - ), "Both inputs must have same quantization for MAX" + if len(input_qparams) != 2: + raise ValueError( + f"Both inputs need to have quantization information for {node}" + ) + if input_qparams[0] != input_qparams[1]: + raise ValueError( + "Both inputs must have the same quantization parameters for MAX" + ) + # insert RESCALEs to int32 operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32( tosa_graph, inputs, node ) @@ -62,7 +67,7 @@ def define_node( operand_inputs = inputs tosa_graph.addOperator( - TosaOp.Op().MAXIMUM, + ts.TosaOp.Op().MAXIMUM, [ operand_inputs[0].name, operand_inputs[1].name, diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py index 1b8c1960411..88cb8d376fe 100644 --- a/backends/arm/operators/op_minimum.py +++ b/backends/arm/operators/op_minimum.py @@ -9,7 +9,7 @@ import executorch.backends.arm.tosa_quant_utils as tqutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, @@ -20,8 +20,6 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_utils import tosa_shape - -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -39,20 +37,27 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert inputs[0].dtype == inputs[1].dtype + if inputs[0].dtype != inputs[1].dtype and inputs[0].dtype != output.dtype: + raise TypeError( + f"Data type of inputs and output must be the same. Got input 0 dtype: " + f"{inputs[0].dtype}, input 1 dtype: {inputs[1].dtype} and output " + f"dtype: {output.dtype}" + ) scale_back = 1.0 min_output = output if inputs[0].dtype == ts.DType.INT8: input_qparams = get_input_qparams(node) - assert ( - len(input_qparams) == 2 - ), f"Both inputs needs to have quantization information for {node}" - # insert RESCALEs to int32 - assert ( - input_qparams[0] == input_qparams[1] - ), "Both inputs must have same quantization for MIN" + if len(input_qparams) != 2: + raise ValueError( + f"Both inputs need to have quantization information for {node}" + ) + if input_qparams[0] != input_qparams[1]: + raise ValueError( + "Both inputs must have the same quantization parameters for MIN" + ) + # insert RESCALEs to int32 operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32( tosa_graph, inputs, node ) @@ -63,7 +68,7 @@ def define_node( operand_inputs = inputs tosa_graph.addOperator( - TosaOp.Op().MINIMUM, + ts.TosaOp.Op().MINIMUM, [ operand_inputs[0].name, operand_inputs[1].name, diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py index 2f6c7e7130c..dcceb36b0ab 100644 --- a/backends/arm/operators/op_mul.py +++ b/backends/arm/operators/op_mul.py @@ -9,10 +9,10 @@ import executorch.backends.arm.tosa_quant_utils as tqutils import executorch.backends.arm.tosa_utils as tutils - -import serializer.tosa_serializer as ts # type: ignore import torch +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore + from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( get_input_qparams, ) @@ -24,7 +24,6 @@ from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.arm.tosa_utils import reshape_for_broadcast -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -87,7 +86,7 @@ def define_node( attr = ts.TosaSerializerAttribute() attr.MulAttribute(shift=0) tosa_graph.addOperator( - TosaOp.Op().MUL, + ts.TosaOp.Op().MUL, [input1.name, input2.name], [mul_output.name], attr, @@ -119,5 +118,5 @@ def define_node( attr = ts.TosaSerializerAttribute() attr.MulAttribute(shift=0) tosa_graph.addOperator( - TosaOp.Op().MUL, [input1.name, input2.name], [output.name], attr + ts.TosaOp.Op().MUL, [input1.name, input2.name], [output.name], attr ) diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py index e659918baf2..c92a008a281 100644 --- a/backends/arm/operators/op_permute.py +++ b/backends/arm/operators/op_permute.py @@ -7,14 +7,14 @@ from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp def permutation_vector_to_matrix(permutation_vector: list[int]) -> torch.Tensor: @@ -117,5 +117,5 @@ def define_node( attr = ts.TosaSerializerAttribute() attr.TransposeAttribute(permutation_vector) tosa_graph.addOperator( - TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr + ts.TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr ) diff --git a/backends/arm/operators/op_pow.py b/backends/arm/operators/op_pow.py index 0f251a8aa6d..d3b92feff12 100644 --- a/backends/arm/operators/op_pow.py +++ b/backends/arm/operators/op_pow.py @@ -7,14 +7,13 @@ from typing import List -import serializer.tosa_serializer as ts +import tosa_tools.v0_80.serializer.tosa_serializer as ts from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -47,7 +46,7 @@ def define_node( ) tosa_graph.addOperator( - TosaOp.Op().POW, + ts.TosaOp.Op().POW, [ inputs[0].name, inputs[1].name, diff --git a/backends/arm/operators/op_reciprocal.py b/backends/arm/operators/op_reciprocal.py index 5410e1dd99a..c75fb99977e 100644 --- a/backends/arm/operators/op_reciprocal.py +++ b/backends/arm/operators/op_reciprocal.py @@ -6,15 +6,15 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -34,5 +34,16 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert inputs[0].dtype == output.dtype == ts.DType.FP32 - tosa_graph.addOperator(TosaOp.Op().RECIPROCAL, [inputs[0].name], [output.name]) + if len(node.all_input_nodes) != 1: + raise ValueError( + f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}" + ) + if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32: + raise ValueError( + f"Input and output for {self.target} need to be FP32, got " + f"{inputs[0].dtype=} and {output.dtype=}" + ) + + tosa_graph.addOperator( + ts.TosaOp.Op().RECIPROCAL, [inputs[0].name], [output.name] + ) diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py index b97d7023ef0..142ccb1d25a 100644 --- a/backends/arm/operators/op_repeat.py +++ b/backends/arm/operators/op_repeat.py @@ -5,15 +5,14 @@ # pyre-unsafe -import serializer.tosa_serializer as ts # type: ignore import torch +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_utils import tosa_shape -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -35,4 +34,6 @@ def define_node( attr = ts.TosaSerializerAttribute() attr.TileAttribute(tosa_shape(multiples, output.dim_order)) - tosa_graph.addOperator(TosaOp.Op().TILE, [inputs[0].name], [output.name], attr) + tosa_graph.addOperator( + ts.TosaOp.Op().TILE, [inputs[0].name], [output.name], attr + ) diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_rescale.py index 098fbeccce1..c59015dcc14 100644 --- a/backends/arm/operators/op_rescale.py +++ b/backends/arm/operators/op_rescale.py @@ -8,10 +8,10 @@ from typing import cast, List import executorch.backends.arm.tosa_quant_utils as tosa_quant_utils -import serializer.tosa_serializer as ts # type: ignore import torch +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore -import tosa.Op as TosaOp # type: ignore +import tosa_tools.v0_80.tosa.Op as TosaOp # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, diff --git a/backends/arm/operators/op_rshift_tensor.py b/backends/arm/operators/op_rshift_tensor.py index 8ea0343faaa..125f5493a29 100644 --- a/backends/arm/operators/op_rshift_tensor.py +++ b/backends/arm/operators/op_rshift_tensor.py @@ -7,15 +7,15 @@ from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import Tosa_0_80 -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -39,7 +39,7 @@ def define_node( attr.ArithmeticRightShiftAttribute(round=round) tosa_graph.addOperator( - TosaOp.Op().ARITHMETIC_RIGHT_SHIFT, + ts.TosaOp.Op().ARITHMETIC_RIGHT_SHIFT, [inputs[0].name, inputs[1].name], [output.name], attr, diff --git a/backends/arm/operators/op_rsqrt.py b/backends/arm/operators/op_rsqrt.py index 0fbb203b081..e3937f8c44a 100644 --- a/backends/arm/operators/op_rsqrt.py +++ b/backends/arm/operators/op_rsqrt.py @@ -6,15 +6,15 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -34,5 +34,14 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - assert inputs[0].dtype == output.dtype == ts.DType.FP32 - tosa_graph.addOperator(TosaOp.Op().RSQRT, [inputs[0].name], [output.name]) + if len(node.all_input_nodes) != 1: + raise ValueError( + f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}" + ) + if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32: + raise ValueError( + f"Input and output for {self.target} need to be FP32, got " + f"{inputs[0].dtype=} and {output.dtype=}" + ) + + tosa_graph.addOperator(ts.TosaOp.Op().RSQRT, [inputs[0].name], [output.name]) diff --git a/backends/arm/operators/op_sigmoid.py b/backends/arm/operators/op_sigmoid.py index abf60bf747f..9a002036fee 100644 --- a/backends/arm/operators/op_sigmoid.py +++ b/backends/arm/operators/op_sigmoid.py @@ -6,15 +6,13 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification - -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -46,4 +44,4 @@ def define_node( f"{inputs[0].dtype} and output_dtype: {output.dtype}" ) - tosa_graph.addOperator(TosaOp.Op().SIGMOID, [inputs[0].name], [output.name]) + tosa_graph.addOperator(ts.TosaOp.Op().SIGMOID, [inputs[0].name], [output.name]) diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py index a3ce80c5b24..27ae977a5bc 100644 --- a/backends/arm/operators/op_slice.py +++ b/backends/arm/operators/op_slice.py @@ -7,13 +7,12 @@ from typing import List -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -24,6 +23,18 @@ class SliceVisitor(NodeVisitor): def __init__(self, *args): super().__init__(*args) + def _fixup_start(self, start, shape, dim): + if start.number < 0: + return start.number % shape[dim] + else: + return start.number + + def _fixup_end(self, end, shape, dim): + if end.number < 0: + return end.number % shape[dim] + else: + return min(end.number, shape[dim]) + def define_node( self, node: Node, @@ -43,20 +54,24 @@ def define_node( # Translate and check parameters in Pytorch dim order. shape = input_node.shape dim = dim.number - if end.number < 0: - end_index = end.number % shape[dim] - else: - end_index = min(end.number, shape[dim]) - size = end_index - start.number + + start_index = self._fixup_start(start, shape, dim) + end_index = self._fixup_end(end, shape, dim) + size = end_index - start_index + assert size > 0 assert size <= shape[dim] # Convert aten args to Tosa's start and size attributes and in TOSA dim order. attr = ts.TosaSerializerAttribute() - start_attr = [start.number if i == dim else 0 for i in input_node.dim_order] + + start_attr = [ + self._fixup_start(start, shape, dim) if i == dim else 0 + for i in input_node.dim_order + ] size_attr = [size if i == dim else shape[i] for i in input_node.dim_order] attr.SliceAttribute(start_attr, size_attr) tosa_graph.addOperator( - TosaOp.Op().SLICE, [input_node.name], [output.name], attr + ts.TosaOp.Op().SLICE, [input_node.name], [output.name], attr ) diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py index 6cd422095ab..ef9ed31c88d 100644 --- a/backends/arm/operators/op_sub.py +++ b/backends/arm/operators/op_sub.py @@ -10,14 +10,13 @@ import executorch.backends.arm.tosa_quant_utils as tqutils import executorch.backends.arm.tosa_utils as tutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -41,9 +40,19 @@ def define_node( ) -> None: # Specification (0.80) states that input and output types # should all be the same - assert inputs[0].dtype == inputs[1].dtype == output.dtype + if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype: + raise TypeError( + f"All IO needs to have the same data type, got input 1: " + f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: " + f"{output.dtype}" + ) + # Handle int8 (quantized) and int32 - assert inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32] + supported_dtypes = [ts.DType.INT8, ts.DType.INT32] + if inputs[0].dtype not in supported_dtypes: + raise TypeError( + f'IO data type needs to be {supported_dtypes}, got "{inputs[0].dtype}"' + ) if inputs[0].dtype == ts.DType.INT8: rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32( @@ -63,7 +72,7 @@ def define_node( # Do the INT32 Sub tosa_graph.addOperator( - TosaOp.Op().SUB, + ts.TosaOp.Op().SUB, [ rescaled_inputs[0].name, rescaled_inputs[1].name, @@ -98,19 +107,31 @@ def define_node( ) -> None: # Specification (0.80) states that input and output types # should all be the same - assert inputs[0].dtype == inputs[1].dtype == output.dtype + if inputs[0].dtype != inputs[1].dtype or inputs[0].dtype != output.dtype: + raise TypeError( + f"All IO needs to have the same data type, got input 1: " + f"{inputs[0].dtype}, input 2: {inputs[1].dtype} and output: " + f"{output.dtype}" + ) if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]: # Call the inherited define_node for handling integers super().define_node(node, tosa_graph, inputs, output) else: # FP32 Sub lowering - assert inputs[0].dtype == ts.DType.FP32 - assert output.dtype == ts.DType.FP32 + if ( + inputs[0].dtype != ts.DType.FP32 + or inputs[1].dtype != ts.DType.FP32 + or output.dtype != ts.DType.FP32 + ): + raise TypeError( + f"All IO needs to have data type fp32. Got: {inputs[0].dtype}, " + f"input 2: {inputs[1].dtype} and output: {output.dtype}" + ) # MI lowering tosa_graph.addOperator( - TosaOp.Op().SUB, + ts.TosaOp.Op().SUB, [inputs[0].name, inputs[1].name], [output.name], None, diff --git a/backends/arm/operators/op_sum.py b/backends/arm/operators/op_sum.py index b5b388b3352..135566e48ac 100644 --- a/backends/arm/operators/op_sum.py +++ b/backends/arm/operators/op_sum.py @@ -10,14 +10,13 @@ import executorch.backends.arm.tosa_quant_utils as tqutils import executorch.backends.arm.tosa_utils as tutils -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -69,7 +68,7 @@ def define_node( ) tosa_graph.addOperator( - TosaOp.Op().REDUCE_SUM, [prev_node.name], [next_node.name], attr + ts.TosaOp.Op().REDUCE_SUM, [prev_node.name], [next_node.name], attr ) prev_node = next_node @@ -120,7 +119,7 @@ def define_node( ).name tosa_graph.addOperator( - TosaOp.Op().REDUCE_SUM, [input_name], [output_name], attr + ts.TosaOp.Op().REDUCE_SUM, [input_name], [output_name], attr ) input_name = output_name diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_table.py index 40214b265f0..6a2053bea0d 100644 --- a/backends/arm/operators/op_table.py +++ b/backends/arm/operators/op_table.py @@ -8,15 +8,14 @@ from typing import List import numpy as np - -import serializer.tosa_serializer as ts # type: ignore import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -49,5 +48,5 @@ def define_node( table_attr.TableAttribute(np.array(table)) tosa_graph.addOperator( - TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr + ts.TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr ) diff --git a/backends/arm/operators/op_tanh.py b/backends/arm/operators/op_tanh.py index 89dd15c97d6..51cf1ee786b 100644 --- a/backends/arm/operators/op_tanh.py +++ b/backends/arm/operators/op_tanh.py @@ -6,14 +6,13 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp from torch.fx import Node @@ -44,4 +43,4 @@ def define_node( f"{inputs[0].dtype} and output_dtype: {output.dtype}" ) - tosa_graph.addOperator(TosaOp.Op().TANH, [inputs[0].name], [output.name]) + tosa_graph.addOperator(ts.TosaOp.Op().TANH, [inputs[0].name], [output.name]) diff --git a/backends/arm/operators/op_to_copy.py b/backends/arm/operators/op_to_copy.py index feaec3a41e9..90485b71d50 100644 --- a/backends/arm/operators/op_to_copy.py +++ b/backends/arm/operators/op_to_copy.py @@ -6,9 +6,10 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch -import tosa.Op as TosaOp # type: ignore + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.tosa.Op as TosaOp # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, diff --git a/backends/arm/operators/op_to_dim_order_copy.py b/backends/arm/operators/op_to_dim_order_copy.py index 397979a439d..f144beba29f 100644 --- a/backends/arm/operators/op_to_dim_order_copy.py +++ b/backends/arm/operators/op_to_dim_order_copy.py @@ -6,9 +6,10 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch -import tosa.Op as TosaOp # type: ignore + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.tosa.Op as TosaOp # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_transpose.py index 54a79297dd6..b909aef2ac9 100644 --- a/backends/arm/operators/op_transpose.py +++ b/backends/arm/operators/op_transpose.py @@ -7,14 +7,14 @@ from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp @register_node_visitor @@ -39,5 +39,5 @@ def define_node( attr = ts.TosaSerializerAttribute() attr.TransposeAttribute(perms) tosa_graph.addOperator( - TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr + ts.TosaOp.Op().TRANSPOSE, [inputs[0].name], [output.name], attr ) diff --git a/backends/arm/operators/op_upsample_nearest2d.py b/backends/arm/operators/op_upsample_nearest2d.py index 38e4087d38d..23d24b78339 100644 --- a/backends/arm/operators/op_upsample_nearest2d.py +++ b/backends/arm/operators/op_upsample_nearest2d.py @@ -6,17 +6,17 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_utils import get_resize_parameters, tosa_shape -from serializer.tosa_serializer import TosaOp -from tosa.ResizeMode import ResizeMode # type: ignore +from tosa_tools.v0_80.tosa.ResizeMode import ResizeMode # type: ignore @register_node_visitor @@ -65,5 +65,5 @@ def in_int16_range(x): ) tosa_graph.addOperator( - TosaOp.Op().RESIZE, [inputs[0].name], [output.name], attr + ts.TosaOp.Op().RESIZE, [inputs[0].name], [output.name], attr ) diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py index 119e32fa58f..e063b8e39ec 100644 --- a/backends/arm/operators/op_view.py +++ b/backends/arm/operators/op_view.py @@ -6,9 +6,10 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch -import tosa.Op as TosaOp # type: ignore + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.tosa.Op as TosaOp # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, diff --git a/backends/arm/operators/op_where.py b/backends/arm/operators/op_where.py index c8b35e831d4..ba2469e74e1 100644 --- a/backends/arm/operators/op_where.py +++ b/backends/arm/operators/op_where.py @@ -5,7 +5,8 @@ from typing import List, Sequence -import serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore +import tosa_tools.v0_80.tosa.Op as TosaOp # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, @@ -13,7 +14,6 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification -from serializer.tosa_serializer import TosaOp from torch.fx import Node diff --git a/backends/arm/operators/ops_binary.py b/backends/arm/operators/ops_binary.py index 307710e38e9..a17da41f767 100644 --- a/backends/arm/operators/ops_binary.py +++ b/backends/arm/operators/ops_binary.py @@ -7,16 +7,16 @@ from typing import List -import serializer.tosa_serializer as ts import torch import torch.fx +import tosa_tools.v0_80.serializer.tosa_serializer as ts + from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp def binary_operator_factory(bw_target: str, tosa_op): @@ -46,12 +46,12 @@ def define_node( register_node_visitor(BinaryOperator) -binary_operator_factory("aten.bitwise_and.Tensor", TosaOp.Op().BITWISE_AND) -binary_operator_factory("aten.bitwise_xor.Tensor", TosaOp.Op().BITWISE_XOR) -binary_operator_factory("aten.bitwise_or.Tensor", TosaOp.Op().BITWISE_OR) -binary_operator_factory("aten.logical_and.default", TosaOp.Op().LOGICAL_AND) -binary_operator_factory("aten.logical_xor.default", TosaOp.Op().LOGICAL_XOR) -binary_operator_factory("aten.logical_or.default", TosaOp.Op().LOGICAL_OR) +binary_operator_factory("aten.bitwise_and.Tensor", ts.TosaOp.Op().BITWISE_AND) +binary_operator_factory("aten.bitwise_xor.Tensor", ts.TosaOp.Op().BITWISE_XOR) +binary_operator_factory("aten.bitwise_or.Tensor", ts.TosaOp.Op().BITWISE_OR) +binary_operator_factory("aten.logical_and.default", ts.TosaOp.Op().LOGICAL_AND) +binary_operator_factory("aten.logical_xor.default", ts.TosaOp.Op().LOGICAL_XOR) +binary_operator_factory("aten.logical_or.default", ts.TosaOp.Op().LOGICAL_OR) binary_operator_factory( - "aten.bitwise_left_shift.Tensor", TosaOp.Op().LOGICAL_LEFT_SHIFT + "aten.bitwise_left_shift.Tensor", ts.TosaOp.Op().LOGICAL_LEFT_SHIFT ) diff --git a/backends/arm/operators/ops_identity.py b/backends/arm/operators/ops_identity.py new file mode 100644 index 00000000000..0c6527cf336 --- /dev/null +++ b/backends/arm/operators/ops_identity.py @@ -0,0 +1,47 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import List + +import torch +import torch.fx + +import tosa_tools.v0_80.serializer.tosa_serializer as ts + +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg + + +def identity_operator_factory(identity_target: str): + """ + Creates and registers NodeVisitors for operators that map directly + to a TOSA IDENTITY op. + """ + + class IdentityOperatorVisitor(NodeVisitor): + target = identity_target + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + ) -> None: + # Simply add an identityOp + tosa_graph.addOperator( + ts.TosaOp.Op().IDENTITY, [inputs[0].name], [output.name] + ) + + register_node_visitor(IdentityOperatorVisitor) + + +identity_operator_factory("getitem") +identity_operator_factory("aten.alias_copy.default") diff --git a/backends/arm/operators/ops_unary.py b/backends/arm/operators/ops_unary.py index 0a7d45ffe98..3f713e086e6 100644 --- a/backends/arm/operators/ops_unary.py +++ b/backends/arm/operators/ops_unary.py @@ -6,15 +6,15 @@ # pyre-unsafe from typing import List -import serializer.tosa_serializer as ts # type: ignore import torch.fx + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp def unary_operator_factory(unary_target: str, tosa_op): @@ -53,6 +53,6 @@ def define_node( register_node_visitor(UnaryOperator) -unary_operator_factory("aten.ceil.default", TosaOp.Op().CEIL) -unary_operator_factory("aten.floor.default", TosaOp.Op().FLOOR) -unary_operator_factory("aten.logical_not.default", TosaOp.Op().LOGICAL_NOT) +unary_operator_factory("aten.ceil.default", ts.TosaOp.Op().CEIL) +unary_operator_factory("aten.floor.default", ts.TosaOp.Op().FLOOR) +unary_operator_factory("aten.logical_not.default", ts.TosaOp.Op().LOGICAL_NOT) diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py index f9b77e28493..6692b75c892 100644 --- a/backends/arm/process_node.py +++ b/backends/arm/process_node.py @@ -5,22 +5,33 @@ # # pyre-unsafe -from typing import cast, Dict +from typing import Any, cast, Dict import numpy as np -import serializer.tosa_serializer as ts # type: ignore import torch import torch.fx from executorch.backends.arm.operators.node_visitor import NodeVisitor from executorch.backends.arm.tosa_mapping import TosaArg -from executorch.backends.arm.tosa_specification import TosaSpecification +from executorch.backends.arm.tosa_specification import ( + Tosa_0_80, + Tosa_1_00, + TosaSpecification, +) from executorch.backends.arm.tosa_utils import getNodeArgs, tosa_shape +from torch._export.utils import ( + get_buffer, + get_lifted_tensor_constant, + get_param, + is_buffer, + is_lifted_tensor_constant, + is_param, +) from torch.export.exported_program import ExportedProgram def process_call_function( node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, + tosa_graph: Any, node_visitors: Dict[str, NodeVisitor], tosa_spec: TosaSpecification, ): @@ -55,7 +66,7 @@ def process_call_function( def process_inputs( node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, + tosa_graph: Any, tosa_spec: TosaSpecification, ): """Serialize an input node""" @@ -73,6 +84,14 @@ def process_inputs( f"Failed processing input placeholder: {node.name}. " "Is the original torch function supported?" ) from e + + if isinstance(tosa_spec, Tosa_0_80): + import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore + elif isinstance(tosa_spec, Tosa_1_00): + import serializer.tosa_serializer as ts + else: + raise ValueError(f"Unsupported TOSA spec: {tosa_spec}") + input_shape = tosa_arg.shape input_dim_order = tosa_arg.dim_order tensor = ts.TosaSerializerTensor( @@ -87,7 +106,7 @@ def process_inputs( def process_inputs_to_parameters( node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, + tosa_graph: Any, edge_program: ExportedProgram, tosa_spec: TosaSpecification, ): @@ -99,8 +118,7 @@ def process_inputs_to_parameters( f"Failed processing parameter placeholder: {node.name}. " "Is the original torch function supported?" ) from e - parameter_name = edge_program.graph_signature.inputs_to_parameters[tosa_arg.name] - parameter_data = edge_program.state_dict[parameter_name] + parameter_data = get_param(edge_program, node) assert isinstance(parameter_data, torch.Tensor), "Expect Attr to be tensor" parameter_values = parameter_data.detach().numpy() @@ -117,7 +135,7 @@ def process_inputs_to_parameters( def process_inputs_to_buffers( node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, + tosa_graph: Any, edge_program: ExportedProgram, ): """Serialize quantized weights""" @@ -128,8 +146,7 @@ def process_inputs_to_buffers( f"Failed processing buffer placeholder: {node.name}. " "Is the original torch function supported?" ) from e - buffer_name = edge_program.graph_signature.inputs_to_buffers[node.name] - buffer_data = edge_program.state_dict[buffer_name] + buffer_data = get_buffer(edge_program, node) assert isinstance(buffer_data, torch.Tensor), "Expect Attr to be tensor" buffer_values = buffer_data.detach().numpy() @@ -146,7 +163,7 @@ def process_inputs_to_buffers( def process_inputs_to_lifted_tensor_constants( node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, + tosa_graph: Any, edge_program: ExportedProgram, ): try: @@ -156,11 +173,8 @@ def process_inputs_to_lifted_tensor_constants( f"Failed processing lifted tensor constant placeholder: {node.name}. " "Is the original torch function supported?" ) from e - tensor_name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[ - tosa_arg.name - ] - tensor = edge_program.tensor_constants[tensor_name] - tensor_data = tensor.detach().numpy() + tensor = get_lifted_tensor_constant(edge_program, node) + tensor_data = tensor.detach().numpy() # type: ignore[union-attr] tosa_graph.addConst( tensor_data.shape, tosa_arg.dtype, tensor_data, name=tosa_arg.name @@ -169,7 +183,7 @@ def process_inputs_to_lifted_tensor_constants( def process_placeholder( node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, + tosa_graph: Any, edge_program: ExportedProgram, tosa_spec: TosaSpecification, ): @@ -179,11 +193,11 @@ def process_placeholder( if node.name in edge_program.graph_signature.user_inputs: process_inputs(node, tosa_graph, tosa_spec) - elif node.name in edge_program.graph_signature.inputs_to_parameters: + elif is_param(edge_program, node): process_inputs_to_parameters(node, tosa_graph, edge_program, tosa_spec) - elif node.name in edge_program.graph_signature.inputs_to_buffers: + elif is_buffer(edge_program, node): process_inputs_to_buffers(node, tosa_graph, edge_program) - elif node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants: + elif is_lifted_tensor_constant(edge_program, node): process_inputs_to_lifted_tensor_constants(node, tosa_graph, edge_program) elif node.name in edge_program.graph_signature.inputs_to_lifted_custom_objs: raise NotImplementedError( @@ -195,7 +209,7 @@ def process_placeholder( def process_output( node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, + tosa_graph: Any, ): for output in cast(tuple[torch.fx.Node, ...], node.args[0]): tosa_graph.addOutputTensor( diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index e76ed5fb415..ee08f8e9eec 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -286,10 +286,10 @@ def _annotate_all_static_patterns( quantization_config: Optional[QuantizationConfig], filter_fn: Optional[Callable[[Node], bool]] = None, ) -> GraphModule: - """Loops over all STATIC_OPS and runs the corresponding registred annotator. + """Loops over all STATIC_OPS and runs the corresponding registered annotator. Args: model: The model to annotate statically. - quantization_config: Specifices the QuantizationSpecs for the model's + quantization_config: Specifies the QuantizationSpecs for the model's input activations, output activations, weights and biases. filter_fn: An optional filter function that takes a node and returns whether the node should be annotated. Returns: diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py index b0f9e90b10f..5398101fd9a 100644 --- a/backends/arm/quantizer/quantization_annotator.py +++ b/backends/arm/quantizer/quantization_annotator.py @@ -178,6 +178,7 @@ def _match_pattern( torch.ops.aten.hardswish_.default, torch.ops.aten.full_like.default, torch.ops.aten.pow.Tensor_Scalar, + torch.ops.aten.gelu.default, ] _one_to_one_shared_input_qspec = [ @@ -243,6 +244,11 @@ def _match_pattern( operator.getitem, ] +_one_to_one_shared_input_or_input_act_qspec = [ + torch.ops.aten.adaptive_avg_pool2d.default, + torch.ops.aten.alias_copy.default, +] + def get_quant_properties( # noqa: C901 node: Node, gm: torch.fx.GraphModule, quantization_config @@ -331,7 +337,7 @@ def any_or_hardtanh_min_zero(n: Node): _QuantProperty(2, shared_qspec), # type: ignore[arg-type] ] quant_properties.quant_output = _QuantProperty(0, shared_qspec) # type: ignore[arg-type] - elif node.target == torch.ops.aten.adaptive_avg_pool2d.default: + elif node.target in _one_to_one_shared_input_or_input_act_qspec: input_qspec = ( SharedQuantizationSpec(node.args[0]) # type: ignore[arg-type] if arm_quantizer_utils.is_output_annotated(node.args[0]) # type: ignore diff --git a/backends/arm/scripts/install_reference_model.sh b/backends/arm/scripts/install_reference_model.sh new file mode 100755 index 00000000000..0141b195a0d --- /dev/null +++ b/backends/arm/scripts/install_reference_model.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +# Installation script to manage transition to 1.0 + +# TOSA reference model +tosa_reference_model_url="https://git.gitlab.arm.com/tosa/tosa-reference-model.git" +tosa_reference_model_0_80_branch="v0.80" +tosa_reference_model_0_80_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a" +tosa_serialization_lib_0_80_rev="v0.80.1" +tosa_reference_model_1_0_rev="f9b4ceb850964be03a39e965ad7a0546dc6c57ae" + +script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) + +source ${script_dir}/utils.sh + + +function setup_tosa_reference_model() { + local work_dir="$1" + + if [[ -z "$work_dir" ]]; then + echo "Error: work_dir parameter is required." + return 1 + fi + + mkdir -p "$work_dir" + pushd "$work_dir" || exit 1 + + # Install a patched version of TOSA reference model v0.80.1 to make it co-exist with 1.0 during the transition period + if [[ ! -d "reference_model" ]]; then + git clone --recurse-submodules --branch ${tosa_reference_model_0_80_branch} "$tosa_reference_model_url" reference_model + fi + + patches_dir=${script_dir}/../third-party/reference_model/patches/v0.80 + patch_repo reference_model ${tosa_reference_model_0_80_rev} ${patches_dir} + patch_repo reference_model/thirdparty/serialization_lib ${tosa_serialization_lib_0_80_rev} ${patches_dir} + + pushd reference_model + rm -rf build + # reference_model flatbuffers version clashes with Vela. + # go with Vela's since it newer. + # Vela's flatbuffer requirement is expected to loosen, then remove this. MLETORCH-565 + CMAKE_POLICY_VERSION_MINIMUM=3.5 pip install . --no-dependencies flatbuffers + popd + + # Install the 1.0 branch from upstream + CMAKE_POLICY_VERSION_MINIMUM=3.5 BUILD_PYBIND=1 pip install "tosa-tools@git+${tosa_reference_model_url}@${tosa_reference_model_1_0_rev}" ml_dtypes==0.5.1 --no-dependencies flatbuffers +} + +setup_tosa_reference_model $1 diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py new file mode 100644 index 00000000000..8aabf7c2c59 --- /dev/null +++ b/backends/arm/scripts/parse_test_names.py @@ -0,0 +1,102 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from executorch.exir.dialects.edge.spec.utils import SAMPLE_INPUT + +# Add edge ops which we lower but which are not included in exir/dialects/edge/edge.yaml here. +CUSTOM_EDGE_OPS = ["linspace.default", "eye.default"] +ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS + +# Add all targets and TOSA profiles we support here. +TARGETS = {"tosa_BI", "tosa_MI", "u55_BI", "u85_BI"} + + +def get_edge_ops(): + """ + Returns a set with edge_ops with names on the form to be used in unittests: + 1. Names are in lowercase. + 2. Overload is ignored if it is 'default', otherwise its appended with an underscore. + 3. Overly verbose name are shortened by removing certain prefixes/suffixes. + + Examples: + abs.default -> abs + split_copy.Tensor -> split_tensor + """ + edge_ops = set() + for edge_name in ALL_EDGE_OPS: + op, overload = edge_name.split(".") + + # Normalize names + op = op.lower() + op = op.removeprefix("_") + op = op.removesuffix("_copy") + op = op.removesuffix("_with_indices") + op = op.removesuffix("_no_training") + overload = overload.lower() + + if overload == "default": + edge_ops.add(op) + else: + edge_ops.add(f"{op}_{overload}") + + return edge_ops + + +def parse_test_name(test_name: str, edge_ops: set[str]) -> tuple[str, str, bool]: + """ + Parses a test name on the form + test_OP_TARGET__ + where OP must match a string in edge_ops and TARGET must match one string in TARGETS. + The "not_delegated" suffix indicates that the test tests that the op is not delegated. + + Examples of valid names: "test_mm_u55_BI_not_delegated" or "test_add_scalar_tosa_MI_two_inputs". + + Returns a tuple (OP, TARGET, IS_DELEGATED) if valid. + """ + test_name = test_name.removeprefix("test_") + is_delegated = "not_delegated" not in test_name + assert ( + "reject" not in test_name + ), f"Use 'not_delegated' instead of 'reject' in {test_name}" + + op = "None" + target = "None" + for potential_target in TARGETS: + index = test_name.find(potential_target) + if index != -1: + op = test_name[: index - 1] + target = potential_target + break + # Special case for convolution + op = op.removesuffix("_1d") + op = op.removesuffix("_2d") + + assert target != "None", f"{test_name} does not contain one of {TARGETS}" + assert ( + op in edge_ops + ), f"Parsed unvalid OP from {test_name}, {op} does not exist in edge.yaml or CUSTOM_EDGE_OPS" + + return op, target, is_delegated + + +if __name__ == "__main__": + """Parses a list of test names given on the commandline.""" + import sys + + sys.tracebacklimit = 0 # Do not print stack trace + + edge_ops = get_edge_ops() + exit_code = 0 + + for test_name in sys.argv[1:]: + try: + assert test_name[:5] == "test_", f"Unexpected input: {test_name}" + parse_test_name(test_name, edge_ops) + except AssertionError as e: + print(e) + exit_code = 1 + else: + print(f"{test_name} OK") + + sys.exit(exit_code) diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push index ac0584c6f73..4eeb0f50d71 100755 --- a/backends/arm/scripts/pre-push +++ b/backends/arm/scripts/pre-push @@ -166,6 +166,44 @@ for COMMIT in ${COMMITS}; do fi fi + # Op test checks + op_test_files=$(echo $commit_files | grep -oE 'backends/arm/test/ops/\S+') + if [ "$op_test_files" ]; then + + # TODO: These checks can be removed when all unittests are refactored. + if grep -icq "SkipIfNoCorstone" $op_test_files; then + echo -e "${ERROR} @SkipIfNoCorstone300/320 is deprecated;"\ + "please use XfailIfNoCorstone300/320 instead." >&2 + FAILED=1 + fi + + if grep -icq "conftest.expectedFailureOnFVP" $op_test_files; then + echo -e "${ERROR} @conftest.expectedFailureOnFVP is deprecated;"\ + "please use XfailIfCorstone300/320 instead." >&2 + FAILED=1 + fi + + if grep -icq "unittest.TestCase" $op_test_files; then + echo -e "${ERROR} Use of the Unittest test framework is deprecated;"\ + "please use Pytest instead." >&2 + FAILED=1 + fi + + if grep -icq "on_fvp(" $op_test_files; then + echo -e "${ERROR} All unittests should run on FVP if relevant,"\ + "on_fvp suffix can be excluded." >&2 + FAILED=1 + fi + + # Check that the tested op and target is parsed correctly from the test name + test_names=$(grep -h "def test_" $op_test_files | cut -d"(" -f1 | cut -d" " -f2) + python ./backends/arm/scripts/parse_test_names.py $test_names + if [ $? -ne 0 ]; then + echo -e "${ERROR} Failed op test name check." >&2 + FAILED=1 + fi + fi + echo "" # Newline to visually separate commit processing done diff --git a/backends/arm/scripts/utils.sh b/backends/arm/scripts/utils.sh index e3ed04ffa22..8b4c8d4f96f 100644 --- a/backends/arm/scripts/utils.sh +++ b/backends/arm/scripts/utils.sh @@ -46,7 +46,7 @@ function patch_repo() { local patch_dir="${3}/$name" echo -e "[${FUNCNAME[0]}] Patching ${name} repo_dir:${repo_dir} base_rev:${base_rev} patch_dir:${patch_dir}" - cd $repo_dir + pushd $repo_dir git fetch git reset --hard ${base_rev} @@ -54,4 +54,5 @@ function patch_repo() { git am -3 ${patch_dir}/*.patch echo -e "[${FUNCNAME[0]}] Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${repo_dir} dir.\n" + popd } diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py index e5d7783fea3..12220acbae9 100644 --- a/backends/arm/test/conftest.py +++ b/backends/arm/test/conftest.py @@ -15,7 +15,7 @@ import pytest try: - import tosa_reference_model + import tosa_tools.v0_80.tosa_reference_model as tosa_reference_model except ImportError: logging.warning("tosa_reference_model not found, can't run reference model tests") tosa_reference_model = None diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index 945c940a20b..60bf89b6e17 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import os import shutil import tempfile @@ -15,9 +14,6 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - class Linear(torch.nn.Module): def __init__( @@ -205,7 +201,6 @@ def test_collate_tosa_BI_tests(self): def test_dump_tosa_ops(caplog): - caplog.set_level(logging.INFO) model = Linear(20, 30) ( ArmTester( @@ -222,7 +217,6 @@ def test_dump_tosa_ops(caplog): def test_fail_dump_tosa_ops(caplog): - caplog.set_level(logging.INFO) class Add(torch.nn.Module): def forward(self, x): diff --git a/backends/arm/test/misc/test_non_persistent_buffers.py b/backends/arm/test/misc/test_non_persistent_buffers.py new file mode 100644 index 00000000000..1b9456ae470 --- /dev/null +++ b/backends/arm/test/misc/test_non_persistent_buffers.py @@ -0,0 +1,49 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + +from executorch.backends.arm.test.common import parametrize +from executorch.backends.arm.test.tester.test_pipeline import ( + TosaPipelineBI, + TosaPipelineMI, +) + + +class NonPersistentBuffer(nn.Module): + """ + Min code version registering a non-persistent input buffer. + """ + + def __init__(self): + super().__init__() + self.register_buffer("test_buff", torch.rand(2, 2, 2, 2), persistent=False) + + def forward(self, x): + return x - self.test_buff + + +test_input = {"input": (torch.ones(2, 2, 2, 2),)} + +input_t = tuple[torch.Tensor] + + +@parametrize("test_data", test_input) +def test_non_persistent_buffer_MI(test_data: input_t): + """ + Test validates Arm backend handling of non-persistent buffers + and ensures that there are no asserts or errors when they are used. + """ + TosaPipelineMI[input_t](NonPersistentBuffer(), test_data, "").run() + + +@parametrize("test_data", test_input) +def test_non_persistent_buffer_BI(test_data: input_t): + """ + Test validates Arm backend handling of non-persistent buffers + and ensures that there are no asserts or errors when they are used. + """ + TosaPipelineBI[input_t](NonPersistentBuffer(), test_data, "").run() diff --git a/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py index f69d9d34462..49efbbb4a9c 100644 --- a/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py +++ b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py @@ -117,7 +117,12 @@ def test_softplus_tosa_BI(test_data: input_t1): # Since GELU will not be quantized by TosaQuantizer, the Dropout's input will not be quantized either. # If so, the Dropout should not be partitioned by TosaPartitioner for TOSA BI profile. This test tests that the # partitioner indeed does not partition the Dropout (clone) for TOSA BI. -@common.parametrize("test_data", test_data) +@common.parametrize( + "test_data", + test_data, + {"3d_rand": "MLETORCH-909: Partition test to not rely on unsupported ops"}, + strict=False, +) def test_linear_residaul_tosa_MI(test_data: input_t1): pipeline = TosaPipelineMI[input_t1]( LinearResidualModule(), diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py index b293555e66b..fb0d5eb75d3 100644 --- a/backends/arm/test/models/test_conformer.py +++ b/backends/arm/test/models/test_conformer.py @@ -3,7 +3,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest import torch @@ -14,10 +13,6 @@ from torchaudio.models import Conformer -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - def get_test_inputs(dim, lengths, num_examples): return (torch.rand(num_examples, int(lengths.max()), dim), lengths) diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py index 89196674c48..644ad69222c 100644 --- a/backends/arm/test/models/test_llama.py +++ b/backends/arm/test/models/test_llama.py @@ -28,7 +28,6 @@ sys.path.append(project_dir) logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) class TestLlama(unittest.TestCase): @@ -79,30 +78,12 @@ def prepare_model(self): llama_model, llama_inputs, llama_meta = get_llama_model(args) - # TODO: Remove workaround since attention mask should not be persistent, - # it only works if input shape is always the same - freqs_c = "freqs_cos" - freqs_s = "freqs_sin" - for i in range(llama_model.n_layers): - val = llama_model.layers[i].attention.get_buffer("mask") - llama_model.layers[i].attention.register_buffer( - "mask", val, persistent=True - ) - val = llama_model.layers[i].attention.rope.get_buffer(freqs_c) - llama_model.layers[i].attention.rope.register_buffer( - freqs_c, val, persistent=True - ) - val = llama_model.layers[i].attention.rope.get_buffer(freqs_s) - llama_model.layers[i].attention.rope.register_buffer( - freqs_s, val, persistent=True - ) - return llama_model, llama_inputs, llama_meta def test_llama_tosa_MI(self): llama_model, llama_inputs, llama_meta = self.prepare_model() - if llama_model is None and llama_inputs is None and llama_meta is None: + if llama_model is None or llama_inputs is None: pytest.skip("Missing model and/or input files") with torch.no_grad(): @@ -123,3 +104,29 @@ def test_llama_tosa_MI(self): rtol=1.1, # TODO: MLETORCH-825 decrease tolerance ) ) + + @pytest.mark.xfail(reason="KeyError: scalar_tensor_1 (MLETORCH-907)") + def test_llama_tosa_BI(self): + llama_model, llama_inputs, llama_meta = self.prepare_model() + + if llama_model is None or llama_inputs is None: + pytest.skip("Missing model and/or input files") + + with torch.no_grad(): + ( + ArmTester( + llama_model, + example_inputs=llama_inputs, + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), + constant_methods=llama_meta, + ) + .quantize() + .export() + .to_edge_transform_and_lower() + .to_executorch() + .run_method_and_compare_outputs( + inputs=llama_inputs, + atol=4.3, + rtol=1.1, # TODO: Tolerance needs to be updated after MLETORCH-907 + ) + ) diff --git a/backends/arm/test/models/test_mobilenet_v3_arm.py b/backends/arm/test/models/test_mobilenet_v3_arm.py index 7e0afe4a54f..f6f8f8f3e0c 100644 --- a/backends/arm/test/models/test_mobilenet_v3_arm.py +++ b/backends/arm/test/models/test_mobilenet_v3_arm.py @@ -46,7 +46,7 @@ def test_mv3_tosa_BI(): aten_op=[], exir_op=[], use_to_edge_transform_and_lower=True, - atol=0.3, + atol=0.5, qtol=1, ) pipeline.run() @@ -63,7 +63,7 @@ def test_mv3_u55_BI(): exir_ops=[], run_on_fvp=True, use_to_edge_transform_and_lower=True, - atol=0.3, + atol=0.5, qtol=1, ) pipeline.run() @@ -80,7 +80,7 @@ def test_mv3_u85_BI(): exir_ops=[], run_on_fvp=True, use_to_edge_transform_and_lower=True, - atol=0.3, + atol=0.5, qtol=1, ) pipeline.run() diff --git a/backends/arm/test/models/test_torch_functions.py b/backends/arm/test/models/test_torch_functions.py index 19e2395adfe..5cd4bd3aaed 100644 --- a/backends/arm/test/models/test_torch_functions.py +++ b/backends/arm/test/models/test_torch_functions.py @@ -101,6 +101,7 @@ def forward(self, *args): "Requires dynamic output shape.", "topk": "NotImplementedError: No registered serialization name for found", "sort": "NotImplementedError: No registered serialization name for found", + "norm": "An error occurred when running the 'KeepDimsFalseToSqueezePass' pass after the following passes:", }, ) def test_torch_fns_MI(test_data): @@ -129,6 +130,7 @@ def test_torch_fns_MI(test_data): "topk": "NotImplementedError: No registered serialization name for found", "sort": "NotImplementedError: No registered serialization name for found", "t": "MLETORCH-855: Issue with Quantization folding.", + "norm": "An error occurred when running the 'KeepDimsFalseToSqueezePass' pass after the following passes:", }, strict=False, ) diff --git a/backends/arm/test/models/test_w2l_arm.py b/backends/arm/test/models/test_w2l_arm.py index fb491ca2250..8cd2ff22b75 100644 --- a/backends/arm/test/models/test_w2l_arm.py +++ b/backends/arm/test/models/test_w2l_arm.py @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple @@ -19,10 +18,6 @@ from torchaudio import models -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - def get_test_inputs(batch_size, num_features, input_frames): return (torch.randn(batch_size, num_features, input_frames),) diff --git a/backends/arm/test/ops/test_alias_copy.py b/backends/arm/test/ops/test_alias_copy.py new file mode 100644 index 00000000000..66fa92bc445 --- /dev/null +++ b/backends/arm/test/ops/test_alias_copy.py @@ -0,0 +1,83 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineBI, + EthosU85PipelineBI, + TosaPipelineBI, + TosaPipelineMI, +) + +input_t1 = Tuple[torch.Tensor] + + +class AliasCopy(torch.nn.Module): + """ + Tests proper handling of alias_copy when used directly. + + alias_copy can also appear from PyTorch/ExecuTorch optimizations + such as `x.transpose(0, 0)`. This is optimized to an alias_copy but + not before dq/q operators are added. + """ + + aten_op = "torch.ops.aten.alias_copy.default" + exir_op = "executorch_exir_dialects_edge__ops_aten_alias_copy_default" + + test_data: dict[input_t1] = { + "1d_ramp": (torch.arange(-16, 16, 0.2),), + "2d_ones": (torch.ones(5, 5),), + "3d_rand": (torch.rand(3, 5, 5),), + "4d_zeros": (torch.zeros(1, 10, 10, 10),), + } + + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor): + return torch.alias_copy(x) + + +@common.parametrize("test_data", AliasCopy.test_data) +def test_alias_copy_tosa_MI(test_data: input_t1): + TosaPipelineMI[input_t1]( + AliasCopy(), + test_data, + AliasCopy.aten_op, + AliasCopy.exir_op, + ).run() + + +@common.parametrize("test_data", AliasCopy.test_data) +def test_alias_copy_tosa_BI(test_data: input_t1): + TosaPipelineBI[input_t1]( + AliasCopy(), + test_data, + AliasCopy.aten_op, + AliasCopy.exir_op, + ).run() + + +@common.parametrize("test_data", AliasCopy.test_data) +def test_alias_copy_u55_BI(test_data: input_t1): + EthosU55PipelineBI[input_t1]( + AliasCopy(), + test_data, + AliasCopy.aten_op, + AliasCopy.exir_op, + ).run() + + +@common.parametrize("test_data", AliasCopy.test_data) +def test_alias_copy_u85_BI(test_data: input_t1): + EthosU85PipelineBI[input_t1]( + AliasCopy(), + test_data, + AliasCopy.aten_op, + AliasCopy.exir_op, + ).run() diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py index 360429d3d6c..980ab28df64 100644 --- a/backends/arm/test/ops/test_batch_norm.py +++ b/backends/arm/test/ops/test_batch_norm.py @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple @@ -15,8 +14,6 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) test_data_suite = [ # (test_name, test_data, [num_features, affine, track_running_stats, weight, bias, running_mean, running_var,] ) diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index f6e13a2222e..0fb3c2675e9 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple @@ -18,8 +17,6 @@ from parameterized import parameterized from torch.nn.parameter import Parameter -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) """ This file contain unit tests where conv are combined with other ops. diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py index 062dbfacaef..d200a753ce5 100644 --- a/backends/arm/test/ops/test_div.py +++ b/backends/arm/test/ops/test_div.py @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Optional, Tuple, Union @@ -17,8 +16,6 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) test_data_suite = [ # (test_name, input, other, rounding_mode) See torch.div() for info diff --git a/backends/arm/test/ops/test_eq.py b/backends/arm/test/ops/test_eq.py index 7cbba632696..e3bcf877ffe 100644 --- a/backends/arm/test/ops/test_eq.py +++ b/backends/arm/test/ops/test_eq.py @@ -96,8 +96,16 @@ def test_eq_scalar_tosa_MI(test_module): pipeline.run() -@common.parametrize("test_module", test_data_tensor | test_data_scalar) -def test_eq_tosa_BI(test_module): +@common.parametrize("test_module", test_data_tensor) +def test_eq_tensor_tosa_BI(test_module): + pipeline = TosaPipelineBI[input_t]( + test_module, test_module.get_inputs(), Equal.aten_op_Tensor, Equal.exir_op + ) + pipeline.run() + + +@common.parametrize("test_module", test_data_scalar) +def test_eq_scalar_tosa_BI(test_module): pipeline = TosaPipelineBI[input_t]( test_module, test_module.get_inputs(), Equal.aten_op_Tensor, Equal.exir_op ) @@ -133,15 +141,34 @@ def test_eq_scalar_u55_BI(test_module): @common.parametrize( "test_module", - test_data_tensor | test_data_scalar, + test_data_tensor, xfails={ "eq_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", + }, + strict=False, +) +@common.XfailIfNoCorstone320 +def test_eq_tensor_u85_BI(test_module): + pipeline = EthosU85PipelineBI[input_t]( + test_module, + test_module.get_inputs(), + Equal.aten_op_Tensor, + Equal.exir_op, + run_on_fvp=True, + ) + pipeline.run() + + +@common.parametrize( + "test_module", + test_data_scalar, + xfails={ "eq_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", }, strict=False, ) @common.XfailIfNoCorstone320 -def test_eq_u85_BI(test_module): +def test_eq_scalar_u85_BI(test_module): pipeline = EthosU85PipelineBI[input_t]( test_module, test_module.get_inputs(), diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py index a6193f6ea08..7bcd2c923a4 100644 --- a/backends/arm/test/ops/test_ge.py +++ b/backends/arm/test/ops/test_ge.py @@ -5,7 +5,6 @@ from typing import Tuple -import pytest import torch from executorch.backends.arm.test import common @@ -16,13 +15,14 @@ TosaPipelineMI, ) -aten_op = "torch.ops.aten.ge.Tensor" -exir_op = "executorch_exir_dialects_edge__ops_aten_ge_Tensor" - input_t = Tuple[torch.Tensor] class GreaterEqual(torch.nn.Module): + aten_op_tensor = "torch.ops.aten.ge.Tensor" + aten_op_scalar = "torch.ops.aten.ge.Scalar" + exir_op = "executorch_exir_dialects_edge__ops_aten_ge_Tensor" + def __init__(self, input, other): super().__init__() self.input_ = input @@ -31,7 +31,7 @@ def __init__(self, input, other): def forward( self, input_: torch.Tensor, - other_: torch.Tensor, + other_: torch.Tensor | int | float, ): return input_ >= other_ @@ -39,98 +39,143 @@ def get_inputs(self): return (self.input_, self.other_) -op_ge_rank1_ones = GreaterEqual( +op_ge_tensor_rank1_ones = GreaterEqual( torch.ones(5), torch.ones(5), ) -op_ge_rank2_rand = GreaterEqual( +op_ge_tensor_rank2_rand = GreaterEqual( torch.rand(4, 5), torch.rand(1, 5), ) -op_ge_rank3_randn = GreaterEqual( +op_ge_tensor_rank3_randn = GreaterEqual( torch.randn(10, 5, 2), torch.randn(10, 5, 2), ) -op_ge_rank4_randn = GreaterEqual( +op_ge_tensor_rank4_randn = GreaterEqual( torch.randn(3, 2, 2, 2), torch.randn(3, 2, 2, 2), ) -test_data_common = { - "ge_rank1_ones": op_ge_rank1_ones, - "ge_rank2_rand": op_ge_rank2_rand, - "ge_rank3_randn": op_ge_rank3_randn, - "ge_rank4_randn": op_ge_rank4_randn, +op_ge_scalar_rank1_ones = GreaterEqual(torch.ones(5), 1.0) +op_ge_scalar_rank2_rand = GreaterEqual(torch.rand(4, 5), 0.2) +op_ge_scalar_rank3_randn = GreaterEqual(torch.randn(10, 5, 2), -0.1) +op_ge_scalar_rank4_randn = GreaterEqual(torch.randn(3, 2, 2, 2), 0.3) + +test_data_tensor = { + "ge_tensor_rank1_ones": op_ge_tensor_rank1_ones, + "ge_tensor_rank2_rand": op_ge_tensor_rank2_rand, + "ge_tensor_rank3_randn": op_ge_tensor_rank3_randn, + "ge_tensor_rank4_randn": op_ge_tensor_rank4_randn, +} + +test_data_scalar = { + "ge_scalar_rank1_ones": op_ge_scalar_rank1_ones, + "ge_scalar_rank2_rand": op_ge_scalar_rank2_rand, + "ge_scalar_rank3_randn": op_ge_scalar_rank3_randn, + "ge_scalar_rank4_randn": op_ge_scalar_rank4_randn, } -@common.parametrize("test_module", test_data_common) -def test_ge_tosa_MI(test_module): +@common.parametrize("test_module", test_data_tensor) +def test_ge_tensor_tosa_MI(test_module): + pipeline = TosaPipelineMI[input_t]( + test_module, + test_module.get_inputs(), + GreaterEqual.aten_op_tensor, + GreaterEqual.exir_op, + ) + pipeline.run() + + +@common.parametrize("test_module", test_data_scalar) +def test_ge_scalar_tosa_MI(test_module): pipeline = TosaPipelineMI[input_t]( - test_module, test_module.get_inputs(), aten_op, exir_op + test_module, + test_module.get_inputs(), + GreaterEqual.aten_op_scalar, + GreaterEqual.exir_op, ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_ge_tosa_BI(test_module): +@common.parametrize("test_module", test_data_tensor) +def test_ge_tensor_tosa_BI(test_module): pipeline = TosaPipelineBI[input_t]( - test_module, test_module.get_inputs(), aten_op, exir_op + test_module, + test_module.get_inputs(), + GreaterEqual.aten_op_tensor, + GreaterEqual.exir_op, ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_ge_u55_BI(test_module): - # GREATER_EQUAL is not supported on U55. - pipeline = OpNotSupportedPipeline[input_t]( +@common.parametrize("test_module", test_data_scalar) +def test_ge_scalar_tosa_BI(test_module): + pipeline = TosaPipelineBI[input_t]( test_module, test_module.get_inputs(), - "TOSA-0.80+BI+u55", - {exir_op: 1}, + GreaterEqual.aten_op_tensor, + GreaterEqual.exir_op, ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_ge_u85_BI(test_module): - pipeline = EthosU85PipelineBI[input_t]( +@common.parametrize("test_module", test_data_tensor) +@common.XfailIfNoCorstone300 +def test_ge_tensor_u55_BI(test_module): + # GREATER_EQUAL is not supported on U55. + pipeline = OpNotSupportedPipeline[input_t]( test_module, test_module.get_inputs(), - aten_op, - exir_op, - run_on_fvp=False, - use_to_edge_transform_and_lower=True, + "TOSA-0.80+BI+u55", + {GreaterEqual.exir_op: 1}, ) pipeline.run() -@common.parametrize("test_module", test_data_common) -@pytest.mark.skip(reason="The same as test_ge_u55_BI") -def test_ge_u55_BI_on_fvp(test_module): +@common.parametrize("test_module", test_data_scalar) +@common.XfailIfNoCorstone300 +def test_ge_scalar_u55_BI(test_module): # GREATER_EQUAL is not supported on U55. pipeline = OpNotSupportedPipeline[input_t]( test_module, test_module.get_inputs(), "TOSA-0.80+BI+u55", - {exir_op: 1}, + {GreaterEqual.exir_op: 1}, + n_expected_delegates=1, + ) + pipeline.run() + + +@common.parametrize( + "test_module", + test_data_tensor, + xfails={"ge_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85"}, +) +@common.XfailIfNoCorstone320 +def test_ge_tensor_u85_BI(test_module): + pipeline = EthosU85PipelineBI[input_t]( + test_module, + test_module.get_inputs(), + GreaterEqual.aten_op_tensor, + GreaterEqual.exir_op, + run_on_fvp=True, ) pipeline.run() @common.parametrize( "test_module", - test_data_common, - xfails={"ge_rank4_randn": "4D fails because boolean Tensors can't be subtracted"}, + test_data_scalar, + xfails={"ge_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85"}, ) -@common.SkipIfNoCorstone320 -def test_ge_u85_BI_on_fvp(test_module): +@common.XfailIfNoCorstone320 +def test_ge_scalar_u85_BI(test_module): pipeline = EthosU85PipelineBI[input_t]( test_module, test_module.get_inputs(), - aten_op, - exir_op, + GreaterEqual.aten_op_tensor, + GreaterEqual.exir_op, run_on_fvp=True, - use_to_edge_transform_and_lower=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_gelu.py b/backends/arm/test/ops/test_gelu.py new file mode 100644 index 00000000000..fb1253fdb0c --- /dev/null +++ b/backends/arm/test/ops/test_gelu.py @@ -0,0 +1,125 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineBI, + EthosU85PipelineBI, + TosaPipelineBI, + TosaPipelineMI, +) + +input_t1 = Tuple[torch.Tensor] + + +class Gelu(torch.nn.Module): + aten_op = "torch.ops.aten.gelu.default" + exir_op = "executorch_exir_dialects_edge__ops_aten_gelu_default" + + test_data: dict[str, Tuple[str, input_t1]] = { + "zeros_none": ( + "none", + torch.zeros(1, 10, 10, 10), + ), + "ones_none": ( + "none", + torch.ones(10, 10, 10), + ), + "rand_none": ( + "none", + (torch.rand(10, 10) - 0.5), + ), + "randn_pos_none": ( + "none", + (torch.randn(1, 4, 4, 4) + 10), + ), + "randn_neg_none": ( + "none", + (torch.randn(1, 4, 4, 4) - 10), + ), + "ramp_none": ( + "none", + torch.arange(-16, 16, 0.2), + ), + "zeros_tanh": ( + "tanh", + torch.zeros(1, 10, 10, 10), + ), + "ones_tanh": ( + "tanh", + torch.ones(10, 10, 10), + ), + "rand_tanh": ( + "tanh", + (torch.rand(10, 10) - 0.5), + ), + "randn_pos_tanh": ( + "tanh", + (torch.randn(1, 4, 4, 4) + 10), + ), + "randn_neg_tanh": ( + "tanh", + (torch.randn(1, 4, 4, 4) - 10), + ), + "ramp_tanh": ( + "tanh", + torch.arange(-16, 16, 0.2), + ), + } + + def __init__(self, approximate: str = "none"): + super().__init__() + self.gelu = torch.nn.GELU(approximate) + + def forward(self, x: torch.Tensor): + return self.gelu(x) + + +@common.parametrize("test_data", Gelu.test_data) +def test_gelu_tosa_MI(test_data: input_t1): + approximate = test_data[0] + TosaPipelineMI[input_t1]( + Gelu(approximate), + (test_data[1],), + Gelu.aten_op, + Gelu.exir_op, + use_to_edge_transform_and_lower=False, + ).run() + + +@common.parametrize("test_data", Gelu.test_data) +def test_gelu_tosa_BI(test_data: input_t1): + approximate = test_data[0] + TosaPipelineBI[input_t1]( + Gelu(approximate), + (test_data[1],), + Gelu.aten_op, + Gelu.exir_op, + ).run() + + +@common.parametrize("test_data", Gelu.test_data) +def test_gelu_u55_BI(test_data: input_t1): + approximate = test_data[0] + EthosU55PipelineBI[input_t1]( + Gelu(approximate), + (test_data[1],), + Gelu.aten_op, + Gelu.exir_op, + ).run() + + +@common.parametrize("test_data", Gelu.test_data) +def test_gelu_u85_BI(test_data: input_t1): + approximate = test_data[0] + EthosU85PipelineBI[input_t1]( + Gelu(approximate), + (test_data[1],), + Gelu.aten_op, + Gelu.exir_op, + ).run() diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py index 2095f781bdb..15515958c85 100644 --- a/backends/arm/test/ops/test_gt.py +++ b/backends/arm/test/ops/test_gt.py @@ -5,7 +5,6 @@ from typing import Tuple -import pytest import torch from executorch.backends.arm.test import common @@ -16,13 +15,15 @@ TosaPipelineMI, ) -aten_op = "torch.ops.aten.gt.Tensor" -exir_op = "executorch_exir_dialects_edge__ops_aten_gt_Tensor" input_t = Tuple[torch.Tensor] class Greater(torch.nn.Module): + aten_op_tensor = "torch.ops.aten.gt.Tensor" + aten_op_scalar = "torch.ops.aten.gt.Scalar" + exir_op = "executorch_exir_dialects_edge__ops_aten_gt_Tensor" + def __init__(self, input, other): super().__init__() self.input_ = input @@ -31,7 +32,7 @@ def __init__(self, input, other): def forward( self, input_: torch.Tensor, - other_: torch.Tensor, + other_: torch.Tensor | int | float, ): return input_ > other_ @@ -39,98 +40,135 @@ def get_inputs(self): return (self.input_, self.other_) -op_gt_rank1_ones = Greater( +op_gt_tensor_rank1_ones = Greater( torch.ones(5), torch.ones(5), ) -op_gt_rank2_rand = Greater( +op_gt_tensor_rank2_rand = Greater( torch.rand(4, 5), torch.rand(1, 5), ) -op_gt_rank3_randn = Greater( +op_gt_tensor_rank3_randn = Greater( torch.randn(10, 5, 2), torch.randn(10, 5, 2), ) -op_gt_rank4_randn = Greater( +op_gt_tensor_rank4_randn = Greater( torch.randn(3, 2, 2, 2), torch.randn(3, 2, 2, 2), ) -test_data_common = { - "gt_rank1_ones": op_gt_rank1_ones, - "gt_rank2_rand": op_gt_rank2_rand, - "gt_rank3_randn": op_gt_rank3_randn, - "gt_rank4_randn": op_gt_rank4_randn, +op_gt_scalar_rank1_ones = Greater(torch.ones(5), 1.0) +op_gt_scalar_rank2_rand = Greater(torch.rand(4, 5), 0.2) +op_gt_scalar_rank3_randn = Greater(torch.randn(10, 5, 2), -0.1) +op_gt_scalar_rank4_randn = Greater(torch.randn(3, 2, 2, 2), 0.3) + +test_data_tensor = { + "gt_tensor_rank1_ones": op_gt_tensor_rank1_ones, + "gt_tensor_rank2_rand": op_gt_tensor_rank2_rand, + "gt_tensor_rank3_randn": op_gt_tensor_rank3_randn, + "gt_tensor_rank4_randn": op_gt_tensor_rank4_randn, +} + +test_data_scalar = { + "gt_scalar_rank1_ones": op_gt_scalar_rank1_ones, + "gt_scalar_rank2_rand": op_gt_scalar_rank2_rand, + "gt_scalar_rank3_randn": op_gt_scalar_rank3_randn, + "gt_scalar_rank4_randn": op_gt_scalar_rank4_randn, } -@common.parametrize("test_module", test_data_common) -def test_gt_tosa_MI(test_module): +@common.parametrize("test_module", test_data_tensor) +def test_gt_tensor_tosa_MI(test_module): + pipeline = TosaPipelineMI[input_t]( + test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op + ) + pipeline.run() + + +@common.parametrize("test_module", test_data_scalar) +def test_gt_scalar_tosa_MI(test_module): pipeline = TosaPipelineMI[input_t]( - test_module, test_module.get_inputs(), aten_op, exir_op + test_module, test_module.get_inputs(), Greater.aten_op_scalar, Greater.exir_op + ) + pipeline.run() + + +@common.parametrize("test_module", test_data_tensor) +def test_gt_tensor_tosa_BI(test_module): + pipeline = TosaPipelineBI[input_t]( + test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_gt_tosa_BI(test_module): +@common.parametrize("test_module", test_data_scalar) +def test_gt_scalar_tosa_BI(test_module): pipeline = TosaPipelineBI[input_t]( - test_module, test_module.get_inputs(), aten_op, exir_op + test_module, test_module.get_inputs(), Greater.aten_op_tensor, Greater.exir_op ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_gt_u55_BI(test_module): - # GREATER is not supported on U55. +@common.parametrize("test_module", test_data_tensor) +@common.XfailIfNoCorstone300 +def test_gt_tensor_u55_BI(test_module): + # Greater is not supported on U55. pipeline = OpNotSupportedPipeline[input_t]( test_module, test_module.get_inputs(), "TOSA-0.80+BI+u55", - {exir_op: 1}, + {Greater.exir_op: 1}, ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_gt_u85_BI(test_module): - pipeline = EthosU85PipelineBI[input_t]( +@common.parametrize("test_module", test_data_scalar) +@common.XfailIfNoCorstone300 +def test_gt_scalar_u55_BI(test_module): + # Greater is not supported on U55. + pipeline = OpNotSupportedPipeline[input_t]( test_module, test_module.get_inputs(), - aten_op, - exir_op, - run_on_fvp=False, - use_to_edge_transform_and_lower=True, + "TOSA-0.80+BI+u55", + {Greater.exir_op: 1}, + n_expected_delegates=1, ) pipeline.run() -@common.parametrize("test_module", test_data_common) -@pytest.mark.skip(reason="The same as test_gt_u55_BI") -def test_gt_u55_BI_on_fvp(test_module): - # GREATER is not supported on U55. - pipeline = OpNotSupportedPipeline[input_t]( +@common.parametrize( + "test_module", + test_data_tensor, + xfails={ + "gt_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", + }, +) +@common.XfailIfNoCorstone320 +def test_gt_tensor_u85_BI(test_module): + pipeline = EthosU85PipelineBI[input_t]( test_module, test_module.get_inputs(), - "TOSA-0.80+BI+u55", - {exir_op: 1}, + Greater.aten_op_tensor, + Greater.exir_op, + run_on_fvp=True, ) pipeline.run() @common.parametrize( "test_module", - test_data_common, - xfails={"gt_rank4_randn": "4D fails because boolean Tensors can't be subtracted"}, + test_data_scalar, + xfails={ + "gt_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", + }, ) -@common.SkipIfNoCorstone320 -def test_gt_u85_BI_on_fvp(test_module): +@common.XfailIfNoCorstone320 +def test_gt_scalar_u85_BI(test_module): pipeline = EthosU85PipelineBI[input_t]( test_module, test_module.get_inputs(), - aten_op, - exir_op, + Greater.aten_op_tensor, + Greater.exir_op, run_on_fvp=True, - use_to_edge_transform_and_lower=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index 33bf9932b5a..9a289909bae 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple @@ -19,9 +18,6 @@ from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - test_data_suite_rank1 = [ # (test_name, test_data, out_features, has_bias) diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py index cae119cd7a8..f5664b7895d 100644 --- a/backends/arm/test/ops/test_lt.py +++ b/backends/arm/test/ops/test_lt.py @@ -5,7 +5,6 @@ from typing import Tuple -import pytest import torch from executorch.backends.arm.test import common @@ -16,13 +15,15 @@ TosaPipelineMI, ) -aten_op = "torch.ops.aten.lt.Tensor" -exir_op = "executorch_exir_dialects_edge__ops_aten_lt_Tensor" input_t = Tuple[torch.Tensor] class LessThan(torch.nn.Module): + aten_op_tensor = "torch.ops.aten.lt.Tensor" + aten_op_scalar = "torch.ops.aten.lt.Scalar" + exir_op = "executorch_exir_dialects_edge__ops_aten_lt_Tensor" + def __init__(self, input, other): super().__init__() self.input_ = input @@ -31,7 +32,7 @@ def __init__(self, input, other): def forward( self, input_: torch.Tensor, - other_: torch.Tensor, + other_: torch.Tensor | int | float, ): return input_ < other_ @@ -39,98 +40,135 @@ def get_inputs(self): return (self.input_, self.other_) -op_lt_rank1_ones = LessThan( +op_lt_tensor_rank1_ones = LessThan( torch.ones(5), torch.ones(5), ) -op_lt_rank2_rand = LessThan( +op_lt_tensor_rank2_rand = LessThan( torch.rand(4, 5), torch.rand(1, 5), ) -op_lt_rank3_randn = LessThan( +op_lt_tensor_rank3_randn = LessThan( torch.randn(10, 5, 2), torch.randn(10, 5, 2), ) -op_lt_rank4_randn = LessThan( +op_lt_tensor_rank4_randn = LessThan( torch.randn(3, 2, 2, 2), torch.randn(3, 2, 2, 2), ) -test_data_common = { - "lt_rank1_ones": op_lt_rank1_ones, - "lt_rank2_rand": op_lt_rank2_rand, - "lt_rank3_randn": op_lt_rank3_randn, - "lt_rank4_randn": op_lt_rank4_randn, +op_lt_scalar_rank1_ones = LessThan(torch.ones(5), 1.0) +op_lt_scalar_rank2_rand = LessThan(torch.rand(4, 5), 0.2) +op_lt_scalar_rank3_randn = LessThan(torch.randn(10, 5, 2), -0.1) +op_lt_scalar_rank4_randn = LessThan(torch.randn(3, 2, 2, 2), 0.3) + +test_data_tensor = { + "lt_tensor_rank1_ones": op_lt_tensor_rank1_ones, + "lt_tensor_rank2_rand": op_lt_tensor_rank2_rand, + "lt_tensor_rank3_randn": op_lt_tensor_rank3_randn, + "lt_tensor_rank4_randn": op_lt_tensor_rank4_randn, +} + +test_data_scalar = { + "lt_scalar_rank1_ones": op_lt_scalar_rank1_ones, + "lt_scalar_rank2_rand": op_lt_scalar_rank2_rand, + "lt_scalar_rank3_randn": op_lt_scalar_rank3_randn, + "lt_scalar_rank4_randn": op_lt_scalar_rank4_randn, } -@common.parametrize("test_module", test_data_common) -def test_lt_tosa_MI(test_module): +@common.parametrize("test_module", test_data_tensor) +def test_lt_tensor_tosa_MI(test_module): + pipeline = TosaPipelineMI[input_t]( + test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op + ) + pipeline.run() + + +@common.parametrize("test_module", test_data_scalar) +def test_lt_scalar_tosa_MI(test_module): pipeline = TosaPipelineMI[input_t]( - test_module, test_module.get_inputs(), aten_op, exir_op + test_module, test_module.get_inputs(), LessThan.aten_op_scalar, LessThan.exir_op + ) + pipeline.run() + + +@common.parametrize("test_module", test_data_tensor) +def test_lt_tensor_tosa_BI(test_module): + pipeline = TosaPipelineBI[input_t]( + test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_lt_tosa_BI(test_module): +@common.parametrize("test_module", test_data_scalar) +def test_lt_scalar_tosa_BI(test_module): pipeline = TosaPipelineBI[input_t]( - test_module, test_module.get_inputs(), aten_op, exir_op + test_module, test_module.get_inputs(), LessThan.aten_op_tensor, LessThan.exir_op ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_lt_u55_BI(test_module): - # GREATER is not supported on U55. LT uses the GREATER Tosa operator. +@common.parametrize("test_module", test_data_tensor) +@common.XfailIfNoCorstone300 +def test_lt_tensor_u55_BI(test_module): + # LessThan is not supported on U55. pipeline = OpNotSupportedPipeline[input_t]( test_module, test_module.get_inputs(), "TOSA-0.80+BI+u55", - {exir_op: 1}, + {LessThan.exir_op: 1}, ) pipeline.run() -@common.parametrize("test_module", test_data_common) -def test_lt_u85_BI(test_module): - pipeline = EthosU85PipelineBI[input_t]( +@common.parametrize("test_module", test_data_scalar) +@common.XfailIfNoCorstone300 +def test_lt_scalar_u55_BI(test_module): + # LessThan is not supported on U55. + pipeline = OpNotSupportedPipeline[input_t]( test_module, test_module.get_inputs(), - aten_op, - exir_op, - run_on_fvp=False, - use_to_edge_transform_and_lower=True, + "TOSA-0.80+BI+u55", + {LessThan.exir_op: 1}, + n_expected_delegates=1, ) pipeline.run() -@common.parametrize("test_module", test_data_common) -@pytest.mark.skip(reason="The same as test_lt_u55_BI") -def test_lt_u55_BI_on_fvp(test_module): - # GREATER is not supported on U55. LT uses the GREATER Tosa operator. - pipeline = OpNotSupportedPipeline[input_t]( +@common.parametrize( + "test_module", + test_data_tensor, + xfails={ + "lt_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", + }, +) +@common.XfailIfNoCorstone320 +def test_lt_tensor_u85_BI(test_module): + pipeline = EthosU85PipelineBI[input_t]( test_module, test_module.get_inputs(), - "TOSA-0.80+BI+u55", - {exir_op: 1}, + LessThan.aten_op_tensor, + LessThan.exir_op, + run_on_fvp=True, ) pipeline.run() @common.parametrize( "test_module", - test_data_common, - xfails={"lt_rank4_randn": "4D fails because boolean Tensors can't be subtracted"}, + test_data_scalar, + xfails={ + "lt_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", + }, ) -@common.SkipIfNoCorstone320 -def test_lt_u85_BI_on_fvp(test_module): +@common.XfailIfNoCorstone320 +def test_lt_scalar_u85_BI(test_module): pipeline = EthosU85PipelineBI[input_t]( test_module, test_module.get_inputs(), - aten_op, - exir_op, + LessThan.aten_op_tensor, + LessThan.exir_op, run_on_fvp=True, - use_to_edge_transform_and_lower=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py index a31c12be3a0..2f3426f2dda 100644 --- a/backends/arm/test/ops/test_max_pool.py +++ b/backends/arm/test/ops/test_max_pool.py @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple @@ -26,8 +25,6 @@ from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) test_data_suite = [ # (test_name, test_data, [kernel_size, stride, padding]) diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py index 6b906067f7b..a4503280db9 100644 --- a/backends/arm/test/ops/test_mm.py +++ b/backends/arm/test/ops/test_mm.py @@ -6,6 +6,7 @@ from typing import Callable +import pytest import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( @@ -53,6 +54,7 @@ def test_mm_tosa_u55(test_data_generator: Callable[[], tuple]): @parameterized.expand(MM.test_data_generators) +@pytest.mark.flaky # Investigate flakiness (MLETORCH-870) def test_mm_tosa_u85(test_data_generator: Callable[[], tuple]): test_data = test_data_generator() EthosU85PipelineBI[test_t](MM(), test_data, MM.aten_op, MM.exir_op).run() @@ -67,6 +69,7 @@ def test_mm_tosa_u55_on_fvp(test_data_generator: Callable[[], tuple]): @parameterized.expand(MM.test_data_generators) @common.SkipIfNoCorstone320 +@pytest.mark.flaky # Investigate flakiness (MLETORCH-870) def test_mm_tosa_u85_on_fvp(test_data_generator: Callable[[], tuple]): test_data = test_data_generator() EthosU85PipelineBI[test_t]( diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py index a5c6c86c52b..43b4abd2039 100644 --- a/backends/arm/test/ops/test_sigmoid.py +++ b/backends/arm/test/ops/test_sigmoid.py @@ -1,24 +1,22 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. # All rights reserved. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple +import pytest + import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - test_data_suite = [ # (test_name, test_data) @@ -67,7 +65,7 @@ def forward(self, x, y): def _test_sigmoid_tosa_MI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -81,11 +79,13 @@ def _test_sigmoid_tosa_MI_pipeline( .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() - .run_method_and_compare_outputs(inputs=test_data) ) + if conftest.is_option_enabled("tosa_ref_model"): + tester.run_method_and_compare_outputs(inputs=test_data) + def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -100,9 +100,11 @@ def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tup .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() - .run_method_and_compare_outputs(inputs=test_data) ) + if conftest.is_option_enabled("tosa_ref_model"): + tester.run_method_and_compare_outputs(inputs=test_data) + def _test_sigmoid_tosa_ethos_BI_pipeline( self, compile_spec: list[CompileSpec], @@ -141,6 +143,7 @@ def _test_sigmoid_tosa_u85_BI_pipeline( ) @parameterized.expand(test_data_suite) + @pytest.mark.tosa_ref_model def test_sigmoid_tosa_MI( self, test_name: str, @@ -149,26 +152,33 @@ def test_sigmoid_tosa_MI( self._test_sigmoid_tosa_MI_pipeline(self.Sigmoid(), (test_data,)) @parameterized.expand(test_data_suite) + @pytest.mark.tosa_ref_model def test_sigmoid_tosa_BI(self, test_name: str, test_data: torch.Tensor): self._test_sigmoid_tosa_BI_pipeline(self.Sigmoid(), (test_data,)) + @pytest.mark.tosa_ref_model def test_add_sigmoid_tosa_MI(self): self._test_sigmoid_tosa_MI_pipeline(self.AddSigmoid(), (test_data_suite[0][1],)) + @pytest.mark.tosa_ref_model def test_add_sigmoid_tosa_BI(self): self._test_sigmoid_tosa_BI_pipeline(self.AddSigmoid(), (test_data_suite[5][1],)) + @pytest.mark.tosa_ref_model def test_sigmoid_add_tosa_MI(self): self._test_sigmoid_tosa_MI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],)) + @pytest.mark.tosa_ref_model def test_sigmoid_add_tosa_BI(self): self._test_sigmoid_tosa_BI_pipeline(self.SigmoidAdd(), (test_data_suite[0][1],)) + @pytest.mark.tosa_ref_model def test_sigmoid_add_sigmoid_tosa_MI(self): self._test_sigmoid_tosa_MI_pipeline( self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1]) ) + @pytest.mark.tosa_ref_model def test_sigmoid_add_sigmoid_tosa_BI(self): self._test_sigmoid_tosa_BI_pipeline( self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1]) diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py index c3907887ac9..240000e6973 100644 --- a/backends/arm/test/ops/test_sigmoid_16bit.py +++ b/backends/arm/test/ops/test_sigmoid_16bit.py @@ -81,7 +81,7 @@ def forward(self, x): @common.parametrize("test_data", test_data_suite) -@pytest.mark.flaky(reruns=5) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 def test_sigmoid_tosa_BI(test_data): pipeline = TosaPipelineBI( Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op @@ -97,7 +97,7 @@ def test_sigmoid_tosa_BI(test_data): "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787" }, ) -@pytest.mark.flaky(reruns=5) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 def test_sigmoid_add_sigmoid_tosa_BI(test_data): pipeline = TosaPipelineBI( SigmoidAddSigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op @@ -110,6 +110,7 @@ def test_sigmoid_add_sigmoid_tosa_BI(test_data): "test_data", test_data_suite, ) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 def test_sigmoid_tosa_u55(test_data): pipeline = OpNotSupportedPipeline( Sigmoid(), (test_data(),), "TOSA-0.80+BI+u55", {Sigmoid.exir_op: 1} @@ -122,6 +123,7 @@ def test_sigmoid_tosa_u55(test_data): "test_data", test_data_suite, ) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 def test_sigmoid_add_sigmoid_tosa_u55(test_data): pipeline = OpNotSupportedPipeline( SigmoidAddSigmoid(), @@ -135,7 +137,7 @@ def test_sigmoid_add_sigmoid_tosa_u55(test_data): @common.parametrize("test_data", test_data_suite) -@pytest.mark.flaky(reruns=5) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 @common.XfailIfNoCorstone320 def test_sigmoid_tosa_u85(test_data): pipeline = EthosU85PipelineBI( @@ -152,7 +154,7 @@ def test_sigmoid_tosa_u85(test_data): "ramp": "AssertionError: Output 0 does not match reference output.", }, ) -@pytest.mark.flaky(reruns=5) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 @common.XfailIfNoCorstone320 def test_sigmoid_add_sigmoid_tosa_u85(test_data): pipeline = EthosU85PipelineBI( diff --git a/backends/arm/test/ops/test_sigmoid_32bit.py b/backends/arm/test/ops/test_sigmoid_32bit.py index 5388eae83c3..14808eedaf9 100644 --- a/backends/arm/test/ops/test_sigmoid_32bit.py +++ b/backends/arm/test/ops/test_sigmoid_32bit.py @@ -97,7 +97,7 @@ def forward(self, x): @common.parametrize("test_data", test_data_suite) -@pytest.mark.flaky(reruns=5) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 def test_sigmoid_tosa_BI(test_data): pipeline = TosaPipelineBI( Sigmoid(), @@ -110,7 +110,7 @@ def test_sigmoid_tosa_BI(test_data): @common.parametrize("test_data", test_data_suite) -@pytest.mark.flaky(reruns=5) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 def test_sigmoid_add_sigmoid_tosa_BI(test_data): pipeline = TosaPipelineBI( SigmoidAddSigmoid(), @@ -123,6 +123,7 @@ def test_sigmoid_add_sigmoid_tosa_BI(test_data): @common.parametrize("test_data", test_data_suite) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 def test_sigmoid_tosa_u55(test_data): pipeline = OpNotSupportedPipeline( Sigmoid(), (test_data(),), "TOSA-0.80+BI+u55", {Sigmoid.exir_op: 1} @@ -132,6 +133,7 @@ def test_sigmoid_tosa_u55(test_data): @common.parametrize("test_data", test_data_suite) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 def test_sigmoid_add_sigmoid_tosa_u55(test_data): pipeline = OpNotSupportedPipeline( SigmoidAddSigmoid(), @@ -145,7 +147,7 @@ def test_sigmoid_add_sigmoid_tosa_u55(test_data): @common.parametrize("test_data", test_data_suite) -@pytest.mark.flaky(reruns=5) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 @common.XfailIfNoCorstone320 def test_sigmoid_tosa_u85(test_data): pipeline = EthosU85PipelineBI( @@ -162,7 +164,7 @@ def test_sigmoid_tosa_u85(test_data): "ramp": "AssertionError: Output 0 does not match reference output.", }, ) -@pytest.mark.flaky(reruns=5) +@pytest.mark.flaky(reruns=32) # Flaky due to Vela bug: MLBEDSW-10642 @common.XfailIfNoCorstone320 def test_sigmoid_add_sigmoid_tosa_u85(test_data): pipeline = EthosU85PipelineBI( diff --git a/backends/arm/test/ops/test_silu.py b/backends/arm/test/ops/test_silu.py new file mode 100644 index 00000000000..51748b02450 --- /dev/null +++ b/backends/arm/test/ops/test_silu.py @@ -0,0 +1,113 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Optional, Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineBI, + EthosU85PipelineBI, + TosaPipelineBI, + TosaPipelineMI, +) + + +input_t = Tuple[torch.Tensor] + + +class Silu(torch.nn.Module): + def forward( + self, + _input: torch.Tensor, + _inplace: Optional[bool] = False, + ): + return torch.nn.SiLU(inplace=_inplace)(_input) + + test_data: list[input_t] = { + "op_silu_rank1_ones": (torch.ones(5),), + "op_silu_rank1_negative_ones": (torch.ones(5) * (-1),), + "op_silu_rank1_rand": (torch.rand(5) * 5,), + "op_silu_rank4_ones": (torch.ones(1, 10, 25, 20),), + "op_silu_rank4_negative_ones": ((-1) * torch.ones(1, 10, 25, 20),), + "op_silu_rank4_large_rand": (200 * torch.rand(1, 10, 25, 20),), + "op_silu_rank4_negative_large_rand": ((-200) * torch.rand(1, 10, 25, 20),), + "op_silu_rank4_large_randn": (200 * torch.randn(1, 10, 25, 20) + 1,), + } + + aten_op_MI = "torch.ops.aten.silu.default" + aten_op_inplace_MI = "torch.ops.aten.silu_.default" + aten_op_BI = ["torch.ops.aten.sigmoid.default", "torch.ops.aten.mul.Tensor"] + + +@common.parametrize("test_data", Silu.test_data) +def test_silu_tosa_MI(test_data: input_t): + silu_data = (test_data[0], False) + pipeline = TosaPipelineMI[input_t](Silu(), silu_data, Silu.aten_op_MI) + pipeline.run() + + +@common.parametrize("test_data", Silu.test_data) +def test_silu_tosa_MI_inplace(test_data: input_t): + silu_data = (test_data[0], True) + pipeline = TosaPipelineMI[input_t](Silu(), silu_data, Silu.aten_op_inplace_MI) + pipeline.run() + + +@common.parametrize("test_data", Silu.test_data) +def test_silu_tosa_BI(test_data: input_t): + silu_data = (test_data[0], False) + pipeline = TosaPipelineBI[input_t](Silu(), silu_data, Silu.aten_op_BI) + pipeline.run() + + +@common.parametrize("test_data", Silu.test_data) +def test_silu_tosa_BI_inplace(test_data: input_t): + silu_data = (test_data[0], True) + pipeline = TosaPipelineBI[input_t](Silu(), silu_data, Silu.aten_op_BI) + pipeline.run() + + +@common.parametrize("test_data", Silu.test_data) +@common.XfailIfNoCorstone300 +def test_silu_u55_BI(test_data: input_t): + silu_data = (test_data[0], False) + pipeline = EthosU55PipelineBI[input_t]( + Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True + ) + pipeline.run() + + +@common.parametrize("test_data", Silu.test_data) +@common.XfailIfNoCorstone300 +def test_silu_u55_BI_inplace(test_data: input_t): + silu_data = (test_data[0], True) + pipeline = EthosU55PipelineBI[input_t]( + Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True + ) + pipeline.run() + + +@common.parametrize("test_data", Silu.test_data) +@common.XfailIfNoCorstone320 +def test_silu_u85_BI(test_data: input_t): + silu_data = (test_data[0], False) + pipeline = EthosU85PipelineBI[input_t]( + Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True + ) + pipeline.run() + + +@common.parametrize("test_data", Silu.test_data) +@common.XfailIfNoCorstone320 +def test_silu_u85_BI_inplace(test_data: input_t): + silu_data = (test_data[0], True) + pipeline = EthosU85PipelineBI[input_t]( + Silu(), silu_data, Silu.aten_op_BI, run_on_fvp=True + ) + pipeline.run() diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py index 7cb82e3a828..91ef51cc2a2 100644 --- a/backends/arm/test/ops/test_slice.py +++ b/backends/arm/test/ops/test_slice.py @@ -7,35 +7,35 @@ import unittest from typing import Tuple +import pytest + import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized +test_data_suite = [ + (torch.ones(10), [(3, -3)]), + (torch.ones(10), [(-8, 3)]), + (torch.ones(10, 10), [(1, 3), (3, None)]), + (torch.ones(10, 10, 10), [(0, 7), (0, None), (0, 8)]), + (torch.ones((1, 12, 10, 10)), [(None, None), (None, 5), (3, 5), (4, 10)]), +] + class TestSimpleSlice(unittest.TestCase): class Slice(torch.nn.Module): - - sizes = [(10), (10, 10), (10, 10, 10), ((1, 12, 10, 10))] - test_tensors = [(torch.ones(n),) for n in sizes] - - def forward(self, x: torch.Tensor): - if x.dim() == 1: - return x[3:-3] - elif x.dim() == 2: - return x[1:3, 3:] - elif x.dim() == 3: - return x[0:7, 0:, 0:8] - elif x.dim() == 4: - return x[:, :5, 3:5, 4:10] + def forward(self, x: torch.Tensor, s: list[tuple[int, int]]): + slices = [slice(*i) for i in s] + return x[slices] def _test_slice_tosa_MI_pipeline( self, module: torch.nn.Module, test_data: torch.Tensor ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -48,14 +48,16 @@ def _test_slice_tosa_MI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() - .run_method_and_compare_outputs(inputs=test_data) ) + if conftest.is_option_enabled("tosa_ref_model"): + tester.run_method_and_compare_outputs(inputs=test_data) + def _test_slice_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -68,9 +70,11 @@ def _test_slice_tosa_BI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() - .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) + if conftest.is_option_enabled("tosa_ref_model"): + tester.run_method_and_compare_outputs(inputs=test_data, qtol=1) + def _test_slice_ethos_BI_pipeline( self, compile_spec: list[CompileSpec], @@ -106,22 +110,29 @@ def _test_slice_u85_BI_pipeline( common.get_u85_compile_spec(), module, test_data ) - @parameterized.expand(Slice.test_tensors) - def test_slice_tosa_MI(self, tensor): - self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor,)) + @parameterized.expand(test_data_suite) + @pytest.mark.tosa_ref_model + def test_slice_tosa_MI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]): + self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor, slices)) - @parameterized.expand(Slice.test_tensors[:2]) - def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor): - self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,)) + @parameterized.expand(test_data_suite) + @pytest.mark.tosa_ref_model + def test_slice_nchw_tosa_BI( + self, tensor: torch.Tensor, slices: list[tuple[int, int]] + ): + self._test_slice_tosa_BI_pipeline(self.Slice(), (tensor, slices)) - @parameterized.expand(Slice.test_tensors[2:]) - def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor): - self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,)) + @parameterized.expand(test_data_suite) + @pytest.mark.tosa_ref_model + def test_slice_nhwc_tosa_BI( + self, tensor: torch.Tensor, slices: list[tuple[int, int]] + ): + self._test_slice_tosa_BI_pipeline(self.Slice(), (tensor, slices)) - @parameterized.expand(Slice.test_tensors) - def test_slice_u55_BI(self, test_tensor: torch.Tensor): - self._test_slice_u55_BI_pipeline(self.Slice(), (test_tensor,)) + @parameterized.expand(test_data_suite) + def test_slice_u55_BI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]): + self._test_slice_u55_BI_pipeline(self.Slice(), (tensor, slices)) - @parameterized.expand(Slice.test_tensors) - def test_slice_u85_BI(self, test_tensor: torch.Tensor): - self._test_slice_u85_BI_pipeline(self.Slice(), (test_tensor,)) + @parameterized.expand(test_data_suite) + def test_slice_u85_BI(self, tensor: torch.Tensor, slices: list[tuple[int, int]]): + self._test_slice_u85_BI_pipeline(self.Slice(), (tensor, slices)) diff --git a/backends/arm/test/ops/test_sqrt.py b/backends/arm/test/ops/test_sqrt.py new file mode 100644 index 00000000000..53a1e79c0a8 --- /dev/null +++ b/backends/arm/test/ops/test_sqrt.py @@ -0,0 +1,78 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Dict, Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineBI, + EthosU85PipelineBI, + TosaPipelineBI, + TosaPipelineMI, +) + + +class Sqrt(torch.nn.Module): + input_t = Tuple[torch.Tensor] + aten_op_MI = "torch.ops.aten.sqrt.default" + exir_op_MI = "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Tensor" + + aten_op_BI = "torch.ops.aten.pow.Tensor_Scalar" + exir_op_BI = "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar" + + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.sqrt(x) + + test_data: Dict[str, input_t] = { + "sqrt_tensor_rank1_ones": (torch.ones(10),), + "sqrt_tensor_rank2_random": (torch.rand(5, 10),), + "sqrt_tensor_rank3_ones": (torch.ones(2, 3, 4),), + "sqrt_tensor_rank4_random": (torch.rand(1, 3, 8, 8),), + "sqrt_tensor_rank4_multibatch": (torch.rand(2, 3, 4, 4),), + } + + +fvp_xfails = { + "sqrt_tensor_rank4_multibatch": "MLETORCH-517 : Multiple batches not supported", +} + + +@common.parametrize("test_data", Sqrt.test_data) +def test_sqrt_tosa_MI(test_data: Sqrt.input_t): + pipeline = TosaPipelineMI[Sqrt.input_t]( + Sqrt(), test_data, Sqrt.aten_op_MI, Sqrt.exir_op_MI + ) + pipeline.run() + + +@common.parametrize("test_data", Sqrt.test_data) +def test_sqrt_tosa_BI(test_data: Sqrt.input_t): + pipeline = TosaPipelineBI[Sqrt.input_t]( + Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI + ) + pipeline.run() + + +@common.parametrize("test_data", Sqrt.test_data, fvp_xfails) +@common.XfailIfNoCorstone300 +def test_sqrt_u55_BI(test_data: Sqrt.input_t): + pipeline = EthosU55PipelineBI[Sqrt.input_t]( + Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI, run_on_fvp=True + ) + pipeline.run() + + +@common.parametrize("test_data", Sqrt.test_data, fvp_xfails) +@common.XfailIfNoCorstone320 +def test_sqrt_u85_BI(test_data: Sqrt.input_t): + pipeline = EthosU85PipelineBI[Sqrt.input_t]( + Sqrt(), test_data, Sqrt.aten_op_BI, Sqrt.exir_op_BI, run_on_fvp=True + ) + pipeline.run() diff --git a/backends/arm/test/ops/test_tanh.py b/backends/arm/test/ops/test_tanh.py index 060d7933ea5..8d13620dc4a 100644 --- a/backends/arm/test/ops/test_tanh.py +++ b/backends/arm/test/ops/test_tanh.py @@ -9,9 +9,11 @@ from typing import Tuple +import pytest + import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -40,7 +42,7 @@ def forward(self, x): def _test_tanh_tosa_MI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -54,11 +56,13 @@ def _test_tanh_tosa_MI_pipeline( .check_not(["executorch_exir_dialects_edge__ops_aten_tanh_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() - .run_method_and_compare_outputs(inputs=test_data) ) + if conftest.is_option_enabled("tosa_ref_model"): + tester.run_method_and_compare_outputs(inputs=test_data) + def _test_tanh_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -73,9 +77,11 @@ def _test_tanh_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple) .check_not(["executorch_exir_dialects_edge__ops_aten_tanh_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() - .run_method_and_compare_outputs(inputs=test_data) ) + if conftest.is_option_enabled("tosa_ref_model"): + tester.run_method_and_compare_outputs(inputs=test_data) + def _test_tanh_tosa_ethos_BI_pipeline( self, compile_spec: list[CompileSpec], @@ -114,6 +120,7 @@ def _test_tanh_tosa_u85_BI_pipeline( ) @parameterized.expand(test_data_suite) + @pytest.mark.tosa_ref_model def test_tanh_tosa_MI( self, test_name: str, @@ -122,6 +129,7 @@ def test_tanh_tosa_MI( self._test_tanh_tosa_MI_pipeline(self.Tanh(), (test_data,)) @parameterized.expand(test_data_suite) + @pytest.mark.tosa_ref_model def test_tanh_tosa_BI(self, test_name: str, test_data: torch.Tensor): self._test_tanh_tosa_BI_pipeline(self.Tanh(), (test_data,)) diff --git a/backends/arm/test/ops/test_where.py b/backends/arm/test/ops/test_where.py index bf127460f3e..dd4f3326f8e 100644 --- a/backends/arm/test/ops/test_where.py +++ b/backends/arm/test/ops/test_where.py @@ -173,14 +173,9 @@ def test_where_u55_BI(test_module): get_symmetric_quantization_config() ) - # If condition is tensor_condition then there will be one full_like op which will be - # delegated. - if test_module.condition == tensor_condition: - num_delegates = 1 - num_exir = 0 - else: - num_delegates = 0 - num_exir = 0 + # There will be one full_like op which will be delegated. + num_delegates = 1 + num_exir = 0 pipeline = OpNotSupportedPipeline[input_t]( test_module, @@ -223,14 +218,9 @@ def test_where_u55_BI_on_fvp(test_module): get_symmetric_quantization_config() ) - # If condition is tensor_condition then there will be one full_like op which will be - # delegated. - if test_module.condition == tensor_condition: - num_delegates = 1 - num_exir = 0 - else: - num_delegates = 0 - num_exir = 0 + # There will be one full_like op which will be delegated. + num_delegates = 1 + num_exir = 0 pipeline = OpNotSupportedPipeline[input_t]( test_module, @@ -249,18 +239,7 @@ def test_where_u55_BI_on_fvp(test_module): pipeline.run() -@common.parametrize( - "test_module", - test_modules_BI, - xfails={ - "two_dim_scalar_cond": "E [executorch:method.cpp:601] Missing operator: " - "[2] aten::gt.Scalar_out", - "three_dim_scalar_cond": "E [executorch:method.cpp:601] Missing operator: " - "[2] aten::gt.Scalar_out", - "float32_scalar_cond": "E [executorch:method.cpp:601] Missing operator: " - "[2] aten::gt.Scalar_out", - }, -) +@common.parametrize("test_module", test_modules_BI) @common.XfailIfNoCorstone320 def test_where_u85_BI_on_fvp(test_module): compile_spec = common.get_u85_compile_spec() diff --git a/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py b/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py new file mode 100644 index 00000000000..5d83bc82f22 --- /dev/null +++ b/backends/arm/test/passes/test_convert_expand_copy_to_repeat.py @@ -0,0 +1,51 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes.convert_expand_copy_to_repeat import ( + ConvertExpandCopyToRepeatPass, +) + +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + +input_t = Tuple[torch.Tensor] # Input x + + +class Expand(torch.nn.Module): + """ + Basic expand model using torch.Tensor.expand function + """ + + def __init__(self): + super(Expand, self).__init__() + + def forward(self, x): + return x.expand(3, 4) + + def get_inputs(self) -> input_t: + return (torch.rand(3, 1),) + + +def test_expand_to_repeat_tosa_BI(): + module = Expand() + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+BI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten_expand_copy_default": 1, + }, + ops_not_before_pass=["executorch_exir_dialects_edge__ops_aten_repeat_default"], + ops_after_pass={ + "executorch_exir_dialects_edge__ops_aten_repeat_default": 1, + }, + ops_not_after_pass=[ + "executorch_exir_dialects_edge__ops_aten_expand_copy_default" + ], + pass_list=[ConvertExpandCopyToRepeatPass], + ) + pipeline.run() diff --git a/backends/arm/test/passes/test_convert_split_to_slice.py b/backends/arm/test/passes/test_convert_split_to_slice.py new file mode 100644 index 00000000000..d4fdffe3b01 --- /dev/null +++ b/backends/arm/test/passes/test_convert_split_to_slice.py @@ -0,0 +1,67 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes.convert_split_to_slice import ( + ConvertSplitToSlicePass, +) + +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + +input_t = Tuple[torch.Tensor] # Input x + + +class Split(torch.nn.Module): + """ + Basic split model using torch.split function + """ + + def get_inputs(self) -> input_t: + return (torch.rand(10),) + + def forward(self, x): + return torch.split(x, 2) + + +class SplitTensor(torch.nn.Module): + """ + Basic split model using torch.Tensor.split function + """ + + def get_inputs(self) -> input_t: + return (torch.rand(10),) + + def forward(self, x): + return x.split(2) + + +modules = {"split_basic": Split(), "split_tensor": SplitTensor()} + + +@common.parametrize("module", modules) +def test_split_to_slice_tosa_BI(module): + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+BI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1, + }, + ops_not_before_pass=[ + "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor" + ], + ops_after_pass={ + "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 5, + }, + ops_not_after_pass=[ + "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default" + ], + pass_list=[ConvertSplitToSlicePass], + ) + pipeline.run() diff --git a/backends/arm/test/passes/test_decompose_div_pass.py b/backends/arm/test/passes/test_decompose_div_pass.py new file mode 100644 index 00000000000..71d586c0029 --- /dev/null +++ b/backends/arm/test/passes/test_decompose_div_pass.py @@ -0,0 +1,65 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass + +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + +input_t = Tuple[torch.Tensor] # Input x + + +class Div(torch.nn.Module): + """ + Basic div model using torch.div + """ + + def get_inputs(self) -> input_t: + return (torch.rand(10),) + + def forward(self, x): + return torch.div(x, 2) + + +class DivTensor(torch.nn.Module): + """ + Basic div model using torch.Tensor.div + """ + + def get_inputs(self) -> input_t: + return (torch.rand(10),) + + def forward(self, x): + return x.div(2) + + +modules = {"div_basic": Div(), "div_tensor": DivTensor()} + + +@common.parametrize("module", modules) +def test_decompose_div_tosa_MI(module): + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+MI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten_div_Tensor": 1, + }, + ops_not_before_pass=[ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor", + "executorch_exir_dialects_edge__ops_aten_reciprocal_default", + ], + ops_after_pass={ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_reciprocal_default": 1, + }, + ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_div_Tensor"], + pass_list=[DecomposeDivPass], + ) + pipeline.run() diff --git a/backends/arm/test/passes/test_decompose_layernorm_pass.py b/backends/arm/test/passes/test_decompose_layernorm_pass.py new file mode 100644 index 00000000000..40e49e15bc5 --- /dev/null +++ b/backends/arm/test/passes/test_decompose_layernorm_pass.py @@ -0,0 +1,69 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes.decompose_layernorm_pass import ( + DecomposeLayerNormPass, +) + +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + +input_t = Tuple[torch.Tensor] # Input x + + +class LayerNorm(torch.nn.Module): + """ + Basic layer_norm model using torch.nn.layer_norm layer + """ + + def __init__(self): + super(LayerNorm, self).__init__() + self.layer_norm = torch.nn.LayerNorm(10) + + def forward(self, x): + x = self.layer_norm(x) + return x + + def get_inputs(self) -> input_t: + return (torch.rand(10),) + + +def test_decompose_layernorm_tosa_MI(): + module = LayerNorm() + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+MI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten_native_layer_norm_default": 1, + }, + ops_not_before_pass=[ + "executorch_exir_dialects_edge__ops_aten_add_Tensor", + "executorch_exir_dialects_edge__ops_aten_view_copy_default", + "executorch_exir_dialects_edge__ops_aten_mul_Tensor", + "executorch_exir_dialects_edge__ops_aten_full_default", + "executorch_exir_dialects_edge__ops_aten_rsqrt_default", + "executorch_exir_dialects_edge__ops_aten_var_correction", + "executorch_exir_dialects_edge__ops_aten_sub_Tensor", + "executorch_exir_dialects_edge__ops_aten_mean_dim", + ], + ops_after_pass={ + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2, + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_full_default": 1, + "executorch_exir_dialects_edge__ops_aten_rsqrt_default": 1, + "executorch_exir_dialects_edge__ops_aten_var_correction": 1, + "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_mean_dim": 1, + }, + ops_not_after_pass=[ + "executorch_exir_dialects_edge__ops_aten_expand_copy_default" + ], + pass_list=[DecomposeLayerNormPass], + ) + pipeline.run() diff --git a/backends/arm/test/passes/test_decompose_meandim_pass.py b/backends/arm/test/passes/test_decompose_meandim_pass.py new file mode 100644 index 00000000000..6ba9ceff3a7 --- /dev/null +++ b/backends/arm/test/passes/test_decompose_meandim_pass.py @@ -0,0 +1,73 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass + +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + +input_t = Tuple[torch.Tensor] # Input x + + +class MeanDim(torch.nn.Module): + """ + Basic mean model using torch.mean function making sure keepdim=True (keepdim=False doesnt work for this pass for some reason) + """ + + def __init__(self): + super(MeanDim, self).__init__() + + def forward(self, x): + return torch.mean(x, 1, True) + + def get_inputs(self) -> input_t: + return (torch.rand(4, 4),) + + +class MeanDimTensor(torch.nn.Module): + """ + Basic mean model using torch.Tensor.mean function making sure keepdim=True (keepdim=False doesnt work for this pass for some reason) + """ + + def __init__(self): + super(MeanDimTensor, self).__init__() + + def forward(self, x): + return x.mean(1, True) + + def get_inputs(self) -> input_t: + return (torch.rand(4, 4),) + + +modules = {"meandim_basic": MeanDim(), "meandim_tensor": MeanDimTensor()} + + +@common.parametrize("module", modules) +def test_decompose_meandim_tosa_MI(module): + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+MI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten_mean_dim": 1, + }, + ops_not_before_pass=[ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor", + "executorch_exir_dialects_edge__ops_aten_full_default", + "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList", + ], + ops_after_pass={ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_full_default": 1, + "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1, + }, + ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_mean_dim"], + pass_list=[DecomposeMeanDimPass], + ) + pipeline.run() diff --git a/backends/arm/test/passes/test_decompose_softmax_pass.py b/backends/arm/test/passes/test_decompose_softmax_pass.py new file mode 100644 index 00000000000..efb911f03aa --- /dev/null +++ b/backends/arm/test/passes/test_decompose_softmax_pass.py @@ -0,0 +1,103 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes.decompose_softmax_pass import DecomposeSoftmaxPass + +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + +input_t = Tuple[torch.Tensor] # Input x + + +class Softmax(torch.nn.Module): + """ + Basic torch.nn.softmax layer model + """ + + def __init__(self): + super(Softmax, self).__init__() + self.softmax = torch.nn.Softmax(dim=1) + + def forward(self, x): + x = self.softmax(x) + return x + + def get_inputs(self) -> input_t: + return (torch.rand(2, 3),) + + +class SoftmaxLog(torch.nn.Module): + """ + Basic torch.nn.log_softmax layer model + """ + + def __init__(self): + super(SoftmaxLog, self).__init__() + self.softmax = torch.nn.LogSoftmax(dim=1) + + def forward(self, x): + x = self.softmax(x) + return x + + def get_inputs(self) -> input_t: + return (torch.rand(2, 3),) + + +def test_softmax_basic_tosa_MI(): + module = Softmax() + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+MI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten__softmax_default": 1, + }, + ops_not_before_pass=[ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor", + "executorch_exir_dialects_edge__ops_aten_reciprocal_default", + "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList", + "executorch_exir_dialects_edge__ops_aten_exp_default", + ], + ops_after_pass={ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_exp_default": 1, + "executorch_exir_dialects_edge__ops_aten_reciprocal_default": 1, + "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1, + }, + ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten__softmax_default"], + pass_list=[DecomposeSoftmaxPass], + ) + pipeline.run() + + +def test_softmax_log_tosa_MI(): + module = SoftmaxLog() + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+MI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1, + }, + ops_not_before_pass=[ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor", + "executorch_exir_dialects_edge__ops_aten_reciprocal_default", + "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList", + "executorch_exir_dialects_edge__ops_aten_exp_default", + ], + ops_after_pass={ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_exp_default": 1, + "executorch_exir_dialects_edge__ops_aten_reciprocal_default": 1, + "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1, + }, + ops_not_after_pass=[ + "executorch_exir_dialects_edge__ops_aten__log_softmax_default" + ], + pass_list=[DecomposeSoftmaxPass], + ) + pipeline.run() diff --git a/backends/arm/test/passes/test_decompose_var_pass.py b/backends/arm/test/passes/test_decompose_var_pass.py new file mode 100644 index 00000000000..fe793dba14b --- /dev/null +++ b/backends/arm/test/passes/test_decompose_var_pass.py @@ -0,0 +1,84 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass + +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + +input_t = Tuple[torch.Tensor] # Input x + + +class VarDim(torch.nn.Module): + """ + Basic variance model using torch.Tensor.var function. + """ + + def __init__(self, keepdim): + super(VarDim, self).__init__() + self.keepdim = keepdim + + def forward(self, x): + return x.var(dim=-1, keepdim=self.keepdim) + + def get_inputs(self) -> input_t: + return (torch.rand(4, 4),) + + +class VarCorrection(torch.nn.Module): + """ + Basic variance model using torch.var function. + """ + + def __init__(self, keepdim): + super(VarCorrection, self).__init__() + self.keepdim = keepdim + + def forward(self, x): + return torch.var(x, -1, keepdim=self.keepdim) + + def get_inputs(self) -> input_t: + return (torch.rand(4, 4),) + + +modules = { + "vardim_keepdim": VarDim(True), + "vardim_no_keepdim": VarDim(False), + "varcorrection_keepdim": VarCorrection(True), + "varcorrection_no_keepdim": VarCorrection(False), +} + + +@common.parametrize("module", modules) +def test_decompose_var_tosa_MI(module): + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+MI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten_var_correction": 1, + }, + ops_not_before_pass=[ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor", + "executorch_exir_dialects_edge__ops_aten_full_default", + "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList", + "executorch_exir_dialects_edge__ops_aten_mean_dim", + "executorch_exir_dialects_edge__ops_aten_sub_Tensor", + ], + ops_after_pass={ + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_mean_dim": 1, + "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_full_default": 1, + "executorch_exir_dialects_edge__ops_aten_sum_dim_IntList": 1, + }, + ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_var_correction"], + pass_list=[DecomposeVarPass], + ) + pipeline.run() diff --git a/backends/arm/test/passes/test_remove_clone_pass.py b/backends/arm/test/passes/test_remove_clone_pass.py new file mode 100755 index 00000000000..e586edd323d --- /dev/null +++ b/backends/arm/test/passes/test_remove_clone_pass.py @@ -0,0 +1,43 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass + +from executorch.backends.arm.test.tester.test_pipeline import PassPipeline + +input_t = Tuple[torch.Tensor] # Input x + + +class Clone(torch.nn.Module): + """ + Basic remove layer model to test RemoveClonePass + """ + + def __init__(self): + super(Clone, self).__init__() + + def forward(self, x): + return torch.clone(x) + + def get_inputs(self) -> input_t: + return (torch.rand(3, 1),) + + +def test_remove_clone_tosa_BI(): + module = Clone() + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + tosa_version="TOSA-0.80+BI", + ops_before_pass={ + "executorch_exir_dialects_edge__ops_aten_clone_default": 1, + }, + ops_not_after_pass=["executorch_exir_dialects_edge__ops_aten_clone_default"], + pass_list=[RemoveClonePass], + ) + pipeline.run() diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 28bbee052f9..4481a9c7cc2 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -13,31 +13,26 @@ from pathlib import Path -from typing import cast, Dict, List, Literal, Optional, Tuple +from typing import Any, cast, Dict, List, Literal, Optional, Tuple import numpy as np import torch -logger = logging.getLogger(__name__) -try: - import tosa_reference_model -except ImportError: - tosa_reference_model = None from executorch.backends.arm.arm_backend import get_tosa_spec, is_tosa - from executorch.backends.arm.test.conftest import is_option_enabled -from executorch.backends.arm.tosa_specification import TosaSpecification +from executorch.backends.arm.tosa_specification import ( + Tosa_0_80, + Tosa_1_00, + TosaSpecification, +) from executorch.exir import ExecutorchProgramManager, ExportedProgram from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.lowered_backend_module import LoweredBackendModule -from packaging.version import Version from torch.fx.node import Node from torch.overrides import TorchFunctionMode -from tosa import TosaGraph logger = logging.getLogger(__name__) -logger.setLevel(logging.CRITICAL) # Copied from PyTorch. # From torch/testing/_internal/common_utils.py:torch_to_numpy_dtype_dict @@ -568,7 +563,7 @@ def arm_executor_runner_exists(target_board): def run_tosa_graph( - graph: TosaGraph, + graph: Any, tosa_version: TosaSpecification, inputs: list[torch.Tensor], ) -> list[torch.Tensor]: @@ -576,25 +571,38 @@ def run_tosa_graph( inputs_np = [input.numpy() for input in inputs] transpose_data_format(inputs_np, to="NHWC") - tosa_release = tosa_version.version - - if tosa_release > Version("0.80"): - logger.warning("The reference model is only tested for TOSA v0.80") - - # tosa_profile: 0 = Base Inference, 1 = Main Inference, 2 = Main Training. - tosa_profile = 1 if tosa_version.support_float() else 0 - debug_mode = "ALL" if logger.level <= logging.DEBUG else None - outputs_np, status = tosa_reference_model.run( - graph, - inputs_np, - verbosity=_tosa_refmodel_loglevel(logger.level), - tosa_profile=tosa_profile, - initialize_variable_tensor_from_numpy=1, # True - debug_mode=debug_mode, - ) + if isinstance(tosa_version, Tosa_0_80): + import tosa_tools.v0_80.tosa_reference_model as reference_model + + # tosa_profile: 0 = Base Inference, 1 = Main Inference, 2 = Main Training. + tosa_profile = 1 if tosa_version.support_float() else 0 + debug_mode = "ALL" if logger.level <= logging.DEBUG else None + outputs_np, status = reference_model.run( + graph, + inputs_np, + verbosity=_tosa_refmodel_loglevel(logger.level), + tosa_profile=tosa_profile, + initialize_variable_tensor_from_numpy=True, + debug_mode=debug_mode, + ) + elif isinstance(tosa_version, Tosa_1_00): + import tosa_reference_model as reference_model + + debug_mode = "ALL" if logger.level <= logging.DEBUG else None + outputs_np, status = reference_model.run( + graph, + inputs_np, + verbosity=_tosa_refmodel_loglevel(logger.level), + initialize_variable_tensor_from_numpy=True, + debug_mode=debug_mode, + ) + else: + raise ValueError( + f"Unknown TOSA specification: {tosa_version}. No refererence model available to run for this specification version" + ) assert ( - status == tosa_reference_model.GraphStatus.TOSA_VALID + status == reference_model.GraphStatus.TOSA_VALID ), "Non-valid TOSA given to reference model." transpose_data_format(outputs_np, to="NCHW") diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl index e97b46cb977..9f5bb778e78 100644 --- a/backends/arm/test/targets.bzl +++ b/backends/arm/test/targets.bzl @@ -12,7 +12,12 @@ def define_arm_tests(): test_files.remove("passes/test_ioquantization_pass.py") # Operators - test_files += native.glob(["ops/test_linear.py"]) + test_files += [ + "ops/test_linear.py", + "ops/test_slice.py", + "ops/test_sigmoid.py", + "ops/test_tanh.py", + ] TESTS = {} diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh index b995341a586..cc140cc9db5 100755 --- a/backends/arm/test/test_arm_baremetal.sh +++ b/backends/arm/test/test_arm_baremetal.sh @@ -12,10 +12,19 @@ script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) et_root_dir=$(cd ${script_dir}/../../.. && pwd) cd "${et_root_dir}" pwd +setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh +_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools." TEST_SUITE=$1 +# Source the tools +# This should be prepared by the setup.sh +[[ -f ${setup_path_script} ]] \ + || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; } + +source ${setup_path_script} + help() { echo "Usage:" echo " $0 " @@ -66,7 +75,6 @@ test_pytest() { # Test ops and other things ./examples/models/llama3_2_vision/install_requirements.sh cd "${et_root_dir}" - source examples/arm/ethos-u-scratch/setup_path.sh backends/arm/scripts/build_quantized_ops_aot_lib.sh # Run arm baremetal pytest tests without FVP @@ -78,7 +86,6 @@ test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using echo "${TEST_SUITE_NAME}: Run pytest with fvp" ./examples/models/llama3_2_vision/install_requirements.sh - source examples/arm/ethos-u-scratch/setup_path.sh # Prepare Corstone-3x0 FVP for pytest examples/arm/run.sh --model_name=add --build_only @@ -92,8 +99,6 @@ test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using test_run_ethosu_fvp() { # End to End model tests using run.sh echo "${TEST_SUITE_NAME}: Test ethos-u delegate examples with run.sh" - source examples/arm/ethos-u-scratch/setup_path.sh - # TOSA quantized echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA" examples/arm/run.sh --et_build_root=arm_test/test_run --target=TOSA --model_name=add @@ -114,8 +119,6 @@ test_run_ethosu_fvp() { # End to End model tests using run.sh test_models_ethosu_fvp() { # End to End model tests using model_test.py echo "${TEST_SUITE_NAME}: Test ethos-u delegate models with test_model.py" - source examples/arm/ethos-u-scratch/setup_path.sh - # Build common libs once python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --build_libs diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 7b74603cfb2..6346a53edef 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -14,10 +14,10 @@ import executorch.backends.xnnpack.test.tester.tester as tester -import serializer.tosa_serializer as ts # type: ignore[import-untyped] - import torch.fx import torch.utils._pytree as pytree + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore[import-untyped] from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager from executorch.backends.arm.arm_backend import ( diff --git a/backends/arm/third-party/reference_model/patches/v0.80/reference_model/0001-Move-tosa-tools-to-be-namespaced-into-tosa-tools.v0_.patch b/backends/arm/third-party/reference_model/patches/v0.80/reference_model/0001-Move-tosa-tools-to-be-namespaced-into-tosa-tools.v0_.patch new file mode 100644 index 00000000000..512c105bda2 --- /dev/null +++ b/backends/arm/third-party/reference_model/patches/v0.80/reference_model/0001-Move-tosa-tools-to-be-namespaced-into-tosa-tools.v0_.patch @@ -0,0 +1,154 @@ +From 20c2059723d5c6952cecfb7fcde92601639ef825 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Per=20=C3=85strand?= +Date: Wed, 5 Feb 2025 12:31:47 +0100 +Subject: [PATCH 1/2] Move tosa-tools to be namespaced into tosa-tools.v0_80 + +--- + CMakeLists.txt | 4 ++- + pyproject.toml | 3 ++- + setup.cfg | 70 +++++++++++++++++++++++++------------------------- + setup.py | 3 ++- + 4 files changed, 42 insertions(+), 38 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 68e8d8a..34becd0 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -1,4 +1,6 @@ +-cmake_minimum_required (VERSION 3.4) ++cmake_minimum_required (VERSION 3.19) ++ ++cmake_policy(SET CMP0077 NEW) + + set(CMAKE_INSTALL_PREFIX ".") + project(tosa_tools LANGUAGES CXX) +diff --git a/pyproject.toml b/pyproject.toml +index 7565f93..60448e7 100644 +--- a/pyproject.toml ++++ b/pyproject.toml +@@ -6,7 +6,8 @@ requires = [ + "setuptools>=42", + "wheel", + "setuptools_scm[toml]>=6.0", +- "cmake" ++ "cmake", ++ "ninja", + ] + build-backend = "setuptools.build_meta" + +diff --git a/setup.cfg b/setup.cfg +index 82ec9b8..c1bd1a8 100644 +--- a/setup.cfg ++++ b/setup.cfg +@@ -2,7 +2,7 @@ + # SPDX-License-Identifier: Apache-2.0 + + [metadata] +-name = tosa-tools ++name = tosa-tools-v0.80 + # version = done by setuptools_scm in pyproject.toml + author = Arm Limited + #author_email = +@@ -25,44 +25,44 @@ install_requires = + python_requires = >=3.6 + include_package_data = True + packages = +- runner +- generator +- checker +- frameworks +- tests +- conformance +- xunit +- json2fbbin +- json2numpy +- schemavalidation +- convert2conformance +- tosa +- serializer +- tosa_reference_model ++ tosa_tools.v0_80.verif.runner ++ tosa_tools.v0_80.verif.generator ++ tosa_tools.v0_80.verif.checker ++ tosa_tools.v0_80.verif.frameworks ++ tosa_tools.v0_80.verif.tests ++ tosa_tools.v0_80.verif.conformance ++ tosa_tools.v0_80.xunit ++ tosa_tools.v0_80.json2fbbin ++ tosa_tools.v0_80.json2numpy ++ tosa_tools.v0_80.schemavalidation ++ tosa_tools.v0_80.convert2conformance ++ tosa_tools.v0_80.tosa ++ tosa_tools.v0_80.serializer ++ tosa_tools.v0_80.tosa_reference_model + package_dir = +- = verif +- xunit = scripts/xunit +- json2fbbin = scripts/json2fbbin +- json2numpy = scripts/json2numpy +- convert2conformance = scripts/convert2conformance +- tosa = thirdparty/serialization_lib/python/tosa +- serializer = thirdparty/serialization_lib/python/serializer +- tosa_reference_model = py_package +- schemavalidation = scripts/schemavalidation ++ tosa_tools.v0_80.verif = verif ++ tosa_tools.v0_80.xunit = scripts/xunit ++ tosa_tools.v0_80.json2fbbin = scripts/json2fbbin ++ tosa_tools.v0_80.json2numpy = scripts/json2numpy ++ tosa_tools.v0_80.convert2conformance = scripts/convert2conformance ++ tosa_tools.v0_80.tosa = thirdparty/serialization_lib/python/tosa ++ tosa_tools.v0_80.serializer = thirdparty/serialization_lib/python/serializer ++ tosa_tools.v0_80.tosa_reference_model = py_package ++ tosa_tools.v0_80.schemavalidation = scripts/schemavalidation + + [options.entry_points] + console_scripts = +- tosa_verif_run_ref = runner.tosa_verif_run_tests:main +- tosa_verif_run_tests = runner.tosa_verif_run_tests:main +- tosa_verif_build_tests = generator.tosa_verif_build_tests:main +- tosa_json2numpy = json2numpy.json2numpy:main +- tosa_json2fbbin = json2fbbin.json2fbbin:main +- tosa_verif_result_check = checker.tosa_result_checker:main +- tosa_convert2conformance = convert2conformance.convert2conformance:main +- tosa_verif_framework_generator = frameworks.tosa_verif_framework_generator:main +- tosa_verif_framework_compiler_runner = frameworks.tosa_verif_framework_compiler_runner:main +- tosa_verif_conformance_generator = conformance.tosa_verif_conformance_generator:main +- tosa_schemavalidation = schemavalidation.schemavalidation:main ++ tosa_verif_run_ref = tosa_tools.v0_80.verif.runner.tosa_verif_run_tests:main ++ tosa_verif_run_tests = tosa_tools.v0_80.verif.runner.tosa_verif_run_tests:main ++ tosa_verif_build_tests = tosa_tools.v0_80.verif.generator.tosa_verif_build_tests:main ++ tosa_json2numpy = tosa_tools.v0_80.verif.json2numpy.json2numpy:main ++ tosa_json2fbbin = tosa_tools.v0_80.verif.json2fbbin.json2fbbin:main ++ tosa_verif_result_check = tosa_tools.v0_80.verif.checker.tosa_result_checker:main ++ tosa_convert2conformance = tosa_tools.v0_80.verif.convert2conformance.convert2conformance:main ++ tosa_verif_framework_generator = tosa_tools.v0_80.verif.frameworks.tosa_verif_framework_generator:main ++ tosa_verif_framework_compiler_runner = tosa_tools.v0_80.verif.frameworks.tosa_verif_framework_compiler_runner:main ++ tosa_verif_conformance_generator = tosa_tools.v0_80.verif.conformance.tosa_verif_conformance_generator:main ++ tosa_schemavalidation = tosa_tools.v0_80.verif.schemavalidation.schemavalidation:main + + [options.package_data] + schemavalidation= +diff --git a/setup.py b/setup.py +index 8c6b4cd..95896ad 100644 +--- a/setup.py ++++ b/setup.py +@@ -20,7 +20,7 @@ class CMakeBuild(build_py): + root_dir = Path(__file__).parent + build_dir = root_dir / "build" + build_dir.mkdir(exist_ok=True) +- package_dir = root_dir / "py_package" ++ package_dir = root_dir / "build/lib/tosa_tools/v0_80/tosa_reference_model/" + + cmake_cmd = [ + "cmake", +@@ -90,6 +90,7 @@ class CMakeBuild(build_py): + # Python will know which one to import + copied_so = False + so_dir = build_dir / "reference_model" ++ package_dir.mkdir(parents=True, exist_ok=True) + print(f"copying .so files from '{so_dir}' to '{package_dir}'") + for so_file in so_dir.glob("tosa_reference_model.*.so"): + shutil.copy(so_file, package_dir) +-- +2.39.5 (Apple Git-154) + diff --git a/backends/arm/third-party/reference_model/patches/v0.80/serialization_lib/0001-Make-TOSA-serializer-lib-to-be-self-contained.patch b/backends/arm/third-party/reference_model/patches/v0.80/serialization_lib/0001-Make-TOSA-serializer-lib-to-be-self-contained.patch new file mode 100644 index 00000000000..cc9cbc4edad --- /dev/null +++ b/backends/arm/third-party/reference_model/patches/v0.80/serialization_lib/0001-Make-TOSA-serializer-lib-to-be-self-contained.patch @@ -0,0 +1,283 @@ +From b3c8c3f779a7e051826f317598fb831fa9cfe923 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Per=20=C3=85strand?= +Date: Wed, 5 Feb 2025 12:30:09 +0100 +Subject: [PATCH] Make TOSA serializer lib to be self contained + +--- + CMakeLists.txt | 4 ++ + python/serializer/tosa_serializer.py | 57 ++++++++++++++-------------- + 2 files changed, 32 insertions(+), 29 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index ac34b75..5e191aa 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -19,6 +19,8 @@ + cmake_minimum_required(VERSION 3.13.4) + project(TosaSerialization) + ++cmake_policy(SET CMP0077 NEW) ++ + set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard to conform to") + set(CMAKE_CXX_STANDARD_REQUIRED YES) + +@@ -27,6 +29,8 @@ set(CMAKE_VERBOSE_MAKEFILE ON) + option(BUILD_TESTS "Build test applications" ON) + option(FLATBUFFERS_ROOT "Location where the flatbuffers 'include' and 'lib' folders to be found" Off) + ++message(STATUS "FLATBUFFERS_ROOT set to: ${FLATBUFFERS_ROOT}") ++ + include_directories(${PROJECT_SOURCE_DIR}/third_party/half/include) + + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +diff --git a/python/serializer/tosa_serializer.py b/python/serializer/tosa_serializer.py +index 7bc75f0..d191997 100644 +--- a/python/serializer/tosa_serializer.py ++++ b/python/serializer/tosa_serializer.py +@@ -14,12 +14,11 @@ + + import os + import struct +-import serializer.tosa_serializer as ts + import json + import flatbuffers + import numpy as np + from enum import IntEnum, unique +-from tosa import ( ++from ..tosa import ( + TosaGraph, + TosaRegion, + TosaBasicBlock, +@@ -27,8 +26,8 @@ from tosa import ( + TosaOperator, + Version, + ) +-import tosa.DType as TosaDType +-import tosa.Op as TosaOp ++from ..tosa import DType as TosaDType ++from ..tosa import Op as TosaOp + + # Keep version number in sync with the version default value with schema/tosa.fbs + TOSA_VERSION_MAJOR = 0 +@@ -159,7 +158,7 @@ class TosaSerializerAttribute(TosaSerializerUnion): + output_zp, + accum_dtype, + ): +- from tosa import PoolAttribute as a, Attribute ++ from ..tosa import PoolAttribute as a, Attribute + + self.utype = Attribute.Attribute().PoolAttribute + +@@ -172,7 +171,7 @@ class TosaSerializerAttribute(TosaSerializerUnion): + self.ints.append((a.AddAccumDtype, accum_dtype)) + + def ConvAttribute(self, pad, stride, dilation, input_zp, weight_zp, local_bound): +- from tosa import ConvAttribute as a, Attribute ++ from ..tosa import ConvAttribute as a, Attribute + + self.utype = Attribute.Attribute().ConvAttribute + self.optFcns = (a.Start, a.End) +@@ -187,7 +186,7 @@ class TosaSerializerAttribute(TosaSerializerUnion): + def TransposeConvAttribute( + self, outpad, stride, output_shape, input_zp, weight_zp, local_bound + ): +- from tosa import TransposeConvAttribute as a, Attribute ++ from ..tosa import TransposeConvAttribute as a, Attribute + + self.utype = Attribute.Attribute().TransposeConvAttribute + self.optFcns = (a.Start, a.End) +@@ -200,7 +199,7 @@ class TosaSerializerAttribute(TosaSerializerUnion): + self.bools.append((a.AddLocalBound, local_bound)) + + def PadAttribute(self, serializer_builder, padding, pad_const_int, pad_const_fp): +- from tosa import PadAttribute as a, Attribute ++ from ..tosa import PadAttribute as a, Attribute + + self.utype = Attribute.Attribute().PadAttribute + self.optFcns = (a.Start, a.End) +@@ -210,14 +209,14 @@ class TosaSerializerAttribute(TosaSerializerUnion): + + # pad_const_fp attribute serialized as uint8 vector + pad_const_float_as_bytes = struct.pack(" tuple[list[TosaSerializerTensor], float]: +) -> tuple[list[ts.TosaSerializerTensor], float]: """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'. The scales are adjusted using the smallest scale of all 'nodes'. @@ -61,7 +61,7 @@ def insert_rescale_ops_to_int32( min_scale = min([qarg.scale for qarg in qargs]) scales = [qarg.scale / min_scale for qarg in qargs] - rescaled_nodes: list[TosaSerializerTensor] = [] + rescaled_nodes: list[ts.TosaSerializerTensor] = [] for tensor, qarg, scale in zip(tensors, qargs, scales): rescaled_nodes.append( build_rescale_to_int32( @@ -198,9 +198,9 @@ def compute_multiplier_and_shift( def build_rescale( - tosa_fb: TosaSerializer, + tosa_fb: ts.TosaSerializer, scale: list[float], - input_node: TosaSerializerTensor, + input_node: ts.TosaSerializerTensor, output_name: str, output_type: ts.DType, output_shape: List[int], @@ -233,14 +233,14 @@ def build_rescale( def build_rescale_to_int32( - tosa_fb: TosaSerializer, + tosa_fb: ts.TosaSerializer, input_arg: executorch.backends.arm.tosa_mapping.TosaArg, input_zp: int, rescale_scale: list[float], is_scale32: bool = True, is_double_round: bool = False, per_channel: bool = False, -) -> TosaSerializerTensor: +) -> ts.TosaSerializerTensor: multipliers, shifts = compute_multiplier_and_shift(rescale_scale) attr_rescale = ts.TosaSerializerAttribute() attr_rescale.RescaleAttribute( @@ -266,7 +266,7 @@ def build_rescale_to_int32( def build_rescale_from_int32( - tosa_fb: TosaSerializer, + tosa_fb: ts.TosaSerializer, input_name: str, output_name: str, output_zp: int, @@ -300,8 +300,8 @@ def build_rescale_from_int32( def build_rescale_conv_output( - tosa_fb: TosaSerializer, - op: TosaSerializerTensor, + tosa_fb: ts.TosaSerializer, + op: ts.TosaSerializerTensor, output_name: str, output_type: ts.DType, input_scale: list[float], diff --git a/backends/arm/tosa_specification.py b/backends/arm/tosa_specification.py index 94c307d440c..640361e059c 100644 --- a/backends/arm/tosa_specification.py +++ b/backends/arm/tosa_specification.py @@ -142,7 +142,7 @@ class Tosa_1_00(TosaSpecification): available_profiles = ["INT", "FP"] valid_extensions = { - "INT": ["int16", "int4", "var", "cf"], + "INT": ["int16", "int4", "var", "cf", "u55"], "FP": ["bf16", "fp8e4m3", "fp8e5m2", "fft", "var", "cf"], } diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py index 5fa603ea683..4d0f33003bc 100644 --- a/backends/arm/tosa_utils.py +++ b/backends/arm/tosa_utils.py @@ -9,14 +9,15 @@ import os from typing import Any, Optional, Tuple -import serializer.tosa_serializer as ts # type: ignore import torch + +import tosa_tools.v0_80.serializer.tosa_serializer as ts # type: ignore from executorch.backends.arm.tosa_mapping import TosaArg from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.print_program import inspect_node -from serializer.tosa_serializer import TosaOp from torch.fx import Node +from tosa_tools.v0_80.serializer.tosa_serializer import TosaOp logger = logging.getLogger(__name__) diff --git a/backends/arm/util/arm_model_evaluator.py b/backends/arm/util/arm_model_evaluator.py index c371d794376..5fbc6f7e894 100644 --- a/backends/arm/util/arm_model_evaluator.py +++ b/backends/arm/util/arm_model_evaluator.py @@ -24,6 +24,7 @@ # Logger for outputting progress for longer running evaluation logger = logging.getLogger(__name__) +# Explicitly set logging level: MLETORCH-893 logger.setLevel(logging.INFO) diff --git a/backends/cadence/README.md b/backends/cadence/README.md index 998ac55ddf0..3cefb71d945 100644 --- a/backends/cadence/README.md +++ b/backends/cadence/README.md @@ -6,7 +6,7 @@ ## Tutorial -Please follow the [tutorial](https://pytorch.org/executorch/main/build-run-xtensa.html) for more information on how to run models on Cadence/Xtensa DSPs. +Please follow the [tutorial](https://pytorch.org/executorch/main/backends-cadence) for more information on how to run models on Cadence/Xtensa DSPs. ## Directory Structure diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py index 4d51d1fa34c..32a4427278b 100644 --- a/backends/cadence/aot/compiler.py +++ b/backends/cadence/aot/compiler.py @@ -31,11 +31,11 @@ EdgeProgramManager, ExecutorchBackendConfig, ExecutorchProgramManager, - to_edge, ) from executorch.exir.pass_base import PassResult from executorch.exir.passes import ToOutVarPass from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass +from executorch.exir.program._program import to_edge_with_preserved_ops from torch._inductor.decomposition import remove_decompositions from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -80,6 +80,7 @@ def convert_pt2( torch.ops.aten.layer_norm.default, torch.ops.aten.linear.default, torch.ops.aten.matmul.default, + torch.ops.aten.rms_norm.default, ] # Remove decompositions for the ops we want to keep # pyre-fixme[6]: For 1st argument expected `Dict[typing.Callable[..., typing.Any @@ -201,9 +202,9 @@ def lower_ep_to_edge( """ Lower an ExportedProgram to an EdgeProgramManager (in edge IR). """ - # Call to_edge to convert the graph to edge IR. + # Call to_edge_with_preserved_ops to convert the graph to edge IR. # Note: dim_order is skipped (https://github.com/pytorch/executorch/issues/3704) - edge_prog_manager = to_edge( + edge_prog_manager = to_edge_with_preserved_ops( expo_program, compile_config=EdgeCompileConfig( _skip_dim_order=True, @@ -216,9 +217,11 @@ def lower_ep_to_edge( torch.ops.aten.linalg_vector_norm.default, torch.ops.aten.unfold.default, torch.ops.aten.angle.default, + torch.ops.aten.rms_norm.default, ], ), constant_methods=constant_methods, + preserve_ops=(torch.ops.aten.rms_norm.default,), ) if dump_graphs: diff --git a/backends/cadence/aot/memory_planning.py b/backends/cadence/aot/memory_planning.py index cfe1b9ab9d8..3c6c518f16a 100644 --- a/backends/cadence/aot/memory_planning.py +++ b/backends/cadence/aot/memory_planning.py @@ -12,7 +12,7 @@ import math import typing from functools import partial -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from executorch.backends.cadence.aot.memory_constraints import ( @@ -73,11 +73,11 @@ def collect_specs_from_graph_module( # the fastest memory available # flake8: noqa 'position_based_greedy_with_hierarchy' is too complex (13) def position_based_greedy_with_hierarchy( - graph_module: torch.fx.GraphModule, alignment: int, + specs: Set[TensorSpec], + graph_module: torch.fx.GraphModule, graph_signature: ExportGraphSignature, - alloc_graph_input: bool, - alloc_graph_output: bool, + extra_padding: int = 0, *, memory_config: MemoryConfig, mem_constraints: MemConstraints, @@ -119,9 +119,7 @@ def memory_available(spec: TensorSpec) -> bool: # Iterate over all the specs in sorted order for spec in sorted( - collect_specs_from_graph_module( - graph_module, graph_signature, alloc_graph_input, alloc_graph_output - ), + specs, key=lambda spec: spec.allocated_memory, reverse=True, ): @@ -167,11 +165,11 @@ def memory_available(spec: TensorSpec) -> bool: # Greedy tensor placement with the heuristics from arxiv.org/pdf/2001.03288.pdf def greedy_by_size_for_offset_calculation_with_hierarchy( - graph_module: torch.fx.GraphModule, alignment: int, + specs: Set[TensorSpec], + graph_module: torch.fx.GraphModule, graph_signature: ExportGraphSignature, - alloc_graph_input: bool, - alloc_graph_output: bool, + extra_padding: int = 0, *, memory_config: MemoryConfig, mem_constraints: MemConstraints, @@ -199,9 +197,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy( # Iterate over all the specs in sorted order for spec in sorted( - collect_specs_from_graph_module( - graph_module, graph_signature, alloc_graph_input, alloc_graph_output - ), + specs, key=lambda spec: spec.allocated_memory, reverse=True, ): diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index dec6feb1b8d..aca4965083d 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -139,7 +139,6 @@ "int in_zero_point, bool channel_last=False) -> (Tensor out)" ) lib.define("linalg_vector_norm(Tensor X) -> (Tensor Y)") -lib.define("rms_norm(Tensor X, float eps, Tensor W) -> (Tensor Y)") lib.define( "transposed_im2row(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, " "int[2] output_padding, Tensor in_zero_point, bool channel_last=False) -> (Tensor out)" @@ -211,9 +210,6 @@ "fully_connected.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define("linalg_vector_norm.out(Tensor X, *, Tensor(a!) out) -> Tensor(a!)") -lib.define( - "rms_norm.out(Tensor X, float eps, Tensor W, *, Tensor(a!) out) -> Tensor(a!)" -) lib.define( "quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, " "Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)" @@ -293,6 +289,15 @@ "attention_mask.out(Tensor input, Tensor start, Tensor stop, *, Tensor(a!) out) -> Tensor(a!)" ) +# Custom ops in aten namespace. RMSNorm is usually decomposed, so having +# an out-variant is non-standard + +lib_aten = Library("aten", "FRAGMENT") + +lib_aten.define( + "rms_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)" +) + @register_fake("cadence::quantize_per_tensor") def quantize_per_tensor_meta( @@ -619,15 +624,6 @@ def linalg_vector_norm_meta( return X.new_empty([], dtype=X.dtype) -@register_fake("cadence::rms_norm") -def rms_norm_meta( - X: torch.Tensor, - eps: float, - weight: torch.Tensor, -) -> torch.Tensor: - return X.new_empty(X.shape, dtype=X.dtype) - - @register_fake("cadence::requantize") def requantize_meta( input: torch.Tensor, diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py index 6b34021a20a..ca5ed017046 100644 --- a/backends/cadence/aot/pass_utils.py +++ b/backends/cadence/aot/pass_utils.py @@ -35,8 +35,8 @@ class CadencePassAttribute: ALL_CADENCE_PASSES: dict[ExportPass, CadencePassAttribute] = {} -def get_cadence_pass_attribute(p: ExportPass) -> CadencePassAttribute: - return ALL_CADENCE_PASSES[p] +def get_cadence_pass_attribute(p: ExportPass) -> Optional[CadencePassAttribute]: + return ALL_CADENCE_PASSES.get(p, None) # A decorator that registers a pass. @@ -61,7 +61,8 @@ def create_cadence_pass_filter( def _filter(p: ExportPass) -> bool: pass_attribute = get_cadence_pass_attribute(p) return ( - pass_attribute.opt_level is not None + pass_attribute is not None + and pass_attribute.opt_level is not None and pass_attribute.opt_level <= opt_level and (not pass_attribute.debug_pass or debug) ) diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py index 62727985452..761b2bf8d31 100644 --- a/backends/cadence/aot/quantizer/quantizer.py +++ b/backends/cadence/aot/quantizer/quantizer.py @@ -43,7 +43,7 @@ from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer -act_qspec_asym8u = QuantizationSpec( +act_qspec_asym8s = QuantizationSpec( dtype=torch.int8, quant_min=-128, quant_max=127, @@ -52,7 +52,7 @@ observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12), ) -wgt_qspec_asym8u = QuantizationSpec( +wgt_qspec_asym8s = QuantizationSpec( dtype=torch.int8, quant_min=-128, quant_max=127, @@ -61,7 +61,7 @@ observer_or_fake_quant_ctr=MinMaxObserver, ) -wgt_qspec_asym8s = QuantizationSpec( +wgt_qspec_sym8s = QuantizationSpec( dtype=torch.int8, quant_min=-128, quant_max=127, @@ -72,17 +72,17 @@ bias_qspec: Optional[QuantizationSpec] = None -qconfig_A8uW8u = QuantizationConfig( - act_qspec_asym8u, - act_qspec_asym8u, - wgt_qspec_asym8u, +qconfig_A8W8 = QuantizationConfig( + act_qspec_asym8s, + act_qspec_asym8s, + wgt_qspec_asym8s, None, ) -qconfig_A8uW8s = QuantizationConfig( - act_qspec_asym8u, - act_qspec_asym8u, - wgt_qspec_asym8s, +qconfig_A8W8sym = QuantizationConfig( + act_qspec_asym8s, + act_qspec_asym8s, + wgt_qspec_sym8s, None, ) @@ -189,15 +189,14 @@ def get_supported_operators(cls) -> List[OperatorConfig]: def get_cadence_default_quantizers() -> List[Quantizer]: return [ - CadenceAtenQuantizer(AddmmPattern(), qconfig_A8uW8u), - CadenceAtenQuantizer(BmmPattern(), qconfig_A8uW8u), - CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8uW8s), - CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8uW8s), - CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8uW8u), - CadenceAtenQuantizer(LinearPattern(), qconfig_A8uW8u), - CadenceAtenQuantizer(MatmulPattern(), qconfig_A8uW8u), - CadenceAtenQuantizer(ReluPattern0(), qconfig_A8uW8u), - CadenceAtenQuantizer(ReluPattern1(), qconfig_A8uW8u), + CadenceAtenQuantizer(AddmmPattern(), qconfig_A8W8), + CadenceAtenQuantizer(BmmPattern(), qconfig_A8W8), + CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8W8sym), + CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8W8sym), + CadenceAtenQuantizer(LinearPattern(), qconfig_A8W8), + CadenceAtenQuantizer(MatmulPattern(), qconfig_A8W8), + CadenceAtenQuantizer(ReluPattern0(), qconfig_A8W8), + CadenceAtenQuantizer(ReluPattern1(), qconfig_A8W8), ] @@ -236,14 +235,26 @@ def __init__( super().__init__([]) +class CadenceWithLayerNormQuantizer(CadenceQuantizer): + """ + Quantizer including layer norm + """ + + def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: + if quantizers is None: + quantizers = get_cadence_default_quantizers() + quantizers.append(CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8W8)) + super().__init__(quantizers) + + class CadenceWakeWordQuantizer(CadenceQuantizer): """ - Quantizer for WakeWord, including add + Quantizer for WakeWord, including add and cat """ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: if quantizers is None: quantizers = get_cadence_default_quantizers() - quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8uW8u)) - quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8uW8u)) + quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8)) + quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8)) super().__init__(quantizers) diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py index 5a4922ae069..867e4ec79c6 100644 --- a/backends/cadence/aot/replace_ops.py +++ b/backends/cadence/aot/replace_ops.py @@ -1806,30 +1806,6 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, tuple(new_args), kwargs, meta) -@register_cadence_pass(CadencePassAttribute(opt_level=0)) -class ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass(ExportPass): - """ - Replace the aten.linalg_vector_norm op with a custom op. - aten.linalg_vector_norm is not supported by Jarvis, so we - need to replace it with native_batch_norm at all optimization levels. - """ - - def call_operator(self, op, args, kwargs, meta): - if op != exir_ops.edge.aten.linalg_vector_norm.default: - return super().call_operator(op, args, kwargs, meta) - - assert ( - len(args) == 1 - ), "aten.linalg_vector_norm should have 1 argument (a tensor), we do not support any custom variants" - - return super().call_operator( - exir_ops.edge.cadence.linalg_vector_norm.default, - args, - kwargs, - meta, - ) - - @register_cadence_pass(CadencePassAttribute(opt_level=1)) class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass): """ @@ -2110,6 +2086,102 @@ def call_operator( return super().call_operator(op, args, kwargs, meta) +@register_cadence_pass(CadencePassAttribute(opt_level=2)) +class ReplaceGeluWithApproximateGeluPass(ExportPass): + """ + Replace the gelu op with an approximate gelu op. The approximate gelu op + is more efficient on DSP backends. + """ + + def call_operator( + self, + op, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op not in { + exir_ops.edge.aten.gelu.default, + }: + return super().call_operator(op, args, kwargs, meta) + + # compute the approximate gelu (0.7978845608028654 is sqrt(2 / pi)) + # as 0.5 * x * (1 + torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3))) + + # Get 0.5 * x + half = super().call_operator( + exir_ops.edge.aten.mul.Tensor, + (args[0], 0.5), + {}, + meta, + ) + + scaled = super().call_operator( + exir_ops.edge.aten.mul.Tensor, + (args[0], 0.044715), + {}, + meta, + ) + + # Get x^2 (note that we use mul.Tensor twice instead of pow.Tensor because + # it is much more efficient on DSP backends) + scaled_square = super().call_operator( + exir_ops.edge.aten.mul.Tensor, + (scaled, args[0]), + {}, + meta, + ) + + # Get x^3 + scaled_cubed = super().call_operator( + exir_ops.edge.aten.mul.Tensor, + (scaled_square, args[0]), + {}, + meta, + ) + + # Get x + 0.044715 * x^3 + inner_sum = super().call_operator( + exir_ops.edge.aten.add.Tensor, + (scaled_cubed, args[0]), + {}, + meta, + ) + + # Get 0.7978845608028654 * ( x + 0.044715 * x^3) + scaled_sum = super().call_operator( + exir_ops.edge.aten.mul.Tensor, + (inner_sum, 0.7978845608028654), + {}, + meta, + ) + + # Get torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3)) + tanh = super().call_operator( + exir_ops.edge.aten.tanh.default, + (scaled_sum,), + {}, + meta, + ) + + # Get 1 + torch.tanh(0.79788456 * ( x + 0.044715 * x^3)) + # TODO(): Check why this is not working properly with integer values (e.g. 1 instead of 1.) + outer_sum = super().call_operator( + exir_ops.edge.aten.add.Tensor, + (tanh, 1.0), + {}, + meta, + ) + + # Retunr the final result + return super().call_operator( + exir_ops.edge.aten.mul.Tensor, + (half, outer_sum), + {}, + meta, + ) + + # This class encapsulates all the functions that replace/switch one op in the # graph with another. class CadenceReplaceOpsInGraph: @@ -2147,6 +2219,6 @@ class CadenceReplaceOpsInGraph: ReplacePT2DequantWithCadenceDequantPass, ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass, ReplaceAtenAvgPoolWithJarvisAvgPoolPass, - ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass, ReplaceWhereWithFullArgsWithWhereScalar, + # ReplaceGeluWithApproximateGeluPass, ] diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py index e40c26c0f4e..886550772b5 100644 --- a/backends/cadence/aot/tests/test_replace_ops_passes.py +++ b/backends/cadence/aot/tests/test_replace_ops_passes.py @@ -23,12 +23,12 @@ MakeSliceAndCatDimOutermostPass, ReplaceAddMMWithLinearPass, ReplaceAtenConvolutionWithJarvisConvolutionPass, - ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass, ReplaceConstantPadNdWithSlicePass, ReplaceConvolutionOptionalArgsWithConcreteArgsPass, ReplaceConvWithIm2RowAndLinear, ReplaceEmptyTensorsWithFullPass, ReplaceFunctionallyEquivalentOpTargets, + ReplaceGeluWithApproximateGeluPass, ReplaceIm2RowWithViewPass, ReplaceLinearWithFullyConnectedOpPass, ReplaceMMWithAddMMPass, @@ -1188,36 +1188,6 @@ def forward(self, x): count_node(graph_after_passes, exir_ops.edge.aten.transpose_copy.int), 0 ) - def test_replace_aten_linalg_vector_norm_with_cadence_linalg_vector_norm(self): - class LinalgVectorNorm(torch.nn.Module): - def forward(self, x: torch.Tensor): - return torch.linalg.vector_norm(x) - - x = torch.randn(32) - - graph_module = ( - export_to_edge(LinalgVectorNorm(), (x,)).exported_program().graph_module - ) - - p = ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass() - graph_after_passes = cast(PassResult, p(graph_module)).graph_module - - # Assert that aten.linalg_vector_norm op was replaced by a - # cadence.linalg_vector_norm op - self.assertEqual( - count_node( - graph_after_passes, - exir_ops.edge.aten.linalg_vector_norm.default, - ), - 0, - ) - self.assertEqual( - count_node( - graph_after_passes, exir_ops.edge.cadence.linalg_vector_norm.default - ), - 1, - ) - def test_replace_aten_where_with_cadence_where_Scalar(self): class WhereScalarModel(torch.nn.Module): def forward(self, cond: torch.Tensor): @@ -1301,6 +1271,41 @@ def forward(self, cond: torch.Tensor): 1, ) + def test_replace_aten_gelu_with_approximate_gelu(self): + class Gelu(torch.nn.Module): + def forward(self, input): + return torch.nn.functional.gelu(input) + + inputs = torch.randn(2, 1, 64) + + graph_module = export_to_edge(Gelu(), (inputs,)).exported_program().graph_module + + p = ReplaceGeluWithApproximateGeluPass() + graph_after_passes = cast(PassResult, p(graph_module)).graph_module + + # Assert that aten.gelu op was decomposed + self.assertEqual( + count_node( + graph_after_passes, + exir_ops.edge.aten.gelu.default, + ), + 0, + ) + + # The decomposition should have one tanh, 2 add and 6 mul + self.assertEqual( + count_node(graph_after_passes, exir_ops.edge.aten.tanh.default), + 1, + ) + self.assertEqual( + count_node(graph_after_passes, exir_ops.edge.aten.add.Tensor), + 2, + ) + self.assertEqual( + count_node(graph_after_passes, exir_ops.edge.aten.mul.Tensor), + 6, + ) + class TestReplaceIm2rowWithViewPass(unittest.TestCase): def test_no_replacement_for_conv(self): diff --git a/backends/cadence/fusion_g3/operators/targets.bzl b/backends/cadence/fusion_g3/operators/targets.bzl index fffeee0d7b3..b878226fcb1 100644 --- a/backends/cadence/fusion_g3/operators/targets.bzl +++ b/backends/cadence/fusion_g3/operators/targets.bzl @@ -40,6 +40,7 @@ OPERATORS = [ "rsqrt", "sigmoid", "sqrt", + "hardtanh", "tanh", "transpose_copy", "where", diff --git a/backends/cadence/runtime/et_pal.cpp b/backends/cadence/runtime/et_pal.cpp index fdf058f05b3..7973e3acc5b 100644 --- a/backends/cadence/runtime/et_pal.cpp +++ b/backends/cadence/runtime/et_pal.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#if defined(XTENSA) +#if defined(__XTENSA__) #include #include diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py index 52b64dc1581..f38c1cc4154 100644 --- a/backends/cadence/utils/facto_util.py +++ b/backends/cadence/utils/facto_util.py @@ -99,6 +99,7 @@ def apply_scalar_contraints(op_name: str) -> list[ScalarDtype]: match op_name: case "add.Scalar" | "sub.Scalar" | "mul.Scalar" | "div.Scalar": return [ScalarDtype.int] + case _: return [ScalarDtype.float, ScalarDtype.int] @@ -122,6 +123,11 @@ def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, s cp.Size.Le(lambda deps, r, d: 2**2), ] ) + if in_spec.name == "max_val": # hardtanh + spec.inspec[index].deps = [0, 1] + spec.inspec[index].constraints.extend( + [cp.Value.Ge(lambda deps, _: deps[1])] + ) else: spec.inspec[index].constraints.extend( [ diff --git a/backends/example/README.md b/backends/example/README.md index e1780722904..2e5ddd1f7bb 100644 --- a/backends/example/README.md +++ b/backends/example/README.md @@ -17,16 +17,16 @@ In the following diagram, we show how to quantize a mobile net v2 model and lowe We can define patterns based on the operators supported by the backend, which will be used by the quantizer and delegate. -![](./diagrams/quantize_delegate.png) +![](diagrams/quantize_delegate.png) ### Partitioner and Backend The way partitioner and backend is, partitioner will tag the nodes to lower to the backend and backend will will receive all tagged nodes and preprocess them as a delegate. -![](./diagrams/delegate.png) +![](diagrams/delegate.png) ### Memory format permute Some operators may have better performance in the memory format other than contiguous. One way to do that is to insert `to_dim_op` to describe memory format permutation and merge if there two opposite one next to each other. -![](./diagrams/memory_permute.png) +![](diagrams/memory_permute.png) diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md index ec4c392eb46..0a756a7bf1a 100644 --- a/backends/mediatek/README.md +++ b/backends/mediatek/README.md @@ -43,7 +43,7 @@ Download [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/publ Follow the steps below to setup your build environment: -1. **Setup ExecuTorch Environment**: Refer to the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) guide for detailed instructions on setting up the ExecuTorch environment. +1. **Setup ExecuTorch Environment**: Refer to the [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup) guide for detailed instructions on setting up the ExecuTorch environment. 2. **Setup MediaTek Backend Environment** - Install the dependent libs. Ensure that you are inside backends/mediatek/ directory diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py new file mode 100644 index 00000000000..eff7f513cb9 --- /dev/null +++ b/backends/nxp/quantizer/neutron_quantizer.py @@ -0,0 +1,205 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024-2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Tuple, Union + +import torch + +from executorch.backends.nxp.quantizer.patterns import ( + AddmmPattern, + AvgPoolPattern, + Conv1dPattern, + Conv2dPattern, + LinearPattern, + MaxPoolPattern, + PadPattern, + PermutePattern, + QuantizationPattern, + ReluInPlacePattern, + ReluPattern, + ReshapePattern, + SoftMaxPattern, +) +from executorch.backends.nxp.quantizer.utils import ( + find_sequential_partitions_aten, + is_annotated, + no_outside_users, +) +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import ( + OperatorConfig, + QuantizationAnnotation, + QuantizationConfig, + QuantizationSpec, +) +from torch import fx +from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver +from torch.ao.quantization.quantizer import DerivedQuantizationSpec, Quantizer +from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer + + +class NeutronAtenQuantizer(Quantizer): + def __init__( + self, pattern: QuantizationPattern, quantization_config: QuantizationConfig + ) -> None: + super().__init__() + self.pattern = pattern + self.quantization_config = quantization_config + + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + fused_partitions = find_sequential_partitions_aten( + model, + self.pattern.partition_types(), + ) + + input_act_qspec = self.quantization_config.input_activation + weight_qspec = self.quantization_config.weight + bias_qspec = self.quantization_config.bias + output_act_qspec = self.quantization_config.output_activation + + for fused_partition in fused_partitions: + if not no_outside_users(fused_partition): + continue + + anchors = self.pattern.get_anchors(model, fused_partition) + if not anchors or anchors.empty: + continue + if is_annotated( + [ + x[0] + for x in anchors.inputs + + anchors.weights + + anchors.biases + + anchors.output + ] + ): + continue + + for output, *custom_spec in anchors.output: + # pyre-ignore[16]: no attribute + output.meta["quantization_annotation"] = QuantizationAnnotation( + # pyre-ignore[6]: incompatible parameter type + output_qspec=(custom_spec[0] if custom_spec else output_act_qspec), + _annotated=True, + ) + + def annotate_inputs( + inputs: Union[ + List[Tuple[fx.Node, int]], + List[Tuple[fx.Node, int, DerivedQuantizationSpec],], + ], + spec: Optional[QuantizationSpec], + ) -> None: + for node, idx, *custom_spec in inputs: + # pyre-ignore[16]: no attribute + annotation = node.meta.get( + "quantization_annotation", + QuantizationAnnotation(_annotated=True), + ) + arg = ( + # pyre-ignore[16]: no attribute + node.args[idx] + if isinstance(idx, int) + # pyre-ignore[16]: no attribute + else node.args[idx[0]][idx[1]] + ) + annotation.input_qspec_map[arg] = ( + custom_spec[0] if custom_spec else spec + ) + # pyre-ignore[16]: no attribute + node.meta["quantization_annotation"] = annotation + + def annotate_weights_or_biases( + weights_or_biases: List[Tuple[fx.Node, int]], + spec: Optional[QuantizationSpec], + ) -> None: + for node, idx, *custom_spec in weights_or_biases: + annotation = node.meta.get( + "quantization_annotation", + QuantizationAnnotation(_annotated=True), + ) + annotation.input_qspec_map[node.args[idx]] = ( + custom_spec[0] if custom_spec else spec + ) + node.meta["quantization_annotation"] = annotation + + # pyre-ignore[6]: incompatible parameter type + annotate_inputs(anchors.inputs, input_act_qspec) + annotate_weights_or_biases(anchors.weights, weight_qspec) + # pyre-ignore[6]: incompatible parameter type + annotate_weights_or_biases(anchors.biases, bias_qspec) + return model + + def validate(self, model: fx.GraphModule) -> None: + pass + + @classmethod + def get_supported_operators(cls) -> List[OperatorConfig]: + return [] + + +# Quantization Specification used by Neutron NPU +act_qspec = QuantizationSpec( + dtype=torch.int8, + quant_min=-128, + quant_max=127, + qscheme=torch.per_tensor_affine, + is_dynamic=False, + observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12), +) + +wgt_qspec = QuantizationSpec( + dtype=torch.int8, + quant_min=-127, + quant_max=127, + qscheme=torch.per_tensor_symmetric, + is_dynamic=False, + observer_or_fake_quant_ctr=MinMaxObserver, + ch_axis=0, +) + +wgt_fc_qspec = QuantizationSpec( + dtype=torch.int8, + quant_min=-127, + quant_max=127, + qscheme=torch.per_tensor_symmetric, + is_dynamic=False, + observer_or_fake_quant_ctr=MinMaxObserver, +) + +# Is set by the *PatternQuantizer directly. +bias_qspec = None + + +class NeutronQuantizer(ComposableQuantizer): + def __init__(self): + static_qconfig = QuantizationConfig( + act_qspec, + act_qspec, + wgt_qspec, + None, + ) + static_fc_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_fc_qspec, None) + super().__init__( + [ + NeutronAtenQuantizer(AddmmPattern(), static_fc_qconfig), + NeutronAtenQuantizer(Conv1dPattern(), static_qconfig), + NeutronAtenQuantizer(Conv2dPattern(), static_qconfig), + NeutronAtenQuantizer(LinearPattern(), static_fc_qconfig), + NeutronAtenQuantizer(MaxPoolPattern(), static_qconfig), + NeutronAtenQuantizer(SoftMaxPattern(), static_qconfig), + NeutronAtenQuantizer(ReshapePattern(), static_qconfig), + NeutronAtenQuantizer(PermutePattern(), static_qconfig), + NeutronAtenQuantizer(PadPattern(), static_qconfig), + NeutronAtenQuantizer(ReluPattern(), static_qconfig), + NeutronAtenQuantizer(ReluInPlacePattern(), static_qconfig), + NeutronAtenQuantizer(AvgPoolPattern(), static_qconfig), + ] + ) + + def transform_for_annotation( + self, model: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + return model diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py new file mode 100644 index 00000000000..6797447c50c --- /dev/null +++ b/backends/nxp/quantizer/patterns.py @@ -0,0 +1,342 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2025 NXP +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import List, Optional, Tuple, Type, Union + +import torch + +from executorch.backends.nxp.quantizer.utils import get_bias_qparams +from torch import fx +from torch._ops import OpOverload +from torch.ao.quantization.quantizer import ( + DerivedQuantizationSpec, + FixedQParamsQuantizationSpec, + SharedQuantizationSpec, +) + + +@dataclass +class PartitionAnchors: + """ + All fields except output are lists of (node, args_index) pair, where node is from + the given partition and node.args[args_index] is an input to the partition. Assumes + a single output. + + Quantizer uses inputs, weights and biases for quantization annotation. The others + field contains tensor inputs that aren't quantized, and the literals fields contains + is used for other types of input values as well as handling default parameters. + """ + + # Inputs can share quantization parameters + inputs: List[ + Union[ + Tuple[fx.Node, Union[int, Tuple[int, int]]], + Tuple[ + fx.Node, + Union[int, Tuple[int, int]], + SharedQuantizationSpec, + ], + ] + ] = field(default_factory=list) + weights: List[Tuple[fx.Node, int]] = field(default_factory=list) + biases: List[ + Union[Tuple[fx.Node, int], Tuple[fx.Node, int, DerivedQuantizationSpec]] + ] = field(default_factory=list) + others: List[Tuple[fx.Node, int]] = field(default_factory=list) + literals: List[Tuple[fx.Node, int]] = field(default_factory=list) + output: List[Union[Tuple[fx.Node], Tuple[fx.Node, SharedQuantizationSpec]]] = field( + default_factory=list + ) + empty: bool = False + + +class QuantizationPattern(ABC): + @abstractmethod + def partition_types(self) -> list[OpOverload]: + """ + List of types to be passed to find_sequential_partitions_aten. + """ + pass + + @abstractmethod + def get_anchors( + self, gm: torch.fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> Optional[PartitionAnchors]: + pass + + +class SharedSpecPattern(QuantizationPattern): + """ + Quantization pattern for shared quantization. + + The quantization is derived from the previous node quantization and the input and output shares the same + quantization parameters (scale and zero-point). + """ + + def partition_types(self) -> List[Type[torch.nn.Module]]: + pass + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> PartitionAnchors | None: + node = fused_partition[0].nodes[-1] + assert len(fused_partition[0].input_nodes) == 1 + prev_node = fused_partition[0].input_nodes[0] + + # Previous node was not quantized => we are not able to share q-params + if "quantization_annotation" not in prev_node.meta: + return None + + qspec = SharedQuantizationSpec(prev_node) + + return PartitionAnchors( + inputs=[(node, 0)], + weights=[], + biases=[], + output=[ + (node, qspec), + ], + ) + + +class AddmmPattern(QuantizationPattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.addmm.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> PartitionAnchors: + # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... + addmm_node = fused_partition[0].nodes[-1] + + bias_qspec = DerivedQuantizationSpec( + derived_from=[ + (addmm_node.args[1], addmm_node), + (addmm_node.args[2], addmm_node), + ], + derive_qparams_fn=get_bias_qparams, + dtype=torch.int32, + quant_min=-(2**31), + quant_max=2**31 - 1, + qscheme=torch.per_tensor_affine, + ) + + return PartitionAnchors( + inputs=[(addmm_node, 1)], + weights=[(addmm_node, 2)], + biases=[(addmm_node, 0, bias_qspec)], + output=[(addmm_node,)], + ) + + +class AvgPoolPattern(SharedSpecPattern): + """ + Quantizer for AvgPool2D operator. + """ + + def partition_types(self): + return [torch.ops.aten.avg_pool2d.default] + + +class Conv1dPattern(QuantizationPattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.conv1d.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> PartitionAnchors: + # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... + conv1d_node = fused_partition[0].nodes[-1] + + bias_qspec = DerivedQuantizationSpec( + derived_from=[ + (conv1d_node.args[0], conv1d_node), + (conv1d_node.args[1], conv1d_node), + ], + derive_qparams_fn=get_bias_qparams, + dtype=torch.int32, + quant_min=-(2**31), + quant_max=2**31 - 1, + qscheme=torch.per_tensor_affine, + ) + + # Keep bias empty if not supplied + bias = [] + if len(conv1d_node.args) > 2 and conv1d_node.args[2] is not None: + bias = [(conv1d_node, 2, bias_qspec)] + + return PartitionAnchors( + inputs=[(conv1d_node, 0)], + weights=[(conv1d_node, 1)], + # pyre-fixme[6]: Incompatible parameter type + biases=bias, + output=[(conv1d_node,)], + ) + + +class Conv2dPattern(QuantizationPattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.conv2d.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> PartitionAnchors: + # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... + conv2d_node = fused_partition[0].nodes[-1] + + bias_qspec = DerivedQuantizationSpec( + derived_from=[ + (conv2d_node.args[0], conv2d_node), + (conv2d_node.args[1], conv2d_node), + ], + derive_qparams_fn=get_bias_qparams, + dtype=torch.int32, + quant_min=-(2**31), + quant_max=2**31 - 1, + qscheme=torch.per_tensor_affine, + ) + + # Keep bias empty if not supplied + bias = [] + if len(conv2d_node.args) > 2 and conv2d_node.args[2] is not None: + bias = [(conv2d_node, 2, bias_qspec)] + + return PartitionAnchors( + inputs=[(conv2d_node, 0)], + weights=[(conv2d_node, 1)], + # pyre-fixme[6]: Incompatible parameter type + biases=bias, + output=[(conv2d_node,)], + ) + + +class LinearPattern(QuantizationPattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.linear.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> PartitionAnchors: + # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... + linear_node = fused_partition[0].nodes[-1] + + bias_qspec = DerivedQuantizationSpec( + derived_from=[ + (linear_node.args[0], linear_node), + (linear_node.args[1], linear_node), + ], + derive_qparams_fn=get_bias_qparams, + dtype=torch.int32, + quant_min=-(2**31), + quant_max=2**31 - 1, + qscheme=torch.per_tensor_affine, + ) + + # Keep bias empty if not supplied + bias = [] + if len(linear_node.args) > 2: + bias = [(linear_node, 2, bias_qspec)] + + return PartitionAnchors( + inputs=[(linear_node, 0)], + weights=[(linear_node, 1)], + # pyre-fixme[6]: Incompatible parameter type + biases=bias, + output=[(linear_node,)], + ) + + +class MaxPoolPattern(SharedSpecPattern): + """ + Quantizer for MaxPool2D operator. + """ + + def partition_types(self): + return [torch.ops.aten.max_pool2d.default] + + +class PadPattern(SharedSpecPattern): + """ + Quantizer for Pad operator. + """ + + def partition_types(self): + return [torch.ops.aten.pad.default] + + +class PermutePattern(SharedSpecPattern): + """ + Quantizer for Permute operator. + """ + + def partition_types(self): + return [torch.ops.aten.permute.default] + + +class ReluPattern(SharedSpecPattern): + """ + Quantizer for Relu operator. Shared quantization spec is selected, as ReLU usually follows computation layer. + """ + + def partition_types(self): + return [torch.ops.aten.relu.default] + + +class ReluInPlacePattern(SharedSpecPattern): + """ + Quantizer for Relu operator with param inplace=True. Shared quantization spec is selected, as ReLU usually + follows computation layer. + """ + + def partition_types(self): + return [torch.ops.aten.relu_.default] + + +class ReshapePattern(SharedSpecPattern): + """ + Quantizer for Reshape operator. + """ + + def partition_types(self): + return [torch.ops.aten.reshape.default] + + +class SoftMaxPattern(QuantizationPattern): + """ + Quantizer for Softmax operator. + + The quantization of Softmax output is fixed to scale 1/256, zero point -128, dtype int8. + """ + + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.softmax.int] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> PartitionAnchors: + node = fused_partition[0].nodes[-1] + assert len(fused_partition[0].input_nodes) == 1 + + qspec = FixedQParamsQuantizationSpec( + dtype=torch.int8, + scale=1.0 / 256.0, + zero_point=-128, + quant_min=-128, + quant_max=127, + qscheme=torch.per_tensor_affine, + ) + + return PartitionAnchors( + inputs=[(node, 0)], + weights=[], + biases=[], + output=[ + (node, qspec), + ], + ) diff --git a/backends/nxp/quantizer/utils.py b/backends/nxp/quantizer/utils.py new file mode 100644 index 00000000000..1effcdff25a --- /dev/null +++ b/backends/nxp/quantizer/utils.py @@ -0,0 +1,151 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024-2025 NXP +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import itertools +from collections import OrderedDict +from typing import Any, Dict, List, Tuple, Type + +import torch +from torch import fx +from torch._ops import OpOverload +from torch.ao.quantization import ObserverOrFakeQuantize +from torch.fx.passes.utils.source_matcher_utils import ( + check_subgraphs_connected, + SourcePartition, +) + + +def is_annotated(nodes: List[fx.Node]) -> bool: + annotated = False + for node in nodes: + annotated = annotated or ( + "quantization_annotation" in node.meta + and node.meta["quantization_annotation"]._annotated + ) + return annotated + + +def no_outside_users(fused_partition) -> bool: + """ + Checks if each partition other than the last does not have any outside users. + """ + for source_partition in fused_partition[:-1]: + if len(source_partition.output_nodes) != 1: + return False + if len(source_partition.output_nodes[0].users) != 1: + return False + return True + + +def get_bias_qparams( + obs_or_fqs: List[ObserverOrFakeQuantize], +) -> Tuple[torch.Tensor, torch.Tensor]: + act_scale, _ = obs_or_fqs[0].calculate_qparams() + weight_scale, _ = obs_or_fqs[1].calculate_qparams() + bias_scale = act_scale * weight_scale + bias_zero_point = torch.zeros_like(bias_scale, dtype=torch.int32) + return bias_scale, bias_zero_point + + +def get_aten_node_target_partitions( + graph: torch.fx.Graph, + wanted_original_aten_op: List[OpOverload], +): + """ + Args: + graph: The graph we want to partition + wanted_original_aten_op: List of original_aten ops (OpOverload) + + Returns: + Dictionary mapping aten ops that were given to a list of SourcePartitions + that correspond to the list of nodes that were decomposed from the given + aten ops. + """ + modules: Dict[Type, Dict[str, List[torch.fx.Node]]] = {} + + for node in graph.nodes: + # The metadata source_fn should contain a tuple of a unique name for the + # source, and the source function if the node is decomposed from a + # function, or the type of module if the node is decomposed from a leaf + # module + # TODO(matthiascremon): look into ways to avoid using source_fn_stack + if (source_fn_st := node.meta.get("source_fn_stack")) is None: + continue + + source_fn = source_fn_st[-1] + if node.target not in wanted_original_aten_op: + continue + + diff_modules = modules.setdefault(source_fn[1], {}) + partition = diff_modules.setdefault(node.name, []) + partition.append(node) + + def make_partition( + nodes: List[torch.fx.Node], module_type: Type + ) -> SourcePartition: + input_nodes = set() + output_nodes = set() + params = set() + for node in nodes: + for arg in node.args: + if isinstance(arg, torch.fx.Node) and arg not in nodes: + input_nodes.add(arg) + + if node.op == "get_attr": + params.add(node) + + for user in node.users.keys(): + if user not in nodes: + output_nodes.add(node) + + return SourcePartition( + nodes, + module_type, + list(input_nodes), + list(output_nodes), + list(params), # type: ignore[arg-type] + ) + + ret: Dict[Type[Any], List[SourcePartition]] = {} + + for k, v in modules.items(): + ret[k] = [make_partition(partition, k) for partition in v.values()] + + return ret + + +def _partitions_sequential(partitions: Tuple[SourcePartition]) -> bool: + prev_partition = None + for partition in partitions: + if prev_partition is not None and not check_subgraphs_connected( + prev_partition, partition + ): + return False + prev_partition = partition + return True + + +def find_sequential_partitions_aten( + gm: torch.fx.GraphModule, + partition_types: List[Any], +): + typed_partitions: OrderedDict[Any, List[SourcePartition]] = OrderedDict() + for partition_type in partition_types: + partitions = get_aten_node_target_partitions(gm.graph, [partition_type]) + typed_partitions[partition_type] = list( + itertools.chain.from_iterable(partitions.values()) + ) + + typed_partitions_list = list(typed_partitions.values()) + fusion_candidates = itertools.product(*typed_partitions_list) + fused_partitions = [] + for candidate in fusion_candidates: + if _partitions_sequential(candidate): + fused_partitions.append(candidate) + return fused_partitions diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py new file mode 100644 index 00000000000..741e64a28a1 --- /dev/null +++ b/backends/nxp/tests/models.py @@ -0,0 +1,238 @@ +# Copyright 2024 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Collection, Union + +import torch + + +class Conv2dModule(torch.nn.Module): + def __init__( + self, + bias: bool = True, + dilation: Union[int, tuple[int, int]] = 1, + in_channels: int = 4, + kernel_size: Union[int, tuple[int, int]] = 3, + out_channels: int = 8, + padding: Union[str, int, Collection[int]] = 0, + stride: Union[int, tuple[int, int]] = 2, + ): + super().__init__() + + self.conv = torch.nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + + def forward(self, x): + return self.conv(x) + + +class Conv2dAndMaxPool2DModule(torch.nn.Module): + def __init__(self): + super().__init__() + + self.conv = torch.nn.Conv2d( + in_channels=8, out_channels=32, kernel_size=5, bias=True + ) + self.maxpool = torch.nn.MaxPool2d(kernel_size=2, stride=2) + + def forward(self, x): + x = self.conv(x) + return self.maxpool(x) + + +class Conv2dConstantPadNDModule(torch.nn.Module): + def __init__(self, paddings: Collection[int], constant: float | int | None = None): + super().__init__() + self.pad = ConstantPadNDModule(paddings, constant) + self.conv = Conv2dModule() + + def forward(self, x): + x = self.conv(x) + return self.pad(x) + + +class SoftmaxModule(torch.nn.Module): + def __init__(self, dim: int): + super().__init__() + + self.softmax = torch.nn.Softmax(dim=dim) + + def forward(self, x): + return self.softmax(x) + + +class SoftmaxConvModule(torch.nn.Module): + def __init__(self, dim: int): + super().__init__() + + self.conv = Conv2dModule() + self.softmax = SoftmaxModule(dim=dim) + + def forward(self, x): + x = self.conv(x) + return self.softmax(x) + + +class LinearModule(torch.nn.Module): + def __init__(self, bias: bool): + super().__init__() + self.linear = torch.nn.Linear(32, 16, bias=bias) + + def forward(self, x): + return self.linear(x) + + +class LinearSoftmaxModule(torch.nn.Module): + def __init__(self): + super().__init__() + + self.linear = torch.nn.Linear(12, 10) + self.softmax = torch.nn.Softmax(1) + + def forward(self, x): + x = self.linear(x) + x = self.softmax(x) + + return x + + +class ConvFCSoftmaxModule(torch.nn.Module): + def __init__(self): + super().__init__() + + self.conv = torch.nn.Conv2d(4, 64, 2, bias=False) + self.fc = torch.nn.Linear(1024, 10) + self.softmax = torch.nn.Softmax(1) + + def forward(self, x): + x = self.conv(x) + x = torch.reshape(x, (-1, 1024)) + x = self.fc(x) + x = self.softmax(x) + + return x + + +class ConstantPadNDModule(torch.nn.Module): + def __init__(self, paddings: Collection[int], constant: float | int | None = None): + super().__init__() + self.paddings = paddings + self.constant = constant + + def forward(self, x): + if self.constant is None: + return torch.nn.functional.pad(x, tuple(self.paddings), "constant") + else: + return torch.nn.functional.pad( + x, tuple(self.paddings), "constant", self.constant + ) + + +class ConstantPadNDConvModule(torch.nn.Module): + def __init__(self, paddings: Collection[int], constant: float | int | None = None): + super().__init__() + self.pad = ConstantPadNDModule(paddings, constant) + self.conv = Conv2dModule() + + def forward(self, x): + x = self.pad(x) + return self.conv(x) + + +class MaxPool2dModule(torch.nn.Module): + def __init__(self, padding=0): + super().__init__() + + self.max_pool2d = torch.nn.MaxPool2d( + kernel_size=3, stride=2, padding=padding, dilation=1 + ) + + def forward(self, x): + return self.max_pool2d(x) + + +class MaxPool2dConvModule(torch.nn.Module): + def __init__(self, padding=0): + super().__init__() + + self.conv = Conv2dModule() + self.max_pool2d = torch.nn.MaxPool2d( + kernel_size=3, stride=2, padding=padding, dilation=1 + ) + + def forward(self, x): + x = self.conv(x) + return self.max_pool2d(x) + + +class AvgPool2dModule(torch.nn.Module): + def __init__(self, count_include_pad, padding=0): + super().__init__() + + self.avg_pool = torch.nn.AvgPool2d( + kernel_size=3, + stride=2, + padding=padding, + count_include_pad=count_include_pad, + ) + + def forward(self, x): + return self.avg_pool(x) + + +class AvgPool2dConvModule(torch.nn.Module): + def __init__(self, count_include_pad, padding=0): + super().__init__() + + self.conv = Conv2dModule() + self.avg_pool = torch.nn.AvgPool2d( + kernel_size=3, + stride=1, + padding=padding, + count_include_pad=count_include_pad, + ) + + def forward(self, x): + x = self.conv(x) + return self.avg_pool(x) + + +class ReLUModule(torch.nn.Module): + def __init__(self): + super().__init__() + + self.relu = torch.nn.ReLU() + + def forward(self, x): + return self.relu(x) + + +class Conv2dReLUModule(torch.nn.Module): + def __init__(self): + super().__init__() + + self.conv = torch.nn.Conv2d(4, 64, 2, bias=False) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = self.conv(x) + return self.relu(x) + + +class Conv2dPermuteModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(4, 64, 2, bias=False) + + def forward(self, x): + x = self.conv(x) + return torch.permute(x, [0, 2, 1, 3]) diff --git a/backends/nxp/tests/test_quantizer.py b/backends/nxp/tests/test_quantizer.py new file mode 100644 index 00000000000..868a94059b5 --- /dev/null +++ b/backends/nxp/tests/test_quantizer.py @@ -0,0 +1,273 @@ +# Copyright 2024 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Tests for NeutronQuantizer. + +import executorch.backends.nxp.tests.models as models +import torch +from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer +from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e + + +def _get_target_name(node): + return node._pretty_print_target(node.target) + + +def test_quantizer_conv2d(): + model = models.Conv2dModule() + model.eval() + + example_input = (torch.ones(1, 4, 32, 32),) + quantizer = NeutronQuantizer() + graph_module = torch.export.export_for_training( + model, example_input, strict=True + ).module() + + # noinspection PyTypeChecker + m = prepare_pt2e(graph_module, quantizer) + m(*example_input) + m = convert_pt2e(m) + + # Dry run + m(*example_input) + + nodes = list(m.graph.nodes) + assert len(nodes) == 11 + assert nodes[7].name == "conv2d" + # [0]: Input, [1] : weights, [2]: bias + assert ( + _get_target_name(nodes[7].args[0]) + == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + ) + assert ( + _get_target_name(nodes[7].args[1]) + == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + ) + assert ( + _get_target_name(nodes[7].args[2]) + == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + ) + assert ( + _get_target_name(nodes[8]) + == "torch.ops.quantized_decomposed.quantize_per_tensor.default" + ) + assert nodes[8].args[0].name == "conv2d" + + +def test_quantizer_linear(): + model = models.LinearModule(bias=True) + model.eval() + + example_input = (torch.ones(10, 32),) + quantizer = NeutronQuantizer() + graph_module = torch.export.export_for_training( + model, example_input, strict=True + ).module() + + # noinspection PyTypeChecker + m = prepare_pt2e(graph_module, quantizer) + m(*example_input) + m = convert_pt2e(m) + + # Dry run + m(*example_input) + + nodes = list(m.graph.nodes) + assert len(nodes) == 11 + assert nodes[7].name == "linear" + # [0]: Input, [1] : weights, [2]: bias + assert ( + _get_target_name(nodes[7].args[0]) + == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + ) + assert ( + _get_target_name(nodes[7].args[1]) + == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + ) + assert ( + _get_target_name(nodes[7].args[2]) + == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + ) + assert ( + _get_target_name(nodes[8]) + == "torch.ops.quantized_decomposed.quantize_per_tensor.default" + ) + assert nodes[8].args[0].name == "linear" + + +def test_quantizer_maxpool2d(): + model = models.Conv2dAndMaxPool2DModule() + model.eval() + + example_input = (torch.ones(1, 8, 32, 32),) + quantizer = NeutronQuantizer() + graph_module = torch.export.export_for_training( + model, example_input, strict=True + ).module() + + # noinspection PyTypeChecker + m = prepare_pt2e(graph_module, quantizer) + m(*example_input) + m = convert_pt2e(m) + + # Dry run + m(*example_input) + + nodes = list(m.graph.nodes) + assert len(nodes) == 14 + # Check if QDQ pattern: + assert nodes[10].name == "max_pool2d" + assert ( + _get_target_name(nodes[10].args[0]) + == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + ) + assert ( + _get_target_name(nodes[11]) + == "torch.ops.quantized_decomposed.quantize_per_tensor.default" + ) + assert nodes[11].args[0].name == "max_pool2d" + + # Check if input and output quantization is same + input_quant = nodes[10].args[0].args[1:] + output_quant = nodes[11].args[1:] + assert input_quant == output_quant + + +def test_quantizer_softmax(): + model = models.SoftmaxModule(dim=0) + model.eval() + + example_input = (torch.ones(1, 10),) + quantizer = NeutronQuantizer() + graph_module = torch.export.export_for_training( + model, example_input, strict=True + ).module() + + # noinspection PyTypeChecker + m = prepare_pt2e(graph_module, quantizer) + m(*example_input) + m = convert_pt2e(m) + + # Dry run + m(*example_input) + + nodes = list(m.graph.nodes) + assert len(nodes) == 7 + # Check if QDQ pattern: + assert nodes[3].name == "softmax" + assert ( + _get_target_name(nodes[3].args[0]) + == "torch.ops.quantized_decomposed.dequantize_per_tensor.default" + ) + assert ( + _get_target_name(nodes[4]) + == "torch.ops.quantized_decomposed.quantize_per_tensor.default" + ) + assert nodes[4].args[0].name == "softmax" + + # Check output quantization + scale, zp, _, _, dtype = nodes[4].args[1:] + assert scale == 1.0 / 256.0 + assert zp == -128 + assert dtype == torch.int8 + + +def test_quantizer_single_maxpool2d(): + model = models.MaxPool2dModule() + model.eval() + + example_input = (torch.ones(1, 4, 32, 32),) + quantizer = NeutronQuantizer() + graph_module = torch.export.export_for_training( + model, example_input, strict=True + ).module() + + # noinspection PyTypeChecker + m = prepare_pt2e(graph_module, quantizer) + m(*example_input) + m = convert_pt2e(m) + + # Dry run + m(*example_input) + + nodes = list(m.graph.nodes) + assert len(nodes) == 3 + assert nodes[1].name == "max_pool2d" + assert "quantization_annotation" not in nodes[1].meta + + +def test_quantizer_conv2d_relu(): + model = models.Conv2dReLUModule() + model.eval() + + example_input = (torch.ones(1, 4, 32, 32),) + quantizer = NeutronQuantizer() + graph_module = torch.export.export_for_training( + model, example_input, strict=True + ).module() + + # noinspection PyTypeChecker + m = prepare_pt2e(graph_module, quantizer) + m(*example_input) + m = convert_pt2e(m) + + # Dry run + m(*example_input) + + nodes = list(m.graph.nodes) + assert len(nodes) == 12 + assert nodes[7].name == "dequantize_per_tensor_default_2" + assert nodes[8].name == "relu" + assert nodes[9].name == "quantize_per_tensor_default_3" + + +def test_quantizer_conv2d_avg_pool2d(): + model = models.AvgPool2dConvModule(count_include_pad=False) + model.eval() + + example_input = (torch.ones(1, 4, 16, 16),) + quantizer = NeutronQuantizer() + graph_module = torch.export.export_for_training( + model, example_input, strict=True + ).module() + + # noinspection PyTypeChecker + m = prepare_pt2e(graph_module, quantizer) + m(*example_input) + m = convert_pt2e(m) + + # Dry run + m(*example_input) + + nodes = list(m.graph.nodes) + assert len(nodes) == 14 + assert nodes[9].name == "dequantize_per_tensor_default_3" + assert nodes[10].name == "avg_pool2d" + assert nodes[11].name == "quantize_per_tensor_default_4" + + +def test_quantizer_conv2d_permute(): + model = models.Conv2dPermuteModule() + model.eval() + + example_input = (torch.ones(1, 4, 16, 16),) + quantizer = NeutronQuantizer() + graph_module = torch.export.export_for_training( + model, example_input, strict=True + ).module() + + # noinspection PyTypeChecker + m = prepare_pt2e(graph_module, quantizer) + m(*example_input) + m = convert_pt2e(m) + + # Dry run + m(*example_input) + + nodes = list(m.graph.nodes) + assert len(nodes) == 12 + assert nodes[7].name == "dequantize_per_tensor_default_2" + assert nodes[8].name == "permute" + assert nodes[9].name == "quantize_per_tensor_default_3" diff --git a/backends/openvino/README.md b/backends/openvino/README.md index 95a5f4c364e..8adc19f828a 100644 --- a/backends/openvino/README.md +++ b/backends/openvino/README.md @@ -40,7 +40,9 @@ executorch ### Prerequisites -Before you begin, ensure you have openvino installed and configured on your system: +Before you begin, ensure you have openvino installed and configured on your system. + +### Build OpenVINO from Source ```bash git clone https://github.com/openvinotoolkit/openvino.git @@ -56,14 +58,26 @@ cmake --install build --prefix cd source setupvars.sh ``` -Note: The OpenVINO backend is not yet supported with the current OpenVINO release packages. It is recommended to build from source. The instructions for using OpenVINO release packages will be added soon. + +### Use OpenVINO from Release Packages + +1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform. + +2. Extract the release package from the archive and set the environment variables. + + ```bash + tar -zxf openvino_toolkit_.tgz + cd openvino_toolkit_ + source setupvars.sh + ``` + For more information about OpenVINO build, refer to the [OpenVINO Build Instructions](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/build_linux.md). ### Setup Follow the steps below to setup your build environment: -1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/stable/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment. +1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment. 2. **Setup OpenVINO Backend Environment** - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory @@ -78,7 +92,7 @@ Follow the steps below to setup your build environment: ```bash ./openvino_build.sh ``` - **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `export_and_infer_openvino.py` script inside `executorch/examples/openvino` folder. + **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder. ```bash ./openvino_build.sh --enable_python diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md index 85019add313..c3d51e7c116 100644 --- a/backends/qualcomm/README.md +++ b/backends/qualcomm/README.md @@ -6,14 +6,14 @@ we reserve the right to modify interfaces and implementations. This backend is implemented on the top of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk). -Please follow [tutorial](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md) to setup environment, build, and run executorch models by this backend (Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation). +Please follow [tutorial](../../docs/source/backends-qualcomm.md) to setup environment, build, and run executorch models by this backend (Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation). -A website version of the tutorial is [here](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html). +A website version of the tutorial is [here](https://pytorch.org/executorch/main/backends-qualcomm). ## Delegate Options Please check `generate_qnn_executorch_compiler_spec()` in -[utils.py](./utils/utils.py) for supported SoC and inference type. +[utils.py](utils/utils.py) for supported SoC and inference type. ### Supported Chipset - Snapdragon 8 Gen 1 diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py index 9c884d7ab93..81b86992dee 100644 --- a/backends/qualcomm/_passes/__init__.py +++ b/backends/qualcomm/_passes/__init__.py @@ -9,7 +9,10 @@ from .annotate_unbind import AnnotateUnbind from .convert_bmm_to_matmul import ConvertBmmToMatmul from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d +from .convert_square_to_pow import ConvertSquareToPow +from .convert_upsample_bicubic2d import ConvertUpsampleBicubicWithBilinear from .decompose_any import DecomposeAny +from .decompose_cdist import DecomposeCDist from .decompose_einsum import DecomposeEinsum from .decompose_expm1 import DecomposeExpM1 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm @@ -26,6 +29,7 @@ from .recompose_pixel_unshuffle import RecomposePixelUnshuffle from .recompose_rms_norm import RecomposeRmsNorm from .reduce_dynamic_range import ReduceDynamicRange +from .remove_0d_tensor import Remove0DTensor from .remove_redundancy import RemoveRedundancy from .replace_arange_args import ReplaceArangeArgs from .replace_index_put_input import ReplaceIndexPutInput @@ -39,7 +43,10 @@ AnnotateUnbind, ConvertBmmToMatmul, ConvertConv1dToConv2d, + ConvertSquareToPow, + ConvertUpsampleBicubicWithBilinear, DecomposeAny, + DecomposeCDist, DecomposeEinsum, DecomposeExpM1, DecomposeLinalgVectorNorm, @@ -56,6 +63,7 @@ RecomposePixelUnshuffle, RecomposeRmsNorm, ReduceDynamicRange, + Remove0DTensor, RemoveRedundancy, ReplaceArangeArgs, ReplaceIndexPutInput, diff --git a/backends/qualcomm/_passes/annotate_stack.py b/backends/qualcomm/_passes/annotate_stack.py index c42804af2f2..5fbfde058b2 100644 --- a/backends/qualcomm/_passes/annotate_stack.py +++ b/backends/qualcomm/_passes/annotate_stack.py @@ -17,14 +17,16 @@ class AnnotateStack(ExportPass): generated after quantization process. """ - decomp_ops = [torch.ops.aten.unbind.int] + decomp_ops = [torch.ops.aten.stack.default] def __init__(self, edge_program: torch.export.ExportedProgram): super(AnnotateStack, self).__init__() self.edge_program = edge_program def _annotate_stack(self, graph_module: torch.fx.GraphModule): - partitions = get_source_partitions(graph_module.graph, [torch.stack, "stack"]) + partitions = get_source_partitions( + graph_module.graph, [torch.stack, torch.ops.aten.stack.default, "stack"] + ) for _, src_partitions in partitions.items(): for src_partition in src_partitions: output = src_partition.output_nodes[0] diff --git a/backends/qualcomm/_passes/annotate_unbind.py b/backends/qualcomm/_passes/annotate_unbind.py index 0efa1638bc4..426285e872b 100644 --- a/backends/qualcomm/_passes/annotate_unbind.py +++ b/backends/qualcomm/_passes/annotate_unbind.py @@ -24,7 +24,9 @@ def __init__(self, edge_program: torch.export.ExportedProgram): self.edge_program = edge_program def _annotate_unbind(self, graph_module: torch.fx.GraphModule): - partitions = get_source_partitions(graph_module.graph, [torch.unbind, "unbind"]) + partitions = get_source_partitions( + graph_module.graph, [torch.unbind, torch.ops.aten.unbind.int, "unbind"] + ) for _, src_partitions in partitions.items(): for src_partition in src_partitions: if src_partition.input_nodes[0].target in dq_ops: diff --git a/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py b/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py index 947b631dbbf..72dc29c2880 100644 --- a/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py +++ b/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn from executorch.backends.qualcomm.builders.utils import get_parameter, set_parameter +from executorch.backends.qualcomm.utils.constants import QCOM_REQUANTIZE from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -43,6 +44,7 @@ def call(self, graph_module: torch.fx.GraphModule): unsqueeze_node.meta = copy_meta( input_node.meta, lambda m: {**m, "val": m["val"].unsqueeze(2)} ) + with graph_module.graph.inserting_after(unsqueeze_node): filter_node = node.args[1] @@ -92,6 +94,14 @@ def call(self, graph_module: torch.fx.GraphModule): ), ) squeeze_node.meta = copy_meta(node.meta) + + if QCOM_REQUANTIZE in input_node.meta: + input_node.meta.pop(QCOM_REQUANTIZE) + if QCOM_REQUANTIZE in node.meta: + squeeze_node.meta[QCOM_REQUANTIZE] = node.meta[ + QCOM_REQUANTIZE + ] + conv2d_node.meta.pop(QCOM_REQUANTIZE, None) for user in node.users.copy(): user.replace_input_with(node, squeeze_node) graph.eliminate_dead_code() diff --git a/backends/qualcomm/_passes/convert_square_to_pow.py b/backends/qualcomm/_passes/convert_square_to_pow.py new file mode 100644 index 00000000000..51a74ac5f10 --- /dev/null +++ b/backends/qualcomm/_passes/convert_square_to_pow.py @@ -0,0 +1,38 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.exir.pass_base import ExportPass, PassResult + +from .utils import copy_meta + + +class ConvertSquareToPow(ExportPass): + """ + Convert square to pow with a scalar value of 2. + This allows LiftConstantScalarOperands to lift the scalar into a scalar. + Otherwise, the square op will be converted to pow.tensor_scalar after to_edge. + """ + + def __init__(self) -> None: + super().__init__() + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + graph = graph_module.graph + for node in graph.nodes: + if node.target == torch.ops.aten.square.default: + input_node = node.args[0] + with graph_module.graph.inserting_after(input_node): + pow_op = torch.ops.aten.pow.Tensor_Scalar + pow_node = graph.create_node( + "call_function", pow_op, (input_node, 2) + ) + pow_node.meta = copy_meta(node.meta) + for user in node.users.copy(): + user.replace_input_with(node, pow_node) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/_passes/convert_upsample_bicubic2d.py b/backends/qualcomm/_passes/convert_upsample_bicubic2d.py new file mode 100644 index 00000000000..367e9155c77 --- /dev/null +++ b/backends/qualcomm/_passes/convert_upsample_bicubic2d.py @@ -0,0 +1,27 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass + + +class ConvertUpsampleBicubicWithBilinear(ExportPass): + """ + Qnn does not support bicubic interpolation, so we need to convert it to bilinear. + This pass will convert bicubic interpolation to bilinear interpolation. + """ + + bicubic_op_targets = { + exir_ops.edge.aten.upsample_bicubic2d.vec, + } + upsample_bilinear_op = exir_ops.edge.aten.upsample_bilinear2d.default + + def __init__(self): + super(ConvertUpsampleBicubicWithBilinear, self).__init__() + + def call_operator(self, op, args, kwargs, meta): + if op not in self.bicubic_op_targets: + return super().call_operator(op, args, kwargs, meta) + return super().call_operator(self.upsample_bilinear_op, args[:-1], kwargs, meta) diff --git a/backends/qualcomm/_passes/decompose_cdist.py b/backends/qualcomm/_passes/decompose_cdist.py new file mode 100644 index 00000000000..d18a0295ffb --- /dev/null +++ b/backends/qualcomm/_passes/decompose_cdist.py @@ -0,0 +1,81 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.exir.pass_base import ExportPass, PassResult + + +class CDist(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y): + # Step 1: Compute differences + diff = x.unsqueeze(-2) - y.unsqueeze(-3) + + # Step 2: Square differences + sq_diff = diff**2 + + # Step 3: Sum of squares + sum_sq_diff = sq_diff.sum(dim=-1) + + # Step 4: Square root + distances = torch.sqrt(sum_sq_diff) + + return distances + + +class DecomposeCDist(ExportPass): + """ + Decompose for math equivalent op. + """ + + def __init__(self) -> None: + super().__init__() + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + graph = graph_module.graph + for node in graph.nodes: + model = CDist() + if torch.ops.aten.cdist.default == node.target: + if len(node.args) > 2: + assert ( + node.args[2] == 2 + ), "Currently only p=2 is supported for CDist Decomposition" + decomposed_module = torch.export.export( + model, + (node.args[0].meta["val"], node.args[1].meta["val"]), + strict=True, + ).module() + with graph.inserting_before(node): + # remap is used to map original node values to new node values, + # which ensures that reference to nodes are correctly updated in the new graph + remap = {"x": node.args[0], "y": node.args[1]} + + for decomposed_node in decomposed_module.graph.nodes: + # no need to copy existent 'output' + if decomposed_node.op == "output": + for user in node.users.copy(): + # remap + user.replace_input_with( + node, + remap[decomposed_node.args[0][0]], + ) + # no need to copy existent placeholders + elif decomposed_node.op == "placeholder": + # replace node map from string to graph node + remap[decomposed_node] = remap.pop(decomposed_node.name) + else: + remap[decomposed_node] = graph.node_copy( + decomposed_node, + arg_transform=lambda x, remap=remap: remap[x], + ) + + graph.erase_node(node) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/_passes/decompose_einsum.py b/backends/qualcomm/_passes/decompose_einsum.py index cbf8cbf1249..046c1598311 100644 --- a/backends/qualcomm/_passes/decompose_einsum.py +++ b/backends/qualcomm/_passes/decompose_einsum.py @@ -8,6 +8,8 @@ from executorch.exir.pass_base import ExportPass, PassResult from torch.fx.experimental.proxy_tensor import make_fx +from .utils import copy_nn_module_stack + class DecomposeEinsum(ExportPass): """ @@ -36,6 +38,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: remap[f"arg1_{i+1}"] = arg for decomposed_node in decomposed_module.graph.nodes: + copy_nn_module_stack(node, decomposed_node) # This is the arg[0] equation string, which is not required anymore after decomposition if "arg0" in decomposed_node.name: continue diff --git a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py index 7d70f5c9342..993f088da12 100644 --- a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py +++ b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py @@ -8,6 +8,8 @@ from executorch.exir import to_edge from executorch.exir.pass_base import ExportPass, PassResult +from .utils import copy_nn_module_stack + class LinalgVectorNorm(torch.nn.Module): def __init__(self, exp, dim, keepdim): @@ -62,6 +64,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: remap = {"x": node.args[0]} for decomposed_node in decomposed_module.graph.nodes: + copy_nn_module_stack(node, decomposed_node) # no need to copy existent 'output' if decomposed_node.op == "output": for user in node.users.copy(): diff --git a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py index 277fc9c6ce8..829b3757e06 100644 --- a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py +++ b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py @@ -22,12 +22,16 @@ def __init__(self): exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.mul.Tensor, exir_ops.edge.aten.div.Tensor, + # Support if the rank of input tensor: {input_dims} is less than the rank of output tensor: {output_dims}. + exir_ops.edge.aten.expand_copy.default, ] def traverse_broadcast_node(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: if node.target in self.broadcast_op_targets: for arg in node.args: + if not isinstance(arg, torch.fx.Node): + continue input_rank = len(arg.meta["val"].shape) output_rank = len(node.meta["val"].shape) if input_rank != output_rank: diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py index 17960a6029b..19c5417f8f8 100644 --- a/backends/qualcomm/_passes/layout_transform.py +++ b/backends/qualcomm/_passes/layout_transform.py @@ -47,6 +47,7 @@ class LayoutTransform(ExportPass): layout_agnostic_ops = { exir_ops.edge.aten.abs.default, exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.amax.default, exir_ops.edge.aten.bitwise_or.Tensor, exir_ops.edge.aten.bmm.default, exir_ops.edge.aten.bitwise_and.Tensor, @@ -54,6 +55,7 @@ class LayoutTransform(ExportPass): exir_ops.edge.aten.ceil.default, exir_ops.edge.aten.clamp.default, exir_ops.edge.aten.constant_pad_nd.default, + exir_ops.edge.aten.cumsum.default, exir_ops.edge.aten.div.Tensor, exir_ops.edge.aten.elu.default, exir_ops.edge.aten.eq.Tensor, diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py index 93abfe621bc..9b3a308813e 100644 --- a/backends/qualcomm/_passes/lift_constant_scalar_operands.py +++ b/backends/qualcomm/_passes/lift_constant_scalar_operands.py @@ -53,7 +53,13 @@ class TensorOpInfo: } -SKIP_LIFT_OPS = {aten.full_like.default, aten.arange.start_step} +SKIP_LIFT_OPS = { + aten.full_like.default, + aten.arange.start_step, + aten.arange.default, + aten.scalar_tensor.default, + aten.elu.default, +} class LiftConstantScalarOperands(ExportPass): diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py index ab2c86102df..c98f27db120 100644 --- a/backends/qualcomm/_passes/qnn_pass_manager.py +++ b/backends/qualcomm/_passes/qnn_pass_manager.py @@ -14,7 +14,10 @@ AnnotateUnbind, ConvertBmmToMatmul, ConvertConv1dToConv2d, + ConvertSquareToPow, + ConvertUpsampleBicubicWithBilinear, DecomposeAny, + DecomposeCDist, DecomposeEinsum, DecomposeExpM1, DecomposeLinalgVectorNorm, @@ -31,6 +34,7 @@ RecomposePixelUnshuffle, RecomposeRmsNorm, ReduceDynamicRange, + Remove0DTensor, RemoveRedundancy, ReplaceArangeArgs, ReplaceIndexPutInput, @@ -70,10 +74,11 @@ def get_capture_program_passes(): # If a pass is activated, it will be executed by default. default_passes_and_setting = [ (AnnotateQuantAttrs, True), - (AnnotateStack, False), + (AnnotateStack, True), (AnnotateUnbind, True), (ConvertBmmToMatmul, True), (ConvertConv1dToConv2d, True), + (ConvertUpsampleBicubicWithBilinear, False), (DecomposeAny, True), (ExpandBroadcastTensorShape, False), (FixedLinearKeepDim, True), @@ -82,6 +87,7 @@ def get_capture_program_passes(): (LayoutTransform, True), (RecomposePixelUnshuffle, True), (RecomposeRmsNorm, False), + (Remove0DTensor, True), (RemoveRedundancy, True), (ReplaceIndexPutInput, True), (TagQuantIO, False), @@ -174,10 +180,27 @@ def transform_for_to_edge_pipeline( return exported_program + # Before quantizer + def transform_for_annotation_pipeline(self, graph_module: GraphModule): + self.add_pass(ReduceDynamicRange()) + self.add_pass(RecomposePixelUnshuffle(quantization_capture=True)) + self.add_pass(ReplaceArangeArgs()) + self.add_pass(DecomposeCDist()) + self.add_pass(DecomposeScaledDotProductAttention()) + self.add_pass(DecomposeSilu()) + self.add_pass(DecomposeEinsum()) + self.add_pass(DecomposeExpM1()) + self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True)) + self.add_pass(ReplaceInfValues()) + self.add_pass(LiftConstantScalarOperands()) + return self._transform(graph_module) + def transform_for_export_pipeline(self, exported_program: ExportedProgram): + self.add_pass(DecomposeCDist()) self.add_pass(DecomposeScaledDotProductAttention()) self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True)) self.add_pass(DecomposeExpM1()) + self.add_pass(ConvertSquareToPow()) self.add_pass(LiftConstantScalarOperands()) self._transform(exported_program.graph_module) ep = lift_constant_tensor_pass(exported_program) @@ -189,16 +212,3 @@ def transform_for_preprocess_pipeline(self, exported_program: ExportedProgram): self.add_pass(LayoutTransform(exported_program, insert_permute=True)) self.add_pass(FuseConsecutiveTranspose()) return self._transform(exported_program.graph_module) - - def transform_for_annotation_pipeline(self, graph_module: GraphModule): - self.add_pass(ReduceDynamicRange()) - self.add_pass(RecomposePixelUnshuffle(quantization_capture=True)) - self.add_pass(ReplaceArangeArgs()) - self.add_pass(DecomposeScaledDotProductAttention()) - self.add_pass(DecomposeSilu()) - self.add_pass(DecomposeEinsum()) - self.add_pass(DecomposeExpM1()) - self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True)) - self.add_pass(ReplaceInfValues()) - self.add_pass(LiftConstantScalarOperands()) - return self._transform(graph_module) diff --git a/backends/qualcomm/_passes/recompose_pixel_unshuffle.py b/backends/qualcomm/_passes/recompose_pixel_unshuffle.py index 7aac4fb823e..81214facc3a 100644 --- a/backends/qualcomm/_passes/recompose_pixel_unshuffle.py +++ b/backends/qualcomm/_passes/recompose_pixel_unshuffle.py @@ -45,13 +45,11 @@ def call(self, graph_module: torch.fx.GraphModule): continue view_node = premute_node.args[0] - if any( - [ - view_node.op != "call_function", - view_node.target != self.view_target, - len(view_node.args[1]) != 6, - len(premute_node.args[1]) != 6, - ] + if ( + view_node.op != "call_function" + or view_node.target != self.view_target + or len(view_node.args[1]) != 6 + or len(premute_node.args[1]) != 6 ): continue diff --git a/backends/qualcomm/_passes/remove_0d_tensor.py b/backends/qualcomm/_passes/remove_0d_tensor.py new file mode 100644 index 00000000000..1e1d711c2b8 --- /dev/null +++ b/backends/qualcomm/_passes/remove_0d_tensor.py @@ -0,0 +1,36 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class Remove0DTensor(ExportPass): + """ + QNN does not allow 0D tensor, we remove the node that will output an 0D tensor. + Before adding operations to the list of nodes to be removed, please ensure that it will not change the logic. + """ + + remove_ops = { + exir_ops.edge.aten.select.int, + exir_ops.edge.aten.select_copy.int, + } + + def __init__(self, quantization_capture=False) -> None: + super().__init__() + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + graph = graph_module.graph + for node in graph.nodes: + if node.target in self.remove_ops and len(node.meta["val"].shape) == 0: + for user_n in list(node.users.keys()): + user_n.replace_input_with(node, node.args[0]) + graph.erase_node(node) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py index d538fe0d34f..46d9e0cde76 100755 --- a/backends/qualcomm/_passes/utils.py +++ b/backends/qualcomm/_passes/utils.py @@ -78,6 +78,7 @@ def get_passes_dependency_for_capture_program(): AnnotateUnbind, ConvertBmmToMatmul, ConvertConv1dToConv2d, + ConvertUpsampleBicubicWithBilinear, DecomposeAny, DecomposeLinalgVectorNorm, ExpandBroadcastTensorShape, @@ -96,18 +97,20 @@ def get_passes_dependency_for_capture_program(): AnnotateQuantAttrs: [ RecomposePixelUnshuffle, ConvertBmmToMatmul, + ConvertUpsampleBicubicWithBilinear, RemoveRedundancy, ], AnnotateStack: [RemoveRedundancy], AnnotateUnbind: [RemoveRedundancy], ConvertBmmToMatmul: [RecomposePixelUnshuffle], ConvertConv1dToConv2d: [FoldQDQ], + ConvertUpsampleBicubicWithBilinear: [RemoveRedundancy], DecomposeAny: [RemoveRedundancy], DecomposeLinalgVectorNorm: [RemoveRedundancy], - ExpandBroadcastTensorShape: [RemoveRedundancy], + ExpandBroadcastTensorShape: [FoldQDQ], FixedLinearKeepDim: [FoldQDQ], FoldQDQ: [AnnotateQuantAttrs, AnnotateStack, AnnotateUnbind], - I64toI32: [RemoveRedundancy], + I64toI32: [ConvertUpsampleBicubicWithBilinear, RemoveRedundancy], LayoutTransform: [ AnnotateQuantAttrs, ConvertConv1dToConv2d, @@ -121,6 +124,14 @@ def get_passes_dependency_for_capture_program(): } +def copy_nn_module_stack(src, target): + """ + Copy meta["nn_module_stack"] from src node to target node if existing. + """ + if value := src.meta.get("nn_module_stack"): + target.meta["nn_module_stack"] = value + + def is_float_tensor(node: torch.fx.Node) -> bool: if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor): return False diff --git a/backends/qualcomm/aot/ir/targets.bzl b/backends/qualcomm/aot/ir/targets.bzl index 5fdcd14485c..b6ca0879dbe 100644 --- a/backends/qualcomm/aot/ir/targets.bzl +++ b/backends/qualcomm/aot/ir/targets.bzl @@ -4,7 +4,7 @@ load( ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/backends/qualcomm:targets.bzl", "generate_schema_header") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") QCIR_NAME = "qcir" INPUT_QCIR = QCIR_NAME + ".fbs" @@ -56,7 +56,7 @@ def define_common_targets(): platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], deps = [ - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", "//executorch/backends/qualcomm/aot/wrappers:wrappers", diff --git a/backends/qualcomm/aot/python/targets.bzl b/backends/qualcomm/aot/python/targets.bzl index f29c02aa593..f2eb654a10c 100644 --- a/backends/qualcomm/aot/python/targets.bzl +++ b/backends/qualcomm/aot/python/targets.bzl @@ -3,7 +3,7 @@ load( "ANDROID", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") PYTHON_MODULE_NAME = "PyQnnManagerAdaptor" @@ -34,7 +34,7 @@ def define_common_targets(): "//executorch/backends/qualcomm/aot/ir:qcir_utils", "//executorch/backends/qualcomm/runtime:runtime", "fbsource//third-party/pybind11:pybind11", - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), ], external_deps = [ "libtorch_python", @@ -67,7 +67,7 @@ def define_common_targets(): "//executorch/backends/qualcomm/aot/ir:qcir_utils", "//executorch/backends/qualcomm/runtime:runtime", "fbsource//third-party/pybind11:pybind11", - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), ], external_deps = [ "libtorch_python", @@ -94,6 +94,6 @@ def define_common_targets(): "//executorch/backends/qualcomm/aot/ir:qcir_utils", "//executorch/backends/qualcomm/runtime:runtime", "fbsource//third-party/pybind11:pybind11", - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), ], ) diff --git a/backends/qualcomm/aot/wrappers/targets.bzl b/backends/qualcomm/aot/wrappers/targets.bzl index 24ceeb723eb..0c5d5b1c3e9 100644 --- a/backends/qualcomm/aot/wrappers/targets.bzl +++ b/backends/qualcomm/aot/wrappers/targets.bzl @@ -3,7 +3,7 @@ load( "ANDROID", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. @@ -23,7 +23,7 @@ def define_common_targets(): platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], deps = [ - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", ], diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md index 3a97e8d6d6a..783a53dd645 100644 --- a/backends/qualcomm/builders/README.md +++ b/backends/qualcomm/builders/README.md @@ -8,6 +8,7 @@ Thank you for contributing to Qualcomm AI Engine Direct delegate for ExecuTorch. * [Check Operator Spec](#check-operator-spec) * [Implementation](#implementation) * [Quantizer Annotation](#quantizer-annotation) +* [Operator Support Status](#operator-support-status) * [Issues](#issues) * [Pull Requests](#pull-requests) @@ -246,7 +247,7 @@ Now, we can start to fill in function body step by step: nodes_to_wrappers, ) ``` - The logic should be similar and straightforward. Please carefully set arguments `tensor_type` + The logic should be similar and straightforward. Please carefully set arguments `tensor_type` according to tensors' property. 3. Define parameters: @@ -355,6 +356,128 @@ Now, we can start to fill in function body step by step: ### Quantizer Annotation The operator now should be functional for Qualcomm backends. For operator to work in fixed-precision, we should also make `QnnQuantizer` to correctly insert observers for recording calibrated encodings. Please read more on the [Quantization Annotation Tutorial](../quantizer//README.md). +## Operator Support Status +Please help update following table if you are contributing new operators: + +| Operators | HTP - 77/116 Enabled | +|-----------|---------| +| Argmax | ✗ | +| Argmin | ✓ | +| BatchNorm | ✓ | +| BatchToSpace | ✗ | +| Cast | ✓ | +| ChannelShuffle | ✗ | +| Concat | ✓ | +| Conv2d | ✓ | +| Conv3d | ✗ | +| Convert | ✓ | +| CreateSparse | ✗ | +| CumulativeSum | ✓ | +| DepthToSpace | ✓ | +| DepthWiseConv2d | ✓ | +| Dequantize | ✓ | +| DetectionOutput | ✗ | +| ElementWiseAbs | ✓ | +| ElementWiseAdd | ✓ | +| ElementWiseAnd | ✓ | +| ElementWiseAsin | ✗ | +| ElementWiseAtan | ✗ | +| ElementWiseBinary | ✗ | +| ElementWiseCeil | ✓ | +| ElementWiseCos | ✓ | +| ElementWiseDivide | ✓ | +| ElementWiseEqual | ✓ | +| ElementWiseExp | ✓ | +| ElementWiseFloor | ✗ | +| ElementWiseFloorDiv | ✗ | +| ElementWiseGreater | ✓ | +| ElementWiseGreaterEqual | ✓ | +| ElementWiseLess | ✓ | +| ElementWiseLessEqual | ✓ | +| ElementWiseLog | ✓ | +| ElementWiseMaximum | ✓ | +| ElementWiseMinimum | ✓ | +| ElementWiseMultiply | ✓ | +| ElementWiseNeg | ✓ | +| ElementWiseNeuron | ✓ | +| ElementWiseNot | ✓ | +| ElementWiseNotEqual | ✓ | +| ElementWiseOr | ✓ | +| ElementWisePower | ✓ | +| ElementWiseRound | ✗ | +| ElementWiseRsqrt | ✓ | +| ElementWiseSelect | ✓ | +| ElementWiseSign | ✗ | +| ElementWiseSin | ✓ | +| ElementWiseSquaredDifference | ✗ | +| ElementWiseSquareRoot | ✓ | +| ElementWiseSubtract | ✓ | +| ElementWiseUnary | ✗ | +| ElementWiseXor | ✗ | +| Elu | ✓ | +| ExpandDims | ✓ | +| ExtractGlimpse | ✗ | +| ExtractPatches | ✗ | +| FullyConnected | ✓ | +| Gather | ✓ | +| GatherElements | ✗ | +| GatherNd | ✓ | +| Gelu | ✓ | +| GetSparseIndices | ✗ | +| GetSparseValues | ✗ | +| GridSample | ✗ | +| GroupNorm | ✓ | +| HardSwish | ✓ | +| InstanceNorm | ✓ | +| L2Norm | ✗ | +| LayerNorm | ✓ | +| LogSoftmax | ✓ | +| Lrn | ✗ | +| Lstm | ✗ | +| MatMul | ✓ | +| MultiClassNms | ✗ | +| NonMaxSuppression | ✗ | +| Nonzero | ✗ | +| OneHot | ✗ | +| Pack | ✓ | +| Pad | ✓ | +| PoolAvg2d | ✓ | +| PoolAvg3d | ✗ | +| PoolMax2d | ✓ | +| Prelu | ✓ | +| Quantize | ✓ | +| ReduceMax | ✓ | +| ReduceMean | ✓ | +| ReduceMin | ✗ | +| ReduceSum | ✓ | +| Relu | ✓ | +| Relu1 | ✗ | +| Relu6 | ✗ | +| ReluMinMax | ✓ | +| Reshape | ✓ | +| Resize | ✗ | +| ResizeBilinear | ✓ | +| ResizeNearestNeighbor | ✓ | +| RoiAlign | ✗ | +| RmsNorm | ✓ | +| ScatterElements | ✗ | +| ScatterNd | ✓ | +| Sigmoid | ✓ | +| Softmax | ✓ | +| SpaceToBatch | ✗ | +| SpaceToDepth | ✓ | +| SparseToDense | ✗ | +| Split | ✓ | +| Squeeze | ✓ | +| StridedSlice | ✓ | +| Tanh | ✓ | +| Tile | ✓ | +| TopK | ✓ | +| TransPose | ✓ | +| TransPoseConv2d | ✓ | +| TransPoseConv3d | ✗ | +| Unpack | ✓ | + ## Issues Please refer to the [issue section](../README.md#issues) for more information. diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py index cc85333f26b..705d5d163cd 100644 --- a/backends/qualcomm/builders/__init__.py +++ b/backends/qualcomm/builders/__init__.py @@ -9,6 +9,7 @@ op_abs, op_adaptive_avg_pool2d, op_add, + op_amax, op_and, op_arange, op_argmin, @@ -20,6 +21,7 @@ op_clamp, op_conv2d, op_cos, + op_cum_sum, op_depth_to_space, op_dequantize, op_div, @@ -95,6 +97,7 @@ op_abs, op_adaptive_avg_pool2d, op_add, + op_amax, op_and, op_arange, op_argmin, @@ -106,6 +109,7 @@ op_clamp, op_conv2d, op_cos, + op_cum_sum, op_depth_to_space, op_dequantize, op_div, diff --git a/backends/qualcomm/builders/op_amax.py b/backends/qualcomm/builders/op_amax.py new file mode 100644 index 00000000000..099004a4bcf --- /dev/null +++ b/backends/qualcomm/builders/op_amax.py @@ -0,0 +1,84 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import cast, Dict, List + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper + +import numpy as np + +import torch +from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA + +from .node_visitor import NodeVisitor, register_node_visitor +from .qnn_constants import OpReduceMax, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class AMax(NodeVisitor): + target = ["aten.amax.default"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + input_node = node.args[0] + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + ) + + # mean dims and keep dims + mean_dims = cast(List[int], node.args[1]) + mean_dims = [ + mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims + ] + if QCOM_AXIS_ORDER in node.meta: + mean_dims = [ + node.meta[QCOM_AXIS_ORDER].index(mean_dim) for mean_dim in mean_dims + ] + mean_dims_shape = [len(mean_dims)] + + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + ) + + reduce_max_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpReduceMax.op_name, + ) + reduce_max_op.AddInputTensors([input_tensor_wrapper]) + reduce_max_op.AddOutputTensors([output_tensor_wrapper]) + reduce_max_op.AddTensorParam( + OpReduceMax.param_axes, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(mean_dims_shape), + mean_dims_shape, + np.array(mean_dims, dtype=np.uint32), + True, + ) + if len(node.args) > 2: + keep_dims = cast(bool, node.args[2]) + reduce_max_op.AddScalarParam( + OpReduceMax.param_keep_dims, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8, + {QCOM_DATA: keep_dims}, + ) + + return reduce_max_op diff --git a/backends/qualcomm/builders/op_cos.py b/backends/qualcomm/builders/op_cos.py index 3858a947d93..589bf3ef88e 100644 --- a/backends/qualcomm/builders/op_cos.py +++ b/backends/qualcomm/builders/op_cos.py @@ -3,7 +3,6 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - from typing import Dict import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper diff --git a/backends/qualcomm/builders/op_cum_sum.py b/backends/qualcomm/builders/op_cum_sum.py new file mode 100644 index 00000000000..f62485bc519 --- /dev/null +++ b/backends/qualcomm/builders/op_cum_sum.py @@ -0,0 +1,84 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import cast, Dict + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper + +import numpy as np +import torch +from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA + +from .node_visitor import NodeVisitor, register_node_visitor +from .qnn_constants import OpCumulativeSum, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class CumulativeSum(NodeVisitor): + target = ["aten.cumsum.default"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def get_param(self, node, input_tensor): + dim = node.args[1] + + if dim < 0: + dim = dim % len(input_tensor.shape) + if QCOM_AXIS_ORDER in node.meta: + dim = node.meta[QCOM_AXIS_ORDER].index(dim) + + return cast(np.uint32, dim) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + input_node = node.args[0] + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + ) + + dim = self.get_param(node, input_tensor) + + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + ) + + cumsum_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpCumulativeSum.op_name, + ) + cumsum_op.AddInputTensors([input_tensor_wrapper]) + cumsum_op.AddOutputTensors([output_tensor_wrapper]) + cumsum_op.AddScalarParam( + OpCumulativeSum.param_axis, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + {QCOM_DATA: dim}, + ) + cumsum_op.AddScalarParam( + OpCumulativeSum.param_exclusive, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8, + {QCOM_DATA: False}, + ) + cumsum_op.AddScalarParam( + OpCumulativeSum.param_reverse, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8, + {QCOM_DATA: False}, + ) + + return cumsum_op diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py index d224e34feb5..aa7f9becd98 100644 --- a/backends/qualcomm/builders/op_rms_norm.py +++ b/backends/qualcomm/builders/op_rms_norm.py @@ -81,8 +81,9 @@ def define_node( {}, # kwargs ) if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + quant_attrs = quant_attrs.copy() + quant_attrs[QCOM_ZERO_POINT] = 0 bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs - bias_node.meta[QCOM_QUANT_ATTRS][QCOM_ZERO_POINT] = 0 bias_tensor_wrapper = self.define_tensor( bias_node, node, diff --git a/backends/qualcomm/builders/op_sin.py b/backends/qualcomm/builders/op_sin.py index 89fce6bee9c..8828685ac9e 100644 --- a/backends/qualcomm/builders/op_sin.py +++ b/backends/qualcomm/builders/op_sin.py @@ -3,7 +3,6 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - from typing import Dict import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper diff --git a/backends/qualcomm/builders/op_sqrt.py b/backends/qualcomm/builders/op_sqrt.py index 030e6c3e10a..5505e92ee67 100644 --- a/backends/qualcomm/builders/op_sqrt.py +++ b/backends/qualcomm/builders/op_sqrt.py @@ -10,7 +10,7 @@ import torch from .node_visitor import NodeVisitor, register_node_visitor -from .qnn_constants import OpElementWiseSqrt, QNN_OP_PACKAGE_NAME_QTI_AISW +from .qnn_constants import OpElementWiseSquareRoot, QNN_OP_PACKAGE_NAME_QTI_AISW @register_node_visitor @@ -51,7 +51,7 @@ def define_node( sqrt_op = PyQnnWrapper.PyQnnOpWrapper( node.name, QNN_OP_PACKAGE_NAME_QTI_AISW, - OpElementWiseSqrt.op_name, + OpElementWiseSquareRoot.op_name, ) sqrt_op.AddInputTensors(sqrt_input_tensors) sqrt_op.AddOutputTensors(sqrt_output_tensors) diff --git a/backends/qualcomm/builders/op_stack.py b/backends/qualcomm/builders/op_stack.py index 616d0ee0ccc..fdef148ad4d 100644 --- a/backends/qualcomm/builders/op_stack.py +++ b/backends/qualcomm/builders/op_stack.py @@ -51,7 +51,7 @@ def define_node( dim = 0 if len(node.args) == 1 else cast(int, node.args[1]) if dim < 0: - dim = dim % len(input_tensor.shape) + dim = dim % len(output_tensor.shape) if QCOM_AXIS_ORDER in node.meta: dim = node.meta[QCOM_AXIS_ORDER].index(dim) stack_op = PyQnnWrapper.PyQnnOpWrapper( diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py index 9613c755c7c..06e398f7c05 100644 --- a/backends/qualcomm/builders/qnn_constants.py +++ b/backends/qualcomm/builders/qnn_constants.py @@ -14,6 +14,13 @@ # instead of replicating them here. +@dataclass(init=False, frozen=True) +class OpArgmin: + op_name: str = "Argmin" + param_axis: str = "axis" + param_keep_dims: str = "keep_dims" + + @dataclass(init=False, frozen=True) class OpBatchnorm: op_name: str = "Batchnorm" @@ -50,6 +57,14 @@ class OpConvert: op_name: str = "Convert" +@dataclass(init=False, frozen=True) +class OpCumulativeSum: + op_name = "CumulativeSum" + param_axis = "axis" + param_exclusive = "exclusive" + param_reverse = "reverse" + + @dataclass(init=False, frozen=True) class OpDepthToSpace: op_name: str = "DepthToSpace" @@ -204,7 +219,7 @@ class OpElementWiseSelect: @dataclass(init=False, frozen=True) -class OpElementWiseSqrt: +class OpElementWiseSquareRoot: op_name = "ElementWiseSquareRoot" @@ -350,16 +365,16 @@ class OpQuantize: @dataclass(init=False, frozen=True) -class OpReduceMean: - op_name: str = "ReduceMean" +class OpReduceMax: + op_name: str = "ReduceMax" param_axes: str = "axes" param_keep_dims: str = "keep_dims" @dataclass(init=False, frozen=True) -class OpArgmin: - op_name: str = "Argmin" - param_axis: str = "axis" +class OpReduceMean: + op_name: str = "ReduceMean" + param_axes: str = "axes" param_keep_dims: str = "keep_dims" diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py index b427c59ce07..6326f4d1210 100644 --- a/backends/qualcomm/partition/common_defs.py +++ b/backends/qualcomm/partition/common_defs.py @@ -13,6 +13,7 @@ exir_ops.edge.aten.clone.default, exir_ops.edge.aten.slice_scatter.default, exir_ops.edge.aten.copy.default, + exir_ops.edge.aten.upsample_bicubic2d.vec, exir_ops.edge.quantized_decomposed.embedding_4bit.dtype, ] diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py index 7b5e72d461d..d9eb188614c 100644 --- a/backends/qualcomm/partition/qnn_partitioner.py +++ b/backends/qualcomm/partition/qnn_partitioner.py @@ -34,7 +34,7 @@ not_supported_operator, to_be_implemented_operator, ) -from .utils import generate_qnn_executorch_option, get_skip_decomp_table +from .utils import filter_fn, generate_qnn_executorch_option, get_skip_decomp_table class QnnOperatorSupport(OperatorSupportBase): @@ -181,5 +181,4 @@ def ops_to_not_decompose( self, ep: ExportedProgram ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: do_not_decompose = get_skip_decomp_table() - - return do_not_decompose, None + return (do_not_decompose, filter_fn) diff --git a/backends/qualcomm/partition/utils.py b/backends/qualcomm/partition/utils.py index 1e2b17b2a69..816d1ac1d9b 100644 --- a/backends/qualcomm/partition/utils.py +++ b/backends/qualcomm/partition/utils.py @@ -24,6 +24,21 @@ def generate_qnn_executorch_option( return qnn_compile_spec_buffer +# Logic to determine whether to skip decompose and has higher priority than get_skip_decomp_table() +def filter_fn(node: torch.fx.Node) -> bool: + # QNN does not support int32/int64 IO for the following OPs. + potential_i32_i64_io_ops = [ + torch.ops.aten.stack.default, + torch.ops.aten.unbind.int, + ] + if node.target in potential_i32_i64_io_ops and node.meta["val"].dtype in [ + torch.int32, + torch.int64, + ]: + return False + return True + + def get_skip_decomp_table() -> List[torch._ops.OperatorBase]: do_not_decompose = [ torch.ops.aten.adaptive_avg_pool2d.default, @@ -39,8 +54,9 @@ def get_skip_decomp_table() -> List[torch._ops.OperatorBase]: torch.ops.aten.rms_norm.default, torch.ops.aten._safe_softmax.default, torch.ops.aten.stack.default, + torch.ops.aten.upsample_bicubic2d.vec, # This request is ignored because it is in a blocklist. Refer to exir/program/_program.py - # torch.ops.aten.unbind.int, + torch.ops.aten.unbind.int, torch.ops.pt2e_quant.quantize_affine.default, torch.ops.pt2e_quant.dequantize_affine.default, ] diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py index 93af5e86c97..469a801feeb 100644 --- a/backends/qualcomm/quantizer/annotators.py +++ b/backends/qualcomm/quantizer/annotators.py @@ -97,6 +97,7 @@ def annotate_in_out_obs_sharing_op( QUANT_ANNOTATION_KEY not in input_act.meta or not input_act.meta[QUANT_ANNOTATION_KEY]._annotated or input_act.meta[QUANT_ANNOTATION_KEY].output_qspec is None + or not _is_float_tensor(input_act) ): return @@ -132,9 +133,10 @@ def annotate_single_in_single_out( return input_qspec_map = {} - input_act = node.args[0] - assert isinstance(input_act, Node) - input_qspec_map[input_act] = quantization_config.input_activation + if _is_float_tensor(node.args[0]): + input_act = node.args[0] + assert isinstance(input_act, Node) + input_qspec_map[input_act] = quantization_config.input_activation if _is_float_tensor(node): node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( @@ -177,11 +179,18 @@ def annotate_binary(node: Node, quantization_config: QuantizationConfig) -> None ) -@register_annotator([torch.ops.aten.add, torch.ops.aten.add.Tensor]) +@register_annotator( + [torch.ops.aten.add, torch.ops.aten.add.Tensor, torch.ops.aten.add_.Tensor] +) def annotate_add(node: Node, quantization_config: QuantizationConfig) -> None: annotate_binary(node, quantization_config) +@register_annotator([torch.ops.aten.amax.default]) +def annotate_amax(node: Node, quantization_config: QuantizationConfig) -> None: + annotate_binary(node, quantization_config) + + @register_annotator([torch.ops.aten.argmin.default]) def annotate_argmin(node: Node, quantization_config: QuantizationConfig) -> None: if _is_annotated([node]): @@ -928,6 +937,11 @@ def annotate_bmm(node: Node, quantization_config: QuantizationConfig) -> None: node.meta["source_fn_stack"] = [(node, torch.bmm)] +@register_annotator([torch.ops.aten.cdist.default]) +def annotate_cdist(node: Node, quantization_config: QuantizationConfig) -> None: + annotate_binary(node, quantization_config) + + @register_annotator( [ torch.ops.aten.conv2d.default, @@ -936,7 +950,7 @@ def annotate_bmm(node: Node, quantization_config: QuantizationConfig) -> None: torch.ops.aten.conv_transpose1d.default, ] ) -def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None: +def annotate_conv(node: Node, quantization_config: QuantizationConfig) -> None: if _is_annotated([node]): return @@ -971,6 +985,11 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None ) +@register_annotator([torch.ops.aten.cumsum.default]) +def annotate_cumsum(node: Node, quantization_config: QuantizationConfig) -> None: + annotate_single_in_single_out(node, quantization_config) + + @register_annotator([torch.ops.aten.linear.default]) def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None: act_node = node.args[0] @@ -1108,15 +1127,17 @@ def annotate_cat(node: Node, quantization_config: QuantizationConfig) -> None: input_qspec_map = {} assert isinstance(first_input_node, Node) assert isinstance(node, Node) - input_qspec_map[first_input_node] = quantization_config.input_activation - share_qparams_with_input_act0_qspec = SharedQuantizationSpec( - (first_input_node, node) - ) + if _is_float_tensor(first_input_node): + input_qspec_map[first_input_node] = quantization_config.input_activation + share_qparams_with_input_act0_qspec = SharedQuantizationSpec( + (first_input_node, node) + ) for input_node in input_nodes[1:]: if input_node not in input_qspec_map: assert isinstance(input_node, Node) - input_qspec_map[input_node] = share_qparams_with_input_act0_qspec + if _is_float_tensor(input_node): + input_qspec_map[input_node] = share_qparams_with_input_act0_qspec node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( input_qspec_map=input_qspec_map, @@ -1130,7 +1151,6 @@ def annotate_unbind(node: Node, quantization_config: QuantizationConfig) -> None # Seems like unbind.int can be either float or int. Only quant when input is float. if _is_annotated([node]) or not _is_float_tensor(node.args[0]): return - input_qspec_map = {} input_act = node.args[0] assert isinstance(input_act, Node) diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py index 33237f3bebe..bda91609f1c 100644 --- a/backends/qualcomm/quantizer/custom_annotation.py +++ b/backends/qualcomm/quantizer/custom_annotation.py @@ -6,7 +6,10 @@ from typing import Sequence import torch -from executorch.backends.qualcomm.quantizer.annotators import QUANT_ANNOTATION_KEY +from executorch.backends.qualcomm.quantizer.annotators import ( + _is_float_tensor, + QUANT_ANNOTATION_KEY, +) from executorch.backends.qualcomm.quantizer.quantizer import ( get_16a8w_qnn_ptq_config, get_8a8w_qnn_ptq_config, @@ -23,6 +26,38 @@ from torch.fx import Node +def annotate_mimi_decoder(gm: torch.fx.GraphModule): + """ + The 1st transpose conv in mimi decoder is really sensitive to scale/offset in 16a8w, which causes execution failure. + Annotate 1st transpose conv as 8a8w to prevent execution failure. + """ + quantization_config_8a8w = get_8a8w_qnn_ptq_config() + for node in gm.graph.nodes: + if not _is_float_tensor(node): + continue + elif node.target == torch.ops.aten.conv_transpose1d.default: + input_qspec_map = {} + input_act = node.args[0] + assert isinstance(input_act, Node) + input_spec = quantization_config_8a8w.input_activation + input_qspec_map[input_act] = input_spec + + weight = node.args[1] + assert isinstance(weight, Node) + input_qspec_map[weight] = quantization_config_8a8w.weight + + if len(node.args) > 2 and isinstance(node.args[2], Node): + bias = node.args[2] + input_qspec_map[bias] = quantization_config_8a8w.bias + + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quantization_config_8a8w.output_activation, + _annotated=True, + ) + break + + def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None: def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None: input_qspec_map = {} diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py index 3620841aff9..8e65607dd84 100644 --- a/backends/qualcomm/quantizer/quantizer.py +++ b/backends/qualcomm/quantizer/quantizer.py @@ -3,9 +3,10 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from dataclasses import dataclass from enum import IntEnum, unique from functools import partial -from typing import Callable, Dict, Optional, Sequence, Set, Tuple +from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple import torch from executorch.backends.qualcomm._passes.qnn_pass_manager import QnnPassManager @@ -58,7 +59,7 @@ class QuantDtype(IntEnum): use_8a8w = 4 -quant_config_dict = { +QUANT_CONFIG_DICT = { # PTQ (QuantDtype.use_16a16w, False): ( get_16a16w_qnn_ptq_config, @@ -123,6 +124,59 @@ class QuantDtype(IntEnum): } +@dataclass +class ModuleQConfig: + quant_dtype: QuantDtype = QuantDtype.use_8a8w + is_qat: bool = False + is_conv_per_channel: bool = False + is_linear_per_channel: bool = False + act_observer: Optional[ + torch.ao.quantization.observer.UniformQuantizationObserverBase + ] = None + + def __post_init__(self): + if (self.quant_dtype, self.is_qat) not in QUANT_CONFIG_DICT: + raise RuntimeError( + f"the quant config, (quant_dtype: {self.quant_dtype}, is_qat: {self.is_qat}) is not support" + ) + ( + quant_config_func, + per_channel_quant_config_func, + per_block_quant_config_func, + ) = QUANT_CONFIG_DICT[(self.quant_dtype, self.is_qat)] + self.quant_config = ( + quant_config_func(act_observer=self.act_observer) + if self.act_observer + else quant_config_func() + ) + self.per_channel_quant_config = ( + per_channel_quant_config_func(act_observer=self.act_observer) + if self.act_observer + else per_channel_quant_config_func() + ) + self.use_per_channel_weight_quant_ops = set() + if self.is_conv_per_channel: + self.use_per_channel_weight_quant_ops.update( + { + torch.ops.aten.conv1d.default, + torch.ops.aten.conv2d.default, + torch.ops.aten.conv_transpose2d.input, + } + ) + if self.is_linear_per_channel: + self.use_per_channel_weight_quant_ops.update( + { + torch.ops.aten.linear.default, + } + ) + if per_block_quant_config_func: + self.per_block_quant_config = ( + per_block_quant_config_func(act_observer=self.act_observer) + if self.act_observer + else per_block_quant_config_func() + ) + + class QnnQuantizer(Quantizer): SUPPORTED_OPS: Set = set(OP_ANNOTATOR.keys()) @@ -130,14 +184,11 @@ def __init__(self): super().__init__() self.quant_ops: Set[OpOverload] = self.SUPPORTED_OPS.copy() - self.is_qat = False - self.quant_dtype = QuantDtype.use_8a8w - self.quant_config: QuantizationConfig = get_8a8w_qnn_ptq_config() - self.per_channel_quant_config = get_ptq_per_channel_quant_config() - self.per_block_quant_config = get_ptq_per_block_quant_config() + self.default_quant_config = ModuleQConfig() + self.submodule_qconfig_list: List[ + Tuple[Callable[[torch.fx.Node], bool], ModuleQConfig] + ] = [] self.block_size_map = {} - self.use_per_channel_weight_quant_ops: Set[OpOverload] = set() - self.use_per_block_weight_quant_ops: Set[OpOverload] = set() self.custom_quant_annotations: Sequence[Callable] = [] self.discard_nodes: Set[str] = set() @@ -155,41 +206,38 @@ def _annotate_custom_annotation(self, gm: GraphModule) -> None: for annotation_func in self.custom_quant_annotations: annotation_func(gm) - def _get_quant_config(self, op: torch.fx.Node) -> Optional[QuantizationConfig]: + def _get_submodule_qconfig(self, node: torch.fx.Node): + for func, qconfig in self.submodule_qconfig_list: + if func(node): + return qconfig + return self.default_quant_config + + def _get_quant_config(self, node: torch.fx.Node) -> Optional[QuantizationConfig]: """ - Priority: - 1. is one of use_per_block_weight_quant_ops - 2. is one of use_per_channel_weight_quant_ops - 3. quant config + How to pick: + 1. is one of per_block_quant_config + 2. Pick specific submodule config if given. + 3. Pick one if op belongs to use_per_channel_weight_quant_ops + 4. If not 3, pick normal quant config """ - target = op.target - if isinstance(target, str): + op = node.target + if isinstance(op, str): return - if target in self.use_per_block_weight_quant_ops: - if block_size := self.block_size_map.get(op.name): - self.per_block_quant_config.block_size = block_size - return self.per_block_quant_config + if block_size := self.block_size_map.get(node.name): + config = self.default_quant_config.per_block_quant_config + config.block_size = block_size + return config - if target in self.use_per_channel_weight_quant_ops: - return self.per_channel_quant_config + config = self._get_submodule_qconfig(node) - if target in self.quant_ops: - return self.quant_config + if op in config.use_per_channel_weight_quant_ops: + return config.per_channel_quant_config - print(f"No quant config is implemented for op, {op}") - - def _update_per_block_weight_quant_ops(self, ops: Set[OpOverload], enable: bool): - if enable: - self.use_per_block_weight_quant_ops.update(ops) - else: - self.use_per_block_weight_quant_ops.difference_update(ops) + if op in self.quant_ops: + return config.quant_config - def _update_per_channel_weight_quant_ops(self, ops: Set[OpOverload], enable: bool): - if enable: - self.use_per_channel_weight_quant_ops.update(ops) - else: - self.use_per_channel_weight_quant_ops.difference_update(ops) + print(f"No quant config is implemented for op, {op}") def add_custom_quant_annotations( self, custom_quant_annotations: Sequence[Callable] @@ -212,55 +260,74 @@ def annotate(self, model: GraphModule) -> GraphModule: def get_supported_ops(self) -> Set[OpOverload]: return self.SUPPORTED_OPS - def set_quant_config( - self, quant_dtype: QuantDtype, is_qat=False, act_observer=None + def set_default_quant_config( + self, + quant_dtype: QuantDtype, + is_qat=False, + is_conv_per_channel=False, + is_linear_per_channel=False, + act_observer=None, ) -> None: - self.quant_dtype = quant_dtype - self.is_qat = is_qat - if (quant_dtype, is_qat) not in quant_config_dict: - raise RuntimeError( - f"the quant config, (quant_dtype: {quant_dtype}, is_qat: {is_qat}) is not support" - ) - - quant_config_fuc, per_channel_quant_config_fuc, per_block_quant_config_fuc = ( - quant_config_dict[(quant_dtype, is_qat)] - ) - self.quant_config = ( - quant_config_fuc(act_observer=act_observer) - if act_observer - else quant_config_fuc() + self.default_quant_config = ModuleQConfig( + quant_dtype, + is_qat, + is_conv_per_channel, + is_linear_per_channel, + act_observer, ) - self.per_channel_quant_config = ( - per_channel_quant_config_fuc(act_observer=act_observer) - if act_observer - else per_channel_quant_config_fuc() - ) - if per_block_quant_config_fuc is not None: - self.per_block_quant_config = ( - per_block_quant_config_fuc(act_observer=act_observer) - if act_observer - else per_block_quant_config_fuc() - ) def set_block_size_map(self, block_size_map: Dict[str, Tuple]) -> None: self.block_size_map = block_size_map - def set_per_block_conv_quant(self, enable: bool) -> None: - conv_ops = {torch.ops.aten.conv2d.default} - self._update_per_block_weight_quant_ops(conv_ops, enable) - - def set_per_channel_conv_quant(self, enable: bool) -> None: - conv_ops = {torch.ops.aten.conv1d.default, torch.ops.aten.conv2d.default} - self._update_per_channel_weight_quant_ops(conv_ops, enable) - - def set_per_channel_linear_quant(self, enable: bool) -> None: - linear_ops = { - torch.ops.aten.linear.default, - } - self._update_per_channel_weight_quant_ops(linear_ops, enable) + def set_submodule_qconfig_list( + self, submodule_qconfig_list: List[Tuple[Callable, ModuleQConfig]] + ) -> None: + """ + Set specific quant config from a callback function. + If a node fits more than one callback, only apply the first one. + """ + self.submodule_qconfig_list = submodule_qconfig_list def transform_for_annotation(self, model: GraphModule) -> GraphModule: return QnnPassManager().transform_for_annotation_pipeline(model) def validate(self, model: GraphModule) -> None: pass + + +def get_submodule_type_predicate(module_type_str): + """ + An example of nn_module_stack + { + 'L__self__': ('', 'executorch.backends.qualcomm.tests.models.SubModules'), + 'L__self___add': ('add', 'executorch.backends.qualcomm.tests.models.Add') + } + """ + + def predicate(node): + if nn_module_stack := node.meta.get("nn_module_stack"): + for _, type_name in nn_module_stack.values(): + if module_type_str in type_name: + return True + return False + + return predicate + + +def get_submodule_name_predicate(module_name_str): + """ + An example of nn_module_stack + { + 'L__self__': ('', 'executorch.backends.qualcomm.tests.models.SubModules'), + 'L__self___add': ('add', 'executorch.backends.qualcomm.tests.models.Add') + } + """ + + def predicate(node): + if nn_module_stack := node.meta.get("nn_module_stack"): + for name in nn_module_stack.keys(): + if module_name_str in name: + return True + return False + + return predicate diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp index 7abe4b35076..fd580867db5 100644 --- a/backends/qualcomm/runtime/backends/QnnProfiler.cpp +++ b/backends/qualcomm/runtime/backends/QnnProfiler.cpp @@ -84,6 +84,22 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData( "ProfileData failed to get events: %d", QNN_GET_ERROR_CODE(error)); return error; } + + auto get_unit = [](QnnProfile_EventUnit_t unit) { + switch (unit) { + case QNN_PROFILE_EVENTUNIT_MICROSEC: + return " (us)"; + case QNN_PROFILE_EVENTUNIT_BYTES: + return " (bytes)"; + case QNN_PROFILE_EVENTUNIT_COUNT: + return " (count)"; + case QNN_PROFILE_EVENTUNIT_BACKEND: + // cycle unit is default appeared + case QNN_PROFILE_EVENTUNIT_CYCLES: + default: + return ""; + } + }; QnnProfile_EventData_t event_data; for (std::uint32_t i = 0; i < num_events; ++i) { error = @@ -96,6 +112,16 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData( QNN_GET_ERROR_CODE(error)); return error; } + // add events for other important metrics, e.g. RPC execution time + std::string identifier = + std::string(event_data.identifier) + get_unit(event_data.unit); + executorch::runtime::event_tracer_log_profiling_delegate( + event_tracer, + identifier.c_str(), + /*delegate_debug_id=*/ + static_cast(-1), + 0, + event_data.value); // Check an event's sub events only if it relates to graph execution time // (and its sub events are the individual op executions): if (backend_->IsProfileEventTypeParentOfNodeTime(event_data.type)) { @@ -109,6 +135,7 @@ Qnn_ErrorHandle_t QnnProfile::ProfileData( QNN_GET_ERROR_CODE(error)); return error; } + QnnProfile_EventData_t sub_event_data; for (std::uint32_t j = 0; j < num_sub_events; ++j) { error = qnn_interface.qnn_profile_get_event_data( diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl index a56accd7813..b9fb2cc54fd 100644 --- a/backends/qualcomm/runtime/targets.bzl +++ b/backends/qualcomm/runtime/targets.bzl @@ -3,7 +3,7 @@ load( "ANDROID", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. @@ -24,7 +24,7 @@ def define_common_targets(): platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], deps = [ - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), "//executorch/runtime/backend:interface", ], exported_deps = [ @@ -60,11 +60,11 @@ def define_common_targets(): platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], resources = ({ - "qnn_lib": "fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs".format(get_qnn_library_verision()), + "qnn_lib": "fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs".format(get_qnn_library_version()), } if include_aot_qnn_lib else { }), deps = [ - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), ":logging", "//executorch/backends/qualcomm:schema", "//executorch/backends/qualcomm/aot/ir:qcir_utils", diff --git a/backends/qualcomm/setup.md b/backends/qualcomm/setup.md index 37d8e04c210..a7adb6d006d 100644 --- a/backends/qualcomm/setup.md +++ b/backends/qualcomm/setup.md @@ -1,6 +1,6 @@ # Setting up QNN Backend -Please refer to [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md). +Please refer to [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](../../docs/source/backends-qualcomm.md). That is a tutorial for building and running Qualcomm AI Engine Direct backend, including compiling a model on a x64 host and running the inference diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl index fbbfa0f1925..9a44ee8b773 100644 --- a/backends/qualcomm/targets.bzl +++ b/backends/qualcomm/targets.bzl @@ -3,7 +3,7 @@ load( "ANDROID", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") # Construct the input and output file names. All input and output files rely on scalar_type file. SCHEMA_NAME = "qc_compiler_spec" @@ -84,7 +84,7 @@ def define_common_targets(): define_static_target = True, visibility = ["@EXECUTORCH_CLIENTS"], deps = [ - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", "//executorch/backends/qualcomm/runtime:runtime_android_build", diff --git a/backends/qualcomm/tests/TARGETS b/backends/qualcomm/tests/TARGETS index b6a9664dcbf..8078ca611f8 100644 --- a/backends/qualcomm/tests/TARGETS +++ b/backends/qualcomm/tests/TARGETS @@ -1,6 +1,6 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library") load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") python_library( name = "models", @@ -17,7 +17,7 @@ python_library( "utils.py", ], # env = { - # "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()), + # "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()), # }, deps = [ ":models", diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 0857a597d88..adf6e256f54 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -72,6 +72,16 @@ def forward(self, x): return torch.any(x, dim=self.dim, keepdim=self.keepdim) +class AMax(torch.nn.Module): + def __init__(self, dim=None, keepdim=False): + super().__init__() + self.dim = dim + self.keepdim = keepdim + + def forward(self, x): + return torch.amax(x, dim=self.dim, keepdim=self.keepdim) + + class Arange(torch.nn.Module): def __init__(self, start, end, step, dtype): super().__init__() @@ -180,6 +190,14 @@ def forward(self, x, y): return torch.cat((y, y, x, x), axis=2) +class CDist(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y): + return torch.cdist(x, y, p=2) + + class Ceil(torch.nn.Module): def __init__(self): super().__init__() @@ -558,6 +576,14 @@ def forward(self, x): return torch.cos(x) +class CumSum(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return x.cumsum(dim=0) + + class Div(torch.nn.Module): def __init__(self): super().__init__() @@ -1410,6 +1436,15 @@ def forward(self, x): return x / torch.sqrt(torch.tensor([64.0])) +class SquaredReLU(torch.nn.Module): + def __init__(self, inplace=False): + super().__init__() + self.relu = torch.nn.ReLU(inplace=inplace) + + def forward(self, x): + return torch.square(self.relu(x)) + + class Squeeze(torch.nn.Module): def __init__(self): super().__init__() @@ -1450,6 +1485,18 @@ def forward(self, x): return 10 - x +class SimpleSubModules(torch.nn.Module): + def __init__(self): + super().__init__() + self.add = Add() + self.sub = Sub() + + def forward(self, a, b, c, d): + lhs = self.add(a, b) + rhs = self.sub(c, d) + return torch.mul(lhs, rhs) + + class SumIntList(torch.nn.Module): def __init__(self): super().__init__() @@ -1558,3 +1605,14 @@ def forward(self, x): return torch.nn.functional.softmax( torch.where(x >= 0, 0.1, float("-inf")), dim=-1 ) + + +# Mimi Decoder has 0D tensor which QNN cannot handle. +class ZeroDimTensor(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + input1 = torch.zeros(1) + selected_element = torch.select(input1, 0, 0) + return torch.add(x, selected_element) diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 795459a9f77..7d097fd45bf 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -24,6 +24,7 @@ from executorch.backends.qualcomm.tests.utils import ( generate_context_binary, + ModuleQConfig, QuantDtype, TestQNN, validate_context_binary, @@ -68,7 +69,11 @@ from collections import defaultdict from typing import List -from executorch.backends.qualcomm._passes import FoldQDQ, TagQuantIO +from executorch.backends.qualcomm._passes import ( + ExpandBroadcastTensorShape, + FoldQDQ, + TagQuantIO, +) from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors from executorch.backends.qualcomm.debugger.utils import DrawGraph from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model @@ -113,6 +118,13 @@ def test_qnn_backend_adaptive_avg_pool2d(self): sample_input = (torch.randn(1, 512, 7, 7),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_amax(self): + modules = [AMax(dim=1, keepdim=False), AMax(dim=1, keepdim=True)] # noqa: F405 + sample_input = (torch.randn(4, 4),) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_any(self): modules = [Any(), Any(dim=[0, 1]), Any(dim=1, keepdim=True)] # noqa: F405 sample_input = (torch.randn(3, 3, 3) > 0,) @@ -164,6 +176,14 @@ def test_qnn_backend_cat(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_cdist(self): + module = CDist() # noqa: F405 + sample_input = ( + torch.randn(1, 125, 256), + torch.randn(1, 2048, 256), + ) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_chunk_single(self): module = Chunk() # noqa: F405 sample_input = (torch.randn(1, 1, 4, 3),) @@ -225,6 +245,11 @@ def test_qnn_backend_cos(self): sample_input = (torch.randn(2, 5, 1, 3),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_cumsum(self): + module = CumSum() # noqa: F405 + sample_input = (torch.randn(4),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_einsum_outer_product(self): module = EinsumOuterProduct() # noqa: F405 x = torch.randn(5) @@ -422,10 +447,20 @@ def test_qnn_backend_equal(self): def test_qnn_backend_expand(self): modules = [ExpandAs(), ExpandCopy()] # noqa: F405 - sample_input = (torch.randn([3, 1]),) - for i, module in enumerate(modules): - with self.subTest(i=i): - self.lower_module_and_test_output(module, sample_input) + sample_inputs = [ + (torch.randn([3, 1]),), + (torch.randn([4]),), + ] + passes_job = get_capture_program_passes() + passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True + index = 0 + for module in modules: + for sample_input in sample_inputs: + with self.subTest(i=index): + self.lower_module_and_test_output( + module, sample_input, passes_job=passes_job + ) + index += 1 def test_qnn_backend_expm1(self): sample_input = (torch.randn(3, 4, 5),) @@ -808,6 +843,11 @@ def test_qnn_backend_softmax(self): sample_input = (torch.randn([1, 4, 8, 8]),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_squared_relu(self): + module = SquaredReLU() # noqa: F405 + sample_input = (torch.randn([2, 5, 1, 3]),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_squeeze(self): module = Squeeze() # noqa: F405 sample_input = (torch.randn([1, 3, 3]),) @@ -843,14 +883,14 @@ def test_qnn_backend_where(self): Where(), # noqa: F405 WhereConstant(torch.randn(3, 2), torch.randn(3, 2)), # noqa: F405 WhereConstantOther(), # noqa: F405 - # WhereConstantAll(), # noqa: F405 TODO: constant dtype does not propogate when doing const i64->32, causing where to fail since where does not support int64 output + WhereConstantAll(), # noqa: F405 WhereConstantInf(), # noqa: F405 ] sample_inputs = [ (torch.randn(3, 2), torch.randn(3, 2), torch.randn(3, 2)), (torch.randn(3, 2),), (torch.randn(3, 2),), - # (torch.randn(3, 2),), + (torch.randn(3, 2),), (torch.randn(30, 20),), ] for i, module in enumerate(modules): @@ -979,6 +1019,11 @@ def test_qnn_backend_view_permute_matmul(self): sample_input = (torch.randn([1, 8, 512]), torch.randn([1, 2, 8, 256])) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_zero_dim_tensor(self): + module = ZeroDimTensor() # noqa: F405 + sample_input = (torch.randn(1, 256, 125),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_example_models(self): # TODO Fix MobileBertModelExample and TorchVisionViTModel instances = [ @@ -1111,6 +1156,14 @@ def test_qnn_backend_adaptive_avg_pool2d(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_amax(self): + modules = [AMax(dim=1, keepdim=False), AMax(dim=1, keepdim=True)] # noqa: F405 + sample_input = (torch.randn(4, 4),) + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_any(self): modules = [Any(), Any(dim=[0, 1]), Any(dim=1, keepdim=True)] # noqa: F405 sample_input = (torch.randn(3, 3, 3) > 0,) @@ -1164,6 +1217,15 @@ def test_qnn_backend_cat(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_cdist(self): + module = CDist() # noqa: F405 + sample_input = ( + torch.randn(1, 125, 256), + torch.randn(1, 2048, 256), + ) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_chunk_single(self): module = Chunk() # noqa: F405 sample_input = (torch.randn(1, 1, 4, 3),) @@ -1237,7 +1299,6 @@ def test_qnn_backend_conv2d_block(self): module = self.get_qdq_module( module, sample_input, - is_conv_per_block=True, quant_dtype=QuantDtype.use_16a4w_block, block_size_map={"conv2d": (1, 128, 1, 1)}, ) @@ -1282,6 +1343,12 @@ def test_qnn_backend_cos(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_cumsum(self): + module = CumSum() # noqa: F405 + sample_input = (torch.randn(4),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_einsum_outer_product(self): module = EinsumOuterProduct() # noqa: F405 x = torch.randn(5) @@ -1326,8 +1393,8 @@ def test_qnn_backend_element_wise_add(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + gm = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(gm, sample_input) index += 1 def test_qnn_backend_element_wise_and(self): @@ -1367,8 +1434,8 @@ def test_qnn_backend_element_wise_div(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + gm = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(gm, sample_input) index += 1 def test_qnn_backend_element_wise_mul(self): @@ -1395,8 +1462,8 @@ def test_qnn_backend_element_wise_mul(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + gm = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(gm, sample_input) index += 1 def test_qnn_backend_element_wise_or(self): @@ -1455,8 +1522,8 @@ def test_qnn_backend_element_wise_sub(self): for module in comb[QCOM_MODULE]: for sample_input in comb[QCOM_SAMPLE_INPUTS]: with self.subTest(i=index): - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + gm = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(gm, sample_input) index += 1 def test_qnn_backend_elu(self): @@ -1491,11 +1558,21 @@ def test_qnn_backend_equal(self): def test_qnn_backend_expand(self): modules = [ExpandAs(), ExpandCopy()] # noqa: F405 - sample_input = (torch.randn([3, 1]),) - for i, module in enumerate(modules): - with self.subTest(i=i): - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + sample_inputs = [ + (torch.randn([3, 1]),), + (torch.randn([4]),), + ] + passes_job = get_capture_program_passes() + passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True + index = 0 + for module in modules: + for sample_input in sample_inputs: + with self.subTest(i=index): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output( + module, sample_input, passes_job=passes_job + ) + index += 1 def test_qnn_backend_expm1(self): sample_input = (torch.randn(3, 4, 5),) @@ -1929,6 +2006,12 @@ def test_qnn_backend_softmax(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_squared_relu(self): + module = SquaredReLU() # noqa: F405 + sample_input = (torch.randn([2, 5, 1, 3]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_squeeze(self): module = Squeeze() # noqa: F405 sample_input = (torch.randn([1, 3, 3]),) @@ -1980,14 +2063,14 @@ def test_qnn_backend_where(self): Where(), # noqa: F405 WhereConstant(torch.randn(3, 2), torch.randn(3, 2)), # noqa: F405 WhereConstantOther(), # noqa: F405 - # WhereConstantAll(), # noqa: F405, TODO: constant dtype does not propogate when doing const i64->32, causing where to fail since where does not support int64 output + WhereConstantAll(), # noqa: F405 WhereConstantInf(), # noqa: F405 ] sample_inputs = [ (torch.randn(3, 2), torch.randn(3, 2), torch.randn(3, 2)), (torch.randn(3, 2),), (torch.randn(3, 2),), - # (torch.randn(3, 2),), + (torch.randn(3, 2),), (torch.randn(30, 20),), ] for i, module in enumerate(modules): @@ -2122,6 +2205,32 @@ def test_qnn_backend_simple_model(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_submodules(self): + module = SimpleSubModules() # noqa: F405 + sample_input = ( + torch.rand(1, 3, 8, 8), + torch.rand(1, 3, 8, 8), + torch.rand(1, 3, 8, 8), + torch.rand(1, 3, 8, 8), + ) + + from executorch.backends.qualcomm.quantizer.quantizer import ( + get_submodule_type_predicate, + ) + + submodule_qconfig_list = [ + ( + get_submodule_type_predicate("Add"), + ModuleQConfig(QuantDtype.use_16a16w), + ) # noqa: F405 + ] + module = self.get_qdq_module( + module, + sample_input, + submodule_qconfig_list=submodule_qconfig_list, + ) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_topk_and_index(self): module = TopKandIndex() # noqa: F405 sample_input = (torch.randn(3, 10),) @@ -2135,6 +2244,12 @@ def test_qnn_backend_view_permute_matmul(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_zero_dim_tensor(self): + module = ZeroDimTensor() # noqa: F405 + sample_input = (torch.randn(1, 256, 125),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_example_models(self): instances = [ { @@ -2390,7 +2505,7 @@ def test_qnn_backend_profile_op(self): module, sample_input, expected_partitions=1, - expected_profile_events=24, + expected_profile_events=34, ) def test_qnn_backend_shared_buffer(self): @@ -3005,7 +3120,7 @@ def test_qnn_backend_profile_op(self): module, sample_input, expected_partitions=1, - expected_profile_events=25, + expected_profile_events=35, ) def test_qnn_backend_shared_buffer(self): @@ -3496,7 +3611,6 @@ def test_conv_former(self): self.assertGreaterEqual(msg["top_1"], 60) self.assertGreaterEqual(msg["top_5"], 80) - @unittest.skip("bicubic resize is not supported") def test_dino_v2(self): if not self.required_envs([self.image_dataset]): self.skipTest("missing required envs") @@ -3532,6 +3646,46 @@ def test_dino_v2(self): self.assertGreaterEqual(msg["top_1"], 70) self.assertGreaterEqual(msg["top_5"], 85) + def test_efficientSAM(self): + if not self.required_envs( + [self.image_dataset, self.pretrained_weight, self.oss_repo] + ): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--oss_repo", + self.oss_repo, + "--pretrained_weight", + self.pretrained_weight, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["MIoU"], 0.55) + def test_esrgan(self): if not self.required_envs(): self.skipTest("missing required envs") diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 41c56c08a85..71d3b9e7ec2 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -9,14 +9,14 @@ import subprocess import tempfile import unittest -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable, Dict, List, Optional, OrderedDict, Tuple import numpy as np import torch from executorch import exir from executorch.backends.qualcomm.qnn_preprocess import QnnBackend -from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.backends.qualcomm.quantizer.quantizer import ModuleQConfig, QuantDtype from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.constants import ( QCOM_DTYPE, @@ -30,6 +30,7 @@ to_edge_transform_and_lower_to_qnn, ) from executorch.devtools import generate_etrecord, Inspector +from executorch.devtools.inspector._inspector_utils import TimeScale from executorch.examples.qualcomm.utils import ( generate_inputs, make_output_dir, @@ -290,7 +291,12 @@ def post_process(): outputs.append(output) def validate_profile(): - inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path) + inspector = Inspector( + etdump_path=etdump_path, + etrecord=etrecord_path, + source_time_scale=TimeScale.CYCLES, + target_time_scale=TimeScale.CYCLES, + ) self.assertTrue( len(inspector.to_dataframe().index) == expected_profile_events ) @@ -435,6 +441,7 @@ def lower_module_and_test_output( expected_profile_events: int = -1, expected_intermediate_events: int = -1, assert_output_equal: bool = True, + passes_job: Optional[OrderedDict] = None, skip_node_id_set: set = None, skip_node_op_set: set = None, dynamic_shapes: Dict = None, @@ -444,6 +451,7 @@ def lower_module_and_test_output( sample_inputs, self.compiler_specs, dynamic_shapes=dynamic_shapes, + passes_job=passes_job, skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, ) @@ -497,7 +505,6 @@ def get_qdq_module( self, module: torch.nn.Module, inputs: Tuple[torch.Tensor], - is_conv_per_block: Optional[bool] = False, is_conv_per_channel: Optional[bool] = True, is_linear_per_channel: Optional[bool] = False, custom_quant_annotations: Tuple[Callable] = (), @@ -505,6 +512,7 @@ def get_qdq_module( dynamic_shapes: Dict = None, bypass_check: bool = False, block_size_map: Dict[str, Tuple] = None, + submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None, ) -> torch.fx.GraphModule: m = torch.export.export( module, inputs, dynamic_shapes=dynamic_shapes, strict=True @@ -513,9 +521,9 @@ def get_qdq_module( quantizer = make_quantizer( quant_dtype=quant_dtype, custom_annotations=custom_quant_annotations, - per_block_conv=is_conv_per_block, per_channel_conv=is_conv_per_channel, per_channel_linear=is_linear_per_channel, + submodule_qconfig_list=submodule_qconfig_list, ) if block_size_map is not None: quantizer.set_block_size_map(block_size_map) @@ -543,6 +551,7 @@ def get_prepared_qat_module( is_linear_per_channel: Optional[bool] = False, custom_quant_annotations: Tuple[Callable] = (), quant_dtype: QuantDtype = QuantDtype.use_8a8w, + submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None, ) -> torch.fx.GraphModule: m = torch.export.export_for_training(module, inputs, strict=True).module() @@ -551,12 +560,12 @@ def get_prepared_qat_module( custom_annotations=custom_quant_annotations, per_channel_conv=is_conv_per_channel, per_channel_linear=is_linear_per_channel, + is_qat=True, + submodule_qconfig_list=submodule_qconfig_list, ) - if quant_dtype == QuantDtype.use_8a8w: - quantizer.set_quant_config(quant_dtype, is_qat=True) - else: - raise RuntimeError("Shuld not be here") + submodule_qconfig_list = submodule_qconfig_list or [] + quantizer.set_submodule_qconfig_list(submodule_qconfig_list) prepared = prepare_qat_pt2e(m, quantizer) return torch.ao.quantization.move_exported_model_to_train(prepared) diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index f7b966ee8ea..e0ebc5beebe 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -16,7 +16,7 @@ import torch -from executorch.backends.qualcomm._passes import AnnotateStack +from executorch.backends.qualcomm._passes import AnnotateStack, AnnotateUnbind from executorch.backends.qualcomm._passes.qnn_pass_manager import QnnPassManager from executorch.backends.qualcomm.builders.node_visitor import ( @@ -304,11 +304,12 @@ def get_decomp_table(passes_job) -> Dict[torch._ops.OperatorBase, Callable]: skip_decompositions = get_skip_decomp_table() # If we want to annotate the decomposed ops, then we should decompose the operation. - if passes_job and passes_job.get(AnnotateStack, False): + if passes_job: skip_decompositions = [ skip_decomp_op for skip_decomp_op in skip_decompositions - if skip_decomp_op not in AnnotateStack.decomp_ops + if skip_decomp_op + not in AnnotateStack.decomp_ops + AnnotateUnbind.decomp_ops ] remove_decompositions(source_decompositions, skip_decompositions) diff --git a/backends/transforms/decompose_sdpa.py b/backends/transforms/decompose_sdpa.py index 329dab96df2..73e9d986c3d 100644 --- a/backends/transforms/decompose_sdpa.py +++ b/backends/transforms/decompose_sdpa.py @@ -62,6 +62,9 @@ def call( # Copy node from decompose graph module for decomposed_node in decomposed_module.graph.nodes: + node.meta["nn_module_stack"] = decomposed_node.meta.get( + "nn_module_stack" + ) if decomposed_node.op == "placeholder": continue diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md index 2cfff6a6eb6..3ae80950645 100644 --- a/backends/vulkan/README.md +++ b/backends/vulkan/README.md @@ -133,7 +133,7 @@ will be executed on the GPU. ::::{note} -The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/supported_ops.py) +The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/op_registry.py#L194) Vulkan partitioner code can be inspected to examine which ops are currently implemented in the Vulkan delegate. :::: diff --git a/backends/vulkan/_passes/int4_weight_only_quantizer.py b/backends/vulkan/_passes/int4_weight_only_quantizer.py index 409cbb4b755..d0b73b8af0e 100644 --- a/backends/vulkan/_passes/int4_weight_only_quantizer.py +++ b/backends/vulkan/_passes/int4_weight_only_quantizer.py @@ -118,9 +118,6 @@ def _vk_replace_linear_int4( # Use custom vulkan linear layer as default linear_class: Type[torch.nn.Module] = VkWeightOnlyInt4Linear, copy_weights: bool = False, - # Serves the same purpose as `tensor_dim_limit` in - # executorch.backends.vulkan.partitioner.VulkanSupportedOperators - feature_limit: int = 16384, ): for name, child in module.named_children(): if isinstance(child, torch.nn.Linear) and ( @@ -131,8 +128,6 @@ def _vk_replace_linear_int4( if ( _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) or padding_allowed - ) and ( - child.out_features < feature_limit and child.in_features < feature_limit ): new_linear = linear_class( child.in_features, @@ -175,7 +170,6 @@ def __init__( inner_k_tiles: Optional[int] = 8, device: torch.device = torch.device("cpu"), # noqa precision: torch.dtype = torch.float32, - feature_limit: int = 16384, ) -> None: super().__init__() assert inner_k_tiles in [2, 4, 8] @@ -186,9 +180,6 @@ def __init__( self.padding_allowed: bool = padding_allowed self.device: torch.device = device self.precision: torch.dtype = precision - # Serves the same purpose as `tensor_dim_limit` in - # executorch.backends.vulkan.partitioner.VulkanSupportedOperators - self.feature_limit = feature_limit @torch.no_grad() def _create_quantized_state_dict( @@ -197,10 +188,7 @@ def _create_quantized_state_dict( cur_state_dict = model.state_dict() for fqn, mod in model.named_modules(): # Add additional check to make sure features do not exceed feature limit - if isinstance(mod, torch.nn.Linear) and ( - mod.out_features < self.feature_limit - and mod.in_features < self.feature_limit - ): + if isinstance(mod, torch.nn.Linear): out_features = mod.out_features in_features = mod.in_features logging.info(f"linear: {fqn}, in={in_features}, out={out_features}") diff --git a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py index a0160efa90f..b4337829d7f 100644 --- a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py +++ b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py @@ -27,6 +27,19 @@ class SqueezeUnsqueezeInputs(ExportPass): exir_ops.edge.aten.gelu.default, } + def should_squeeze(self, op, shape: List[int]) -> bool: # pyre-ignore + if len(shape) == 3: + return shape[1] == 1 and shape[0] > 1 + if len(shape) == 4: + # No need to squeeze if all dims are 1 except the width dim + if all(dim == 1 for dim in shape[:-1]): + return False + # Otherwise, check for squeezable dim + return 1 in shape[:-1] + + # Prefer not to introduce additional orchestration ops by default + return False + def call_operator( self, op, # pyre-ignore @@ -34,18 +47,18 @@ def call_operator( kwargs: Dict[str, Argument], meta: NodeMetadata, ) -> ProxyValue: - def _squeezable(shape: List[int]) -> bool: - return len(shape) > 2 and 1 in shape - if op not in self._squeezable_ops: return super().call_operator(op, args, kwargs, meta) - # pyre-ignore[16]: `None` has no attribute `node` input_shape = args[0].node.meta["val"].shape output_shape = meta["val"].shape - if not _squeezable(input_shape): + + if not self.should_squeeze(op, input_shape): return super().call_operator(op, args, kwargs, meta) + def _squeezable(shape: List[int]) -> bool: + return len(shape) > 2 and 1 in shape + # squeeze input tensor squeeze_shape = list(input_shape) while _squeezable(squeeze_shape): diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake index 67285738b4c..adbffaa76fd 100644 --- a/backends/vulkan/cmake/ShaderLibrary.cmake +++ b/backends/vulkan/cmake/ShaderLibrary.cmake @@ -45,16 +45,20 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) function(gen_vulkan_shader_lib_cpp shaders_path) set(VULKAN_SHADERGEN_ENV "") - set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/${ARGV1}) + set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/vulkan_compute_shaders) - execute_process( + add_custom_command( + COMMENT "Generating Vulkan Compute Shaders" + OUTPUT ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp COMMAND "${PYTHON_EXECUTABLE}" ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} - --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH}/shader_cache/ - --env ${VULKAN_GEN_ARG_ENV} - RESULT_VARIABLE error_code + --glslc-path=${GLSLC_PATH} + --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH}/shader_cache/ --env + ${VULKAN_GEN_ARG_ENV} + DEPENDS ${shaders_path}/* + ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py ) set(generated_spv_cpp @@ -86,13 +90,6 @@ macro(vulkan_shader_library shaders_path library_name) set(VULKAN_SHADERGEN_ENV "") set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/${library_name}) - # execute_process( COMMAND "${PYTHON_EXECUTABLE}" - # ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path - # ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} - # --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH} --env - # ${VULKAN_GEN_ARG_ENV} RESULT_VARIABLE error_code ) set(ENV{PYTHONPATH} - # ${PYTHONPATH}) - set(generated_spv_cpp ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp) add_library(${library_name} STATIC ${generated_spv_cpp}) diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index 7eab1c21f89..1f36b76ec6f 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -1,6 +1,6 @@ # Building and Running ExecuTorch with the Vulkan Backend -The [ExecuTorch Vulkan Delegate](./native-delegates-executorch-vulkan-delegate.md) +The [ExecuTorch Vulkan Delegate](../../../docs/source/native-delegates-executorch-vulkan-delegate.md) is a native GPU delegate for ExecuTorch. @@ -12,8 +12,8 @@ is a native GPU delegate for ExecuTorch. ::: :::{grid-item-card} Prerequisites: :class-card: card-prerequisites -* Follow [**Setting up ExecuTorch**](./getting-started-setup.md) -* It is also recommended that you read through [**ExecuTorch Vulkan Delegate**](./native-delegates-executorch-vulkan-delegate.md) and follow the example in that page +* Follow [**Setting up ExecuTorch**](../../../docs/source/getting-started-setup.rst) +* It is also recommended that you read through [**ExecuTorch Vulkan Delegate**](../../../docs/source/native-delegates-executorch-vulkan-delegate.md) and follow the example in that page ::: :::: @@ -59,7 +59,7 @@ partially lower the Llama model to Vulkan. # The files will usually be downloaded to ~/.llama python -m examples.models.llama.export_llama \ --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \ - --model "llama3_2" \ + --model "llama3_2" \ -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \ -p ~/.llama/checkpoints/Llama3.2-1B/params.json \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index b33430a6bca..026f1db9273 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -277,6 +277,7 @@ def register_binary_op(features: OpFeatures): exir_ops.edge.aten.rsqrt.default, exir_ops.edge.aten.tanh.default, exir_ops.edge.aten.round.default, + exir_ops.edge.aten.leaky_relu.default, ] ) def register_unary_op(features: OpFeatures): @@ -392,6 +393,7 @@ def register_int8_mm_op(features: OpFeatures): @update_features(exir_ops.edge.et_vk.linear_weight_int4.default) def register_int4_mm_op(features: OpFeatures): + features.buffer_impl = True features.texture_impl = TextureImplFeatures( uses_axis_map=False, valid_packed_dims={PackedDim.WIDTH}, @@ -400,6 +402,7 @@ def register_int4_mm_op(features: OpFeatures): features.optimal_storage = VkStorageType.TEXTURE_3D features.optimal_layout = VkMemoryLayout.TENSOR_WIDTH_PACKED features.handles_own_prepacking = True + features.skip_limits_check = {1} return features diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 4cbd1290401..62b53f9a76c 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -260,6 +260,26 @@ vkapi::VulkanImage allocate_image( return vkapi::VulkanImage(); } + // TODO(ssjia): change to always check that the image extents do not exceed + // physical limits. Adding the check now based on `maxImageDimension3D` will + // cause some existing models to break. Anecdotally, on Adreno and + // SwiftShader devices, using 3D textures that exceed `maxImageDimension3D` + // appears to be ok. So we need to figure out if is it undefined behaviour + // or if there's a better way to figure out what the limit is. For now, only + // check during debug build so that we can detect when exceeding physical + // limits could be a potential cause for model outputs to be wrong. In the + // meantime, the threshold for using texture storage can be configured at + // export time. +#ifdef VULKAN_DEBUG + uint32_t max_extent = storage_type == utils::kTexture3D + ? adapter_ptr->max_texture3d_dim() + : adapter_ptr->max_texture2d_dim(); + + VK_CHECK_COND( + image_extents[0] <= max_extent && image_extents[1] <= max_extent && + image_extents[2] <= max_extent); +#endif + VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props); return adapter_ptr->vma().create_image( @@ -291,6 +311,8 @@ vkapi::VulkanBuffer allocate_buffer( return vkapi::VulkanBuffer(); } + VK_CHECK_COND(numel <= context_ptr->adapter_ptr()->max_buffer_numel()); + return adapter_ptr->vma().create_storage_buffer( element_size(dtype) * numel, allocate_memory); } @@ -497,9 +519,7 @@ vTensor::vTensor( VK_CHECK_COND( dim_order_is_valid(dim_order_), "computed dim order is invalid"); - if (storage_type != utils::kBuffer) { - set_logical_limits(storage_.image_extents_); - } + set_logical_limits(storage_.image_extents_); } // NOLINTNEXTLINE diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py index a29f7d14964..e52780b6a4d 100644 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/gen_vulkan_spv.py @@ -125,6 +125,8 @@ def buffer_gvec_type(dtype: str, n: int) -> str: if dtype == "float": return f"vec{n}" + if dtype == "uint": + return f"uvec{n}" elif dtype == "half": return f"f16vec{n}" elif dtype == "int": diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl index 1c2ffe7afe4..c0ed9204227 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl @@ -47,7 +47,17 @@ void main() { // Compute the start and end of the input indices to load. Padding is assumed // to be constant 0 padding, so reads from the padding region are skipped. - const ivec2 start = max(ivec2(0), ipos); + ivec2 start = ipos; + if (start.x < 0) { + // number of "steps" to get to >= zero is div_up(-start, dilation) + int num_steps = ((-ipos.x) + dilation.x - 1) / dilation.x; + start.x = ipos.x + num_steps * dilation.x; + } + if (start.y < 0) { + // number of "steps" to get to >= zero is div_up(-start, dilation) + int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y; + start.y = ipos.y + num_steps * dilation.y; + } const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy)); // Compute the start of the kernel based on how far we are skipping ahead when // reading the input. Note that these are "canonical" indices. diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl index d6c94661ace..c3e53cbfc3b 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl @@ -43,106 +43,275 @@ ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); const lowp int out_packed_dim = unhash_packed_dim(out_layout); -void main() { - const ivec3 lpos = ivec3(gl_GlobalInvocationID); +#define MAX_WORKGROUP_SIZE 64 + +// Shared memory factor increases shared memory allocation by a scale that should either be 1 or a power of 2. +// +// Increasing factor allows more data to be stored in shared memory and increase thread utilization during reduction. +// Why? Because when performing reduction, the number of active threads becomes half in each iteration. +// Increasing scaling factor increases the thread occupancy and hence utilize the GPU better. +// eg. +// If local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 1, 32 elements will be loaded into shared memory. +// First iteration of reduce will have 16 threads sum up 32 elements. +// Second iteration will have 8 threads sum up 16 elements from previous iteration and so on. +// So thread utilization starts at 50%. +// +// By contrast if local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 2, 64 elements will be loaded into shared memory. +// First iteration of reduce will have 32 threads sum up 64 elements. +// Second iteration will have 32 threads sum up 16 elements from previous iteration and so on. +// Thus thread utilization starts at 100%. +#define SHARED_MEMORY_FACTOR 2 + +#define offset_pos_index(index) ((index) + ((index) >> 2)) + +shared VEC4_T shared_input[offset_pos_index(MAX_WORKGROUP_SIZE * SHARED_MEMORY_FACTOR)]; + +// Function to reduce input data in workgroup's x dimension +// +// The implementation resembles reduction as depicted below +// | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | 2 | 3 | 2 | 7 | 0 | 11 | 0 | 2 | current_stride -> 1 +// | / | / | / | / | / | / | / | / +// | / | / | / | / | / | / | / | / +// | / | / | / | / | / | / | / | / +// | 11 | 1 | 9 | 1 | 2 | 2 | 8 | 5 | 5 | 3 | 9 | 7 | 11 | 11 | 2 | 2 | current_stride -> 2 +// | / | / | / | / +// | / | / | / | / +// | / | / | / | / +// | 20 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |14 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 4 +// | / | / +// | / | / +// | / | / +// | / | / +// | / | / +// | 30 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 8 +// | / +// | / +// | / +// | / +// | / +// | / +// | / +// | / +// | / +// | 57 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride = -> 16 +// +// Threads access shared index in following pattern +// Thread | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 1 +// Shared Index | 0 | 2 | 4 | 6 | 8 | 10 | 12 | 14 | X | X | X | X | X | X | X | X | index *= 1 +// +// Thread | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 2 +// Shared Index | 0 | 4 | 8 | 12 | X | X | X | X | X | X | X | X | X | X | X | X | index *= 2 +// +// Thread | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 4 +// Shared Index | 0 | 8 | X | X | X | X | X | X | X | X | X | X | X | X | X | X | index *= 4 +// +// Thread | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 8 +// Shared Index | 0 | X | X | X | X | X | X | X | X | X | X | X | X | X | X | X | index *= 8 + +void reduce_input(const int width_stride, const int shared_idx_offset) { + // wait for all shared memory writes to finish + memoryBarrierShared(); + barrier(); + + // loop log(width_stride) times + for (int current_stride = 1, index = int(gl_LocalInvocationID.x << 1); current_stride < width_stride; current_stride *= 2, index <<= 1) { + // if the index at this thread is within the width stride + if (index < width_stride) { + const int local_shared_idx = shared_idx_offset + index; + // add the value at current stride to this thread's value + shared_input[offset_pos_index(local_shared_idx)] += shared_input[offset_pos_index(local_shared_idx + current_stride)]; + } - if (any(greaterThanEqual(lpos, out_limits))) { - return; + memoryBarrierShared(); + barrier(); } +} +void reduce_non_packed_dim() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); const int width = int(sizes.x); + ivec3 in_pos = lpos_to_pos(lpos, in_axis_map); - if (in_packed_dim != W_DIM) { - VEC4_T mean = VEC4_T(0); - VEC4_T delta = VEC4_T(0); - VEC4_T delta2 = VEC4_T(0); - VEC4_T M2 = VEC4_T(0); - - // Use Welford's online algorithm to compute mean and variance in one pass - // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm - ivec3 in_pos = lpos_to_pos(lpos, in_axis_map); - for (int w = 0; w < width; ++w) { - in_pos[in_axis_map.x] = w; - VEC4_T v = load_texel(t_in, in_pos); - delta = v - mean; - mean += delta / (w + 1); - delta2 = v - mean; - M2 += delta * delta2; + // width batch read stride + const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR; + + // local memory starting offset for this thread + const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y); + + // local memory index for this thread + const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x); + + VEC4_T mean = VEC4_T(0); + VEC4_T var = VEC4_T(0); + + // Loop over the width in stride increments + for (int width_offset = 0; width_offset < width; width_offset += width_stride) { + // Read input in shared memory + for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) { + in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x); + + VEC4_T in_val = VEC4_T(0); + if (all(lessThan(in_pos, out_limits))) { + in_val = load_texel(t_in, in_pos); + } + shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = in_val; } - VEC4_T var = M2 / width; - VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5)); - VEC4_T offset = -rstd * mean; - - for (int w = 0; w < width; ++w) { - in_pos[in_axis_map.x] = w; - VEC4_T v = load_texel(t_in, in_pos); - // broadcasting - VEC4_T weight = load_texel(t_weight, ivec3(w, 0, 0)).xxxx; - VEC4_T bias = load_texel(t_bias, ivec3(w, 0, 0)).xxxx; - VEC4_T outtex = (v * rstd + offset) * weight + bias; - write_texel_lpos(t_out, ivec3(w, lpos.y, lpos.z), outtex, out_axis_map); + reduce_input(width_stride, shared_idx_offset); + mean += shared_input[offset_pos_index(shared_idx_offset)]; + } + + mean /= width; + + memoryBarrierShared(); + barrier(); + + // Loop over the width in stride increments + for (int width_offset = 0; width_offset < width; width_offset += width_stride) { + // Read input in shared memory + for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) { + in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x); + + VEC4_T in_val = mean; + if (all(lessThan(in_pos, out_limits))) { + in_val = load_texel(t_in, in_pos); + } + + const VEC4_T delta = in_val - mean; + shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = delta * delta; } + reduce_input(width_stride, shared_idx_offset); + var += shared_input[offset_pos_index(shared_idx_offset)]; + } + + var /= width; + + VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5)); + VEC4_T offset = -rstd * mean; + + VEC4_T v = load_texel(t_in, lpos); + VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0)).xxxx; + VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0)).xxxx; + VEC4_T outtex = (v * rstd + offset) * weight + bias; + + if (all(lessThan(lpos, out_limits))) { + write_texel_lpos(t_out, lpos, outtex, out_axis_map); + } + + if (gl_GlobalInvocationID.x == 0) { write_texel(t_mean, lpos, mean); write_texel(t_rstd, lpos, rstd); - } else { - const int packed_width = divup4(width); - - T mean = T(0); - T delta = T(0); - T delta2 = T(0); - T M2 = T(0); - // Use Welford's online algorithm to compute mean and variance in one pass - // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm - ivec3 in_pos = lpos_to_pos(lpos, in_axis_map); - T width_counter = T(1); - - const bool has_unaligned_width = (width & 0x3) != 0; - const int fully_packed_4_comp_count = packed_width - mix(0, 1, has_unaligned_width); - - // iterate through texels that are fully packed ie. has 4 components - for (int w = 0; w < fully_packed_4_comp_count; ++w) { - in_pos[in_axis_map.x] = w; - VEC4_T v = load_texel(t_in, in_pos); - for (int i=0; i<4; i++) { - delta = v[i] - mean; - mean += delta / width_counter; - delta2 = v[i] - mean; - M2 += delta * delta2; - width_counter++; + } +} + +void reduce_packed_dim() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const int width = int(sizes.x); + ivec3 in_pos = lpos_to_pos(lpos, in_axis_map); + + // width batch read stride + const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR; + + // local memory starting offset for this thread + const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y); + + // local memory index for this thread + const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x); + + const int last_packed_width_index = divup4(width) - 1; + T mean = T(0); + T var = T(0); + const int remain = width & 3; + + const int in_pos_x_limit = out_limits[in_axis_map.x]; + + // Loop over the width in stride increments + for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) { + // Read input in shared memory + for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) { + const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x); + in_pos[in_axis_map.x] = in_pos_x; + + VEC4_T in_val = VEC4_T(0); + if (in_pos_x < in_pos_x_limit) { + in_val = load_texel(t_in, in_pos); } - } - // handle last texel if its not 4 aligned - if (has_unaligned_width) { - in_pos[in_axis_map.x] = fully_packed_4_comp_count; - const int remaining_width = width & 0x3; - - VEC4_T v = load_texel(t_in, in_pos); - for (int i=0; i 2); + in_val.z = mix(in_val.z, T(0), remain_inv > 1); + in_val.w = mix(in_val.w, T(0), remain_inv > 0); } + + shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = in_val; } - T var = M2 / (width_counter - 1); - T rstd = inversesqrt(var + epsilon); - T offset = -rstd * mean; - - for (int w = 0; w < packed_width; ++w) { - in_pos[in_axis_map.x] = w; - VEC4_T v = load_texel(t_in, in_pos); - VEC4_T weight = load_texel(t_weight, ivec3(w, 0, 0)); - VEC4_T bias = load_texel(t_bias, ivec3(w, 0, 0)); - VEC4_T outtex = (v * rstd + offset) * weight + bias; - write_texel_lpos(t_out, ivec3(w, lpos.y, lpos.z), outtex, out_axis_map); + reduce_input(width_stride, shared_idx_offset); + const VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)]; + mean += val.x + val.y + val.z + val.w; + } + + mean /= width; + + memoryBarrierShared(); + barrier(); + + // Loop over the width in stride increments + for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) { + // Read input in shared memory + for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) { + const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x); + in_pos[in_axis_map.x] = in_pos_x; + + VEC4_T in_val = VEC4_T(mean); + if (in_pos_x < in_pos_x_limit) { + in_val = load_texel(t_in, in_pos); + } + + if (in_pos_x == last_packed_width_index && remain != 0) { + const int remain_inv = 4 - remain; + in_val.y = mix(in_val.y, mean.x, remain_inv > 2); + in_val.z = mix(in_val.z, mean.x, remain_inv > 1); + in_val.w = mix(in_val.w, mean.x, remain_inv > 0); + } + + const VEC4_T delta = in_val - mean; + const VEC4_T delta2 = delta * delta; + shared_input[offset_pos_index(shared_idx + si * gl_WorkGroupSize.x)] = delta2; } + reduce_input(width_stride, shared_idx_offset); + const VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)]; + var += val.x + val.y + val.z + val.w; + } + + var /= width; + + T rstd = pow(var + epsilon, T(-0.5)); + T offset = -rstd * mean; + + VEC4_T v = load_texel(t_in, lpos); + VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0)); + VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0)); + VEC4_T outtex = (v * rstd + offset) * weight + bias; + + if (all(lessThan(lpos, out_limits))) { + write_texel_lpos(t_out, lpos, outtex, out_axis_map); + } + + if (gl_GlobalInvocationID.x == 0) { write_texel(t_mean, lpos, VEC4_T(mean)); write_texel(t_rstd, lpos, VEC4_T(rstd)); } } + +void main() { + // if packed dimension width + if (in_packed_dim != W_DIM) { + reduce_non_packed_dim(); + } else { + reduce_packed_dim(); + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl new file mode 100644 index 00000000000..0079526c248 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl @@ -0,0 +1,163 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +$if not NO_INT8_BUFFERS: + ${define_required_extensions("uint8")} +$if STORAGE == "buffer": + ${define_required_extensions("int8")} + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_qmat2", "uint8", STORAGE, is_scalar_array=False)} +$if NO_INT8_BUFFERS: + ${layout_declare_tensor(B, "r", "nchw_4x2", "uint", "buffer")} +$else: + ${layout_declare_tensor(B, "r", "nchw_4x2", "uint8", "buffer")} + +layout(push_constant) uniform restrict Block { + ivec4 qmat2_sizes; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +$if NO_INT8_BUFFERS: + #define BUF_T uint +$else: + #define BUF_T uint8_t + +$if STORAGE == "buffer": + #define UVEC4_T u8vec4 +$else: + #define UVEC4_T uvec4 + +uint get_first(const BUF_T packed) { + return (packed & 0xF0) >> 4; +} + +uint get_second(const BUF_T packed) { + return packed & 0x0F; +} + +uint combine(const uint first, const uint second) { + return (first << 4 | second); +} + +$if NO_INT8_BUFFERS: + uint extract_comp(const uint packed4, const uint idx) { + return (packed4 >> (idx * 8)) & 0xFF; + } + +/* + * This shader packs the weight tensor into a texture. + * + * The original tensor has a (W, H) shape of (K / 2, N) and each scalar element + * is a uint8_t, which contains 2 packed 4 bit uint values. + * + * The transform performed by this shader is to first transpose the tensor, so + * the shape of the packed tensor becomes (N / 2, K). Then, the 4 bit integers + * are re-packed in groups of 8. For each 4 uint8_t values, the "left" 4-bits + * of each value contain the 0, 1, 2, 3 4-bit values, and the "right" 4-bits of + * each value contain the 4, 5, 6, 7 4-bit values. + * + * As a concrete example, consider the following weight tensor. The | demarks + * the packing boundary, so 1| 2 represents a single uint8_t value with 1 in the + * leftmost 4 bits and 2 in the rightmost 4 bits. + * + * 1| 2, 3| 4, 5| 6, 7| 8, + * 9|10, 11|12, 13|14, 15|16, + * 17|18, 19|20, 21|22, 23|24, + * 25|26, 27|28, 29|30, 31|32, + * 33|34, 35|36, 37|38, 39|40, + * 41|42, 43|44, 45|46, 47|48, + * 49|50, 51|52, 53|54, 55|56, + * 57|58, 59|60, 61|62, 63|64, + * + * After packing, the packed tensor would contain + * + * 1|33, 9|41, 17|49, 25|57, + * 2|34, 10|42, 18|50, 26|58, + * 3|35, 11|43, 19|51, 27|59, + * 4|36, 12|44, 20|52, 28|60, + * 5|37, 13|45, 21|53, 29|61, + * 6|38, 14|46, 22|54, 30|62, + * 7|39, 15|47, 23|55, 31|63, + * 8|40, 16|48, 24|56, 32|64, + * + * The purpose of interleaving is to make it easier to extract the unpacked + * values in order using the u8vec4 vectorized type. With the packing in place, + * The 4-bit values can be extracted via + * + * u8vec4 packed; + * u8vec4 vals_0123 = (packed & 0xF0) >> 4; + * u8vec4 vals_4567 = (packed | 0x0F); + */ +void main() { + // Each thread writes 2 output texels along the height axis + ivec2 packed_pos = ivec2( + gl_GlobalInvocationID.x, + gl_GlobalInvocationID.y << 1); + + // The packed tensor is width packed + if ((packed_pos.x << 2) >= qmat2_sizes.x || packed_pos.y >= qmat2_sizes.y) { + return; + } + + int out_col = packed_pos.x << 3; + int out_row = packed_pos.y; + + int in_col = out_row; + int in_int8_col = in_col >> 1; + int in_row = out_col; + + int in_numrows = qmat2_sizes.x << 1; + int in_numcols = qmat2_sizes.y; + int in_num_int8_cols = qmat2_sizes.y >> 1; + + uint in_vals[8][2]; + for (int r = 0; r < 8; ++r) { + if (in_row + r < in_numrows) { + uint scalar_idx = (in_row + r) * in_num_int8_cols + in_int8_col; + $if NO_INT8_BUFFERS: + BUF_T in_val_packed_texel = nchw_4x2[scalar_idx >> 2]; + const uint packed_idx = scalar_idx % 4; + uint in_val_packed = extract_comp(in_val_packed_texel, packed_idx); + $else: + BUF_T in_val_packed = nchw_4x2[scalar_idx]; + + in_vals[r][0] = get_first(in_val_packed); + in_vals[r][1] = get_second(in_val_packed); + } else { + in_vals[r][0] = uint(0); + in_vals[r][1] = uint(0); + } + } + + UVEC4_T out_tex_1 = UVEC4_T( + combine(in_vals[0][0], in_vals[4][0]), + combine(in_vals[1][0], in_vals[5][0]), + combine(in_vals[2][0], in_vals[6][0]), + combine(in_vals[3][0], in_vals[7][0])); + + UVEC4_T out_tex_2 = UVEC4_T( + combine(in_vals[0][1], in_vals[4][1]), + combine(in_vals[1][1], in_vals[5][1]), + combine(in_vals[2][1], in_vals[6][1]), + combine(in_vals[3][1], in_vals[7][1])); + + $if STORAGE == "buffer": + int stride = qmat2_sizes.x >> 2; + t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1; + t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2; + $else: + imageStore(t_qmat2, packed_pos.xy, out_tex_1); + imageStore(t_qmat2, ivec2(packed_pos.x, packed_pos.y + 1), out_tex_2); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml new file mode 100644 index 00000000000..145f4301f14 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +pack_int4_linear_weight_transposed_interleaved: + parameter_names_with_default_values: + STORAGE: texture2d + NO_INT8_BUFFERS: false + shader_variants: + - NAME: pack_int4_linear_weight_transposed_interleaved_texture2d + - NAME: pack_int4_linear_weight_transposed_interleaved_buffer + STORAGE: buffer + - NAME: pack_int4_linear_weight_transposed_interleaved_nobitw8buffer_texture2d + NO_INT8_BUFFERS: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl index 8a8703becd9..716c42e8ede 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl @@ -31,6 +31,8 @@ layout(push_constant) uniform PRECISION restrict Block { layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; +#extension GL_EXT_control_flow_attributes : require + void main() { ivec3 pos = ivec3(gl_GlobalInvocationID); @@ -54,11 +56,16 @@ void main() { in_bchw_pos[out_ndims[2]] = pos.y; in_bchw_pos[out_ndims[3]] = pos.x; - for (int j = 0; j < 4; ++j) { + const int in_packed_dim_size = in_sizes[3 - out_ndims[in_packed_dim_bchw_index]]; + + [[unroll]] for (int j = 0, bchw_index = in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]; j < 4; ++j, ++bchw_index) { // terminate the loop if trying to access input texture out of bounds - if (any(greaterThanEqual(in_bchw_pos.wzyx, in_sizes.xyzw))) { + if (bchw_index >= in_packed_dim_size) { break; } + // go to position in the input, that is mapped to the packed dim in the output + in_bchw_pos[out_ndims[in_packed_dim_bchw_index]] = bchw_index; + ivec3 fetch_pos; fetch_pos.xy = in_bchw_pos.wz; @@ -74,9 +81,6 @@ void main() { // fetch input texel VEC4_T inval = VEC4_T(load_texel(t_in, fetch_pos)); outval[j] = inval[in_packed_dim_lane_index]; - - // go to next position in the input, that is mapped to the packed dim in the output - in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]++; } pos[packed_dim] = int(gl_GlobalInvocationID[packed_dim]); diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl deleted file mode 100644 index b702a110a65..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#include "indexing_utils.h" - -#define PRECISION ${PRECISION} - -#define FOUR 4 - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} -#define FLOAT_T ${buffer_scalar_type(DTYPE)} - -${define_active_storage_type(STORAGE)} - -${define_required_extensions([DTYPE, "uint8", "uint16"])} -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "ret", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "x", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "weights", "uint8", "buffer")} -${layout_declare_tensor(B, "r", "qparams", DTYPE, STORAGE)} -${layout_declare_ubo(B, "ivec3", "ret_limits")} -${layout_declare_ubo(B, "ivec4", "x_sizes")} -${layout_declare_ubo(B, "ivec4", "weights_strides")} -${layout_declare_ubo(B, "ivec4", "qparams_strides")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int group_size = 1; - -/* - * This shader computes a linear operator between a floating point input matrix - * x and a weights matrix that is quantized to 4 bits. - * - * The (W, H, C) shape of each tensor is: - * - x: (K, M) - * - weights: (K / 2, N) - * - The weights tensor has a data type of `uint8`. Each element in the tensor - * contains 2 4-bit values packed into a uint8. - * - qparams: (2, N, number_of_groups) - * - This tensor contains the scales and zeros quantization parameters for the - * weights tensor. The weight tensor is quantized group-wise, which means - * that every `group_size` elements along the K dimension of the weights - * tensor has independent quantization parameters. Along the width dim, the - * first value contains the scale for the group and the second value - * contains the zero point for the group. - * - * Note that this shader assumes that all tensors are width packed. - */ -void main() { - // output positions being calculated are (n, m), (n + 1, m), ... - // This means multiplying the m-th row of x with the n-th, (n+1)-th, ... rows - // of the weights tensor. - const u16vec3 ret_pos = u16vec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(ret_pos, ret_limits))) { - return; - } - - // Since ret is width packed, need to multiply by 4 - const uint16_t n = uint16_t(ret_pos.x * 4); - - // K is guaranteed to be a multiple of group size - const uint16_t num_blocks = uint16_t(x_sizes.x / group_size); - - uint16_t k_texel_i = uint16_t(0); - vec4 sums = vec4(0.0); - for (uint16_t block_idx = uint16_t(0); block_idx < num_blocks; block_idx++) { - vec4 scales; - vec4 zeros; - - [[unroll]] for (int comp = 0; comp < 4; comp++) { - const vec4 scale_and_zero = load_texel( - qparams, u16vec3(0, n + comp, block_idx)); - scales[comp] = scale_and_zero.x; - zeros[comp] = scale_and_zero.y; - } - - for (uint16_t i = uint16_t(0); i < group_size; i += uint16_t(4), k_texel_i++) { - const VEC4_T x_texel = load_texel( - x, u16vec3(k_texel_i, ret_pos.y, ret_pos.z)); - - [[unroll]] for (int comp = 0; comp < 4; comp++) { - const int weights_bufi = (n + comp) * weights_strides.y + (k_texel_i * 2); - // Need to read 4 unpacked values, which corresponds to 2 packed values - const uint8_t weights_val_1 = weights[weights_bufi]; - const uint8_t weights_val_2 = weights[weights_bufi + 1]; - - const u8vec4 weights_texel = u8vec4( - (weights_val_1 & 0xF0) >> 4, - weights_val_1 & 0x0F, - (weights_val_2 & 0xF0) >> 4, - weights_val_2 & 0x0F); - - // Note that the unpacked 4-bit values are unsigned, therefore they must - // first be "centered" around 0 by subtracting 8 before applying the - // scale and zero point. - sums[comp] += dot( - x_texel, (vec4(weights_texel) - 8.0) * scales[comp] + zeros[comp]); - } - } - } - write_texel(ret, ret_pos, sums); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml deleted file mode 100644 index 40d95d4a05f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -q_4w_linear: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: q_4w_linear_texture3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.glsl new file mode 100644 index 00000000000..715f84d3a56 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.glsl @@ -0,0 +1,199 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${buffer_gvec_type(DTYPE, 4)} + +#define TILE_ROWS ${TILE_ROWS} + +#define NGROUPS 8 +#define NWORKERS 8 + +${define_required_extensions(DTYPE)} +$if WEIGHT_STORAGE == "buffer": + ${define_required_extensions("uint8")} + +#extension GL_EXT_control_flow_attributes : require + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_mat1", DTYPE, IN_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)} + +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 mat1_sizes; + ivec4 qmat2_sizes; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int group_size = 64; + +shared VEC4_T partial_sums[NGROUPS][NWORKERS][TILE_ROWS][2]; + +/* + * This shader computes a linear operator between a floating point input matrix + * x and a weights matrix that is quantized to 4 bits. Please refer to the + * q_4w_linear shader for more details. + * + * This shader implements a co-operative algorithm to compute the output. The + * work group size is {NGROUP, 1, NWORKERS}, and each group of NWORKERS threads + * cooperative to compute TILE_ROWS * 2 output texels. Therefore, + * NGROUP * TILE_ROWS * 2 output texels are computed across one work group. + * + * The threads co-operate by each thread computing a partial reduction along the + * K dimension. To illustrate the computation, consider a scalar variant of the + * algorithm that computes the dot product of 2 vectors. Also assume that + * NWORKERS is 8. + * + * Thread 1 in each group will compute: + * (mat1[0] * mat2[0]) + (mat1[8] * mat2[8]) + (mat1[16] * mat2[16]) + ... + * + * Thread 2 in each group will compute: + * (mat1[1] * mat2[1]) + (mat2[9] * mat2[9]) + (mat1[17] * mat2[17]) + ... + * + * Thread 3 in each group will compute: + * (mat1[2] * mat2[2]) + (mat2[10] * mat2[10]) + (mat1[18] * mat2[18]) + ... + * + * The partial accumulations is structured such that memory accesses in each + * loop iteration can be coalesced. + * + * Then, at the end first thread in each group will accumulate the partial + * accumulations computed by each thread to obtain the final result. + * + * Note that this shader assumes that all tensors are width packed. + */ +void main() { + const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS; + // Each thread writes out 2 texels along the width axis, equivalent to 8 + // scalar elements. Therefore multiply the thread_idx.x by 8. + const uint out_col = gl_GlobalInvocationID.x << 3; + // Similar reasoning to the above, each thread works on 2 texels along the + // width axis so multiply thread_idx.x by 2. + const int out_col_texel_idx = int(gl_GlobalInvocationID.x) << 1; + + const uint gid = gl_LocalInvocationID.x; // group id + const uint wid = gl_LocalInvocationID.z; // worker id + + if (out_col >= out_sizes.x || out_row >= out_sizes.y) { + return; + } + + const int num_blocks = mat1_sizes.x / group_size; + + VEC4_T mat1[TILE_ROWS]; + VEC4_T qmat2[4][2]; + VEC4_T local_sums[TILE_ROWS][2]; + + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + local_sums[r][0] = VEC4_T(0); + local_sums[r][1] = VEC4_T(0); + } + + VEC4_T scales[2]; + VEC4_T zeros[2]; + + $if WEIGHT_STORAGE == "buffer": + const int qmat2_stride = qmat2_sizes.x >> 2; + $if PARAMS_STORAGE == "buffer": + const int qparams_y_stride = out_sizes.x >> 2; + const int qparams_z_stride = qparams_y_stride * 2; + + for (int block_idx = 0; block_idx < num_blocks; ++block_idx) { + $if PARAMS_STORAGE == "buffer": + scales[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx]; + zeros[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + qparams_y_stride]; + + scales[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1]; + zeros[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1 + qparams_y_stride]; + $else: + scales[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 0, block_idx), 0); + zeros[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 1, block_idx), 0); + + scales[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 0, block_idx), 0); + zeros[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 1, block_idx), 0); + + for (uint g_idx = 4 * wid; g_idx < group_size; g_idx += (4 * NWORKERS)) { + const uint k = block_idx * group_size + g_idx; + + // Preload B + [[unroll]] for (int r = 0; r < 4; ++r) { + $if WEIGHT_STORAGE == "buffer": + const u8vec4 packed_weight_tex = t_qmat2[(k + r) * qmat2_stride + gl_GlobalInvocationID.x]; + $else: + const uvec4 packed_weight_tex = texelFetch( + t_qmat2, + ivec2(gl_GlobalInvocationID.x, k + r), + 0); + + qmat2[r][0] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0) * scales[0] + zeros[0]; + qmat2[r][1] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0) * scales[1] + zeros[1]; + } + + // Preload A + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + $if IN_STORAGE == "buffer": + mat1[r] = t_mat1[((out_row + r) * mat1_sizes.x + k) >> 2]; + $else: + mat1[r] = texelFetch(t_mat1, ivec3(k >> 2, out_row + r, 0), 0); + } + + // Accumulate local output tile + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + local_sums[r][0] += mat1[r].x * qmat2[0][0] + + mat1[r].y * qmat2[1][0] + + mat1[r].z * qmat2[2][0] + + mat1[r].w * qmat2[3][0]; + + local_sums[r][1] += mat1[r].x * qmat2[0][1] + + mat1[r].y * qmat2[1][1] + + mat1[r].z * qmat2[2][1] + + mat1[r].w * qmat2[3][1]; + } + } + } + + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + partial_sums[gid][wid][r][0] = local_sums[r][0]; + partial_sums[gid][wid][r][1] = local_sums[r][1]; + } + + memoryBarrierShared(); + barrier(); + + if (wid != 0) { + return; + } + + VEC4_T sums[TILE_ROWS][2]; + + for (int r = 0; r < TILE_ROWS; ++r) { + sums[r][0] = VEC4_T(0); + sums[r][1] = VEC4_T(0); + [[unroll]] for (int worker = 0; worker < NWORKERS; ++ worker) { + sums[r][0] += partial_sums[gid][worker][r][0]; + sums[r][1] += partial_sums[gid][worker][r][1]; + } + } + + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + $if OUT_STORAGE == "buffer": + t_out[((out_row + r) * out_sizes.x + out_col) >> 2] = sums[r][0]; + t_out[((out_row + r) * out_sizes.x + out_col + 4) >> 2] = sums[r][1]; + $else: + imageStore(t_out, ivec3(out_col_texel_idx, out_row + r, 0), sums[r][0]); + imageStore(t_out, ivec3(out_col_texel_idx + 1, out_row + r, 0), sums[r][1]); + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.yaml new file mode 100644 index 00000000000..504cc4ab3b1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_coop.yaml @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q_4w_linear_coop: + parameter_names_with_default_values: + DTYPE: float + OUT_STORAGE: texture3d + IN_STORAGE: texture3d + WEIGHT_STORAGE: texture2d + PARAMS_STORAGE: buffer + TILE_ROWS: 1 + shader_variants: + - NAME: q_4w_linear_coop_texture3d_texture3d_texture2d_float + - NAME: q_4w_linear_coop_buffer_buffer_texture2d_float + OUT_STORAGE: buffer + IN_STORAGE: buffer + - NAME: q_4w_linear_coop_buffer_buffer_buffer_float + OUT_STORAGE: buffer + IN_STORAGE: buffer + WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.glsl new file mode 100644 index 00000000000..64d0991e489 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.glsl @@ -0,0 +1,161 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${buffer_gvec_type(DTYPE, 4)} + +#define TILE_ROWS ${TILE_ROWS} + +${define_required_extensions(DTYPE)} +$if WEIGHT_STORAGE == "buffer": + ${define_required_extensions("uint8")} + +#extension GL_EXT_control_flow_attributes : require + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_mat1", DTYPE, IN_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)} + +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 mat1_sizes; + ivec4 qmat2_sizes; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int group_size = 64; + +/* + * This shader computes a linear operator between a floating point input matrix + * x and a weights matrix that is quantized to 4 bits. + * + * The (W, H, C) shape of each tensor is: + * - x: (K, M) + * - weights: (N / 2, K) + * - The weights tensor has a data type of `uint8`. Each element in the tensor + * contains 2 4-bit values packed into a uint8. + * - See the pack_int4_linear_weight_transposed_interleave shader to see more + * details on how the weight tensor is stored. + * - qparams: (2, N, number_of_groups) + * - This tensor contains the scales and zeros quantization parameters for the + * weights tensor. The weight tensor is quantized group-wise, which means + * that every `group_size` elements along the K dimension of the weights + * tensor has independent quantization parameters. Along the width dim, the + * first value contains the scale for the group and the second value + * contains the zero point for the group. + * + * Each thread computes a tile of TILE_ROWS * 2 texels of the output tensor. + * + * Note that this shader assumes that all tensors are width packed. + */ +void main() { + const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS; + // Each thread writes out 2 texels along the width axis, equivalent to 8 + // scalar elements. Therefore multiply the thread_idx.x by 8. + const uint out_col = gl_GlobalInvocationID.x << 3; + // Similar reasoning to the above, each thread works on 2 texels along the + // width axis so multiply thread_idx.x by 2. + const int out_col_texel_idx = int(gl_GlobalInvocationID.x) << 1; + + if (out_col >= out_sizes.x || out_row >= out_sizes.y) { + return; + } + + const int num_blocks = mat1_sizes.x / group_size; + + VEC4_T mat1[TILE_ROWS]; + VEC4_T qmat2[4][2]; + VEC4_T sums[TILE_ROWS][2]; + + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + sums[r][0] = VEC4_T(0); + sums[r][1] = VEC4_T(0); + } + + VEC4_T scales[2]; + VEC4_T zeros[2]; + + $if WEIGHT_STORAGE == "buffer": + const int qmat2_stride = qmat2_sizes.x >> 2; + $if PARAMS_STORAGE == "buffer": + const int qparams_y_stride = out_sizes.x >> 2; + const int qparams_z_stride = qparams_y_stride * 2; + + for (int block_idx = 0; block_idx < num_blocks; ++block_idx) { + $if PARAMS_STORAGE == "buffer": + scales[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx]; + zeros[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + qparams_y_stride]; + + scales[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1]; + zeros[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1 + qparams_y_stride]; + $else: + scales[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 0, block_idx), 0); + zeros[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 1, block_idx), 0); + + scales[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 0, block_idx), 0); + zeros[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 1, block_idx), 0); + + for (int g_idx = 0; g_idx < group_size; g_idx += 4) { + const int k = block_idx * group_size + g_idx; + + // Preload B + [[unroll]] for (int r = 0; r < 4; ++r) { + $if WEIGHT_STORAGE == "buffer": + const u8vec4 packed_weight_tex = t_qmat2[(k + r) * qmat2_stride + gl_GlobalInvocationID.x]; + $else: + const uvec4 packed_weight_tex = texelFetch( + t_qmat2, + ivec2(gl_GlobalInvocationID.x, k + r), + 0); + + qmat2[r][0] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0) * scales[0] + zeros[0]; + qmat2[r][1] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0) * scales[1] + zeros[1]; + } + + // Preload A + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + $if IN_STORAGE == "buffer": + mat1[r] = t_mat1[((out_row + r) * mat1_sizes.x + k) >> 2]; + $else: + mat1[r] = texelFetch(t_mat1, ivec3(k >> 2, out_row + r, 0), 0); + } + + // Accumulate output tile + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + sums[r][0] += mat1[r].x * qmat2[0][0] + + mat1[r].y * qmat2[1][0] + + mat1[r].z * qmat2[2][0] + + mat1[r].w * qmat2[3][0]; + + sums[r][1] += mat1[r].x * qmat2[0][1] + + mat1[r].y * qmat2[1][1] + + mat1[r].z * qmat2[2][1] + + mat1[r].w * qmat2[3][1]; + } + } + } + + [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) { + $if OUT_STORAGE == "buffer": + if (out_row + r < out_sizes.y) { + t_out[((out_row + r) * out_sizes.x + out_col) >> 2] = sums[r][0]; + t_out[((out_row + r) * out_sizes.x + out_col + 4) >> 2] = sums[r][1]; + } + $else: + imageStore(t_out, ivec3(out_col_texel_idx, out_row + r, 0), sums[r][0]); + imageStore(t_out, ivec3(out_col_texel_idx + 1, out_row + r, 0), sums[r][1]); + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.yaml new file mode 100644 index 00000000000..865a46629df --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear_tiled.yaml @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q_4w_linear_tiled: + parameter_names_with_default_values: + DTYPE: float + OUT_STORAGE: texture3d + IN_STORAGE: texture3d + WEIGHT_STORAGE: texture2d + PARAMS_STORAGE: buffer + TILE_ROWS: 3 + shader_variants: + - NAME: q_4w_linear_tiled_texture3d_texture3d_texture2d_float + - NAME: q_4w_linear_tiled_buffer_buffer_texture2d_float + OUT_STORAGE: buffer + IN_STORAGE: buffer + - NAME: q_4w_linear_tiled_buffer_buffer_buffer_float + OUT_STORAGE: buffer + IN_STORAGE: buffer + WEIGHT_STORAGE: buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl index 228e2e8f870..dfb5f1f2f9c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl @@ -29,16 +29,20 @@ ${layout_declare_tensor(2, "r", "t_qmat2", "int8", STORAGE)} ${layout_declare_tensor(3, "r", "t_scales", DTYPE, STORAGE)} $if STORAGE == "buffer": - ${layout_declare_ubo(4, "ivec4", "out_sizes")} - ${layout_declare_ubo(5, "ivec4", "out_strides")} - ${layout_declare_ubo(6, "int", "out_numel")} - ${layout_declare_ubo(7, "ivec4", "mat1_sizes")} - ${layout_declare_ubo(8, "ivec4", "mat1_strides")} - ${layout_declare_ubo(9, "ivec4", "qmat2_strides")} - ${layout_declare_ubo(10, "ivec4", "scales_strides")} + layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 out_strides; + ivec4 mat1_sizes; + ivec4 mat1_strides; + ivec4 qmat2_strides; + ivec4 scales_strides; + int out_numel; + }; $else: - ${layout_declare_ubo(4, "ivec3", "out_limits")} - ${layout_declare_ubo(5, "ivec4", "mat1_sizes")} + layout(push_constant) uniform restrict Block { + ivec3 out_limits; + ivec4 mat1_sizes; + }; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -83,42 +87,40 @@ void main() { #else // USING_TEXTURE -#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require - void main() { - const u16vec2 out_pos = u16vec2( - gl_GlobalInvocationID.x, - gl_GlobalInvocationID.y); + const ivec2 out_pos = ivec2( + gl_GlobalInvocationID.x % out_limits.x, + gl_GlobalInvocationID.x / out_limits.x); - if (out_pos.x >= out_limits.x || out_pos.y >= out_limits.y) { + if (out_pos.y >= out_limits.y) { return; } - const uint16_t qmat2_pos_x = out_pos.x; + const int qmat2_pos_x = out_pos.x; VEC4_T outtex = VEC4_T(0); - const VEC4_T scales = load_texel(t_scales, u16vec3(out_pos.x, 0, 0)); + const VEC4_T scales = load_texel(t_scales, ivec3(out_pos.x, 0, 0)); VEC4_T mat1_tex; VEC4_T mat2_tex[4]; for ( - uint16_t i = uint16_t(0), x = uint16_t(0); - i < uint16_t(mat1_sizes.x); - i += uint16_t(4), x++) + int i = 0, x = 0; + i < mat1_sizes.x; + i += 4, x++) { - mat1_tex = load_texel(t_mat1, u16vec3(x, out_pos.y, 0)); + mat1_tex = load_texel(t_mat1, ivec3(x, out_pos.y, 0)); - mat2_tex[0] = load_texel(t_qmat2, u16vec3(out_pos.x, i, 0)); - mat2_tex[1] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(1), 0)); - mat2_tex[2] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(2), 0)); - mat2_tex[3] = load_texel(t_qmat2, u16vec3(out_pos.x, i + uint16_t(3), 0)); + mat2_tex[0] = load_texel(t_qmat2, ivec3(out_pos.x, i, 0)); + mat2_tex[1] = load_texel(t_qmat2, ivec3(out_pos.x, i + 1, 0)); + mat2_tex[2] = load_texel(t_qmat2, ivec3(out_pos.x, i + 2, 0)); + mat2_tex[3] = load_texel(t_qmat2, ivec3(out_pos.x, i + 3, 0)); outtex += mat1_tex.x * mat2_tex[0] + mat1_tex.y * mat2_tex[1] + mat1_tex.z * mat2_tex[2] + mat1_tex.w * mat2_tex[3]; } outtex *= scales; - write_texel(t_out, u16vec3(out_pos, 0), outtex); + write_texel(t_out, ivec3(out_pos, 0), outtex); } #endif diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 18599ed4ba6..060f5028c02 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -262,11 +262,6 @@ void check_conv2d_params(const Kernel2dParams& p, const bool transposed) { "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!"); } } - if ((p.padding[0] > 0 && p.kernel_size[0] > 1 && p.dilation[0] > 1) || - (p.padding[1] > 0 && p.kernel_size[1] > 1 && p.dilation[1] > 1)) { - VK_THROW( - "aten.convolution.default: padding > 0 while dilation, kernel_size > 1 is not supported yet!"); - } } Conv2dMethod get_conv2d_method( diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp index 7aa98e52654..f2e8eff763a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp @@ -83,8 +83,19 @@ void add_native_layer_norm_node( std::vector in_sizes = t_input->sizes(); - utils::uvec3 global_size = t_mean->logical_limits(); - utils::uvec3 local_size = adaptive_work_group_size(global_size); + utils::uvec3 global_size = t_out->logical_limits(); + utils::uvec3 local_size; + + // Since the shader sets shared memory scale factor > 1, if dispatch is + // greater than maximum WG size. Setting WG size in X axis to max WG size, + // would allow best thread utilization. + if (global_size[0] > 64) { + local_size = {64, 1, 1}; + } else { + // If thread size in X axis is smaller or equal to maximum WG size, we can + // let the function decide the best WG size. + local_size = graph.create_local_wg_size(global_size); + } std::string kernel_name("native_layer_norm"); kernel_name.reserve(kShaderNameReserve); diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp new file mode 100644 index 00000000000..4b33dd9b806 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp @@ -0,0 +1,207 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +namespace vkcompute { + +void check_q_4w_linear_args( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size, + const ValueRef scales_and_zeros, + const ValueRef out) { + VK_CHECK_COND(graph.val_is_tensor(mat1)); + VK_CHECK_COND(graph.val_is_tref(mat2_data)); + VK_CHECK_COND(graph.val_is_tref(scales_and_zeros)); + + VK_CHECK_COND(graph.dim_of(mat1) <= 3); + VK_CHECK_COND(graph.dim_of(mat2_data) == 2); + VK_CHECK_COND(graph.dim_of(scales_and_zeros) == 3); + + VK_CHECK_COND(graph.size_at(-3, mat1) == 1); + const int K = graph.size_at(-1, mat1); + VK_CHECK_COND(graph.size_at(-1, mat2_data) * 2 == K); + + const int group_size_val = graph.extract_scalar(group_size); + VK_CHECK_COND(K % group_size_val == 0); + // Due to the way weight packing works, group size needs to be a multiple of 8 + VK_CHECK_COND(group_size_val % 8 == 0); + + VK_CHECK_COND(graph.has_standard_axis_map(mat1)); + VK_CHECK_COND(graph.has_standard_axis_map(out)); +} + +void resize_q_4w_linear_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]); + vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]); + + const int out_cols = utils::val_at(-2, mat1->sizes()); + const int out_rows = utils::val_at(-1, mat2->sizes()) * 2; + + std::vector new_out_sizes(3); + if (mat1->sizes().size() == 2) { + new_out_sizes.resize(2); + new_out_sizes.at(0) = out_cols; + new_out_sizes.at(1) = out_rows; + } else { + new_out_sizes.at(0) = mat1->sizes().at(0); + new_out_sizes.at(1) = out_cols; + new_out_sizes.at(2) = out_rows; + } + + out->virtual_resize(new_out_sizes); +} + +ValueRef prepack_int4_linear_weight_transposed_interleaved( + ComputeGraph& graph, + const ValueRef qmat2_data) { + std::vector qmat2_orig_sizes = graph.sizes_of(qmat2_data); + const int64_t ndim = graph.dim_of(qmat2_data); + + const int64_t K = qmat2_orig_sizes.at(ndim - 1) * 2; + const int64_t N = qmat2_orig_sizes.at(ndim - 2); + const int64_t N_div2 = N / int64_t(2); + + utils::StorageType storage_type = utils::kTexture2D; + uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); + if (N_div2 > max_extent * 4 || K > max_extent) { + storage_type = utils::kBuffer; + } + + std::vector qmat2_sizes{K, N_div2}; + ValueRef qmat2 = graph.add_tensor( + qmat2_sizes, vkcompute::vkapi::kByte, storage_type, utils::kWidthPacked); + + utils::uvec3 global_wg_size; + global_wg_size = graph.logical_limits_of(qmat2); + global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(2)); + + std::string kernel_name = + graph.context()->adapter_ptr()->has_full_int8_buffers_support() + ? "pack_int4_linear_weight_transposed_interleaved" + : "pack_int4_linear_weight_transposed_interleaved_nobitw8buffer"; + add_storage_type_suffix(kernel_name, storage_type); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Inputs and Outputs + qmat2_data, + qmat2, + // UBOs + {}, + // Specialization Constants + {}, + // Push Constants + {graph.sizes_pc_of(qmat2)})); + + return qmat2; +} + +void add_q_4w_linear_node( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size, + const ValueRef scales_and_zeros_data, + const ValueRef out) { + check_q_4w_linear_args( + graph, mat1, mat2_data, group_size, scales_and_zeros_data, out); + + const uint32_t group_size_val = graph.extract_scalar(group_size); + + bool use_coop_algorithm = false; + // Apply the coop algorithm for gemv cases, i.e. mat1 is a vector as opposed + // to a matrix. + if (graph.size_at(-2, mat1) == 1) { + use_coop_algorithm = true; + } + + ValueRef mat2 = + prepack_int4_linear_weight_transposed_interleaved(graph, mat2_data); + + ValueRef scales_and_zeros = prepack_standard_hw_transposed( + graph, scales_and_zeros_data, utils::kBuffer, utils::kWidthPacked); + + std::string kernel_name = "q_4w_linear"; + if (use_coop_algorithm) { + kernel_name += "_coop"; + } else { + kernel_name += "_tiled"; + } + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(mat1)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(mat2)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + utils::uvec3 global_wg_size = graph.logical_limits_of(out); + global_wg_size[0] = utils::div_up(global_wg_size[0], uint32_t(2)); + utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size); + + if (use_coop_algorithm) { + local_wg_size = {8, 1, 8}; + } else { + global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(3)); + } + + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {{mat1, mat2, scales_and_zeros}, vkapi::kRead}}, + // Shader params buffers + {}, + // Specialization Constants + {SV(group_size_val)}, + // Resizing Logic + resize_q_4w_linear_node, + {}, + // Push Constants + {graph.sizes_pc_of(out), + graph.sizes_pc_of(mat1), + graph.sizes_pc_of(mat2)})); +} + +void linear_weight_int4( + ComputeGraph& graph, + const std::vector& args) { + return add_q_4w_linear_node( + graph, + args[0], // mat1 + args[1], // mat2 + args[2], // group_size + args[3], // scales_and_zeros + // There is an unused variable inner_k_tiles which is used to call + // _convert_weight_to_int4pack in the AOT custom op, which is why the 4th + // argument is skipped. + args[5] // out + ); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.linear_weight_int4.default, linear_weight_int4); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp similarity index 56% rename from backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp rename to backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp index f4f5c853ddd..5054b2e5e9c 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp @@ -98,47 +98,25 @@ void add_q_8w_linear_node( add_dtype_suffix(kernel_name, graph.dtype_of(out_W_packed)); add_storage_type_suffix(kernel_name, graph.storage_type_of(out_W_packed)); - vkapi::ParamsBindList ubos({}); + std::vector pcs; if (graph.is_buffer_storage(out_W_packed)) { - ubos.append( - {graph.sizes_ubo(out_W_packed), - graph.strides_ubo(out_W_packed), - graph.numel_ubo(out_W_packed), - graph.sizes_ubo(mat1_W_packed), - graph.strides_ubo(mat1), - graph.strides_ubo(q_mat2), - graph.strides_ubo(scales)}); + pcs = { + graph.sizes_pc_of(out_W_packed), + graph.strides_pc_of(out_W_packed), + graph.sizes_pc_of(mat1_W_packed), + graph.strides_pc_of(mat1), + graph.strides_pc_of(q_mat2), + graph.strides_pc_of(scales), + graph.numel_pc_of(out_W_packed)}; } else { - ubos.append( - {graph.logical_limits_ubo(out_W_packed), - graph.sizes_ubo(mat1_W_packed)}); + pcs = { + graph.logical_limits_pc_of(out_W_packed), + graph.sizes_pc_of(mat1_W_packed)}; } - utils::uvec3 global_wg; - if (graph.is_buffer_storage(out)) { - global_wg = {static_cast(graph.numel_of(out_W_packed)), 1, 1}; - } else { - global_wg = graph.logical_limits_of(out_W_packed); - } - - utils::uvec3 local_wg{8, 8, 1}; - int32_t out_W = graph.size_at(-1, out_W_packed); - - if (graph.is_buffer_storage(out_W_packed)) { - local_wg[0] = 64; - local_wg[1] = 1; - local_wg[2] = 1; - } else { - if (out_W % 8 != 0) { - if (out_W % 4 == 0) { - local_wg[0] = 4; - local_wg[1] = 16; - } else { - local_wg[0] = 2; - local_wg[1] = 32; - } - } - } + const utils::uvec3 global_wg = { + static_cast(graph.numel_of(out_W_packed)), 1, 1}; + const utils::uvec3 local_wg{64, 1, 1}; graph.execute_nodes().emplace_back(new DispatchNode( graph, @@ -149,11 +127,13 @@ void add_q_8w_linear_node( {{out_W_packed, vkapi::MemoryAccessType::WRITE}, {{mat1_W_packed, q_mat2, scales}, vkapi::MemoryAccessType::READ}}, // Shader params buffers - ubos, + {}, // Specialization Constants {}, // Resizing Logic - resize_q_8w_linear_node)); + resize_q_8w_linear_node, + {}, + pcs)); if (!graph.is_buffer_storage(out) && graph.packed_dim_of(out) != WHCN::kWidthDim) { viewFn(graph, {out_W_packed, graph.add_none(), out}); @@ -268,157 +248,8 @@ void weight_int8pack_mm( return add_q_8w_linear_node(graph, args[0], args[1], args[2], args[3]); } -void check_q_4w_linear_args( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef group_size, - const ValueRef scales_and_zeros, - const ValueRef out) { - VK_CHECK_COND(graph.int16_shader_types_enabled()); - VK_CHECK_COND(graph.int8_buffers_enabled()); - - VK_CHECK_COND(graph.val_is_tensor(mat1)); - VK_CHECK_COND(graph.val_is_tref(mat2_data)); - VK_CHECK_COND(graph.val_is_tref(scales_and_zeros)); - - VK_CHECK_COND(graph.dim_of(mat1) <= 3); - VK_CHECK_COND(graph.dim_of(mat2_data) == 2); - VK_CHECK_COND(graph.dim_of(scales_and_zeros) == 3); - - VK_CHECK_COND(graph.size_at(-3, mat1) == 1); - const int K = graph.size_at(-1, mat1); - VK_CHECK_COND(graph.size_at(-1, mat2_data) * 2 == K); - - const int group_size_val = graph.extract_scalar(group_size); - VK_CHECK_COND(K % group_size_val == 0); - - VK_CHECK_COND(graph.has_standard_axis_map(mat1)); - VK_CHECK_COND(graph.has_standard_axis_map(out)); -} - -void resize_q_4w_linear_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - vTensorPtr out = graph->get_tensor(args[0].refs[0]); - vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]); - vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]); - - const int out_cols = utils::val_at(-2, mat1->sizes()); - const int out_rows = utils::val_at(-2, mat2->sizes()); - - std::vector new_out_sizes(3); - if (mat1->sizes().size() == 2) { - new_out_sizes.resize(2); - new_out_sizes.at(0) = out_cols; - new_out_sizes.at(1) = out_rows; - } else { - new_out_sizes.at(0) = mat1->sizes().at(0); - new_out_sizes.at(1) = out_cols; - new_out_sizes.at(2) = out_rows; - } - - out->virtual_resize(new_out_sizes); -} - -void add_q_4w_linear_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef group_size, - const ValueRef scales_and_zeros_data, - const ValueRef out) { - check_q_4w_linear_args( - graph, mat1, mat2_data, group_size, scales_and_zeros_data, out); - - utils::StorageType storage_type = graph.storage_type_of(out); - - ValueRef mat2 = prepack_direct_copy_buffer(graph, mat2_data); - - ValueRef scales_and_zeros = prepack_standard( - graph, - scales_and_zeros_data, - graph.storage_type_of(out), - utils::kWidthPacked); - - std::string kernel_name = "q_4w_linear"; - add_storage_type_suffix(kernel_name, storage_type); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - const uint32_t group_size_val = graph.extract_scalar(group_size); - - ValueRef mat1_W_packed = mat1; - ValueRef out_W_packed = out; - auto viewFn = VK_GET_OP_FN("aten.view_copy.default"); - // Create temporary tensors to store the width packed versions of mat1 and out - TmpTensor mat1_tmp( - &graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked); - TmpTensor out_tmp( - &graph, graph.sizes_of(out), graph.dtype_of(out), utils::kWidthPacked); - if (storage_type == utils::kTexture3D) { - if (!graph.is_buffer_storage(out) && - graph.packed_dim_of(mat1) != WHCN::kWidthDim) { - // Ensure mat1 is width packed - mat1_W_packed = mat1_tmp; - viewFn(graph, {mat1, graph.add_none(), mat1_W_packed}); - // Ensure out is packed correctly - out_W_packed = out_tmp; - } - } - - vkapi::ParamsBindList ubos({}); - ubos.append(graph.logical_limits_ubo(out_W_packed)); - ubos.append(graph.sizes_ubo(mat1_W_packed)); - ubos.append(graph.strides_ubo(mat2)); - ubos.append(graph.strides_ubo(scales_and_zeros)); - - utils::uvec3 global_wg_size = graph.logical_limits_of(out_W_packed); - utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - local_wg_size, - // Inputs and Outputs - {{out_W_packed, vkapi::MemoryAccessType::WRITE}, - {{mat1_W_packed, mat2, scales_and_zeros}, - vkapi::MemoryAccessType::READ}}, - // Shader params buffers - ubos, - // Specialization Constants - {SV(group_size_val)}, - // Resizing Logic - resize_q_4w_linear_node, - {})); - if (!graph.is_buffer_storage(out) && - graph.packed_dim_of(out) != WHCN::kWidthDim) { - viewFn(graph, {out_W_packed, graph.add_none(), out}); - } -} - -void linear_weight_int4( - ComputeGraph& graph, - const std::vector& args) { - return add_q_4w_linear_node( - graph, - args[0], // mat1 - args[1], // mat2 - args[2], // group_size - args[3], // scales_and_zeros - // There is an unused variable inner_k_tiles which is used to call - // _convert_weight_to_int4pack in the AOT custom op, which is why the 4th - // argument is skipped. - args[5] // out - ); -} - REGISTER_OPERATORS { VK_REGISTER_OP(aten._weight_int8pack_mm.default, weight_int8pack_mm); - VK_REGISTER_OP(et_vk.linear_weight_int4.default, linear_weight_int4); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h index be0554161d3..d73ed1bc0ce 100644 --- a/backends/vulkan/runtime/vk_api/Adapter.h +++ b/backends/vulkan/runtime/vk_api/Adapter.h @@ -211,6 +211,18 @@ class Adapter final { return physical_device_.min_ubo_alignment; } + inline uint32_t max_texture2d_dim() const { + return physical_device_.properties.limits.maxImageDimension2D; + } + + inline uint32_t max_texture3d_dim() const { + return physical_device_.properties.limits.maxImageDimension3D; + } + + inline uint32_t max_buffer_numel() const { + return physical_device_.properties.limits.maxStorageBufferRange; + } + // Command Buffer Submission void diff --git a/backends/vulkan/test/CMakeLists.txt b/backends/vulkan/test/CMakeLists.txt index 4559077ccf8..592d7fca40e 100644 --- a/backends/vulkan/test/CMakeLists.txt +++ b/backends/vulkan/test/CMakeLists.txt @@ -46,8 +46,7 @@ if(LIB_VULKAN_BACKEND) set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../third-party) set(GTEST_INCLUDE_PATH - ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include set - (PYTORCH_PATH ${EXECUTORCH_ROOT}/third-party/pytorch) + ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include ) set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include) set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk) diff --git a/backends/vulkan/test/op_tests/CMakeLists.txt b/backends/vulkan/test/op_tests/CMakeLists.txt new file mode 100644 index 00000000000..0c0558b7917 --- /dev/null +++ b/backends/vulkan/test/op_tests/CMakeLists.txt @@ -0,0 +1,166 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# ### Editing this file ### +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# +# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON + +cmake_minimum_required(VERSION 3.19) +project(executorch) + +find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend) +find_package(GTest CONFIG REQUIRED) + +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) +endif() + +# Include this file to access target_link_options_shared_lib This is required to +# provide access to target_link_options_shared_lib which allows libraries to be +# linked with the --whole-archive flag. This is required for libraries that +# perform dynamic registration via static initialization. +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + +get_torch_base_path(TORCH_BASE_PATH) +message(STATUS "torch base path: ${TORCH_BASE_PATH}") + +# Only build tests if Vulkan was compiled +find_library(LIB_VULKAN_BACKEND vulkan_backend) +find_library(LIB_TORCH torch ${TORCH_BASE_PATH}/lib) +find_library(LIB_TORCH_CPU torch_cpu ${TORCH_BASE_PATH}/lib) +find_library(LIB_C10 c10 ${TORCH_BASE_PATH}/lib) + +message(STATUS "Vulkan backend lib ${LIB_VULKAN_BACKEND}") +message(STATUS "Torch ${LIB_TORCH}") + +if(NOT PYTHON_EXECUTABLE) + set(PYTHON_EXECUTABLE python3) +endif() + +# Third party include paths + +set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party) + +set(GTEST_INCLUDE_PATH + ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include +) +set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include) +set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk) +set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator) + +set(COMMON_INCLUDES + ${EXECUTORCH_ROOT}/.. + ${VULKAN_HEADERS_PATH} + ${VOLK_PATH} + ${VMA_PATH} + ${GTEST_INCLUDE_PATH} + ${TORCH_BASE_PATH}/include + ${TORCH_BASE_PATH}/include/torch/csrc/api/include +) + +target_link_options_shared_lib(vulkan_backend) + +function(vulkan_op_test test_name test_src) + set(extra_deps ${ARGN}) + + add_executable(${test_name} ${test_src}) + target_include_directories(${test_name} PRIVATE ${COMMON_INCLUDES}) + target_link_libraries( + ${test_name} + PRIVATE GTest::gtest_main + vulkan_backend + executorch + ${LIB_TORCH} + ${LIB_TORCH_CPU} + ${LIB_C10} + ${extra_deps} + ) + + add_test(${test_name} ${test_name}) +endfunction() + +if(LIB_VULKAN_BACKEND AND LIB_TORCH) + find_library( + CUSTOM_OPS_LIB custom_ops_aot_lib + HINTS ${CMAKE_INSTALL_PREFIX}/executorch/extension/llm/custom_ops + ) + if(CUSTOM_OPS_LIB) + vulkan_op_test( + vulkan_sdpa_test ${CMAKE_CURRENT_SOURCE_DIR}/sdpa_test.cpp + ${CUSTOM_OPS_LIB} + ) + else() + message( + STATUS "Skip building sdpa_test because custom_ops_aot_lib is not found" + ) + endif() + vulkan_op_test( + vulkan_rope_test ${CMAKE_CURRENT_SOURCE_DIR}/rotary_embedding_test.cpp + ) + vulkan_op_test( + vulkan_linear_weight_int4_test + ${CMAKE_CURRENT_SOURCE_DIR}/linear_weight_int4_test.cpp + ) + + # Only build generated op tests if a path to tags.yaml and + # native_functions.yaml is provided. These files are required for codegen. + if(TORCH_OPS_YAML_PATH) + set(GENERATED_VULKAN_TESTS_CPP_PATH ${CMAKE_CURRENT_BINARY_DIR}/vk_gen_cpp) + + # Generated operator correctness tests + + set(generated_test_cpp ${GENERATED_VULKAN_TESTS_CPP_PATH}/op_tests.cpp) + + add_custom_command( + COMMENT "Generating Vulkan operator correctness tests" + OUTPUT ${generated_test_cpp} + COMMAND + ${PYTHON_EXECUTABLE} + ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/generate_op_correctness_tests.py + -o ${GENERATED_VULKAN_TESTS_CPP_PATH} --tags-path + ${TORCH_OPS_YAML_PATH}/tags.yaml --aten-yaml-path + ${TORCH_OPS_YAML_PATH}/native_functions.yaml + DEPENDS ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/**/*.py + ) + + vulkan_op_test(vulkan_op_correctness_tests ${generated_test_cpp}) + + # Generated operator benchmarks (only built in google benchmark is + # installed) + find_package(benchmark CONFIG) + + if(benchmark_FOUND) + set(generated_benchmark_cpp + ${GENERATED_VULKAN_TESTS_CPP_PATH}/op_benchmarks.cpp + ) + + add_custom_command( + COMMENT "Generating Vulkan operator benchmarks" + OUTPUT ${generated_benchmark_cpp} + COMMAND + ${PYTHON_EXECUTABLE} + ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/generate_op_benchmarks.py + -o ${GENERATED_VULKAN_TESTS_CPP_PATH} --tags-path + ${TORCH_OPS_YAML_PATH}/tags.yaml --aten-yaml-path + ${TORCH_OPS_YAML_PATH}/native_functions.yaml + DEPENDS ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/**/*.py + ) + + vulkan_op_test(vulkan_op_benchmarks ${generated_benchmark_cpp}) + endif() + else() + message( + STATUS + "Skipping generated operator correctness tests and benchmarks. Please specify TORCH_OPS_YAML_PATH to build these tests." + ) + endif() +endif() diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 85008a52ff0..a1b03db27c9 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -226,153 +226,190 @@ def get_max_pool2d_inputs(): @register_test_suite("aten.convolution.default") def get_conv_inputs(): - test_suite = VkTestSuite( + Test = namedtuple( + "ConvTest", [ - ( - (1, 6, 40, 50), - (8, 6, 3, 3), - (8,), - [1, 2], - [2, 3], - [1, 1], - False, - [0, 0], - 1, - ), - ( - (1, 6, 40, 50), - (6, 8, 3, 3), - (8,), - [1, 2], - [2, 3], - [1, 1], - True, - [0, 1], - 1, - ), - ( - (1, 8, 72, 96), - (8, 1, 3, 3), - (8,), - [1, 1], - [1, 1], - [1, 1], - False, - [0, 0], - 8, - ), - ( - (1, 8, 72, 96), - (8, 8, 1, 1), - (8,), - [1, 1], - [1, 1], - [1, 1], - False, - [0, 0], - 1, - ), - ( - (1, 6, 40, 50), - (8, 6, 3, 3), - None, - [1, 2], - [2, 3], - [1, 1], - False, - [0, 0], - 1, - ), - ( - (1, 6, 7), - (6, 1, 3), - (6,), - [1], - [0], - [1], - False, - [0], - 6, - ), - ( - (2, 20, 30), - (10, 4, 6), - (10,), - [5], - [5], - [3], - False, - [0], - 5, - ), - ( - (1, 9, 11), - (9, 1, 3), - None, - [1], - [0], - [1], - False, - [0], - 9, - ), - ( - (5, 15, 30), - (20, 3, 3), - None, - [3], - [5], - [7], - False, - [0], - 5, - ), - ( - (1, 16, 672, 512), - (64, 16, 1, 1), - (64,), - [1, 1], - [0, 0], - [1, 1], - False, - [0, 0], - 1, - ), - ( - (1, 4, 234, 234), - (4, 1, 3, 3), - (4,), - [2, 1], - [1, 1], - [1, 1], - False, - [0, 0], - 4, - ), - ( - (1, 4, 234, 234), - (4, 1, 3, 3), - (4,), - [1, 2], - [1, 1], - [1, 1], - False, - [0, 0], - 4, - ), - ( - (1, 4, 234, 234), - (4, 1, 3, 3), - (4,), - [2, 2], - [1, 1], - [1, 1], - False, - [0, 0], - 4, - ), - ] + "self", + "weight", + "bias", + "stride", + "padding", + "dilation", + "transposed", + "output_padding", + "groups", + ], + ) + Test.__new__.__defaults__ = ( + None, + None, + None, + [1, 1], + [0, 0], + [1, 1], + False, + [9, 0], + 1, ) + test_cases = [] + test_cases = [ + Test( + self=(1, 6, 40, 50), + weight=(8, 6, 3, 3), + bias=(8,), + stride=[1, 2], + padding=[2, 3], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=1, + ), + Test( + self=(1, 6, 40, 50), + weight=(6, 8, 3, 3), + bias=(8,), + stride=[1, 2], + padding=[2, 3], + dilation=[1, 1], + transposed=True, + output_padding=[0, 1], + groups=1, + ), + Test( + self=(1, 8, 72, 96), + weight=(8, 1, 3, 3), + bias=(8,), + stride=[1, 1], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=8, + ), + Test( + self=(1, 8, 72, 96), + weight=(8, 8, 1, 1), + bias=(8,), + stride=[1, 1], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=1, + ), + Test( + self=(1, 6, 40, 50), + weight=(8, 6, 3, 3), + bias=None, + stride=[1, 2], + padding=[2, 3], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=1, + ), + Test( + self=(1, 6, 7), + weight=(6, 1, 3), + bias=(6,), + stride=[1], + padding=[0], + dilation=[1], + transposed=False, + output_padding=[0], + groups=6, + ), + Test( + self=(2, 20, 30), + weight=(10, 4, 6), + bias=(10,), + stride=[5], + padding=[5], + dilation=[3], + transposed=False, + output_padding=[0], + groups=5, + ), + Test( + self=(1, 9, 11), + weight=(9, 1, 3), + bias=None, + stride=[1], + padding=[0], + dilation=[1], + transposed=False, + output_padding=[0], + groups=9, + ), + Test( + self=(5, 15, 30), + weight=(20, 3, 3), + bias=None, + stride=[3], + padding=[5], + dilation=[7], + transposed=False, + output_padding=[0], + groups=5, + ), + Test( + self=(1, 16, 672, 512), + weight=(64, 16, 1, 1), + bias=(64,), + stride=[1, 1], + padding=[0, 0], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=1, + ), + Test( + self=(1, 4, 234, 234), + weight=(4, 1, 3, 3), + bias=(4,), + stride=[2, 1], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=4, + ), + Test( + self=(1, 4, 234, 234), + weight=(4, 1, 3, 3), + bias=(4,), + stride=[1, 2], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=4, + ), + Test( + self=(1, 4, 234, 234), + weight=(4, 1, 3, 3), + bias=(4,), + stride=[2, 2], + padding=[1, 1], + dilation=[1, 1], + transposed=False, + output_padding=[0, 0], + groups=4, + ), + Test( + self=(1, 8, 90, 77), + weight=(1, 8, 3, 3), + bias=(1,), + stride=[1, 1], + padding=[2, 2], + dilation=[2, 2], + transposed=False, + output_padding=[0, 0], + groups=1, + ), + ] + + test_suite = VkTestSuite(test_cases) return test_suite diff --git a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py index 4e51e23940b..8814070abd3 100644 --- a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py +++ b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py @@ -58,6 +58,9 @@ def process_test_suites( def generate_cpp( native_functions_yaml_path: str, tags_path: str, output_dir: str ) -> None: + if not os.path.exists(output_dir): + os.makedirs(output_dir) + output_file = os.path.join(output_dir, "op_tests.cpp") cpp_generator = VkCorrectnessTestFileGen(output_file) diff --git a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp index 66a585844cf..e617f5b5249 100644 --- a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp +++ b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp @@ -152,13 +152,17 @@ vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) { } } -void test_vulkan_linear_int4( +void test_vulkan_linear_int4_impl( const int B, const int M, const int K, const int N, const int group_size = 32, - const int inner_k_tiles = 8) { + const int inner_k_tiles = 8, + const vkcompute::utils::StorageType in_storage = + vkcompute::utils::kTexture3D, + const vkcompute::utils::StorageType out_storage = + vkcompute::utils::kTexture3D) { assert(K % group_size == 0); at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); @@ -169,8 +173,13 @@ void test_vulkan_linear_int4( at::Tensor scales_and_zeros = at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat)); - at::Tensor out_ref = dequantize_and_linear( - x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles); + at::Tensor weights_int = unpack_weights_4x2(weights_4x2); + at::Tensor out_ref = linear_weight_int4_reference_impl( + x, + at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size), + group_size, + scales_and_zeros, + inner_k_tiles); // Build Vulkan graph using namespace vkcompute; @@ -188,14 +197,13 @@ void test_vulkan_linear_int4( MAKE_TENSORREF_FOR(weights_4x2); MAKE_TENSORREF_FOR(scales_and_zeros); -#define MAKE_INPUT_FOR(x) \ - IOValueRef r_##x = graph.add_input_tensor( \ - x.sizes().vec(), from_at_scalartype(x.scalar_type())); - - MAKE_INPUT_FOR(x); + IOValueRef r_x = graph.add_input_tensor( + x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage); const ValueRef r_out = graph.add_tensor( - out_ref.sizes().vec(), from_at_scalartype(out_ref.scalar_type())); + out_ref.sizes().vec(), + from_at_scalartype(out_ref.scalar_type()), + out_storage); VK_GET_OP_FN("et_vk.linear_weight_int4.default") (graph, @@ -229,6 +237,34 @@ void test_vulkan_linear_int4( ASSERT_TRUE(at::allclose(vk_out, out_ref, 1e-4, 1e-4)); } +void test_vulkan_linear_int4( + const int B, + const int M, + const int K, + const int N, + const int group_size = 32, + const int inner_k_tiles = 8) { + test_vulkan_linear_int4_impl( + B, + M, + K, + N, + group_size, + inner_k_tiles, + vkcompute::utils::kBuffer, + vkcompute::utils::kBuffer); + + test_vulkan_linear_int4_impl( + B, + M, + K, + N, + group_size, + inner_k_tiles, + vkcompute::utils::kTexture3D, + vkcompute::utils::kTexture3D); +} + TEST(VulkanInt4LinearTest, test_reference_impl) { test_reference_linear_int4( /*B = */ 1, @@ -237,15 +273,24 @@ TEST(VulkanInt4LinearTest, test_reference_impl) { /*N = */ 32); } -TEST(VulkanInt4LinearTest, test_vulkan_impl) { - if (!vkcompute::api::context() - ->adapter_ptr() - ->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } +TEST(VulkanInt4LinearTest, test_vulkan_impl_small_m) { test_vulkan_linear_int4( /*B = */ 1, /*M = */ 4, /*K = */ 128, /*N = */ 32); + + test_vulkan_linear_int4( + /*B = */ 1, + /*M = */ 1, + /*K = */ 256, + /*N = */ 256); +} + +TEST(VulkanInt4LinearTest, test_vulkan_impl_gemm) { + test_vulkan_linear_int4( + /*B = */ 1, + /*M = */ 256, + /*K = */ 256, + /*N = */ 256); } diff --git a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py index 983d2c82bd0..65bb959f6d1 100644 --- a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py +++ b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py @@ -228,7 +228,7 @@ def generate_benchmark_fixture(self) -> str: return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone(); }} -at::Tensor make_index_tensor(std::vector indices) {{ +at::Tensor make_index_tensor_1d(std::vector indices) {{ at::ScalarType dtype = at::kInt; std::vector sizes = {{static_cast(indices.size())}}; @@ -236,7 +236,7 @@ def generate_benchmark_fixture(self) -> str: return at::from_blob(indices.data(), sizes, dtype).detach().clone(); }} -at::Tensor make_index_tensor(std::vector> indices) {{ +at::Tensor make_index_tensor_2d(std::vector> indices) {{ at::ScalarType dtype = at::kInt; std::vector sizes = {{ static_cast(indices.size()), @@ -252,7 +252,7 @@ def generate_benchmark_fixture(self) -> str: return at::from_blob(acc.data(), sizes, dtype).detach().clone(); }} -at::Tensor make_index_tensor(std::vector>> indices) {{ +at::Tensor make_index_tensor_3d(std::vector>> indices) {{ at::ScalarType dtype = at::kInt; std::vector sizes = {{ static_cast(indices.size()), diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py index 708da8eab85..b24879f660a 100644 --- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py +++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py @@ -229,11 +229,10 @@ def create_aten_fn_call(self) -> str: def create_aten_method_call(self) -> str: # For functions with only Method variant, we fallback to the function - # declared in MethodOperators.h. The method is declared as - # at::_ops::{name}::call(*), and ATEN_FN is a handly macro. + # declared in MethodOperators.h cpp_sig = gen_static_dispatch_backend_call_signature(self.f_sig, self.f) exprs = translate_args(self.f_sig, cpp_sig) - func_call = f"ATEN_FN({self.f_sig.name()})({exprs});" + func_call = f"at::_ops::{self.f_sig.name()}::call({exprs});" return func_call def create_out_src(self, include_declarations: bool = True) -> str: diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py index d7e38969452..e6ce135736b 100644 --- a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py +++ b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py @@ -170,7 +170,13 @@ def create_input_data(self, arg: Argument, data: Any) -> str: # noqa: C901 if cpp_type == AT_TENSOR: if arg.name == "index" or arg.name == "indices": - ret_str += f"make_index_tensor({init_list_str(data)});" + args_str = init_list_str(data) + if args_str[:3] == "{{{": + ret_str += f"make_index_tensor_3d({init_list_str(data)});" + elif args_str[:2] == "{{": + ret_str += f"make_index_tensor_2d({init_list_str(data)});" + else: + ret_str += f"make_index_tensor_1d({init_list_str(data)});" else: ret_str += self.call_data_gen_fn(arg, data) elif cpp_type == OPT_AT_TENSOR: @@ -278,7 +284,7 @@ def generate_suite_cpp(self) -> str: float high = 1.0) {{ if (high == 1.0 && low == 0.0) return at::rand(sizes, at::device(at::kCPU).dtype(dtype)); - + if (dtype == at::kChar) return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype)); @@ -307,7 +313,7 @@ def generate_suite_cpp(self) -> str: return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone(); }} -at::Tensor make_index_tensor(std::vector indices) {{ +at::Tensor make_index_tensor_1d(std::vector indices) {{ at::ScalarType dtype = at::kInt; std::vector sizes = {{static_cast(indices.size())}}; @@ -315,7 +321,7 @@ def generate_suite_cpp(self) -> str: return at::from_blob(indices.data(), sizes, dtype).detach().clone(); }} -at::Tensor make_index_tensor(std::vector> indices) {{ +at::Tensor make_index_tensor_2d(std::vector> indices) {{ at::ScalarType dtype = at::kInt; std::vector sizes = {{ static_cast(indices.size()), @@ -331,7 +337,7 @@ def generate_suite_cpp(self) -> str: return at::from_blob(acc.data(), sizes, dtype).detach().clone(); }} -at::Tensor make_index_tensor(std::vector>> indices) {{ +at::Tensor make_index_tensor_3d(std::vector>> indices) {{ at::ScalarType dtype = at::kInt; std::vector sizes = {{ static_cast(indices.size()), diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py index 1c1c51bb58a..188311e5f2c 100644 --- a/backends/vulkan/vulkan_preprocess.py +++ b/backends/vulkan/vulkan_preprocess.py @@ -47,7 +47,7 @@ ) from executorch.exir.backend.utils import DelegateMappingBuilder -from executorch.exir.memory_planning import greedy, memory_planning_algorithm_suite +from executorch.exir.memory_planning import greedy, MemoryPlanningAlgorithmSuite from executorch.exir.pass_base import ExportPass, PassBase from executorch.exir.passes import MemoryPlanningPass, SpecPropPass @@ -199,8 +199,8 @@ def preprocess( # noqa: C901 # Finally, apply dynamic shape passes and memory planning pass. These passes # must be applied only when the graph structure is finalized. greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False) - mem_planning_suite = partial( - memory_planning_algorithm_suite, algo_list=[greedy_memory_planning] + mem_planning_suite = MemoryPlanningAlgorithmSuite( + algo_list=[greedy_memory_planning] ) program = apply_passes( program, diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md index 967a852599a..2328f8e4b90 100644 --- a/backends/xnnpack/README.md +++ b/backends/xnnpack/README.md @@ -131,6 +131,6 @@ create an issue on [github](https://www.github.com/pytorch/executorch/issues). ## See Also -For more information about the XNNPACK Delegate, please check out the following resources: -- [ExecuTorch XNNPACK Delegate](https://pytorch.org/executorch/0.2/native-delegates-executorch-xnnpack-delegate.html) -- [Building and Running ExecuTorch with XNNPACK Backend](https://pytorch.org/executorch/0.2/native-delegates-executorch-xnnpack-delegate.html) +For more information about the XNNPACK Backend, please check out the following resources: +- [XNNPACK Backend](https://pytorch.org/executorch/main/backends-xnnpack) +- [XNNPACK Backend Internals](https://pytorch.org/executorch/main/backend-delegates-xnnpack-reference) diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py index 89a44f303df..768df1f4f04 100644 --- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py +++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py @@ -8,6 +8,7 @@ import torch from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass +from executorch.backends.xnnpack.utils.quant_utils import is_dynamic_qdq from executorch.backends.xnnpack.utils.utils import is_param_node from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import PassResult @@ -283,6 +284,14 @@ def input_to_nhwc( ] else: # Need to create NHWC node + # Check if input uses dynamic quantization + is_dynamic_input = is_dynamic_qdq(input_node) + + if is_dynamic_input: + # Trace back to original source node + while getattr(input_node, "args", None): + input_node = input_node.args[0] + with graph_module.graph.inserting_after(input_node): input_node_nhwc = self.create_call_function_node( graph_module=graph_module, @@ -290,7 +299,11 @@ def input_to_nhwc( args=(input_node,), memory_format=torch.channels_last, ) - self.mark_as_nhwc_node(input_node_nhwc) + + if is_dynamic_input: + # Replace downstream input_nodes with NHWC node + input_node.replace_all_uses_with(input_node_nhwc) + input_node_nhwc.args = (input_node,) self.insert_copy_and_assign_partner_nodes_quantization_sensitive( graph_module=graph_module, diff --git a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py index 82a35236294..bef9ac40c02 100644 --- a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py +++ b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py @@ -78,7 +78,7 @@ def define_node( @register_node_visitor class OpDequantizeAffine(NodeVisitor): - target = "quant.dequantize_affine.default" + target = "torchao.dequantize_affine.default" def __init__(self, *args) -> None: super().__init__(*args) diff --git a/backends/xnnpack/operators/op_dynamic_quantize_ops.py b/backends/xnnpack/operators/op_dynamic_quantize_ops.py index 9369f025216..6c6d31d82a4 100644 --- a/backends/xnnpack/operators/op_dynamic_quantize_ops.py +++ b/backends/xnnpack/operators/op_dynamic_quantize_ops.py @@ -127,7 +127,7 @@ def define_node( @register_node_visitor class OpQuantizeAffine(NodeVisitor): - target = "quant.quantize_affine.default" + target = "torchao.quantize_affine.default" def define_node( self, diff --git a/backends/xnnpack/operators/op_skip_ops.py b/backends/xnnpack/operators/op_skip_ops.py index face7342d8f..19df74e77ac 100644 --- a/backends/xnnpack/operators/op_skip_ops.py +++ b/backends/xnnpack/operators/op_skip_ops.py @@ -85,7 +85,7 @@ class OpChooseQparamsAffine(OpSkipOps): do nothing if node is choose_qparams_affine.default """ - target = "quant.choose_qparams_affine.default" + target = "torchao.choose_qparams_affine.default" @register_node_visitor diff --git a/backends/xnnpack/operators/op_slice_copy.py b/backends/xnnpack/operators/op_slice_copy.py index 40d8e5f04eb..d9056afa832 100644 --- a/backends/xnnpack/operators/op_slice_copy.py +++ b/backends/xnnpack/operators/op_slice_copy.py @@ -69,7 +69,9 @@ def define_node( output_shape = [output_shape[i] for i in PERM_NCHW_TO_NHWC] dim_of_slice = PERM_NHWC_TO_NCHW[dim_of_slice] - slice_begin_index = cast(int, node.args[2]) + slice_begin_index = 0 + if len(node.args) > 2 and node.args[2]: + slice_begin_index = cast(int, node.args[2]) if slice_begin_index < 0: slice_begin_index = input_shape[dim_of_slice] + slice_begin_index diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py index e695b151560..fbee1d192cf 100644 --- a/backends/xnnpack/operators/quant_params.py +++ b/backends/xnnpack/operators/quant_params.py @@ -141,12 +141,27 @@ def quantize_tensor(self, tensor: torch.Tensor) -> torch.Tensor: tensor, self.scale, self.zp, self.qmin, self.qmax, self.dtype ) + # Temporary helper until non-batch dimensions can be inferred + # Detects if a node feeds into a conv op by checking all downstream users + @staticmethod + def _feeds_into_conv(node: torch.fx.Node) -> bool: + users_list = [node] + + while users_list: + current_user = users_list.pop() + if "convolution" in str(current_user.target): + return True + users_list.extend(current_user.users) + + return False + @classmethod def _from_dynamic_input_node(cls, quant_node: torch.fx.Node) -> QuantParams: q_input = quant_node.args[0] # fp32 input assert isinstance(q_input, torch.fx.Node) # TODO - materialize this from the quant_node scale count and val shape - num_nonbatch_dims = 1 + # Set non-batch dims to 3 if node feeds into conv (only 2D is supported), otherwise set to 1 for linear + num_nonbatch_dims = 3 if cls._feeds_into_conv(quant_node) else 1 return cls( per_channel=False, # True is not valid diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py index 8712c2709ac..67bccbc52d1 100644 --- a/backends/xnnpack/partition/config/gemm_configs.py +++ b/backends/xnnpack/partition/config/gemm_configs.py @@ -9,6 +9,7 @@ from typing import cast, List, Optional, Tuple import torch +from executorch.backends.transforms import get_shape from executorch.backends.xnnpack.operators.quant_params import QuantParams from executorch.backends.xnnpack.partition.config.xnnpack_config import ( ConfigPrecisionType, @@ -27,6 +28,7 @@ ) from executorch.backends.xnnpack.utils.utils import ( get_input_node, + is_depthwise_conv, is_getitem, is_node, is_param_node, @@ -359,12 +361,23 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False # Only support 1D + 2D Conv kernel_node = get_input_node(node, 1) + kernel_shape = get_shape(kernel_node) weight_quant_params = QuantParams.from_weights(kernel_node, ep) - - is_transpose = node.args[6] groups = cast(int, node.args[8]) + is_transpose = node.args[6] + + # XNNPACK does not support dynamic quantization convs that are not 2D or are depthwise + if self._detect_precision(node) == ConfigPrecisionType.DYNAMIC_QUANT and ( + len(conv_stride) != 2 + or is_depthwise_conv(kernel_shape, groups, is_transpose) + ): + why( + node, + "XNNPACK only supports standard 2D convolutions for dynamic quantization", + ) + return False - # XNNPack does not support non-zero output padding in transposed + # XNNPACK does not support non-zero output padding in transposed # convolutions. if is_transpose and any( out_pad != 0 for out_pad in cast(List[int], node.args[7]) @@ -394,6 +407,7 @@ def supported_precision_types(self): return [ ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT, + ConfigPrecisionType.DYNAMIC_QUANT, ] diff --git a/backends/xnnpack/partition/config/quant_affine_configs.py b/backends/xnnpack/partition/config/quant_affine_configs.py index d9e789104b6..046402800a3 100644 --- a/backends/xnnpack/partition/config/quant_affine_configs.py +++ b/backends/xnnpack/partition/config/quant_affine_configs.py @@ -36,7 +36,7 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]: try: import torchao.quantization.quant_primitives # noqa - return torch.ops.quant.quantize_affine.default + return torch.ops.torchao.quantize_affine.default except: return None @@ -48,7 +48,7 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]: try: import torchao.quantization.quant_primitives # noqa - return torch.ops.quant.dequantize_affine.default + return torch.ops.torchao.dequantize_affine.default except: return None @@ -60,6 +60,6 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]: try: import torchao.quantization.quant_primitives # noqa - return torch.ops.quant.choose_qparams_affine.default + return torch.ops.torchao.choose_qparams_affine.default except: return None diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer.py b/backends/xnnpack/quantizer/xnnpack_quantizer.py index 0ddee53a41a..fdabd0383e6 100644 --- a/backends/xnnpack/quantizer/xnnpack_quantizer.py +++ b/backends/xnnpack/quantizer/xnnpack_quantizer.py @@ -265,6 +265,7 @@ class XNNPACKQuantizer(Quantizer): DYNAMIC_OPS = [ "linear", + "conv", ] def __init__(self) -> None: diff --git a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py index ce459806c6e..4b961bef81d 100644 --- a/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py +++ b/backends/xnnpack/quantizer/xnnpack_quantizer_utils.py @@ -6,6 +6,7 @@ import torch import torch.nn.functional as F +from executorch.backends.xnnpack.utils.utils import is_depthwise_conv from torch._subclasses import FakeTensor from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix from torch.ao.quantization.pt2e.export_utils import _WrapperModule @@ -29,7 +30,6 @@ ) from torch.fx.passes.utils.source_matcher_utils import get_source_partitions - __all__ = [ "OperatorConfig", "OperatorPatternType", @@ -323,6 +323,23 @@ def _do_annotate_conv( assert isinstance(weight, Node) input_qspec_map[weight] = get_weight_qspec(quantization_config) + # Only annotate dynamically quantized conv if it's 2D and not depthwise + if ( + quantization_config + and quantization_config.input_activation + and quantization_config.input_activation.is_dynamic + ): + weight_val = weight.meta.get("val", None) + weight_shape = getattr(weight_val, "shape", None) + + # Skip if not a 4D weight tensor (i.e. not conv2d) + if weight_shape is not None and len(weight_shape) != 4: + continue + + # Skip if depthwise (default to groups=1 since it's not an arg) + if is_depthwise_conv(weight_shape, 1, is_conv_transpose): + continue + # adding weight node to the partition as well partition = [conv_node, conv_node.args[1]] diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index c0204831c07..0b187d05df0 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -512,11 +512,6 @@ Error defineTensor( buffer_ptr == nullptr, Internal, "Dynamically quantized tensor should not have constant data but found non-nullptr"); - // TODO(T179441835): Dynamic Quantization with num_nonbatch_dims > 1 - ET_CHECK_OR_RETURN_ERROR( - qparams->num_nonbatch_dims() == 1, - Internal, - "Dynamically Quantized Tensors currently only support per token quantization"); status = xnn_define_dynamically_quantized_tensor_value( /*subgraph=*/subgraph_ptr, /*datatype=*/getDataType(tensor_value->datatype()), @@ -1172,7 +1167,7 @@ Error defineStaticTransposeNode( ET_CHECK_OR_RETURN_ERROR( status == xnn_status_success, Internal, - "Failed to create sigmoid node %i with code: %s", + "Failed to create static transpose node %i with code: %s", node->debug_handle(), xnn_status_to_string(status)); diff --git a/backends/xnnpack/test/ops/test_check_quant_params.py b/backends/xnnpack/test/ops/test_check_quant_params.py index b76935a9f72..d05b1fce540 100644 --- a/backends/xnnpack/test/ops/test_check_quant_params.py +++ b/backends/xnnpack/test/ops/test_check_quant_params.py @@ -52,7 +52,7 @@ def _test_check_quant_message(self, ep_modifier, expected_message): torch._dynamo.reset() mod = torch.nn.Linear(10, 10) quantizer = XNNPACKQuantizer() - captured = export_for_training(mod, (torch.randn(1, 10),)).module() + captured = export_for_training(mod, (torch.randn(1, 10),), strict=True).module() quantizer.set_global(get_symmetric_quantization_config(is_per_channel=True)) prepared = prepare_pt2e(captured, quantizer) @@ -65,10 +65,9 @@ def _test_check_quant_message(self, ep_modifier, expected_message): with self.assertRaises(ValueError) as context: to_edge_transform_and_lower(aten, partitioner=[XnnpackPartitioner()]) - self.assertEquals(str(context.exception), expected_message) + self.assertEqual(str(context.exception), expected_message) def test_in_per_tensor_quant(self): - for invalid_scale in [ float("nan"), float("inf"), diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py index 80b731bd18e..92bb03c907a 100644 --- a/backends/xnnpack/test/ops/test_conv2d.py +++ b/backends/xnnpack/test/ops/test_conv2d.py @@ -18,6 +18,10 @@ except: has_quantized_ops = False +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, +) +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( get_symmetric_quantization_config, ) @@ -26,7 +30,7 @@ ) from executorch.backends.xnnpack.test.test_xnnpack_utils import randomize_bn from executorch.backends.xnnpack.test.tester import Quantize, Tester - +from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower from executorch.exir.dialects._ops import ops as exir_ops @@ -169,6 +173,43 @@ def get_inputs(self): return (torch.randn(2, 2, 4, 4),) +class Conv2dDQSeq(torch.nn.Module): + def __init__(self): + super().__init__() + self.first = torch.nn.Conv2d( + in_channels=3, out_channels=8, kernel_size=3, padding=1 + ) + self.second = torch.nn.Conv2d( + in_channels=8, out_channels=10, kernel_size=3, padding=1 + ) + + def forward(self, x): + y = self.first(x) + return self.second(y) + + def get_inputs(self): + return (torch.randn(1, 3, 8, 8),) + + +class Conv2dDQParallel(torch.nn.Module): + def __init__(self): + super().__init__() + self.first = torch.nn.Conv2d( + in_channels=3, out_channels=8, kernel_size=3, padding=1 + ) + self.second = torch.nn.Conv2d( + in_channels=3, out_channels=8, kernel_size=3, padding=1 + ) + + def forward(self, x): + first = self.first(x) + second = self.second(x) + return first, second + + def get_inputs(self): + return (torch.randn(1, 3, 8, 8),) + + class TestConv2d(unittest.TestCase): def setUp(self): torch._dynamo.reset() @@ -223,6 +264,37 @@ def _test( .run_method_and_compare_outputs(qtol=1) ) + def _test_dq( + self, + m: torch.nn.Module, + conv_count=1, + dynamic_shapes=None, + ): + quant_config = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=True, + ) + + DynamicallyQuantizedPartitioner = XnnpackPartitioner( + config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, + per_op_mode=True, + ) + + tester = Tester(m, m.get_inputs(), dynamic_shapes=dynamic_shapes) + tester.quantize(Quantize(quantization_config=quant_config)) + tester.export() + tester.check(["torch.ops.quantized_decomposed.choose_qparams"]) + tester.to_edge_transform_and_lower( + ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) + ) + tester.check_count( + {"torch.ops.higher_order.executorch_call_delegate": conv_count} + ) + tester.check_not(["executorch_exir_dialects_edge__ops_aten_conv2d_default"]) + tester.to_executorch() + tester.serialize() + tester.run_method_and_compare_outputs(qtol=1) + def test_fp16_conv2d(self) -> None: for transpose in (True, False): for has_bias in (True, False): @@ -699,3 +771,26 @@ def forward(self, x): .serialize() .run_method_and_compare_outputs(qtol=1) ) + + def test_dq_conv2d(self) -> None: + model = Conv2d( + in_channels=3, + out_channels=10, + kernel_size=(3, 3), + stride=(1, 1), + padding=(0, 0), + batches=1, + width=8, + height=8, + ) + self._test_dq(model) + + def test_dq_conv2d_seq(self) -> None: + model = Conv2dDQSeq() + conv_count = sum(1 for m in model.modules() if type(m) is torch.nn.Conv2d) + self._test_dq(model, conv_count) + + def test_dq_conv2d_parallel(self) -> None: + model = Conv2dDQParallel() + conv_count = sum(1 for m in model.modules() if type(m) is torch.nn.Conv2d) + self._test_dq(model, conv_count) diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py index bcaf2e82a08..421e59c0b08 100644 --- a/backends/xnnpack/test/ops/test_linear.py +++ b/backends/xnnpack/test/ops/test_linear.py @@ -402,9 +402,9 @@ def _test_groupwise_dq_linear( .export() .check_count( { - "torch.ops.quant.choose_qparams_affine.default": 1 * num_linears, - "torch.ops.quant.quantize_affine.default": 1 * num_linears, - "torch.ops.quant.dequantize_affine.default": 2 * num_linears, + "torch.ops.torchao.choose_qparams_affine.default": 1 * num_linears, + "torch.ops.torchao.quantize_affine.default": 1 * num_linears, + "torch.ops.torchao.dequantize_affine.default": 2 * num_linears, "torch.ops.aten.linear.default": 1 * num_linears, } ) diff --git a/backends/xnnpack/test/ops/test_slice_copy.py b/backends/xnnpack/test/ops/test_slice_copy.py index ea65571b1e8..857c78480ad 100644 --- a/backends/xnnpack/test/ops/test_slice_copy.py +++ b/backends/xnnpack/test/ops/test_slice_copy.py @@ -69,6 +69,18 @@ def forward(self, x): # Note that two of the slices are optimized away as they are identity. self._test_slice_copy(ConvSlice(), inputs, 4, 2) + def test_fp32_slice_copy_default_start(self): + """ + XNNPACK supports default start in slice op. + """ + + class Slice(torch.nn.Module): + def forward(self, x): + return torch.ops.aten.slice.Tensor(x, 0, None, 2) + + inputs = (torch.randn(5, 5),) + self._test_slice_copy(Slice(), inputs, 1, 1) + def test_fp32_slice_copy_stride_non_1(self): """ XNNPACK does not support strided slicing. diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py index 6d60f9d76b5..a00209f4ea6 100644 --- a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py +++ b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py @@ -10,10 +10,13 @@ from executorch.backends.xnnpack._passes.channels_last_tagged_reshape_pass import ( ChannelsLastTaggedReshapePass, ) +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, +) from executorch.backends.xnnpack.test.test_xnnpack_utils_classes import ( OpSequencesAddConv2d, ) -from executorch.backends.xnnpack.test.tester import RunPasses, Tester +from executorch.backends.xnnpack.test.tester import Quantize, RunPasses, Tester class TestChannelsLastTaggedReshapePass(unittest.TestCase): @@ -35,6 +38,10 @@ def setUp(self): dequant_name = "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default" conv_name = "executorch_exir_dialects_edge__ops_aten_convolution_default" relu_name = "executorch_exir_dialects_edge__ops_aten_relu_default" + choose_qparams_name = ( + "executorch_exir_dialects_edge__ops_quantized_decomposed_choose_qparams_tensor" + ) + dynamic_quant_name = "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_tensor" def test_fp32_channels_last_tagged_reshape_pass(self): for module, num_reshape in self.modules.items(): @@ -179,3 +186,37 @@ def test_fp32_channels_last_tagged_reshape_pass_conv_bn_hardtanh_mean_seq(self): ) .run_method_and_compare_outputs() ) + + class Conv2dDynamicQuant(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 10, 3) + + def forward(self, x): + return self.conv(x) + + def test_dq_conv2d_channels_last_tagged_reshape_pass(self) -> None: + ( + Tester(self.Conv2dDynamicQuant().eval(), (torch.randn(1, 3, 8, 8),)) + .quantize( + Quantize( + quantization_config=get_symmetric_quantization_config( + is_dynamic=True + ) + ) + ) + .export() + .to_edge() + .run_passes(self.PassStage) + .check( + [ + self.to_copy_name, + self.choose_qparams_name, + self.dynamic_quant_name, + self.dequant_name, + self.conv_name, + self.to_copy_name, + ] + ) + .run_method_and_compare_outputs() + ) diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py index db1914e3910..cb91b78c123 100644 --- a/backends/xnnpack/utils/quant_utils.py +++ b/backends/xnnpack/utils/quant_utils.py @@ -12,6 +12,7 @@ from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, ) +from torch.fx.experimental.symbolic_shapes import free_symbols, has_free_symbols _Q_OPS = { "quantize_per_tensor.tensor", @@ -126,8 +127,8 @@ def is_affine_qdq(node: torch.fx.Node) -> bool: def _get_block_size_input_scale(node: torch.fx.Node): assert is_affine_qdq(node) block_size = node.args[1] - input_val = node.all_input_nodes[0].meta["val"] - scale_val = node.all_input_nodes[1].meta["val"] + input_val = cast(torch.fx.Node, node.args[0]).meta["val"] + scale_val = cast(torch.fx.Node, node.args[2]).meta["val"] return block_size, input_val, scale_val @@ -145,7 +146,21 @@ def is_per_token(node: torch.fx.Node): flag &= block_size[i] == 1 scale_numel_expected *= input_val.shape[i] - flag &= block_size[-1] == input_val.shape[-1] + ic_block_size = block_size[-1] + if isinstance(ic_block_size, torch.fx.Node): + ic_block_size = ic_block_size.meta["val"] + assert free_symbols( + ic_block_size + ), f"block_size: {block_size} given, but {block_size[-1]} is not a dynamic symint" + + ic_dim = input_val.shape[-1] + if isinstance(ic_dim, torch.fx.Node): + ic_dim = ic_dim.meta["val"] + assert free_symbols( + ic_dim + ), f"input_shape: {input_val.shape} given, but {input_val.shape[-1]} is not a dynamic symint" + + flag &= ic_dim == ic_block_size flag &= scale_val.numel() == scale_numel_expected return flag @@ -160,6 +175,11 @@ def is_per_channel_group(node: torch.fx.Node): return True elif is_affine_qdq(node): block_size, input_val, scale_val = _get_block_size_input_scale(node) + # per channel group is only valid on static weights + # so scales and weights can't have dynamic shape + if has_free_symbols(input_val.shape) or has_free_symbols(scale_val.shape): + return False + flag = True flag &= len(block_size) == 2 flag &= block_size[0] == 1 diff --git a/backends/xnnpack/utils/utils.py b/backends/xnnpack/utils/utils.py index fab95618807..b23fd444117 100644 --- a/backends/xnnpack/utils/utils.py +++ b/backends/xnnpack/utils/utils.py @@ -158,3 +158,33 @@ def get_source_fn(node: torch.fx.Node) -> Optional[torch.fx.Node]: return None source_fn = source_fn_st[-1] return source_fn[1] + + +def is_depthwise_conv( + kernel_shape: Tuple[int, ...], groups: int = 1, is_transpose: bool = False +) -> bool: + """ + A convolution is depthwise if: + 1) groups = input_channels (i.e. group_input_channels = 1) + 2) output_channels is a positive integer multiple of input channels + + For standard convolutions: + weight shape = (out_channels, in_channels_per_group, height, width) + For transposed convolutions: + weight shape = (in_channels, out_channels_per_group, height, width) + + Returns True if the convolution is depthwise + """ + if len(kernel_shape) < 2 or groups < 1: + return False + + if is_transpose: + group_input_channels = int(kernel_shape[0] / groups) + group_output_channels = kernel_shape[1] + else: + group_input_channels = kernel_shape[1] + group_output_channels = int(kernel_shape[0] / groups) + + return ( + group_input_channels == 1 and group_output_channels % group_input_channels == 0 + ) diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp index 3076cde1a99..180baf9b2a9 100644 --- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp +++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp @@ -22,8 +22,8 @@ // JIT op registry instead of c10 dispatcher. JIT op registry only takes boxed // kernels, so we are calling unboxing functions in UnboxingFunctions.h to cast // arguments into C++ types (instead of IValue) and delegate to unboxed kernels. -using KernelSpan = - ::executorch::runtime::Span; +using KernelSpan = ::executorch::runtime::Span< + const ::executorch::ET_RUNTIME_NAMESPACE::Kernel>; namespace torch { namespace executor { namespace function { diff --git a/devtools/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp index f12262f7dd0..df4124e0038 100644 --- a/devtools/bundled_program/bundled_program.cpp +++ b/devtools/bundled_program/bundled_program.cpp @@ -27,13 +27,13 @@ using executorch::aten::ArrayRef; using executorch::aten::Half; using executorch::aten::ScalarType; using executorch::aten::Tensor; +using ::executorch::ET_RUNTIME_NAMESPACE::Method; using ::executorch::runtime::Error; using ::executorch::runtime::EValue; -using ::executorch::runtime::Method; using ::executorch::runtime::Result; namespace executorch { -namespace bundled_program { +namespace BUNDLED_PROGRAM_NAMESPACE { namespace { @@ -332,8 +332,9 @@ ET_NODISCARD Error load_bundled_input( static_cast(status)); } - ::executorch::runtime::internal::event_tracer_set_bundled_input_index( - method.get_event_tracer(), testset_idx); + ::executorch::ET_RUNTIME_NAMESPACE::internal:: + event_tracer_set_bundled_input_index( + method.get_event_tracer(), testset_idx); return Error::Ok; } @@ -432,5 +433,5 @@ bool is_bundled_program(void* file_data, ET_UNUSED size_t file_data_len) { file_data); } -} // namespace bundled_program +} // namespace BUNDLED_PROGRAM_NAMESPACE } // namespace executorch diff --git a/devtools/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h index 884ca6f21bc..14f26ce00f7 100644 --- a/devtools/bundled_program/bundled_program.h +++ b/devtools/bundled_program/bundled_program.h @@ -10,15 +10,20 @@ #include #include +#ifdef USE_ATEN_LIB +#define BUNDLED_PROGRAM_NAMESPACE bundled_program::aten +#else // !USE_ATEN_LIB +#define BUNDLED_PROGRAM_NAMESPACE bundled_program +#endif // USE_ATEN_LIB namespace executorch { -namespace bundled_program { +namespace BUNDLED_PROGRAM_NAMESPACE { /** * An opaque pointer to a serialized bundled program. */ using SerializedBundledProgram = const void; - +using ::executorch::ET_RUNTIME_NAMESPACE::Method; /** * Load testset_idx-th bundled input of method_idx-th Method test in * bundled_program_ptr to given Method. @@ -31,7 +36,7 @@ using SerializedBundledProgram = const void; * execution. */ ET_NODISCARD ::executorch::runtime::Error load_bundled_input( - ::executorch::runtime::Method& method, + Method& method, SerializedBundledProgram* bundled_program_ptr, size_t testset_idx); @@ -49,7 +54,7 @@ ET_NODISCARD ::executorch::runtime::Error load_bundled_input( * execution. */ ET_NODISCARD ::executorch::runtime::Error verify_method_outputs( - ::executorch::runtime::Method& method, + Method& method, SerializedBundledProgram* bundled_program_ptr, size_t testset_idx, double rtol = 1e-5, @@ -94,7 +99,7 @@ ET_DEPRECATED inline bool is_bundled_program(void* file_data) { return is_bundled_program(file_data, 128); } -} // namespace bundled_program +} // namespace BUNDLED_PROGRAM_NAMESPACE } // namespace executorch namespace torch { @@ -103,24 +108,24 @@ namespace bundled_program { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. using serialized_bundled_program = - ::executorch::bundled_program::SerializedBundledProgram; + ::executorch::BUNDLED_PROGRAM_NAMESPACE::SerializedBundledProgram; ET_NODISCARD inline ::executorch::runtime::Error LoadBundledInput( - ::executorch::runtime::Method& method, + Method& method, serialized_bundled_program* bundled_program_ptr, size_t testset_idx) { - return ::executorch::bundled_program::load_bundled_input( + return ::executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input( method, bundled_program_ptr, testset_idx); } ET_NODISCARD inline ::executorch::runtime::Error VerifyResultWithBundledExpectedOutput( - ::executorch::runtime::Method& method, + Method& method, serialized_bundled_program* bundled_program_ptr, size_t testset_idx, double rtol = 1e-5, double atol = 1e-8) { - return ::executorch::bundled_program::verify_method_outputs( + return ::executorch::BUNDLED_PROGRAM_NAMESPACE::verify_method_outputs( method, bundled_program_ptr, testset_idx, rtol, atol); } @@ -129,13 +134,14 @@ ET_NODISCARD inline ::executorch::runtime::Error GetProgramData( size_t file_data_len, const void** out_program_data, size_t* out_program_data_len) { - return ::executorch::bundled_program::get_program_data( + return ::executorch::BUNDLED_PROGRAM_NAMESPACE::get_program_data( file_data, file_data_len, out_program_data, out_program_data_len); } inline bool IsBundledProgram(void* file_data) { // 128 is enough data to contain the identifier in the flatbuffer header. - return ::executorch::bundled_program::is_bundled_program(file_data, 128); + return ::executorch::BUNDLED_PROGRAM_NAMESPACE::is_bundled_program( + file_data, 128); } } // namespace bundled_program } // namespace executor diff --git a/devtools/etdump/etdump_filter.h b/devtools/etdump/etdump_filter.h index 545823a5556..29db43be8b9 100644 --- a/devtools/etdump/etdump_filter.h +++ b/devtools/etdump/etdump_filter.h @@ -77,8 +77,9 @@ class ETDumpFilter : public ::executorch::runtime::EventTracerFilterBase { * * @return A Result indicating whether the event matches the filter * criteria. - * - True if the event matches the filter, or filter is unset. - * - False if the event does not match or is unknown. + * - True if the event matches the filter. + * - False if the event does not match, or is unknown, or filter is + * unset. * - An error code if an error occurs during filtering. */ Result filter( diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp index ea464f2f5ce..4b5da78550e 100644 --- a/devtools/etdump/etdump_flatcc.cpp +++ b/devtools/etdump/etdump_flatcc.cpp @@ -42,7 +42,7 @@ namespace executorch { namespace etdump { namespace { -executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type( +Result get_flatbuffer_scalar_type( executorch::aten::ScalarType tensor_scalar_type) { switch (tensor_scalar_type) { case executorch::aten::ScalarType::Byte: @@ -66,21 +66,26 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type( case executorch::aten::ScalarType::UInt16: return executorch_flatbuffer_ScalarType_UINT16; default: - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( 0, + InvalidArgument, "This ScalarType = %hhd is not yet supported in ETDump", static_cast(tensor_scalar_type)); } } -etdump_Tensor_ref_t add_tensor_entry( +Result add_tensor_entry( flatcc_builder_t* builder_, const executorch::aten::Tensor& tensor, long offset) { etdump_Tensor_start(builder_); - etdump_Tensor_scalar_type_add( - builder_, get_flatbuffer_scalar_type(tensor.scalar_type())); + Result scalar_type = + get_flatbuffer_scalar_type(tensor.scalar_type()); + if (!scalar_type.ok()) { + return scalar_type.error(); + } + etdump_Tensor_scalar_type_add(builder_, scalar_type.get()); etdump_Tensor_sizes_start(builder_); for (auto dim : tensor.sizes()) { @@ -323,40 +328,32 @@ Result ETDumpGen::log_intermediate_output_delegate( const char* name, DelegateDebugIntId delegate_debug_index, const ArrayRef output) { - log_intermediate_output_delegate_helper(name, delegate_debug_index, output); - Result result = log_intermediate_output_delegate_helper( + return log_intermediate_output_delegate_helper( name, delegate_debug_index, output); - return result; } Result ETDumpGen::log_intermediate_output_delegate( const char* name, DelegateDebugIntId delegate_debug_index, const int& output) { - log_intermediate_output_delegate_helper(name, delegate_debug_index, output); - Result result = log_intermediate_output_delegate_helper( + return log_intermediate_output_delegate_helper( name, delegate_debug_index, output); - return result; } Result ETDumpGen::log_intermediate_output_delegate( const char* name, DelegateDebugIntId delegate_debug_index, const bool& output) { - log_intermediate_output_delegate_helper(name, delegate_debug_index, output); - Result result = log_intermediate_output_delegate_helper( + return log_intermediate_output_delegate_helper( name, delegate_debug_index, output); - return result; } Result ETDumpGen::log_intermediate_output_delegate( const char* name, DelegateDebugIntId delegate_debug_index, const double& output) { - log_intermediate_output_delegate_helper(name, delegate_debug_index, output); - Result result = log_intermediate_output_delegate_helper( + return log_intermediate_output_delegate_helper( name, delegate_debug_index, output); - return result; } template @@ -369,6 +366,19 @@ Result ETDumpGen::log_intermediate_output_delegate_helper( InvalidArgument, "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); + if (filter_) { + Result result = filter_->filter(name, delegate_debug_index); + if (!result.ok()) { + return result; + } + + // If the filter returns true, meaning this event should be filtered out and + // we should not log it. + if (result.get()) { + return false; + } + } + check_ready_to_add_events(); int64_t string_id = name != nullptr ? create_string_entry(name) : -1; @@ -385,18 +395,26 @@ Result ETDumpGen::log_intermediate_output_delegate_helper( // Check the type of `output` then call the corresponding logging functions if constexpr (std::is_same::value) { long offset = write_tensor_or_raise_error(output); - etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset); + Result tensor_ref = + add_tensor_entry(builder_, output, offset); + if (!tensor_ref.ok()) { + return tensor_ref.error(); + } etdump_Value_start(builder_); etdump_Value_val_add(builder_, etdump_ValueType_Tensor); - etdump_Value_tensor_add(builder_, tensor_ref); + etdump_Value_tensor_add(builder_, tensor_ref.get()); } else if constexpr (std::is_same>::value) { etdump_Tensor_vec_start(builder_); for (size_t i = 0; i < output.size(); ++i) { long offset = write_tensor_or_raise_error(output[i]); - etdump_Tensor_vec_push( - builder_, add_tensor_entry(builder_, output[i], offset)); + Result tensor_ref = + add_tensor_entry(builder_, output[i], offset); + if (!tensor_ref.ok()) { + return tensor_ref.error(); + } + etdump_Tensor_vec_push(builder_, tensor_ref.get()); } etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); etdump_TensorList_ref_t tensor_list_ref = @@ -518,22 +536,26 @@ ETDumpResult ETDumpGen::get_etdump_data() { return result; } -void ETDumpGen::set_debug_buffer(Span buffer) { +Result ETDumpGen::set_debug_buffer(Span buffer) { Result bds_ret = BufferDataSink::create(buffer); - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( bds_ret.ok(), + InvalidArgument, "Failed to create data sink from debug buffer with error 0x%" PRIx32, static_cast(bds_ret.error())); buffer_data_sink_ = std::move(bds_ret.get()); data_sink_ = &buffer_data_sink_; + return true; } void ETDumpGen::set_data_sink(DataSinkBase* data_sink) { data_sink_ = data_sink; } -void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { +Result ETDumpGen::log_evalue( + const EValue& evalue, + LoggedEValueType evalue_type) { check_ready_to_add_events(); etdump_DebugEvent_start(builder_); @@ -545,12 +567,15 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { case Tag::Tensor: { executorch::aten::Tensor tensor = evalue.toTensor(); long offset = write_tensor_or_raise_error(tensor); - etdump_Tensor_ref_t tensor_ref = + Result tensor_ref = add_tensor_entry(builder_, tensor, offset); + if (!tensor_ref.ok()) { + return tensor_ref.error(); + } etdump_Value_start(builder_); etdump_Value_val_add(builder_, etdump_ValueType_Tensor); - etdump_Value_tensor_add(builder_, tensor_ref); + etdump_Value_tensor_add(builder_, tensor_ref.get()); if (evalue_type == LoggedEValueType::kProgramOutput) { auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); etdump_Value_output_add(builder_, bool_ref); @@ -567,8 +592,12 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { etdump_Tensor_vec_start(builder_); for (size_t i = 0; i < tensors.size(); ++i) { long offset = write_tensor_or_raise_error(tensors[i]); - etdump_Tensor_vec_push( - builder_, add_tensor_entry(builder_, tensors[i], offset)); + Result tensor_ref = + add_tensor_entry(builder_, tensors[i], offset); + if (!tensor_ref.ok()) { + return tensor_ref.error(); + } + etdump_Tensor_vec_push(builder_, tensor_ref.get()); } etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); etdump_TensorList_ref_t tensor_list_ref = @@ -640,6 +669,7 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { etdump_RunData_events_push_start(builder_); etdump_Event_debug_event_add(builder_, debug_event); etdump_RunData_events_push_end(builder_); + return true; } size_t ETDumpGen::get_num_blocks() { @@ -654,6 +684,11 @@ DataSinkBase* ETDumpGen::get_data_sink() { return data_sink_; } +void ETDumpGen::set_delegation_intermediate_output_filter( + EventTracerFilterBase* filter) { + filter_ = filter; +} + long ETDumpGen::write_tensor_or_raise_error(Tensor tensor) { // Previously, the function copy_tensor_to_debug_buffer returned 0xFF..F when // given an empty tensor, which is an invalid offset for most buffers. In our diff --git a/devtools/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h index 6b51745eee3..ea0c1cb653d 100644 --- a/devtools/etdump/etdump_flatcc.h +++ b/devtools/etdump/etdump_flatcc.h @@ -25,6 +25,7 @@ namespace executorch { namespace etdump { using ::executorch::runtime::DelegateDebugIntId; +using ::executorch::runtime::EventTracerFilterBase; using ::executorch::runtime::Result; namespace internal { @@ -101,7 +102,7 @@ class ETDumpGen : public ::executorch::runtime::EventTracer { size_t size) override; virtual ::executorch::runtime::AllocatorID track_allocator( const char* name) override; - virtual void log_evalue( + virtual Result log_evalue( const ::executorch::runtime::EValue& evalue, ::executorch::runtime::LoggedEValueType evalue_type = ::executorch::runtime::LoggedEValueType::kIntermediateOutput) @@ -146,7 +147,14 @@ class ETDumpGen : public ::executorch::runtime::EventTracer { const char* name, DelegateDebugIntId delegate_debug_index, const double& output) override; - void set_debug_buffer(::executorch::runtime::Span buffer); + + /** + * Set the filter of event tracer for delegation intermediate outputs. + */ + virtual void set_delegation_intermediate_output_filter( + EventTracerFilterBase* event_tracer_filter) override; + + Result set_debug_buffer(::executorch::runtime::Span buffer); void set_data_sink(DataSinkBase* data_sink); ETDumpResult get_etdump_data(); size_t get_num_blocks(); @@ -188,6 +196,8 @@ class ETDumpGen : public ::executorch::runtime::EventTracer { int bundled_input_index_ = -1; State state_ = State::Init; struct internal::ETDumpStaticAllocator alloc_; + + EventTracerFilterBase* filter_ = nullptr; }; } // namespace etdump diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp index c64bab0448c..9d39a8bbde1 100644 --- a/devtools/etdump/tests/etdump_test.cpp +++ b/devtools/etdump/tests/etdump_test.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,7 @@ using ::executorch::runtime::AllocatorID; using ::executorch::runtime::ArrayRef; using ::executorch::runtime::BoxedEvalueList; using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::DelegateDebugIntId; using ::executorch::runtime::Error; using ::executorch::runtime::EValue; using ::executorch::runtime::EventTracerEntry; @@ -45,6 +47,8 @@ using ::executorch::runtime::testing::TensorFactory; using ::executorch::etdump::BufferDataSink; using ::executorch::etdump::FileDataSink; +using ::executorch::etdump::ETDumpFilter; + class ProfilerETDumpTest : public ::testing::Test { protected: void SetUp() override { @@ -75,6 +79,70 @@ class ProfilerETDumpTest : public ::testing::Test { "Must set data sink before writing tensor-like data"); } + void check_log_with_filter( + const char* name, + DelegateDebugIntId delegate_debug_index, + bool use_tensor_input, + bool expected_log, + bool expected_ok) { + TensorFactory tf; + for (size_t i = 0; i < 2; i++) { + const size_t buffer_size = 2048; + + void* ptr = malloc(buffer_size); + auto buffer_data_sink = BufferDataSink::create(ptr, buffer_size); + auto filter = ETDumpFilter(); + filter.add_regex("filtered.*"); + filter.set_debug_handle_range(1, 10); + etdump_gen[i]->set_delegation_intermediate_output_filter(&filter); + + etdump_gen[i]->create_event_block("test_block"); + etdump_gen[i]->set_data_sink(&buffer_data_sink.get()); + + // size of empty etdump + size_t initial_size = 68; + + // Perform logging + + if (use_tensor_input) { + auto tensor = tf.ones({3, 2}); + auto result = etdump_gen[i]->log_intermediate_output_delegate( + name, delegate_debug_index, tensor); + ASSERT_EQ(result.ok(), expected_ok); + if (expected_ok) { + ASSERT_EQ(result.get(), expected_log); + } + } else { // use tensor_list instead + std::vector tensors = {tf.ones({5, 4}), tf.ones({7, 6})}; + Result result = etdump_gen[i]->log_intermediate_output_delegate( + name, + delegate_debug_index, + ArrayRef(tensors.data(), tensors.size())); + ASSERT_EQ(result.ok(), expected_ok); + if (expected_ok) { + ASSERT_EQ(result.get(), expected_log); + } + } + + // Get final size of etdump + ETDumpResult final_result = etdump_gen[i]->get_etdump_data(); + size_t final_size = final_result.size; + // Check if the size of etdump has changed based on logging success + if (expected_log) { + ASSERT_NE(initial_size, final_size); // Expect size change if logged + } else { + ASSERT_EQ( + initial_size, final_size); // Expect no size change if not logged + } + + if (!etdump_gen[i]->is_static_etdump()) { + free(final_result.buf); + } + + free(ptr); + } + } + ETDumpGen* etdump_gen[2]; uint8_t* buf = nullptr; std::unique_ptr temp_file; @@ -652,7 +720,7 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { void* ptr = malloc(2048); Span buffer((uint8_t*)ptr, 2048); - ; + auto buffer_data_sink = BufferDataSink::create(ptr, 2048); auto file_data_sink = FileDataSink::create(dump_file_path.c_str()); @@ -892,3 +960,62 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { } } } + +TEST_F(ProfilerETDumpTest, LogWithRegexAndUnsetDelegateDebugIdOnTensor) { + check_log_with_filter( + "filtered_event", + kUnsetDelegateDebugIntId, + /*use_tensor_input=*/true, + /*expected_log=*/false, + /*expected_ok=*/true); +} + +TEST_F(ProfilerETDumpTest, LogWithRegexAndUnsetDelegateDebugIdOnTensorList) { + check_log_with_filter( + "filtered_event", + kUnsetDelegateDebugIntId, + /*use_tensor_input=*/true, + /*expected_log=*/false, + /*expected_ok=*/true); +} + +TEST_F(ProfilerETDumpTest, LogWithNullptrAndInRange) { + check_log_with_filter( + nullptr, + 5, + /*use_tensor_input=*/true, + /*expected_log=*/false, + /*expected_ok=*/true); +} +TEST_F(ProfilerETDumpTest, LogWithNonMatchingRegexAndOutOfRange) { + check_log_with_filter( + "unfiltered_event", + kUnsetDelegateDebugIntId, + /*use_tensor_input=*/true, + /*expected_log=*/true, + /*expected_ok=*/true); +} +TEST_F(ProfilerETDumpTest, LogWithNullptrAndOutOfRange) { + check_log_with_filter( + nullptr, + 20, + /*use_tensor_input=*/true, + /*expected_log=*/true, + /*expected_ok=*/true); +} +TEST_F(ProfilerETDumpTest, LogWithRegexAndInRange) { + check_log_with_filter( + "filtered_event", + 5, + /*use_tensor_input=*/true, + /*expected_log=*/false, + /*expected_ok=*/false); +} +TEST_F(ProfilerETDumpTest, LogWithNullptrAndUnsetDebugHandle) { + check_log_with_filter( + nullptr, + kUnsetDelegateDebugIntId, + /*use_tensor_input=*/true, + /*expected_log=*/false, + /*expected_ok=*/false); +} diff --git a/devtools/etdump/tests/targets.bzl b/devtools/etdump/tests/targets.bzl index 7f266eed5a7..10eb8608362 100644 --- a/devtools/etdump/tests/targets.bzl +++ b/devtools/etdump/tests/targets.bzl @@ -19,6 +19,7 @@ def define_common_targets(): "//executorch/extension/testing_util:temp_file", "//executorch/runtime/platform:platform", "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + "//executorch/devtools/etdump:etdump_filter", ], ) diff --git a/devtools/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py index 5e224415bb6..ee571e365fe 100644 --- a/devtools/inspector/tests/inspector_utils_test.py +++ b/devtools/inspector/tests/inspector_utils_test.py @@ -205,7 +205,7 @@ def test_compare_results(self): self.assertAlmostEqual(calculate_cosine_similarity([a], [b])[0], 1.0) def test_compare_results_uint8(self): - a = torch.randint(0, 255, (4, 4), dtype=torch.uint8) + a = torch.randint(1, 255, (4, 4), dtype=torch.uint8) # Create tensor b which has very close value to tensor a b = a.clone() diff --git a/docs/README.md b/docs/README.md index e6dc66d335e..e30decb9362 100644 --- a/docs/README.md +++ b/docs/README.md @@ -102,7 +102,7 @@ The current version of PyTorch is ${executorch_version:pytorch}. This will result in the following output: - + Right now we only support PyTorch version as custom variable, but will support others in the future. @@ -130,7 +130,7 @@ Use the to contribute to the documentation. In addition to that, see -[Markdown in Sphinx Tips and Tricks](https://pytorch.org/executorch/markdown-sphinx-tips-tricks.html) +[Markdown in Sphinx Tips and Tricks](source/markdown-sphinx-tips-tricks.md) for tips on how to author high-quality markdown pages with Myst Parser. ## Adding Tutorials @@ -143,12 +143,12 @@ directory. Use one of the following templates: - [Markdown template](https://github.com/pytorch/executorch/blob/main/docs/source/tutorial-template.md) After creating a tutorial, make sure to add the corresponding path in the -[index.rst](./source/index.rst) file in the following places: +[index.md](source/index.md) file in the following places: - In the - [tutorials torctree](https://github.com/pytorch/executorch/blob/main/docs/source/index.rst?plain=1#L183) + [tutorials torctree](https://github.com/pytorch/executorch/blob/main/docs/source/index.md?plain=1#L185) - In the - [customcard section](https://github.com/pytorch/executorch/blob/main/docs/source/index.rst?plain=1#L201) + [customcard section](https://github.com/pytorch/executorch/blob/main/docs/source/index.md?plain=1#L201) If you want to include a Markdown tutorial that is stored in another directory outside of the `docs/source` directory, complete the following steps: @@ -163,7 +163,7 @@ outside of the `docs/source` directory, complete the following steps: **NOTE:** Your tutorial source file needs to follow the tutorial template. -3. Add the file that you have created in **Step 1** to the `index.rst` toctree +3. Add the file that you have created in **Step 1** to the `index.md` toctree and add a `customcarditem` with the link to that file. For example, if I wanted to include the `README.md` file from @@ -176,7 +176,7 @@ file: ```{include} ../../../examples/selective_build/README.md ```` -In the `index.rst` file, I would add `tutorials/selective-build-tutorial` in +In the `index.md` file, I would add `tutorials/selective-build-tutorial` in both the `toctree` and the `cusotmcarditem` sections. # Auto-generated API documentation @@ -211,7 +211,7 @@ executorch.exir ``` These separate `.rst` files should all be linked together, with the initial -landing page under `index.rst`. +landing page under `index.md`. ### C++ APIs @@ -236,4 +236,4 @@ important/relevant parts are: If you need to include new files, simply add them to the `INPUT` in the `Doxyfile`. The generated output is included to the ExecuTorch documentation -build and referenced in `index.rst`. +build and referenced in `index.md`. diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile index 0d60bf51c7e..ef076e5794d 100644 --- a/docs/source/Doxyfile +++ b/docs/source/Doxyfile @@ -399,9 +399,9 @@ BUILTIN_STL_SUPPORT = NO CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen -# will parse them like normal C++ but will assume all classes use public instead -# of private inheritance when no explicit protection keyword is present. +# https://python-sip.readthedocs.io/en/stable/introduction.html) sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO @@ -1483,8 +1483,9 @@ HTML_INDEX_NUM_ENTRIES = 100 # output directory. Running make will produce the docset in that directory and # running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy -# genXcode/_index.html for more information. +# startup. See +# https://developer.apple.com/library/archive/featuredarticles/DoxygenXcode/_index.html +# for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. diff --git a/docs/source/_static/img/demo_ios_app.jpg b/docs/source/_static/img/demo_ios_app.jpg deleted file mode 100644 index d45b3dd38f6..00000000000 Binary files a/docs/source/_static/img/demo_ios_app.jpg and /dev/null differ diff --git a/docs/source/_static/img/demo_ios_app.png b/docs/source/_static/img/demo_ios_app.png new file mode 100644 index 00000000000..97622123093 Binary files /dev/null and b/docs/source/_static/img/demo_ios_app.png differ diff --git a/docs/source/_static/img/llama_ios_app.mp4 b/docs/source/_static/img/llama_ios_app.mp4 index 2f5df08984d..b4bf23cfdf6 100644 Binary files a/docs/source/_static/img/llama_ios_app.mp4 and b/docs/source/_static/img/llama_ios_app.mp4 differ diff --git a/docs/source/_static/img/llama_ios_app.png b/docs/source/_static/img/llama_ios_app.png index d9088abc4f9..fff399cfe1d 100644 Binary files a/docs/source/_static/img/llama_ios_app.png and b/docs/source/_static/img/llama_ios_app.png differ diff --git a/docs/source/_static/img/new-contributor-guide/ci1.png b/docs/source/_static/img/new-contributor-guide/ci1.png new file mode 100644 index 00000000000..ba26f572913 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/ci1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/cla1.png b/docs/source/_static/img/new-contributor-guide/cla1.png new file mode 100644 index 00000000000..0e9918bd542 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/cla1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/cla2.png b/docs/source/_static/img/new-contributor-guide/cla2.png new file mode 100644 index 00000000000..e62d90b46fd Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/cla2.png differ diff --git a/docs/source/_static/img/new-contributor-guide/end_of_draft_pr1.png b/docs/source/_static/img/new-contributor-guide/end_of_draft_pr1.png new file mode 100644 index 00000000000..4bd8d085a9f Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/end_of_draft_pr1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/end_of_draft_pr2.png b/docs/source/_static/img/new-contributor-guide/end_of_draft_pr2.png new file mode 100644 index 00000000000..0de46229b22 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/end_of_draft_pr2.png differ diff --git a/docs/source/_static/img/new-contributor-guide/good_first_issues.png b/docs/source/_static/img/new-contributor-guide/good_first_issues.png new file mode 100644 index 00000000000..0c3a0564678 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/good_first_issues.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_clone.png b/docs/source/_static/img/new-contributor-guide/how_to_clone.png new file mode 100644 index 00000000000..6a8ba7e9a35 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_clone.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_draft_pr1.png b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr1.png new file mode 100644 index 00000000000..b92a7016d52 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_draft_pr2.png b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr2.png new file mode 100644 index 00000000000..46110ba7886 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr2.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_draft_pr3.png b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr3.png new file mode 100644 index 00000000000..ca5bb03436c Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_draft_pr3.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_fork1.png b/docs/source/_static/img/new-contributor-guide/how_to_fork1.png new file mode 100644 index 00000000000..c8f56d5a841 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_fork1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_fork2.png b/docs/source/_static/img/new-contributor-guide/how_to_fork2.png new file mode 100644 index 00000000000..ea4b2e9dfa2 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_fork2.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_label1.png b/docs/source/_static/img/new-contributor-guide/how_to_label1.png new file mode 100644 index 00000000000..fb2d4e03868 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_label1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_label2.png b/docs/source/_static/img/new-contributor-guide/how_to_label2.png new file mode 100644 index 00000000000..f5d38561744 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_label2.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_merge1.png b/docs/source/_static/img/new-contributor-guide/how_to_merge1.png new file mode 100644 index 00000000000..6f06911db97 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_merge1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_merge2.png b/docs/source/_static/img/new-contributor-guide/how_to_merge2.png new file mode 100644 index 00000000000..e7a38177b36 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_merge2.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_merge3.png b/docs/source/_static/img/new-contributor-guide/how_to_merge3.png new file mode 100644 index 00000000000..88911271f04 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_merge3.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_pr1.png b/docs/source/_static/img/new-contributor-guide/how_to_pr1.png new file mode 100644 index 00000000000..454c86a6a02 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_pr1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_pr2.png b/docs/source/_static/img/new-contributor-guide/how_to_pr2.png new file mode 100644 index 00000000000..b3eb7900e81 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_pr2.png differ diff --git a/docs/source/_static/img/new-contributor-guide/how_to_pr3.png b/docs/source/_static/img/new-contributor-guide/how_to_pr3.png new file mode 100644 index 00000000000..6c5037f78f4 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/how_to_pr3.png differ diff --git a/docs/source/_static/img/new-contributor-guide/pr_approval1.png b/docs/source/_static/img/new-contributor-guide/pr_approval1.png new file mode 100644 index 00000000000..d21ddd966ba Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/pr_approval1.png differ diff --git a/docs/source/_static/img/new-contributor-guide/pr_approval2.png b/docs/source/_static/img/new-contributor-guide/pr_approval2.png new file mode 100644 index 00000000000..88c0c2389d4 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/pr_approval2.png differ diff --git a/docs/source/_static/img/new-contributor-guide/release_notes.png b/docs/source/_static/img/new-contributor-guide/release_notes.png new file mode 100644 index 00000000000..8f5d34cf03d Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/release_notes.png differ diff --git a/docs/source/_static/img/new-contributor-guide/synced_fork.png b/docs/source/_static/img/new-contributor-guide/synced_fork.png new file mode 100644 index 00000000000..a2ba263df84 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/synced_fork.png differ diff --git a/docs/source/_static/img/new-contributor-guide/unsynced_fork.png b/docs/source/_static/img/new-contributor-guide/unsynced_fork.png new file mode 100644 index 00000000000..916b08424d5 Binary files /dev/null and b/docs/source/_static/img/new-contributor-guide/unsynced_fork.png differ diff --git a/docs/source/_static/img/swiftpm_xcode1.png b/docs/source/_static/img/swiftpm_xcode1.png index 11b9c237827..3fcad383610 100644 Binary files a/docs/source/_static/img/swiftpm_xcode1.png and b/docs/source/_static/img/swiftpm_xcode1.png differ diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html index 210153e123c..55f91103b35 100644 --- a/docs/source/_templates/layout.html +++ b/docs/source/_templates/layout.html @@ -2,8 +2,8 @@ {% block extrahead %} {% if 'getting-started-setup' in pagename%} - - + + {% elif 'compiler-delegate-and-partitioner' in pagename%} @@ -74,7 +74,7 @@
{{ toc }}
{% endif %} {% endblock %} - + {% block footer %} {{ super() }} @@ -131,14 +131,14 @@ $(".main-menu a:contains('GitHub')").each(overwrite); // Overwrite link to Tutorials and Get Started top navigation. If these sections are moved // this overrides need to be updated. - $(".main-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/stable/index.html#tutorials-and-examples"); - $(".main-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/stable/getting-started-setup.html"); + $(".main-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/main/index#tutorials-and-examples"); + $(".main-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/main/getting-started-setup"); // Mobile $(".mobile-menu a:contains('Github')").each(overwrite); // Overwrite link to Tutorials and Get Started top navigation. If these sections are moved // this overrides need to be updated. - $(".mobile-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/stable/index.html#tutorials-and-examples"); - $(".mobile-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/stable/getting-started-setup.html"); + $(".mobile-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/main/index#tutorials-and-examples"); + $(".mobile-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/main/getting-started-setup"); }); diff --git a/docs/source/backend-delegates-integration.md b/docs/source/backend-delegates-integration.md index c127252e2f4..0179ceff872 100644 --- a/docs/source/backend-delegates-integration.md +++ b/docs/source/backend-delegates-integration.md @@ -16,7 +16,7 @@ Delegate Python files such as those implementing `preprocess()` or `partition()` functions for ExecuTorch AOT flow, excluding any external third-party dependencies and their files, should be installed and available with the top level ExecuTorch package. For third-party dependencies, please refer to -[this](./backend-delegates-dependencies.md). +[this](backend-delegates-dependencies.md). ## C++ Source Files @@ -28,7 +28,7 @@ top level `CMakeLists.txt` file using `add_subdirectory` CMake command, and should be built conditionally with an ExecuTorch build flag like `EXECUTORCH_BUILD_`, see `EXECUTORCH_BUILD_XNNPACK` for example. For third-party dependencies, please refer to -[this](./backend-delegates-dependencies.md). +[this](backend-delegates-dependencies.md). Scheme --> Edit Scheme --> Info --> Build Configuration We recommend that you only use the Debug build scheme during development, where you might need to access additional logs. Debug build has logging overhead and will impact inferencing performance, while release build has compiler optimizations enabled and all logging overhead removed. -For more details integrating and Running ExecuTorch on Apple Platforms or building the package locally, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html). +For more details integrating and Running ExecuTorch on Apple Platforms or building the package locally, checkout this [link](https://pytorch.org/executorch/main/using-executorch-ios). ### 4. Build and Run the project diff --git a/examples/demo-apps/react-native/rnllama/README.md b/examples/demo-apps/react-native/rnllama/README.md index 33c607d635f..f017c8bfa22 100644 --- a/examples/demo-apps/react-native/rnllama/README.md +++ b/examples/demo-apps/react-native/rnllama/README.md @@ -1,7 +1,7 @@ # React Native Llama

- rnllama Logo + rnllama Logo

A React Native mobile application for running LLaMA language models using ExecuTorch. This example is for iOS only for now. diff --git a/examples/devtools/README.md b/examples/devtools/README.md index e4fbadfcca0..0b516ad629e 100644 --- a/examples/devtools/README.md +++ b/examples/devtools/README.md @@ -17,7 +17,7 @@ examples/devtools We will use an example model (in `torch.nn.Module`) and its representative inputs, both from [`models/`](../models) directory, to generate a [BundledProgram(`.bpte`)](../../docs/source/bundled-io.md) file using the [script](scripts/export_bundled_program.py). Then we will use [devtools/example_runner](example_runner/example_runner.cpp) to execute the `.bpte` model on the ExecuTorch runtime and verify the model on BundledProgram API. -1. Sets up the basic development environment for ExecuTorch by [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/stable/getting-started-setup). +1. Sets up the basic development environment for ExecuTorch by [Setting up ExecuTorch from GitHub](https://pytorch.org/executorch/main/getting-started-setup). 2. Using the [script](scripts/export_bundled_program.py) to generate a BundledProgram binary file by retreiving a `torch.nn.Module` model and its representative inputs from the list of available models in the [`models/`](../models) dir. diff --git a/examples/llm_manual/README.md b/examples/llm_manual/README.md index e465255fc66..6318bbe7e84 100644 --- a/examples/llm_manual/README.md +++ b/examples/llm_manual/README.md @@ -1,3 +1,3 @@ # LLM Manual -This repository is a storage place for the files that [LLM Manual](https://pytorch.org/executorch/main/llm/getting-started.html) needs. Please refer to the documentation website for more information. +This repository is a storage place for the files that [LLM Manual](https://pytorch.org/executorch/main/llm/getting-started) needs. Please refer to the documentation website for more information. diff --git a/examples/llm_manual/export_nanogpt.py b/examples/llm_manual/export_nanogpt.py index 9de2e831e25..8c948479f2a 100644 --- a/examples/llm_manual/export_nanogpt.py +++ b/examples/llm_manual/export_nanogpt.py @@ -28,7 +28,7 @@ # The torch.no_grad() call tells PyTorch to exclude training-specific logic. with sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): m = export_for_training( - model, example_inputs, dynamic_shapes=dynamic_shape + model, example_inputs, dynamic_shapes=dynamic_shape, strict=True ).module() traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape, strict=True) diff --git a/examples/llm_pte_finetuning/README.md b/examples/llm_pte_finetuning/README.md index 8aeea31608c..b8d0b1eac1a 100644 --- a/examples/llm_pte_finetuning/README.md +++ b/examples/llm_pte_finetuning/README.md @@ -63,7 +63,7 @@ shuffle: True batch_size: 1 ``` -Torchtune supports datasets using huggingface dataloaders, so custom datasets could also be defined. For examples on defining your own datasets, review the [torchtune docs](https://pytorch.org/torchtune/stable/tutorials/datasets.html#hugging-face-datasets). +Torchtune supports datasets using huggingface dataloaders, so custom datasets could also be defined. For examples on defining your own datasets, review the [torchtune docs](https://pytorch.org/torchtune/stable/basics/text_completion_datasets.html#loading-text-completion-datasets-from-hugging-face). ### Loss diff --git a/examples/mediatek/aot_utils/llm_utils/tokenizers_/tokenization_llama.py b/examples/mediatek/aot_utils/llm_utils/tokenizers_/tokenization_llama.py index 9b5ef2c0c85..00ca0bf5b77 100644 --- a/examples/mediatek/aot_utils/llm_utils/tokenizers_/tokenization_llama.py +++ b/examples/mediatek/aot_utils/llm_utils/tokenizers_/tokenization_llama.py @@ -454,7 +454,7 @@ def create_token_type_ids_from_sequences( Optional second list of IDs for sequence pairs. Returns: - `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + `List[int]`: List of token type IDs according to the given sequence(s). """ bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py index 2246b8eeb15..25362788e31 100755 --- a/examples/mediatek/aot_utils/oss_utils/utils.py +++ b/examples/mediatek/aot_utils/oss_utils/utils.py @@ -30,7 +30,9 @@ def build_executorch_binary( if quant_dtype not in Precision: raise AssertionError(f"No support for Precision {quant_dtype}.") - captured_model = torch.export.export_for_training(model, inputs).module() + captured_model = torch.export.export_for_training( + model, inputs, strict=True + ).module() annotated_model = prepare_pt2e(captured_model, quantizer) print("Quantizing the model...") # calibration diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp index 5274d0925ae..131ad95e34b 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp @@ -80,11 +80,9 @@ bool MTKLlamaRunner::is_loaded() const { Error MTKLlamaRunner::generate( const std::string& prompt, - int32_t seq_len, + executorch::extension::llm::GenerationConfig config, std::function token_callback, - std::function stats_callback, - bool echo, - bool warming) { + std::function stats_callback) { if (!is_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(load()); } diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h index 0f76f610a7e..5dd8a85005e 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.h +++ b/examples/mediatek/executor_runner/mtk_llama_runner.h @@ -43,11 +43,9 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner { Error load(); Error generate( const std::string& prompt, - int32_t seq_len = 128, + executorch::extension::llm::GenerationConfig config, std::function token_callback = {}, - std::function stats_callback = {}, - bool echo = true, - bool warming = false); + std::function stats_callback = {}); void stop(); LlamaModelOptions get_model_options(); diff --git a/examples/mediatek/model_export_scripts/llama.py b/examples/mediatek/model_export_scripts/llama.py index 5da17727075..413df21d5cc 100644 --- a/examples/mediatek/model_export_scripts/llama.py +++ b/examples/mediatek/model_export_scripts/llama.py @@ -319,7 +319,7 @@ def export_to_et_ir( ) print("Getting pre autograd ATen Dialect Graph") pre_autograd_aten_dialect = torch.export.export_for_training( - model, example_inputs, dynamic_shapes=dynamic_shapes + model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True ).module() # NOTE: Will be replaced with export quantizer = NeuropilotQuantizer() quantizer.setup_precision(getattr(Precision, precision)) diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md index 3a7a723c73b..5fd47ad61ec 100644 --- a/examples/models/deepseek-r1-distill-llama-8B/README.md +++ b/examples/models/deepseek-r1-distill-llama-8B/README.md @@ -17,7 +17,7 @@ pip install -U "huggingface_hub[cli]" huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Llama-8B --local-dir /target_dir/DeepSeek-R1-Distill-Llama-8B --local-dir-use-symlinks False ``` -2. Download the [tokenizer.model](https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/original/tokenizer.model) from the Llama3.1 repo which will be needed later on when running the model using the runtime. +2. Download the [tokenizer.model](https://huggingface.co/meta-llama/Llama-3.1-8B/tree/main/original) from the Llama3.1 repo which will be needed later on when running the model using the runtime. 3. Convert the model to pth file. ``` @@ -48,16 +48,13 @@ print("saving checkpoint") torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth") ``` -4. Download and save the params.json file -``` -wget https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/original/params.json -o /tmp/params.json -``` +4. Download and save the [params.json](https://huggingface.co/meta-llama/Llama-3.1-8B/tree/main/original) file. 5. Generate a PTE file for use with the Llama runner. ``` python -m examples.models.llama.export_llama \ --checkpoint /tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \ - -p /tmp/params.json \ + -p params.json \ -kv \ --use_sdpa_with_kv_cache \ -X \ diff --git a/examples/models/efficient_sam/README.md b/examples/models/efficient_sam/README.md index bce1f7c5319..1f89a3ec5b3 100644 --- a/examples/models/efficient_sam/README.md +++ b/examples/models/efficient_sam/README.md @@ -12,7 +12,7 @@ Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup# ### Exporting to Core ML -Make sure to install the [required dependencies](https://pytorch.org/executorch/main/build-run-coreml.html#setting-up-your-developer-environment) for Core ML export. +Make sure to install the [required dependencies](https://pytorch.org/executorch/main/backends-coreml#development-requirements) for Core ML export. To export the model to Core ML, run the following command: @@ -32,7 +32,7 @@ python -m examples.xnnpack.aot_compiler -m efficient_sam # Performance -Tests were conducted on an Apple M1 Pro chip using the instructions for building and running Executorch with [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html#runtime) and [XNNPACK](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering.html#running-the-xnnpack-model-with-cmake) backends. +Tests were conducted on an Apple M1 Pro chip using the instructions for building and running Executorch with [Core ML](https://pytorch.org/executorch/main/backends-coreml#runtime-integration) and [XNNPACK](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering#running-the-xnnpack-model-with-cmake) backends. | Backend Configuration | Average Inference Time (seconds) | | ---------------------- | -------------------------------- | @@ -46,4 +46,4 @@ All models were tested with `float32` precision. # Licensing -The code in the `efficient_sam_core` directory is licensed under the [Apache License 2.0](./efficient_sam_core/LICENSE.txt). +The code in the `efficient_sam_core` directory is licensed under the [Apache License 2.0](efficient_sam_core/LICENSE.txt). diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index e6d45424bd4..12385f32d20 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -111,7 +111,8 @@ target_link_options_shared_lib(quantized_ops_lib) list(APPEND link_libraries quantized_kernels quantized_ops_lib) if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - list(APPEND link_libraries $) + target_link_options_shared_lib(custom_ops) + list(APPEND link_libraries custom_ops) endif() if(EXECUTORCH_BUILD_TORCHAO) diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 0bef45ea3ae..3f616b86e19 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -11,7 +11,7 @@ Here are supported models: Pretrained models are not included in this repo. Users are suggested to download them [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). -This page contains the basic recipe for running Llama. See [Llama utils page](./UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes. +This page contains the basic recipe for running Llama. See [Llama utils page](UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes. # What is Llama? Llama is a collection of large language models that use publicly available data for training. These models are based on the transformer architecture, which allows it to process input sequences of arbitrary length and generate output sequences of variable length. One of the key features of Llama models is its ability to generate coherent and contextually relevant text. This is achieved through the use of attention mechanisms, which allow the model to focus on different parts of the input sequence as it generates output. Additionally, Llama models use a technique called “masked language modeling” to pre-train the model on a large corpus of text, which helps it learn to predict missing words in a sentence. @@ -80,12 +80,12 @@ Llama 3.2 1B and 3B performance was measured on Android OnePlus 12 device. The p
- +
Llama3.2 1B, unquantized, BF16 on Android phone.
- +
Llama3.2 3B, 4bit quantized (SpinQuant) on Android phone @@ -129,7 +129,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus


- +
Llama3.1 8B, 4bit quantized on Android phone @@ -143,7 +143,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus ## Tested on - MacOS M1/M2, Linux. -- For Llama 3 8B, your device may require at least 32GB RAM. If this is a constraint for you, please try the [smaller stories model](./UTILS.md). +- For Llama 3 8B, your device may require at least 32GB RAM. If this is a constraint for you, please try the [smaller stories model](UTILS.md). ## Step 1: Setup > :warning: **double check your python environment**: make sure `conda activate ` is run before all the bash and python scripts. @@ -177,6 +177,7 @@ python -m examples.models.llama.export_llama \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ --output_name="llama3_2.pte" ``` +For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb). - To use **SpinQuant**, here are two ways: - Download directly from [Llama website](https://www.llama.com/llama-downloads). The model weights are prequantized and can be exported to `pte` file directly. @@ -206,6 +207,8 @@ python -m examples.models.llama.export_llama \ --use_spin_quant native \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ``` +For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb). + - To use **QAT+LoRA**, download directly from [Llama website](https://www.llama.com/llama-downloads). The model weights are prequantized and can be exported to `pte` file directly by: @@ -234,6 +237,7 @@ python -m examples.models.llama.export_llama \ --output_name "llama3_2.pte" \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ``` +For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb). ### Option B: Download and export Llama 3 8B instruct model @@ -371,14 +375,14 @@ adb push cmake-out-android/examples/models/llama/llama_main /data/local/tmp/llam ``` adb shell "cd /data/local/tmp/llama && ./llama_main --model_path --tokenizer_path --prompt \"What is the capital of France?\" --seq_len 120" --warmup=1 ``` -## Step 6: Build Mobile apps +## Step 5: Build Mobile apps ### iOS -Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension. +Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension. ### Android -Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App. +Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android) to for full instructions on building the Android LLAMA Demo App. ## Running with low-bit kernels @@ -412,7 +416,7 @@ python -m examples.models.llama.export_llama \ ``` A few notes: -- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized with weight zeros or not by specifying a third argument. For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and uses weight zeros (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32, but is quantized with scales-only. If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations. +- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized asymmetrically or not by specifying a third argument. For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and is asymmetric (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32 and is symmetric. If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations. - To do channelwise quantization, specify group_size to 0. This works for both linear and embedding layers. Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels. @@ -427,7 +431,7 @@ cmake -DPYTHON_EXECUTABLE=python \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ @@ -492,7 +496,7 @@ python -m examples.models.llama.eval_llama \ --max_context_len ``` -See [Llama utils page](./UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes, and quick iteration and verification. +See [Llama utils page](UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes, and quick iteration and verification. # What is coming next? ## Quantization @@ -544,3 +548,22 @@ clang: error: linker command failed with exit code 1 (use -v to see invocation) ``` It's a known issue for Xcode version 15.1. Mitigation: update to most recent Xcode version, clean and rebuild. + +- If you encounter issues with missing abseil-cpp or re2, try running `git submodule update --init --recursive` to pull in those submodules. +Example error: +``` +CMake Error at runner/CMakeLists.txt:68 (add_subdirectory): + The source directory + + /Users/../executorch/extension/llm/tokenizers/third-party/abseil-cpp + + does not contain a CMakeLists.txt file. + + +CMake Error at runner/CMakeLists.txt:72 (add_subdirectory): + The source directory + + /Users/../executorch/extension/llm/tokenizers/third-party/re2 + + does not contain a CMakeLists.txt file. +``` diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS index 93ac18c993d..f2aa396f7a1 100644 --- a/examples/models/llama/TARGETS +++ b/examples/models/llama/TARGETS @@ -3,7 +3,7 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load(":targets.bzl", "define_common_targets") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") oncall("executorch") @@ -90,7 +90,7 @@ runtime.python_binary( runtime.command_alias( name = "export_llama_qnn", env = { - "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()), + "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()), }, exe = ":export_llama", ) @@ -108,7 +108,7 @@ runtime.python_library( "source_transformation/pre_quantization.py", "source_transformation/prune_vocab.py", "source_transformation/quantize.py", - "source_transformation/quantized_kv_cache.py", + "source_transformation/custom_kv_cache.py", "source_transformation/rms_norm.py", "source_transformation/rope.py", "source_transformation/sdpa.py", @@ -208,9 +208,9 @@ runtime.python_library( ) runtime.python_library( - name = "quantized_kv_cache", + name = "custom_kv_cache", srcs = [ - "source_transformation/quantized_kv_cache.py", + "source_transformation/custom_kv_cache.py", ], _is_external_target = True, visibility = ["//executorch/..."], @@ -240,7 +240,7 @@ runtime.python_test( "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", ], deps = [ - ":quantized_kv_cache", + ":custom_kv_cache", "//caffe2:torch", "//executorch/examples/models/llama:llama_transformer", ], @@ -255,7 +255,7 @@ runtime.python_test( "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", ], deps = [ - ":quantized_kv_cache", + ":custom_kv_cache", ":sdpa", "//caffe2:torch", "//executorch/examples/models/llama:llama_transformer", @@ -274,3 +274,20 @@ runtime.python_test( ":export_library", ], ) + +runtime.python_test( + name = "quantized_sdpa_source_transform_test", + srcs = [ + "source_transformation/test_quantized_sdpa.py", + ], + preload_deps = [ + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/extension/llm/custom_ops:custom_ops_aot_py", + ], + deps = [ + ":custom_kv_cache", + ":sdpa", + "//caffe2:torch", + "//executorch/examples/models/llama:llama_transformer", + ], +) diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md index dd014240ace..5f760ad7670 100644 --- a/examples/models/llama/UTILS.md +++ b/examples/models/llama/UTILS.md @@ -25,7 +25,7 @@ From `executorch` root: ## Smaller model delegated to other backends Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction -for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is +for each backend ([CoreML](https://pytorch.org/executorch/main/backends-coreml), [MPS](https://pytorch.org/executorch/main/backends-mps), [QNN](https://pytorch.org/executorch/main/backends-qualcomm)) before trying to lower them. After the backend library is installed, the script to export a lowered model is - Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` - MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 8e6d4fefb0e..79a225232e0 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -59,14 +59,15 @@ ) from .source_transformation.attention import replace_attention_to_attention_sha +from .source_transformation.custom_kv_cache import ( + replace_kv_cache_with_custom_kv_cache, + replace_kv_cache_with_quantized_kv_cache, +) + from .source_transformation.quantize import ( get_quant_embedding_transform, get_quant_weight_transform, ) -from .source_transformation.quantized_kv_cache import ( - replace_kv_cache_with_custom_kv_cache, - replace_kv_cache_with_quantized_kv_cache, -) from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis @@ -77,6 +78,7 @@ replace_sdpa_with_coreml_sdpa, replace_sdpa_with_custom_op, replace_sdpa_with_flex_sdpa, + replace_sdpa_with_quantized_sdpa, replace_sdpa_with_simple_sdpa, ) from .source_transformation.vulkan_rope import replace_with_vulkan_rotary_emb @@ -651,7 +653,7 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager: _get_source_transforms( modelname=args.model, dtype_override=dtype_override, - checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype), + checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype), # type: ignore args=args, ) ) @@ -793,10 +795,6 @@ def _to_edge_and_lower_llama( # noqa: C901 args.enable_dynamic_shape, ) ) - # Apply XNNPACK after Vulkan so that undelegated ops can be accelerated by XNNPACK - partitioners.append( - get_xnnpack_partitioner(dynamic_quant_only_partitioner=False) - ) modelname = f"vulkan_{modelname}" # Need to remove asserts from the graph to prevent graph breaks @@ -818,6 +816,10 @@ def _to_edge_and_lower_llama( # noqa: C901 modelname = f"coreml_{modelname}" if args.qnn: + logging.warning( + "The model definition in current repro is not performant, please refer to the instruction" + " in https://github.com/pytorch/executorch/tree/main/examples/qualcomm/oss_scripts/llama/README.md for better performance." + ) from executorch.extension.llm.custom_ops import model_sharding partitioners.append( @@ -1104,7 +1106,7 @@ def _load_llama_model( return LLMEdgeManager( model=model, modelname=modelname, - max_seq_len=model.max_seq_len, + max_seq_len=model.max_seq_len, # type: ignore dtype=dtype_override, use_kv_cache=use_kv_cache, generate_full_logits=generate_full_logits, @@ -1117,6 +1119,8 @@ def _load_llama_model( calibration_seq_length=calibration_seq_length, calibration_data=calibration_data, tokenizer_path=tokenizer_path, + use_legacy_export=args.qnn, + save_exported_program=args.export_only, verbose=verbose, metadata=_load_llama_model_metadata( weight_type, @@ -1137,7 +1141,6 @@ def _load_llama_model( model.vocab_size, metadata_str, ), - args=args, ) @@ -1224,13 +1227,28 @@ def _get_source_transforms( # noqa if args.expand_rope_table: transforms.append(materialze_broadcast_of_rope_freq_cis) + use_attention_mask_for_custom_sdpa = False + if isinstance(args, argparse.Namespace): + if getattr(args, "use_custom_sdpa_with_attention_mask", None): + use_attention_mask_for_custom_sdpa = True + if args.use_sdpa_with_kv_cache: transforms.append(replace_kv_cache_with_custom_kv_cache) - transforms.append(replace_sdpa_with_custom_op) + # todo: do this optionally + # if use attention mask instead of causal attention + # then create partial function that sets use_attention_mask=True + if use_attention_mask_for_custom_sdpa: + transforms.append( + partial(replace_sdpa_with_custom_op, use_attention_mask=True) + ) + else: + transforms.append(replace_sdpa_with_custom_op) if args.quantize_kv_cache: assert args.use_kv_cache, "quantize_kv_cache requires use_kv_cache=True" transforms.append(replace_kv_cache_with_quantized_kv_cache) + # Right now + transforms.append(replace_sdpa_with_quantized_sdpa) if args.use_kv_cache: if args.qnn: diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp index 5fe0ce93cf6..5179bf28fc7 100644 --- a/examples/models/llama/main.cpp +++ b/examples/models/llama/main.cpp @@ -53,7 +53,7 @@ int32_t main(int32_t argc, char** argv) { const char* prompt = FLAGS_prompt.c_str(); - double temperature = FLAGS_temperature; + float temperature = FLAGS_temperature; int32_t seq_len = FLAGS_seq_len; @@ -73,13 +73,18 @@ int32_t main(int32_t argc, char** argv) { } #endif // create llama runner - example::Runner runner(model_path, tokenizer_path, temperature); + // @lint-ignore CLANGTIDY facebook-hte-Deprecated + example::Runner runner(model_path, tokenizer_path); if (warmup) { - runner.warmup(prompt, seq_len); + // @lint-ignore CLANGTIDY facebook-hte-Deprecated + runner.warmup(prompt, /*max_new_tokens=*/seq_len); } // generate - runner.generate(prompt, seq_len); + executorch::extension::llm::GenerationConfig config{ + .seq_len = seq_len, .temperature = temperature}; + // @lint-ignore CLANGTIDY facebook-hte-Deprecated + runner.generate(prompt, config); return 0; } diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py index 19829576482..2c82841c573 100644 --- a/examples/models/llama/model.py +++ b/examples/models/llama/model.py @@ -18,6 +18,7 @@ from executorch.examples.models.llama.llama_transformer import Transformer from executorch.examples.models.llama.model_args import ModelArgs +from torchao.utils import TorchAOBaseTensor try: from .fairseq2 import convert_to_llama_checkpoint @@ -257,6 +258,9 @@ def __init__(self, **kwargs): strict=False, assign=True, ) # self.model_ = Transformer(gptconf) + for param in self.model_.parameters(): + if isinstance(param, TorchAOBaseTensor): + param.requires_grad = False else: print("Checkpoint not provided, defaulting weights to zeros.") self.model_.to_empty(device="cpu") diff --git a/examples/models/llama/non_cpu_backends.md b/examples/models/llama/non_cpu_backends.md index 1ee594ebd83..f414582a3c1 100644 --- a/examples/models/llama/non_cpu_backends.md +++ b/examples/models/llama/non_cpu_backends.md @@ -2,7 +2,7 @@ # Running Llama 3/3.1 8B on non-CPU backends ### QNN -Please follow [the instructions](https://pytorch.org/executorch/stable/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.html) to deploy Llama 3 8B to an Android smartphone with Qualcomm SoCs. +Please follow [the instructions](https://pytorch.org/executorch/main/llm/build-run-llama3-qualcomm-ai-engine-direct-backend) to deploy Llama 3 8B to an Android smartphone with Qualcomm SoCs. ### MPS Export: @@ -10,7 +10,7 @@ Export: python -m examples.models.llama2.export_llama --checkpoint llama3.pt --params params.json -kv --disable_dynamic_shape --mps --use_sdpa_with_kv_cache -d fp32 -qmode 8da4w -G 32 --embedding-quantize 4,32 ``` -After exporting the MPS model .pte file, the [iOS LLAMA](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) app can support running the model. ` --embedding-quantize 4,32` is an optional args for quantizing embedding to reduce the model size. +After exporting the MPS model .pte file, the [iOS LLAMA](https://pytorch.org/executorch/main/llm/llama-demo-ios) app can support running the model. ` --embedding-quantize 4,32` is an optional args for quantizing embedding to reduce the model size. ### CoreML Export: diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp index 429e4b61c36..53c777fa80b 100644 --- a/examples/models/llama/runner/runner.cpp +++ b/examples/models/llama/runner/runner.cpp @@ -41,13 +41,11 @@ static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; Runner::Runner( const std::string& model_path, const std::string& tokenizer_path, - const float temperature, std::optional data_path) // NOTE: we observed ~2x loading performance increase on iPhone 15 // and a ~5% improvement on Galaxy S22 by switching to // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors. - : temperature_(temperature), - tokenizer_path_(tokenizer_path), + : tokenizer_path_(tokenizer_path), metadata_({ {kEnableDynamicShape, false}, {kMaxSeqLen, 128}, @@ -68,6 +66,17 @@ Runner::Runner( tokenizer_path.c_str()); } +[[deprecated( + "This constructor is deprecated. Use the constructor without temperature parameter instead.")]] +Runner::Runner( + const std::string& model_path, + const std::string& tokenizer_path, + const float temperature, + std::optional data_path) + : Runner(model_path, tokenizer_path, std::move(data_path)) { + temperature_ = temperature; +} + bool Runner::is_loaded() const { return module_->is_loaded() && tokenizer_ && text_decoder_runner_ && text_prefiller_ && text_token_generator_; @@ -133,11 +142,9 @@ Error Runner::load() { ET_LOG(Info, "eos_id = %" PRId64, value); } } + // @lint-ignore CLANGTIDY facebook-hte-Deprecated text_decoder_runner_ = std::make_unique( - module_.get(), - metadata_.at(kUseKVCache), - metadata_.at(kVocabSize), - temperature_); + module_.get(), metadata_.at(kUseKVCache)); text_prefiller_ = std::make_unique( text_decoder_runner_.get(), metadata_.at(kUseKVCache), @@ -164,11 +171,9 @@ Error Runner::load() { Error Runner::generate( const std::string& prompt, - int32_t seq_len, + const ::executorch::extension::llm::GenerationConfig& config, std::function token_callback, - std::function stats_callback, - bool echo, - bool warmup) { + std::function stats_callback) { // Prepare the inputs. // Use ones-initialized inputs. ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); @@ -178,19 +183,19 @@ Error Runner::generate( stats_.model_load_end_ms = llm::time_in_ms(); } - if (warmup) { + if (config.warming) { ET_LOG(Info, "Doing a warmup run..."); } RUNNER_ET_LOG( - warmup, + config.warming, "RSS after loading model: %f MiB (0 if unsupported)", llm::get_rss_bytes() / 1024.0 / 1024.0); // Wrap the token_callback with print function std::function wrapped_callback = - [token_callback, warmup](const std::string& piece) { - if (!warmup) { + [token_callback, config](const std::string& piece) { + if (!config.warming) { llm::safe_printf(piece.c_str()); fflush(stdout); } @@ -204,11 +209,6 @@ Error Runner::generate( stats_.inference_start_ms = llm::time_in_ms(); shouldStop_ = false; - // Set the sequence length to the max seq length if not provided - seq_len = (seq_len > 0 && seq_len <= metadata_.at(kMaxContextLen)) - ? seq_len - : metadata_.at(kMaxContextLen); - ::tokenizers::Result> encode_res = tokenizer_->encode( prompt, /* bos */ 0, @@ -225,21 +225,22 @@ Error Runner::generate( ET_CHECK_MSG( num_prompt_tokens < metadata_.at(kMaxContextLen), "num_prompt_tokens %d >= max_seq_len_ %" PRId64 - ", Max seq length exceeded - please increase max seq len value in .../llama2/model.py", + ", Max seq length exceeded - please increase max seq len value in your export script", num_prompt_tokens, metadata_.at(kMaxContextLen)); - ET_CHECK_MSG( - num_prompt_tokens < seq_len, - "num_prompt_tokens %d >= seq_len %d, Sequence length exceeded - please increase the seq_len value passed to generate()", - num_prompt_tokens, - seq_len); + + // Determine max_new_tokens using the GenerationConfig's resolve method + int max_new_tokens = config.resolve_max_new_tokens( + metadata_.at(kMaxContextLen), num_prompt_tokens); + + ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens); // Prefill first // Here feed all tokens to the model and get the next predicted token // after the prompt. After that we will enter generate loop. // print prompts - if (echo) { + if (config.echo) { wrapped_callback(prompt); } int64_t pos = 0; @@ -253,32 +254,38 @@ Error Runner::generate( wrapped_callback( ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); RUNNER_ET_LOG( - warmup, + config.warming, "RSS after prompt prefill: %f MiB (0 if unsupported)", llm::get_rss_bytes() / 1024.0 / 1024.0); // start the main loop prompt_tokens.push_back(cur_token); + + // Generate max_new_tokens - 1 because prefill already generated 1 token. int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( - prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback)); + prompt_tokens, + num_prompt_tokens, + max_new_tokens - 1, + temperature_ == -1.0f ? config.temperature : temperature_, + wrapped_callback)); stats_.inference_end_ms = llm::time_in_ms(); - if (!warmup) { + if (!config.warming) { printf("\n"); } RUNNER_ET_LOG( - warmup, + config.warming, "RSS after finishing text generation: %f MiB (0 if unsupported)", llm::get_rss_bytes() / 1024.0 / 1024.0); - if (num_prompt_tokens + num_generated_tokens == seq_len) { - RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len); + if (num_generated_tokens == max_new_tokens) { + RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens); } stats_.num_prompt_tokens = num_prompt_tokens; stats_.num_generated_tokens = num_generated_tokens; - if (warmup) { + if (config.warming) { ET_LOG(Info, "Warmup run finished!"); } else { // Do not print report during warmup @@ -291,14 +298,15 @@ Error Runner::generate( return Error::Ok; } -Error Runner::warmup(const std::string& prompt, int32_t seq_len) { - Error err = generate( - prompt, - seq_len, - /*token_callback=*/nullptr, - /*stats_callbak=*/nullptr, - /*echo=*/false, - /*warmup=*/true); +Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) { + // Create a GenerationConfig for warmup + llm::GenerationConfig config{ + .echo = false, .max_new_tokens = max_new_tokens, .warming = true}; + + // Call generate with the warmup config + Error err = generate(prompt, config); + + // Reset stats after warmup stats_.reset(); return err; } diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h index 509fe234027..97ffe4b98b7 100644 --- a/examples/models/llama/runner/runner.h +++ b/examples/models/llama/runner/runner.h @@ -33,26 +33,30 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner { explicit Runner( const std::string& model_path, const std::string& tokenizer_path, - const float temperature = 0.8f, std::optional data_path = std::nullopt); - bool is_loaded() const; - ::executorch::runtime::Error load(); + [[deprecated( + "This constructor is deprecated. Use the constructor without temperature parameter instead.")]] + explicit Runner( + const std::string& model_path, + const std::string& tokenizer_path, + const float temperature, + std::optional data_path = std::nullopt); + + bool is_loaded() const override; + ::executorch::runtime::Error load() override; ::executorch::runtime::Error generate( const std::string& prompt, - int32_t seq_len = 128, + const ::executorch::extension::llm::GenerationConfig& config, std::function token_callback = {}, std::function - stats_callback = {}, - bool echo = true, - bool warming = false); + stats_callback = {}) override; ::executorch::runtime::Error warmup( const std::string& prompt, - int32_t seq_len = 128); - void stop(); + int32_t max_new_tokens); + void stop() override; private: - float temperature_; bool shouldStop_{false}; // model @@ -68,6 +72,10 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner { // stats ::executorch::extension::llm::Stats stats_; + + // temperature. + // Deprecated, we should rely on the temperature in GenerationConfig instead. + float temperature_ = -1.0f; }; } // namespace example diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/custom_kv_cache.py similarity index 88% rename from examples/models/llama/source_transformation/quantized_kv_cache.py rename to examples/models/llama/source_transformation/custom_kv_cache.py index e7138622ed9..1158a8ba7a6 100644 --- a/examples/models/llama/source_transformation/quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/custom_kv_cache.py @@ -52,6 +52,8 @@ def __init__( self.use_custom_update_cache_op = use_custom_update_cache_op self.quantized_cache_dtype = torch.int8 self.cache_fp_type = torch.float32 + self.return_float_values = True + self.max_context_length = max_context_length cache_shape = (max_batch_size, max_context_length, n_heads, head_dim) scale_shape = (max_batch_size, max_context_length, n_heads, 1) self.register_buffer( @@ -61,17 +63,17 @@ def __init__( "v_cache", torch.zeros(cache_shape, dtype=self.quantized_cache_dtype) ) self.register_buffer( - "k_cache_scales", torch.ones(scale_shape, dtype=torch.float64) + "k_cache_scales", torch.ones(scale_shape, dtype=torch.float32) ) self.register_buffer( - "v_cache_scales", torch.ones(scale_shape, dtype=torch.float64) + "v_cache_scales", torch.ones(scale_shape, dtype=torch.float32) ) if cache_type == QuantizedCacheType.AffineAsymmetric: self.register_buffer( - "k_cache_zero_points", torch.ones(scale_shape, dtype=torch.int64) + "k_cache_zero_points", torch.ones(scale_shape, dtype=torch.int8) ) self.register_buffer( - "v_cache_zero_points", torch.ones(scale_shape, dtype=torch.int64) + "v_cache_zero_points", torch.ones(scale_shape, dtype=torch.int8) ) def _quantize(self, value): @@ -91,20 +93,15 @@ def _quantize(self, value): ) return quantized_value, scales, zero_points - def update(self, input_pos, k_val, v_val): - """ - k_val, v_val: [B, H, S, D] - return: [B, H, S, D] - However the storage is [B, S, H, D] so we incur transpose in, transpose out - This shall be removed by subsequent post-export graph pass - """ - k_val = k_val.transpose(1, 2) - v_val = v_val.transpose(1, 2) - # quantize current k_val and store it in the cache + def _quantize_and_update(self, input_pos, k_val, v_val): quantized_k_val, k_scales, k_zero_points = self._quantize(k_val) - quantized_v_val, v_scales, v_zero_points = self._quantize(v_val) + k_scales = k_scales.to(torch.float32) + k_zero_points = k_zero_points.to(self.quantized_cache_dtype) + v_scales = v_scales.to(torch.float32) + v_zero_points = v_zero_points.to(self.quantized_cache_dtype) + if self.use_custom_update_cache_op: start_pos = input_pos[0].item() _ = torch.ops.llama.update_cache(quantized_k_val, self.k_cache, start_pos) @@ -125,10 +122,13 @@ def update(self, input_pos, k_val, v_val): self.v_cache_scales[:, input_pos] = v_scales self.v_cache_zero_points[:, input_pos] = v_zero_points + def _update_and_return_float_values(self, input_pos, k_val, v_val): + self._quantize_and_update(input_pos, k_val, v_val) + k_out = torch.ops.quantized_decomposed.dequantize_per_token( self.k_cache, - self.k_cache_scales, - self.k_cache_zero_points, + self.k_cache_scales.to(torch.float64), + self.k_cache_zero_points.to(torch.int64), torch.iinfo(self.quantized_cache_dtype).min, torch.iinfo(self.quantized_cache_dtype).max, self.quantized_cache_dtype, @@ -136,14 +136,16 @@ def update(self, input_pos, k_val, v_val): ) v_out = torch.ops.quantized_decomposed.dequantize_per_token( self.v_cache, - self.v_cache_scales, - self.v_cache_zero_points, + self.v_cache_scales.to(torch.float64), + self.v_cache_zero_points.to(torch.int64), torch.iinfo(self.quantized_cache_dtype).min, torch.iinfo(self.quantized_cache_dtype).max, self.quantized_cache_dtype, self.cache_fp_type, ) + # When returning float values we jsut use the last value + # instead of dequantized value. start_pos = input_pos[0].item() if self.use_custom_update_cache_op: _ = torch.ops.llama.update_cache(k_val, k_out, start_pos) @@ -152,6 +154,29 @@ def update(self, input_pos, k_val, v_val): k_out[:, input_pos] = k_val v_out[:, input_pos] = v_val + return k_out, v_out + + def _update_and_return_quantized_values(self, input_pos, k_val, v_val): + self._quantize_and_update(input_pos, k_val, v_val) + + return self.k_cache, self.v_cache + + def update(self, input_pos, k_val, v_val): + """ + k_val, v_val: [B, H, S, D] + return: [B, H, S, D] + However the storage is [B, S, H, D] so we incur transpose in, transpose out + This shall be removed by subsequent post-export graph pass + """ + k_val = k_val.transpose(1, 2) + v_val = v_val.transpose(1, 2) + + if self.return_float_values: + k_out, v_out = self._update_and_return_float_values(input_pos, k_val, v_val) + else: + k_out, v_out = self._update_and_return_quantized_values( + input_pos, k_val, v_val + ) return k_out.transpose(1, 2), v_out.transpose(1, 2) @classmethod diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py index 2ef016de097..ec02f442217 100644 --- a/examples/models/llama/source_transformation/quantize.py +++ b/examples/models/llama/source_transformation/quantize.py @@ -107,14 +107,24 @@ def quantize( # noqa C901 print("quantized model:", model) return model elif qmode.startswith("torchao:8da"): + # Check for required args + if group_size is None: + raise Exception( + "For torchao:8daxw quantization, group size must be specified." + ) + pattern = r"torchao:8da(\d+)w" matches = re.findall(pattern, qmode) assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}" bitwidth = int(matches[0][0]) - from torchao.experimental.quant_api import Int8DynamicActivationIntxWeightConfig - from torchao.quantization.granularity import PerGroup, PerRow - from torchao.quantization.quant_api import quantize_ + from torchao.dtypes import PackedLinearInt8DynamicActivationIntxWeightLayout + from torchao.quantization.granularity import PerAxis, PerGroup + from torchao.quantization.quant_api import ( + Int8DynamicActivationIntxWeightConfig, + MappingType, + quantize_, + ) from torchao.utils import unwrap_tensor_subclass with torch.no_grad(): @@ -124,8 +134,11 @@ def quantize( # noqa C901 model, Int8DynamicActivationIntxWeightConfig( weight_dtype=getattr(torch, f"int{bitwidth}"), - granularity=(PerRow() if group_size == 0 else PerGroup(group_size)), - has_weight_zeros=False, + weight_granularity=( + PerAxis(0) if group_size == 0 else PerGroup(group_size) + ), + weight_mapping_type=MappingType.SYMMETRIC, + layout=PackedLinearInt8DynamicActivationIntxWeightLayout(), ), ) model = unwrap_tensor_subclass(model) @@ -164,7 +177,7 @@ def quantize( # noqa C901 try: # torchao 0.3+ - from torchao._eval import InputRecorder # pyre-fixme[21] + from torchao._models._eval import InputRecorder except ImportError: from torchao.quantization.GPTQ import InputRecorder # pyre-ignore @@ -206,17 +219,6 @@ def quantize( # noqa C901 q_group_size = 256 if group_size is None else group_size model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model) - # Apply additional quantizer for linear layers that aren't lowered to Vulkan - # at the moment - from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer - - # 1. Quantize in checkpoint dtype. - model = Int8DynActInt4WeightQuantizer( - precision=checkpoint_torch_dtype, groupsize=q_group_size - ).quantize(model) - # 2. Set the computation dtype (what weights/acts dequantize to). - model = set_8da4w_computation_dtype(model, computation_torch_dtype) - return model else: raise Exception(f"Unrecognized quantize mode: {qmode}") @@ -788,23 +790,27 @@ def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None): EmbeddingQuantizer, SharedEmbeddingQuantizer, ) - from torchao.quantization.granularity import PerGroup, PerRow + from torchao.quantization.granularity import PerAxis, PerGroup + from torchao.quantization.quant_api import MappingType quant_args = args.embedding_quantize.split(":")[1].split(",") if len(quant_args) == 2: bitwidth, group_size = quant_args - has_weight_zeros = True + is_asymmetric = True else: - bitwidth, group_size, has_weight_zeros = quant_args + bitwidth, group_size, is_asymmetric = quant_args if group_size in ["none", "None", "0"]: group_size = 0 group_size = int(group_size) bitwidth = int(bitwidth) - has_weight_zeros = bool(has_weight_zeros) + is_asymmetric = bool(is_asymmetric) weight_dtype = getattr(torch, f"int{bitwidth}") - granularity = PerRow() if group_size == 0 else PerGroup(group_size) + granularity = PerAxis(0) if group_size == 0 else PerGroup(group_size) + mapping_type = ( + MappingType.ASYMMETRIC if is_asymmetric else MappingType.SYMMETRIC + ) def _torchao_embedding_quantizer(model): with torch.no_grad(): @@ -812,14 +818,14 @@ def _torchao_embedding_quantizer(model): EmbeddingQuantizer( weight_dtype=weight_dtype, granularity=granularity, - has_weight_zeros=has_weight_zeros, + mapping_type=mapping_type, use_fallback=False, ).quantize(model) else: SharedEmbeddingQuantizer( weight_dtype=weight_dtype, granularity=granularity, - has_weight_zeros=has_weight_zeros, + mapping_type=mapping_type, ).quantize(model) return model diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py index 1bb7d277545..1bc54198fba 100644 --- a/examples/models/llama/source_transformation/sdpa.py +++ b/examples/models/llama/source_transformation/sdpa.py @@ -13,16 +13,24 @@ import torch -from executorch.examples.models.llama.attention import KVCache, SDPA +from executorch.examples.models.llama.attention import Attention, KVCache, SDPA + +from .custom_kv_cache import QuantizedKVCache class SDPACustom(torch.nn.Module): def __init__( self, dim: int, + max_context_len, + enable_dynamic_shape, + use_attention_mask: bool = False, ): super().__init__() self.dim = dim + self.max_context_len = max_context_len + self.use_attention_mask = use_attention_mask + self.enable_dynamic_shape = enable_dynamic_shape def forward( self, @@ -34,6 +42,16 @@ def forward( seqlen, mask, ): + if self.use_attention_mask: + if self.enable_dynamic_shape: + start_pos = input_pos[-1].item() + torch._check_is_size(start_pos) + torch._check(start_pos < self.max_context_len) + seq_length = q.size(2) + mask = mask.narrow(0, start_pos, seq_length) + else: + mask = mask[input_pos] + q = q.transpose(1, 2) # (bs, seqlen, n_local_heads, head_dim) k = k.transpose(1, 2) v = v.transpose(1, 2) @@ -45,34 +63,172 @@ def forward( k = k.to(dtype=torch.float) v = v.to(dtype=torch.float) - output = torch.ops.llama.custom_sdpa( - q, - k, - v, - input_pos[0].item(), - None, # Attention mask - 0, # dropout probability. Ignored by the code - True, # is_causal - ) + if self.use_attention_mask: + output = torch.ops.llama.custom_sdpa( + q, + k, + v, + input_pos[0].item(), + mask, # Attention mask + 0, # dropout probability. Ignored by the code + False, # is_causal + ) + else: + output = torch.ops.llama.custom_sdpa( + q, + k, + v, + input_pos[0].item(), + None, # Attention mask + 0, # dropout probability. Ignored by the code + True, # is_causal + ) return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype) -def _replace_sdpa_with_custom_op(module: torch.nn.Module): +def _replace_sdpa_with_custom_op( + module: torch.nn.Module, use_attention_mask: bool = False +): for name, child in module.named_children(): if isinstance(child, SDPA): setattr( module, name, - SDPACustom(child.dim), + SDPACustom( + child.dim, + child.max_context_len, + child.enable_dynamic_shape, + use_attention_mask=use_attention_mask, + ), + ) + else: + _replace_sdpa_with_custom_op(child, use_attention_mask=use_attention_mask) + + +def replace_sdpa_with_custom_op( + module: torch.nn.Module, use_attention_mask: bool = False +) -> torch.nn.Module: + from executorch.extension.llm.custom_ops import custom_ops # noqa + + _replace_sdpa_with_custom_op(module, use_attention_mask=use_attention_mask) + return module + + +class QuantizedSDPA(torch.nn.Module): + """ + A quantized version of the SDPA (Scaled Dot Product Attention) module. + + This module implements attention computation using quantized key-value pairs + to reduce memory footprint and potentially improve performance. It works with + a QuantizedKVCache to store and retrieve quantized key-value tensors. + + The quantization process converts floating point tensors to int8, which requires + maintaining scale and zero point values for proper dequantization during computation. + + Args: + dim (int): The dimension of the model + kv_cache (QuantizedKVCache): The cache for storing quantized key-value pairs + Note that it needs to own kv_cache to access scales and zero points, and since + SDPA forward signature only accepts q, k and v, to allow accessing scales and + zero points, we need to pass kv_cache to SDPA. + """ + + def __init__(self, dim: int, kv_cache: QuantizedKVCache): + super().__init__() + self.dim = dim + self.quantized_dtype = torch.int8 + self.float_dtype = torch.float32 + self.kv_cache = kv_cache + + def forward( + self, + input_pos: torch.Tensor, + q: torch.Tensor, + k_quantized: torch.Tensor, + v_quantized: torch.Tensor, + bsz, + seqlen, + mask, + ): + q = q.transpose(1, 2) # (bs, seqlen, n_local_heads, head_dim) + k_quantized = k_quantized.transpose(1, 2) + v_quantized = v_quantized.transpose(1, 2) + + q_scale, q_zero_point = ( + torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default( + q, self.quantized_dtype ) + ) + q_quantized = torch.ops.quantized_decomposed.quantize_per_token( + q, + q_scale, + q_zero_point, + torch.iinfo(self.quantized_dtype).min, + torch.iinfo(self.quantized_dtype).max, + self.quantized_dtype, + ) + q_zero_point_int8 = q_zero_point.to(dtype=torch.int8) + q_scale_fp32 = q_scale.to(dtype=torch.float32) + + k_zero_point_int8 = self.kv_cache.k_cache_zero_points + k_scale_fp32 = self.kv_cache.k_cache_scales + v_zero_point_int8 = self.kv_cache.v_cache_zero_points + v_scale_fp32 = self.kv_cache.v_cache_scales + + start_pos = input_pos[0].item() + output = torch.ops.llama.custom_quantized_sdpa( + q_quantized, + k_quantized, + v_quantized, + start_pos, + None, + 0, + True, + None, + q_zero_point_int8, + q_scale_fp32, + k_zero_point_int8, + k_scale_fp32, + v_zero_point_int8, + v_scale_fp32, + ) + + return output.view(bsz, seqlen, self.dim) + + +def _update_attention_module_with_quantized_sdpa( + module: torch.nn.Module, kv_cache: QuantizedKVCache +): + sdpa = getattr(module, "SDPA", None) + assert sdpa is not None + # pyre-ignore + setattr(module, "SDPA", QuantizedSDPA(sdpa.dim, kv_cache)) # noqa: B010 + + +def _replace_sdpa_with_quantized_sdpa(module: torch.nn.Module): + for _, child in module.named_children(): + if isinstance(child, Attention): + kv_cache = getattr(child, "kv_cache", None) + if kv_cache is None: + continue + if not isinstance(kv_cache, QuantizedKVCache): + continue + # Only when kv_cache is QuantizedKVCache, we replace SDPA with QuantizedSDPA + sdpa = getattr(child, "SDPA", None) + if sdpa is None: + continue + if not isinstance(sdpa, SDPACustom): + continue + kv_cache.return_float_values = False + _update_attention_module_with_quantized_sdpa(child, kv_cache) else: - _replace_sdpa_with_custom_op(child) + _replace_sdpa_with_quantized_sdpa(child) -def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module: +def replace_sdpa_with_quantized_sdpa(module: torch.nn.Module) -> torch.nn.Module: from executorch.extension.llm.custom_ops import custom_ops # noqa - _replace_sdpa_with_custom_op(module) + _replace_sdpa_with_quantized_sdpa(module) return module diff --git a/examples/models/llama/source_transformation/test_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_quantized_kv_cache.py index 4252518a4ee..07c8e1bf9a0 100644 --- a/examples/models/llama/source_transformation/test_quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/test_quantized_kv_cache.py @@ -10,7 +10,7 @@ from executorch.examples.models.llama.attention import KVCache -from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.source_transformation.custom_kv_cache import ( QuantizedCacheType, QuantizedKVCache, ) diff --git a/examples/models/llama/source_transformation/test_quantized_sdpa.py b/examples/models/llama/source_transformation/test_quantized_sdpa.py new file mode 100644 index 00000000000..242f3a0876d --- /dev/null +++ b/examples/models/llama/source_transformation/test_quantized_sdpa.py @@ -0,0 +1,173 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch + +from executorch.examples.models.llama.attention import Attention, KVCache, SDPA +from executorch.examples.models.llama.source_transformation.custom_kv_cache import ( + QuantizedCacheType, + QuantizedKVCache, +) +from executorch.examples.models.llama.source_transformation.sdpa import ( + QuantizedSDPA, + replace_sdpa_with_custom_op, + replace_sdpa_with_quantized_sdpa, + SDPACustom, +) + + +class MockAttention(Attention): + """Mock Attention class for testing purposes.""" + + def __init__( + self, dim, head_dim, n_rep, max_context_len=100, enable_dynamic_shape=False + ): + super().__init__() + self.dim = dim + self.head_dim = head_dim + self.n_rep = n_rep + self.SDPA = SDPA(dim, head_dim, n_rep, max_context_len, enable_dynamic_shape) + self.kv_cache = None + + def forward(self, x, freqs_cos, freqs_sin, **kwargs): + # Not used in tests + pass + + +class QuantizedSDPATest(unittest.TestCase): + def setUp(self): + torch.manual_seed(42) + self.max_batch_size = 1 + self.max_context_len = 5 + self.n_kv_heads = 4 + self.n_heads = 8 + self.head_dim = 16 + self.dim = self.n_heads * self.head_dim + self.enable_dynamic_shape = False + self.dtype = torch.float32 + + def _create_test_model(self): + """Create a simple model with SDPA modules for testing.""" + model = torch.nn.Module() + attention = MockAttention( + self.dim, self.head_dim, self.n_heads // self.n_kv_heads + ) + # Add KVCache to the attention module + attention.kv_cache = KVCache( + self.max_batch_size, + self.max_context_len, + self.n_kv_heads, + self.head_dim, + self.enable_dynamic_shape, + dtype=self.dtype, + ) + model.attention = attention + return model + + def test_replace_sdpa_with_quantized_sdpa(self): + """Test that replace_sdpa_with_quantized_sdpa correctly transforms SDPA to QuantizedSDPA.""" + # Create a model with SDPA + model = self._create_test_model() + + # First replace standard SDPA with SDPACustom (required before quantization) + model = replace_sdpa_with_custom_op(model) + self.assertIsInstance(model.attention.SDPA, SDPACustom) + + # Replace KVCache with QuantizedKVCache + model.attention.kv_cache = QuantizedKVCache.from_float( + model.attention.kv_cache, + QuantizedCacheType.AffineAsymmetric, + use_custom_update_cache_op=True, + ) + self.assertIsInstance(model.attention.kv_cache, QuantizedKVCache) + + # Set return_float_values to False to enable quantized operation + model.attention.kv_cache.return_float_values = False + + # Apply the transformation + model = replace_sdpa_with_quantized_sdpa(model) + + # Verify that SDPA has been replaced with QuantizedSDPA + self.assertIsInstance(model.attention.SDPA, QuantizedSDPA) + + # Verify that the QuantizedSDPA has the correct properties + self.assertEqual(model.attention.SDPA.dim, self.dim) + self.assertEqual(model.attention.SDPA.quantized_dtype, torch.int8) + self.assertEqual(model.attention.SDPA.float_dtype, torch.float32) + self.assertIs(model.attention.SDPA.kv_cache, model.attention.kv_cache) + + def test_no_replacement_when_no_quantized_kv_cache(self): + """Test that SDPA is not replaced when there's no QuantizedKVCache.""" + # Create a model with SDPA + model = self._create_test_model() + + # First replace standard SDPA with SDPACustom + model = replace_sdpa_with_custom_op(model) + self.assertIsInstance(model.attention.SDPA, SDPACustom) + + # Apply the transformation without replacing KVCache + model = replace_sdpa_with_quantized_sdpa(model) + + # Verify that SDPA has NOT been replaced with QuantizedSDPA + self.assertIsInstance(model.attention.SDPA, SDPACustom) + self.assertNotIsInstance(model.attention.SDPA, QuantizedSDPA) + + def test_forward_functionality(self): + """Test that the QuantizedSDPA forward function works correctly.""" + # This test requires the custom ops to be loaded, so we'll check if they're available + try: + from executorch.extension.llm.custom_ops import custom_ops # noqa + except ImportError: + self.skipTest( + "Custom ops not available, skipping forward functionality test" + ) + + # Create a model with SDPA + model = self._create_test_model() + + # First replace standard SDPA with SDPACustom + model = replace_sdpa_with_custom_op(model) + + # Replace KVCache with QuantizedKVCache + model.attention.kv_cache = QuantizedKVCache.from_float( + model.attention.kv_cache, + QuantizedCacheType.AffineAsymmetric, + use_custom_update_cache_op=True, + ) + + # Set return_float_values to False to enable quantized operation + model.attention.kv_cache.return_float_values = False + + # Save the original SDPACustom for comparison + # Apply the transformation + model = replace_sdpa_with_quantized_sdpa(model) + + # Create test inputs + input_pos = torch.tensor([0], dtype=torch.int64) + bsz = 1 + seqlen = 1 + q = torch.randn(bsz, self.n_heads, seqlen, self.head_dim, dtype=self.dtype) + k = torch.randn(bsz, self.n_kv_heads, seqlen, self.head_dim, dtype=self.dtype) + v = torch.randn(bsz, self.n_kv_heads, seqlen, self.head_dim, dtype=self.dtype) + + # Update the KV cache + k_quantized, v_quantized = model.attention.kv_cache.update(input_pos, k, v) + + # Run the forward pass with the quantized SDPA + try: + output = model.attention.SDPA( + input_pos, q, k_quantized, v_quantized, bsz, seqlen, None + ) + + # Verify the output shape + self.assertEqual(output.shape, (bsz, seqlen, self.dim)) + except Exception: + # If the forward pass fails, it might be due to missing custom ops + self.skipTest( + "Custom ops not available, skipping forward functionality test" + ) diff --git a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py index 35c88e10b6b..e5e278f8ce8 100644 --- a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py @@ -10,7 +10,7 @@ from executorch.examples.models.llama.attention import KVCache -from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.source_transformation.custom_kv_cache import ( CustomKVCache, QuantizedCacheType, QuantizedKVCache, @@ -71,8 +71,8 @@ def test_simple(self, is_dynamic_shape=False): self.seq_len = 3 self._init_cache() q, k_val, v_val = self._init_kv() - self.float_sdpa = SDPACustom(self.dim) - self.quantized_sdpa = SDPACustom(self.dim) + self.float_sdpa = SDPACustom(self.dim, self.max_context_len, True) + self.quantized_sdpa = SDPACustom(self.dim, self.max_context_len, True) k, v = self.custom_kv_cache.update(input_pos, k_val, v_val) float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None) k, v = self.quantized_kv_cache.update(input_pos, k_val, v_val) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 92ddbf74d94..615ad3948fc 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -41,7 +41,7 @@ You can export and run the original Llama 2 7B model. ``` 4. Create tokenizer.bin. ``` - python -m extension.llm.tokenizer.tokenizer -t -o tokenizer.bin + python -m pytorch_tokenizers.tools.llama2c.convert -t -o tokenizer.bin ``` Pass the converted `tokenizer.bin` file instead of `tokenizer.model` for subsequent steps. diff --git a/examples/models/llama3_2_vision/preprocess/test_preprocess.py b/examples/models/llama3_2_vision/preprocess/test_preprocess.py index 4c0a5635e5c..220b0dc9b6f 100644 --- a/examples/models/llama3_2_vision/preprocess/test_preprocess.py +++ b/examples/models/llama3_2_vision/preprocess/test_preprocess.py @@ -124,9 +124,9 @@ class TestImageTransform: same output as the reference model. Reference model: CLIPImageTransform - https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L115 + https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transform.py#L127 Eager and exported models: _CLIPImageTransform - https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L26 + https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transform.py#L28 """ models_no_resize = initialize_models(resize_to_max_canvas=False) @@ -147,7 +147,7 @@ def prepare_inputs( without distortion. These calculations are done by the reference model inside __init__ and __call__ - https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L115 + https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transform.py#L198 """ image_tensor = F.to_dtype( F.grayscale_to_rgb_image(F.to_image(image)), scale=True diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index 6003f3a000d..eeb6c296dd5 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -15,7 +15,7 @@ # ~~~ # It should also be cmake-lint clean. # -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE project(llava) # Duplicating options as root CMakeLists.txt @@ -124,7 +124,7 @@ target_link_options_shared_lib(quantized_ops_lib) list(APPEND link_libraries quantized_kernels quantized_ops_lib) if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - list(APPEND link_libraries custom_ops) + list(APPEND link_libraries $) endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md index d0dc71c0a85..6ba9ef21555 100644 --- a/examples/models/llava/README.md +++ b/examples/models/llava/README.md @@ -11,7 +11,7 @@ huggingface page [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llav

- +
Running Llava1.5 7B on Android phone @@ -26,17 +26,26 @@ model) for general-purpose visual and language understanding, achieving impressive chat capabilities mimicking spirits of the cutting edge multimodal models and setting a high bar for accuracy on Science QA. -## Instructions +## Instructions to run Llava on Android/iOS First you need to generate a .PTE file for the model, along with input image, and other artifacts. Then you need either a C++ runner, or Android or iOS application to test things out on device. +### Host machine requirements + +The biggest requirement is to have a host machine with at least 32GiB memory, preferably 64GiB. + +The model weights is 15GiB, and the other memory usage at export stage (`export_llava`) is around 10GiB. So you need at least 25GiB memory to run the export script. + + ### Generate ExecuTorch .PTE and other artifacts Run the following command to generate `llava.pte`, `tokenizer.bin` and an image tensor (serialized in TorchScript) `image.pt`. +> **Warning**: The C++ runner `llava_main` binary cannot process raw image inputs such as JPEG, PNG, or BMP files directly. You must convert these images to a `.pt` file format using the `examples/models/llava/image_util.py` script before using them with `llava_main`. + Prerequisite: run `install_executorch.sh` to install ExecuTorch and run `examples/models/llava/install_requirements.sh` to install dependencies. @@ -69,6 +78,13 @@ cmake-out/examples/models/llava/llava_main ### Build Mobile Apps +#### Device Requirements + +To run the Android/iOS apps, you need a device with at least 12GiB memory. + +- iPhone 13 Pro or above +- Samsung Galaxy S23 or above + #### Android We can run LLAVA using the LLAMA Demo Apps. Please refer to [this diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 63ae0f4a118..66b61840866 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -20,13 +20,13 @@ build_args_parser, get_quantizer_and_quant_params, ) +from executorch.examples.models.llama.source_transformation.custom_kv_cache import ( + replace_kv_cache_with_custom_kv_cache, +) from executorch.examples.models.llama.source_transformation.quantize import ( EmbeddingQuantHandler, get_quant_weight_transform, ) -from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( - replace_kv_cache_with_custom_kv_cache, -) from executorch.examples.models.llama.source_transformation.sdpa import ( replace_sdpa_with_custom_op, ) @@ -92,7 +92,6 @@ def forward(self, input_pos, embeddings): use_kv_cache=True, example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings), dynamic_shapes=dynamic_shapes, - args=llava.text_model_args, ) dtype_override = DType.fp32 @@ -161,7 +160,6 @@ def forward(self, images): use_kv_cache=True, example_inputs=(resized,), dynamic_shapes=dynamic_shapes, - args=None, ) .export() .pt2e_quantize([quantizer]) diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index 6ce4b701bbe..351356607c8 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -15,7 +15,7 @@ from executorch.examples.models.llama.llama_transformer import Transformer from executorch.examples.models.llama.model_args import ModelArgs -from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.source_transformation.custom_kv_cache import ( replace_kv_cache_with_custom_kv_cache, ) from executorch.examples.models.llama.source_transformation.sdpa import ( diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index 971e126a14c..aab5bfb4720 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -47,8 +47,10 @@ Error LlavaRunner::load() { tokenizer_->load(tokenizer_path_); // Load the text decoder runner - text_decoder_runner_ = std::make_unique( - module_.get(), tokenizer_->vocab_size(), temperature_); + text_decoder_runner_ = + // @lint-ignore CLANGTIDY facebook-hte-Deprecated + std::make_unique(module_.get()); + // @lint-ignore CLANGTIDY facebook-hte-Deprecated text_decoder_runner_->load(); // Load the text prefiller @@ -117,7 +119,11 @@ Error LlavaRunner::generate_from_pos( // Generate tokens int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( - {prefill_next_token}, start_pos, seq_len, token_callback)); + /*tokens=*/{prefill_next_token}, + /*start_pos=*/start_pos, + /*max_new_tokens=*/seq_len - start_pos + 1, + /*temperature=*/temperature_, + /*token_callback=*/token_callback)); // Bookkeeping stats_.num_generated_tokens = num_generated_tokens; diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h index 4c7809361b0..3de418b57ea 100644 --- a/examples/models/llava/runner/llava_text_decoder_runner.h +++ b/examples/models/llava/runner/llava_text_decoder_runner.h @@ -17,11 +17,8 @@ namespace example { class ET_EXPERIMENTAL LlavaTextDecoderRunner : public executorch::extension::llm::TextDecoderRunner { public: - LlavaTextDecoderRunner( - executorch::extension::Module* module, - int32_t vocab_size, - float temperature) - : TextDecoderRunner(module, true, vocab_size, temperature){}; + explicit LlavaTextDecoderRunner(executorch::extension::Module* module) + : TextDecoderRunner(module, true) {} inline executorch::runtime::Result step( executorch::extension::TensorPtr& tokens, diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py index 5fd60399415..36381b27124 100644 --- a/examples/models/llava/test/test_llava.py +++ b/examples/models/llava/test/test_llava.py @@ -131,7 +131,7 @@ def test_llava_export(self): # being tested, using llama_transformer new_tokens = [torch.argmax(pte_prefill_after_img).item()] # TODO: uncomment this line - # self.assertEquals(new_tokens[0], 1932) # When + # self.assertEqual(new_tokens[0], 1932) # When for i in range(4): print(i, llava_model.tokenizer.decode(new_tokens[i])) token_embeds = llava_module.run_method( diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py index 69859fa39bc..7e2cfb14c49 100644 --- a/examples/models/moshi/mimi/test_mimi.py +++ b/examples/models/moshi/mimi/test_mimi.py @@ -135,16 +135,28 @@ def test_streaming_encoding_decoding(self): all_codes_th = torch.cat(all_codes, dim=-1) + pcm_ref = self.mimi.decode(all_codes_th) + all_pcms = [] + for i in range(all_codes_th.shape[-1]): + codes = all_codes_th[..., i : i + 1] + pcm = self.mimi.decode(codes) + all_pcms.append(pcm) + all_pcms = torch.cat(all_pcms, dim=-1) + sqnr = compute_sqnr(pcm_ref, all_pcms) + print(f"sqnr = {sqnr} dB") + self.assertTrue(sqnr > 4) + + all_pcms_streaming = [] with self.mimi.streaming(1): for i in range(all_codes_th.shape[-1]): codes = all_codes_th[..., i : i + 1] - pcm = self.mimi.decode(codes) - all_pcms.append(pcm) - all_pcms = torch.cat(all_pcms, dim=-1) - - pcm_ref = self.mimi.decode(all_codes_th) - self.assertTrue(torch.allclose(pcm_ref, all_pcms, atol=1e-5)) + pcm_streaming = self.mimi.decode(codes) + all_pcms_streaming.append(pcm_streaming) + all_pcms_streaming = torch.cat(all_pcms_streaming, dim=-1) + sqnr_streaming = compute_sqnr(pcm_ref, all_pcms_streaming) + print(f"sqnr_streaming = {sqnr_streaming} dB") + self.assertTrue(sqnr_streaming > 100) def test_exported_encoding(self): """Ensure exported encoding model is consistent with reference output.""" diff --git a/examples/models/phi-3-mini-lora/README.md b/examples/models/phi-3-mini-lora/README.md index 2b7cc0ba401..62efda6c3dc 100644 --- a/examples/models/phi-3-mini-lora/README.md +++ b/examples/models/phi-3-mini-lora/README.md @@ -16,8 +16,9 @@ To see how you can use the model exported for training in a fully involved finet python export_model.py ``` -2. Run the inference model using an example runtime. For more detailed steps on this, check out [Build & Run](https://pytorch.org/executorch/stable/getting-started-setup.html#build-run). +2. Run the inference model using an example runtime. For more detailed steps on this, check out [Building from Source](https://pytorch.org/executorch/main/using-executorch-building-from-source). ``` + # Clean and configure the CMake build system. Compiled programs will appear in the executorch/cmake-out directory we create here. ./install_executorch.sh --clean (mkdir cmake-out && cd cmake-out && cmake ..) diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md index ba878d42a3f..f52f2a3a06d 100644 --- a/examples/models/phi-3-mini/README.md +++ b/examples/models/phi-3-mini/README.md @@ -13,7 +13,7 @@ pip uninstall -y transformers ; pip install transformers==4.44.2 ``` cd executorch wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true" -python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin +python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin ``` 2. Export the model. This step will take a few minutes to finish. ``` diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py index 8fa948e7dc7..11c2f3834eb 100644 --- a/examples/models/phi-3-mini/export_phi-3-mini.py +++ b/examples/models/phi-3-mini/export_phi-3-mini.py @@ -65,7 +65,7 @@ def export(args) -> None: xnnpack_quantizer.set_global(xnnpack_quant_config) model = export_for_training( - model, example_inputs, dynamic_shapes=dynamic_shapes + model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True ).module() model = prepare_pt2e(model, xnnpack_quantizer) # pyre-fixme[6] model(*example_inputs) diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py index 18f82957f94..3d91747f468 100644 --- a/examples/models/phi_4_mini/convert_weights.py +++ b/examples/models/phi_4_mini/convert_weights.py @@ -1,4 +1,5 @@ import argparse +import os from typing import Dict import torch @@ -7,6 +8,63 @@ from torchtune.training import FullModelHFCheckpointer +_HF_PHI_4_FROM_META = { + "tok_embeddings.weight": "model.embed_tokens.weight", + "norm.weight": "model.norm.weight", + "layers.{}.attention.wq.weight": "model.layers.{}.self_attn.q_proj.weight", + "layers.{}.attention.wk.weight": "model.layers.{}.self_attn.k_proj.weight", + "layers.{}.attention.wv.weight": "model.layers.{}.self_attn.v_proj.weight", + "layers.{}.attention.wo.weight": "model.layers.{}.self_attn.o_proj.weight", + "layers.{}.attention_norm.weight": "model.layers.{}.input_layernorm.weight", + "layers.{}.ffn_norm.weight": "model.layers.{}.post_attention_layernorm.weight", + "layers.{}.feed_forward.w1.weight": "model.layers.{}.mlp.gate_proj.weight", + "layers.{}.feed_forward.w3.weight": "model.layers.{}.mlp.up_proj.weight", + "layers.{}.feed_forward.w2.weight": "model.layers.{}.mlp.down_proj.weight", + "output.weight": "lm_head.weight", +} + + +def phi_4_hf_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """ + Convert a state dict from hf's format to Meta's format. + + Args: + state_dict (Dict[str, torch.Tensor]): State dict in hf's format. + + Returns: + Dict[str, torch.Tensor]: State dict in Meta's format. + """ + converted_state_dict = {} + inverted_mapping_dict = {v: k for k, v in _HF_PHI_4_FROM_META.items()} + + for key, value in state_dict.items(): + if key.endswith("mlp.gate_up_proj.weight"): + # Split the gate_up_proj into gate_proj and up_proj + hidden_dim = value.shape[0] // 2 + assert 2 * hidden_dim == value.shape[0] + gate = value[0:hidden_dim, :] + up = value[hidden_dim:, :] + for new_key, new_value in [("gate_proj", gate), ("up_proj", up)]: + new_key = key.replace("gate_up_proj", new_key) + new_key = get_mapped_key(new_key, inverted_mapping_dict) + converted_state_dict[new_key] = new_value + elif key.endswith("self_attn.qkv_proj.weight"): + # Split the qkv_proj into q_proj, k_proj, and v_proj + q_dim = value.shape[1] + kv_dim = (value.shape[0] - q_dim) // 2 + assert 2 * kv_dim + q_dim == value.shape[0] + q = value[0:q_dim, :] + k = value[q_dim : (q_dim + kv_dim), :] + v = value[(q_dim + kv_dim) :, :] + for new_key, new_value in [("q_proj", q), ("k_proj", k), ("v_proj", v)]: + new_key = key.replace("qkv_proj", new_key) + new_key = get_mapped_key(new_key, inverted_mapping_dict) + converted_state_dict[new_key] = new_value + else: + new_key = get_mapped_key(key, inverted_mapping_dict) + converted_state_dict[new_key] = value + return converted_state_dict + # Standard _FROM_META weight mapping of Meta weights to TorchTune. _PHI_4_FROM_META = { @@ -51,22 +109,30 @@ def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.T return converted_state_dict -def convert_weights(input_dir: str, output_file: str) -> None: - # Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves. - checkpointer = FullModelHFCheckpointer( - checkpoint_dir=input_dir, - checkpoint_files=[ - "model-00001-of-00002.safetensors", - "model-00002-of-00002.safetensors", - ], - output_dir=".", - model_type="PHI4", - ) +def convert_weights(input_dir_or_checkpoint: str, output_file: str) -> None: + # If input_dir_or_checkpoint is a directory downloaded from HF, FullModelHFCheckpointer is used to extract the state dict + # If input_dir_or_checkpoint is a checkpoint (from eager model model), it is loaded directly + if os.path.isdir(input_dir_or_checkpoint): + checkpointer = FullModelHFCheckpointer( + checkpoint_dir=input_dir_or_checkpoint, + checkpoint_files=[ + "model-00001-of-00002.safetensors", + "model-00002-of-00002.safetensors", + ], + output_dir=".", + model_type="PHI4", + ) + print("Loading checkpoint from directory...") + sd = checkpointer.load_checkpoint() + sd = sd["model"] + print("Converting checkpoint...") + sd = phi_4_tune_to_meta(sd) + else: + print("Loading checkpoint from file...") + sd = torch.load(input_dir_or_checkpoint, map_location="cpu", weights_only=True) + print("Converting checkpoint...") + sd = phi_4_hf_to_meta(sd) - print("Loading checkpoint...") - sd = checkpointer.load_checkpoint() - print("Converting checkpoint...") - sd = phi_4_tune_to_meta(sd["model"]) print("Saving checkpoint...") torch.save(sd, output_file) print("Done.") @@ -79,7 +145,7 @@ def main(): parser.add_argument( "input_dir", type=str, - help="Path to directory containing checkpoint files", + help="Path to directory containing checkpoint files, or path to a single checkpoint file.", ) parser.add_argument("output", type=str, help="Path to the output checkpoint") diff --git a/examples/models/test/test_export.py b/examples/models/test/test_export.py index 9a4ff7a35ed..306f54c0e89 100644 --- a/examples/models/test/test_export.py +++ b/examples/models/test/test_export.py @@ -29,7 +29,9 @@ def collect_executorch_and_eager_outputs( Returns a tuple containing the outputs of the eager mode model and the executorch mode model. """ eager_model = eager_model.eval() - model = torch.export.export_for_training(eager_model, example_inputs).module() + model = torch.export.export_for_training( + eager_model, example_inputs, strict=True + ).module() edge_model = export_to_edge(model, example_inputs) executorch_prog = edge_model.to_executorch() diff --git a/examples/portable/README.md b/examples/portable/README.md index a6658197da3..ef9b44a48a3 100644 --- a/examples/portable/README.md +++ b/examples/portable/README.md @@ -20,7 +20,7 @@ We will walk through an example model to generate a `.pte` file in [portable mod from the [`models/`](../models) directory using scripts in the `portable/scripts` directory. Then we will run on the `.pte` model on the ExecuTorch runtime. For that we will use `executor_runner`. -1. Following the setup guide in [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) +1. Following the setup guide in [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup) you should be able to get the basic development environment for ExecuTorch working. 2. Using the script `portable/scripts/export.py` generate a model binary file by selecting a @@ -78,4 +78,4 @@ Output 0: tensor(sizes=[1, 1000], [ ## Custom Operator Registration -Explore the demos in the [`custom_ops/`](./custom_ops) directory to learn how to register custom operators into ExecuTorch as well as register its kernels into ExecuTorch runtime. +Explore the demos in the [`custom_ops/`](custom_ops) directory to learn how to register custom operators into ExecuTorch as well as register its kernels into ExecuTorch runtime. diff --git a/examples/portable/custom_ops/README.md b/examples/portable/custom_ops/README.md index db517e84a0c..bf17d6a6753 100644 --- a/examples/portable/custom_ops/README.md +++ b/examples/portable/custom_ops/README.md @@ -3,7 +3,7 @@ This folder contains examples to register custom operators into PyTorch as well ## How to run -Prerequisite: finish the [setting up wiki](https://pytorch.org/executorch/stable/getting-started-setup). +Prerequisite: finish the [setting up wiki](https://pytorch.org/executorch/main/getting-started-setup). Run: diff --git a/examples/portable/custom_ops/custom_ops_2_out.cpp b/examples/portable/custom_ops/custom_ops_2_out.cpp index 138a8eeed89..2fb50e521c1 100644 --- a/examples/portable/custom_ops/custom_ops_2_out.cpp +++ b/examples/portable/custom_ops/custom_ops_2_out.cpp @@ -13,7 +13,7 @@ namespace native { using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; namespace { void check_preconditions(const Tensor& in, Tensor& out) { diff --git a/examples/portable/scripts/export_and_delegate.py b/examples/portable/scripts/export_and_delegate.py index 6a8a28d5338..1c2adf67688 100644 --- a/examples/portable/scripts/export_and_delegate.py +++ b/examples/portable/scripts/export_and_delegate.py @@ -61,7 +61,7 @@ def export_composite_module_with_lower_graph(): m_compile_spec = m.get_compile_spec() # pre-autograd export. eventually this will become torch.export - m = torch.export.export_for_training(m, m_inputs).module() + m = torch.export.export_for_training(m, m_inputs, strict=True).module() edge = export_to_edge(m, m_inputs) logging.info(f"Exported graph:\n{edge.exported_program().graph}") @@ -84,7 +84,7 @@ def forward(self, *args): m = CompositeModule() m = m.eval() # pre-autograd export. eventually this will become torch.export - m = torch.export.export_for_training(m, m_inputs).module() + m = torch.export.export_for_training(m, m_inputs, strict=True).module() composited_edge = export_to_edge(m, m_inputs) # The graph module is still runnerable @@ -134,7 +134,7 @@ def get_example_inputs(self): m = Model() m_inputs = m.get_example_inputs() # pre-autograd export. eventually this will become torch.export - m = torch.export.export_for_training(m, m_inputs).module() + m = torch.export.export_for_training(m, m_inputs, strict=True).module() edge = export_to_edge(m, m_inputs) logging.info(f"Exported graph:\n{edge.exported_program().graph}") @@ -171,7 +171,7 @@ def export_and_lower_the_whole_graph(): m_inputs = m.get_example_inputs() # pre-autograd export. eventually this will become torch.export - m = torch.export.export_for_training(m, m_inputs).module() + m = torch.export.export_for_training(m, m_inputs, strict=True).module() edge = export_to_edge(m, m_inputs) logging.info(f"Exported graph:\n{edge.exported_program().graph}") diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md index bdac58d2bfc..04354cda3f6 100644 --- a/examples/qualcomm/README.md +++ b/examples/qualcomm/README.md @@ -4,12 +4,12 @@ This directory contains examples for some AI models. We have seperated the example scripts into the following subfolders, please refer to [README.md](../../backends/qualcomm/README.md) for the example scripts' directory structure: -1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama](./oss_scripts/llama/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script. +1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama](oss_scripts/llama/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script. 2. oss_scripts: OSS stands for Open Source Software. This folder contains python scripts for open source models. Some models under this folder might also have their own customized runner. - For example, [llama](./oss_scripts/llama/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model. + For example, [llama](oss_scripts/llama/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model. -3. qaihub_scripts: QAIHub stands for [Qualcomm AI Hub](https://aihub.qualcomm.com/). On QAIHub, users can find pre-compiled context binaries, a format used by QNN to save its models. This provides users with a new option for model deployment. Different from oss_scripts & scripts, which the example scripts are converting a model from nn.Module to ExecuTorch .pte files, qaihub_scripts provides example scripts for converting pre-compiled context binaries to ExecuTorch .pte files. Additionaly, users can find customized example runners specific to the QAIHub models for execution. For example [qaihub_llama2_7b](./qaihub_scripts/llama2/qaihub_llama2_7b.py) is a script converting context binaries to ExecuTorch .pte files, and [qaihub_llama2_7b_runner](./qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp) is a customized example runner to execute llama2 .pte files. Please be aware that context-binaries downloaded from QAIHub are tied to a specific QNN SDK version. +3. qaihub_scripts: QAIHub stands for [Qualcomm AI Hub](https://aihub.qualcomm.com/). On QAIHub, users can find pre-compiled context binaries, a format used by QNN to save its models. This provides users with a new option for model deployment. Different from oss_scripts & scripts, which the example scripts are converting a model from nn.Module to ExecuTorch .pte files, qaihub_scripts provides example scripts for converting pre-compiled context binaries to ExecuTorch .pte files. Additionaly, users can find customized example runners specific to the QAIHub models for execution. For example [qaihub_llama2_7b](qaihub_scripts/llama/llama2/qaihub_llama2_7b.py) is a script converting context binaries to ExecuTorch .pte files, and [qaihub_llama2_7b_runner](qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp) is a customized example runner to execute llama2 .pte files. Please be aware that context-binaries downloaded from QAIHub are tied to a specific QNN SDK version. Before executing the scripts and runner, please ensure that you are using the QNN SDK version that is matching the context binary. Please refer to [Check context binary version](#check-context-binary-version) for tutorial on how to check the QNN Version for a context binary. 4. scripts: This folder contains scripts to build models provided by Executorch. @@ -22,15 +22,15 @@ Here are some general information and limitations. ## Prerequisite -Please finish tutorial [Setting up executorch](https://pytorch.org/executorch/stable/getting-started-setup). +Please finish tutorial [Setting up executorch](https://pytorch.org/executorch/main/getting-started-setup). -Please finish [setup QNN backend](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md). +Please finish [setup QNN backend](../../docs/source/backends-qualcomm.md). ## Environment Please set up `QNN_SDK_ROOT` environment variable. Note that this version should be exactly same as building QNN backend. -Please check [setup](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md). +Please check [setup](../../docs/source/backends-qualcomm.md). Please set up `LD_LIBRARY_PATH` to `$QNN_SDK_ROOT/lib/x86_64-linux-clang`. Or, you could put QNN libraries to default search path of the dynamic linker. diff --git a/examples/qualcomm/TARGETS b/examples/qualcomm/TARGETS index 47f4fa422ce..43ca4db6be5 100644 --- a/examples/qualcomm/TARGETS +++ b/examples/qualcomm/TARGETS @@ -4,7 +4,7 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library") load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") oncall("executorch") @@ -27,8 +27,8 @@ python_binary( runtime.command_alias( name = "export_example_qnn", env = { - "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()), - "QNN_SDK_ROOT": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:__dir__)".format(get_qnn_library_verision()), + "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()), + "QNN_SDK_ROOT": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:__dir__)".format(get_qnn_library_version()), }, exe = ":export_example", ) diff --git a/examples/qualcomm/oss_scripts/conv_former.py b/examples/qualcomm/oss_scripts/conv_former.py index 76131d659df..8ce16abcc87 100644 --- a/examples/qualcomm/oss_scripts/conv_former.py +++ b/examples/qualcomm/oss_scripts/conv_former.py @@ -12,10 +12,14 @@ import numpy as np import timm import torch -from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype -from executorch.backends.qualcomm.utils.constants import ( - QCOM_PASS_EXPAND_BROADCAST_SHAPE, +from executorch.backends.qualcomm._passes.expand_broadcast_tensor_shape import ( + ExpandBroadcastTensorShape, +) +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, ) +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY from executorch.examples.qualcomm.utils import ( build_executorch_binary, get_imagenet_dataset, @@ -55,6 +59,9 @@ def main(args): model = model.eval() + # lower to QNN + passes_job = get_capture_program_passes() + passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True build_executorch_binary( model, inputs[0], @@ -62,7 +69,7 @@ def main(args): f"{args.artifact}/{pte_filename}", inputs, quant_dtype=QuantDtype.use_8a8w, - custom_pass_config={QCOM_PASS_EXPAND_BROADCAST_SHAPE}, + passes_job=passes_job, ) if args.compile_only: diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py index 2eb26e6cece..18b5ade8b35 100644 --- a/examples/qualcomm/oss_scripts/dino_v2.py +++ b/examples/qualcomm/oss_scripts/dino_v2.py @@ -10,7 +10,12 @@ import numpy as np import torch +from executorch.backends.qualcomm._passes import ConvertUpsampleBicubicWithBilinear +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, +) from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY from executorch.examples.qualcomm.utils import ( build_executorch_binary, @@ -56,6 +61,8 @@ def main(args): pte_filename = "dino_v2" instance = get_instance() + passes_job = get_capture_program_passes() + passes_job[ConvertUpsampleBicubicWithBilinear][QCOM_PASS_ACTIVATE_KEY] = True build_executorch_binary( instance, sample_input, @@ -65,6 +72,7 @@ def main(args): skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, quant_dtype=QuantDtype.use_8a8w, + passes_job=passes_job, shared_buffer=args.shared_buffer, ) diff --git a/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py b/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py new file mode 100644 index 00000000000..ea65917dcd9 --- /dev/null +++ b/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py @@ -0,0 +1,357 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import getpass +import json +import os +import zipfile +from multiprocessing.connection import Client +from typing import Callable, List + +import numpy as np +import torch +from executorch.backends.qualcomm._passes import ( + ConvertUpsampleBicubicWithBilinear, + ExpandBroadcastTensorShape, +) +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, +) +from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY +from executorch.examples.qualcomm.oss_scripts.efficientSAM.source_transformation import ( + replace_maskdecoder_with_custom_op, + replace_pos_emb_with_custom_op, +) + +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + class_agnostic_mIoU, + make_output_dir, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, +) +from PIL import Image, ImageDraw +from scipy.ndimage import label +from torch.utils.data import DataLoader, Dataset +from torchvision import datasets, transforms + + +def load_dataset(dataset_path): + image_shape = (224, 224) + preprocess = transforms.Compose( + [ + transforms.Resize(image_shape), + transforms.ToTensor(), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + + return list(imagenet_data) + + +class EfficientSAMDataset(Dataset): + def __init__(self, dataset_path, data_size=1) -> None: + self.to_tensor = transforms.ToTensor() + dataset = load_dataset(dataset_path) + self.inputs = self.get_val_dataset(dataset, data_size) + self.data_size = data_size + + def get_val_dataset(self, dataset, data_size): + imgs, pt_prompts, pt_labels = [], [], [] + for i, data in enumerate(dataset): + if i >= data_size: + break + img = data[0] + h, w = img.shape[-2:] + + # Assuming the main object usually appears in the middle of the image, this default value is set for better demo visualization. + # Users can modify/add the point prompt here. + pt_prompt = torch.tensor([[w / 2, (h * 2 / 3)]], dtype=torch.float32)[ + None, ... + ] + # Users can increase the tensor size by adding more labels (0 for negative samples, 1 for positive samples) to label the corresponding points. + # The default label is [[1]], indicating that the point is a positive sample. + pt_label = torch.tensor([[1]], dtype=torch.float32) + + imgs.append(img) + pt_prompts.append(pt_prompt) + pt_labels.append(pt_label) + + imgs = torch.stack(imgs) + pt_prompts = torch.stack(pt_prompts) + pt_labels = torch.stack(pt_labels) + inputs = (imgs, pt_prompts, pt_labels) + return inputs + + def __getitem__(self, idx): + return self.inputs[0][idx], self.inputs[1][idx], self.inputs[2][idx] + + def __len__(self): + return self.data_size + + +def get_dataset(dataset_path, data_size=1): + + dataset = EfficientSAMDataset(dataset_path, data_size=data_size) + dataloader = DataLoader(dataset) + + # prepare input data + inputs, input_list = [], "" + for index, data in enumerate(dataloader): + if index >= data_size: + break + inputs.append(tuple(data)) + num_feature = len(data) + for idx, _ in enumerate(data): + input_name = f"input_{index}_{idx}.raw" + input_list += input_name + " " if idx < num_feature - 1 else input_name + + input_list = input_list + "\n" + + return inputs, input_list + + +def source_transform( + model, transforms: List[Callable[[torch.nn.Module], torch.nn.Module]] +): + for transform in transforms: + model = transform(model) + return model + + +def get_instance(args): + import sys + + sys.path.insert(0, args.oss_repo) + from efficient_sam.efficient_sam import build_efficient_sam + + ckpt = args.pretrained_weight + file_path, file_extension = os.path.splitext(ckpt) + file_dir, filename = os.path.split(file_path) + + if file_extension == ".zip": + with zipfile.ZipFile(ckpt, "r") as zip_ref: + zip_ref.extractall(file_dir) + ckpt = file_path + filename = os.path.splitext(filename)[0] + + model_arch = filename.split("_")[-1] + + if model_arch == "vitt": + encoder_patch_embed_dim, encoder_num_heads = (192, 3) + elif model_arch == "vits": + encoder_patch_embed_dim, encoder_num_heads = (384, 6) + else: + raise ValueError(f"Unsupported model architecture: {model_arch}") + + model = build_efficient_sam( + encoder_patch_embed_dim=encoder_patch_embed_dim, + encoder_num_heads=encoder_num_heads, + checkpoint=ckpt, + ).eval() + + return model + + +def generate_mask(predicted_logits, predicted_iou): + sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True) + predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2) + predicted_logits = torch.take_along_dim( + predicted_logits, sorted_ids[..., None, None], dim=2 + ) + + # The masks are already sorted by their predicted IOUs. + # We use the first mask. + mask = torch.ge(predicted_logits[0, 0, 0, :, :], 0).cpu().detach().numpy() + return mask + + +def save_mask(mask, input, save_path): + image, prompt, pt_label = input + original_image_tensor = image[0] + + # Convert tensor to numpy array if necessary + if not isinstance(original_image_tensor, np.ndarray): + original_image_tensor = original_image_tensor.detach().numpy() + + # Transpose if the image has 3 channels + if original_image_tensor.shape[0] == 3: + original_image_tensor = original_image_tensor.transpose(1, 2, 0) + + original_img = Image.fromarray( + (original_image_tensor * 255).astype(np.uint8) + ).convert("RGBA") + + # Create an empty RGBA image for the mask + mask_img = np.ones((mask.shape[0], mask.shape[1], 4)) + mask_img[:, :, 3] = 0 + + colors = [ + [1, 0, 0, 0.5], + [0, 1, 0, 0.5], + [0, 0, 1, 0.5], + [1, 1, 0, 0.5], + [1, 0, 1, 0.5], + [0, 1, 1, 0.5], + ] + + # Apply mask + labeled_mask, num_feature = label(mask) + for i in range(1, num_feature + 1): + mask_img[labeled_mask == i] = colors[(i - 1) % len(colors)] + + mask_img = Image.fromarray((mask_img * 255).astype(np.uint8), "RGBA") + + # Combine original image with mask + combined_img = Image.alpha_composite(original_img, mask_img) + + # Draw prompts point ("green" for positive samples, "red" for negative samples) + draw = ImageDraw.Draw(combined_img) + for pt, l in zip(prompt[0][0], pt_label[0][0]): + color = "green" if l else "red" + point_size = 3 + x1, y1 = max(0, int(pt[0]) - point_size), max(0, int(pt[1]) - point_size) + x2, y2 = min(combined_img.size[0], int(pt[0]) + point_size), min( + combined_img.size[1], int(pt[1]) + point_size + ) + draw.ellipse((x1, y1, x2, y2), fill=color, outline=color) + + combined_img.save(save_path) + + +def main(args): + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + + os.makedirs(args.artifact, exist_ok=True) + + data_size = 1 + inputs, input_list = get_dataset(args.dataset, data_size) + assert args.pretrained_weight, "Checkpoint params can't be empty" + + # Get the EfficientSAM model. + model = get_instance(args) + model = source_transform( + model, + [ + replace_maskdecoder_with_custom_op, + replace_pos_emb_with_custom_op, + ], + ) + + pte_filename = "efficientSAM_qnn" + + # lower to QNN + passes_job = get_capture_program_passes() + passes_job[ConvertUpsampleBicubicWithBilinear][QCOM_PASS_ACTIVATE_KEY] = True + passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True + build_executorch_binary( + model, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) + + if args.compile_only: + return + + workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" + pte_path = f"{args.artifact}/{pte_filename}.pte" + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=pte_path, + workspace=workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + ) + adb.push(inputs=inputs, input_list=input_list) + adb.execute() + + # collect output data + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + outputs = [] + + def post_process(): + for i, f in enumerate(sorted(os.listdir(output_data_folder))): + filename = os.path.join(output_data_folder, f) + output = np.fromfile(filename, dtype=np.float32) + output_shape = [1, 1, 3] if i % 2 else [1, 1, 3, 224, 224] + output = torch.from_numpy(output).reshape(output_shape) + outputs.append(output) + + adb.pull(output_path=args.artifact, callback=post_process) + + # MIoU analysis + miou = 0 + targets = [model(img, pt, pt_label) for img, pt, pt_label in inputs] + for i in range(data_size): + pred_mask = generate_mask(outputs[i * 2], outputs[i * 2 + 1]) + save_mask(pred_mask, inputs[i], f"{args.artifact}/output_{i}.png") + target_mask = generate_mask(targets[i][0], targets[i][1]) + miou += class_agnostic_mIoU([pred_mask], [target_mask]) + miou /= data_size + + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"MIoU": miou})) + else: + print(f"MIoU->{miou}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts and output by this example. Default ./EfficientSAM_qnn", + default="./EfficientSAM_qnn", + type=str, + ) + + parser.add_argument( + "--pretrained_weight", + help="Path to ESAM checkpoint, such as ./efficient_sam_vitt.pt or ./efficient_sam_vits.pt.zip", + type=str, + required=True, + ) + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "--oss_repo", + help="Path to clone https://github.com/yformer/EfficientSAM", + type=str, + required=True, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py new file mode 100644 index 00000000000..fd54a727136 --- /dev/null +++ b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from executorch.examples.qualcomm.oss_scripts.efficientSAM.source_transformation.mask_decoder import ( + replace_maskdecoder_with_custom_op, +) +from executorch.examples.qualcomm.oss_scripts.efficientSAM.source_transformation.pos_emb import ( + replace_pos_emb_with_custom_op, +) + + +__all__ = [ + replace_maskdecoder_with_custom_op, + replace_pos_emb_with_custom_op, +] diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py new file mode 100644 index 00000000000..c70d51a48fe --- /dev/null +++ b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py @@ -0,0 +1,125 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import List, Tuple + +import torch +import torch.nn as nn + + +class MaskDecoderCustom(nn.Module): + def forward( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + multimask_output: bool, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Arguments: + image_embeddings: A tensor of shape [B, C, H, W] or [B*max_num_queries, C, H, W] + image_pe (torch.Tensor): positional encoding with the shape of image_embeddings (the batch dimension is broadcastable). + sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes + multimask_output (bool): Whether to return multiple masks or a single + mask. + + Returns: + torch.Tensor: batched predicted masks + torch.Tensor: batched predictions of mask quality + """ + + ( + batch_size, + max_num_queries, + sparse_embed_dim_1, + sparse_embed_dim_2, + ) = sparse_prompt_embeddings.shape + + ( + _, + image_embed_dim_c, + image_embed_dim_h, + image_embed_dim_w, + ) = image_embeddings.shape + + # QNN don't support dim greater than 4 + image_embeddings_expanded = image_embeddings.expand(max_num_queries, -1, -1, -1) + image_embeddings_tiled = image_embeddings_expanded.contiguous().view( + batch_size * max_num_queries, + image_embed_dim_c, + image_embed_dim_h, + image_embed_dim_w, + ) + sparse_prompt_embeddings = sparse_prompt_embeddings.reshape( + batch_size * max_num_queries, sparse_embed_dim_1, sparse_embed_dim_2 + ) + masks, iou_pred = self.predict_masks( + image_embeddings=image_embeddings_tiled, + image_pe=image_pe, + sparse_prompt_embeddings=sparse_prompt_embeddings, + ) + if multimask_output and self.num_multimask_outputs > 1: + return masks[:, 1:, :], iou_pred[:, 1:] + else: + return masks[:, :1, :], iou_pred[:, :1] + + def predict_masks( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Predicts masks. See 'forward' for more details.""" + # Concatenate output tokens + output_tokens = torch.cat( + [self.iou_token.weight, self.mask_tokens.weight], dim=0 + ) + output_tokens = output_tokens.unsqueeze(0).expand( + sparse_prompt_embeddings.size(0), -1, -1 + ) + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) + # Expand per-image data in batch direction to be per-mask + # QNN don't support dim greater than 4, + pos_src = image_pe.expand([tokens.shape[0]] + [*image_pe.shape[1:]]) + b, c, h, w = image_embeddings.shape + hs, src = self.transformer(image_embeddings, pos_src, tokens) + iou_token_out = hs[:, 0, :] + mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + upscaled_embedding = src.transpose(1, 2).view(b, c, h, w) + + for upscaling_layer in self.final_output_upscaling_layers: + upscaled_embedding = upscaling_layer(upscaled_embedding) + hyper_in_list: List[torch.Tensor] = [] + for i, output_hypernetworks_mlp in enumerate(self.output_hypernetworks_mlps): + hyper_in_list.append(output_hypernetworks_mlp(mask_tokens_out[:, i, :])) + hyper_in = torch.stack(hyper_in_list, dim=1) + b, c, h, w = upscaled_embedding.shape + masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w) + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + return masks, iou_pred + + +def _replace_maskdecoder_with_custom_op(module: torch.nn.Module): + from efficient_sam.efficient_sam_decoder import MaskDecoder # B007 + + for _, child in module.named_children(): + if isinstance(child, MaskDecoder): + child.forward = MaskDecoderCustom.forward.__get__(child, MaskDecoder) + child.predict_masks = MaskDecoderCustom.predict_masks.__get__( + child, MaskDecoder + ) + else: + _replace_maskdecoder_with_custom_op(child) + + +def replace_maskdecoder_with_custom_op(module: torch.nn.Module) -> torch.nn.Module: + + _replace_maskdecoder_with_custom_op(module) + return module diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py new file mode 100644 index 00000000000..7a3a91c7607 --- /dev/null +++ b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py @@ -0,0 +1,64 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import Tuple + +import numpy as np + +import torch +import torch.nn as nn + + +class PositionEmbeddingRandomCustom(nn.Module): + """ + Positional encoding using random spatial frequencies. + """ + + def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor: + """Positionally encode points that are normalized to [0,1].""" + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coords = 2 * coords - 1 + coords = coords.unsqueeze(0) + coords = torch.matmul( + coords, self.positional_encoding_gaussian_matrix.unsqueeze(0) + ) + coords = coords.squeeze(0) + coords = 2 * np.pi * coords + # outputs d_1 x ... x d_n x C shape + return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1) + + def forward_with_coords( + self, coords_input: torch.Tensor, image_size: Tuple[int, int] + ) -> torch.Tensor: + """Positionally encode points that are not normalized to [0,1].""" + coords = coords_input.clone() + coords_0 = coords[:, :, 0] / image_size[1] + coords_1 = coords[:, :, 1] / image_size[0] + coords = torch.stack((coords_0, coords_1), dim=-1) + + return self._pe_encoding(coords.to(torch.float)) # B x N x C + + +def _replace_pos_emb_with_custom_op(module: torch.nn.Module): + from efficient_sam.efficient_sam_decoder import PositionEmbeddingRandom # B007 + + for _, child in module.named_children(): + if isinstance(child, PositionEmbeddingRandom): + child._pe_encoding = PositionEmbeddingRandomCustom._pe_encoding.__get__( + child, PositionEmbeddingRandom + ) + child.forward_with_coords = ( + PositionEmbeddingRandomCustom.forward_with_coords.__get__( + child, PositionEmbeddingRandom + ) + ) + else: + _replace_pos_emb_with_custom_op(child) + + +def replace_pos_emb_with_custom_op(module: torch.nn.Module) -> torch.nn.Module: + + _replace_pos_emb_with_custom_op(module) + return module diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py index 501ea522acd..ee062735fbd 100644 --- a/examples/qualcomm/oss_scripts/fastvit.py +++ b/examples/qualcomm/oss_scripts/fastvit.py @@ -101,20 +101,22 @@ def main(args): ), ) # rewrite default per-channel ptq config - quantizer.per_channel_quant_config = QuantizationConfig( + quantizer.default_quant_config.per_channel_quant_config = QuantizationConfig( input_activation=act_qspec, output_activation=act_qspec, weight=weight_qspec, bias=_derived_bias_quant_spec, ) + # rewrite default ptq config - q_config = quantizer.bit8_quant_config - quantizer.bit8_quant_config = QuantizationConfig( + q_config = quantizer.default_quant_config.quant_config + quantizer.default_quant_config.quant_config = QuantizationConfig( input_activation=act_qspec, output_activation=act_qspec, weight=q_config.weight, bias=q_config.bias, ) + # lower to QNN passes_job = get_capture_program_passes() passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md index cd468eebb26..3ee2d3789e4 100644 --- a/examples/qualcomm/oss_scripts/llama/README.md +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -14,7 +14,7 @@ Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache - AR-N model: The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use it to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor in hybrid mode. - Prompt processing with AR-N model:

- Prompt Processing With AR-N Model + Prompt Processing With AR-N Model
Prompt processing is done using a for-loop. An N-token block is taken, and the KV cache is updated for that block. This process is repeated until all tokens are consumed, with the last block potentially requiring padding. For flexibility, the AR-N model can handle any input length less than the maximum sequence length. For TTFT, the input length (or number of blocks) will vary depending on the actual input length, rather than always being the same.
@@ -28,7 +28,7 @@ Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache ### Step 1: Setup 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. -2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. +2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend. ### Step 2: Prepare Model @@ -41,7 +41,7 @@ wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" # tokenizer.bin: -python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin +python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin # params.json: echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json @@ -70,14 +70,14 @@ We have two distinct mechanisms for updating the key-value (KV) cache, which can #### Shift Pointer mechanism
- Shift Pointer mechanism
+ Shift Pointer mechanism
The figure illustrates the process of updating the key and value caches during each inference step. In key cache update process, we initially allocate memory for each layer with num_head size of (head_dim + 1) * (seq_len - 1). After a single inference, the new key cache is copied from the key output pointer k_out and appended to the key cache. Subsequently, the buffer start pointer of the key cache k_in moves to the next token, making the previous position of the buffer start pointer unused. This process is repeated for each subsequent inference step. For the value cache update process, we first allocate a contiguous memory of size (num_head + 1) * head_dim * (seq_len - 1) for each layer, with the last head reserved for I/O shifting, After the first inference, the cache is updated by simply shifting the pointers of all heads to the next token position, making only the previous head_dim * 1 section of the buffer start pointer v_in of the first head unused. This process is repeated for each subsequent inference step.
#### Smart Mask mechanism:
- Smart Mask mechanism + Smart Mask mechanism
The Smart Mask mechanism streamlines the process of updating tokens in the cache. Unlike the Shift Pointer mechanism, which requires moving the buffer start pointer k_in/v_in of the cache, the Smart Mask mechanism updates only the new token at the specified position. This approach eliminates the need to adjust the buffer start pointer. This mechanism is beneficial for shared buffers but requires CPU memory copying.
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS index e4bad10a234..024b45b65cd 100644 --- a/examples/qualcomm/oss_scripts/llama/TARGETS +++ b/examples/qualcomm/oss_scripts/llama/TARGETS @@ -1,5 +1,5 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") @@ -48,7 +48,7 @@ python_binary( runtime.command_alias( name = "llama_qnn", env = { - "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()), + "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()), # Place holder to pass the QNN_SDK_ROOT check in executorch/examples/qualcomm/utils.py "QNN_SDK_ROOT": "", }, diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 7e1b6872882..375edf9fb6c 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -390,6 +390,14 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()): fx_graph_module = torch.export.export( self.llama_graph_module, self.inputs, strict=True ).module() + + if QuantDtype == QuantDtype.use_16a4w_block: + conv_nodes = [ + n for n in fx_graph_module.graph.nodes if "conv" in n.name + ] + block_size_map = {n.name: (1, 64, 1, 1) for n in conv_nodes} + quantizer.set_block_size_map(block_size_map) + fx_graph_module = prepare_pt2e(fx_graph_module, quantizer) logging.info("Quantizing the model...") @@ -574,13 +582,14 @@ def permute(w, heads): fixed_point_type["kv_type"] = torch.uint8 if args.ptq == "8a8w": fixed_point_type["io_type"] = torch.uint8 - elif args.ptq == "16a4w": + elif args.ptq in ("16a4w", "16a4w_block"): fixed_point_type["io_type"] = torch.uint16 else: assert args.ptq in [ "8a8w", "16a4w", - ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w." + "16a4w_block", + ], f"No support for quant type {args.ptq}. Support 8a8w, 16a4w and 16a4w_block." quant_dtype = getattr(QuantDtype, f"use_{args.ptq}") assert args.tokenizer_model is not None, "Need tokenizer model for calibration" @@ -954,7 +963,7 @@ def _build_parser(): parser.add_argument( "-P", "--ptq", - help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.", + help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w, 16a4w and 16a4w_block.", type=str, ) diff --git a/examples/qualcomm/oss_scripts/llama/targets.bzl b/examples/qualcomm/oss_scripts/llama/targets.bzl index c3f7e7fbbda..a67281e7e66 100644 --- a/examples/qualcomm/oss_scripts/llama/targets.bzl +++ b/examples/qualcomm/oss_scripts/llama/targets.bzl @@ -1,5 +1,5 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version") def define_common_targets(): runtime.cxx_library( @@ -20,7 +20,7 @@ def define_common_targets(): "//executorch/extension/llm/runner:stats", "//executorch/extension/tensor:tensor", "//executorch/kernels/quantized:generated_lib", - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_version()), ], exported_deps = [ "//executorch/extension/module:module", diff --git a/examples/qualcomm/oss_scripts/moshi/mimi.py b/examples/qualcomm/oss_scripts/moshi/mimi.py new file mode 100644 index 00000000000..6b59a71ae64 --- /dev/null +++ b/examples/qualcomm/oss_scripts/moshi/mimi.py @@ -0,0 +1,402 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# import argparse +import io +import json +import os +import random +from multiprocessing.connection import Client + +import numpy as np +import requests + +import sphn +import torch + +import torch.nn as nn +import torchaudio + +from executorch.backends.qualcomm.quantizer.custom_annotation import ( + annotate_mimi_decoder, +) +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype + +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + make_output_dir, + make_quantizer, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, +) + +from huggingface_hub import hf_hub_download +from moshi.models import loaders + +from torch.ao.quantization.observer import MinMaxObserver + + +def seed_all(seed): + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # for multi-GPU setups + random.seed(seed) + np.random.seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def read_mp3_from_url(url): + response = requests.get(url) + response.raise_for_status() # Ensure request is successful + + # Convert to a file-like object + audio_stream = io.BytesIO(response.content) + + # Load audio using torchaudio + waveform, sample_rate = torchaudio.load(audio_stream, format="mp3") + + return waveform.numpy(), sample_rate + + +def compute_scores(cpu_decode_res: torch.Tensor, htp_decode_res: torch.Tensor): + assert cpu_decode_res.shape == htp_decode_res.shape, "Tensor shapes do not match" + abs_diff = torch.abs(cpu_decode_res - htp_decode_res) + atol = torch.max(abs_diff) + print("Atol: ", atol) + + cpu_decode_res = cpu_decode_res.float() + htp_decode_res = htp_decode_res.float() + error = cpu_decode_res - htp_decode_res + original_power = torch.mean(torch.pow(cpu_decode_res, 2)) + error_power = torch.mean(torch.pow(error, 2)) + sqnr = 10 * torch.log10(original_power / error_power) + print("SQNR: ", sqnr) + + +def test_decoder_with_emb_input(mimi, args): + class MimiDecode(nn.Module): + def __init__(self, mimi: nn.Module): + super().__init__() + self.mimi_model = mimi + + def forward(self, x): + x = x.transpose(1, 2) + x = self.mimi_model.upsample(x) + (emb,) = self.mimi_model.decoder_transformer(x) + emb.transpose(1, 2) + with self.mimi_model._context_for_encoder_decoder: + out = self.mimi_model.decoder(emb) + return out + + emb_input = torch.rand(1, 1, 512, device="cpu") + mimi_decode = MimiDecode(mimi).eval() + cpu_res = mimi_decode(emb_input) + pte_filename = "mimi_decoder_emb_qnn" + + quantizer = make_quantizer( + quant_dtype=QuantDtype.use_16a8w, + per_channel_conv=True, + per_channel_linear=True, + act_observer=MinMaxObserver, + ) + quantizer.add_custom_quant_annotations((annotate_mimi_decoder,)) + + emb_inputs = [(emb_input,)] + build_executorch_binary( + mimi_decode, + emb_inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + emb_inputs, + custom_quantizer=quantizer, + quant_dtype=QuantDtype.use_16a8w, + shared_buffer=args.shared_buffer, + ) + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=f"{args.artifact}/{pte_filename}.pte", + workspace=f"/data/local/tmp/executorch/{pte_filename}", + device_id=args.device, + host_id=args.host, + soc_model=args.model, + shared_buffer=args.shared_buffer, + ) + adb.push(inputs=emb_inputs, input_list="input_0_0.raw\n") + adb.execute() + + # collect output data + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + adb.pull(output_path=args.artifact) + + emb_predictions = [] + for i in range(len(emb_inputs)): + np_arr = np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + emb_predictions.append(torch.from_numpy(np_arr).view(1, 1, 1920)) + print("Emb input test results") + compute_scores(cpu_res, emb_predictions[0]) + + +def mimi_encode( + mimi, + encode_inputs, + encoder_input_list, + pcm_chunk_size, + skip_node_id_set, + skip_node_op_set, +) -> torch.Tensor: + class MimiEncode(nn.Module): + def __init__(self, mimi: nn.Module): + super().__init__() + self.mimi_model = mimi + + def forward(self, x): + return self.mimi_model.encode(x) + + mimi_encode_model = MimiEncode(mimi) + + pte_filename = "mimi_encoder_qnn" + build_executorch_binary( + mimi_encode_model.eval(), + encode_inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + encode_inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_8a8w, + shared_buffer=args.shared_buffer, + ) + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=f"{args.artifact}/{pte_filename}.pte", + workspace=f"/data/local/tmp/executorch/{pte_filename}", + device_id=args.device, + host_id=args.host, + soc_model=args.model, + shared_buffer=args.shared_buffer, + ) + adb.push(inputs=encode_inputs, input_list=encoder_input_list) + adb.execute() + + # collect output data + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + adb.pull(output_path=args.artifact) + + encoder_predictions = [] + # Num chunks should align with args.chunks_per_batch + num_chunks = encode_inputs[0][0].shape[-1] // pcm_chunk_size + for i in range(len(encode_inputs)): + np_arr = np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.int64 + ) + encoder_predictions.append(torch.from_numpy(np_arr).view(1, 8, num_chunks)) + return encoder_predictions + + +def mimi_decode( + mimi, encode_res_list, pcm_chunk_size, skip_node_id_set, skip_node_op_set +) -> torch.Tensor: + class MimiDecode(nn.Module): + def __init__(self, mimi: nn.Module): + super().__init__() + self.mimi_model = mimi + + def forward(self, x): + return self.mimi_model.decode(x) + + mimi_decode_model = MimiDecode(mimi) + decode_inputs, decode_input_list = [], "" + for index, encoder_res in enumerate(encode_res_list): + decode_inputs.append((encoder_res.to(torch.int32),)) + decode_input_list += f"input_{index}_0.raw\n" + + pte_filename = "mimi_decoder_qnn" + + quantizer = make_quantizer( + quant_dtype=QuantDtype.use_16a8w, + per_channel_conv=True, + per_channel_linear=True, + act_observer=MinMaxObserver, + ) + quantizer.add_custom_quant_annotations((annotate_mimi_decoder,)) + + build_executorch_binary( + mimi_decode_model.eval(), + decode_inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + decode_inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + custom_quantizer=quantizer, + quant_dtype=QuantDtype.use_16a8w, + shared_buffer=args.shared_buffer, + ) + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=f"{args.artifact}/{pte_filename}.pte", + workspace=f"/data/local/tmp/executorch/{pte_filename}", + device_id=args.device, + host_id=args.host, + soc_model=args.model, + shared_buffer=args.shared_buffer, + ) + adb.push(inputs=decode_inputs, input_list=decode_input_list) + adb.execute() + + # collect output data + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + adb.pull(output_path=args.artifact) + + decoder_predictions = [] + # Num chunks should align with args.chunks_per_batch + num_chunks = decode_inputs[0][0].shape[-1] + shape = num_chunks * pcm_chunk_size + for i in range(len(decode_inputs)): + np_arr = np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + decoder_predictions.append(torch.from_numpy(np_arr).view(1, 1, shape)) + htp_decode_res = torch.cat(decoder_predictions, dim=-1) + + return htp_decode_res + + +def export_mimi(mimi, args, max_duration_sec=10.0): + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + os.makedirs(args.artifact, exist_ok=True) + + if args.emb_input_test: + test_decoder_with_emb_input(mimi, args) + return + + sample_rate = mimi.sample_rate + url = "https://huggingface.co/lmz/moshi-swift/resolve/main/bria-24khz.mp3" + sample_pcm, sample_sr = read_mp3_from_url(url) + sample_rate = mimi.sample_rate + sample_pcm = torch.tensor(sample_pcm, device="cpu") + max_duration_len = int(sample_rate * max_duration_sec) + if sample_pcm.shape[-1] > max_duration_len: + sample_pcm = sample_pcm[..., :max_duration_len] + sample_pcm = sample_pcm[None].to(device="cpu") + + encoder_inputs, encoder_input_list = [], "" + # 1920 chunk_size = 0.08sec + pcm_chunk_size = int(mimi.sample_rate / mimi.frame_rate) + batch_size = pcm_chunk_size * args.chunks_per_batch + count = 0 + for start_idx in range(0, sample_pcm.shape[-1], batch_size): + end_idx = min(sample_pcm.shape[-1], start_idx + batch_size) + chunk = sample_pcm[..., start_idx:end_idx] + encoder_inputs.append((chunk,)) + encoder_input_list += f"input_{count}_0.raw\n" + count += 1 + + print("streaming encoding...") + cpu_encode_res = mimi.encode(sample_pcm) + htp_encode_res = mimi_encode( + mimi, + encoder_inputs, + encoder_input_list, + pcm_chunk_size, + skip_node_id_set, + skip_node_op_set, + ) + + # Leave it here for now, uncomment this to check htp_encoder with cpu_decoder + # htp_res = torch.cat(htp_encode_res, dim=-1) + # cpu_decode_htp_encode = mimi.decode(htp_res) + # sphn.write_wav("cpu_decode_htp_encode.wav", cpu_decode_htp_encode[0, 0].cpu().numpy(), sample_rate) + + print("streaming decoding...") + cpu_decode_res = mimi.decode(cpu_encode_res) + # TODO: Enable streaming mode, which is the correct way to execute 1 chunk at a time. + # with mimi.streaming(1): + htp_decode_res = mimi_decode( + mimi, htp_encode_res, pcm_chunk_size, skip_node_id_set, skip_node_op_set + ) + compute_scores(cpu_decode_res, htp_decode_res) + + sphn.write_wav( + f"{args.artifact}/cpu_decode_res.wav", + cpu_decode_res[0, 0].cpu().numpy(), + sample_rate, + ) + sphn.write_wav( + f"{args.artifact}/htp_decode_res.wav", + htp_decode_res[0, 0].cpu().numpy(), + sample_rate, + ) + + +def main(args): + seed_all(42424242) + + print("loading mimi") + if args.mimi_weight is None: + args.mimi_weight = hf_hub_download(args.hf_repo, loaders.MIMI_NAME) + mimi = loaders.get_mimi(args.mimi_weight, "cpu") + print("mimi loaded") + + with torch.no_grad(): + export_mimi(mimi, args) + + +if __name__ == "__main__": + + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./mimi", + default="./mimi", + type=str, + ) + + parser.add_argument( + "--chunks_per_batch", + help="Number of chunks to process per time. Default is 1 chunk per batch, which equals to 0.08 second", + default=1, + type=int, + ) + + parser.add_argument( + "--emb_input_test", + help="This is just a metrics used to compute accuracy scores, not recommended for general users.", + action="store_true", + default=False, + ) + + parser.add_argument("--mimi-weight", type=str) + parser.add_argument("--hf-repo", type=str, default=loaders.DEFAULT_REPO) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/qaihub_scripts/llama/README.md b/examples/qualcomm/qaihub_scripts/llama/README.md index 0fec6ea867f..1e08b97bccb 100644 --- a/examples/qualcomm/qaihub_scripts/llama/README.md +++ b/examples/qualcomm/qaihub_scripts/llama/README.md @@ -12,14 +12,14 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o ### Instructions #### Step 1: Setup 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. -2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. +2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend. #### Step2: Prepare Model 1. Create account for https://aihub.qualcomm.com/ 2. Follow instructions in https://huggingface.co/qualcomm/Llama-v2-7B-Chat to export context binaries (will take some time to finish) ```bash -# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model +# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main # tokenizer.bin: python -m examples.models.llama.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin ``` @@ -40,7 +40,7 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o ### Instructions #### Step 1: Setup 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. -2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. +2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend. #### Step2: Prepare Model 1. Create account for https://aihub.qualcomm.com/ @@ -54,4 +54,4 @@ Please refer to [Check context binary version](../../README.md#check-context-bin ```bash # AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v3_8b_chat_quantized python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?" -``` \ No newline at end of file +``` diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md index b008d3135d4..d2649cf72c2 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md @@ -11,7 +11,7 @@ The model architecture, scheduler, and time embedding are from the [stabilityai/ ### Instructions #### Step 1: Setup 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. -2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. +2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend. #### Step2: Prepare Model 1. Download the context binaries for TextEncoder, UNet, and VAEDecoder under https://huggingface.co/qualcomm/Stable-Diffusion-v2.1/tree/main diff --git a/examples/qualcomm/qaihub_scripts/utils/README.md b/examples/qualcomm/qaihub_scripts/utils/README.md index 61f465f3926..996b72f7937 100644 --- a/examples/qualcomm/qaihub_scripts/utils/README.md +++ b/examples/qualcomm/qaihub_scripts/utils/README.md @@ -1,6 +1,6 @@ # CLI Tool for Compile / Deploy Pre-Built QNN Artifacts -An easy-to-use tool for generating / executing .pte program from pre-built model libraries / context binaries from Qualcomm AI Engine Direct. Tool is verified with [host environement](../../../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md#host-os). +An easy-to-use tool for generating / executing .pte program from pre-built model libraries / context binaries from Qualcomm AI Engine Direct. Tool is verified with [host environement](../../../../docs/source/backends-qualcomm.md#host-os). ## Description @@ -20,7 +20,7 @@ If users are interested in well-known applications, [Qualcomm AI HUB](https://ai ### Dependencies * Register for Qualcomm AI HUB. -* Download the corresponding QNN SDK via [link](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) which your favorite model is compiled with. Ths link will automatically download the latest version at this moment (users should be able to specify version soon, please refer to [this](../../../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md#software) for earlier releases). +* Download the corresponding QNN SDK via [link](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) which your favorite model is compiled with. Ths link will automatically download the latest version at this moment (users should be able to specify version soon, please refer to [this](../../../../docs/source/backends-qualcomm.md#software) for earlier releases). ### Target Model diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py index fa7efc0c459..a12a5069c3f 100755 --- a/examples/qualcomm/scripts/edsr.py +++ b/examples/qualcomm/scripts/edsr.py @@ -24,7 +24,7 @@ from PIL import Image from torch.utils.data import Dataset -from torchsr.datasets import B100 +from torchsr.datasets import B100, Div2K from torchvision.transforms.functional import to_pil_image, to_tensor @@ -75,6 +75,16 @@ def get_b100( return SrDataset(hr_dir, lr_dir) +def get_Div2K( + dataset_dir: str, +): + hr_dir = f"{dataset_dir}/sr_bm_dataset/DIV2K/DIV2K_valid_HR" + lr_dir = f"{dataset_dir}/sr_bm_dataset/DIV2K/DIV2K_valid_LR_bicubic/X2" + if not os.path.exists(hr_dir) or not os.path.exists(lr_dir): + Div2K(root=f"{dataset_dir}/sr_bm_dataset", scale=2, download=True) + return SrDataset(hr_dir, lr_dir) + + def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str): if not (lr_dir and hr_dir) and not default_dataset: raise RuntimeError( @@ -85,7 +95,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str raise RuntimeError("Either use custom dataset, or use default dataset.") if default_dataset: - return get_b100(dataset_dir) + return get_Div2K(dataset_dir) return SrDataset(hr_dir, lr_dir) diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index 47a489f6d52..bd0b6dfbcf2 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -23,7 +23,6 @@ make_output_dir, make_quantizer, parse_skip_delegation_node, - QnnPartitioner, setup_common_args_and_variables, SimpleADB, ) @@ -103,10 +102,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): from transformers import get_linear_schedule_with_warmup # grab dataset - url = ( - "https://raw.githubusercontent.com/susanli2016/NLP-with-Python" - "/master/data/title_conference.csv" - ) + url = "https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv" content = requests.get(url, allow_redirects=True).content data = pd.read_csv(BytesIO(content)) @@ -273,19 +269,15 @@ def calibrator(gm): quantizer = make_quantizer(quant_dtype=quant_dtype) backend_options = generate_htp_compiler_spec(quant_dtype is not None) - partitioner = QnnPartitioner( - generate_qnn_executorch_compiler_spec( - soc_model=getattr(QcomChipset, args.model), - backend_options=backend_options, - ), - skip_node_id_set=skip_node_id_set, - skip_node_op_set=skip_node_op_set, + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=getattr(QcomChipset, args.model), + backend_options=backend_options, ) # skip embedding layer cause it's quantization sensitive graph_module, _ = skip_annotation( nn_module=model, quantizer=quantizer, - partitioner=partitioner, + compiler_specs=compiler_specs, sample_input=inputs[0], calibration_cb=calibrator, fp_node_op_set={torch.ops.aten.embedding.default}, diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index 2b2f32b037b..242170712e1 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -14,12 +14,16 @@ import tempfile from pathlib import Path -from typing import Callable, List, Optional +from typing import Callable, List, Optional, Tuple import numpy as np import torch -from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype +from executorch.backends.qualcomm.quantizer.quantizer import ( + ModuleQConfig, + QnnQuantizer, + QuantDtype, +) from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset from executorch.backends.qualcomm.utils.utils import ( generate_htp_compiler_spec, @@ -254,18 +258,23 @@ def qat_train(ori_model, captured_model, quantizer, dataset): def make_quantizer( quant_dtype: Optional[QuantDtype] = QuantDtype.use_8a8w, custom_annotations=(), - per_block_conv=False, per_channel_conv=True, per_channel_linear=False, act_observer=MovingAverageMinMaxObserver, is_qat=False, + submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None, ): quantizer = QnnQuantizer() quantizer.add_custom_quant_annotations(custom_annotations) - quantizer.set_per_block_conv_quant(per_block_conv) - quantizer.set_per_channel_conv_quant(per_channel_conv) - quantizer.set_per_channel_linear_quant(per_channel_linear) - quantizer.set_quant_config(quant_dtype, is_qat, act_observer) + quantizer.set_default_quant_config( + quant_dtype, + is_qat=is_qat, + is_conv_per_channel=per_channel_conv, + is_linear_per_channel=per_channel_linear, + act_observer=act_observer, + ) + submodule_qconfig_list = submodule_qconfig_list or [] + quantizer.set_submodule_qconfig_list(submodule_qconfig_list) return quantizer @@ -279,7 +288,7 @@ def build_executorch_binary( skip_node_id_set=None, skip_node_op_set=None, quant_dtype: Optional[QuantDtype] = None, - custom_quantizer=None, + custom_quantizer: Optional[QnnQuantizer] = None, shared_buffer=False, metadata=None, dump_intermediate_outputs=False, @@ -316,8 +325,8 @@ def build_executorch_binary( shared_buffer=shared_buffer, dump_intermediate_outputs=dump_intermediate_outputs, ) - if quant_dtype is not None: - captured_model = torch.export.export(model, inputs, strict=True).module() + if quant_dtype is not None or custom_quantizer is not None: + captured_model = torch.export.export(model, inputs, strict=False).module() if qat_training_data: quantizer = custom_quantizer or make_quantizer( quant_dtype=quant_dtype, is_qat=True @@ -419,6 +428,15 @@ def histogram(golden, predict): return (pa, mpa, miou, cls_iou) +def class_agnostic_mIoU(predictions, targets): + total_iou = 0 + for pred, tar in zip(predictions, targets): + inter = np.count_nonzero(pred & tar) + union = np.count_nonzero(pred | tar) + total_iou += inter / (union + 1e-10) + return total_iou / len(predictions) + + def get_imagenet_dataset( dataset_path, data_size, image_shape, crop_size=None, shuffle=True ): diff --git a/examples/selective_build/README.md b/examples/selective_build/README.md index 6c655e18a3d..97706d70c48 100644 --- a/examples/selective_build/README.md +++ b/examples/selective_build/README.md @@ -3,7 +3,7 @@ To optimize binary size of ExecuTorch runtime, selective build can be used. This ## How to run -Prerequisite: finish the [setting up wiki](https://pytorch.org/executorch/stable/getting-started-setup). +Prerequisite: finish the [setting up wiki](https://pytorch.org/executorch/main/getting-started-setup). Run: diff --git a/examples/xnnpack/README.md b/examples/xnnpack/README.md index 179e47004a1..5c307d34717 100644 --- a/examples/xnnpack/README.md +++ b/examples/xnnpack/README.md @@ -1,8 +1,8 @@ # XNNPACK Backend [XNNPACK](https://github.com/google/XNNPACK) is a library of optimized neural network operators for ARM and x86 CPU platforms. Our delegate lowers models to run using these highly optimized CPU operators. You can try out lowering and running some example models in the demo. Please refer to the following docs for information on the XNNPACK Delegate -- [XNNPACK Backend Delegate Overview](https://pytorch.org/executorch/stable/native-delegates-executorch-xnnpack-delegate.html) -- [XNNPACK Delegate Export Tutorial](https://pytorch.org/executorch/stable/tutorial-xnnpack-delegate-lowering.html) +- [XNNPACK Backend Delegate Overview](https://pytorch.org/executorch/main/backends-xnnpack) +- [XNNPACK Delegate Export Tutorial](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering) ## Directory structure @@ -60,7 +60,7 @@ Now finally you should be able to run this model with the following command ``` ## Quantization -First, learn more about the generic PyTorch 2 Export Quantization workflow in the [Quantization Flow Docs](https://pytorch.org/executorch/stable/quantization-overview.html), if you are not familiar already. +First, learn more about the generic PyTorch 2 Export Quantization workflow in the [Quantization Flow Docs](https://pytorch.org/executorch/main/quantization-overview), if you are not familiar already. Here we will discuss quantizing a model suitable for XNNPACK delegation using XNNPACKQuantizer. diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py index 6db0d82a274..f67150169dc 100644 --- a/examples/xnnpack/aot_compiler.py +++ b/examples/xnnpack/aot_compiler.py @@ -87,14 +87,14 @@ model = model.eval() # pre-autograd export. eventually this will become torch.export - ep = torch.export.export_for_training(model, example_inputs) + ep = torch.export.export_for_training(model, example_inputs, strict=True) model = ep.module() if args.quantize: logging.info("Quantizing Model...") # TODO(T165162973): This pass shall eventually be folded into quantizer model = quantize(model, example_inputs, quant_type) - ep = torch.export.export_for_training(model, example_inputs) + ep = torch.export.export_for_training(model, example_inputs, strict=True) edge = to_edge_transform_and_lower( ep, diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py index 3e30c239215..90a6b94d02b 100644 --- a/examples/xnnpack/quantization/example.py +++ b/examples/xnnpack/quantization/example.py @@ -60,7 +60,9 @@ def verify_xnnpack_quantizer_matching_fx_quant_model(model_name, model, example_ m = model # 1. pytorch 2.0 export quantization flow (recommended/default flow) - m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() + m = torch.export.export_for_training( + m, copy.deepcopy(example_inputs), strict=True + ).module() quantizer = XNNPACKQuantizer() quantization_config = get_symmetric_quantization_config(is_per_channel=True) quantizer.set_global(quantization_config) @@ -177,7 +179,9 @@ def main() -> None: model = model.eval() # pre-autograd export. eventually this will become torch.export - model = torch.export.export_for_training(model, example_inputs).module() + model = torch.export.export_for_training( + model, example_inputs, strict=True + ).module() start = time.perf_counter() quantized_model = quantize(model, example_inputs) end = time.perf_counter() diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index ab2e66f7885..310e5ea9379 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -8,8 +8,9 @@ import copy import logging from contextlib import contextmanager, nullcontext +from dataclasses import dataclass from functools import singledispatch -from typing import Generator, List +from typing import Dict, Generator, List, Mapping import torch @@ -36,7 +37,7 @@ update_to_real_program, ) from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param -from torch.export import ExportedProgram +from torch.export.exported_program import ExportedProgram, InputSpec, OutputSpec @singledispatch @@ -190,6 +191,65 @@ def _get_node_list_with_same_tag( return node_list +def _insert_lowered_submodule( + submodule_program: ExportedProgram, + owning_program: ExportedProgram, + call_submodule_node: torch.fx.Node, + submodule_output_node: torch.fx.Node, + lowered_module: LoweredBackendModule, + is_submodule: bool, + toplevel_input_specs_to_delete: Dict[str, InputSpec], + toplevel_output_specs_to_delete: Dict[str, OutputSpec], +): + owning_graph_module = call_submodule_node.graph.owning_module + # call delegate args should only use user_inputs + call_delegate_args = [] + # Preserve input order as user_inputs + for inp_name in submodule_program.graph_signature.user_inputs: + for inp_node in call_submodule_node.all_input_nodes: + if inp_node.name == inp_name: + call_delegate_args.append(inp_node) + break + + def generate_debug_handle(ep: ExportedProgram) -> int: + """ + Generate a debug handle for the given ExportedProgram. + """ + debug_handle = 0 + for node in ep.graph_module.graph.nodes: + debug_handle = max(debug_handle, node.meta.get("debug_handle", 0)) + return debug_handle + 1 + + # Replace the partitioned submodule with a lowered submodule + # Add call_method node with function "forward" + with owning_graph_module.graph.inserting_before(call_submodule_node): + lowered_name = get_lowered_module_name(owning_graph_module, lowered_module) + lowered_node = owning_graph_module.graph.get_attr(lowered_name) + call_delegate_node = owning_graph_module.graph.call_function( + executorch_call_delegate, + (lowered_node,) + tuple(call_delegate_args), + call_submodule_node.kwargs, + ) + call_delegate_node.meta["debug_handle"] = generate_debug_handle(owning_program) + call_delegate_node.meta["val"] = submodule_output_node.meta["val"] + call_submodule_node.replace_all_uses_with(call_delegate_node) + owning_graph_module.graph.erase_node(call_submodule_node) + + if is_submodule: + assert len(toplevel_input_specs_to_delete) == 0 + assert len(toplevel_output_specs_to_delete) == 0 + elif ( + len(toplevel_input_specs_to_delete) > 0 + or len(toplevel_output_specs_to_delete) > 0 + ): + _unsafe_adjust_original_program( + owning_program, + call_delegate_node, + toplevel_input_specs_to_delete, + toplevel_output_specs_to_delete, + ) + + def _partition_and_lower_one_graph_module( tagged_graph_module: torch.fx.GraphModule, partition_result: PartitionResult, @@ -254,56 +314,16 @@ def _partition_and_lower_one_graph_module( delegation_spec.compile_specs, ) - # call delegate args should only use user_inputs - call_delegate_args = [] - # Preserve input order as user_inputs - for inp_name in submodule_program.graph_signature.user_inputs: - for inp_node in call_module_node.all_input_nodes: - if inp_node.name == inp_name: - call_delegate_args.append(inp_node) - break - - def generate_debug_handle(ep: ExportedProgram) -> int: - """ - Generate a debug handle for the given ExportedProgram. - """ - debug_handle = 0 - for node in ep.graph_module.graph.nodes: - debug_handle = max(debug_handle, node.meta.get("debug_handle", 0)) - return debug_handle + 1 - - # Replace the partitioned submodule with a lowered submodule - # Add call_method node with function "forward" - with tagged_graph_module.graph.inserting_before(call_module_node): - lowered_name = get_lowered_module_name( - tagged_graph_module, lowered_submodule - ) - lowered_node = tagged_graph_module.graph.get_attr(lowered_name) - call_delegate_node = tagged_graph_module.graph.call_function( - executorch_call_delegate, - (lowered_node,) + tuple(call_delegate_args), - call_module_node.kwargs, - ) - call_delegate_node.meta["debug_handle"] = generate_debug_handle( - owning_program - ) - call_delegate_node.meta["val"] = submodule_output_node.meta["val"] - call_module_node.replace_all_uses_with(call_delegate_node) - tagged_graph_module.graph.erase_node(call_module_node) - - if is_submodule: - assert len(toplevel_input_specs_to_delete) == 0 - assert len(toplevel_output_specs_to_delete) == 0 - elif ( - len(toplevel_input_specs_to_delete) > 0 - or len(toplevel_output_specs_to_delete) > 0 - ): - _unsafe_adjust_original_program( - owning_program, - call_delegate_node, - toplevel_input_specs_to_delete, - toplevel_output_specs_to_delete, - ) + _insert_lowered_submodule( + submodule_program, + owning_program, + call_module_node, + submodule_output_node, + lowered_submodule, + is_submodule, + toplevel_input_specs_to_delete, + toplevel_output_specs_to_delete, + ) return tagged_graph_module @@ -417,3 +437,330 @@ def to_backend( constants=tagged_exported_program.constants, verifiers=[tagged_exported_program.verifier], ) + + +def _create_partitions_in_graph_module( + tagged_graph_module: torch.fx.GraphModule, + partition_result: PartitionResult, + owning_program: ExportedProgram, + is_submodule: bool, +) -> Dict[str, List[torch.fx.Node]]: + backend_id_to_submodule_name = {} + for tag, delegation_spec in partition_result.partition_tags.items(): + # Create partition with nodes containing this tag. There should only be + # one contained submodule per tag + node_list = _get_node_list_with_same_tag( + tagged_graph_module, tag, owning_program + ) + + if len(node_list) == 0: + logging.debug(f"Did not find any nodes for tag {tag}") + continue + + logging.debug(f"For tag {tag}, found nodes {node_list}") + # Tag the nodes that are params as buffers, so we can order the submodule as (Parms + Buffers) (User Inputs) + + replace_ctx = ( + tagged_graph_module._set_replace_hook( + owning_program.graph_signature.get_replace_hook() + ) + if not is_submodule + else nullcontext() + ) + with replace_ctx: + submodule, call_module_node = create_submodule_from_nodes( + tagged_graph_module, node_list, tag + ) + + tagged_graph_module_output_node = [ + node for node in tagged_graph_module.graph.nodes if node.op == "output" + ][0] + submodule_output_node = [ + node for node in submodule.graph.nodes if node.op == "output" + ][0] + # Copy the output node meta from the original output node, because + # create_submodule_from_nodes doesn't cover the meta field + submodule_output_node.meta = tagged_graph_module_output_node.meta + logging.debug(f"Partitioned graph module: {tagged_graph_module}") + ( + submodule_program, + toplevel_input_specs_to_delete, + toplevel_output_specs_to_delete, + ) = create_exported_program_from_submodule( + submodule, + owning_program, + tag, + call_module_node, + is_submodule, + ) + call_module_node.meta["backend_id"] = delegation_spec.backend_id + call_module_node.meta["compile_spec"] = delegation_spec.compile_specs + call_module_node.meta["submodule_program"] = submodule_program + call_module_node.meta["toplevel_input_specs_to_delete"] = ( + toplevel_input_specs_to_delete + ) + call_module_node.meta["toplevel_output_specs_to_delete"] = ( + toplevel_output_specs_to_delete + ) + call_module_node.meta["is_submodule"] = is_submodule + call_module_node.meta["submodule_output_node"] = submodule_output_node + + if delegation_spec.backend_id not in backend_id_to_submodule_name: + backend_id_to_submodule_name[delegation_spec.backend_id] = [] + + # The call_module_node created here might not be the same node instance as + # the one in the final graph module. This is because this node might be replaced + # in future edits to the graph. As a result, we just keep track of the node's name + # and at the end we search for this node in our final graph module + backend_id_to_submodule_name[delegation_spec.backend_id].append( + call_module_node.target + ) + + created_submodule_nodes = {key: [] for key in backend_id_to_submodule_name.keys()} + for backend_id, submodule_name in backend_id_to_submodule_name.items(): + for node in tagged_graph_module.graph.nodes: + if node.op == "call_module" and node.target in submodule_name: + created_submodule_nodes[backend_id].append(node) + + # check the number of submodule_names and submodule_nodes are equal + for backend_id in created_submodule_nodes.keys(): + assert len(created_submodule_nodes[backend_id]) == len( + backend_id_to_submodule_name[backend_id] + ) + + return created_submodule_nodes + + +def _create_partitions( + tagged_graph_module: torch.fx.GraphModule, + partition_result: PartitionResult, + owning_program: ExportedProgram, + is_submodule: bool = False, +) -> Dict[str, List[torch.fx.Node]]: + backend_id_to_call_submodules = _create_partitions_in_graph_module( + tagged_graph_module, partition_result, owning_program, is_submodule + ) + + # Recursively partition and lower for submodules + for _, submod, _ in get_control_flow_submodules(tagged_graph_module): + nested_backend_id_to_call_submodules = _create_partitions( + submod, partition_result, owning_program, is_submodule=True + ) + for ( + backend_id, + nested_submodules, + ) in nested_backend_id_to_call_submodules.items(): + if backend_id not in backend_id_to_call_submodules: + backend_id_to_call_submodules[backend_id] = nested_submodules + else: + backend_id_to_call_submodules[backend_id].extend(nested_submodules) + + return backend_id_to_call_submodules + + +def lower_all_submodules_to_backend( + backend_id: str, + method_to_submodules_nodes: Dict[str, List[torch.fx.Node]], + method_to_tagged_edge_program: Dict[str, ExportedProgram], +) -> None: + """ + Lower all submodules nodes given in the method_to_submodule_nodes map to backend_id. + """ + # The created exported program for the submodules are in the call_module node's meta data + # We just map the method_to_submodule_nodes directly to the method_to_partitioned_exported_programs + method_to_partitioned_program = { + method_name: [node.meta["submodule_program"] for node in call_submodule_nodes] + for method_name, call_submodule_nodes in method_to_submodules_nodes.items() + } + method_to_compile_specs = { + method_name: [node.meta["compile_spec"] for node in call_submodule_nodes] + for method_name, call_submodule_nodes in method_to_submodules_nodes.items() + } + backend_found = False + for cls in BackendDetails.__subclasses__(): + if backend_id == cls.__name__: + method_to_preprocess_result: dict[str, List[PreprocessResult]] = ( + cls.preprocess_multimethod( + method_to_partitioned_program, method_to_compile_specs + ) + ) + backend_found = True + + if not backend_found: + raise NotImplementedError(f"Backend {backend_id} was not found.") + + for method_name in method_to_preprocess_result.keys(): + owning_program = method_to_tagged_edge_program[method_name] + list_of_preprocess_results = method_to_preprocess_result[method_name] + list_of_call_submodule_nodes = method_to_submodules_nodes[method_name] + list_of_compile_specs = method_to_compile_specs[method_name] + for preprocess_result, call_submodule_node, compile_spec in zip( + list_of_preprocess_results, + list_of_call_submodule_nodes, + list_of_compile_specs, + ): + submodule_program = call_submodule_node.meta["submodule_program"] + lowered_module = LoweredBackendModule( + edge_program=submodule_program, + backend_id=backend_id, + processed_bytes=preprocess_result.processed_bytes, + compile_specs=compile_spec, + named_data_store_output=preprocess_result.data_store_output, + ) + is_submodule = call_submodule_node.meta["is_submodule"] + toplevel_input_specs_to_delete = call_submodule_node.meta[ + "toplevel_input_specs_to_delete" + ] + toplevel_output_specs_to_delete = call_submodule_node.meta[ + "toplevel_output_specs_to_delete" + ] + submodule_output_node = call_submodule_node.meta["submodule_output_node"] + + _insert_lowered_submodule( + submodule_program, + owning_program, + call_submodule_node, + submodule_output_node, + lowered_module, + is_submodule, + toplevel_input_specs_to_delete, + toplevel_output_specs_to_delete, + ) + + +@dataclass +class MethodProgramsPartitionerSpec: + """ + Since single dispatch for to_backend requires the first argument to be a + valid class, we create the following dataclass spec to hold the dictionaries + mapping the method name to the corresponding program, partitioner + """ + + method_to_edge_program: Mapping[str, ExportedProgram] + method_to_partitioner: Mapping[str, Partitioner] + + +@to_backend.register +def _( + method_edge_program_partitioners: MethodProgramsPartitionerSpec, +) -> Dict[str, ExportedProgram]: + """ + Add overloaded implementations for to_backend: + + :: + + def to_backend( + method_edge_program_partitioners: MethodProgramsPartitionerSpec + ) -> Dict[str, ExportedProgram]: + + Returns a semantically-equivalent dictionary of programs to the programs given as input (represented + as a graph module in Edge dialect), but with portions of the program targeted for + delegation as determined by the partitioner. + + Args: + method_edge_program_partitioners: contains two mappings, + - method_to_edge_program: mapping of method names to their respective programs in Edge dialect. + - method_to_partitioner: mapping of method names to an instance of the partitioner, in charge with tagging + portions of the specified program for delegation. A valid partitioner must return PartitionerResult + including both tagged exported program and partitioner_tag: Dict[str, DelegationSpec], where each key is a tag name and + the nodes with same tag will be fused a one subgraph and delegated to backend specififed in delegation spec. + + + Returns: + ExportedProgram: The input program, with some portions targeted for delegation. + """ + method_to_edge_program = method_edge_program_partitioners.method_to_edge_program + method_to_partitioner = method_edge_program_partitioners.method_to_partitioner + + partitioned_and_lowered_exported_programs = {} + backend_id_to_method_submodules_map = {} + method_to_tagged_exported_program = {} + + for method_name, partitioner_instance in method_to_partitioner.items(): + assert ( + method_name in method_to_edge_program + ), f"Partitioner for method {method_name} is not provided" + edge_program = method_to_edge_program[method_name] + edge_program._validate() + + # Use fake program, with FakeTensors in the state dict, to avoid copying large constant values. + # Fall back to deepcopy if no fake mode is found. TODO(T182910699): Remove this fallback. + try: + fake_edge_program = get_fake_program(edge_program) + except Exception as e: + logging.warning( + f"Error in get_fake_program for graph {edge_program.graph_module}, fallback to deepcopy: {e}" + ) + fake_edge_program = copy.deepcopy(edge_program) + partitioner_result = partitioner_instance(fake_edge_program) + tagged_exported_program = partitioner_result.tagged_exported_program + method_to_tagged_exported_program[method_name] = tagged_exported_program + + # Check that the partitioner did not modify the original graph + if _ENABLE_VALIDATION: + assert is_identical_graph( + tagged_exported_program.graph_module, + edge_program.graph_module, + ), f"The partitioner {partitioner_instance} should not modify the graph module" + else: + logging.warning("Disabled validating the partitioner.") + + assert ( + partitioner_result.partition_tags is not None + ), f"Partitioner {partitioner_instance} needs a `partition_tags` field containing a mapping of tags to delegate spec" + + update_to_real_program(tagged_exported_program, edge_program) + + for tag, _ in partitioner_result.partition_tags.items(): + _maybe_duplicate_constant_nodes(tagged_exported_program, tag) + + backend_id_to_call_submodule_nodes = _create_partitions( + tagged_exported_program.graph_module, + partitioner_result, + tagged_exported_program, + ) + for ( + backend_id, + call_submodule_nodes, + ) in backend_id_to_call_submodule_nodes.items(): + if backend_id not in backend_id_to_method_submodules_map: + backend_id_to_method_submodules_map[backend_id] = {} + backend_id_to_method_submodules_map[backend_id][ + method_name + ] = call_submodule_nodes + + for ( + backend_id, + method_to_submodule_nodes, + ) in backend_id_to_method_submodules_map.items(): + lower_all_submodules_to_backend( + backend_id, + method_to_submodule_nodes, + method_to_tagged_exported_program, + ) + + for method_name in method_to_edge_program.keys(): + if method_name in method_to_tagged_exported_program: + tagged_exported_program = method_to_tagged_exported_program[method_name] + partitioned_and_lowered_exported_programs[method_name] = ExportedProgram( + root=tagged_exported_program.graph_module, + graph=tagged_exported_program.graph_module.graph, + graph_signature=tagged_exported_program.graph_signature, + state_dict=tagged_exported_program.state_dict, + range_constraints=copy.deepcopy( + tagged_exported_program.range_constraints + ), + module_call_graph=copy.deepcopy( + tagged_exported_program.module_call_graph + ), + example_inputs=None, + constants=tagged_exported_program.constants, + verifiers=[tagged_exported_program.verifier], + ) + else: + # this edge program wasn't partitioned, so we can just return it as is + partitioned_and_lowered_exported_programs[method_name] = ( + method_to_edge_program[method_name] + ) + + return partitioned_and_lowered_exported_programs diff --git a/exir/backend/backend_details.py b/exir/backend/backend_details.py index 248d03f2b05..513ae7c64b3 100644 --- a/exir/backend/backend_details.py +++ b/exir/backend/backend_details.py @@ -50,15 +50,6 @@ class BackendDetails(ABC): the decorators, this interface will be static, abstract and all inheritances are enforced to implement this method. - Args: - edge_program: The original exported program. It will not be modified in place. - compile_specs: List of values needed for compilation - - Returns: - PreprocessResult: It wraps the following information: - processed_bytes -> bytes: A compiled blob - a binary that can run the desired program in the backend. - debug_handle_map (Optional[Dict[int, Tuple[int]]]): For profiling purposes, a map from the node_id in the final graph (either EXIR or the user's self-defined IR) - to debug handle id attached in the original exported program. """ @staticmethod @@ -70,6 +61,69 @@ def preprocess( edge_program: ExportedProgram, compile_specs: List[CompileSpec], ) -> PreprocessResult: + """ + Preprocesses an edge program and returns the preprocess result fo the given backend + + Args: + edge_program: The original exported program. It will not be modified in place. + compile_specs: List of values needed for compilation + + Returns: + PreprocessResult: It wraps the following information: + processed_bytes -> bytes: A compiled blob - a binary that can run the desired + program in the backend. + debug_handle_map (Optional[Dict[int, Tuple[int]]]): For profiling purposes, a + map from the node_id in the final graph (either EXIR or the user's self-defined + IR) to debug handle id attached in the original exported program. + """ # Users should return a compiled blob - a binary that can run the desired # program in the backend. pass + + @classmethod + def preprocess_multimethod( + cls, + edge_programs: Dict[str, List[ExportedProgram]], + compile_specs: Dict[str, List[List[CompileSpec]]], + ) -> Dict[str, list[PreprocessResult]]: + """ + Runs preprocess on all partitioned Edge Programs across multiple methods. This allows + backends to share information across partitioned graphs. Backend can serialize shared + data by putting the shared data into the data_store_output of the preprocess results. + This will record the shared data used by that specific partition. + + Default implementation is running the existing preprocess implementation on all + + Args: + edge_programs: Dictionary mapping the method name to a list of all the partitioned + edge_programs from that method to be lowered. + compile_specs: Dictionary mapping the method name to a list of compile_specs. The + list of compile specs maps directly to the list of edge_programs for the + same given method name i.e. edge_program[method_name][i] --> compile_specs[method_name][i] + + Returns: + Dictionary mapping the method name to a list of PreprocessResults. The list of + PreprocessResults maps directly to the list of edge_programs for the same given + method name. i.e. edge_program[method_name][i] --> result[method_name][i] + + + """ + preprocess_results = {} + for method_name, programs in edge_programs.items(): + assert ( + method_name in compile_specs + ), f"Error: missing compile specs for {method_name}" + compile_specs_for_method = compile_specs[method_name] + assert len(compile_specs_for_method) == len( + programs + ), f"Error: method {method_name} has {len(programs)} partitions but only {len(compile_specs_for_method)}" + results_for_method = [] + for program, compile_spec_for_program in zip( + programs, compile_specs_for_method + ): + preprocess_result = cls.preprocess(program, compile_spec_for_program) + results_for_method.append(preprocess_result) + + preprocess_results[method_name] = results_for_method + + return preprocess_results diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS index f0ba618936d..5b12d673f7c 100644 --- a/exir/backend/test/TARGETS +++ b/exir/backend/test/TARGETS @@ -189,6 +189,59 @@ python_unittest( ], ) +python_unittest( + name = "test_to_backend_multi_method", + srcs = [ + "test_to_backend_multi_method.py", + ], + preload_deps = [ + "//executorch/kernels/portable:custom_ops_generated_lib", + "//executorch/kernels/quantized:custom_ops_generated_lib", + "//executorch/runtime/executor/test:test_backend_compiler_lib", + ], + deps = [ + ":backend_with_preprocess_all_demo", + "//caffe2:torch", + "//caffe2/functorch:functorch_src", + "//executorch/exir/backend/test:backend_with_compiler_demo", + "//executorch/exir:delegate", + "//executorch/exir:graph_module", + "//executorch/exir:lib", + "//executorch/exir:lowered_backend_module", + "//executorch/exir:print_program", + "//executorch/exir:schema", + "//executorch/exir/backend:backend_api", + "//executorch/exir/backend:compile_spec_schema", + "//executorch/exir/backend:partitioner", + "//executorch/exir/dialects:lib", + "//executorch/extension/pybindings:portable_lib", # @manual + "//executorch/extension/pytree:pylib", + ], +) + +python_library( + name = "backend_with_preprocess_all_demo", + srcs = [ + "backend_with_preprocess_all_demo.py" + ], + deps = [ + "//caffe2:torch", + "//caffe2/functorch:functorch_src", + "//executorch/exir:delegate", + "//executorch/exir:graph_module", + "//executorch/exir:lib", + "//executorch/exir:lowered_backend_module", + "//executorch/exir:print_program", + "//executorch/exir:schema", + "//executorch/exir/backend:backend_api", + "//executorch/exir/backend:compile_spec_schema", + "//executorch/exir/backend:partitioner", + "//executorch/exir/dialects:lib", + "//executorch/extension/pybindings:portable_lib", # @manual + "//executorch/extension/pytree:pylib", + ], +) + python_unittest( name = "test_debug_handle_map", srcs = [ diff --git a/exir/backend/test/backend_with_preprocess_all_demo.py b/exir/backend/test/backend_with_preprocess_all_demo.py new file mode 100644 index 00000000000..ae9a8174be5 --- /dev/null +++ b/exir/backend/test/backend_with_preprocess_all_demo.py @@ -0,0 +1,266 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, final, List, Tuple + +import torch + +from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult +from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import ( + generate_pattern_op_partitions, +) + +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, +) +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.graph_module import get_control_flow_submodules +from torch.export.exported_program import ExportedProgram +from torch.fx.passes.operator_support import any_chain, OperatorSupportBase + + +def _preprocess_multimethod( + edge_programs: Dict[str, List[ExportedProgram]], + compile_specs: Dict[str, List[List[CompileSpec]]], + supported_ops: List[torch._ops.OpOverload], + backend_name: str, +) -> Dict[str, List[PreprocessResult]]: + """ + Helper function to abstract out the logic to be shared between the two backends: + FirstBackendWithPreprocessAll and SecondBackendWithPreprocessAll. This will be used + in testing for a partitioner which tags different partitions for different backends + to be lowered to + """ + total_number_of_ops = 0 + for edge_program in edge_programs.values(): + for partitioned_program in edge_program: + for node in partitioned_program.graph.nodes: + if node.op == "call_function": + if node.target in supported_ops: + total_number_of_ops += 1 + all_processed_results = {key: [] for key in edge_programs.keys()} + + for method_name, partitioned_programs in edge_programs.items(): + compile_specs_for_method = compile_specs[method_name] + + assert len(compile_specs_for_method) == len(partitioned_programs) + for compile_spec_for_partition, partitioned_program in zip( + compile_specs_for_method, partitioned_programs + ): + debug_handle_map = {} + processed_bytes = f"{backend_name}#{total_number_of_ops}#" + for node in partitioned_program.graph.nodes: + if node.op == "call_function": + if node.target in supported_ops: + op_name = node.target.__name__ + processed_bytes += f"{op_name}:" + original_debug_id = node.meta["debug_handle"] + new_debug_id = original_debug_id + debug_handle_map[new_debug_id] = (original_debug_id,) + else: + raise RuntimeError( + f"{node.op} {node.target.__name__} is not supported in backend {backend_name}" + ) + + processed_bytes += "#" + for cs in compile_spec_for_partition: + processed_bytes += f"{cs.key}:{cs.value};" + + all_processed_results[method_name].append( + PreprocessResult( + processed_bytes=bytes(processed_bytes, encoding="utf8"), + debug_handle_map=debug_handle_map, + ) + ) + + return all_processed_results + + +@final +class FirstBackendWithPreprocessAll(BackendDetails): + """ + Backend used to test the preprocess_multimethod for multi methods lowering. + lowered modules are returned in the format: + FirstBackendWithPreprocessAll##::#;: + + + lowered blobs are not functional, and are purely used for testing purposes + """ + + @staticmethod + def preprocess( + edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> PreprocessResult: + """ + Not used for testing + """ + return PreprocessResult( + processed_bytes=bytes(b"\x00"), + debug_handle_map={}, + ) + + @staticmethod + def preprocess_multimethod( + edge_programs: Dict[str, List[ExportedProgram]], + compile_specs: Dict[str, List[List[CompileSpec]]], + ) -> Dict[str, list[PreprocessResult]]: + """ + Preprocess all the edge programs in the given dictionary and return a dictionary + of preprocess results. The preprocess result is a tuple of processed bytes and + a map from the node name to the original debug handle. + """ + match_ops = [ + exir_ops.edge.aten.sin.default, + exir_ops.edge.aten.add.Tensor, + ] + + return _preprocess_multimethod( + edge_programs, compile_specs, match_ops, "FirstBackendWithPreprocessAll" + ) + + +@final +class SecondBackendWithPreprocessAll(BackendDetails): + """ + Backend used to test the preprocess_multimethod for multi methods lowering. + lowered modules are returned in the format: + SecondBackendWithPreprocessAll##::#;: + + + lowered blobs are not functional, and are purely used for testing purposes + """ + + @staticmethod + def preprocess( + edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> PreprocessResult: + """ + Not used for testing + """ + return PreprocessResult( + processed_bytes=bytes(b"\x00"), + debug_handle_map={}, + ) + + @staticmethod + def preprocess_multimethod( + edge_programs: Dict[str, List[ExportedProgram]], + compile_specs: Dict[str, List[List[CompileSpec]]], + ) -> Dict[str, list[PreprocessResult]]: + """ + Preprocess all the edge programs in the given dictionary and return a dictionary + of preprocess results. The preprocess result is a tuple of processed bytes and + a map from the node name to the original debug handle. + """ + match_ops = [ + exir_ops.edge.aten.cos.default, + exir_ops.edge.aten.sub.Tensor, + ] + + return _preprocess_multimethod( + edge_programs, compile_specs, match_ops, "SecondBackendWithPreprocessAll" + ) + + +class AddSinOperatorSupport(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + return node.op == "call_function" and node.target in [ + exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.sin.default, + ] + + +class SubCosOperatorSupport(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + return node.op == "call_function" and node.target in [ + exir_ops.edge.aten.sub.Tensor, + exir_ops.edge.aten.cos.default, + ] + + +@final +class BackendWithPreprocessAllPartitioner(Partitioner): + """ + Partitioner that partitions for both FirstBackendWithPreprocessAll + and SecondBackendWithPreprocessAll. + + - The partitioner tags all sin and add nodes for delegation to + FirstBackendWithPreprocessAll + - The partitioner tags all cos and sub nodes for delegation to + SecondBackendWithPreprocessAll + """ + + def __init__(self) -> None: + self.add_sin_support = any_chain(AddSinOperatorSupport()) + self.add_sin_backend_id = FirstBackendWithPreprocessAll.__name__ + + self.sub_cos_support = any_chain(SubCosOperatorSupport()) + self.sub_cos_backend_id = SecondBackendWithPreprocessAll.__name__ + + def _partition_graph_module( + self, + graph_module: torch.fx.GraphModule, + id_start=0, + ) -> Tuple[Dict[str, DelegationSpec], int]: + partition_tags: Dict[str, DelegationSpec] = {} + + num_partitions_in_gm = 0 + for op_support, backend_id, tag_prefix in [ + (self.add_sin_support, self.add_sin_backend_id, "first"), + (self.sub_cos_support, self.sub_cos_backend_id, "second"), + ]: + partition_list = generate_pattern_op_partitions( + graph_module, op_support=op_support + ) + num_partitions_in_gm = num_partitions_in_gm + len(partition_list) + for partition in partition_list: + compile_specs = [] + delegation_tag = f"{tag_prefix}_tag{id_start + partition.id}" + for node in partition.nodes: + node.meta["delegation_tag"] = delegation_tag + if ( + node.op == "call_function" + and node.target == exir_ops.edge.aten.add.Tensor + ): + compile_specs.append(CompileSpec("add", bytes(b"\x00"))) + if ( + node.op == "call_function" + and node.target == exir_ops.edge.aten.sin.default + ): + compile_specs.append(CompileSpec("sin", bytes(b"\x01"))) + if ( + node.op == "call_function" + and node.target == exir_ops.edge.aten.sub.Tensor + ): + compile_specs.append(CompileSpec("sub", bytes(b"\x02"))) + if ( + node.op == "call_function" + and node.target == exir_ops.edge.aten.cos.default + ): + compile_specs.append(CompileSpec("cos", bytes(b"\x03"))) + + delegation_spec = DelegationSpec(backend_id, compile_specs) + partition_tags[delegation_tag] = delegation_spec + + start_idx_for_submodules = num_partitions_in_gm + for _, submodule, _ in get_control_flow_submodules(graph_module): + ret_partition_tags, start_idx_for_submodules = self._partition_graph_module( + submodule, id_start=start_idx_for_submodules + ) + partition_tags.update(ret_partition_tags) + + return partition_tags, start_idx_for_submodules + + def partition(self, exported_program: ExportedProgram) -> PartitionResult: + partition_tags, _ = self._partition_graph_module(exported_program.graph_module) + return PartitionResult( + tagged_exported_program=exported_program, partition_tags=partition_tags + ) diff --git a/exir/backend/test/test_lowered_backend_module.py b/exir/backend/test/test_lowered_backend_module.py index dcc5841bc3e..6cdaf92b3d2 100644 --- a/exir/backend/test/test_lowered_backend_module.py +++ b/exir/backend/test/test_lowered_backend_module.py @@ -22,7 +22,6 @@ from executorch.extension.pybindings.portable_lib import ( # @manual _load_for_executorch_from_buffer, ) -from hypothesis import given, settings, strategies as st from torch.export import export @@ -65,7 +64,6 @@ def forward(self, *args): .executorch_program ) - @settings(deadline=500000) def test_emit_lowered_backend_module_end_to_end(self): class SinModule(torch.nn.Module): def __init__(self): @@ -109,11 +107,7 @@ def forward(self, x): torch.allclose(model_outputs[0], expected_res, atol=1e-03, rtol=1e-03) ) - @given( - unlift=st.booleans(), # verify both lifted and unlifted graph - ) - @settings(deadline=500000) - def test_emit_lowered_backend_module(self, unlift): + def test_emit_lowered_backend_module(self): module_list = [ models.Emformer(), models.Repeat(), @@ -166,11 +160,7 @@ def test_emit_lowered_backend_module(self, unlift): _ = lowered_model.buffer() self.validate_lowered_module_program(program) - @given( - unlift=st.booleans(), # verify both lifted and unlifted graph - ) - @settings(deadline=500000) - def test_emit_nested_lowered_backend_module(self, unlift): + def test_emit_nested_lowered_backend_module(self): module_list = [ models.Emformer(), models.Repeat(), diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py index 917dae32d74..e9320cf415d 100644 --- a/exir/backend/test/test_partitioner.py +++ b/exir/backend/test/test_partitioner.py @@ -76,7 +76,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = export_for_training(mlp, example_inputs).module() + model = export_for_training(mlp, example_inputs, strict=True).module() aten = export(model, example_inputs, strict=True) spec_key = "path" spec_value = "/a/b/c/d" @@ -137,7 +137,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = export_for_training(mlp, example_inputs).module() + model = export_for_training(mlp, example_inputs, strict=True).module() aten = export(model, example_inputs, strict=True) edge = exir.to_edge(aten) @@ -177,7 +177,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = export_for_training(mlp, example_inputs).module() + model = export_for_training(mlp, example_inputs, strict=True).module() edge = exir.to_edge(export(model, example_inputs, strict=True)) with self.assertRaisesRegex( @@ -229,7 +229,9 @@ def partition( partition_tags=partition_tags, ) - model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() + model = export_for_training( + self.AddConst(), (torch.ones(2, 2),), strict=True + ).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True)) delegated = edge.to_backend(PartitionerNoTagData()) @@ -308,7 +310,9 @@ def partition( partition_tags=partition_tags, ) - model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() + model = export_for_training( + self.AddConst(), (torch.ones(2, 2),), strict=True + ).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True)) delegated = edge.to_backend(PartitionerTagData()) @@ -383,7 +387,9 @@ def partition( partition_tags=partition_tags, ) - model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() + model = export_for_training( + self.AddConst(), (torch.ones(2, 2),), strict=True + ).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True)) delegated = edge.to_backend(PartitionerTagData()) @@ -471,7 +477,9 @@ def partition( ) inputs = (torch.ones(2, 2),) - model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() + model = export_for_training( + ReuseConstData(), (torch.ones(2, 2),), strict=True + ).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True)) exec_prog = edge.to_backend(PartitionerTagData()).to_executorch() executorch_module = _load_for_executorch_from_buffer(exec_prog.buffer) @@ -531,7 +539,9 @@ def partition( partition_tags=partition_tags, ) - model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() + model = export_for_training( + ReuseConstData(), (torch.ones(2, 2),), strict=True + ).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),), strict=True)) with self.assertRaises(RuntimeError) as error: _ = edge.to_backend(PartitionerTagData()) diff --git a/exir/backend/test/test_passes.py b/exir/backend/test/test_passes.py index bc18f090238..1cdf494fa01 100644 --- a/exir/backend/test/test_passes.py +++ b/exir/backend/test/test_passes.py @@ -28,7 +28,9 @@ def forward(self, x): z = x - self.const return y, z - model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() + model = export_for_training( + ReuseConstData(), (torch.ones(2, 2),), strict=True + ).module() edge = exir.to_edge( torch.export.export(model, (torch.ones(2, 2),), strict=True) ) diff --git a/exir/backend/test/test_to_backend_multi_method.py b/exir/backend/test/test_to_backend_multi_method.py new file mode 100644 index 00000000000..d4f8fccb8f2 --- /dev/null +++ b/exir/backend/test/test_to_backend_multi_method.py @@ -0,0 +1,524 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Dict, List, Tuple + +import torch + +from executorch.exir import EdgeProgramManager, to_edge +from executorch.exir.backend.backend_api import ( + MethodProgramsPartitionerSpec, + to_backend, +) + +from executorch.exir.backend.canonical_partitioners.all_node_partitioner import ( + AllNodePartitioner, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.backend.partitioner import Partitioner +from executorch.exir.backend.test.backend_with_compiler_demo import ( + BackendWithCompilerDemo, +) + +from executorch.exir.backend.test.backend_with_preprocess_all_demo import ( + BackendWithPreprocessAllPartitioner, +) +from executorch.exir.graph_module import get_control_flow_submodules +from executorch.exir.lowered_backend_module import ( + get_lowered_submodules, + LoweredBackendModule, +) +from executorch.exir.schema import ( + BackendDelegate, + BackendDelegateDataReference, + DataLocation, + Program, +) +from executorch.extension.pybindings.portable_lib import ( # @manual + _load_for_executorch_from_buffer, +) +from torch.export.exported_program import ExportedProgram + +from torch.testing import FileCheck + + +class TestToBackendMultiMethod(unittest.TestCase): + """ + Testing suite used to test multi method to_backend lowering. The test suite uses demo backends + FirstBackendWithPreprocessAll and SecondBackendWithPreprocessAll. + - FirstBackendWithPreprocessAll: supports add + sin + - SecondBackendWithPreprocessAll: supports sub + cos + + Both backends lower exported programs into payloads in the string format: + - (backend_id)#(total_number_ops across methods)#[op_target_name;]#[compile_spec.key:compile_spec.value;] + + We leverage the above expectation to test various lowering across different modules, and ensure + that the right exported programs and compile specs are given when lowering a specifed exported program + + We leverage the demo partitioner BackendWithPreprocessAll which partitions add + sin nodes to + FirstBackendWithPreprocessAll and sub + cos nodes to SecondBackendWithPreprocessAll. This allows + us to test cases in which multiple backends are being lowered. + """ + + def _get_lowered_submodules_across_controlflow( + self, graph_module: torch.fx.GraphModule + ) -> List[Tuple[str, LoweredBackendModule, torch.fx.Node]]: + top_level_submodules = get_lowered_submodules(graph_module) + + for _, submodule, _ in get_control_flow_submodules(graph_module): + top_level_submodules.extend( + self._get_lowered_submodules_across_controlflow(submodule) + ) + + return top_level_submodules + + def check_backend_delegate( + self, + program: Program, + delegate: BackendDelegate, + expected_id: str, + expected_processed: bytes, + ) -> None: + self.assertEqual(delegate.id, expected_id) + processed: BackendDelegateDataReference = delegate.processed + self.assertEqual(processed.location, DataLocation.INLINE) + self.assertLess(processed.index, len(program.backend_delegate_data)) + self.assertEqual( + program.backend_delegate_data[processed.index].data, expected_processed + ) + + def _test( + self, test_set: Dict[str, Tuple[ExportedProgram, Partitioner, List[str]]] + ): + method_to_edge_program = { + method_name: ep for method_name, (ep, _, _) in test_set.items() + } + + method_to_partitioner = { + method_name: partitioner + for method_name, (_, partitioner, _) in test_set.items() + } + + lowered_ep_dict = to_backend( + MethodProgramsPartitionerSpec( + method_to_edge_program, + method_to_partitioner, + ) + ) + + self.assertEqual(len(lowered_ep_dict.keys()), len(test_set.keys())) + for method_name in test_set.keys(): + self.assertTrue(method_name in lowered_ep_dict.keys()) + (_, _, list_of_payload_as_string) = test_set[method_name] + lowered_ep = lowered_ep_dict[method_name] + FileCheck().check_count( + "torch.ops.higher_order.executorch_call_delegate", + len(list_of_payload_as_string), + exactly=True, + ).run(str(lowered_ep)) + lowered_submodules = self._get_lowered_submodules_across_controlflow( + lowered_ep.graph_module + ) + self.assertEqual(len(lowered_submodules), len(list_of_payload_as_string)) + + for idx, (_, lowered_backend_module, _) in enumerate(lowered_submodules): + self.assertEqual( + lowered_backend_module.processed_bytes.decode("utf-8"), + list_of_payload_as_string[idx], + ) + + def test_multi_method_to_backend_single_method(self): + class SinModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.sin(x) + + edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),))) + # Payload String: + # [Number of Ops lowered across all methods/partitions]#OpTargetNames#CompileSpecs; + test_set = { + "forward": ( + edgeir_m.exported_program(), + AllNodePartitioner( + "FirstBackendWithPreprocessAll", + [CompileSpec("max_value", bytes([1]))], + ), + [ + "FirstBackendWithPreprocessAll#1#aten.sin.default:#max_value:b'\\x01';" + ], + ) + } + self._test(test_set) + + def test_multi_method_to_backend_two_methods(self): + class SinModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.sin(x) + + class AddModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return x + x + + sin_edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),))) + add_edgeir_m = to_edge(torch.export.export(AddModule(), (torch.ones(1),))) + sin_partitioner = AllNodePartitioner( + "FirstBackendWithPreprocessAll", [CompileSpec("sin", bytes([2]))] + ) + add_partitioner = AllNodePartitioner( + "FirstBackendWithPreprocessAll", [CompileSpec("add", bytes([3]))] + ) + # Payload String: + # [Number of Ops lowered across all methods/partitions]#OpTargetNames#CompileSpecs; + test_set = { + "sin": ( + sin_edgeir_m.exported_program(), + sin_partitioner, + ["FirstBackendWithPreprocessAll#2#aten.sin.default:#sin:b'\\x02';"], + ), + "add": ( + add_edgeir_m.exported_program(), + add_partitioner, + ["FirstBackendWithPreprocessAll#2#aten.add.Tensor:#add:b'\\x03';"], + ), + } + self._test(test_set) + + def test_multi_method_to_backend_two_methods_multiple_partitions(self): + class AddModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + y = x + x + y = y * y + y = y + y + return y + + class SinModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + y = torch.sin(x) + y = y * y + return torch.sin(y) + + add_edgeir_m = to_edge(torch.export.export(AddModule(), (torch.ones(1),))) + sin_edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),))) + test_set = { + "add": ( + add_edgeir_m.exported_program(), + BackendWithPreprocessAllPartitioner(), + [ + "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';", + "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';", + ], + ), + "sin": ( + sin_edgeir_m.exported_program(), + BackendWithPreprocessAllPartitioner(), + [ + "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';", + "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';", + ], + ), + } + self._test(test_set) + + def test_multi_method_to_backend_two_methods_different_partitions(self): + class AddSinModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + y = x + x + y = y * y + y = torch.sin(y) + return y + + class SinAddModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + y = torch.sin(x) + y = y * y + return y + y + + add_sin_edgeir_m = to_edge( + torch.export.export(AddSinModule(), (torch.ones(1),)) + ) + sin_add_edgeir_m = to_edge( + torch.export.export(SinAddModule(), (torch.ones(1),)) + ) + test_set = { + "add_sin": ( + add_sin_edgeir_m.exported_program(), + BackendWithPreprocessAllPartitioner(), + [ + "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';", + "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';", + ], + ), + "sin_add": ( + sin_add_edgeir_m.exported_program(), + BackendWithPreprocessAllPartitioner(), + [ + "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';", + "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';", + ], + ), + } + self._test(test_set) + + def test_multi_method_to_backend_two_methods_different_backends(self): + class AddSinCosSubModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + y = x + x + y = torch.sin(y) + y = torch.cos(y) + y = y - x + return y + + class CosSubAddSinModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + y = torch.cos(x) + y = y - x + y = y + y + y = torch.sin(y) + return y + + first_second_edgeir_m = to_edge( + torch.export.export(AddSinCosSubModule(), (torch.ones(1),)) + ) + second_first_edgeir_m = to_edge( + torch.export.export(CosSubAddSinModule(), (torch.ones(1),)) + ) + test_set = { + "first_second": ( + first_second_edgeir_m.exported_program(), + BackendWithPreprocessAllPartitioner(), + [ + "FirstBackendWithPreprocessAll#4#aten.add.Tensor:aten.sin.default:#add:b'\\x00';sin:b'\\x01';", + "SecondBackendWithPreprocessAll#4#aten.cos.default:aten.sub.Tensor:#cos:b'\\x03';sub:b'\\x02';", + ], + ), + "second_first": ( + second_first_edgeir_m.exported_program(), + BackendWithPreprocessAllPartitioner(), + [ + "SecondBackendWithPreprocessAll#4#aten.cos.default:aten.sub.Tensor:#cos:b'\\x03';sub:b'\\x02';", + "FirstBackendWithPreprocessAll#4#aten.add.Tensor:aten.sin.default:#add:b'\\x00';sin:b'\\x01';", + ], + ), + } + self._test(test_set) + + def test_multi_method_to_backend_control_flow(self): + class SinCosModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def true_fn(self, x): + return torch.sin(x) + + def false_fn(self, x): + return torch.cos(x) + + def forward(self, x): + x = x + x + return torch.cond(x > 0, self.true_fn, self.false_fn, [x]) + + class SinAddModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def true_fn(self, x): + return torch.sin(x) + + def false_fn(self, x): + return x + x + + def forward(self, x): + return torch.cond(x > 0, self.true_fn, self.false_fn, [x]) + + sin_cos_edgeir_m = to_edge( + torch.export.export(SinCosModule(), (torch.ones(1),)) + ) + sin_add_edgeir_m = to_edge( + torch.export.export(SinAddModule(), (torch.ones(1),)) + ) + + test_set = { + "sin_cos": ( + sin_cos_edgeir_m.exported_program(), + BackendWithPreprocessAllPartitioner(), + [ + "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';", + # True Module Partition + "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';", + # False Module Partition + "SecondBackendWithPreprocessAll#1#aten.cos.default:#cos:b'\\x03';", + ], + ), + "sin_add": ( + sin_add_edgeir_m.exported_program(), + BackendWithPreprocessAllPartitioner(), + [ + # True Module Partition + "FirstBackendWithPreprocessAll#4#aten.sin.default:#sin:b'\\x01';", + # False Module Partition + "FirstBackendWithPreprocessAll#4#aten.add.Tensor:#add:b'\\x00';", + ], + ), + } + self._test(test_set) + + def test_multi_method_to_backend_not_found(self): + class SinModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.sin(x) + + class AddModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return x + x + + sin_edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),))) + add_edgeir_m = to_edge(torch.export.export(AddModule(), (torch.ones(1),))) + sin_partitioner = AllNodePartitioner( + "Invalid", [CompileSpec("sin", bytes([2]))] + ) + add_partitioner = AllNodePartitioner( + "FirstBackendWithPreprocessAll", [CompileSpec("add", bytes([3]))] + ) + + test_set = { + "sin": ( + sin_edgeir_m.exported_program(), + sin_partitioner, + [], + ), + "add": ( + add_edgeir_m.exported_program(), + add_partitioner, + [], + ), + } + with self.assertRaisesRegex( + NotImplementedError, "Backend Invalid was not found." + ): + self._test(test_set) + + def test_multi_method_end_to_end(self): + """ + Tests multi method lowering end-to-end. Lowers the same Sin Module for two methods + "forward" and "forward_copy". Ensures that the lowered program has two delegates + but only one serialized blob. Ensures that the lowered program runs correctly. + """ + + class SinModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.sin(x) + + sin_edgeir_m = to_edge(torch.export.export(SinModule(), (torch.ones(1),))) + sin_edgeir_m_copy = to_edge(torch.export.export(SinModule(), (torch.ones(1),))) + + method_edge_program = { + "forward": sin_edgeir_m.exported_program(), + "forward_copy": sin_edgeir_m_copy.exported_program(), + } + compile_specs = [CompileSpec("max_value", bytes([1]))] + + method_partitioner = { + "forward": AllNodePartitioner( + BackendWithCompilerDemo.__name__, compile_specs + ), + "forward_copy": AllNodePartitioner( + BackendWithCompilerDemo.__name__, compile_specs + ), + } + + lowered_ep_dict = to_backend( + MethodProgramsPartitionerSpec( + method_edge_program, + method_partitioner, + ) + ) + + new_edge_manager = EdgeProgramManager(lowered_ep_dict) + + exec_prog = new_edge_manager.to_executorch() + + program = exec_prog.executorch_program + # Since the preprocessed bytes are the same, there should only be on copy + self.assertEqual(len(program.backend_delegate_data), 1) + + self.check_backend_delegate( + program=program, + delegate=program.execution_plan[0].delegates[0], + expected_id=BackendWithCompilerDemo.__name__, + expected_processed=b"1version:0#op:demo::aten.sin.default, numel:1, dtype:torch.float322#", + ) + self.check_backend_delegate( + program=program, + delegate=program.execution_plan[1].delegates[0], + expected_id=BackendWithCompilerDemo.__name__, + expected_processed=b"1version:0#op:demo::aten.sin.default, numel:1, dtype:torch.float322#", + ) + + # Check that there are two methods + self.assertEqual(len(program.execution_plan), 2) + + delegate_method_1 = program.execution_plan[0].delegates + delegate_method_2 = program.execution_plan[1].delegates + + # 1 delegate blob for each method + self.assertEqual(len(delegate_method_1), 1) + self.assertEqual(len(delegate_method_2), 1) + + # Delegate Blobs reference the same underlying bytes + delegate_reference1 = delegate_method_1[0].processed + delegate_reference2 = delegate_method_2[0].processed + self.assertEqual(delegate_reference1.index, delegate_reference2.index) + + et_module = _load_for_executorch_from_buffer(exec_prog.buffer) + model_inputs = torch.ones(1) + model_outputs = et_module.run_method("forward", [model_inputs]) + self.assertEqual(model_inputs, torch.ones(1)) + model_outputs_from_copy_method = et_module.run_method( + "forward_copy", [model_inputs] + ) + self.assertEqual(model_inputs, torch.ones(1)) + self.assertEqual(model_outputs, model_outputs_from_copy_method) + self.assertTrue( + torch.allclose( + model_outputs[0], 0.8333 * torch.ones(1), atol=1e-03, rtol=1e-03 + ) + ) diff --git a/exir/capture/_config.py b/exir/capture/_config.py index abb7aa74b93..9267af4f2dc 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -97,3 +97,8 @@ class ExecutorchBackendConfig: # If set to true, all trainable weights will be stored in a separate file, # external to the PTE file. external_mutable_weights: bool = False + + # If set to true, all mutable buffers will have their fully qualified names + # serialized in the PTE file. Its value is ignored if mutable buffers are not + # memory planned as the names must be serialized in that case. + emit_mutable_buffer_names: bool = False diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py index f9571143a1b..f456626feed 100644 --- a/exir/emit/_emit_program.py +++ b/exir/emit/_emit_program.py @@ -118,6 +118,7 @@ def emit_program( methods: Union[ExportedProgram, Dict[str, ExportedProgram]], emit_stacktrace: bool = False, prim_getters: Optional[Dict[str, Any]] = None, + emit_mutable_buffer_names: bool = False, ) -> EmitterOutput: """ Given a exported program, it returns the program in the format @@ -163,6 +164,7 @@ def emit_program( operator_cache={}, delegate_cache={}, emit_stacktrace=emit_stacktrace, + emit_mutable_buffer_names=emit_mutable_buffer_names, ) gm = _remove_non_user_outputs(exported_program) diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index 0cbc63bde21..fe18e49a623 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -149,6 +149,7 @@ class _EmitterState: # delegate_cache: the key is hash(delegated_payload) and the value is the index in delegates delegate_cache: Dict[str, int] emit_stacktrace: bool + emit_mutable_buffer_names: bool spec2id_dict: Dict[TensorSpec, int] = field(default_factory=dict) @@ -1610,7 +1611,7 @@ def _find_fqn_for_placeholder( ) return fqn, is_mutable_buffer - def placeholder( + def placeholder( # noqa: C901 self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument] ) -> _AbstractValue: """Emits the value within the placeholder node. @@ -1640,6 +1641,26 @@ def placeholder( spec.extra_tensor_info.fully_qualified_name = fqn spec.extra_tensor_info.location = TensorDataLocation.EXTERNAL + if is_mutable_buffer: + # Emit names if we are supposed to. + if self.emitter_state.emit_mutable_buffer_names: + if spec.extra_tensor_info is None: + spec.extra_tensor_info = ExtraTensorInfo( + fully_qualified_name=fqn, + location=TensorDataLocation.SEGMENT, + ) + else: + spec.extra_tensor_info.fully_qualified_name = fqn + # if We aren't emitting the name then it needs to be memory planned. + elif spec.mem_id is None or spec.mem_offset is None: + raise InternalError( + self._emit_node_specific_error( + self.node, + # [2:] to remove the b_ prefix buffers get + f'Mutable buffer "{target[2:]}" must have a memory id and offset if we are emitting it without a name. Please either memory plan your mutable buffers or call to_executorch with config=ExecutorchBackendConfig(emit_mutable_buffer_names=True)', + ) + ) + # From the fqn find the corresponding tensor real_tensor = None if fqn in self.exported_program.state_dict: diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py index 362796146ee..186c5a402ab 100644 --- a/exir/emit/test/test_emit.py +++ b/exir/emit/test/test_emit.py @@ -1751,8 +1751,8 @@ def forward(self, x): module_1(*example_inputs) module_2(*example_inputs) - ep1 = export_for_training(module_1, example_inputs) - ep2 = export_for_training(module_2, example_inputs) + ep1 = export_for_training(module_1, example_inputs, strict=True) + ep2 = export_for_training(module_2, example_inputs, strict=True) edge_program_manager = exir.to_edge( {"forward1": ep1, "forward2": ep2}, @@ -1819,3 +1819,59 @@ def forward(self, input, label): ] self.assertEqual(external_map["net.linear.weight"], 0) self.assertEqual(external_map["net.linear.bias"], 1) + + def test_emit_mutable_buffer_names(self) -> None: + class Net(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(2, 2) + self.register_buffer("buffer", torch.zeros(1, 2)) + + def forward(self, x): + self.buffer.add_(1) + return self.linear(x) + self.buffer + + net = Net() + + ep = export(net, (torch.randn(1, 2),), strict=True) + # Lower the graph to edge dialect. + ep = to_edge(ep) + # Lower the graph to executorch. + ep = ep.to_executorch( + config=ExecutorchBackendConfig( + emit_mutable_buffer_names=True, + memory_planning_pass=MemoryPlanningPass(alloc_mutable_buffers=False), + ) + ) + for val in ep.executorch_program.execution_plan[0].values: + if isinstance(val, Tensor) and val.extra_tensor_info: + self.assertEqual(val.extra_tensor_info.fully_qualified_name, "buffer") + self.assertEqual(val.allocation_info, None) + + def test_emit_mutable_buffer_names_fails(self) -> None: + class Net(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(2, 2) + self.register_buffer("buffer", torch.zeros(1, 2)) + + def forward(self, x): + self.buffer.add_(1) + return self.linear(x) + self.buffer + + net = Net() + + ep = export(net, (torch.randn(1, 2),), strict=True) + # Lower the graph to edge dialect. + ep = to_edge(ep) + # Lower the graph to executorch. + # Must emit mutable buffer names if we don't allocate mutable buffers + with self.assertRaises(InternalError): + ep.to_executorch( + config=ExecutorchBackendConfig( + emit_mutable_buffer_names=False, + memory_planning_pass=MemoryPlanningPass( + alloc_mutable_buffers=False + ), + ) + ) diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 6bcc1b2f3d8..78b031a238e 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -766,15 +766,15 @@ def create_submodule_from_nodes( gm = insert_subgm(gm, sub_gm, orig_inputs, orig_outputs) submodule_node = None for node in gm.graph.nodes: - if node.op == "call_module": - if node.target == submodule_name: - submodule_node = node - else: - raise RuntimeError( - f"The submodule created with nodes {node_list} did not form \ - one fully contained subgraph. Check that these nodes form a \ - fully contained graph. Partitioned graph: {gm.graph}." - ) + if node.op == "call_module" and node.target == submodule_name: + submodule_node = node + + if submodule_node is None: + raise RuntimeError( + f"The submodule created with nodes {node_list} did not form \ + one fully contained subgraph. Check that these nodes form a \ + fully contained graph. Partitioned graph: {gm.graph}." + ) if len(orig_outputs) == 1 and isinstance(orig_outputs[0].meta["val"], FakeTensor): # If the original output is a single tensor, it has been @@ -809,12 +809,13 @@ def create_submodule_from_nodes( for node in gm.graph.nodes: if node.op == "call_module" and node.target == submodule_name: submodule_node = node - elif node.op == "call_module": - raise RuntimeError( - f"The submodule created with nodes {node_list} did not form \ - one fully contained subgraph. Check that these nodes form a \ - fully contained graph. Partitioned graph: {gm.graph}." - ) + + if submodule_node is None: + raise RuntimeError( + f"The submodule created with nodes {node_list} did not form \ + one fully contained subgraph. Check that these nodes form a \ + fully contained graph. Partitioned graph: {gm.graph}." + ) assert ( submodule_node is not None diff --git a/exir/memory_planning.py b/exir/memory_planning.py index 3f45276c9e2..83598940882 100644 --- a/exir/memory_planning.py +++ b/exir/memory_planning.py @@ -44,12 +44,14 @@ def __init__( graph_module: torch.fx.GraphModule, alloc_graph_input: bool, alloc_graph_output: bool, + alloc_mutable_buffers: bool, graph_signature: Optional[ExportGraphSignature] = None, ) -> None: self.graph_module = graph_module self.graph_signature = graph_signature self.alloc_graph_input = alloc_graph_input self.alloc_graph_output = alloc_graph_output + self.alloc_mutable_buffers = alloc_mutable_buffers @classmethod def mem_obj_id_match( @@ -149,6 +151,7 @@ def verify_storage_reuse( ignore_const=True, ignore_graph_input=not self.alloc_graph_input, ignore_graph_output=not self.alloc_graph_output, + ignore_mutable_buffers=not self.alloc_mutable_buffers, do_assertion=False, ignore_out_var_node=False, dedup=True, @@ -374,6 +377,7 @@ def collect_specs_from_nodes( # noqa: C901 graph_signature: Optional[ExportGraphSignature] = None, ignore_graph_input: bool = False, ignore_graph_output: bool = False, + ignore_mutable_buffers: bool = False, ignore_const: bool = True, ignore_out_var_node: bool = True, dedup: bool = True, @@ -414,6 +418,9 @@ def collect_specs_from_nodes( # noqa: C901 if _is_inplace_node(node): continue + if _is_mutable_buffer(node, graph_signature) and ignore_mutable_buffers: + continue + if do_assertion: internal_assert( node.op in ("placeholder", "output") @@ -469,6 +476,7 @@ def update_all_tensors_lifetime( Set the lifetime for all the tensors encountered in the Fx graph. """ specs = set() + for node_idx, node in enumerate(graph_module.graph.nodes): for spec in collect_specs_from_nodes( filter_nodes(itertools.chain([node], node.args, node.kwargs.values())), @@ -731,53 +739,43 @@ def _contains_xnnpack_delegate(graph_module: torch.fx.GraphModule) -> bool: def greedy( - graph_module: torch.fx.GraphModule, alignment: int, - graph_signature: Optional[ExportGraphSignature] = None, - alloc_graph_input: bool = True, - alloc_graph_output: bool = True, + specs: Set[TensorSpec], + graph_module: torch.fx.GraphModule, + graph_signature: ExportGraphSignature, + extra_padding: int = 0, + *, allow_overlapping_allocations: bool = True, ) -> MemoryAlgoResult: r"""Greedy algorithm to allocate memory for tensors in the graph. - alloc_graph_input: If set to true, the algorithm will allocate memory for graph input. - alloc_graph_output: If set to true, the algorithm will allocate memory for graph output. - allow_overlapping_allocations: If set to true, allows for allocations that overlap - in their lifetime but are at different offsets in the storage. By default true. - This flag is added to allow for Vulkan to use MemoryPlanningPass with overlapping - allocations disabled + + Args: + alignment: Memory alignment requirement + specs: Set of TensorSpec objects with updated lifetimes + graph_module: Graph module + graph_signature: Graph signature + extra_padding: Additional padding to add to each memory buffer (in bytes) + allow_overlapping_allocations: If set to true, allows for allocations that overlap + in their lifetime but are at different offsets in the storage. By default true. + This flag is added to allow for Vulkan to use MemoryPlanningPass with overlapping + allocations disabled + + Returns: + MemoryAlgoResult containing the allocation decisions """ greedy_result = MemoryAlgoResult({}, []) - # padding allocation with 64 bytes. - # this requirement is really for XNNPACK backend which can read tensors - # beyond the end of the tensor. This is done for performance - # optimizations in XNNPACK. - # While accounting for backend specific requirement is not the right choice - # in backend agnostic memory planning, we do it here as it seems most appropriate. - # Right now this applies to greedy only so any other - # algorithm that plans memory for XNNPACK backend will - # not have this. - extra_padded_bytes = 0 - if _contains_xnnpack_delegate(graph_module): - extra_padded_bytes = 64 spec2obj = {} shared_objects = defaultdict(list) - # Don't do assertion in collect_specs_from_nodes if we have already encountered - # and ignored some to_out_variant errors. - do_assertion = not getattr(graph_module, "encounter_to_out_var_failure", False) + # For each tensor, pick the available shared object with closest size to # the tensor. If there are no available shared object left, create a new # one. import bisect sorted_specs = [] - for spec in collect_specs_from_nodes( - graph_module.graph.nodes, - graph_signature, - do_assertion=do_assertion, - ignore_graph_input=not alloc_graph_input, - ignore_graph_output=not alloc_graph_output, - ): + for spec in specs: bisect.insort(sorted_specs, spec, key=lambda x: x.allocated_memory) + sorted_specs.reverse() for spec in sorted_specs: @@ -806,15 +804,13 @@ def greedy( for mem_id in shared_objects: input_total_size = 0 if bufsizes := getattr(graph_module, "input_mem_buffer_sizes", None): - # pyre-fixme[6]: For 1st argument expected - # `pyre_extensions.ReadOnly[Sized]` but got `Union[Tensor, Module]`. + assert isinstance(bufsizes, list) if len(bufsizes) > mem_id: - # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.Ten... input_total_size = bufsizes[mem_id] total_sizes[mem_id] = materialize_buffer( shared_objects[mem_id], input_total_size ) - total_sizes[mem_id] += extra_padded_bytes + total_sizes[mem_id] += extra_padding # Since we now know the number of shared objects we need and the size of # each shared object, we can assign offset in the memory buffer for each @@ -838,72 +834,107 @@ def greedy( return greedy_result -def memory_planning_algorithm_suite( - graph_module: torch.fx.GraphModule, - alignment: int, - graph_signature: Optional[ExportGraphSignature] = None, - alloc_graph_input: bool = True, - alloc_graph_output: bool = True, - allow_overlapping_allocations: bool = True, - algo_list: Optional[List[Callable[..., MemoryAlgoResult]]] = None, -) -> List[int]: - r""" - Memory planning algorithm suite that runs a list of memory planning algorithms - and returns the result of the algorithm that minimizes the total memory usage. - """ - if algo_list is None: - algo_list = [greedy] - mem_algo_results = {} - for algo in algo_list: - if isinstance(algo, functools.partial): - name = algo.func.__name__ - else: - name = getattr(algo, "__name__", None) - # Run this memory planning algorithm and store the result in mem_algo_results - # with the name of the algorithm as the key. - mem_algo_results[name] = algo( - graph_module, - alignment, - graph_signature, - alloc_graph_input, - alloc_graph_output, - ) +class MemoryPlanningAlgorithmSuite: + def __init__( + self, + algo_list: Optional[List[Callable[..., MemoryAlgoResult]]] = None, + ) -> None: + if algo_list is None: + algo_list = [greedy] + self.algo_list: List[Callable[..., MemoryAlgoResult]] = algo_list - # All the algorithms should have the same number of buffers allocated. - assert ( - len( - { - len(mem_algo_result.bufsizes) - for mem_algo_result in mem_algo_results.values() - } + def __call__( + self, + alignment: int, + specs: Set[TensorSpec], + graph_module: torch.fx.GraphModule, + graph_signature: ExportGraphSignature, + extra_padding: int, + ) -> List[int]: + r""" + Memory planning algorithm suite that runs a list of memory planning algorithms + and returns the result of the algorithm that minimizes the total memory usage. + + Args: + graph_module: The graph module to allocate memory for + alignment: Memory alignment requirement + graph_signature: Optional graph signature + alloc_graph_input: Whether to allocate memory for graph input + alloc_graph_output: Whether to allocate memory for graph output + allow_overlapping_allocations: Whether to allow overlapping allocations + algo_list: List of memory planning algorithms to run + specs: Optional set of TensorSpec objects with updated lifetimes. If None, they will be + calculated from the graph_module. + + Returns: + List of buffer sizes for each memory hierarchy + """ + + mem_algo_results = {} + for algo in self.algo_list: + if isinstance(algo, functools.partial): + name = algo.func.__name__ + else: + name = getattr(algo, "__name__", None) + + mem_algo_results[name] = algo( + alignment, + specs, + graph_module, + graph_signature, + extra_padding, + ) + + # All the algorithms should have the same number of buffers allocated. + assert ( + len( + { + len(mem_algo_result.bufsizes) + for mem_algo_result in mem_algo_results.values() + } + ) + == 1 + ), "Different memory planning algorithms should have the same number of buffers allocated." + + # Find the algorithm that minimizes the total memory usage. + best_algo = min( + mem_algo_results, key=lambda k: sum(mem_algo_results[k].bufsizes) ) - == 1 - ), "Different memory planning algorithms should have the same number of buffers allocated." - - # Find the algorithm that minimizes the total memory usage. - best_algo = min(mem_algo_results, key=lambda k: sum(mem_algo_results[k].bufsizes)) - logging.debug(f"Best memory planning algo for this model is {best_algo}") - bufsizes = mem_algo_results[best_algo].bufsizes - - # Update the mem_id and mem_offset for each spec in the graph module based on the - # values provided by the best memory planning algorithm. - for spec in mem_algo_results[best_algo].spec_dict: - spec_alloc_result = mem_algo_results[best_algo].spec_dict[spec] - spec.mem_id = spec_alloc_result.mem_id - spec.mem_offset = spec_alloc_result.mem_offset - spec.mem_obj_id = spec_alloc_result.mem_obj_id + logging.debug(f"Best memory planning algo for this model is {best_algo}") + bufsizes = mem_algo_results[best_algo].bufsizes - return bufsizes + # Update the mem_id and mem_offset for each spec in the graph module based on the + # values provided by the best memory planning algorithm. + for spec in mem_algo_results[best_algo].spec_dict: + spec_alloc_result = mem_algo_results[best_algo].spec_dict[spec] + spec.mem_id = spec_alloc_result.mem_id + spec.mem_offset = spec_alloc_result.mem_offset + spec.mem_obj_id = spec_alloc_result.mem_obj_id + + return bufsizes def naive( - graph_module: torch.fx.GraphModule, alignment: int, - graph_signature: Optional[ExportGraphSignature] = None, - alloc_graph_input: bool = True, - alloc_graph_output: bool = True, + specs: Set[TensorSpec], + graph_module: torch.fx.GraphModule, + graph_signature: ExportGraphSignature, + extra_padding: int, ) -> MemoryAlgoResult: + """Naive algorithm to allocate memory for tensors in the graph. + + This algorithm simply allocates memory for each tensor sequentially without reusing memory. + + Args: + alignment: Memory alignment requirement + specs: Set of TensorSpec objects with updated lifetimes + graph_module: Graph module + graph_signature: Graph signature + extra_padding: Additional padding to add to each memory buffer (in bytes) + Returns: + MemoryAlgoResult containing the allocation decisions + """ naive_result = MemoryAlgoResult({}, []) # allocate 'allocated' bytes from buffer with id mem_id. @@ -918,14 +949,9 @@ def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int: bufsizes = getattr(graph_module, "input_mem_buffer_sizes", None) if bufsizes is None: bufsizes = [0, 0] - bufsizes = typing.cast(List[int], bufsizes) - for spec in collect_specs_from_nodes( - graph_module.graph.nodes, - graph_signature, - ignore_graph_input=not alloc_graph_input, - ignore_graph_output=not alloc_graph_output, - ): + + for spec in specs: spec_alloc_result = naive_result.spec_dict.get(spec, SpecAllocResult(0, 0, 0)) # assume a single memory layer which has mem_id 1 if spec.mem_id is None: @@ -1027,7 +1053,7 @@ def insert_calls_to_free( def apply_algo( algo: Callable[ - [torch.fx.GraphModule, int, Optional[ExportGraphSignature], bool, bool], + ..., List[int], ], graph_module: torch.fx.GraphModule, @@ -1035,6 +1061,7 @@ def apply_algo( graph_signature: Optional[ExportGraphSignature] = None, alloc_graph_input: bool = True, alloc_graph_output: bool = True, + alloc_mutable_buffers: bool = True, ) -> List[int]: """ Recursively apply algo to graph_module and its submodules for control flow. @@ -1047,12 +1074,35 @@ def apply_algo( storage with tensors in the outer module. TODO: make these optimizations once we have some baseline working. """ + # Extract the nodes and their lifespans from the graph_module + # Difficult to just filter the list of specs returned by this due to + # how we flag trainable weights. + _ = update_all_tensors_lifetime(graph_module, graph_signature) + # Filter specs based on alloc_graph_input and alloc_graph_output + specs = collect_specs_from_nodes( + graph_module.graph.nodes, + graph_signature, + do_assertion=False, + ignore_graph_input=not alloc_graph_input, + ignore_graph_output=not alloc_graph_output, + ignore_mutable_buffers=not alloc_mutable_buffers, + ) - specs = update_all_tensors_lifetime(graph_module, graph_signature) + # Get extra padding for XNNPACK if needed + extra_padding = 0 + if _contains_xnnpack_delegate(graph_module): + extra_padding = 64 + + # Pass the filtered specs to the algorithm bufsizes: List[int] = algo( - graph_module, alignment, graph_signature, alloc_graph_input, alloc_graph_output + alignment, + specs, + graph_module, + graph_signature, + extra_padding, ) - insert_calls_to_free(graph_module, specs) + + insert_calls_to_free(graph_module, set(specs)) def handle_submodule( submodule_nd: torch.fx.Node, alloc_graph_input: bool = False @@ -1063,6 +1113,7 @@ def handle_submodule( # memory planning for submodule need to be aware of the amount of # buffer already allocated. submodule.input_mem_buffer_sizes = bufsizes + bufsizes = apply_algo( algo, submodule, diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py index f4881e7ab71..9bd4ab20bf5 100644 --- a/exir/passes/memory_planning_pass.py +++ b/exir/passes/memory_planning_pass.py @@ -17,7 +17,7 @@ _is_out_var_node, apply_algo, get_node_tensor_specs, - memory_planning_algorithm_suite, + MemoryPlanningAlgorithmSuite, Verifier, ) from executorch.exir.operator.convert import get_out_args_from_opoverload @@ -40,12 +40,11 @@ def _callable_name(any_callable: Callable[..., Any]) -> str: class MemoryPlanningPass(PassBase): def __init__( self, - memory_planning_algo: Callable[ - ..., List[int] - ] = memory_planning_algorithm_suite, + memory_planning_algo: Optional[Callable[..., List[int]]] = None, allow_lifetime_and_storage_overlap: bool = False, alloc_graph_input: bool = True, alloc_graph_output: bool = True, + alloc_mutable_buffers: bool = True, alignment: int = ALIGNMENT, ) -> None: r""" @@ -54,10 +53,13 @@ def __init__( the graph input/output. The default behavior is the algorithm will allocate memory for both graph input and output. """ - self.memory_planning_algo = memory_planning_algo + if memory_planning_algo is None: + memory_planning_algo = MemoryPlanningAlgorithmSuite() + self.memory_planning_algo: Callable[..., List[int]] = memory_planning_algo self.allow_lifetime_and_storage_overlap = allow_lifetime_and_storage_overlap self.alloc_graph_input = alloc_graph_input self.alloc_graph_output = alloc_graph_output + self.alloc_mutable_buffers = alloc_mutable_buffers self.alignment = alignment def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None: @@ -124,6 +126,7 @@ def run( # customized fields. Using the graph_module object to convey information across # passes/stages is quite natural and avoid yet another 'context' data structure # to do the job. + _ = apply_algo( self.memory_planning_algo, graph_module, @@ -131,6 +134,7 @@ def run( graph_signature, self.alloc_graph_input, self.alloc_graph_output, + self.alloc_mutable_buffers, ) # TODO: make the verifier do the work recursively to handle @@ -139,6 +143,7 @@ def run( graph_module, self.alloc_graph_input, self.alloc_graph_output, + self.alloc_mutable_buffers, graph_signature, ) diff --git a/exir/program/_program.py b/exir/program/_program.py index 7a2120f9e9b..e0484f4f4ff 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -212,7 +212,30 @@ def _get_updated_graph_signature( return new_signature -def _transform(self, *passes: PassType) -> "ExportedProgram": +def _transform( + self, + *passes: PassType, + override_verifiers: None | list[Type[Verifier]] = None, +) -> "ExportedProgram": + """ + Transforms the program according to the provided passes. + + Args: + self: The ExportedProgram instance to transform + *passes: A sequence of passes to apply to the program + override_verifiers: Optional list of verifier classes to use instead of the default verifiers. + This is needed if the transforms yields illegal graph that the default verifier cannot handle. + + Returns: + ExportedProgram: A new ExportedProgram with the transformations applied, or self if no changes were made + """ + # A user friendly check to avoid vararg surprises, PEP 3102 + assert not any( + isinstance(p, (list, Verifier)) for p in passes + ), f"Expected all passes to be of PassType, not list or Verifier. Use override_verifiers kwarg instead. Got: {list(passes)}" + + for p in list(passes): + print(type(p)) pm = PassManager(list(passes)) res = pm(self.graph_module) transformed_gm = res.graph_module if res is not None else self.graph_module @@ -221,7 +244,9 @@ def _transform(self, *passes: PassType) -> "ExportedProgram": if transformed_gm is self.graph_module and not res.modified: return self - return _update_exported_program_graph_module(self, transformed_gm) + return _update_exported_program_graph_module( + self, transformed_gm, override_verifiers + ) def _update_exported_program_graph_module( @@ -986,7 +1011,7 @@ def keep(op): try: # Ops in torch.ops.quant are not always loaded, so we use try/except # Aliases output, but we need to allow it for XNNPACK - allow_list.append(torch.ops.quant.choose_qparams_affine.default) + allow_list.append(torch.ops.torchao.choose_qparams_affine.default) except: pass @@ -1027,6 +1052,7 @@ def keep(op): torch.ops.aten.item.default, torch.ops.aten._local_scalar_dense.default, torch.ops.aten.unbind.int, + torch.ops.aten.split_with_sizes.default, ]: logging.warn( f"Op {op} was requested for preservation by partitioner. This request is ignored because it is in a blocklist." @@ -1325,7 +1351,7 @@ def to_edge( class EdgeProgramManager: """ Package of one or more `ExportedPrograms` in Edge dialect. Designed to simplify - lowering to ExecuTorch. See: https://pytorch.org/executorch/stable/ir-exir.html + lowering to ExecuTorch. See: https://pytorch.org/executorch/main/ir-exir Allows easy applications of transforms across a collection of exported programs including the delegation of subgraphs. @@ -1565,7 +1591,7 @@ def to_executorch( class ExecutorchProgramManager: """ Package of one or more `ExportedPrograms` in Execution dialect. Designed to simplify - lowering to ExecuTorch. See: https://pytorch.org/executorch/stable/ir-exir.html + lowering to ExecuTorch. See: https://pytorch.org/executorch/main/ir-exir When the ExecutorchProgramManager is constructed the ExportedPrograms in execution dialect are used to form the executorch binary (in a process called emission) and then serialized @@ -1612,6 +1638,7 @@ def __init__( self._execution_programs, backend_config.emit_stacktrace, self._config_methods, + backend_config.emit_mutable_buffer_names, ) # Serialize emitter output, ready to be written to a file. diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py index fca8bd2212f..9889417c56e 100644 --- a/exir/program/test/test_program.py +++ b/exir/program/test/test_program.py @@ -22,6 +22,7 @@ from executorch.exir.pass_base import ExportPass from executorch.exir.passes import MemoryPlanningPass from executorch.exir.program._program import ( + _transform, EdgeProgramManager, ExecutorchProgramManager, to_edge, @@ -34,6 +35,7 @@ from executorch.extension.pybindings.portable_lib import ( _load_for_executorch_from_buffer, ) +from torch._export.verifier import Verifier from torch.export import Dim, export, ExportedProgram from torch.export._trace import _export @@ -273,7 +275,6 @@ def get_executorch_memory_planning_passes() -> Dict[str, MemoryPlanningPass]: for output_val in method.outputs: evalue = method.values[output_val] self.assertNotEqual(evalue.val.allocation_info, None) - else: for input_val in method.inputs: evalue = method.values[input_val] self.assertEqual(evalue.val.allocation_info, None) @@ -725,17 +726,17 @@ def count_nodes(graph_module, target): ) def test_edge_dialect_non_core_aten_ops(self): - class LinalgNorm(torch.nn.Module): + class LinalgRank(torch.nn.Module): def __init__(self): super().__init__() def forward(self, x: torch.Tensor) -> torch.Tensor: - return torch.linalg.norm(x) + return torch.linalg.matrix_rank(x) from torch._export.verifier import SpecViolationError - input = torch.arange(9, dtype=torch.float) - 4 - ep = torch.export.export(LinalgNorm(), (input,), strict=True) + input = torch.ones((9, 9, 9), dtype=torch.float) + ep = torch.export.export(LinalgRank(), (input,), strict=True) # aten::linalg_norm is not a core op, so it should error out with self.assertRaises(SpecViolationError): @@ -748,9 +749,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: ep, compile_config=EdgeCompileConfig( _check_ir_validity=True, - _core_aten_ops_exception_list=[ - torch.ops.aten.linalg_vector_norm.default - ], + _core_aten_ops_exception_list=[torch.ops.aten._linalg_svd.default], ), ) except SpecViolationError: @@ -849,3 +848,23 @@ def test_save_fails(self): et = edge.to_executorch() with self.assertRaises(ValueError): _ = et.save("/tmp/test_save.pt") + + def test__transform_override_verifiers(self): + """Test that _transform can override verifiers in the exported program.""" + + class MyVerifier(Verifier): + dialect: str = "MY_DIALECT" + + def __init__(self): + super().__init__() + + model = TestLinear() + program = torch.export.export(model, model._get_random_inputs(), strict=True) + self.assertFalse(issubclass(program.verifiers[0], MyVerifier)) + + # Apply transformation with custom verifier + transformed = _transform( + program, AddToMulPassEdge(), override_verifiers=[MyVerifier] + ) + self.assertTrue(issubclass(transformed.verifiers[0], MyVerifier)) + self.assertFalse(issubclass(program.verifiers[0], MyVerifier)) diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py index 8df0cfed0bf..b87ae2dfb58 100644 --- a/exir/tests/test_memory_planning.py +++ b/exir/tests/test_memory_planning.py @@ -8,7 +8,6 @@ import itertools import unittest -from functools import partial from typing import Any, Callable, List, Optional, Tuple, Type import executorch.exir as exir @@ -20,8 +19,8 @@ filter_nodes, get_node_tensor_specs, greedy, - memory_planning_algorithm_suite, MemoryAlgoResult, + MemoryPlanningAlgorithmSuite, naive, Verifier, ) @@ -242,6 +241,7 @@ def maketest( use_functionalization: bool = True, alloc_graph_input: bool = True, alloc_graph_output: bool = True, + alloc_mutable_buffer: bool = True, has_unused_graph_input: bool = False, ) -> Callable[..., None]: # parameterized.expand is not compatible with maketest. I'll just loop thru @@ -269,7 +269,7 @@ def wrapper(self: "TestMemoryPlanning") -> None: .exported_program() .graph_module ) - mem_algo = partial(memory_planning_algorithm_suite, algo_list=[algo]) + mem_algo = MemoryPlanningAlgorithmSuite(algo_list=[algo]) graph_module = PassManager( passes=[ SpecPropPass(), @@ -283,10 +283,17 @@ def wrapper(self: "TestMemoryPlanning") -> None: )(graph_module).graph_module self.verify_reuse( - graph_module, expect_reuse, alloc_graph_input, alloc_graph_output + graph_module, + expect_reuse, + alloc_graph_input, + alloc_graph_output, + alloc_mutable_buffer, ) self.verify_graph_input_output( - graph_module, alloc_graph_input, alloc_graph_output + graph_module, + alloc_graph_input, + alloc_graph_output, + alloc_mutable_buffer, ) self.verify_overlap_placeholders(has_unused_graph_input, graph_module) @@ -307,6 +314,7 @@ def verify_reuse( expect_reuse: bool, alloc_graph_input: bool, alloc_graph_output: bool, + alloc_mutable_buffer: bool, ) -> None: r""" Do sanity check and verify tensor storage reuse. @@ -322,6 +330,7 @@ def verify_reuse( graph_module, alloc_graph_input=alloc_graph_input, alloc_graph_output=alloc_graph_output, + alloc_mutable_buffers=alloc_mutable_buffer, ).verify_storage_reuse() print(f"num_reuse_pairs is {num_reuse_pairs}") @@ -335,9 +344,10 @@ def verify_graph_input_output( graph_module: torch.fx.GraphModule, alloc_graph_input: bool, alloc_graph_output: bool, + alloc_mutable_buffers: bool, ) -> None: Verifier( - graph_module, alloc_graph_input, alloc_graph_output + graph_module, alloc_graph_input, alloc_graph_output, alloc_mutable_buffers ).verify_graph_input_output() def verify_overlap_placeholders( @@ -405,13 +415,16 @@ def verify_overlap_placeholders( ) def test_graph_input_output(self) -> None: - for alloc_graph_input, alloc_graph_output in itertools.product( - [True, False], [True, False] - ): + for ( + alloc_graph_input, + alloc_graph_output, + alloc_mutable_buffers, + ) in itertools.product([True, False], [True, False], [True, False]): case = maketest( ModelWithDifferentTensorSizes, alloc_graph_input=alloc_graph_input, alloc_graph_output=alloc_graph_output, + alloc_mutable_buffer=alloc_mutable_buffers, ) case(self) @@ -497,7 +510,6 @@ def quantize(self, eager_model: nn.Module) -> nn.Module: ) return quantized_model - # pyre-ignore @parameterized.expand( [ ( @@ -514,7 +526,7 @@ def quantize(self, eager_model: nn.Module) -> nn.Module: ) def test_multiple_pools( self, - algo: Callable[..., List[int]], + algo: Callable[..., MemoryAlgoResult], expected_allocs: List[Tuple[int, int]], expected_bufsizes: List[int], ) -> None: @@ -522,7 +534,7 @@ def test_multiple_pools( export(MultiplePoolsToyModel(), (torch.ones(1),), strict=True) ) - mem_algo = partial(memory_planning_algorithm_suite, algo_list=[algo]) + mem_algo = MemoryPlanningAlgorithmSuite(algo_list=[algo]) edge_program.to_executorch( exir.ExecutorchBackendConfig( memory_planning_pass=CustomPoolMemoryPlanningPass( @@ -537,6 +549,7 @@ def test_multiple_pools( graph_module, alloc_graph_input=True, alloc_graph_output=True, + alloc_mutable_buffers=True, ) verifier.verify_storage_reuse() verifier.verify_graph_input_output() diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py index 39dbd3f51d3..887ca39864a 100644 --- a/exir/tests/test_passes.py +++ b/exir/tests/test_passes.py @@ -1164,7 +1164,9 @@ def forward(self, query, key, value): value = torch.randn(32, 32, 32, 32) # Capture the model - m = torch.export.export_for_training(M(32), (query, key, value)).module() + m = torch.export.export_for_training( + M(32), (query, key, value), strict=True + ).module() # 8w16a quantization from torch.ao.quantization.observer import ( @@ -1405,8 +1407,7 @@ def quantize_model( ) -> Tuple[EdgeProgramManager, int, int]: # program capture m = torch.export.export_for_training( - m_eager, - example_inputs, + m_eager, example_inputs, strict=True ).module() quantizer = XNNPACKQuantizer() diff --git a/exir/tests/test_quantization.py b/exir/tests/test_quantization.py index 61e3410186e..0a0a85077bb 100644 --- a/exir/tests/test_quantization.py +++ b/exir/tests/test_quantization.py @@ -52,7 +52,7 @@ def test_resnet(self) -> None: m_copy = copy.deepcopy(m) # program capture m = torch.export.export_for_training( - m, copy.deepcopy(example_inputs) + m, copy.deepcopy(example_inputs), strict=True ).module() quantizer = XNNPACKQuantizer() diff --git a/exir/tests/test_quantize_io_pass.py b/exir/tests/test_quantize_io_pass.py index aab941b538c..ddc0294ba68 100644 --- a/exir/tests/test_quantize_io_pass.py +++ b/exir/tests/test_quantize_io_pass.py @@ -39,12 +39,14 @@ def _quantize(self, mod, example_inputs): operator_config = get_symmetric_quantization_config() quantizer.set_global(operator_config) m = torch.export.export_for_training( - mod, copy.deepcopy(example_inputs) + mod, copy.deepcopy(example_inputs), strict=True ).module() m = prepare_pt2e(m, quantizer) _ = m(*example_inputs) m = convert_pt2e(m) - exported_program = torch.export.export_for_training(m, example_inputs) + exported_program = torch.export.export_for_training( + m, example_inputs, strict=True + ) return exported_program def _check_count(self, op, count, epm): diff --git a/exir/tracer.py b/exir/tracer.py index 82f93424a14..c749df510ad 100644 --- a/exir/tracer.py +++ b/exir/tracer.py @@ -631,8 +631,18 @@ def _default_decomposition_table( ] # pyre-fixme[7]: Expected `Dict[OpOverload, typing.Callable[..., executorch.e... return get_decompositions(decomp_opset) + + decomps = default_decompositions() + # Add edge specific decompositions + additional_decomp_ops = [ + # TODO: Eventually this op should be added to the core decompo table, and will not + # need to be added here. + torch.ops.aten.linalg_vector_norm.default, + ] + additional_decomps = get_decompositions(additional_decomp_ops) + decomps.update(additional_decomps) # pyre-fixme[7]: Expected `Dict[OpOverload, typing.Callable[..., executorch.exir.... - return default_decompositions() + return decomps def dynamo_trace( diff --git a/extension/android/README.md b/extension/android/README.md new file mode 100644 index 00000000000..5fc4ba4429d --- /dev/null +++ b/extension/android/README.md @@ -0,0 +1,50 @@ +# ExecuTorch Android + +This directory contains the Android Java/Kotlin binding. The final product is an AAR, +which contains the `.so` libraries for c++ runtime, and `.jar` for Java API, and required +metadata `AndroidManifest.xml`. + +## Core contents + +Under `extension/android/`, + +- `executorch_android/` is the root for the Java `org.pytorch.executorch` package + - `src/` + - `androidTest/` contains the android instrumentation test source + - `main/` contains the Java source + - `test/` contains the Java unit test source + - `build.gradle` is the rule to build the Java package. +- `jni/` contains the JNI layer code, which depends on the ExecuTorch c++ runtime library. +- `CMakeLists.txt` is the rule for building the JNI library. + +## Build + +`scripts/build_android_library.sh` is a helper script to build the Java library (into .jar), native library (into .so), and the packaged AAR file. + +The usage is: +```sh +export ANDROID_HOME=/path/to/sdk +export ANDROID_NDK=/path/to/ndk +sh scripts/build_android_library.sh +``` + +The AAR file will be `extension/android/executorch_android/build/outputs/aar/executorch_android-debug.aar`. +If you set an environment variable `BUILD_AAR_DIR`, then the AAR will be copied to `$BUILD_AAR_DIR/executorch.aar`. +Later, you can copy `$BUILD_AAR_DIR/executorch.aar` to your app directory to use as a library. + +Please see [Android building from source](https://pytorch.org/executorch/main/using-executorch-android#building-from-source) for details + +## Test + +After the library is built, + +```sh +# Set up models for testing +sh executorch_android/android_test_setup.sh + +# Run unit test +./gradlew :executorch_android:testDebugUnitTest + +# Run instrumentation test +./gradlew :executorch_android:connectedAndroidTest +``` diff --git a/extension/android/executorch_android/android_test_setup.sh b/extension/android/executorch_android/android_test_setup.sh new file mode 100644 index 00000000000..c1fb2a19386 --- /dev/null +++ b/extension/android/executorch_android/android_test_setup.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi +which "${PYTHON_EXECUTABLE}" + +BASEDIR=$(dirname "$(realpath $0)") + +prepare_add() { + cp "${BASEDIR}/../../../extension/module/test/resources/add.pte" "${BASEDIR}/src/androidTest/resources" +} + +prepare_tinyllama() { + pushd "${BASEDIR}/../../../" + curl -C - -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt" --output stories15M.pt + curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model + # Create params.json file + touch params.json + echo '{"dim": 288, "multiple_of": 32, "n_heads": 6, "n_layers": 6, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json + python -m examples.models.llama.export_llama -c stories15M.pt -p params.json -d fp16 -n stories15m_h.pte -kv + python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin + + cp stories15m_h.pte "${BASEDIR}/src/androidTest/resources/stories.pte" + cp tokenizer.bin "${BASEDIR}/src/androidTest/resources/tokenizer.bin" + popd +} + +prepare_vision() { + pushd "${BASEDIR}/../../../" + python3 -m examples.xnnpack.aot_compiler --model_name "mv2" --delegate + python3 -m examples.xnnpack.aot_compiler --model_name "mv3" --delegate + python3 -m examples.xnnpack.aot_compiler --model_name "resnet50" --quantize --delegate + cp mv2*.pte mv3*.pte resnet50*.pte "${BASEDIR}/src/androidTest/resources/" + popd +} + +prepare_add +prepare_tinyllama +prepare_vision diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle index b284ce3896e..15088f4097f 100644 --- a/extension/android/executorch_android/build.gradle +++ b/extension/android/executorch_android/build.gradle @@ -27,12 +27,19 @@ android { } sourceSets { + main { + jniLibs.srcDirs = ['../../../cmake-out-android-so/'] + } androidTest { resources.srcDirs += [ 'src/androidTest/resources' ] } } } +task copyTestRes(type: Exec) { + commandLine 'bash', 'android_test_setup.sh' +} + dependencies { implementation 'com.facebook.fbjni:fbjni:0.5.1' implementation 'com.facebook.soloader:nativeloader:0.10.5' @@ -40,6 +47,7 @@ dependencies { androidTestImplementation 'androidx.test.ext:junit:1.1.5' androidTestImplementation 'androidx.test:rules:1.2.0' androidTestImplementation 'commons-io:commons-io:2.4' + androidTestImplementation 'org.json:json:20250107' } import com.vanniktech.maven.publish.SonatypeHost @@ -48,7 +56,7 @@ mavenPublishing { publishToMavenCentral(SonatypeHost.DEFAULT) signAllPublications() - coordinates("org.pytorch", "executorch-android", "0.5.0-SNAPSHOT") + coordinates("org.pytorch", "executorch-android", "0.7.0") pom { name = "ExecuTorch Android" diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java index b3b515d7ed0..c0a43b25a98 100644 --- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java @@ -34,13 +34,15 @@ import org.apache.commons.io.FileUtils; import androidx.test.ext.junit.runners.AndroidJUnit4; import androidx.test.InstrumentationRegistry; +import org.json.JSONException; +import org.json.JSONObject; import org.pytorch.executorch.extension.llm.LlmCallback; import org.pytorch.executorch.extension.llm.LlmModule; /** Unit tests for {@link org.pytorch.executorch.extension.llm.LlmModule}. */ @RunWith(AndroidJUnit4.class) public class LlmModuleInstrumentationTest implements LlmCallback { - private static String TEST_FILE_NAME = "/tinyllama_portable_fp16_h.pte"; + private static String TEST_FILE_NAME = "/stories.pte"; private static String TOKENIZER_FILE_NAME = "/tokenizer.bin"; private static String TEST_PROMPT = "Hello"; private static int OK = 0x00; @@ -86,7 +88,6 @@ public void testGenerate() throws IOException, URISyntaxException{ @Test public void testGenerateAndStop() throws IOException, URISyntaxException{ - int seqLen = 32; mModule.generate(TEST_PROMPT, SEQ_LEN, new LlmCallback() { @Override public void onResult(String result) { @@ -95,8 +96,8 @@ public void onResult(String result) { } @Override - public void onStats(float tps) { - LlmModuleInstrumentationTest.this.onStats(tps); + public void onStats(String stats) { + LlmModuleInstrumentationTest.this.onStats(stats); } }); @@ -110,7 +111,16 @@ public void onResult(String result) { } @Override - public void onStats(float tps) { - tokensPerSecond.add(tps); + public void onStats(String stats) { + float tps = 0; + try { + JSONObject jsonObject = new JSONObject(stats); + int numGeneratedTokens = jsonObject.getInt("generated_tokens"); + int inferenceEndMs = jsonObject.getInt("inference_end_ms"); + int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms"); + tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000; + tokensPerSecond.add(tps); + } catch (JSONException e) { + } } } diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java new file mode 100644 index 00000000000..3a033851be9 --- /dev/null +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.fail; + +import android.graphics.Bitmap; +import android.graphics.BitmapFactory; +import android.os.Environment; +import androidx.test.rule.GrantPermissionRule; +import android.Manifest; +import android.content.Context; +import org.junit.Test; +import org.junit.Before; +import org.junit.Rule; +import org.junit.runner.RunWith; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicInteger; +import java.io.IOException; +import java.io.File; +import java.io.FileOutputStream; +import org.junit.runners.JUnit4; +import org.apache.commons.io.FileUtils; +import androidx.test.ext.junit.runners.AndroidJUnit4; +import androidx.test.InstrumentationRegistry; + +/** Unit tests for {@link Module}. */ +@RunWith(AndroidJUnit4.class) +public class ModuleE2ETest { + private static String getTestFilePath(String fileName) { + return InstrumentationRegistry.getInstrumentation().getTargetContext().getExternalCacheDir() + fileName; + } + + @Rule + public GrantPermissionRule mRuntimePermissionRule = GrantPermissionRule.grant(Manifest.permission.READ_EXTERNAL_STORAGE); + + static int argmax(float[] array) { + if (array.length == 0) { + throw new IllegalArgumentException("Array cannot be empty"); + } + int maxIndex = 0; + float maxValue = array[0]; + for (int i = 1; i < array.length; i++) { + if (array[i] > maxValue) { + maxValue = array[i]; + maxIndex = i; + } + } + return maxIndex; + } + + public void testClassification(String filePath) throws IOException, URISyntaxException { + File pteFile = new File(getTestFilePath(filePath)); + InputStream inputStream = getClass().getResourceAsStream(filePath); + FileUtils.copyInputStreamToFile(inputStream, pteFile); + inputStream.close(); + + InputStream imgInputStream = getClass().getResourceAsStream("/banana.jpeg"); + Bitmap bitmap = BitmapFactory.decodeStream(imgInputStream); + bitmap = Bitmap.createScaledBitmap(bitmap, 224, 224, true); + imgInputStream.close(); + + Tensor inputTensor = + TensorImageUtils.bitmapToFloat32Tensor( + bitmap, + TensorImageUtils.TORCHVISION_NORM_MEAN_RGB, + TensorImageUtils.TORCHVISION_NORM_STD_RGB); + + Module module = Module.load(getTestFilePath(filePath)); + + EValue[] results = module.forward(EValue.from(inputTensor)); + assertTrue(results[0].isTensor()); + float[] scores = results[0].toTensor().getDataAsFloatArray(); + + int bananaClass = 954; // From ImageNet 1K + assertEquals(bananaClass, argmax(scores)); + } + + @Test + public void testMv2Fp32() throws IOException, URISyntaxException { + testClassification("/mv2_xnnpack_fp32.pte"); + } + + @Test + public void testMv3Fp32() throws IOException, URISyntaxException { + testClassification("/mv3_xnnpack_fp32.pte"); + } + + @Test + public void testResnet50() throws IOException, URISyntaxException { + testClassification("/resnet50_xnnpack_q8.pte"); + } +} diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TensorImageUtils.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TensorImageUtils.java new file mode 100644 index 00000000000..95434dcb734 --- /dev/null +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TensorImageUtils.java @@ -0,0 +1,150 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch; + +import android.graphics.Bitmap; +import android.util.Log; +import java.nio.FloatBuffer; +import org.pytorch.executorch.Tensor; + +/** + * Contains utility functions for {@link Tensor} creation from {@link android.graphics.Bitmap} or + * {@link android.media.Image} source. + */ +public final class TensorImageUtils { + + public static float[] TORCHVISION_NORM_MEAN_RGB = new float[] {0.485f, 0.456f, 0.406f}; + public static float[] TORCHVISION_NORM_STD_RGB = new float[] {0.229f, 0.224f, 0.225f}; + + /** + * Creates new {@link Tensor} from full {@link android.graphics.Bitmap}, normalized with specified + * in parameters mean and std. + * + * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order + * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB + * order + */ + public static Tensor bitmapToFloat32Tensor( + final Bitmap bitmap, final float[] normMeanRGB, final float normStdRGB[]) { + checkNormMeanArg(normMeanRGB); + checkNormStdArg(normStdRGB); + + return bitmapToFloat32Tensor( + bitmap, 0, 0, bitmap.getWidth(), bitmap.getHeight(), normMeanRGB, normStdRGB); + } + + /** + * Writes tensor content from specified {@link android.graphics.Bitmap}, normalized with specified + * in parameters mean and std to specified {@link java.nio.FloatBuffer} with specified offset. + * + * @param bitmap {@link android.graphics.Bitmap} as a source for Tensor data + * @param x - x coordinate of top left corner of bitmap's area + * @param y - y coordinate of top left corner of bitmap's area + * @param width - width of bitmap's area + * @param height - height of bitmap's area + * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order + * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB + * order + */ + public static void bitmapToFloatBuffer( + final Bitmap bitmap, + final int x, + final int y, + final int width, + final int height, + final float[] normMeanRGB, + final float[] normStdRGB, + final FloatBuffer outBuffer, + final int outBufferOffset) { + checkOutBufferCapacity(outBuffer, outBufferOffset, width, height); + checkNormMeanArg(normMeanRGB); + checkNormStdArg(normStdRGB); + final int pixelsCount = height * width; + final int[] pixels = new int[pixelsCount]; + bitmap.getPixels(pixels, 0, width, x, y, width, height); + final int offset_g = pixelsCount; + final int offset_b = 2 * pixelsCount; + for (int i = 0; i < 100; i++) { + final int c = pixels[i]; + Log.i("Image", ": " + i + " " + ((c >> 16) & 0xff)); + } + for (int i = 0; i < pixelsCount; i++) { + final int c = pixels[i]; + float r = ((c >> 16) & 0xff) / 255.0f; + float g = ((c >> 8) & 0xff) / 255.0f; + float b = ((c) & 0xff) / 255.0f; + outBuffer.put(outBufferOffset + i, (r - normMeanRGB[0]) / normStdRGB[0]); + outBuffer.put(outBufferOffset + offset_g + i, (g - normMeanRGB[1]) / normStdRGB[1]); + outBuffer.put(outBufferOffset + offset_b + i, (b - normMeanRGB[2]) / normStdRGB[2]); + } + } + + /** + * Creates new {@link Tensor} from specified area of {@link android.graphics.Bitmap}, normalized + * with specified in parameters mean and std. + * + * @param bitmap {@link android.graphics.Bitmap} as a source for Tensor data + * @param x - x coordinate of top left corner of bitmap's area + * @param y - y coordinate of top left corner of bitmap's area + * @param width - width of bitmap's area + * @param height - height of bitmap's area + * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order + * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB + * order + */ + public static Tensor bitmapToFloat32Tensor( + final Bitmap bitmap, + int x, + int y, + int width, + int height, + float[] normMeanRGB, + float[] normStdRGB) { + checkNormMeanArg(normMeanRGB); + checkNormStdArg(normStdRGB); + + final FloatBuffer floatBuffer = Tensor.allocateFloatBuffer(3 * width * height); + bitmapToFloatBuffer(bitmap, x, y, width, height, normMeanRGB, normStdRGB, floatBuffer, 0); + return Tensor.fromBlob(floatBuffer, new long[] {1, 3, height, width}); + } + + private static void checkOutBufferCapacity( + FloatBuffer outBuffer, int outBufferOffset, int tensorWidth, int tensorHeight) { + if (outBufferOffset + 3 * tensorWidth * tensorHeight > outBuffer.capacity()) { + throw new IllegalStateException("Buffer underflow"); + } + } + + private static void checkTensorSize(int tensorWidth, int tensorHeight) { + if (tensorHeight <= 0 || tensorWidth <= 0) { + throw new IllegalArgumentException("tensorHeight and tensorWidth must be positive"); + } + } + + private static void checkRotateCWDegrees(int rotateCWDegrees) { + if (rotateCWDegrees != 0 + && rotateCWDegrees != 90 + && rotateCWDegrees != 180 + && rotateCWDegrees != 270) { + throw new IllegalArgumentException("rotateCWDegrees must be one of 0, 90, 180, 270"); + } + } + + private static void checkNormStdArg(float[] normStdRGB) { + if (normStdRGB.length != 3) { + throw new IllegalArgumentException("normStdRGB length must be 3"); + } + } + + private static void checkNormMeanArg(float[] normMeanRGB) { + if (normMeanRGB.length != 3) { + throw new IllegalArgumentException("normMeanRGB length must be 3"); + } + } +} diff --git a/extension/android/executorch_android/src/androidTest/resources/banana.jpeg b/extension/android/executorch_android/src/androidTest/resources/banana.jpeg new file mode 100644 index 00000000000..2b237ce3d14 Binary files /dev/null and b/extension/android/executorch_android/src/androidTest/resources/banana.jpeg differ diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java index c05b30b0625..639fd0812bd 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java @@ -31,8 +31,11 @@ public interface LlmCallback { /** * Called when the statistics for the generate() is available. * - * @param tps Tokens/second for generated tokens. + * The result will be a JSON string. See extension/llm/stats.h for the field + * definitions. + * + * @param stats JSON string containing the statistics for the generate() */ @DoNotStrip - public void onStats(float tps); + default void onStats(String stats) {} } diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index d9ef6d1455e..da2ac49e446 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -72,6 +72,7 @@ non_fbcode_target(_kind = fb_android_cxx_library, "//xplat/executorch/extension/module:module_static", "//xplat/executorch/extension/runner_util:inputs_static", "//xplat/executorch/extension/tensor:tensor_static", + "//xplat/executorch/kernels/quantized:generated_lib_static", ], ) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index d6ade74ee1f..83ca1d898ed 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -100,14 +100,12 @@ class ExecuTorchLlmCallbackJni void onStats(const llm::Stats& result) const { static auto cls = ExecuTorchLlmCallbackJni::javaClassStatic(); - static const auto method = cls->getMethod("onStats"); - double eval_time = - (double)(result.inference_end_ms - result.prompt_eval_end_ms); - - float tps = result.num_generated_tokens / eval_time * - result.SCALING_FACTOR_UNITS_PER_SECOND; - - method(self(), tps); + static const auto on_stats_method = + cls->getMethod)>("onStats"); + on_stats_method( + self(), + facebook::jni::make_jstring( + executorch::extension::llm::stats_to_json_string(result))); } }; @@ -149,7 +147,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { facebook::jni::alias_ref data_path = nullptr) { #if defined(ET_USE_THREADPOOL) // Reserve 1 thread for the main thread. - uint32_t num_performant_cores = + int32_t num_performant_cores = ::executorch::extension::cpuinfo::get_num_performant_cores() - 1; if (num_performant_cores > 0) { ET_LOG(Info, "Resetting threadpool to %d threads", num_performant_cores); @@ -219,12 +217,15 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { [callback](const llm::Stats& result) { callback->onStats(result); }, echo); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { + executorch::extension::llm::GenerationConfig config{ + .echo = static_cast(echo), + .seq_len = seq_len, + }; runner_->generate( prompt->toStdString(), - seq_len, + config, [callback](std::string result) { callback->onResult(result); }, - [callback](const llm::Stats& result) { callback->onStats(result); }, - echo); + [callback](const llm::Stats& result) { callback->onStats(result); }); } return 0; } diff --git a/extension/benchmark/README.md b/extension/benchmark/README.md index a9918864e9c..d1367379bb8 100644 --- a/extension/benchmark/README.md +++ b/extension/benchmark/README.md @@ -61,7 +61,7 @@ Users can schedule a benchmarking workflow on a pull request through GitHub Acti ## Retrieving Benchmark Results -The easiest way to view benchmark results is on the [dashboard](./README.md#dashboard), while raw results for individual configurations can be manually accessed by downloading the `Customer_Artifacts.zip` from the CI. +The easiest way to view benchmark results is on the [dashboard](README.md#dashboard), while raw results for individual configurations can be manually accessed by downloading the `Customer_Artifacts.zip` from the CI. ## Feedback and Issue Reporting diff --git a/extension/benchmark/android/benchmark/app/build.gradle.kts b/extension/benchmark/android/benchmark/app/build.gradle.kts index dcf99ca9cd0..28dfc8ae49d 100644 --- a/extension/benchmark/android/benchmark/app/build.gradle.kts +++ b/extension/benchmark/android/benchmark/app/build.gradle.kts @@ -39,6 +39,7 @@ dependencies { implementation("com.facebook.soloader:soloader:0.10.5") implementation("com.facebook.fbjni:fbjni:0.5.1") implementation("com.google.code.gson:gson:2.8.6") + implementation("org.json:json:20250107") testImplementation("junit:junit:4.13.2") androidTestImplementation("androidx.test.ext:junit:1.2.1") androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1") diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java index 3bc38aad403..f6a894d6a1f 100644 --- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java +++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java @@ -21,8 +21,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import org.json.JSONException; +import org.json.JSONObject; public class LlmBenchmarkActivity extends Activity implements ModelRunnerCallback { ModelRunner mModelRunner; @@ -80,7 +80,17 @@ public void onTokenGenerated(String token) {} @Override public void onStats(String stats) { - mStatsInfo.tokens = stats; + float tps = 0; + try { + JSONObject jsonObject = new JSONObject(stats); + int numGeneratedTokens = jsonObject.getInt("generated_tokens"); + int inferenceEndMs = jsonObject.getInt("inference_end_ms"); + int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms"); + tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000; + mStatsInfo.tps = tps; + } catch (JSONException e) { + Log.e("LLM", "Error parsing JSON: " + e.getMessage()); + } } @Override @@ -108,8 +118,7 @@ public void onGenerationStopped() { (mStatsInfo.generateEnd - mStatsInfo.generateStart) * 1e-6, 0.0f)); // Token per second - results.add( - new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsInfo.tokens), 0.0f)); + results.add(new BenchmarkMetric(benchmarkModel, "token_per_sec", mStatsInfo.tps, 0.0f)); try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { Gson gson = new Gson(); @@ -118,15 +127,6 @@ public void onGenerationStopped() { e.printStackTrace(); } } - - private double extractTPS(final String tokens) { - final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens); - if (m.find()) { - return Double.parseDouble(m.group()); - } else { - return 0.0f; - } - } } class StatsInfo { @@ -135,7 +135,7 @@ class StatsInfo { long loadEnd; long generateStart; long generateEnd; - String tokens; + float tps; String modelName; @Override @@ -149,6 +149,6 @@ public String toString() { + "\ngenerateEnd: " + generateEnd + "\n" - + tokens; + + tps; } } diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java index 6ba1f57c4f3..0a75b47f3a6 100644 --- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java +++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java @@ -68,8 +68,8 @@ public void onResult(String result) { } @Override - public void onStats(float tps) { - mCallback.onStats("tokens/second: " + tps); + public void onStats(String result) { + mCallback.onStats(result); } } diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java index 63701a7bbc6..8503d47ccce 100644 --- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java +++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java @@ -18,7 +18,7 @@ public interface ModelRunnerCallback { void onTokenGenerated(String token); - void onStats(String token); + void onStats(String result); void onGenerationStopped(); } diff --git a/extension/benchmark/apple/Benchmark/README.md b/extension/benchmark/apple/Benchmark/README.md index a68a9bf8abb..4d8e9374634 100644 --- a/extension/benchmark/apple/Benchmark/README.md +++ b/extension/benchmark/apple/Benchmark/README.md @@ -33,7 +33,7 @@ This command performs a shallow clone to speed up the process. The Benchmark App is configured to use a Swift PM package that provides the prebuilt ExecuTorch frameworks. -By default, the app relies on the package referencing locally built binaries. To ensure it functions correctly, you must first build the frameworks by following the [guide](https://pytorch.org/executorch/main/using-executorch-ios.html#building-from-source). +By default, the app relies on the package referencing locally built binaries. To ensure it functions correctly, you must first build the frameworks by following the [guide](https://pytorch.org/executorch/main/using-executorch-ios#building-from-source). ## Adding Models and Resources diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm index 332c3986b0b..985f77956b6 100644 --- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm +++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm @@ -60,7 +60,7 @@ @implementation LLaMATests + (NSDictionary *)predicates { return @{ @"model" : ^BOOL(NSString *filename){ - return [filename hasSuffix:@".pte"] && [filename containsString:@"llama"]; + return [filename hasSuffix:@".pte"] && [filename.lowercaseString containsString:@"llama"]; }, @"tokenizer" : ^BOOL(NSString *filename) { return [filename isEqual:@"tokenizer.bin"] || [filename isEqual:@"tokenizer.model"]; @@ -85,14 +85,18 @@ @implementation LLaMATests [testCase measureWithMetrics:@[ tokensPerSecondMetric, [XCTClockMetric new], [XCTMemoryMetric new] ] block:^{ tokensPerSecondMetric.tokenCount = 0; + // Create a GenerationConfig object + ::executorch::extension::llm::GenerationConfig config{ + .max_new_tokens = 50, + .warming = false, + }; + const auto status = runner->generate( "Once upon a time", - 50, + config, [=](const std::string &token) { tokensPerSecondMetric.tokenCount++; - }, - nullptr, - false); + }); XCTAssertEqual(status, Error::Ok); }]; }, diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp index 1a9ddad259f..503539774a5 100644 --- a/extension/data_loader/file_data_loader.cpp +++ b/extension/data_loader/file_data_loader.cpp @@ -86,6 +86,9 @@ Result FileDataLoader::from( "Alignment %zu is not a power of 2", alignment); + ET_CHECK_OR_RETURN_ERROR( + file_name != nullptr, InvalidArgument, "File name cannot be empty."); + // Use open() instead of fopen() to avoid the layer of buffering that // fopen() does. We will be reading large portions of the file in one shot, // so buffering does not help. diff --git a/extension/data_loader/test/file_data_loader_test.cpp b/extension/data_loader/test/file_data_loader_test.cpp index 1d4f4c16196..7dc872995a5 100644 --- a/extension/data_loader/test/file_data_loader_test.cpp +++ b/extension/data_loader/test/file_data_loader_test.cpp @@ -154,6 +154,12 @@ TEST_P(FileDataLoaderTest, FromMissingFileFails) { EXPECT_NE(fdl.error(), Error::Ok); } +TEST_P(FileDataLoaderTest, FromEmptyFilePathFails) { + // Nullptr should fail + Result fdl = FileDataLoader::from(nullptr); + EXPECT_NE(fdl.error(), Error::Ok); +} + TEST_P(FileDataLoaderTest, BadAlignmentFails) { // Create a temp file; contents don't matter. uint8_t data[256] = {}; diff --git a/extension/export_util/utils.py b/extension/export_util/utils.py index 2679930178a..aa3a736af3c 100644 --- a/extension/export_util/utils.py +++ b/extension/export_util/utils.py @@ -108,7 +108,7 @@ def export_to_exec_prog( ) -> ExecutorchProgramManager: m = model.eval() # pre-autograd export. eventually this will become torch.export - m = export_for_training(m, example_inputs).module() + m = export_for_training(m, example_inputs, strict=True).module() core_aten_ep = _to_core_aten( m, diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp index bf54ae014b5..8aa0af13928 100644 --- a/extension/flat_tensor/flat_tensor_data_map.cpp +++ b/extension/flat_tensor/flat_tensor_data_map.cpp @@ -25,8 +25,8 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::aten::ScalarType; +using executorch::ET_RUNTIME_NAMESPACE::TensorLayout; using executorch::runtime::DataLoader; -using executorch::runtime::TensorLayout; namespace executorch { namespace extension { diff --git a/extension/flat_tensor/flat_tensor_data_map.h b/extension/flat_tensor/flat_tensor_data_map.h index 972a5fa9c55..0e7aee8ffc8 100644 --- a/extension/flat_tensor/flat_tensor_data_map.h +++ b/extension/flat_tensor/flat_tensor_data_map.h @@ -32,7 +32,8 @@ namespace extension { /** * A NamedDataMap implementation for FlatTensor-serialized data. */ -class FlatTensorDataMap final : public executorch::runtime::NamedDataMap { +class FlatTensorDataMap final + : public executorch::ET_RUNTIME_NAMESPACE::NamedDataMap { public: /** * Creates a new DataMap that wraps FlatTensor data. @@ -51,7 +52,8 @@ class FlatTensorDataMap final : public executorch::runtime::NamedDataMap { * @return Error::NotFound if the key is not present. */ ET_NODISCARD - executorch::runtime::Result + executorch::runtime::Result< + const executorch::ET_RUNTIME_NAMESPACE::TensorLayout> get_metadata(const char* key) const override; /** diff --git a/extension/flat_tensor/targets.bzl b/extension/flat_tensor/targets.bzl index 0d49995aa6e..4ac515b7bf0 100644 --- a/extension/flat_tensor/targets.bzl +++ b/extension/flat_tensor/targets.bzl @@ -1,24 +1,26 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def define_common_targets(): - runtime.cxx_library( - name = "flat_tensor_data_map", - srcs = [ - "flat_tensor_data_map.cpp", - ], - exported_headers = ["flat_tensor_data_map.h"], - deps = [ - "//executorch/runtime/core:core", - "//executorch/runtime/core:evalue", - "//executorch/runtime/core:named_data_map", - "//executorch/runtime/core/exec_aten:lib", - "//executorch/runtime/core/exec_aten/util:tensor_util", - ], - exported_deps = [ - "//executorch/extension/flat_tensor/serialize:flat_tensor_header", - "//executorch/extension/flat_tensor/serialize:generated_headers", - ], - visibility = [ - "//executorch/...", - ], - ) + for aten_mode in [True, False]: + aten_suffix = "_aten" if aten_mode else "" + runtime.cxx_library( + name = "flat_tensor_data_map" + aten_suffix, + srcs = [ + "flat_tensor_data_map.cpp", + ], + exported_headers = ["flat_tensor_data_map.h"], + deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/core:evalue", + "//executorch/runtime/core:named_data_map" + aten_suffix, + "//executorch/runtime/core/exec_aten:lib" + aten_suffix, + "//executorch/runtime/core/exec_aten/util:tensor_util", + ], + exported_deps = [ + "//executorch/extension/flat_tensor/serialize:flat_tensor_header", + "//executorch/extension/flat_tensor/serialize:generated_headers", + ], + visibility = [ + "//executorch/...", + ], + ) diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index fd2ead6c8b0..42e82dc360f 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -21,6 +21,9 @@ if(NOT EXECUTORCH_ROOT) endif() set(_common_compile_options -Wno-deprecated-declarations -fPIC) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + list(APPEND _common_compile_options "-march=armv8.2-a+dotprod") +endif() include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake) @@ -38,6 +41,7 @@ include(${EXECUTORCH_SRCS_FILE}) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) +list(APPEND _common_include_directories ${EXECUTORCH_ROOT}/third-party/ao) # Custom op libraries set(custom_ops_libs pthreadpool) diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS index 5d0c0490506..61be3d191a7 100644 --- a/extension/llm/custom_ops/TARGETS +++ b/extension/llm/custom_ops/TARGETS @@ -47,3 +47,17 @@ runtime.python_test( "//caffe2:torch", ], ) + +runtime.python_test( + name = "test_quantized_sdpa", + srcs = [ + "test_quantized_sdpa.py", + ], + preload_deps = [ + ":custom_ops_aot_lib_mkl_noomp", + ":custom_ops_aot_py", + ], + deps = [ + "//caffe2:torch", + ], +) diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py index d299b314816..6d96a926497 100644 --- a/extension/llm/custom_ops/custom_ops.py +++ b/extension/llm/custom_ops/custom_ops.py @@ -229,3 +229,127 @@ def update_cache_meta( # workaround. Should we just return cache instead? But I am afraid that # will result in extra memory allocation return torch.empty((1,), dtype=value.dtype, device="meta") + + +def _validate_quantized_sdpa_params( + query, + key, + value, + start_pos, + seq_len, + attn_mask, + drpout_p, + is_causal, + scale, + q_scale, + q_zero_point, + k_scale, + k_zero_point, + v_scale, + v_zero_point, + is_seq_at_dim_2, +): + assert ( + query.dim() == 4 + ), f"Expected query to be 4 dimensional but got {query.dim()} dimensions." + assert ( + key.dim() == 4 + ), f"Expected key to be 4 dimensional but got {key.dim()} dimensions." + assert ( + value.dim() == 4 + ), f"Expected value to be 4 dimensional but got {value.dim()} dimensions." + + assert (q_scale is not None) and ( + q_zero_point is not None + ), "q_scale and q_zero_point must be provided" + assert (k_scale is not None) and ( + k_zero_point is not None + ), "k_scale and k_zero_point must be provided" + assert (v_scale is not None) and ( + v_zero_point is not None + ), "v_scale and v_zero_point must be provided" + + assert query.dtype == torch.int8, f"Expected query to be int8 but got {query.dtype}" + assert key.dtype == torch.int8, f"Expected key to be int8 but got {key.dtype}" + assert value.dtype == torch.int8, f"Expected value to be int8 but got {value.dtype}" + + assert ( + q_scale.dtype == torch.float32 + ), f"Expected q_scale to be float32 but got {q_scale.dtype}" + assert ( + q_zero_point.dtype == torch.int8 + ), f"Expected q_zero_point to be int8 but got {q_zero_point.dtype}" + assert ( + k_scale.dtype == torch.float32 + ), f"Expected k_scale to be float32 but got {k_scale.dtype}" + assert ( + k_zero_point.dtype == torch.int8 + ), f"Expected k_zero_point to be int8 but got {k_zero_point.dtype}" + assert ( + v_scale.dtype == torch.float32 + ), f"Expected v_scale to be float32 but got {v_scale.dtype}" + assert ( + v_zero_point.dtype == torch.int8 + ), f"Expected v_zero_point to be int8 but got {v_zero_point.dtype}" + + assert ( + query.size()[:-1] == q_scale.size()[:-1] + ), f"Expected query and q_scale to have same size except last dimensions but got {query.size()} and {q_scale.size()}" + assert ( + query.size()[:-1] == q_zero_point.size()[:-1] + ), f"Expected query and q_zero_point to have same size except last dimensions but got {query.size()} and {q_zero_point.size()}" + + assert ( + key.size()[:-1] == k_scale.size()[:-1] + ), f"Expected key and k_scale to have same size except last dimensions but got {key.size()} and {k_scale.size()}" + assert ( + key.size()[:-1] == k_zero_point.size()[:-1] + ), f"Expected key and k_zero_point to have same size except last dimensions but got {key.size()} and {k_zero_point.size()}" + + assert ( + value.size()[:-1] == v_scale.size()[:-1] + ), f"Expected value and v_scale to have same size except last dimensions but got {value.size()} and {v_scale.size()}" + assert ( + value.size()[:-1] == v_zero_point.size()[:-1] + ), f"Expected value and v_zero_point to have same size except last dimensions but got {value.size()} and {v_zero_point.size()}" + + +@impl(custom_ops_lib, "custom_quantized_sdpa", "Meta") +def custom_quantized_sdpa_meta( + query, + key, + value, + start_pos, + attn_mask=None, + drpout_p=0.0, + is_causal=False, + scale=None, + q_zero_point=None, + q_scale=None, + k_zero_point=None, + k_scale=None, + v_zero_point=None, + v_scale=None, + is_seq_at_dim_2=False, +): + seq_len = query.size(1) + _validate_quantized_sdpa_params( + query, + key, + value, + start_pos, + seq_len, + attn_mask, + drpout_p, + is_causal, + scale, + q_scale, + q_zero_point, + k_scale, + k_zero_point, + v_scale, + v_zero_point, + is_seq_at_dim_2, + ) + + return torch.empty(query.size(), dtype=torch.float32, device="meta") diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index 202ff17188d..4a2c464eb56 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -44,7 +44,9 @@ bool validate_flash_attention_args( "scaled_dot_product_attention_flash_attention: Q/K/V should have the same head size"); ET_CHECK_OR_RETURN_FALSE( - (query.scalar_type() == ScalarType::Float), "Query must be Float type"); + (query.scalar_type() == ScalarType::Float) || + (query.scalar_type() == ScalarType::Char), + "Query must be Float type"); ET_CHECK_OR_RETURN_FALSE( (query.scalar_type() == key.scalar_type()) && @@ -262,14 +264,14 @@ Tensor& flash_attention_kernel_out( InvalidArgument, output); - auto q_seq_len = query.size(2); + auto seq_len = query.size(2); ET_SWITCH_FLOAT_TYPES( query.scalar_type(), ctx, "flash_attention", CTYPE, [&] { // TODO we need to re-evaluate this for ARM CPUs // And there can be many so instead of templatizing // we might consider another appraoch - if (q_seq_len >= 768) { + if (seq_len >= 768) { sdpa::impl::cpu_flash_attention( output, query, @@ -285,7 +287,7 @@ Tensor& flash_attention_kernel_out( nullopt, nullopt, nullopt); - } else if (q_seq_len >= 192) { + } else if (seq_len >= 192) { sdpa::impl::cpu_flash_attention( output, query, @@ -339,7 +341,8 @@ Tensor& custom_sdpa_out_impl( const optional& k_zero_points = nullopt, const optional& k_scales = nullopt, const optional& v_zero_points = nullopt, - const optional& v_scales = nullopt) { + const optional& v_scales = nullopt, + bool is_seq_at_dim_2 = false) { ET_KERNEL_CHECK_MSG( ctx, !attn_mask.has_value() || !is_causal, @@ -354,9 +357,16 @@ Tensor& custom_sdpa_out_impl( output, "Invalid arguments"); - bool is_seq_at_dim_1{true}; + int64_t seq_len = q.size(1); + SeqDim seq_dim{SeqDim::TWO}; + if (!is_seq_at_dim_2) { + seq_dim = SeqDim::ONE; + } + if (q.scalar_type() == ScalarType::Char) { - is_seq_at_dim_1 = false; + if (seq_dim == SeqDim::TWO) { + seq_len = q.size(2); + } ET_KERNEL_CHECK_MSG( ctx, q_scales.has_value() && q_zero_points.has_value() && @@ -390,10 +400,8 @@ Tensor& custom_sdpa_out_impl( ET_CHECK_MSG(q.dim() == 4, "query must be a 4D tensor"); - const int64_t seq_len = q.size(1); - auto q_seq_len = q.size(1); - - const int64_t num_keys_for_causal_attention = start_pos + seq_len; + const int64_t num_keys_for_causal_attention = + attn_mask.has_value() ? -1 : start_pos + seq_len; ET_KERNEL_CHECK( ctx, @@ -408,7 +416,7 @@ Tensor& custom_sdpa_out_impl( // TODO we need to re-evaluate this for ARM CPUs // And there can be many so instead of templatizing // we might consider another appraoch - if (q_seq_len >= 768) { + if (seq_len >= 768) { sdpa::impl::cpu_flash_attention( output, q, @@ -418,16 +426,16 @@ Tensor& custom_sdpa_out_impl( is_causal, attn_mask, scale, - nullopt, // q_zero_points - nullopt, // q_scales - nullopt, // k_zero_points - nullopt, // k_scales - nullopt, // v_zero_points - nullopt, // v_scales - is_seq_at_dim_1, /* is_seq_at_dim_1 */ + q_zero_points, // q_zero_points + q_scales, // q_scales + k_zero_points, // k_zero_points + k_scales, // k_scales + v_zero_points, // v_zero_points + v_scales, // v_scales + seq_dim, /* seq_dim */ start_pos, num_keys_for_causal_attention); - } else if (q_seq_len >= 192) { + } else if (seq_len >= 192) { sdpa::impl::cpu_flash_attention( output, q, @@ -437,13 +445,13 @@ Tensor& custom_sdpa_out_impl( is_causal, attn_mask, scale, - nullopt, // q_zero_points - nullopt, // q_scales - nullopt, // k_zero_points - nullopt, // k_scales - nullopt, // v_zero_points - nullopt, // v_scales - is_seq_at_dim_1, /* is_seq_at_dim_1 */ + q_zero_points, // q_zero_points + q_scales, // q_scales + k_zero_points, // k_zero_points + k_scales, // k_scales + v_zero_points, // v_zero_points + v_scales, // v_scales + seq_dim, /* seq_dim */ start_pos, num_keys_for_causal_attention); } else { @@ -456,13 +464,13 @@ Tensor& custom_sdpa_out_impl( is_causal, attn_mask, scale, - nullopt, // q_zero_points - nullopt, // q_scales - nullopt, // k_zero_points - nullopt, // k_scales - nullopt, // v_zero_points - nullopt, // v_scales - is_seq_at_dim_1, /* is_seq_at_dim_1 */ + q_zero_points, // q_zero_points + q_scales, // q_scales + k_zero_points, // k_zero_points + k_scales, // k_scales + v_zero_points, // v_zero_points + v_scales, // v_scales + seq_dim, /* seq_dim */ start_pos, num_keys_for_causal_attention); } @@ -470,6 +478,45 @@ Tensor& custom_sdpa_out_impl( return output; } +Tensor& custom_quantized_sdpa_out( + RuntimeContext& ctx, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const int64_t start_pos, + const optional& attn_mask, + const double dropout_p, + const bool is_causal, + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const optional scale, + const optional& q_zero_points, + const optional& q_scales, + const optional& k_zero_points, + const optional& k_scales, + const optional& v_zero_points, + const optional& v_scales, + const bool is_seq_at_dim_2, + Tensor& output) { + return custom_sdpa_out_impl( + ctx, + q, + k, + v, + start_pos, + attn_mask, + dropout_p, + is_causal, + scale, + output, + q_zero_points, + q_scales, + k_zero_points, + k_scales, + v_zero_points, + v_scales, + is_seq_at_dim_2); +} + /* Input params @param[in] q_projected Projected query with query weights. @@ -570,3 +617,8 @@ EXECUTORCH_LIBRARY( llama, "custom_sdpa.out", torch::executor::native::custom_sdpa_out); + +EXECUTORCH_LIBRARY( + llama, + "custom_quantized_sdpa.out", + torch::executor::native::custom_quantized_sdpa_out); diff --git a/extension/llm/custom_ops/op_sdpa.h b/extension/llm/custom_ops/op_sdpa.h index bc2202b9bd8..9d357eb6ea1 100644 --- a/extension/llm/custom_ops/op_sdpa.h +++ b/extension/llm/custom_ops/op_sdpa.h @@ -56,6 +56,25 @@ Tensor& flash_attention_kernel_out( const optional scale, Tensor& output); +Tensor& custom_quantized_sdpa_out( + RuntimeContext& ctx, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const int64_t start_pos, + const optional& attn_mask, + const double dropout_p, + const bool is_causal, + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const optional scale, + const optional& q_zero_points, + const optional& q_scales, + const optional& k_zero_points, + const optional& k_scales, + const optional& v_zero_points, + const optional& v_scales, + const bool is_seq_at_dim_1, + Tensor& output); } // namespace native } // namespace executor } // namespace torch diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp index 213adf1c8ab..ff367c85c8a 100644 --- a/extension/llm/custom_ops/op_sdpa_aot.cpp +++ b/extension/llm/custom_ops/op_sdpa_aot.cpp @@ -77,6 +77,47 @@ at::Tensor custom_sdpa_aten( // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy const std::optional scale); +Tensor& custom_quantized_sdpa_out_no_context( + const Tensor& q, + const Tensor& k, + const Tensor& v, + const int64_t start_pos, + // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const optional attn_mask, + const double dropout_p, + const bool is_causal, + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const optional scale, + const optional q_zero_points, + const optional q_scales, + const optional k_zero_points, + const optional k_scales, + const optional v_zero_points, + const optional v_scales, + const bool is_seq_at_dim_2, + Tensor& output); + +at::Tensor custom_quantized_sdpa_aten( + const at::Tensor& q, + const at::Tensor& k, + const at::Tensor& v, + const int64_t start_pos, + // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const std::optional attn_mask, + const double dropout_p, + const bool is_causal, + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const std::optional scale, + const std::optional& q_zero_points, + const std::optional& q_scales, + const std::optional& k_zero_points, + const std::optional& k_scales, + const std::optional& v_zero_points, + const std::optional& v_scales, + const bool is_seq_at_dim_2); + Tensor& update_cache_out_no_context( const Tensor& value, Tensor& cache, @@ -198,6 +239,87 @@ at::Tensor custom_sdpa_aten( return output; } +Tensor& custom_quantized_sdpa_out_no_context( + const Tensor& q, + const Tensor& k, + const Tensor& v, + const int64_t start_pos, + // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const optional attn_mask, + const double dropout_p, + const bool is_causal, + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const optional scale, + const optional q_zero_points, + const optional q_scales, + const optional k_zero_points, + const optional k_scales, + const optional v_zero_points, + const optional v_scales, + const bool is_seq_at_dim_2, + Tensor& output) { + executorch::aten::RuntimeContext context{}; + return torch::executor::native::custom_quantized_sdpa_out( + context, + q, + k, + v, + start_pos, + attn_mask, + dropout_p, + is_causal, + scale, + q_zero_points, + q_scales, + k_zero_points, + k_scales, + v_zero_points, + v_scales, + is_seq_at_dim_2, + output); +} + +at::Tensor custom_quantized_sdpa_aten( + const at::Tensor& q, + const at::Tensor& k, + const at::Tensor& v, + const int64_t start_pos, + // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const std::optional attn_mask, + const double dropout_p, + const bool is_causal, + // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy + const std::optional scale, + const std::optional& q_zero_points, + const std::optional& q_scales, + const std::optional& k_zero_points, + const std::optional& k_scales, + const std::optional& v_zero_points, + const std::optional& v_scales, + const bool is_seq_at_dim_2) { + auto output = at::empty(q.sizes()); + WRAP_TO_ATEN(custom_quantized_sdpa_out_no_context, 15) + (q, + k, + v, + start_pos, + attn_mask, + dropout_p, + is_causal, + scale, + q_zero_points, + q_scales, + k_zero_points, + k_scales, + v_zero_points, + v_scales, + is_seq_at_dim_2, + output); + return output; +} + Tensor& update_cache_out_no_context( const Tensor& value, Tensor& cache, @@ -245,6 +367,18 @@ TORCH_LIBRARY_FRAGMENT(llama, m) { m.def( "update_cache.out(Tensor value, Tensor(a!) cache, " "SymInt start_pos, *, Tensor(b!) out) -> Tensor(b!)"); + m.def( + "custom_quantized_sdpa(Tensor query, Tensor key, Tensor value, SymInt start_pos, " + "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, " + "float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, " + "Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, " + "Tensor? v_scales=None, bool is_seq_at_dim_2=False) -> Tensor"); + m.def( + "custom_quantized_sdpa.out(Tensor query, Tensor key, Tensor value, SymInt start_pos, " + "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, " + "float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, " + "Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, " + "Tensor? v_scales=None, bool is_seq_at_dim_2=False, *, Tensor(a!) out) -> Tensor(a!)"); } // TODO: Rename this file to op_custom_ops_aot.cpp @@ -263,4 +397,11 @@ TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) { m.impl( "update_cache.out", WRAP_TO_ATEN(torch::executor::native::update_cache_out_no_context, 3)); + m.impl( + "custom_quantized_sdpa", + torch::executor::native::custom_quantized_sdpa_aten); + m.impl( + "custom_quantized_sdpa.out", + WRAP_TO_ATEN( + torch::executor::native::custom_quantized_sdpa_out_no_context, 15)); } diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h index 0639c539ed1..c907a84f14c 100644 --- a/extension/llm/custom_ops/op_sdpa_impl.h +++ b/extension/llm/custom_ops/op_sdpa_impl.h @@ -23,11 +23,15 @@ #endif #include +#include + namespace torch { namespace executor { namespace native { +enum class SeqDim { ONE = 1, TWO }; + namespace sdpa::impl { struct MaybeQuantizedMatrixData { @@ -35,6 +39,8 @@ struct MaybeQuantizedMatrixData { const int8_t* zero_points{nullptr}; const float* scales{nullptr}; int64_t m = 0, n = 0; + const int64_t zero_points_stride{1}; + const int64_t scales_stride{1}; ScalarType dtype{ScalarType::Float}; MaybeQuantizedMatrixData() = default; MaybeQuantizedMatrixData( @@ -43,12 +49,15 @@ struct MaybeQuantizedMatrixData { const float* scales_, int64_t m_, int64_t n_, + int64_t qparams_stride, ScalarType dtype_) : data(data_), zero_points(zero_points_), scales(scales_), m(m_), n(n_), + zero_points_stride(qparams_stride), + scales_stride(qparams_stride), dtype(dtype_) {} }; @@ -67,7 +76,32 @@ void _q_at_k_gemm( q_data.dtype == ScalarType::Char || q_data.dtype == ScalarType::Float, "q and k must be either int8 or float"); if (q_data.dtype == ScalarType::Char) { - ET_CHECK_MSG(false, "int8 not supported yet"); + if constexpr (std::is_same::value) { + int a_stride_m_tmp, b_stride_n_tmp; + auto kernel = torchao::kernels::cpu::quantized_matmul:: + get_int8_a_int8_b_channelwise_qmatmul( + q_m, k_n, qk_k, false, true, a_stride_m_tmp, b_stride_n_tmp); + kernel( + q_m, + k_n, + qk_k, + static_cast(q_data.data), + q_stride_m, + static_cast(k_data.data), + k_stride_n, + qk_data, + k_n, + static_cast(q_data.zero_points), + static_cast(k_data.zero_points), + static_cast(q_data.scales), + static_cast(k_data.scales), + // LHS and RHS are assumed to have same stride for qparams + q_data.zero_points_stride, + k_data.zero_points_stride); + } else { + ET_CHECK_MSG( + false, "Accumulation in dtype other than float not supported yet"); + } } else { ::executorch::cpublas::gemm( ::executorch::cpublas::TransposeType::Transpose, @@ -86,6 +120,131 @@ void _q_at_k_gemm( } } +// Refactor op_dequantize.cpp to avoid code duplication +void dequantize_optimized( + const int8_t* in, + const float scale, + const int8_t zero_point, + float* out, + int64_t quant_min, + int64_t quant_max, + size_t numel) { + size_t i = 0; +#if defined(__aarch64__) || defined(__ARM_NEON) + int8x8_t zero_point_vec = vdup_n_s8(zero_point); + float32x4_t scales = vdupq_n_f32(static_cast(scale)); + constexpr int32_t kVecSize = 16; + const size_t num_vecs = numel / kVecSize; + const int8_t* in_copy = in; + float* out_copy = out; + for (; i < num_vecs; i++) { + int8x16_t in_vec = vld1q_s8(in_copy); + int16x8_t sub_vec_0_7 = vsubl_s8(vget_low_s8(in_vec), zero_point_vec); + int32x4_t sub_vec_0_3 = vmovl_s16(vget_low_s16(sub_vec_0_7)); + int32x4_t sub_vec_4_7 = vmovl_s16(vget_high_s16(sub_vec_0_7)); + float32x4_t out_vec_0_3 = vmulq_f32(vcvtq_f32_s32(sub_vec_0_3), scales); + float32x4_t out_vec_4_7 = vmulq_f32(vcvtq_f32_s32(sub_vec_4_7), scales); + + int16x8_t sub_vec_8_15 = vsubl_s8(vget_high_s8(in_vec), zero_point_vec); + int32x4_t sub_vec_8_11 = vmovl_s16(vget_low_s16(sub_vec_8_15)); + int32x4_t sub_vec_12_15 = vmovl_s16(vget_high_s16(sub_vec_8_15)); + float32x4_t out_vec_8_11 = vmulq_f32(vcvtq_f32_s32(sub_vec_8_11), scales); + float32x4_t out_vec_12_15 = vmulq_f32(vcvtq_f32_s32(sub_vec_12_15), scales); + vst1q_f32(out_copy + 0, out_vec_0_3); + vst1q_f32(out_copy + 4, out_vec_4_7); + vst1q_f32(out_copy + 8, out_vec_8_11); + vst1q_f32(out_copy + 12, out_vec_12_15); + in_copy += kVecSize; + out_copy += kVecSize; + } + i = i * kVecSize; +#endif + for (; i < numel; i++) { + out[i] = (static_cast(in[i]) - static_cast(zero_point)) * + scale; + } +} + +void dequantize_per_channel_optimized( + const int8_t* in_data, + const float* scales_data, + const int8_t* zero_points_data, + float* out_data, + int64_t quant_min, + int64_t quant_max, + size_t outer_size, + size_t in_outer_stride, + size_t out_outer_stride, + size_t num_channels, + size_t in_channel_stride, + size_t out_channel_stride, + size_t channel_size, + size_t qparams_stride) { + for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { + // Loop through dim + for (size_t channel_idx = 0; channel_idx < num_channels; ++channel_idx) { + const int8_t* in_data_local = in_data + outer_idx * in_outer_stride + + channel_idx * in_channel_stride; + const float scale = *(scales_data + channel_idx * qparams_stride); + const int8_t zero_point = + *(zero_points_data + channel_idx * qparams_stride); + float* out_data_local = out_data + outer_idx * out_outer_stride + + channel_idx * out_channel_stride; + dequantize_optimized( + in_data_local, + scale, + zero_point, + out_data_local, + quant_min, + quant_max, + channel_size); + } + } +} + +void dequant_and_gemm( + const int64_t m, + const int64_t n, + const int64_t k, + float* qk_data, + const int64_t qk_stride_m, + const MaybeQuantizedMatrixData& v_data, + const int64_t v_stride_n, + float* o_data, + const int64_t o_stride_m, + const float beta) { + std::vector dequantized_v_data(v_data.m * v_data.n); + dequantize_per_channel_optimized( + static_cast(v_data.data), + static_cast(v_data.scales), + static_cast(v_data.zero_points), + dequantized_v_data.data(), + -128, + 127, + 1, + 0, + 0, + v_data.m, + v_stride_n, + v_data.n, + v_data.n, + v_data.zero_points_stride); + ::executorch::cpublas::gemm( + ::executorch::cpublas::TransposeType::NoTranspose, + ::executorch::cpublas::TransposeType::NoTranspose, + n, + m, + k, + static_cast(1), + dequantized_v_data.data(), + v_data.n, + qk_data, + qk_stride_m, + beta, + o_data, + o_stride_m); +} + template void _qk_at_v_gemm( const int64_t m, @@ -99,7 +258,46 @@ void _qk_at_v_gemm( const int64_t o_stride_m, const accum_t beta) { if (v_data.dtype == ScalarType::Char) { - ET_CHECK_MSG(false, "int8 not supported yet"); + if constexpr (std::is_same::value) { + if (m > 4) { + // For larger batch sizes, dequantize and use BLAS for better + // performance + dequant_and_gemm( + m, + n, + k, + const_cast(qk_data), + qk_stride_m, + v_data, + v_stride_n, + o_data, + o_stride_m, + beta); + } else { + // For smaller batch sizes, use quantized gemm + int a_stride_m_tmp, b_stride_n_tmp; + auto kernel = torchao::kernels::cpu::quantized_matmul:: + get_fp32_a_input_channelwise_8bit_b_f32_c_matmul( + m, n, k, false, false, a_stride_m_tmp, b_stride_n_tmp); + kernel( + m, + n, + k, + qk_data, + qk_stride_m /*lhs_stride_m*/, + static_cast(v_data.data), + v_stride_n /*rhs_stride_n*/, + o_data, + o_stride_m /*out_stride_n*/, + static_cast(v_data.zero_points), + static_cast(v_data.scales), + beta, + v_data.zero_points_stride); + } + } else { + ET_CHECK_MSG( + false, "Accumulation in dtype other than float not supported yet"); + } } else { ::executorch::cpublas::gemm( ::executorch::cpublas::TransposeType::NoTranspose, @@ -289,6 +487,40 @@ sdpa_with_kv_cache does not use attn_mask. TODO: Just handle conversion of bool mask to float */ +/** + * @brief Implements Flash Attention algorithm on CPU + * + * This function computes scaled dot-product attention with optimizations for + CPU. + * It supports both regular and quantized attention computation. + * + * @tparam scalar_t The data type for computation (e.g., float) + * @tparam q_split_size Block size for query matrix in tiling algorithm + * @tparam kv_split_size Block size for key/value matrices in tiling algorithm + * + * @param output Output tensor to store attention results + * @param query Query tensor [Batch x Num_heads x Q_seq_len x Dim_per_head] + * @param key Key tensor [Batch x Num_heads_kv x KV_seq_len x Dim_per_head] + * @param value Value tensor [Batch x Num_heads_kv x KV_seq_len x Dim_per_head] + * @param dropout_p Dropout probability (not used in current implementation) + * @param is_causal Whether to apply causal mask (lower triangular) + * @param attn_mask Optional explicit attention mask + * @param scale Optional custom scaling factor (default: 1/sqrt(head_dim)) + * @param q_zero_points Optional zero points for quantized query + * @param q_scales Optional scales for quantized query + * @param k_zero_points Optional zero points for quantized key + * @param k_scales Optional scales for quantized key + * @param v_zero_points Optional zero points for quantized value + * @param v_scales Optional scales for quantized value + * @param seq_dim Which dimension is sequence dimension. + If SeqDim::One, then query, key, value are + expected to be in shape [Batch x Q_seq_len x Dim_per_head x Num_heads] and + output is expected to be in shape [Batch x Q_seq_len x Dim_per_head x + Num_heads] + * @param start_pos Starting position for causal masking in generation + * @param num_keys_for_causal_attention Number of keys to consider for causal + attention (-1 for all) + */ template void cpu_flash_attention( Tensor& output, @@ -305,22 +537,10 @@ void cpu_flash_attention( const optional& k_scales, const optional& v_zero_points, const optional& v_scales, - bool is_seq_at_dim_1 = false, + const SeqDim seq_dim = SeqDim::TWO, const int64_t start_pos = 0, const int64_t num_keys_for_causal_attention = -1) { (void)dropout_p; - // Query (Batch x Num_heads x Q_seq_len x Dim_per_head) - // Key (Batch x Num_heads x KV_seq_len x Dim_per_head) - // Value (Batch x Num_heads x KV_seq_len x Dim_per_head) - - /* - // -> (Batch x Q_seq_len x Num_heads x Dim_per_head) - at::Tensor query = q.transpose(1, 2); - // -> (Batch x KV_seq_len x Num_heads x Dim_per_head) - at::Tensor key = k.transpose(1, 2); - // -> (Batch x KV_seq_len x Num_heads x Dim_per_head) - at::Tensor value = v.transpose(1, 2); - */ // Without this we have out-of-bounds writes for // causal masking @@ -346,7 +566,7 @@ void cpu_flash_attention( int64_t kvSize = value.size(2); int64_t num_heads_kv = key.size(1); - if (is_seq_at_dim_1) { + if (seq_dim == SeqDim::ONE) { num_head = query.size(2); num_heads_kv = key.size(2); qSize = query.size(1); @@ -385,7 +605,11 @@ void cpu_flash_attention( */ ET_CHECK_MSG(attn_mask.value().dim() == 2, "attn_mask must be 2D"); ET_CHECK_MSG( - attn_mask.value().size(0) == qSize, "attn_mask shape mismatch"); + attn_mask.value().size(0) == qSize, + "attn_mask shape mismatch" + "attn_mask.size(0)=%zd qSize=%" PRId64, + attn_mask.value().size(0), + qSize); ET_CHECK_MSG( attn_mask.value().size(1) == kvSize, "attn_mask shape mismatch" @@ -394,14 +618,15 @@ void cpu_flash_attention( kvSize); } - bool is_quantized_sdpa = query.scalar_type() == ScalarType::Char; + bool is_quantized_sdpa = false; + is_quantized_sdpa = query.scalar_type() == ScalarType::Char; auto strides = query.strides(); int64_t qStrideB = strides[0]; int64_t qStrideH = strides[1]; int64_t qStrideM = strides[2]; - if (is_seq_at_dim_1) { + if (seq_dim == SeqDim::ONE) { qStrideH = strides[2]; qStrideM = strides[1]; } @@ -411,7 +636,7 @@ void cpu_flash_attention( int64_t kStrideH = strides[1]; int64_t kStrideN = strides[2]; - if (is_seq_at_dim_1) { + if (seq_dim == SeqDim::ONE) { kStrideH = strides[2]; kStrideN = strides[1]; } @@ -421,17 +646,60 @@ void cpu_flash_attention( int64_t vStrideH = strides[1]; int64_t vStrideN = strides[2]; - if (is_seq_at_dim_1) { + if (seq_dim == SeqDim::ONE) { vStrideH = strides[2]; vStrideN = strides[1]; } + int64_t q_quant_params_StrideB = 0; + int64_t q_quant_params_StrideH = 0; + int64_t q_quant_params_StrideM = 0; + int64_t k_quant_params_StrideB = 0; + int64_t k_quant_params_StrideH = 0; + int64_t k_quant_params_StrideN = 0; + int64_t v_quant_params_StrideB = 0; + int64_t v_quant_params_StrideH = 0; + int64_t v_quant_params_StrideN = 0; + + if (is_quantized_sdpa) { + auto q_strides = q_zero_points.value().strides(); + q_quant_params_StrideB = q_strides[0]; + q_quant_params_StrideH = q_strides[1]; + q_quant_params_StrideM = q_strides[2]; + + auto k_strides = k_zero_points.value().strides(); + k_quant_params_StrideB = k_strides[0]; + k_quant_params_StrideH = k_strides[1]; + k_quant_params_StrideN = k_strides[2]; + + auto v_strides = v_zero_points.value().strides(); + v_quant_params_StrideB = v_strides[0]; + v_quant_params_StrideH = v_strides[1]; + v_quant_params_StrideN = v_strides[2]; + + ET_CHECK_MSG( + (v_quant_params_StrideN == k_quant_params_StrideN) && + (v_quant_params_StrideN == q_quant_params_StrideM), + "Quant params strides must be same for seq dim"); + + if (seq_dim == SeqDim::ONE) { + q_quant_params_StrideH = q_strides[2]; + q_quant_params_StrideM = q_strides[1]; + + k_quant_params_StrideH = k_strides[2]; + k_quant_params_StrideN = k_strides[1]; + + v_quant_params_StrideH = v_strides[2]; + v_quant_params_StrideN = v_strides[1]; + } + } + strides = output.strides(); int64_t oStrideB = strides[0]; int64_t oStrideH = strides[1]; int64_t oStrideM = strides[2]; - if (is_seq_at_dim_1) { + if (seq_dim == SeqDim::ONE) { oStrideH = strides[2]; oStrideM = strides[1]; } @@ -473,7 +741,11 @@ void cpu_flash_attention( /* qk_sum */ qSplitSize + /* dst */ qSplitSize * headSize; - int64_t size_bytes = size_per_thread * num_thread * query.element_size(); + // Since all intermediate compute is accum_t, we need to + // allocate a buffer accordingly. + int64_t size_of_intermediate_precision = sizeof(accum_t); + int64_t size_bytes = size_per_thread * num_thread * query.element_size() * + size_of_intermediate_precision; std::vector buf_vec(size_bytes); void* buf = reinterpret_cast(buf_vec.data()); // Need to double check the following @@ -559,14 +831,18 @@ void cpu_flash_attention( int64_t q_offset = i * qStrideB + j * qStrideH + m * qStrideM; int64_t k_offset = i * kStrideB + j_kv * kStrideH + n * kStrideN; if (is_quantized_sdpa) { - ET_CHECK_MSG( - !is_seq_at_dim_1, "For quantized SDPA, seq_len must be at dim 2"); - q_scales_ptr = q_scales.value().const_data_ptr() + q_offset; - k_scales_ptr = k_scales.value().const_data_ptr() + k_offset; - q_zero_points_ptr = - q_zero_points.value().const_data_ptr() + q_offset; - k_zero_points_ptr = - k_zero_points.value().const_data_ptr() + k_offset; + int64_t q_quant_params_offset = i * q_quant_params_StrideB + + j * q_quant_params_StrideH + m * q_quant_params_StrideM; + int64_t k_quant_params_offset = i * k_quant_params_StrideB + + j_kv * k_quant_params_StrideH + n * k_quant_params_StrideN; + q_scales_ptr = + q_scales.value().const_data_ptr() + q_quant_params_offset; + k_scales_ptr = + k_scales.value().const_data_ptr() + k_quant_params_offset; + q_zero_points_ptr = q_zero_points.value().const_data_ptr() + + q_quant_params_offset; + k_zero_points_ptr = k_zero_points.value().const_data_ptr() + + k_quant_params_offset; q_sub_matrix_data_ptr = (const int8_t*)(q_data) + q_offset; k_sub_matrix_data_ptr = (const int8_t*)(k_data) + k_offset; } else { @@ -579,6 +855,7 @@ void cpu_flash_attention( q_scales_ptr, qBlockSize, headSize, + q_quant_params_StrideM, query.scalar_type()); MaybeQuantizedMatrixData k_sub_matrix_data = MaybeQuantizedMatrixData( static_cast(k_sub_matrix_data_ptr), @@ -586,6 +863,7 @@ void cpu_flash_attention( k_scales_ptr, kvBlockSize, headSize, + k_quant_params_StrideN, key.scalar_type()); _q_at_k_gemm( qBlockSize, @@ -719,11 +997,12 @@ void cpu_flash_attention( const int8_t* v_zero_points_ptr = nullptr; int64_t v_offset = i * vStrideB + j_kv * vStrideH + n * vStrideN; if (is_quantized_sdpa) { - ET_CHECK_MSG( - !is_seq_at_dim_1, "For quantized SDPA, seq_len must be at dim 2"); - v_scales_ptr = v_scales.value().const_data_ptr() + v_offset; - v_zero_points_ptr = - v_zero_points.value().const_data_ptr() + v_offset; + int64_t v_quant_params_offset = i * v_quant_params_StrideB + + j_kv * v_quant_params_StrideH + n * v_quant_params_StrideN; + v_scales_ptr = + v_scales.value().const_data_ptr() + v_quant_params_offset; + v_zero_points_ptr = v_zero_points.value().const_data_ptr() + + v_quant_params_offset; v_sub_matrix_data_ptr = (const int8_t*)(v_data) + v_offset; } else { v_sub_matrix_data_ptr = (const scalar_t*)(v_data) + v_offset; @@ -734,6 +1013,7 @@ void cpu_flash_attention( v_scales_ptr, kvBlockSize, headSize, + v_quant_params_StrideN, value.scalar_type()); // Calculate Softmax(q @ k.T) @ v _qk_at_v_gemm( diff --git a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp index 435cf44e66f..6c0496af32d 100644 --- a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp +++ b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp @@ -524,289 +524,6 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) { EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_5, 1e-4, 1e-4); } -TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) { - TensorFactory tfFloat; - - executorch::aten::Tensor query = tfFloat.make( - {1, 1, 4, 4}, - {0.8823, - 0.9150, - 0.3829, - 0.9593, - 0.3904, - 0.6009, - 0.2566, - 0.7936, - 0.9408, - 0.1332, - 0.9346, - 0.5936, - 0.8694, - 0.5677, - 0.7411, - 0.4294}); - executorch::aten::Tensor key = tfFloat.make( - {1, 1, 4, 4}, - {0.8854, - 0.5739, - 0.2666, - 0.6274, - 0.2696, - 0.4414, - 0.2969, - 0.8317, - 0.1053, - 0.2695, - 0.3588, - 0.1994, - 0.5472, - 0.0062, - 0.9516, - 0.0753}); - executorch::aten::Tensor value = tfFloat.make( - {1, 1, 4, 4}, - {0.8860, - 0.5832, - 0.3376, - 0.8090, - 0.5779, - 0.9040, - 0.5547, - 0.3423, - 0.6343, - 0.3644, - 0.7104, - 0.9464, - 0.7890, - 0.2814, - 0.7886, - 0.5895}); - executorch::aten::Tensor attn_mask = tfFloat.make({1, 1}, {0}); - executorch::aten::Tensor key_cache_0 = tfFloat.zeros({1, 5, 4, 4}); - executorch::aten::Tensor value_cache_0 = tfFloat.zeros({1, 5, 4, 4}); - executorch::aten::Tensor key_cache_1 = tfFloat.zeros({1, 5, 4, 4}); - executorch::aten::Tensor value_cache_1 = tfFloat.zeros({1, 5, 4, 4}); - executorch::aten::Tensor key_cache_2 = tfFloat.zeros({1, 5, 4, 4}); - executorch::aten::Tensor value_cache_2 = tfFloat.zeros({1, 5, 4, 4}); - double dropout_p = 0; - bool is_causal = false; - executorch::aten::optional scale; - - // start pos: 0 layer id 0 - executorch::aten::Tensor ret_expected_0 = tfFloat.make( - {1, 1, 4, 4}, - {0.8860, - 0.5832, - 0.3376, - 0.8090, - 0.5779, - 0.9040, - 0.5547, - 0.3423, - 0.6343, - 0.3644, - 0.7104, - 0.9464, - 0.7890, - 0.2814, - 0.7886, - 0.5895}); - - std::vector out_size = {1, 1, 4, 4}; - executorch::aten::Tensor out = tfFloat.zeros(out_size); - executorch::aten::Tensor ret = op_sdpa_with_kv_cache( - query, - key, - value, - key_cache_0, - value_cache_0, - 0, - 1, - attn_mask, - dropout_p, - is_causal, - scale, - out); - EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_0, 1e-4, 1e-4); - - // start pos: 0 layer id 2 - executorch::aten::Tensor ret_expected_1 = tfFloat.make( - {1, 1, 4, 4}, - {0.8860, - 0.5832, - 0.3376, - 0.8090, - 0.5779, - 0.9040, - 0.5547, - 0.3423, - 0.6343, - 0.3644, - 0.7104, - 0.9464, - 0.7890, - 0.2814, - 0.7886, - 0.5895}); - out = tfFloat.zeros(out_size); - ret = op_sdpa_with_kv_cache( - query, - key, - value, - key_cache_2, - value_cache_2, - 0, - 1, - attn_mask, - dropout_p, - is_causal, - scale, - out); - EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_1, 1e-4, 1e-4); - - attn_mask = tfFloat.make({1, 2}, {0, 0}); - // start pos: 1 layer id 0 - executorch::aten::Tensor ret_expected_2 = tfFloat.make( - {1, 1, 4, 4}, - {0.8860, - 0.5832, - 0.3376, - 0.8090, - 0.5779, - 0.9040, - 0.5547, - 0.3423, - 0.6343, - 0.3644, - 0.7104, - 0.9464, - 0.7890, - 0.2814, - 0.7886, - 0.5895}); - out = tfFloat.zeros(out_size); - ret = op_sdpa_with_kv_cache( - query, - key, - value, - key_cache_0, - value_cache_0, - 1, - 1, - attn_mask, - dropout_p, - is_causal, - scale, - out); - EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_2, 1e-4, 1e-4); - - // start pos: 1 layer id 1 - executorch::aten::Tensor ret_expected_3 = tfFloat.make( - {1, 1, 4, 4}, - {0.6486, - 0.4270, - 0.2472, - 0.5922, - 0.3669, - 0.5740, - 0.3522, - 0.2173, - 0.3635, - 0.2088, - 0.4071, - 0.5423, - 0.5110, - 0.1822, - 0.5107, - 0.3817}); - out = tfFloat.zeros(out_size); - ret = op_sdpa_with_kv_cache( - query, - key, - value, - key_cache_1, - value_cache_1, - 1, - 1, - attn_mask, - dropout_p, - is_causal, - scale, - out); - EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_3, 1e-4, 1e-4); - - attn_mask = tfFloat.make({1, 3}, {0, 0, 0}); - // start pos: 2 layer id 1 - executorch::aten::Tensor ret_expected_4 = tfFloat.make( - {1, 1, 4, 4}, - {0.7490, - 0.4930, - 0.2854, - 0.6838, - 0.4489, - 0.7021, - 0.4308, - 0.2659, - 0.4622, - 0.2655, - 0.5176, - 0.6895, - 0.6202, - 0.2212, - 0.6199, - 0.4634}); - out = tfFloat.zeros(out_size); - ret = op_sdpa_with_kv_cache( - query, - key, - value, - key_cache_1, - value_cache_1, - 2, - 1, - attn_mask, - dropout_p, - is_causal, - scale, - out); - EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_4, 1e-4, 1e-4); - - // start pos: 2 layer id 2 - executorch::aten::Tensor ret_expected_5 = tfFloat.make( - {1, 1, 4, 4}, - {0.7490, - 0.4930, - 0.2854, - 0.6838, - 0.4489, - 0.7021, - 0.4308, - 0.2659, - 0.4622, - 0.2655, - 0.5176, - 0.6895, - 0.6202, - 0.2212, - 0.6199, - 0.4634}); - out = tfFloat.zeros(out_size); - ret = op_sdpa_with_kv_cache( - query, - key, - value, - key_cache_2, - value_cache_2, - 2, - 1, - attn_mask, - dropout_p, - is_causal, - scale, - out); - EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_5, 1e-4, 1e-4); -} - TEST(OpScaledDotProductAttentionTest, SequenceTest) { TensorFactory tfFloat; diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index 5b68715e401..545f6516bb7 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -9,6 +9,18 @@ load( "get_compiler_optimization_flags", ) +def _get_quantized_sdpa_deps(): + if runtime.is_oss: + return [] + else: + return ["//pytorch/ao/torchao/experimental/kernels/cpu/interface:interface"] + +def _get_quantized_preproc_flags(): + if runtime.is_oss: + return [] + else: + return ["-DENABLE_CUSTOM_QUANTIZED_SDPA"] + def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. @@ -33,7 +45,8 @@ def define_common_targets(): headers = [ "op_sdpa_impl.h", ], - preprocessor_flags = get_vec_preprocessor_flags(), + exported_preprocessor_flags = get_vec_preprocessor_flags() + + _get_quantized_preproc_flags(), exported_deps = [ "//executorch/runtime/kernel:kernel_includes", "//executorch/kernels/portable/cpu:scalar_utils", @@ -45,8 +58,12 @@ def define_common_targets(): deps = [ "//executorch/kernels/portable/cpu/util:reduce_util", "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform", - ] + get_vec_deps(), - compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags(), + ] + get_vec_deps() + _get_quantized_sdpa_deps(), + compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags() + + select({ + "DEFAULT": [], + "ovr_config//cpu:arm64": ["-march=armv8.2-a+dotprod"], + }), visibility = [ "//executorch/...", "//executorch/extension/llm/custom_ops/...", diff --git a/extension/llm/custom_ops/test_quantized_sdpa.py b/extension/llm/custom_ops/test_quantized_sdpa.py new file mode 100644 index 00000000000..f7b28e1508f --- /dev/null +++ b/extension/llm/custom_ops/test_quantized_sdpa.py @@ -0,0 +1,536 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import unittest + +import torch +import torch.nn.functional as F + +from .custom_ops import custom_ops_lib # noqa + + +class SDPATestForCustomQuantizedSDPA(unittest.TestCase): + """ + This test is to test the custom quantized SDPA op + Tensors are in [B, H, S, D] format + """ + + def setUp(self): + from torch.ao.quantization.fx._decomposed import ( # noqa: F401 + quantized_decomposed_lib, + ) + + torch.manual_seed(42) + self.n_batch = 1 + self.n_heads_kv = 32 + self.n_heads_q = 32 + self.head_dim = 128 + self.max_seq_len = 2048 + self.quantized_dtype = torch.int8 + self.float_dtype = torch.float32 + self.q_shape = None + self.kv_shape = None + self.is_seq_at_dim_2 = True + + def _scale_tensor(self, tensor, min_value, max_value, scale=True): + normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min()) + + scaled_tensor = normalized_tensor * (max_value - min_value) + min_value + + return scaled_tensor if scale else tensor + + def setup_caches_and_mask(self, tensor_scale_max, tensor_scale_min, scale_tensors): + self.mask = torch.full( + (self.max_seq_len, self.max_seq_len), + float("-inf"), + ) + self.mask = torch.triu(self.mask, diagonal=1) + + self.k = self._scale_tensor( + torch.rand(self.kv_shape), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) + self.v = self._scale_tensor( + torch.rand(self.kv_shape), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) + + def _sdpa_ref( + self, + q_quantized, + k_quantized, + v_quantized, + start_pos, + q_zero_point, + q_scale, + k_zero_point, + k_scale, + v_zero_point, + v_scale, + attn_mask, + ): + q = torch.ops.quantized_decomposed.dequantize_per_token( + q_quantized, + q_scale, + q_zero_point, + torch.iinfo(self.quantized_dtype).min, + torch.iinfo(self.quantized_dtype).max, + self.quantized_dtype, + self.float_dtype, + ) + k = torch.ops.quantized_decomposed.dequantize_per_token( + k_quantized, + k_scale, + k_zero_point, + torch.iinfo(self.quantized_dtype).min, + torch.iinfo(self.quantized_dtype).max, + self.quantized_dtype, + self.float_dtype, + ) + v = torch.ops.quantized_decomposed.dequantize_per_token( + v_quantized, + v_scale, + v_zero_point, + torch.iinfo(self.quantized_dtype).min, + torch.iinfo(self.quantized_dtype).max, + self.quantized_dtype, + self.float_dtype, + ) + + if not self.is_seq_at_dim_2: + q = q.transpose(1, 2).contiguous() + k = k.transpose(1, 2).contiguous() + v = v.transpose(1, 2).contiguous() + num_heads_q = q.size(1) + num_heads_kv = k.size(1) + seq_len = q.size(2) + k = torch.narrow(k, 2, 0, start_pos + seq_len) + v = torch.narrow(v, 2, 0, start_pos + seq_len) + if num_heads_q != num_heads_kv: + assert ( + num_heads_q % num_heads_kv == 0 + ), f"{num_heads_q} not divisible by {num_heads_kv}" + n_reps = num_heads_q // num_heads_kv + if n_reps > 1: + k = k.repeat_interleave(n_reps, dim=1) + v = v.repeat_interleave(n_reps, dim=1) + out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask) + if not self.is_seq_at_dim_2: + out = out.transpose(1, 2).contiguous() + return out + + def _int_matmul( + self, quantized_q, quantized_k, q_zero_points, q_scale, k_zero_point, k_scale + ): + row_sum_q = torch.sum(quantized_q, dim=-1, keepdim=True) + row_sum_k = torch.sum(quantized_k, dim=-1, keepdim=True) + q_at_k = torch.matmul(quantized_q, quantized_k.transpose(-2, -1)) + row_sum_q_scaled = row_sum_q * k_zero_point.squeeze(-1).unsqueeze(0) + row_sum_k_scaled = q_zero_points * row_sum_k.squeeze(-1).unsqueeze(0) + zero_points_product = ( + quantized_q.size(-1) * q_zero_points * k_zero_point.squeeze(-1).unsqueeze(0) + ) + res = q_at_k - row_sum_q_scaled - row_sum_k_scaled + zero_points_product + q_scale_mul_k_scale = q_scale * k_scale.squeeze(-1).unsqueeze(0) + res = res.to(torch.float32) * q_scale_mul_k_scale + return res + + def _quantized_sdpa_ref( + self, + quantized_q, + quantized_k, + quantized_v, + q_zero_points, + q_scale, + k_scale, + k_zero_point, + v_scale, + v_zero_point, + attn_mask, + ): + import math + + quantized_q = quantized_q.to(torch.int32) + quantized_k = quantized_k.to(torch.int32) + quantized_v = quantized_v.to(torch.int32) + batch_size = quantized_q.size(0) + num_heads_q = quantized_q.size(1) + num_heads_kv = quantized_k.size(1) + q_scale = q_scale.to(torch.float32) + k_scale = k_scale.to(torch.float32) + q_zero_points = q_zero_points.to(torch.int32) + k_zero_point = k_zero_point.to(torch.int32) + if num_heads_q != num_heads_kv: + assert ( + num_heads_q % num_heads_kv == 0 + ), f"{num_heads_q} not divisible by {num_heads_kv}" + n_reps = num_heads_q // num_heads_kv + if n_reps > 1: + quantized_k = quantized_k.repeat_interleave(n_reps, dim=1) + quantized_v = quantized_v.repeat_interleave(n_reps, dim=1) + res_b = [] + scale_factor = 1 / math.sqrt(quantized_k.size(-1)) + dequantized_v = torch.ops.quantized_decomposed.dequantize_per_token( + quantized_v, + v_scale, + v_zero_point, + torch.iinfo(torch.int8).min, + torch.iinfo(torch.int8).max, + torch.int8, + torch.float32, + ) + for b in range(batch_size): + res_h = [] + for h in range(num_heads_q): + q_at_k = self._int_matmul( + quantized_q[b][h], + quantized_k[b][h], + q_zero_points[b][h], + q_scale[b][h], + k_zero_point[b][h], + k_scale[b][h], + ) + q_at_k = q_at_k * scale_factor + q_at_k += attn_mask + attn_weight = torch.softmax(q_at_k, dim=-1) + y = torch.matmul(attn_weight, dequantized_v[b][h]) + res_h.append(y) + res = torch.stack(res_h, dim=0) + res_b.append(res.unsqueeze(0)) + res = torch.cat(res_b, dim=0) + return res + + def _test_sdpa_common( + self, + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + scale_tensors=False, + atol=1e-5, + is_seq_at_dim_2=False, + ): + # Range arbitrarily chosen to reproduce a numerical error on x86 in some of the long context tests + tensor_scale_max = 15 + tensor_scale_min = -15 + self.n_heads_kv = n_heads_kv + self.n_heads_q = n_heads_q + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.is_seq_at_dim_2 = is_seq_at_dim_2 + seq_dim = 2 + self.q_shape = (self.n_batch, self.n_heads_q, seq_len, self.head_dim) + self.kv_shape = (self.n_batch, self.n_heads_kv, self.max_seq_len, self.head_dim) + if not is_seq_at_dim_2: + seq_dim = 1 + self.q_shape = (self.n_batch, seq_len, self.n_heads_q, self.head_dim) + self.kv_shape = ( + self.n_batch, + self.max_seq_len, + self.n_heads_kv, + self.head_dim, + ) + + q = self._scale_tensor( + torch.rand(self.q_shape), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) + self.setup_caches_and_mask(tensor_scale_max, tensor_scale_min, scale_tensors) + k = self.k + v = self.v + + quantized_dtype = self.quantized_dtype + q_scale, q_zero_point = ( + torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default( + q, quantized_dtype + ) + ) + k_scale, k_zero_point = ( + torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default( + k, quantized_dtype + ) + ) + v_scale, v_zero_point = ( + torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default( + v, quantized_dtype + ) + ) + + q_quantized = torch.ops.quantized_decomposed.quantize_per_token( + q, + q_scale, + q_zero_point, + torch.iinfo(quantized_dtype).min, + torch.iinfo(quantized_dtype).max, + quantized_dtype, + ) + k_quantized = torch.ops.quantized_decomposed.quantize_per_token( + k, + k_scale, + k_zero_point, + torch.iinfo(quantized_dtype).min, + torch.iinfo(quantized_dtype).max, + quantized_dtype, + ) + v_quantized = torch.ops.quantized_decomposed.quantize_per_token( + v, + v_scale, + v_zero_point, + torch.iinfo(quantized_dtype).min, + torch.iinfo(quantized_dtype).max, + quantized_dtype, + ) + + seq_len = q.size(seq_dim) + attn_mask = self.mask[start_pos : start_pos + seq_len, :] + attn_mask = attn_mask[:, : start_pos + seq_len] + + # quantized_sdpa_ref_output = self._quantized_sdpa_ref(q_quantized, k_quantized, v_quantized, q_zero_point, q_scale, k_scale, k_zero_point, v_scale, v_zero_point, attn_mask) + + from torch.nn.attention import SDPBackend + + with torch.nn.attention.sdpa_kernel( + [SDPBackend.FLASH_ATTENTION] + ), torch.no_grad(): + ref_output = self._sdpa_ref( + q_quantized, + k_quantized, + v_quantized, + start_pos, + q_zero_point, + q_scale, + k_zero_point, + k_scale, + v_zero_point, + v_scale, + attn_mask, + ) + + q_zero_point_int8 = q_zero_point.to(dtype=torch.int8) + k_zero_point_int8 = k_zero_point.to(dtype=torch.int8) + v_zero_point_int8 = v_zero_point.to(dtype=torch.int8) + q_scale_fp32 = q_scale.to(dtype=torch.float32) + k_scale_fp32 = k_scale.to(dtype=torch.float32) + v_scale_fp32 = v_scale.to(dtype=torch.float32) + + op_output = torch.ops.llama.custom_quantized_sdpa( + q_quantized, + k_quantized, + v_quantized, + start_pos, + None, + 0, + True, + None, + q_zero_point_int8, + q_scale_fp32, + k_zero_point_int8, + k_scale_fp32, + v_zero_point_int8, + v_scale_fp32, + is_seq_at_dim_2, + ) + self.assertTrue(torch.allclose(ref_output, op_output, atol=atol)) + # Following line crashes due to some weird issues in mkldnn with crash in mkl_sgemm with `wild jump` + # self.assertTrue(torch.allclose(ref_output, quantized_sdpa_ref_output, atol=1e-3)) + + start_pos = seq_len + seq_len = q.size(seq_dim) + attn_mask = self.mask[start_pos : start_pos + seq_len, :] + attn_mask = attn_mask[:, : start_pos + seq_len] + with torch.nn.attention.sdpa_kernel( + [SDPBackend.FLASH_ATTENTION] + ), torch.no_grad(): + ref_output = self._sdpa_ref( + q_quantized, + k_quantized, + v_quantized, + start_pos, + q_zero_point, + q_scale, + k_zero_point, + k_scale, + v_zero_point, + v_scale, + attn_mask, + ) + op_output = torch.ops.llama.custom_quantized_sdpa( + q_quantized, + k_quantized, + v_quantized, + start_pos, + None, + 0, + True, + None, + q_zero_point_int8, + q_scale_fp32, + k_zero_point_int8, + k_scale_fp32, + v_zero_point_int8, + v_scale_fp32, + is_seq_at_dim_2, + ) + self.assertTrue(torch.allclose(ref_output, op_output, atol=atol)) + + def test_sdpa_with_custom_quantized(self): + n_heads_kv = 8 + n_heads_q = 8 + head_dim = 128 + max_seq_len = 2048 + seq_len = 24 + start_pos = 0 + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + True, + atol=1e-4, + is_seq_at_dim_2=True, + ) + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + True, + atol=1e-4, + is_seq_at_dim_2=False, + ) + + def test_sdpa_with_custom_quantized_seq_len_1(self): + n_heads_kv = 4 + n_heads_q = 4 + head_dim = 4 + max_seq_len = 8 + seq_len = 1 + start_pos = 0 + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + is_seq_at_dim_2=True, + ) + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + is_seq_at_dim_2=False, + ) + + def test_sdpa_with_custom_quantized_seq_len_small(self): + n_heads_kv = 4 + n_heads_q = 4 + head_dim = 4 + max_seq_len = 8 + seq_len = 4 + start_pos = 0 + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + is_seq_at_dim_2=True, + ) + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + is_seq_at_dim_2=False, + ) + + def test_sdpa_with_custom_quantized_seq_len_llava_example(self): + n_heads_kv = 32 + n_heads_q = 32 + head_dim = 128 + max_seq_len = 2048 + seq_len = 634 + start_pos = 0 + self._test_sdpa_common( + n_heads_kv, n_heads_q, head_dim, max_seq_len, start_pos, seq_len + ) + + def test_sdpa_with_custom_quantized_seq_len_130_gqa(self): + n_heads_kv = 8 + n_heads_q = 32 + head_dim = 128 + max_seq_len = 2048 + seq_len = 130 + start_pos = 0 + # For some reason when scaling tensors, the test fails with smaller atol + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + True, + atol=1e-3, + ) + + def test_sdpa_with_custom_quantized_seq_len_llava_example_gqa(self): + n_heads_kv = 16 + n_heads_q = 32 + head_dim = 128 + max_seq_len = 2048 + seq_len = 634 + start_pos = 0 + self._test_sdpa_common( + n_heads_kv, n_heads_q, head_dim, max_seq_len, start_pos, seq_len + ) + + def test_sdpa_with_cache_mqa(self): + n_heads_kv = 1 + n_heads_q = 8 + head_dim = 128 + max_seq_len = 2048 + seq_len = 24 + start_pos = 0 + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + is_seq_at_dim_2=True, + ) + self._test_sdpa_common( + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + start_pos, + seq_len, + is_seq_at_dim_2=False, + ) diff --git a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py index a1f054a153e..334e53c437f 100644 --- a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py +++ b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py @@ -67,12 +67,14 @@ def test_sdpa_with_cache_no_mqa_1(self): ) if self.use_mask_with_custom_op: attn_mask = attn_mask.contiguous() + sliced_k_cache = self.k_cache[:, : start_pos + seq_len, :, :] + sliced_v_cache = self.v_cache[:, : start_pos + seq_len, :, :] op_output = torch.ops.llama.sdpa_with_kv_cache( q, k, v, - self.k_cache, - self.v_cache, + sliced_k_cache, + sliced_v_cache, start_pos, seq_len, attn_mask, @@ -108,12 +110,14 @@ def test_sdpa_with_cache_no_mqa_2(self): ) if self.use_mask_with_custom_op: attn_mask = attn_mask.contiguous() + sliced_k_cache = self.k_cache[:, : start_pos + seq_len, :, :] + sliced_v_cache = self.v_cache[:, : start_pos + seq_len, :, :] op_output = torch.ops.llama.sdpa_with_kv_cache( q, k, v, - self.k_cache, - self.v_cache, + sliced_k_cache, + sliced_v_cache, start_pos, seq_len, attn_mask, @@ -150,12 +154,14 @@ def test_sdpa_with_cache_no_mqa_3(self): ) if self.use_mask_with_custom_op: attn_mask = attn_mask.contiguous() + sliced_k_cache = self.k_cache[:, : start_pos + seq_len, :, :] + sliced_v_cache = self.v_cache[:, : start_pos + seq_len, :, :] op_output = torch.ops.llama.sdpa_with_kv_cache( q, k, v, - self.k_cache, - self.v_cache, + sliced_k_cache, + sliced_v_cache, start_pos, seq_len, attn_mask, @@ -191,12 +197,14 @@ def test_sdpa_with_cache_no_mqa_4(self): ) if self.use_mask_with_custom_op: attn_mask = attn_mask.contiguous() + sliced_k_cache = self.k_cache[:, : start_pos + seq_len, :, :] + sliced_v_cache = self.v_cache[:, : start_pos + seq_len, :, :] op_output = torch.ops.llama.sdpa_with_kv_cache( q, k, v, - self.k_cache, - self.v_cache, + sliced_k_cache, + sliced_v_cache, start_pos, seq_len, attn_mask, @@ -489,11 +497,11 @@ def _test_sdpa_common( class SDPATestForLargeSeqLength(SDPATestCommon): def test_sdpa_with_cache_seq_len_130(self): - n_heads_kv = 32 - n_heads_q = 32 + n_heads_kv = 8 + n_heads_q = 8 head_dim = 128 max_seq_len = 2048 - seq_len = 130 + seq_len = 24 self._test_sdpa_common( n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, True ) diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS index 40f8599e9e0..298a64ba328 100644 --- a/extension/llm/export/TARGETS +++ b/extension/llm/export/TARGETS @@ -22,6 +22,7 @@ runtime.python_library( "//bento/...", "//bento_kernels/...", "//executorch/examples/...", + "//executorch/extension/llm/...", "//meta_intern_odllm/...", ], deps = [ diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index cf3a1087cfb..2dee6b0954a 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -13,7 +13,7 @@ import contextlib import logging from enum import Enum -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Tuple from unittest.mock import patch import torch @@ -41,6 +41,7 @@ from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer from torch.export import export_for_training, ExportedProgram from torch.nn.attention import SDPBackend +from torchao.utils import unwrap_tensor_subclass FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -80,14 +81,13 @@ class LLMEdgeManager: def __init__( self, - model, - modelname, - max_seq_len, - dtype, - use_kv_cache, - example_inputs, + model: torch.nn.Module, + modelname: str, + max_seq_len: int, + use_kv_cache: bool, + example_inputs: Tuple[torch.Tensor, ...], + dtype: Optional[DType] = None, example_kwarg_inputs: Optional[Dict] = None, - args: Optional[Any] = None, enable_dynamic_shape: bool = False, generate_full_logits: bool = False, calibration_tasks: Optional[List[str]] = None, @@ -98,36 +98,42 @@ def __init__( verbose: bool = False, metadata: Optional[dict] = None, dynamic_shapes: Optional[Any] = None, + use_legacy_export: bool = False, + save_exported_program: bool = False, ): + # Store necessary constructor arguments. self.model = model - # Note: treat this as the source of truth for the result of - # torch.export'ing a model. If the overall ExportedProgram is needed, - # make sure to re-export this graph module to persist any changes. See - # https://github.com/pytorch/pytorch/blob/main/torch/export/exported_program.py#L921 - self.pre_autograd_graph_module: Optional[torch.nn.Module] = None self.modelname = modelname self.max_seq_len = max_seq_len - self.dtype = dtype + self.use_kv_cache = use_kv_cache self.example_inputs = example_inputs + self.dtype = dtype self.example_kwarg_inputs = example_kwarg_inputs - self.use_kv_cache = use_kv_cache - self.generate_full_logits = generate_full_logits self.enable_dynamic_shape = enable_dynamic_shape - self.verbose = verbose - self.metadata = metadata - self.applied_source_transforms = [] - self.edge_manager: Optional[EdgeProgramManager] = None - self.export_program = None - self.output_dir = "." - self.dynamic_shapes = dynamic_shapes - self._saved_pte_filename = None - self.args = args + self.generate_full_logits = generate_full_logits self.calibration_tasks = calibration_tasks self.calibration_limit = calibration_limit self.calibration_seq_length = calibration_seq_length self.calibration_data = calibration_data self.tokenizer_path = tokenizer_path - self.canonical_passes = [RemoveRedundantTransposes()] + self.verbose = verbose + self.metadata = metadata + self.dynamic_shapes = dynamic_shapes + self.use_legacy_export = use_legacy_export + self.save_exported_program = save_exported_program + + # Note: treat this as the source of truth for the result of + # torch.export'ing a model. If the overall ExportedProgram is needed, + # make sure to re-export this graph module to persist any changes. See + # https://github.com/pytorch/pytorch/blob/main/torch/export/exported_program.py#L921 + self.pre_autograd_graph_module: Optional[torch.nn.Module] = None + self.edge_manager: Optional[EdgeProgramManager] = None + self.canonical_passes = [ + RemoveRedundantTransposes() + ] # Graph transformations optimizations. + self.export_program = None # Final result of lowering to executorch. + self.output_dir = "." + self._saved_pte_filename = None def set_output_dir(self, output_dir: str) -> "LLMEdgeManager": """ @@ -166,10 +172,9 @@ def source_transform( """ for transform in transforms: self.model = transform(self.model) - self.applied_source_transforms.extend(transforms) if self.verbose: - logging.info(f"Applied source transforms: {self.applied_source_transforms}") + logging.info(f"Applied source transforms: {transforms}") logging.info(f"Model after source transforms: {self.model}") return self @@ -178,13 +183,13 @@ def _get_dynamic_shape(self) -> Any: return self.dynamic_shapes dim = torch.export.Dim("token_dim", max=self.max_seq_len - 1) - - if not self.use_kv_cache: - # Only one input argument: tokens - self.dynamic_shapes = ({1: dim},) - elif self.enable_dynamic_shape: - # Two input arguments: tokens and input_pos but input_pos is static shape - self.dynamic_shapes = ({1: dim}, {"input_pos": {0: 1}}) + if self.enable_dynamic_shape: + if not self.use_kv_cache: + # Only one input argument: tokens + self.dynamic_shapes = ({1: dim},) + else: + # Two input arguments: tokens and input_pos but input_pos is static shape + self.dynamic_shapes = ({1: dim}, {"input_pos": {0: 1}}) else: # Two input arguments: tokens and input_pos but both are of static shape self.dynamic_shapes = None @@ -199,12 +204,17 @@ def _get_edge_config(self) -> EdgeCompileConfig: return edge_config def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram: + if module is not None: + unwrap_tensor_subclass(module) + else: + unwrap_tensor_subclass(self.model) + dynamic_shape = self._get_dynamic_shape() # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up) with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): - if hasattr(self.args, "qnn") and self.args.qnn: - # TODO: this is temporary, as qnn flow does not work with new, non-functional export IR. + if self.use_legacy_export: + # TODO: for use cases such as qnn, which does not work with new, non-functional export IR. # See issue: https://github.com/pytorch/executorch/issues/7373 with patch.object( @@ -234,6 +244,7 @@ def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram: self.example_inputs, kwargs=self.example_kwarg_inputs, dynamic_shapes=dynamic_shape, + strict=True, ) return exported_module @@ -249,8 +260,12 @@ def export(self) -> "LLMEdgeManager": # Persisting those changes back to an ExportedProgram will require # an additional export(). self.pre_autograd_graph_module = exported_module.module() - if hasattr(self.args, "export_only") and self.args.export_only: - torch.export.save(exported_module, self.args.output_name) + if self.save_exported_program: + export_output = f"{self.modelname}.pt2" + logging.info( + f"Saving torch.export()/export_for_training() result to {export_output}" + ) + torch.export.save(exported_module, export_output) return self def run_canonical_optimizations(self): @@ -414,7 +429,7 @@ def export_to_edge(self) -> "LLMEdgeManager": self.export() override_export_behaviour = contextlib.nullcontext() - if hasattr(self.args, "qnn") and self.args.qnn: + if self.use_legacy_export: override_export_behaviour = patch.object( torch._utils_internal, "export_training_ir_rollout_check", diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 76e8c357119..20604bbf635 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -57,7 +57,7 @@ def get_mps_partitioner(use_kv_cache: bool = False): ) except ImportError: raise ImportError( - "Please install the MPS backend follwing https://pytorch.org/executorch/main/build-run-mps.html" + "Please install the MPS backend follwing https://pytorch.org/executorch/main/backends-mps" ) compile_specs = [CompileSpec("use_fp16", bytes([True]))] @@ -81,7 +81,7 @@ def get_coreml_partitioner( ) except ImportError: raise ImportError( - "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html" + "Please install the CoreML backend follwing https://pytorch.org/executorch/main/backends-coreml" + "; for buck users, please add example dependancies: //executorch/backends/apple/coreml:backend, and etc" ) @@ -195,7 +195,7 @@ def get_qnn_partitioner( ) except ImportError: raise ImportError( - "Please install the Qualcomm backend following https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html" + "Please install the Qualcomm backend following https://pytorch.org/executorch/main/backends-qualcomm" ) use_fp16 = True diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 40d81075d9f..24c3be2e802 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -158,7 +158,7 @@ def get_qnn_quantizer( except ImportError: raise ImportError( - "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html" + "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/backends-qualcomm" ) backend, quant_config = pt2e_quantize.split("_") @@ -166,30 +166,39 @@ def get_qnn_quantizer( backend == "qnn" ), f"The quantization config is for backend {backend} instead of qnn." qnn_quantizer = QnnQuantizer() # pyre-fixme[16] - qnn_quantizer.set_per_channel_conv_quant(enable=True) - qnn_quantizer.set_per_channel_linear_quant(enable=True) + # more custom quantization are supported including 16a4w etc. default to 8bit quantized custom_annotations = () if quant_config == "8a8w": quant_dtype = QuantDtype.use_8a8w # pyre-fixme[16] - qnn_quantizer.set_quant_config(quant_dtype, is_qat=is_qat) + qnn_quantizer.set_default_quant_config( + quant_dtype, + is_qat=is_qat, + is_conv_per_channel=True, + is_linear_per_channel=True, + ) elif quant_config == "16a16w": - quant_dtype = QuantDtype.use_16a16w # pyre-fixme[16] # Due to the error with 16a16w in Qnn Htp, we need to disable per channel linear quantization when use 16a16w # TODO: enable it after the issue is fixed logging.warning( "Disable per channel quantization for linear and conv due to the error with QNN HTP 16a16w." ) - qnn_quantizer.set_per_channel_conv_quant(enable=False) - qnn_quantizer.set_per_channel_linear_quant(enable=False) - qnn_quantizer.set_quant_config( - quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver + quant_dtype = QuantDtype.use_16a16w # pyre-fixme[16] + qnn_quantizer.set_default_quant_config( + quant_dtype, + is_qat=is_qat, + is_conv_per_channel=False, + is_linear_per_channel=False, + act_observer=MinMaxObserver, ) elif quant_config == "16a4w": - # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`. - quant_dtype = QuantDtype.use_16a4w - qnn_quantizer.set_quant_config( - quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver + quant_dtype = QuantDtype.use_16a16w # pyre-fixme[16] + qnn_quantizer.set_default_quant_config( + quant_dtype, + is_qat=is_qat, + is_conv_per_channel=True, + is_linear_per_channel=True, + act_observer=MinMaxObserver, ) # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`. custom_annotations = (custom_annotate_llama_matmul_16a8w,) @@ -217,7 +226,7 @@ def get_coreml_quantizer(pt2e_quantize: str): from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer except ImportError: raise ImportError( - "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html" + "Please install the CoreML backend follwing https://pytorch.org/executorch/main/backends-coreml" ) if pt2e_quantize == "coreml_8a_c8w": diff --git a/extension/llm/export/test/TARGETS b/extension/llm/export/test/TARGETS new file mode 100644 index 00000000000..63efce84119 --- /dev/null +++ b/extension/llm/export/test/TARGETS @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +runtime.python_test( + name = "test_builder", + srcs = ["test_builder.py"], + deps = [ + "//executorch/extension/llm/export:export_lib", + "//caffe2:torch", + ], +) diff --git a/extension/llm/export/test/__init__.py b/extension/llm/export/test/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/extension/llm/export/test/test_builder.py b/extension/llm/export/test/test_builder.py new file mode 100644 index 00000000000..7883480c1e7 --- /dev/null +++ b/extension/llm/export/test/test_builder.py @@ -0,0 +1,117 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict +import unittest +from unittest.mock import MagicMock + +import torch + +from executorch.extension.llm.export.builder import DType, LLMEdgeManager + + +class TestLLMEdgeManager(unittest.TestCase): + def setUp(self) -> None: + # Create a mock model + self.mock_model = MagicMock() + self.modelname = "test_model" + self.max_seq_len = 2048 + self.dtype = DType.fp32 + self.example_inputs = (torch.zeros((1, 10), dtype=torch.long),) + self.example_kwarg_inputs = {"input_pos": torch.tensor([0])} + + def test_get_dynamic_shape_with_preset_dynamic_shapes(self) -> None: + """Test that _get_dynamic_shape returns preset dynamic_shapes if available.""" + # Create a manager with preset dynamic_shapes + preset_dynamic_shapes = {"preset": "shapes"} + manager = LLMEdgeManager( + model=self.mock_model, + modelname=self.modelname, + max_seq_len=self.max_seq_len, + dtype=self.dtype, + use_kv_cache=False, + example_inputs=self.example_inputs, + dynamic_shapes=preset_dynamic_shapes, + ) + + # Call _get_dynamic_shape and verify it returns the preset value + result = manager._get_dynamic_shape() + self.assertEqual(result, preset_dynamic_shapes) + + def test_get_dynamic_shape_with_dynamic_shape_enabled_no_kv_cache(self) -> None: + """Test _get_dynamic_shape when enable_dynamic_shape=True and use_kv_cache=False.""" + # Create a manager with enable_dynamic_shape=True and use_kv_cache=False + manager = LLMEdgeManager( + model=self.mock_model, + modelname=self.modelname, + max_seq_len=self.max_seq_len, + dtype=self.dtype, + use_kv_cache=False, + example_inputs=self.example_inputs, + enable_dynamic_shape=True, + ) + + # Call _get_dynamic_shape + result = manager._get_dynamic_shape() + + # Verify the result has the expected structure + self.assertIsInstance(result, tuple) + self.assertEqual(len(result), 1) + self.assertIsInstance(result[0], dict) + self.assertIn(1, result[0]) + # Check that the value at key 1 is a torch.export.Dim with the correct max value + self.assertEqual(result[0][1].max, self.max_seq_len - 1) + + def test_get_dynamic_shape_with_dynamic_shape_enabled_with_kv_cache(self) -> None: + """Test _get_dynamic_shape when enable_dynamic_shape=True and use_kv_cache=True.""" + # Create a manager with enable_dynamic_shape=True and use_kv_cache=True + manager = LLMEdgeManager( + model=self.mock_model, + modelname=self.modelname, + max_seq_len=self.max_seq_len, + dtype=self.dtype, + use_kv_cache=True, + example_inputs=self.example_inputs, + enable_dynamic_shape=True, + ) + + # Call _get_dynamic_shape + result = manager._get_dynamic_shape() + + # Verify the result has the expected structure + self.assertIsInstance(result, tuple) + self.assertEqual(len(result), 2) + + # Check first element (tokens dimension) + self.assertIsInstance(result[0], dict) + self.assertIn(1, result[0]) + self.assertEqual(result[0][1].max, self.max_seq_len - 1) + + # Check second element (input_pos dimension) + self.assertIsInstance(result[1], dict) + self.assertIn("input_pos", result[1]) + self.assertIsInstance(result[1]["input_pos"], dict) + self.assertIn(0, result[1]["input_pos"]) + self.assertEqual(result[1]["input_pos"][0], 1) + + def test_get_dynamic_shape_with_dynamic_shape_disabled(self) -> None: + """Test _get_dynamic_shape when enable_dynamic_shape=False.""" + # Create a manager with enable_dynamic_shape=False + manager = LLMEdgeManager( + model=self.mock_model, + modelname=self.modelname, + max_seq_len=self.max_seq_len, + dtype=self.dtype, + use_kv_cache=True, # Doesn't matter for this test + example_inputs=self.example_inputs, + enable_dynamic_shape=False, + ) + + # Call _get_dynamic_shape + result = manager._get_dynamic_shape() + + # Verify the result is None + self.assertIsNone(result) diff --git a/extension/llm/export/test_export_passes.py b/extension/llm/export/test_export_passes.py index 12ce18ebb79..b0c5af7e65f 100644 --- a/extension/llm/export/test_export_passes.py +++ b/extension/llm/export/test_export_passes.py @@ -10,10 +10,7 @@ class RemoveRedundantTransposesPassTest(unittest.TestCase): def _export(self, model, example_inputs): - exported_module = export_for_training( - model, - example_inputs, - ) + exported_module = export_for_training(model, example_inputs, strict=True) return exported_module.module() def _check(self, model, example_inputs, key, before_count, after_count): diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 993314ccd63..75c30cff71b 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -53,3 +53,7 @@ target_include_directories( extension_llm_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include ) + +if(BUILD_TESTING) + add_subdirectory(test) +endif() diff --git a/extension/llm/runner/irunner.h b/extension/llm/runner/irunner.h index 35d87e997a0..c3ed668a4be 100644 --- a/extension/llm/runner/irunner.h +++ b/extension/llm/runner/irunner.h @@ -6,42 +6,124 @@ * LICENSE file in the root directory of this source tree. */ -// An interface for LLM runners. Developers can create their own runner that -// implements their own load and generation logic to run the model. +// Interface for text generation runners. #pragma once +#include #include +#include #include #include -#include +#include namespace executorch { namespace extension { namespace llm { +// Configuration struct for generation parameters, fields should be sorted in +// alphabetic order +struct GenerationConfig { + // Whether to echo the input prompt in the output + bool echo = true; + + // Maximum number of new tokens to generate + // If the max_context_len metadata that's serialized in the .pte file exists, + // then the number of prompt tokens + max_new_tokens won't exceed + // max_context_len. If this field is -1, it means we will rely on + // max_context_len metadata and seq_len value. Check resolve_max_new_tokens + // for details. + int32_t max_new_tokens = -1; + + // Whether this is a warmup run (affects perf benchmarking) + bool warming = false; + + // Maximum number of total tokens + // If the .pte file contains the max_context_len metadata, it will override + // this value if it's smaller. If this field is -1, we will use the + // max_context_len metadata directly. Check resolve_max_new_tokens for + // details. + int32_t seq_len = -1; + + // Temperature for sampling (higher = more random) + float temperature = 0.8f; + + /** + * Resolve the maximum number of new tokens to generate based on constraints. + * + * This method calculates the maximum number of new tokens that can be + * generated considering both seq_len and max_new_tokens constraints, as well + * as the model's maximum context length and the number of tokens in the + * prompt. + * + * @param max_context_len The maximum context length supported by the model + * @param num_prompt_tokens The number of tokens in the input prompt + * @return The resolved maximum number of new tokens to generate + */ + int32_t resolve_max_new_tokens( + int32_t max_context_len, + int32_t num_prompt_tokens) const { + int32_t result; + + if (seq_len == -1 && max_new_tokens == -1) { + // Both are -1, use max context len minus prompt tokens + result = max_context_len - num_prompt_tokens; + } else if (seq_len == -1 && max_new_tokens != -1) { + // Only max_new_tokens is specified + result = std::min(max_new_tokens, max_context_len - num_prompt_tokens); + } else if (seq_len != -1 && max_new_tokens == -1) { + // Only seq_len is specified + result = std::min(seq_len, max_context_len) - num_prompt_tokens; + } else { + // Both are specified + result = std::min( + std::min(seq_len, max_context_len) - num_prompt_tokens, + max_new_tokens); + } + + // Ensure result is not negative + return std::max(0, result); + } +}; + +// Base interface for LLM runners class ET_EXPERIMENTAL IRunner { public: virtual ~IRunner() = default; - // Checks if the model is loaded. + /** + * Check if the runner is loaded and ready for inference. + * + * @return true if the runner is loaded, false otherwise + */ virtual bool is_loaded() const = 0; - // Load the model and tokenizer. - virtual ::executorch::runtime::Error load() = 0; + /** + * Load the model and prepare for inference. + * + * @return Error::Ok if successful, an error otherwise + */ + virtual runtime::Error load() = 0; - // Generate the output tokens. - virtual ::executorch::runtime::Error generate( + /** + * Generate text based on the provided prompt and generation config. + * + * @param prompt The input prompt to generate from + * @param config Generation configuration parameters + * @param token_callback Callback function called for each generated token + * @param stats_callback Callback function for generation statistics + * @return Error::Ok if successful, an error otherwise + */ + virtual runtime::Error generate( const std::string& prompt, - int32_t seq_len, - std::function token_callback = {}, - std::function - stats_callback = {}, - bool echo = true, - bool warming = false) = 0; - - // Stop the generation. + const GenerationConfig& config, + std::function token_callback, + std::function stats_callback) = 0; + + /** + * Stop the generation process. + */ virtual void stop() = 0; }; diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt new file mode 100644 index 00000000000..b17a318a080 --- /dev/null +++ b/extension/llm/runner/test/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# + +cmake_minimum_required(VERSION 3.19) + +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) + +include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) + +set(_test_srcs generation_config_test.cpp) + +et_cxx_test( + generation_config_test + SOURCES + ${_test_srcs} + EXTRA_LIBS + executorch +) diff --git a/extension/llm/runner/test/TARGETS b/extension/llm/runner/test/TARGETS new file mode 100644 index 00000000000..97de7abe9b1 --- /dev/null +++ b/extension/llm/runner/test/TARGETS @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/llm/runner/test/generation_config_test.cpp b/extension/llm/runner/test/generation_config_test.cpp new file mode 100644 index 00000000000..061f982c684 --- /dev/null +++ b/extension/llm/runner/test/generation_config_test.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +using namespace ::testing; +using executorch::extension::llm::GenerationConfig; + +class GenerationConfigTest : public Test {}; + +TEST_F(GenerationConfigTest, TestResolveMaxNewTokensBothDefault) { + // Test when both seq_len and max_new_tokens are -1 (default) + GenerationConfig config; + // Default values: seq_len = -1, max_new_tokens = -1 + + // max_context_len = 100, num_prompt_tokens = 20 + // Expected: max_context_len - num_prompt_tokens = 100 - 20 = 80 + EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 80); + + // max_context_len = 50, num_prompt_tokens = 30 + // Expected: max_context_len - num_prompt_tokens = 50 - 30 = 20 + EXPECT_EQ(config.resolve_max_new_tokens(50, 30), 20); + + // Edge case: num_prompt_tokens equals max_context_len + // Expected: 0 (no tokens left) + EXPECT_EQ(config.resolve_max_new_tokens(40, 40), 0); + + // Edge case: num_prompt_tokens exceeds max_context_len + // Expected: 0 (no tokens left, and we ensure non-negative result) + EXPECT_EQ(config.resolve_max_new_tokens(30, 50), 0); +} + +TEST_F(GenerationConfigTest, TestResolveMaxNewTokensOnlyMaxNewTokens) { + // Test when only max_new_tokens is specified (seq_len = -1) + GenerationConfig config; + config.seq_len = -1; + config.max_new_tokens = 25; + + // max_context_len = 100, num_prompt_tokens = 20 + // Available tokens: 100 - 20 = 80 + // Expected: min(max_new_tokens, available) = min(25, 80) = 25 + EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 25); + + // max_context_len = 50, num_prompt_tokens = 40 + // Available tokens: 50 - 40 = 10 + // Expected: min(max_new_tokens, available) = min(25, 10) = 10 + EXPECT_EQ(config.resolve_max_new_tokens(50, 40), 10); + + // Edge case: num_prompt_tokens equals max_context_len + // Available tokens: 0 + // Expected: 0 (no tokens left) + EXPECT_EQ(config.resolve_max_new_tokens(40, 40), 0); +} + +TEST_F(GenerationConfigTest, TestResolveMaxNewTokensOnlySeqLen) { + // Test when only seq_len is specified (max_new_tokens = -1) + GenerationConfig config; + config.seq_len = 50; + config.max_new_tokens = -1; + + // max_context_len = 100, num_prompt_tokens = 20 + // Effective seq_len: min(seq_len, max_context_len) = min(50, 100) = 50 + // Expected: effective_seq_len - num_prompt_tokens = 50 - 20 = 30 + EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 30); + + // max_context_len = 40, num_prompt_tokens = 20 + // Effective seq_len: min(seq_len, max_context_len) = min(50, 40) = 40 + // Expected: effective_seq_len - num_prompt_tokens = 40 - 20 = 20 + EXPECT_EQ(config.resolve_max_new_tokens(40, 20), 20); + + // Edge case: num_prompt_tokens equals effective seq_len + // Expected: 0 (no tokens left) + EXPECT_EQ(config.resolve_max_new_tokens(100, 50), 0); + + // Edge case: num_prompt_tokens exceeds effective seq_len + // Expected: 0 (no tokens left, and we ensure non-negative result) + EXPECT_EQ(config.resolve_max_new_tokens(100, 60), 0); +} + +TEST_F(GenerationConfigTest, TestResolveMaxNewTokensBothSpecified) { + // Test when both seq_len and max_new_tokens are specified + GenerationConfig config; + config.seq_len = 50; + config.max_new_tokens = 25; + + // max_context_len = 100, num_prompt_tokens = 20 + // Effective seq_len: min(seq_len, max_context_len) = min(50, 100) = 50 + // Available tokens: effective_seq_len - num_prompt_tokens = 50 - 20 = 30 + // Expected: min(max_new_tokens, available) = min(25, 30) = 25 + EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 25); + + // max_context_len = 40, num_prompt_tokens = 20 + // Effective seq_len: min(seq_len, max_context_len) = min(50, 40) = 40 + // Available tokens: effective_seq_len - num_prompt_tokens = 40 - 20 = 20 + // Expected: min(max_new_tokens, available) = min(25, 20) = 20 + EXPECT_EQ(config.resolve_max_new_tokens(40, 20), 20); + + // Edge case: num_prompt_tokens equals effective seq_len + // Available tokens: 0 + // Expected: 0 (no tokens left) + EXPECT_EQ(config.resolve_max_new_tokens(40, 40), 0); + + // Edge case: max_new_tokens is very small + config.max_new_tokens = 5; + // Available tokens: 50 - 20 = 30 + // Expected: min(max_new_tokens, available) = min(5, 30) = 5 + EXPECT_EQ(config.resolve_max_new_tokens(100, 20), 5); +} diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl new file mode 100644 index 00000000000..9cdaad990bb --- /dev/null +++ b/extension/llm/runner/test/targets.bzl @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_test( + name = "generation_config_test", + srcs = ["generation_config_test.cpp"], + deps = [ + "//executorch/extension/llm/runner:irunner", + "//executorch/extension/llm/runner:stats", + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + ) diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index 7a546574e37..8705dfeb842 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -21,18 +21,8 @@ namespace llm { // NOTE: we observed ~2x loading performance increase on iPhone 15 // and a ~5% improvement on Galaxy S22 by switching to // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors. -TextDecoderRunner::TextDecoderRunner( - Module* module, - bool use_kv_cache, - int32_t vocab_size, - float temperature) - : module_(module), - sampler_(std::make_unique( - vocab_size, - temperature, - kTopp, - static_cast(std::time(nullptr)))), - use_kv_cache_(use_kv_cache) {} +TextDecoderRunner::TextDecoderRunner(Module* module, bool use_kv_cache) + : module_(module), use_kv_cache_(use_kv_cache) {} // This function is functional, meaning it shouldn't modify any state of the // input. It should be safe to call multiple times with the same inputs. The diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index ca4d127e516..b0db48ee75e 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -22,11 +22,7 @@ namespace llm { class ET_EXPERIMENTAL TextDecoderRunner { public: - TextDecoderRunner( - Module* module, - bool use_kv_cache, - int32_t vocab_size, - float temperature); + TextDecoderRunner(Module* module, bool use_kv_cache); virtual ~TextDecoderRunner() = default; @@ -64,10 +60,13 @@ class ET_EXPERIMENTAL TextDecoderRunner { /** * Sample the next token from the logits tensor. * @param logits_tensor The logits tensor. + * @param temperature The temperature parameter used to control randomness in + * sampling. * @return The next token. */ inline int32_t logits_to_token( - const executorch::aten::Tensor& logits_tensor) { + const executorch::aten::Tensor& logits_tensor, + const float temperature = 0.0f) { int32_t result = 0; ET_SWITCH_THREE_TYPES( Float, @@ -82,15 +81,14 @@ class ET_EXPERIMENTAL TextDecoderRunner { // vocab_size], get the last logits, sample and return. Else the model // outputs the last logit, directly sample and return. auto* logits = logits_tensor.mutable_data_ptr(); + ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1); if (logits_tensor.dim() == 3) { auto num_tokens = logits_tensor.size(1); - auto vocab_size = logits_tensor.size(2); - auto* logits_last = logits; - logits_last += (num_tokens - 1) * vocab_size; - result = sampler_->sample(logits_last); - } else { - result = sampler_->sample(logits); + logits += (num_tokens - 1) * vocab_size; } + // @lint-ignore CLANGTIDY facebook-hte-Deprecated + Sampler sampler(vocab_size, temperature); + result = sampler.sample(logits); }); return result; } @@ -98,7 +96,6 @@ class ET_EXPERIMENTAL TextDecoderRunner { protected: // TODO: use shared_ptr for module Module* module_; - std::unique_ptr sampler_; bool use_kv_cache_; bool should_stop_{false}; }; diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h index e8bf891f8ec..1b928de1717 100644 --- a/extension/llm/runner/text_token_generator.h +++ b/extension/llm/runner/text_token_generator.h @@ -38,16 +38,20 @@ class ET_EXPERIMENTAL TextTokenGenerator { * prefill. * @param start_pos the start position of the new tokens, based on how many * prompt tokens is prefilled. - * @param seq_len the total sequence length, including the prompt tokens, next - * token from prefill and new tokens. + * @param max_new_tokens Maximum number of new tokens to generate. + * @param temperature controls the randomness of predictions by scaling the + * logits before applying softmax. A higher temperature results in more + * random predictions, while a lower temperature results in more deterministic + * predictions. * @param token_callback what to do after a token is generated. * @return how many tokens are generated. */ inline ::executorch::runtime::Result generate( std::vector tokens, int64_t start_pos, - int32_t seq_len, - std::function token_callback) { + int32_t max_new_tokens, + float temperature = 0.0f, + const std::function& token_callback = {}) { ET_CHECK_MSG( !tokens.empty(), "Token generation loop shouldn't take empty tokens"); int64_t pos = start_pos; // position in the sequence @@ -78,7 +82,7 @@ class ET_EXPERIMENTAL TextTokenGenerator { should_stop_ = false; // Generate our tokens - while (pos < seq_len - 1) { + while (pos < start_pos + max_new_tokens) { // Run the model auto logits_res = text_decoder_runner_->step(tokens_managed, start_pos_managed); @@ -89,7 +93,8 @@ class ET_EXPERIMENTAL TextTokenGenerator { prev_token = cur_token; stats_->on_sampling_begin(); - cur_token = text_decoder_runner_->logits_to_token(logits_tensor); + cur_token = + text_decoder_runner_->logits_to_token(logits_tensor, temperature); stats_->on_sampling_end(); pos++; diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h index f7b48433cfb..0cb2463d163 100644 --- a/extension/llm/runner/util.h +++ b/extension/llm/runner/util.h @@ -68,9 +68,9 @@ ET_EXPERIMENTAL void inline safe_printf(const char* piece) { ET_EXPERIMENTAL long inline time_in_ms() { // return time in milliseconds, for benchmarking the model speed struct timespec time; - // The `timespec_get` function is only available on Android API levels - // 29 or later. -#if defined(__ANDROID_API__) && __ANDROID_API__ < 29 + // The `timespec_get` function is for windows time access. Some AOSP OS does + // not have timespec_get support. +#if defined(__ANDROID_API__) clock_gettime(CLOCK_REALTIME, &time); #else timespec_get(&time, TIME_UTC); diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index 71167bf9cf4..295ee78e4b0 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit 71167bf9cf4bed861eb9547d1d77e993fd1004f1 +Subproject commit 295ee78e4b0d99d4527bbe81bc3156341366de11 diff --git a/extension/module/module.cpp b/extension/module/module.cpp index 400a2c45049..6c534b8d560 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -37,6 +37,9 @@ namespace executorch { namespace extension { +using ET_RUNTIME_NAMESPACE::MethodMeta; +using ET_RUNTIME_NAMESPACE::Program; + namespace { runtime::Result> load_file( const std::string& file_path, @@ -113,7 +116,7 @@ Module::Module( } Module::Module( - std::shared_ptr program, + std::shared_ptr program, std::unique_ptr memory_allocator, std::unique_ptr temp_allocator, std::unique_ptr event_tracer, @@ -131,7 +134,7 @@ Module::Module( runtime::runtime_init(); } -runtime::Error Module::load(const runtime::Program::Verification verification) { +runtime::Error Module::load(const Program::Verification verification) { if (!is_loaded()) { // Load the program if (!data_loader_) { @@ -156,10 +159,10 @@ runtime::Error Module::load(const runtime::Program::Verification verification) { } // else: either the map itself was provided or we have no data map, either // way no work to do. - auto program = ET_UNWRAP_UNIQUE( - runtime::Program::load(data_loader_.get(), verification)); - program_ = std::shared_ptr( - program.release(), [](runtime::Program* pointer) { delete pointer; }); + auto program = + ET_UNWRAP_UNIQUE(Program::load(data_loader_.get(), verification)); + program_ = std::shared_ptr( + program.release(), [](Program* pointer) { delete pointer; }); } return runtime::Error::Ok; } @@ -224,7 +227,7 @@ runtime::Error Module::load_method( return runtime::Error::Ok; } -runtime::Result Module::method_meta( +runtime::Result Module::method_meta( const std::string& method_name) { ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name)); return methods_.at(method_name).method->method_meta(); diff --git a/extension/module/module.h b/extension/module/module.h index 45d2cc1d14b..73c7328ee0a 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -19,6 +19,11 @@ namespace executorch { namespace extension { +using ET_RUNTIME_NAMESPACE::Method; +using ET_RUNTIME_NAMESPACE::MethodMeta; +using ET_RUNTIME_NAMESPACE::NamedDataMap; +using ET_RUNTIME_NAMESPACE::Program; + /** * A facade class for loading programs and executing methods within them. */ @@ -95,7 +100,7 @@ class Module { * @param[in] data_map_loader A DataLoader used for loading external weights. */ explicit Module( - std::shared_ptr program, + std::shared_ptr program, std::unique_ptr memory_allocator = nullptr, std::unique_ptr temp_allocator = nullptr, std::unique_ptr event_tracer = nullptr, @@ -116,8 +121,8 @@ class Module { */ ET_NODISCARD runtime::Error load( - const runtime::Program::Verification verification = - runtime::Program::Verification::Minimal); + const Program::Verification verification = + Program::Verification::Minimal); /** * Checks if the program is loaded. @@ -134,7 +139,7 @@ class Module { * * @returns Shared pointer to the program or nullptr if it's not yet loaded. */ - inline std::shared_ptr program() const { + inline std::shared_ptr program() const { return program_; } @@ -224,8 +229,7 @@ class Module { * @returns A method metadata, or an error if the program or method failed to * load. */ - runtime::Result method_meta( - const std::string& method_name); + runtime::Result method_meta(const std::string& method_name); /** * Execute a specific method with the given input values and retrieve the @@ -473,20 +477,20 @@ class Module { std::vector> planned_spans; std::unique_ptr planned_memory; std::unique_ptr memory_manager; - std::unique_ptr method; + std::unique_ptr method; std::vector inputs; }; std::string file_path_; std::string data_map_path_; LoadMode load_mode_{LoadMode::MmapUseMlock}; - std::shared_ptr program_; + std::shared_ptr program_; std::unique_ptr data_loader_; std::unique_ptr memory_allocator_; std::unique_ptr temp_allocator_; std::unique_ptr event_tracer_; std::unique_ptr data_map_loader_; - std::unique_ptr data_map_; + std::unique_ptr data_map_; protected: std::unordered_map methods_; diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl index 09a610a1fca..d8019ce9c4e 100644 --- a/extension/module/targets.bzl +++ b/extension/module/targets.bzl @@ -25,7 +25,7 @@ def define_common_targets(): "//executorch/extension/memory_allocator:malloc_memory_allocator", "//executorch/extension/data_loader:file_data_loader", "//executorch/extension/data_loader:mmap_data_loader", - "//executorch/extension/flat_tensor:flat_tensor_data_map", + "//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix, ], exported_deps = [ "//executorch/runtime/executor:program" + aten_suffix, diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index a998e591f30..69952c5d173 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -84,27 +84,27 @@ void et_pal_emit_log_message( } namespace py = pybind11; -using executorch::bundled_program::verify_method_outputs; +using executorch::BUNDLED_PROGRAM_NAMESPACE::verify_method_outputs; +using ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface; +using ::executorch::ET_RUNTIME_NAMESPACE::get_backend_class; +using ::executorch::ET_RUNTIME_NAMESPACE::get_backend_name; +using ::executorch::ET_RUNTIME_NAMESPACE::get_num_registered_backends; +using ::executorch::ET_RUNTIME_NAMESPACE::get_registered_kernels; +using ::executorch::ET_RUNTIME_NAMESPACE::Kernel; +using ::executorch::ET_RUNTIME_NAMESPACE::Method; +using ::executorch::ET_RUNTIME_NAMESPACE::Program; using ::executorch::extension::BufferDataLoader; using ::executorch::extension::MallocMemoryAllocator; using ::executorch::extension::MmapDataLoader; using ::executorch::runtime::ArrayRef; -using ::executorch::runtime::BackendInterface; using ::executorch::runtime::DataLoader; using ::executorch::runtime::Error; using ::executorch::runtime::EValue; using ::executorch::runtime::EventTracerDebugLogLevel; -using ::executorch::runtime::get_backend_class; -using ::executorch::runtime::get_backend_name; -using ::executorch::runtime::get_num_registered_backends; -using ::executorch::runtime::get_registered_kernels; using ::executorch::runtime::HierarchicalAllocator; -using ::executorch::runtime::Kernel; using ::executorch::runtime::MemoryAllocator; using ::executorch::runtime::MemoryManager; -using ::executorch::runtime::Method; using ::executorch::runtime::prof_result_t; -using ::executorch::runtime::Program; using ::executorch::runtime::Result; using ::executorch::runtime::Span; using ::executorch::runtime::Tag; @@ -826,7 +826,7 @@ struct PyModule final { const std::string method_name, size_t testset_idx) { const void* bundled_program_ptr = m.get_bundled_program_ptr(); - Error status = executorch::bundled_program::load_bundled_input( + Error status = executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input( module_->get_method(method_name), bundled_program_ptr, testset_idx); THROW_IF_ERROR( status, @@ -842,14 +842,14 @@ struct PyModule final { double atol = 1e-8) { const void* bundled_program_ptr = m.get_bundled_program_ptr(); auto& method = module_->get_method(method_name); - Error status = executorch::bundled_program::load_bundled_input( + Error status = executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input( method, bundled_program_ptr, testset_idx); THROW_IF_ERROR( status, "load_bundled_input failed with status 0x%" PRIx32, static_cast(status)); py::list outputs = plan_execute(method_name); - status = executorch::bundled_program::verify_method_outputs( + status = executorch::BUNDLED_PROGRAM_NAMESPACE::verify_method_outputs( method, bundled_program_ptr, testset_idx, rtol, atol); THROW_IF_ERROR( status, diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi index 64ea14f08ff..7aede1c29a9 100644 --- a/extension/pybindings/pybindings.pyi +++ b/extension/pybindings/pybindings.pyi @@ -161,7 +161,7 @@ def _load_for_executorch( Args: path: File path to the ExecuTorch program as a string. enable_etdump: If true, enables an ETDump which can store profiling information. - See documentation at https://pytorch.org/executorch/stable/etdump.html + See documentation at https://pytorch.org/executorch/main/etdump for how to use it. debug_buffer_size: If non-zero, enables a debug buffer which can store intermediate results of each instruction in the ExecuTorch program. @@ -192,7 +192,7 @@ def _load_for_executorch_from_bundled_program( ) -> ExecuTorchModule: """Same as _load_for_executorch, but takes a bundled program instead of a file path. - See https://pytorch.org/executorch/stable/bundled-io.html for documentation. + See https://pytorch.org/executorch/main/bundled-io for documentation. .. warning:: diff --git a/extension/runner_util/inputs.cpp b/extension/runner_util/inputs.cpp index 11cd176b5d1..842ba25532f 100644 --- a/extension/runner_util/inputs.cpp +++ b/extension/runner_util/inputs.cpp @@ -12,12 +12,12 @@ #include #include +using executorch::ET_RUNTIME_NAMESPACE::Method; +using executorch::ET_RUNTIME_NAMESPACE::MethodMeta; +using executorch::ET_RUNTIME_NAMESPACE::TensorInfo; using executorch::runtime::Error; -using executorch::runtime::Method; -using executorch::runtime::MethodMeta; using executorch::runtime::Result; using executorch::runtime::Tag; -using executorch::runtime::TensorInfo; namespace executorch { namespace extension { diff --git a/extension/runner_util/inputs.h b/extension/runner_util/inputs.h index 73722c0d7bf..214b76d67e3 100644 --- a/extension/runner_util/inputs.h +++ b/extension/runner_util/inputs.h @@ -15,6 +15,9 @@ namespace executorch { namespace extension { +using ::executorch::ET_RUNTIME_NAMESPACE::Method; +using ::executorch::ET_RUNTIME_NAMESPACE::TensorInfo; + /** * RAII helper that frees a set of buffers when destroyed. Movable. */ @@ -80,7 +83,7 @@ struct PrepareInputTensorsOptions { * @returns An error on failure. */ executorch::runtime::Result prepare_input_tensors( - executorch::runtime::Method& method, + Method& method, PrepareInputTensorsOptions options = {}); namespace internal { @@ -89,8 +92,8 @@ namespace internal { * fills it with ones, and sets the input at `input_index`. */ executorch::runtime::Error fill_and_set_input( - executorch::runtime::Method& method, - executorch::runtime::TensorInfo& tensor_meta, + Method& method, + TensorInfo& tensor_meta, size_t input_index, void* data_ptr); } // namespace internal diff --git a/extension/runner_util/inputs_aten.cpp b/extension/runner_util/inputs_aten.cpp index 83d12dac42d..b89562a2f69 100644 --- a/extension/runner_util/inputs_aten.cpp +++ b/extension/runner_util/inputs_aten.cpp @@ -15,8 +15,8 @@ #include using executorch::runtime::Error; -using executorch::runtime::Method; -using executorch::runtime::TensorInfo; +using executorch::runtime::aten::Method; +using executorch::runtime::aten::TensorInfo; namespace executorch { namespace extension { diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp index c1742fc599b..8a35e83a526 100644 --- a/extension/tensor/tensor_ptr.cpp +++ b/extension/tensor/tensor_ptr.cpp @@ -188,7 +188,7 @@ TensorPtr clone_tensor_ptr(const executorch::aten::Tensor& tensor) { runtime::Error resize_tensor_ptr( TensorPtr& tensor, const std::vector& sizes) { - return runtime::resize_tensor( + return ET_RUNTIME_NAMESPACE::resize_tensor( *tensor, executorch::aten::ArrayRef( sizes.data(), sizes.size())); diff --git a/install_executorch.py b/install_executorch.py index 1d3fe8af1fb..4c7b51ef239 100644 --- a/install_executorch.py +++ b/install_executorch.py @@ -120,8 +120,10 @@ def check_folder(folder: str, file: str) -> bool: if missing_submodules: logger.warning("Some required submodules are missing. Updating submodules...") try: - subprocess.check_call(["git", "submodule", "sync"]) - subprocess.check_call(["git", "submodule", "update", "--init"]) + subprocess.check_call(["git", "submodule", "sync", "--recursive"]) + subprocess.check_call( + ["git", "submodule", "update", "--init", "--recursive"] + ) except subprocess.CalledProcessError as e: logger.error(f"Error updating submodules: {e}") exit(1) @@ -130,13 +132,10 @@ def check_folder(folder: str, file: str) -> bool: for path, file in missing_submodules.items(): if not check_folder(path, file): logger.error(f"{file} not found in {path}.") - logger.error("Please run `git submodule update --init`.") + logger.error( + "Submodule update failed. Please run `git submodule update --init --recursive` manually." + ) exit(1) - # Go into tokenizers submodule and install its submodules - tokenizers_path = get_required_submodule_paths().get("tokenizers", None) - if tokenizers_path: - with pushd(tokenizers_path): - subprocess.check_call(["git", "submodule", "update", "--init"]) logger.info("All required submodules are present.") diff --git a/kernels/README.md b/kernels/README.md index e1c6d02afa8..58931beb984 100644 --- a/kernels/README.md +++ b/kernels/README.md @@ -351,9 +351,11 @@ Once you have your operator and corresponding tests in place, we can try it out. cmake . \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ @@ -369,6 +371,7 @@ cmake --build cmake-out -j9 --target install ``` ./cmake-out/kernels/test/portable_kernels_test ./cmake-out/kernels/test/optimized_kernels_test +./cmake-out/kernels/test/quantized_kernels_test ``` #### Implementation restrictions diff --git a/kernels/aten/cpu/op__empty_dim_order.cpp b/kernels/aten/cpu/op__empty_dim_order.cpp index e75963a9c4e..654b29c778d 100644 --- a/kernels/aten/cpu/op__empty_dim_order.cpp +++ b/kernels/aten/cpu/op__empty_dim_order.cpp @@ -102,7 +102,7 @@ Tensor& _empty_dim_order_out( IntArrayRef size, OptionalIntArrayRef dim_order, Tensor& out) { - executorch::runtime::KernelRuntimeContext ctx{}; + KernelRuntimeContext ctx{}; return _empty_dim_order_out(ctx, size, dim_order, out); } diff --git a/kernels/aten/cpu/op__to_dim_order_copy.cpp b/kernels/aten/cpu/op__to_dim_order_copy.cpp index 10793d24db5..a8216c9a8e9 100644 --- a/kernels/aten/cpu/op__to_dim_order_copy.cpp +++ b/kernels/aten/cpu/op__to_dim_order_copy.cpp @@ -116,7 +116,7 @@ Tensor& _to_dim_order_copy_out( bool non_blocking, OptionalArrayRef dim_order, Tensor& out) { - executorch::runtime::KernelRuntimeContext ctx{}; + KernelRuntimeContext ctx{}; return _to_dim_order_copy_out(ctx, self, non_blocking, dim_order, out); } diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index a8fa6611478..28f1a215562 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -6,6 +6,8 @@ - op: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out +- op: _fft_c2r.out + - op: _fft_r2c.out - op: _linalg_det.result @@ -423,6 +425,8 @@ - op: var.out +- op: view_as_real_copy.out + - op: view_copy.out - op: where.self_out diff --git a/kernels/optimized/cpu/fft_utils.h b/kernels/optimized/cpu/fft_utils.h new file mode 100644 index 00000000000..2225e8ddfa7 --- /dev/null +++ b/kernels/optimized/cpu/fft_utils.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace torch::executor::native { + +// TODO: contents of this anonymous namespace are copy/pasted from +// PyTorch core (aten/src/ATen/native/mkl/SpectralOps.cpp). Small +// portions (the parts that don't depend on Tensor) could be reused; +// refactor to enable that once we can share headers from PyTorch +// core. +namespace { +inline pocketfft::stride_t stride_from_tensor(const Tensor& t) { + pocketfft::stride_t stride(t.strides().begin(), t.strides().end()); + for (auto& s : stride) { + s *= t.element_size(); + } + return stride; +} + +inline pocketfft::shape_t shape_from_tensor(const Tensor& t) { + return pocketfft::shape_t(t.sizes().begin(), t.sizes().end()); +} + +// NOTE: The reinterpret_cast in tensor_cdata is UB, but it's what +// PyTorch core does and I'm not aware of a portable way to do this +// that doesn't rely on UB. +template +inline std::complex* tensor_cdata(Tensor& t) { + return reinterpret_cast*>( + t.data_ptr>()); +} + +template +inline const std::complex* tensor_cdata(const Tensor& t) { + return reinterpret_cast*>( + t.const_data_ptr>()); +} + +// NOTE: in particular this is in ATen/native/SpectralOpsUtils.h and +// could be shared immediately. +enum class fft_norm_mode { + none, // No normalization + by_root_n, // Divide by sqrt(signal_size) + by_n, // Divide by signal_size +}; + +// NOTE: slight fork from upstream PyTorch to use ET_KERNEL_CHECK; +// upstream with TORCH_CHECK will be fine to use once we have code +// sharing. +template +std::optional +compute_fct(KernelRuntimeContext& ctx, int64_t size, int64_t normalization) { + constexpr auto one = static_cast(1); + switch (static_cast(normalization)) { + case fft_norm_mode::none: + return one; + case fft_norm_mode::by_n: + return one / static_cast(size); + case fft_norm_mode::by_root_n: + return one / std::sqrt(static_cast(size)); + } + ET_KERNEL_CHECK_MSG( + ctx, + false, + InvalidArgument, + std::nullopt, + "Unsupported normalization type: %" PRId64, + normalization); +} + +template +std::optional compute_fct( + KernelRuntimeContext& ctx, + const Tensor& t, + IntArrayRef dim, + int64_t normalization) { + if (static_cast(normalization) == fft_norm_mode::none) { + return static_cast(1); + } + const auto& sizes = t.sizes(); + int64_t n = 1; + for (auto idx : dim) { + n *= sizes[idx]; + } + return compute_fct(ctx, n, normalization); +} +} // namespace + +} // namespace torch::executor::native diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp index 5e7fa1dd839..11697f9b0de 100644 --- a/kernels/optimized/cpu/op_bmm.cpp +++ b/kernels/optimized/cpu/op_bmm.cpp @@ -6,9 +6,9 @@ * LICENSE file in the root directory of this source tree. */ -#include - #include +#include +#include // Performs a batch matrix-matrix product of matrices stored in input and mat2. @@ -136,33 +136,32 @@ Error resize_out_tensor(const Tensor& self, const Tensor& mat2, Tensor& out) { // bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) Tensor& opt_bmm_out( - KernelRuntimeContext& context, + KernelRuntimeContext& ctx, const Tensor& self, const Tensor& mat2, Tensor& out) { - (void)context; + (void)ctx; ET_KERNEL_CHECK( - context, + ctx, resize_out_tensor(self, mat2, out) == Error::Ok, InvalidArgument, out); ET_KERNEL_CHECK( - context, check_bmm_out_args(self, mat2, out), InvalidArgument, out); - -#define BMM_TENSOR(ctype, dtype) \ - case ScalarType::dtype: \ - bmm_kernel(self, mat2, out); \ - break; - - auto scalar_type = self.scalar_type(); - switch (scalar_type) { - ET_FORALL_REAL_TYPES_AND(Half, BMM_TENSOR) - default: - ET_CHECK_MSG( - false, "Unhandled dtype %" PRId8, static_cast(scalar_type)); + ctx, check_bmm_out_args(self, mat2, out), InvalidArgument, out); + + constexpr auto name = "bmm.out"; + auto self_type = self.scalar_type(); + + if (executorch::runtime::isComplexType(self_type)) { + ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, name, CTYPE, [&]() { + internal::bmm_out_impl(self, mat2, out); + }); + } else { + ET_SWITCH_REALH_TYPES(self_type, ctx, name, CTYPE, [&]() { + bmm_kernel(self, mat2, out); + }); } -#undef BMM_TENSOR return out; } diff --git a/kernels/optimized/cpu/op_fft_c2r.cpp b/kernels/optimized/cpu/op_fft_c2r.cpp new file mode 100644 index 00000000000..f595b5f7299 --- /dev/null +++ b/kernels/optimized/cpu/op_fft_c2r.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace torch::executor::native { +Tensor& opt_fft_c2r_out( + KernelRuntimeContext& ctx, + const Tensor& in, + IntArrayRef dim, + int64_t normalization, + int64_t last_dim_size, + Tensor& out) { + auto in_sizes = in.sizes(); + ET_KERNEL_CHECK(ctx, in.dim() <= kTensorDimensionLimit, InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, !dim.empty(), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, last_dim_size >= 1, InvalidArgument, out); + + // Determine the output size + std::array out_sizes_storage{}; + executorch::runtime::Span out_sizes( + out_sizes_storage.data(), in_sizes.size()); + std::copy(in_sizes.begin(), in_sizes.end(), out_sizes.begin()); + out_sizes[dim.back()] = last_dim_size; + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK_MSG( + ctx, + in.scalar_type() == executorch::runtime::toComplexType(out.scalar_type()), + InvalidArgument, + out, + "the input type for _fft_c2r must be the Complex type corresponding to the output type"); + + for (auto d : dim) { + ET_KERNEL_CHECK_MSG( + ctx, + d >= 0 && d < in.dim(), + InvalidArgument, + out, + "dims must be in bounds (got %" PRId64 ")", + d); + } + + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor( + out, + executorch::runtime::ArrayRef( + out_sizes.data(), out_sizes.size())) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor (last dim %d).", + out_sizes[dim.back()]); + + pocketfft::shape_t axes(dim.begin(), dim.end()); + auto out_shape = shape_from_tensor(out); + // TODO: if arbitrary strides are a possibility, we need to validate + // these, because pocketfft README says "Strides that lead to + // multiple accesses of the same memory address are not allowed." + auto in_stride = stride_from_tensor(in); + auto out_stride = stride_from_tensor(out); + // NOTE: as of this writing, upstream PyTorch only supports + // float/double, so we follow suit. + ET_SWITCH_FLOAT_TYPES(out.scalar_type(), ctx, "_fft_c2r.out", CTYPE_OUT, [&] { + auto fct = compute_fct(ctx, out, dim, normalization); + if (!fct) { + // Check failed, just bail out of the lambda. + return; + } + pocketfft::c2r( + out_shape, + in_stride, + out_stride, + axes, + false /* forward */, + tensor_cdata(in), + out.mutable_data_ptr(), + *fct); + }); + return out; +} +} // namespace torch::executor::native diff --git a/kernels/optimized/cpu/op_fft_r2c.cpp b/kernels/optimized/cpu/op_fft_r2c.cpp index 45d3d9acb42..750a7e8f0a2 100644 --- a/kernels/optimized/cpu/op_fft_r2c.cpp +++ b/kernels/optimized/cpu/op_fft_r2c.cpp @@ -6,99 +6,10 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include - -#include - -#include namespace torch::executor::native { - -// TODO: contents of this anonymous namespace are copy/pasted from -// PyTorch core (aten/src/ATen/native/mkl/SpectralOps.cpp). Small -// portions (the parts that don't depend on Tensor) could be reused; -// refactor to enable that once we can share headers from PyTorch -// core. -namespace { -pocketfft::stride_t stride_from_tensor(const Tensor& t) { - pocketfft::stride_t stride(t.strides().begin(), t.strides().end()); - for (auto& s : stride) { - s *= t.element_size(); - } - return stride; -} - -pocketfft::shape_t shape_from_tensor(const Tensor& t) { - return pocketfft::shape_t(t.sizes().begin(), t.sizes().end()); -} - -// NOTE: The reinterpret_cast in tensor_cdata is UB, but it's what -// PyTorch core does and I'm not aware of a portable way to do this -// that doesn't rely on UB. -template -inline std::complex* tensor_cdata(Tensor& t) { - return reinterpret_cast*>( - t.data_ptr>()); -} - -template -inline const std::complex* tensor_cdata(const Tensor& t) { - return reinterpret_cast*>( - t.const_data_ptr>()); -} - -// NOTE: in particular this is in ATen/native/SpectralOpsUtils.h and -// could be shared immediately. -enum class fft_norm_mode { - none, // No normalization - by_root_n, // Divide by sqrt(signal_size) - by_n, // Divide by signal_size -}; - -// NOTE: slight fork from upstream PyTorch to use ET_KERNEL_CHECK; -// upstream with TORCH_CHECK will be fine to use once we have code -// sharing. -template -std::optional -compute_fct(KernelRuntimeContext& ctx, int64_t size, int64_t normalization) { - constexpr auto one = static_cast(1); - switch (static_cast(normalization)) { - case fft_norm_mode::none: - return one; - case fft_norm_mode::by_n: - return one / static_cast(size); - case fft_norm_mode::by_root_n: - return one / std::sqrt(static_cast(size)); - } - ET_KERNEL_CHECK_MSG( - ctx, - false, - InvalidArgument, - std::nullopt, - "Unsupported normalization type: %" PRId64, - normalization); -} - -template -std::optional compute_fct( - KernelRuntimeContext& ctx, - const Tensor& t, - IntArrayRef dim, - int64_t normalization) { - if (static_cast(normalization) == fft_norm_mode::none) { - return static_cast(1); - } - const auto& sizes = t.sizes(); - int64_t n = 1; - for (auto idx : dim) { - n *= sizes[idx]; - } - return compute_fct(ctx, n, normalization); -} - -} // namespace - Tensor& opt_fft_r2c_out( KernelRuntimeContext& ctx, const Tensor& in, diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index c9da2584f08..7406cc21b53 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -15,6 +15,7 @@ _OPTIMIZED_ATEN_OPS = ( name = "op_bmm", deps = [ "//executorch/kernels/optimized:libblas", + "//executorch/kernels/portable/cpu/util:matmul_ops_util", ], ), op_target( @@ -34,13 +35,21 @@ _OPTIMIZED_ATEN_OPS = ( ], ), op_target(name = "op_exp"), + op_target( + name = "op_fft_c2r", + compiler_flags = [] if runtime.is_oss else [ + "-Wno-global-constructors", + "-Wno-shadow", + ], + deps = [":fft_utils"], + ), op_target( name = "op_fft_r2c", compiler_flags = [] if runtime.is_oss else [ "-Wno-global-constructors", "-Wno-shadow", ], - deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"], + deps = [":fft_utils"], ), op_target(name = "op_sigmoid"), op_target( @@ -142,6 +151,14 @@ def define_common_targets(): exported_deps = ["//executorch/runtime/core:core"], ) + runtime.cxx_library( + name = "fft_utils", + srcs = [], + exported_headers = ["fft_utils.h"], + visibility = ["//executorch/kernels/optimized/cpu/..."], + exported_deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"], + ) + runtime.cxx_library( name = "binary_ops", exported_headers = ["binary_ops.h"], diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml index 864c3ed5780..42a065f63ed 100644 --- a/kernels/optimized/optimized.yaml +++ b/kernels/optimized/optimized.yaml @@ -2,6 +2,11 @@ # # This yaml file contains operators that have optimized kernels available. +- op: _fft_c2r.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_fft_c2r_out + - op: _fft_r2c.out kernels: - arg_meta: null diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp index 40ce86e8fdc..70fc3507f05 100644 --- a/kernels/portable/cpu/op__to_dim_order_copy.cpp +++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp @@ -125,7 +125,7 @@ Tensor& _to_dim_order_copy_out( bool non_blocking, OptionalArrayRef dim_order, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return _to_dim_order_copy_out(context, self, non_blocking, dim_order, out); } diff --git a/kernels/portable/cpu/op_bmm.cpp b/kernels/portable/cpu/op_bmm.cpp index b9f9d4f2c94..a887cd3c926 100644 --- a/kernels/portable/cpu/op_bmm.cpp +++ b/kernels/portable/cpu/op_bmm.cpp @@ -7,7 +7,6 @@ */ #include -#include #include namespace torch { @@ -37,26 +36,19 @@ Tensor& bmm_out( InvalidArgument, out); - ET_SWITCH_REAL_TYPES_AND( - Half, in.scalar_type(), ctx, "bmm.out", CTYPE, [&]() { - const CTYPE* in_data = in.const_data_ptr(); - const CTYPE* mat2_data = mat2.const_data_ptr(); - CTYPE* out_data = out.mutable_data_ptr(); + constexpr auto name = "bmm.out"; - int64_t batch_size = in.size(0); - int64_t m = in.size(1); - int64_t n = in.size(2); - int64_t p = mat2.size(2); + auto in_type = in.scalar_type(); - for (int i = 0; i < batch_size; ++i) { - const CTYPE* in_data_offset = in_data + i * m * n; - const CTYPE* mat2_data_offset = mat2_data + i * n * p; - CTYPE* out_data_offset = out_data + i * m * p; - - vec_matmul( - out_data_offset, in_data_offset, mat2_data_offset, m, n, p); - } - }); + if (executorch::runtime::isComplexType(in_type)) { + ET_SWITCH_COMPLEXH_TYPES(in_type, ctx, name, CTYPE, [&]() { + internal::bmm_out_impl(in, mat2, out); + }); + } else { + ET_SWITCH_REALH_TYPES(in_type, ctx, name, CTYPE, [&]() { + internal::bmm_out_impl(in, mat2, out); + }); + } return out; } diff --git a/kernels/portable/cpu/op_view_as_real_copy.cpp b/kernels/portable/cpu/op_view_as_real_copy.cpp new file mode 100644 index 00000000000..4a2803eded0 --- /dev/null +++ b/kernels/portable/cpu/op_view_as_real_copy.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = executorch::aten::Tensor; + +namespace { + +template +inline void _to_impl(const Tensor& self, Tensor& out) { + auto self_data = self.mutable_data_ptr(); + auto out_data = out.mutable_data_ptr(); + + for (size_t i = 0, e = self.numel(); i < e; i++) { + auto val_in = self_data[i]; + out_data[2 * i] = static_cast(val_in.real_); + out_data[2 * i + 1] = static_cast(val_in.imag_); + } +} + +} // namespace + +// view_as_real_copy(Tensor self) -> Tensor +Tensor& view_as_real_copy_out( + KernelRuntimeContext& ctx, + const Tensor& self, + Tensor& out) { + (void)ctx; + + // Get the output shape + Tensor::SizesType expected_output_size[kTensorDimensionLimit]; + get_view_as_real_copy_out_target_size(self, expected_output_size); + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor( + out, {expected_output_size, static_cast(out.dim())}) == + Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + // The input tensor must be complex type + ET_KERNEL_CHECK_MSG( + ctx, + executorch::runtime::isComplexType(self.scalar_type()), + InvalidArgument, + out, + "Input tensor must be complex type"); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + constexpr auto op_name = "view_as_real_copy.out"; + + ET_SWITCH_COMPLEXH_TYPES(self.scalar_type(), ctx, op_name, CTYPE_IN, [&] { + ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + _to_impl(self, out); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp index 93725d92dab..02b2910fc88 100644 --- a/kernels/portable/cpu/util/copy_ops_util.cpp +++ b/kernels/portable/cpu/util/copy_ops_util.cpp @@ -1018,5 +1018,14 @@ void get_unfold_copy_out_target_size( *out_ndim = self.dim() + 1; } +void get_view_as_real_copy_out_target_size( + const Tensor& self, + executorch::aten::SizesType* out_sizes) { + for (auto i : c10::irange(self.dim())) { + out_sizes[i] = self.size(i); + } + out_sizes[self.dim()] = 2; +} + } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h index edcc6eb0021..cef2b3d4ee1 100644 --- a/kernels/portable/cpu/util/copy_ops_util.h +++ b/kernels/portable/cpu/util/copy_ops_util.h @@ -247,5 +247,9 @@ void get_unfold_copy_out_target_size( executorch::aten::SizesType* out_sizes, size_t* out_ndim); +void get_view_as_real_copy_out_target_size( + const Tensor& self, + executorch::aten::SizesType* out_sizes); + } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/matmul_ops_util.h b/kernels/portable/cpu/util/matmul_ops_util.h index d2991868e95..2d2799eaa59 100644 --- a/kernels/portable/cpu/util/matmul_ops_util.h +++ b/kernels/portable/cpu/util/matmul_ops_util.h @@ -45,5 +45,36 @@ void get_linear_out_target_size( Tensor::SizesType* out_sizes, size_t* out_ndim); +namespace internal { + +template +void bmm_out_impl(const Tensor& in, const Tensor& mat2, Tensor& out) { + const CTYPE* in_data = in.const_data_ptr(); + const CTYPE* mat2_data = mat2.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + int64_t batch_size = in.size(0); + int64_t m = in.size(1); + int64_t n = in.size(2); + int64_t p = mat2.size(2); + + for (int b = 0; b < batch_size; ++b) { + const CTYPE* in_data_offset = in_data + b * m * n; + const CTYPE* mat2_data_offset = mat2_data + b * n * p; + CTYPE* out_data_offset = out_data + b * m * p; + + for (const auto i : c10::irange(m)) { + for (const auto j : c10::irange(p)) { + CTYPE sum = static_cast(0.0); + for (const auto k : c10::irange(n)) { + sum += in_data_offset[i * n + k] * mat2_data_offset[k * p + j]; + } + out_data_offset[i * p + j] = sum; + } + } + } +} + +} // namespace internal } // namespace executor } // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 5e45a210a70..ab04d3b26ac 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -957,6 +957,11 @@ - arg_meta: null kernel_name: torch::executor::var_out +- op: view_as_real_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::view_as_real_copy_out + - op: view_copy.out kernels: - arg_meta: null diff --git a/kernels/portable/test/op_gelu_test.cpp b/kernels/portable/test/op_gelu_test.cpp index 19e757b4bd0..2e5cad55c35 100644 --- a/kernels/portable/test/op_gelu_test.cpp +++ b/kernels/portable/test/op_gelu_test.cpp @@ -25,7 +25,7 @@ using torch::executor::testing::TensorFactory; // executorch/kernels/test/op_gelu_test.cpp instead. Tensor& op_gelu_out(const Tensor& self, string_view approximate, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::native::gelu_out(context, self, approximate, out); } diff --git a/kernels/prim_ops/et_view.cpp b/kernels/prim_ops/et_view.cpp index 44ac7470193..f32c43ee3a4 100644 --- a/kernels/prim_ops/et_view.cpp +++ b/kernels/prim_ops/et_view.cpp @@ -93,7 +93,14 @@ void et_view(KernelRuntimeContext& context, EValue** stack) { "Failed to resize output tensor."); // Do some checks - ET_KERNEL_CHECK(context, self.numel() == out.numel(), InvalidArgument, ); + ET_KERNEL_CHECK_MSG( + context, + self.numel() == out.numel(), + InvalidArgument, + , + "self.numel(): %" ET_PRIsize_t ", out.numel(): %" ET_PRIsize_t, + static_cast(self.numel()), + static_cast(out.numel())); // Update data ptr ET_KERNEL_CHECK_MSG( diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp index 1d197b63584..62aead8978f 100644 --- a/kernels/prim_ops/register_prim_ops.cpp +++ b/kernels/prim_ops/register_prim_ops.cpp @@ -381,14 +381,13 @@ static Kernel prim_ops[] = { }; -executorch::runtime::Span kernel_span( - prim_ops, - prim_ops + sizeof(prim_ops) / sizeof(Kernel)); +executorch::runtime::Span + kernel_span(prim_ops, prim_ops + sizeof(prim_ops) / sizeof(Kernel)); // Return value not used. Keep the static variable assignment to register // operators in static initialization time. auto success_with_kernel_reg = - executorch::runtime::register_kernels(kernel_span); + executorch::ET_RUNTIME_NAMESPACE::register_kernels(kernel_span); } // namespace } // namespace function diff --git a/kernels/prim_ops/targets.bzl b/kernels/prim_ops/targets.bzl index c1af21a7e73..d2cff10a194 100644 --- a/kernels/prim_ops/targets.bzl +++ b/kernels/prim_ops/targets.bzl @@ -56,7 +56,7 @@ def define_common_targets(): ":et_copy_index" + aten_suffix, ":et_view" + aten_suffix, "//executorch/runtime/core:evalue" + aten_suffix, - "//executorch/runtime/kernel:operator_registry", + "//executorch/runtime/kernel:operator_registry" + aten_suffix, "//executorch/runtime/kernel:kernel_includes" + aten_suffix, ], ) diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp index 5079109683f..632bddd58c4 100644 --- a/kernels/quantized/cpu/op_quantize.cpp +++ b/kernels/quantized/cpu/op_quantize.cpp @@ -22,6 +22,7 @@ namespace native { using Tensor = executorch::aten::Tensor; using Scalar = executorch::aten::Scalar; using ScalarType = executorch::aten::ScalarType; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; namespace { @@ -214,7 +215,7 @@ Tensor& quantize_per_tensor_tensor_args_out( int64_t quant_max, ScalarType dtype, Tensor& out) { - auto context = executorch::runtime::KernelRuntimeContext(); + auto context = KernelRuntimeContext(); auto& res = quantize_per_tensor_tensor_args_out( context, input, scale, zero_point, quant_min, quant_max, dtype, out); ET_CHECK(context.failure_state() == Error::Ok); diff --git a/kernels/quantized/targets.bzl b/kernels/quantized/targets.bzl index a2533cb003a..7bd8f6852a7 100644 --- a/kernels/quantized/targets.bzl +++ b/kernels/quantized/targets.bzl @@ -61,6 +61,10 @@ def define_common_targets(): name = "all_quantized_ops", ops_schema_yaml_target = ":quantized.yaml", define_static_targets = True, + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], ) # On Windows we can only compile these two ops currently, so adding a diff --git a/kernels/quantized/test/op_add_test.cpp b/kernels/quantized/test/op_add_test.cpp index 17dd1cfb3fc..3f258827973 100644 --- a/kernels/quantized/test/op_add_test.cpp +++ b/kernels/quantized/test/op_add_test.cpp @@ -24,7 +24,7 @@ using executorch::aten::optional; using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::native::add_out; using torch::executor::native::dequantize_per_tensor_out; using torch::executor::native::quantize_per_tensor_out; diff --git a/kernels/quantized/test/op_embedding2b_test.cpp b/kernels/quantized/test/op_embedding2b_test.cpp index bf48fa4227b..a350b77ec0d 100644 --- a/kernels/quantized/test/op_embedding2b_test.cpp +++ b/kernels/quantized/test/op_embedding2b_test.cpp @@ -21,7 +21,7 @@ using executorch::aten::ArrayRef; using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::native::quantized_embedding_2bit_out; using torch::executor::testing::TensorFactory; diff --git a/kernels/quantized/test/op_embedding4b_test.cpp b/kernels/quantized/test/op_embedding4b_test.cpp index 9f205be80e3..6ab10376b88 100644 --- a/kernels/quantized/test/op_embedding4b_test.cpp +++ b/kernels/quantized/test/op_embedding4b_test.cpp @@ -21,7 +21,7 @@ using executorch::aten::ArrayRef; using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::native::quantized_embedding_4bit_out; using torch::executor::testing::TensorFactory; diff --git a/kernels/quantized/test/op_embedding_test.cpp b/kernels/quantized/test/op_embedding_test.cpp index 252aca41314..6c949bd6e69 100644 --- a/kernels/quantized/test/op_embedding_test.cpp +++ b/kernels/quantized/test/op_embedding_test.cpp @@ -24,7 +24,7 @@ using executorch::aten::optional; using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::native::dequantize_per_tensor_out; using torch::executor::native::embedding_out; using torch::executor::native::quantize_per_tensor_out; diff --git a/kernels/quantized/test/op_mixed_linear_test.cpp b/kernels/quantized/test/op_mixed_linear_test.cpp index 6b86b199f60..833fc766ffd 100644 --- a/kernels/quantized/test/op_mixed_linear_test.cpp +++ b/kernels/quantized/test/op_mixed_linear_test.cpp @@ -20,7 +20,7 @@ using namespace ::testing; using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::native::quantized_mixed_linear_out; using torch::executor::testing::TensorFactory; diff --git a/kernels/quantized/test/op_mixed_mm_test.cpp b/kernels/quantized/test/op_mixed_mm_test.cpp index e20ac96d610..4d81089fa91 100644 --- a/kernels/quantized/test/op_mixed_mm_test.cpp +++ b/kernels/quantized/test/op_mixed_mm_test.cpp @@ -20,7 +20,7 @@ using namespace ::testing; using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::native::quantized_mixed_mm_out; using torch::executor::testing::TensorFactory; diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index 2d497dfc124..deb61410b10 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -242,6 +242,7 @@ set(all_test_sources "op_upsample_bilinear2d_test.cpp" "op_upsample_nearest2d_test.cpp" "op_var_test.cpp" + "op_view_as_real_copy_test.cpp" "op_view_copy_test.cpp" "op_where_test.cpp" "op_zeros_test.cpp" @@ -276,6 +277,7 @@ set(_optimized_kernels_test_sources "op_div_test.cpp" "op_elu_test.cpp" "op_exp_test.cpp" + "op_fft_c2r_test.cpp" "op_fft_r2c_test.cpp" "op_gelu_test.cpp" "op_le_test.cpp" diff --git a/kernels/test/TestUtil.h b/kernels/test/TestUtil.h index aa220f5bfd5..7ec20c11bef 100644 --- a/kernels/test/TestUtil.h +++ b/kernels/test/TestUtil.h @@ -116,6 +116,6 @@ class OperatorTest : public ::testing::Test { } protected: - executorch::runtime::KernelRuntimeContext context_; + ::torch::executor::KernelRuntimeContext context_; bool expect_failure_; }; diff --git a/kernels/test/custom_kernel_example/op_relu.cpp b/kernels/test/custom_kernel_example/op_relu.cpp index 2cc3eefe0a8..074ebe6b900 100644 --- a/kernels/test/custom_kernel_example/op_relu.cpp +++ b/kernels/test/custom_kernel_example/op_relu.cpp @@ -17,8 +17,8 @@ namespace native { using executorch::aten::ScalarType; using executorch::aten::Tensor; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using executorch::runtime::Error; -using executorch::runtime::KernelRuntimeContext; using executorch::runtime::resize_tensor; using executorch::runtime::tensors_have_same_shape_and_dtype; diff --git a/kernels/test/op_abs_test.cpp b/kernels/test/op_abs_test.cpp index 0d022d0a839..eb4a808b321 100644 --- a/kernels/test/op_abs_test.cpp +++ b/kernels/test/op_abs_test.cpp @@ -44,12 +44,7 @@ class OpAbsTest : public OperatorTest { TensorFactory tf; constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE); TensorFactory tf_out; - using REAL_CTYPE = - typename executorch::runtime::ScalarTypeToCppType::type; - Tensor in = tf.make( - {1, 2}, - {CTYPE{REAL_CTYPE(3), REAL_CTYPE(4)}, - CTYPE{REAL_CTYPE(5), REAL_CTYPE(12)}}); + Tensor in = tf.make({1, 2}, {CTYPE(3, 4), CTYPE(5, 12)}); Tensor out = tf_out.zeros({1, 2}); Tensor expected = tf_out.make({1, 2}, {5, 13}); Tensor ret = op_abs_out(in, out); diff --git a/kernels/test/op_atan2_test.cpp b/kernels/test/op_atan2_test.cpp index e69ea0e90c8..436826e2b6d 100644 --- a/kernels/test/op_atan2_test.cpp +++ b/kernels/test/op_atan2_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures; using torch::executor::testing::TensorFactory; Tensor& op_atan2_out(const Tensor& self, const Tensor& other, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::atan2_outf(context, self, other, out); } diff --git a/kernels/test/op_bmm_test.cpp b/kernels/test/op_bmm_test.cpp index 88671467f46..70a5f37946d 100644 --- a/kernels/test/op_bmm_test.cpp +++ b/kernels/test/op_bmm_test.cpp @@ -43,6 +43,61 @@ class OpBmmOutTest : public OperatorTest { EXPECT_TENSOR_EQ(out, expected); } + + template + void test_complex_dtype() { + TensorFactory tf; + Tensor x = tf.make( + {2, 2, 3}, + {CTYPE(1, 1), + CTYPE(2, 2), + CTYPE(3, 3), + CTYPE(4, 4), + CTYPE(5, 5), + CTYPE(6, 6), + CTYPE(7, 7), + CTYPE(8, 8), + CTYPE(9, 9), + CTYPE(10, 10), + CTYPE(11, 11), + CTYPE(12, 12)}); + Tensor y = tf.make( + {2, 3, 2}, + {CTYPE(2, 1), + CTYPE(4, 2), + CTYPE(6, 3), + CTYPE(8, 4), + CTYPE(10, 5), + CTYPE(12, 6), + CTYPE(14, 7), + CTYPE(16, 8), + CTYPE(18, 9), + CTYPE(20, 10), + CTYPE(22, 11), + CTYPE(24, 12)}); + Tensor out = tf.make( + {2, 2, 2}, + {CTYPE(0, 0), + CTYPE(0, 0), + CTYPE(0, 0), + CTYPE(0, 0), + CTYPE(0, 0), + CTYPE(0, 0), + CTYPE(0, 0), + CTYPE(0, 0)}); + Tensor expected = tf.make( + {2, 2, 2}, + {CTYPE(22, 66), + CTYPE(28, 84), + CTYPE(49, 147), + CTYPE(64, 192), + CTYPE(220, 660), + CTYPE(244, 732), + CTYPE(301, 903), + CTYPE(334, 1002)}); + op_bmm_out(x, y, out); + EXPECT_TENSOR_CLOSE(out, expected); + } }; TEST_F(OpBmmOutTest, OutputDim) { @@ -132,7 +187,7 @@ TEST_F(OpBmmOutTest, OutputDimFloat) { /// A generic smoke test that works for any dtype that supports ones() and /// zeros(). -TEST_F(OpBmmOutTest, AllDtypesSupported) { +TEST_F(OpBmmOutTest, AllRealDtypesSupported) { #define TEST_ENTRY(ctype, dtype) test_dtype(); ET_FORALL_REAL_TYPES(TEST_ENTRY); #undef TEST_ENTRY @@ -141,6 +196,16 @@ TEST_F(OpBmmOutTest, AllDtypesSupported) { // for those types. } +TEST_F(OpBmmOutTest, AllComplexDtypesSupported) { +#define TEST_ENTRY(ctype, dtype) test_complex_dtype(); + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + ET_FORALL_COMPLEX_TYPES(TEST_ENTRY); + } else { + ET_FORALL_COMPLEXH_TYPES(TEST_ENTRY); + } +#undef TEST_ENTRY +} + TEST_F(OpBmmOutTest, EmptyInputWithEmptyOutTensorPasses) { TensorFactory tf; diff --git a/kernels/test/op_cdist_forward_test.cpp b/kernels/test/op_cdist_forward_test.cpp index 32465ca439b..9ddab4c3c49 100644 --- a/kernels/test/op_cdist_forward_test.cpp +++ b/kernels/test/op_cdist_forward_test.cpp @@ -21,6 +21,7 @@ using executorch::aten::ArrayRef; using executorch::aten::optional; using executorch::aten::ScalarType; using executorch::aten::Tensor; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::testing::TensorFactory; Tensor& op_cdist_forward_out( @@ -29,7 +30,7 @@ Tensor& op_cdist_forward_out( double p, optional compute_mode, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + KernelRuntimeContext context{}; return torch::executor::aten::_cdist_forward_outf( context, x1, x2, p, compute_mode, out); } diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp index a1003e892e0..8a021c70303 100644 --- a/kernels/test/op_clamp_test.cpp +++ b/kernels/test/op_clamp_test.cpp @@ -260,7 +260,7 @@ class OpClampTensorOutTest : public OperatorTest { const optional& min, const optional& max, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::clamp_outf(context, self, min, max, out); } }; diff --git a/kernels/test/op_diagonal_copy_test.cpp b/kernels/test/op_diagonal_copy_test.cpp index cc0bd02e1a5..080b0d70645 100644 --- a/kernels/test/op_diagonal_copy_test.cpp +++ b/kernels/test/op_diagonal_copy_test.cpp @@ -27,7 +27,7 @@ Tensor& op_diagonal_copy_out( int64_t dim1, int64_t dim2, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::diagonal_copy_outf( context, input, offset, dim1, dim2, out); } @@ -54,31 +54,22 @@ class OpDiagonalCopyOutTest : public ::testing::Test { template void run_2d_complex_dtype() { TensorFactory tf; - constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE); - using REAL_CTYPE = - typename executorch::runtime::ScalarTypeToCppType::type; Tensor input = tf.make( {3, 4}, - {CTYPE{REAL_CTYPE(1), REAL_CTYPE(1)}, - CTYPE{REAL_CTYPE(2), REAL_CTYPE(2)}, - CTYPE{REAL_CTYPE(3), REAL_CTYPE(3)}, - CTYPE{REAL_CTYPE(4), REAL_CTYPE(4)}, - CTYPE{REAL_CTYPE(5), REAL_CTYPE(5)}, - CTYPE{REAL_CTYPE(6), REAL_CTYPE(6)}, - CTYPE{REAL_CTYPE(7), REAL_CTYPE(7)}, - CTYPE{REAL_CTYPE(8), REAL_CTYPE(8)}, - CTYPE{REAL_CTYPE(9), REAL_CTYPE(9)}, - CTYPE{REAL_CTYPE(10), REAL_CTYPE(10)}, - CTYPE{REAL_CTYPE(11), REAL_CTYPE(11)}, - CTYPE{REAL_CTYPE(12), REAL_CTYPE(12)}}); - Tensor out = tf.make( - {2}, - {CTYPE{REAL_CTYPE(0), REAL_CTYPE(0)}, - CTYPE{REAL_CTYPE(0), REAL_CTYPE(0)}}); - Tensor out_expected = tf.make( - {2}, - {CTYPE{REAL_CTYPE(5), REAL_CTYPE(5)}, - CTYPE{REAL_CTYPE(10), REAL_CTYPE(10)}}); + {CTYPE(1, 1), + CTYPE(2, 2), + CTYPE(3, 3), + CTYPE(4, 4), + CTYPE(5, 5), + CTYPE(6, 6), + CTYPE(7, 7), + CTYPE(8, 8), + CTYPE(9, 9), + CTYPE(10, 10), + CTYPE(11, 11), + CTYPE(12, 12)}); + Tensor out = tf.make({2}, {CTYPE(0, 0), CTYPE(0, 0)}); + Tensor out_expected = tf.make({2}, {CTYPE(5, 5), CTYPE(10, 10)}); op_diagonal_copy_out(input, 1, 1, 0, out); EXPECT_TENSOR_CLOSE(out, out_expected); } diff --git a/kernels/test/op_fft_c2r_test.cpp b/kernels/test/op_fft_c2r_test.cpp new file mode 100644 index 00000000000..58c8a216e42 --- /dev/null +++ b/kernels/test/op_fft_c2r_test.cpp @@ -0,0 +1,187 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +using executorch::aten::IntArrayRef; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::testing::TensorFactory; + +class OpFftC2rOutTest : public OperatorTest { + protected: + Tensor& op_fft_c2r_out( + const Tensor& in, + IntArrayRef dim, + int64_t normalization, + int64_t last_dim_size, + Tensor& out) { + return torch::executor::aten::_fft_c2r_outf( + context_, in, dim, normalization, last_dim_size, out); + } + + template < + class CTYPE_OUT, + executorch::aten::ScalarType DTYPE_OUT, + bool expect_failure = false> + void test_dtype(int64_t norm, int64_t dim = 0) { + TensorFactory tf_out; + constexpr auto DTYPE_IN = executorch::runtime::toComplexType(DTYPE_OUT); + TensorFactory tf_in; + + using CTYPE_IN = + typename executorch::runtime::ScalarTypeToCppType::type; + + std::vector input_data = { + CTYPE_IN{24, 4}, + CTYPE_IN{4, -8}, + CTYPE_IN{0, 4}, + + CTYPE_IN{8, -16}, + CTYPE_IN{-4, 0}, + CTYPE_IN{0, 32}, + + CTYPE_IN{12, 0}, + CTYPE_IN{0, 4}, + CTYPE_IN{-8, 4}, + + CTYPE_IN{0, 8}, + CTYPE_IN{-4, 8}, + CTYPE_IN{8, 0}, + }; + + Tensor in = tf_in.make({4, 3}, input_data); + Tensor out = tf_out.full({4, 3}, 0); + + int64_t last_dim_size = + (dim >= 0 && dim < out.dim()) ? out.sizes()[dim] : 0; + op_fft_c2r_out(in, {dim}, norm, last_dim_size, out); + + double norm_factor = 1; + if (norm == 1) { + norm_factor = 2; + } else if (norm == 2) { + norm_factor = 4; + } + std::vector expected_data = { + 52., -4., -8., 44., 4., -56., 20., 12., -8., -20., 4., 72.}; + for (auto& elem : expected_data) { + elem /= norm_factor; + } + Tensor expected = tf_out.make({4, 3}, expected_data); + + if (!expect_failure) { + EXPECT_TENSOR_CLOSE(out, expected); + } + } + + template + void test_dtype_multiple_axes() { + TensorFactory tf_out; + constexpr auto DTYPE_IN = executorch::runtime::toComplexType(DTYPE_OUT); + TensorFactory tf_in; + + using CTYPE_IN = + typename executorch::runtime::ScalarTypeToCppType::type; + + std::vector input_data = { + CTYPE_IN{16, 4}, + CTYPE_IN{4, -8}, + CTYPE_IN{0, 4}, + + CTYPE_IN{8, -16}, + CTYPE_IN{-4, 0}, + CTYPE_IN{0, 36}, + + CTYPE_IN{32, 0}, + CTYPE_IN{0, 4}, + CTYPE_IN{-8, 4}, + + CTYPE_IN{0, 8}, + CTYPE_IN{-4, 8}, + CTYPE_IN{8, 0}, + }; + + Tensor in = tf_in.make({4, 3}, input_data); + Tensor out = tf_out.full({4, 4}, 0); + + int64_t last_dim_size = out.sizes()[0]; + std::array dim = {0, 1}; + op_fft_c2r_out(in, dim, 1, last_dim_size, out); + + std::vector expected_data = { + 12., + 12., + 16., + 16., + 1., + 15., + -11., + 3., + 12., + 20., + 0., + 8., + -1., + -15., + 3., + -27.}; + Tensor expected = tf_out.make({4, 4}, expected_data); + EXPECT_TENSOR_CLOSE(out, expected); + } +}; + +TEST_F(OpFftC2rOutTest, AllDtypesSupported) { +#define TEST_ENTRY(ctype, dtype) \ + test_dtype(0); \ + test_dtype(1); \ + test_dtype(2); + ET_FORALL_FLOAT_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpFftC2rOutTest, MultipleDims) { +#define TEST_ENTRY(ctype, dtype) \ + test_dtype_multiple_axes(); + ET_FORALL_FLOAT_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpFftC2rOutTest, InvalidNorm) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() << "ATen MKL path does not validate norm"; + return; + } + auto invalid_norm = [this](int64_t norm) { + test_dtype(norm); + }; + ET_EXPECT_KERNEL_FAILURE(context_, invalid_norm(3)); + ET_EXPECT_KERNEL_FAILURE(context_, invalid_norm(4)); + ET_EXPECT_KERNEL_FAILURE(context_, invalid_norm(-1)); + ET_EXPECT_KERNEL_FAILURE(context_, invalid_norm(9999999)); +} + +TEST_F(OpFftC2rOutTest, InvalidDim) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() << "ATen fails UBSAN"; + return; + } + auto negative_dim = [this]() { + test_dtype(0, -1); + test_dtype(0, 3); + test_dtype(0, 9001); + }; + ET_EXPECT_KERNEL_FAILURE(context_, negative_dim()); +} diff --git a/kernels/test/op_flip_test.cpp b/kernels/test/op_flip_test.cpp index f240dfd4ad3..be06e397be2 100644 --- a/kernels/test/op_flip_test.cpp +++ b/kernels/test/op_flip_test.cpp @@ -22,7 +22,7 @@ using executorch::aten::Tensor; using torch::executor::testing::TensorFactory; Tensor& op_flip_out(const Tensor& input, IntArrayRef dims, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::flip_outf(context, input, dims, out); } diff --git a/kernels/test/op_ge_test.cpp b/kernels/test/op_ge_test.cpp index 4b21644a5c5..a79502b266e 100644 --- a/kernels/test/op_ge_test.cpp +++ b/kernels/test/op_ge_test.cpp @@ -18,7 +18,7 @@ using namespace ::testing; using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpGeTensorOutTest : public OperatorTest { diff --git a/kernels/test/op_gt_test.cpp b/kernels/test/op_gt_test.cpp index 29a2fb0e8b8..96c0e95f950 100644 --- a/kernels/test/op_gt_test.cpp +++ b/kernels/test/op_gt_test.cpp @@ -18,7 +18,7 @@ using namespace ::testing; using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpGtScalarOutTest : public OperatorTest { diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp index 49ef5235d0f..bcd40d24d89 100644 --- a/kernels/test/op_le_test.cpp +++ b/kernels/test/op_le_test.cpp @@ -18,7 +18,7 @@ using namespace ::testing; using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpLeScalarOutTest : public OperatorTest { diff --git a/kernels/test/op_lt_test.cpp b/kernels/test/op_lt_test.cpp index 51ccb310e4a..eee12c50521 100644 --- a/kernels/test/op_lt_test.cpp +++ b/kernels/test/op_lt_test.cpp @@ -18,7 +18,7 @@ using namespace ::testing; using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpLtScalarOutTest : public OperatorTest { diff --git a/kernels/test/op_maximum_test.cpp b/kernels/test/op_maximum_test.cpp index 9c701e208eb..faa18fa56cd 100644 --- a/kernels/test/op_maximum_test.cpp +++ b/kernels/test/op_maximum_test.cpp @@ -21,7 +21,7 @@ using executorch::aten::Tensor; using torch::executor::testing::TensorFactory; Tensor& op_maximum_out(const Tensor& self, const Tensor& other, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::maximum_outf(context, self, other, out); } diff --git a/kernels/test/op_native_batch_norm_test.cpp b/kernels/test/op_native_batch_norm_test.cpp index 67e46b27508..bf05a87312d 100644 --- a/kernels/test/op_native_batch_norm_test.cpp +++ b/kernels/test/op_native_batch_norm_test.cpp @@ -173,7 +173,7 @@ class OpNativeBatchNormLegitOutTest : public OperatorTest { executorch::aten::Tensor& out0, executorch::aten::Tensor& out1, executorch::aten::Tensor& out2) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::_native_batch_norm_legit_outf( context, input, diff --git a/kernels/test/op_native_group_norm_test.cpp b/kernels/test/op_native_group_norm_test.cpp index ea742e97231..7452350ad29 100644 --- a/kernels/test/op_native_group_norm_test.cpp +++ b/kernels/test/op_native_group_norm_test.cpp @@ -32,7 +32,7 @@ ::std::tuple op_native_group_norm_out( Tensor& out0, Tensor& out1, Tensor& out2) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::native_group_norm_outf( context, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2); } diff --git a/kernels/test/op_ne_test.cpp b/kernels/test/op_ne_test.cpp index 6cb0217ec0f..fe4e6c3621c 100644 --- a/kernels/test/op_ne_test.cpp +++ b/kernels/test/op_ne_test.cpp @@ -18,7 +18,7 @@ using namespace ::testing; using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpNeTest : public OperatorTest { diff --git a/kernels/test/op_pdist_forward_test.cpp b/kernels/test/op_pdist_forward_test.cpp index e6c0d472517..2b28591f7fc 100644 --- a/kernels/test/op_pdist_forward_test.cpp +++ b/kernels/test/op_pdist_forward_test.cpp @@ -23,7 +23,7 @@ using executorch::aten::Tensor; using torch::executor::testing::TensorFactory; Tensor& op_pdist_forward_out(const Tensor& input, double p, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::_pdist_forward_outf(context, input, p, out); } diff --git a/kernels/test/op_prod_test.cpp b/kernels/test/op_prod_test.cpp index f9cf53ded57..11a7e3fae4f 100644 --- a/kernels/test/op_prod_test.cpp +++ b/kernels/test/op_prod_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::TensorFactory; Tensor& op_prod_out(const Tensor& self, optional dtype, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::prod_outf(context, self, dtype, out); } @@ -33,7 +33,7 @@ Tensor& op_prod_int_out( bool keepdim, optional dtype, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::prod_outf( context, self, dim, keepdim, dtype, out); } diff --git a/kernels/test/op_reflection_pad1d_test.cpp b/kernels/test/op_reflection_pad1d_test.cpp index 5f3b2a1c273..aebf057326a 100644 --- a/kernels/test/op_reflection_pad1d_test.cpp +++ b/kernels/test/op_reflection_pad1d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_reflection_pad1d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::reflection_pad1d_outf( context, input, padding, out); } diff --git a/kernels/test/op_reflection_pad2d_test.cpp b/kernels/test/op_reflection_pad2d_test.cpp index 8696b5dff7b..01e0619b9f1 100644 --- a/kernels/test/op_reflection_pad2d_test.cpp +++ b/kernels/test/op_reflection_pad2d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_reflection_pad2d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::reflection_pad2d_outf( context, input, padding, out); } diff --git a/kernels/test/op_reflection_pad3d_test.cpp b/kernels/test/op_reflection_pad3d_test.cpp index 7d5cc84c6bc..55ed906a958 100644 --- a/kernels/test/op_reflection_pad3d_test.cpp +++ b/kernels/test/op_reflection_pad3d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_reflection_pad3d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::reflection_pad3d_outf( context, input, padding, out); } diff --git a/kernels/test/op_replication_pad1d_test.cpp b/kernels/test/op_replication_pad1d_test.cpp index 9a6d3b2285e..f8a3fc0a48b 100644 --- a/kernels/test/op_replication_pad1d_test.cpp +++ b/kernels/test/op_replication_pad1d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_replication_pad1d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::replication_pad1d_outf( context, input, padding, out); } diff --git a/kernels/test/op_replication_pad2d_test.cpp b/kernels/test/op_replication_pad2d_test.cpp index 00bc76ac093..7f62f5c9b6e 100644 --- a/kernels/test/op_replication_pad2d_test.cpp +++ b/kernels/test/op_replication_pad2d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_replication_pad2d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::replication_pad2d_outf( context, input, padding, out); } diff --git a/kernels/test/op_replication_pad3d_test.cpp b/kernels/test/op_replication_pad3d_test.cpp index 010870298d9..5b931fee3f9 100644 --- a/kernels/test/op_replication_pad3d_test.cpp +++ b/kernels/test/op_replication_pad3d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_replication_pad3d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::replication_pad3d_outf( context, input, padding, out); } diff --git a/kernels/test/op_roll_test.cpp b/kernels/test/op_roll_test.cpp index fc5baaad4a7..4407e395db6 100644 --- a/kernels/test/op_roll_test.cpp +++ b/kernels/test/op_roll_test.cpp @@ -26,7 +26,7 @@ Tensor& op_roll_out( ArrayRef shifts, ArrayRef dims, Tensor& out) { - executorch::runtime::KernelRuntimeContext context{}; + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; return torch::executor::aten::roll_outf(context, input, shifts, dims, out); } diff --git a/kernels/test/op_topk_test.cpp b/kernels/test/op_topk_test.cpp index 46098a81b68..bdd185daaae 100644 --- a/kernels/test/op_topk_test.cpp +++ b/kernels/test/op_topk_test.cpp @@ -106,7 +106,8 @@ std::tuple op_topk_values( Tensor& values, Tensor& indices) { TempMemoryAllocator allocator = TempMemoryAllocator(); - executorch::runtime::KernelRuntimeContext context(nullptr, &allocator); + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context( + nullptr, &allocator); return torch::executor::aten::topk_outf( context, input, k, dim, largest, sorted, values, indices); } diff --git a/kernels/test/op_view_as_real_copy_test.cpp b/kernels/test/op_view_as_real_copy_test.cpp new file mode 100644 index 00000000000..8e959c3db8c --- /dev/null +++ b/kernels/test/op_view_as_real_copy_test.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include + +#include + +using namespace ::testing; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using torch::executor::testing::TensorFactory; + +class OpViewAsRealTest : public OperatorTest { + protected: + Tensor& view_as_real_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::aten::view_as_real_copy_outf(context_, self, out); + } + + template + void run_complex_smoke_test() { + TensorFactory tf; + constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE); + TensorFactory tf_out; + + Tensor in = tf.make( + {2, 2}, + {CTYPE(3, 4), CTYPE(-1.7, 7.4), CTYPE(5, -12), CTYPE(8.3, 0.1)}); + Tensor out = tf_out.zeros({2, 2, 2}); + Tensor expected = + tf_out.make({2, 2, 2}, {3, 4, -1.7, 7.4, 5, -12, 8.3, 0.1}); + Tensor ret = view_as_real_copy_out(in, out); + + EXPECT_TENSOR_EQ(out, ret); + EXPECT_TENSOR_EQ(out, expected); + } + + // Tests on tensors with 0 size + template + void test_empty_input() { + TensorFactory tf; + constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE); + TensorFactory tf_out; + + Tensor in = tf.make(/*sizes=*/{3, 0, 4}, /*data=*/{}); + Tensor out = tf_out.zeros({3, 0, 4, 2}); + Tensor expected = tf_out.make(/*sizes=*/{3, 0, 4, 2}, /*data=*/{}); + Tensor ret = view_as_real_copy_out(in, out); + + EXPECT_TENSOR_EQ(out, ret); + EXPECT_TENSOR_EQ(out, expected); + } + + // Tests on 0-dim input tensors + template + void zero_dim_input() { + TensorFactory tf; + constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE); + TensorFactory tf_out; + + Tensor in = tf.make(/*sizes=*/{}, {CTYPE(0, 0)}); + Tensor out = tf_out.zeros({2}); + Tensor expected = tf_out.zeros(/*sizes=*/{2}); + Tensor ret = view_as_real_copy_out(in, out); + + EXPECT_TENSOR_EQ(out, ret); + EXPECT_TENSOR_EQ(out, expected); + } +}; + +TEST_F(OpViewAsRealTest, ComplexSmokeTest) { +#define RUN_SMOKE_TEST(ctype, dtype) \ + run_complex_smoke_test(); \ + test_empty_input(); \ + zero_dim_input(); + ET_FORALL_COMPLEXH_TYPES(RUN_SMOKE_TEST); +#undef RUN_SMOKE_TEST +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 05e678c6229..b9e1d3d6dac 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -223,6 +223,7 @@ def define_common_targets(): _common_op_test("op_exp_test", ["aten", "portable", "optimized"]) _common_op_test("op_expand_copy_test", ["aten", "portable"]) _common_op_test("op_expm1_test", ["aten", "portable"]) + _common_op_test("op_fft_c2r_test", ["aten", "optimized"]) _common_op_test("op_fft_r2c_test", ["aten", "optimized"]) _common_op_test("op_fill_test", ["aten", "portable"]) _common_op_test("op_flip_test", ["aten", "portable"]) @@ -331,6 +332,7 @@ def define_common_targets(): _common_op_test("op_upsample_bilinear2d_test", ["aten", "portable"]) _common_op_test("op_upsample_nearest2d_test", ["aten", "portable"]) _common_op_test("op_var_test", ["aten", "portable"]) + _common_op_test("op_view_as_real_copy_test", ["aten", "portable"]) _common_op_test("op_view_copy_test", ["aten", "portable"]) _common_op_test("op_where_test", ["aten", "portable"]) _common_op_test("op_zeros_test", ["aten", "portable"]) diff --git a/pytest.ini b/pytest.ini index cd647c43a1c..8c661aa9ee4 100644 --- a/pytest.ini +++ b/pytest.ini @@ -63,8 +63,6 @@ addopts = --ignore=exir/backend/test/demos --ignore=exir/backend/test/test_backends.py --ignore=exir/backend/test/test_backends_lifted.py - --ignore=exir/backend/test/test_compatibility.py - --ignore=exir/backend/test/test_lowered_backend_module.py --ignore=exir/backend/test/test_partitioner.py --ignore=exir/tests/test_common.py --ignore=exir/tests/test_memory_format_ops_pass_aten.py diff --git a/runtime/COMPATIBILITY.md b/runtime/COMPATIBILITY.md index 7d9fd47c590..583dab172cc 100644 --- a/runtime/COMPATIBILITY.md +++ b/runtime/COMPATIBILITY.md @@ -1,7 +1,7 @@ # Runtime Compatibility Policy This document describes the compatibility guarantees between the [PTE file -format](https://pytorch.org/executorch/stable/pte-file-format.html) and the +format](https://pytorch.org/executorch/main/pte-file-format) and the ExecuTorch runtime. > [!IMPORTANT] diff --git a/runtime/backend/backend_execution_context.h b/runtime/backend/backend_execution_context.h index d2790b158ef..7ee41d8e5b1 100644 --- a/runtime/backend/backend_execution_context.h +++ b/runtime/backend/backend_execution_context.h @@ -12,7 +12,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { /** * BackendExecutionContext will be used to inject run time context. @@ -68,13 +68,13 @@ class BackendExecutionContext final { const char* method_name_ = nullptr; }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::BackendExecutionContext; +using ::executorch::ET_RUNTIME_NAMESPACE::BackendExecutionContext; } // namespace executor } // namespace torch diff --git a/runtime/backend/backend_init_context.h b/runtime/backend/backend_init_context.h index de1661c3af0..71c5182f401 100644 --- a/runtime/backend/backend_init_context.h +++ b/runtime/backend/backend_init_context.h @@ -7,12 +7,12 @@ */ #pragma once +#include #include #include namespace executorch { -namespace runtime { - +namespace ET_RUNTIME_NAMESPACE { /** * BackendInitContext will be used to inject runtime info for to initialize * delegate. @@ -70,13 +70,13 @@ class BackendInitContext final { const NamedDataMap* named_data_map_ = nullptr; }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::BackendInitContext; +using ::executorch::ET_RUNTIME_NAMESPACE::BackendInitContext; } // namespace executor } // namespace torch diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp index 4fb1eadfa87..ffeb133fbf2 100644 --- a/runtime/backend/interface.cpp +++ b/runtime/backend/interface.cpp @@ -9,7 +9,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { // Pure-virtual dtors still need an implementation. BackendInterface::~BackendInterface() {} @@ -66,5 +66,5 @@ Result get_backend_name(size_t index) { return registered_backends[index].name; } -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/backend/interface.h b/runtime/backend/interface.h index 0a3c069a201..95705d48f92 100644 --- a/runtime/backend/interface.h +++ b/runtime/backend/interface.h @@ -22,7 +22,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { struct SizedBuffer { void* buffer; @@ -150,19 +150,20 @@ size_t get_num_registered_backends(); */ Result get_backend_name(size_t index); -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::Backend; -using ::executorch::runtime::CompileSpec; -using ::executorch::runtime::DelegateHandle; -using ::executorch::runtime::get_backend_class; -using ::executorch::runtime::register_backend; -using ::executorch::runtime::SizedBuffer; -using PyTorchBackendInterface = ::executorch::runtime::BackendInterface; +using ::executorch::ET_RUNTIME_NAMESPACE::Backend; +using ::executorch::ET_RUNTIME_NAMESPACE::CompileSpec; +using ::executorch::ET_RUNTIME_NAMESPACE::DelegateHandle; +using ::executorch::ET_RUNTIME_NAMESPACE::get_backend_class; +using ::executorch::ET_RUNTIME_NAMESPACE::register_backend; +using ::executorch::ET_RUNTIME_NAMESPACE::SizedBuffer; +using PyTorchBackendInterface = + ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface; } // namespace executor } // namespace torch diff --git a/runtime/core/event_tracer.h b/runtime/core/event_tracer.h index 77d7fc64102..5bcdd0cfb1f 100644 --- a/runtime/core/event_tracer.h +++ b/runtime/core/event_tracer.h @@ -313,8 +313,11 @@ class EventTracer { * @param[in] evalue The value to be logged. * @param[in] evalue_type Indicates what type of output this is logging e.g. * an intermediate output, program output etc. + * @return A Result indicating the status of the logging operation. + * - True if the evalue output was successfully logged. + * - An error code if an error occurs during logging. */ - virtual void log_evalue( + virtual Result log_evalue( const EValue& evalue, LoggedEValueType evalue_type) = 0; @@ -439,6 +442,12 @@ class EventTracer { DelegateDebugIntId delegate_debug_index, const double& output) = 0; + /** + * Set the filter of event tracer for delegation intermediate outputs. + */ + virtual void set_delegation_intermediate_output_filter( + EventTracerFilterBase* event_tracer_filter) = 0; + /** * Helper function to set the chain id ands debug handle. Users have two * options, the first is that they can directly pass in the chain id and debug @@ -513,12 +522,6 @@ class EventTracer { event_tracer_profiling_level_ = profiling_level; } - /** - * Set the filter of event tracer for delegation intermediate outputs. - */ - void set_delegation_intermediate_output_filter( - EventTracerFilterBase* event_tracer_filter); - /** * Return the current level of event tracer profiling. */ diff --git a/runtime/core/event_tracer_hooks.h b/runtime/core/event_tracer_hooks.h index 40754160c41..cd74b447ca8 100644 --- a/runtime/core/event_tracer_hooks.h +++ b/runtime/core/event_tracer_hooks.h @@ -30,7 +30,7 @@ */ namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace internal { /** @@ -305,7 +305,7 @@ inline void event_tracer_set_bundled_input_index( } } // namespace internal -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { @@ -313,18 +313,27 @@ namespace executor { namespace internal { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::internal::event_tracer_begin_profiling_event; -using ::executorch::runtime::internal::event_tracer_create_event_block; -using ::executorch::runtime::internal::event_tracer_end_profiling_event; -using ::executorch::runtime::internal::event_tracer_log_evalue; -using ::executorch::runtime::internal::event_tracer_log_evalue_output; -using ::executorch::runtime::internal::event_tracer_set_bundled_input_index; -using ::executorch::runtime::internal::event_tracer_track_allocation; -using ::executorch::runtime::internal::event_tracer_track_allocator; -using ::executorch::runtime::internal::EventTracerProfileInstructionScope; -using ::executorch::runtime::internal::EventTracerProfileMethodScope; -using ::executorch::runtime::internal::EventTracerProfileOpScope; -using ::executorch::runtime::internal::EventTracerProfileScope; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + event_tracer_begin_profiling_event; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + event_tracer_create_event_block; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + event_tracer_end_profiling_event; +using ::executorch::ET_RUNTIME_NAMESPACE::internal::event_tracer_log_evalue; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + event_tracer_log_evalue_output; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + event_tracer_set_bundled_input_index; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + event_tracer_track_allocation; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + event_tracer_track_allocator; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + EventTracerProfileInstructionScope; +using ::executorch::ET_RUNTIME_NAMESPACE::internal:: + EventTracerProfileMethodScope; +using ::executorch::ET_RUNTIME_NAMESPACE::internal::EventTracerProfileOpScope; +using ::executorch::ET_RUNTIME_NAMESPACE::internal::EventTracerProfileScope; } // namespace internal } // namespace executor diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h index 704bb868abd..10075ab5920 100644 --- a/runtime/core/exec_aten/exec_aten.h +++ b/runtime/core/exec_aten/exec_aten.h @@ -47,6 +47,21 @@ #endif +/** + * This hack is for separating out ATen mode vs non-ATen mode. In ATen mode, + * we use the ATen types directly. In non-ATen mode, we use the portable types. + * To avoid duplicate symbols and/or duplicate operator registration, when a + * user depends on both the ATen mode and non-ATen mode versions of the + * ExecuTorch library. + */ +#ifndef ET_RUNTIME_NAMESPACE +#if defined(USE_ATEN_LIB) +#define ET_RUNTIME_NAMESPACE runtime::aten +#else +#define ET_RUNTIME_NAMESPACE runtime +#endif +#endif + namespace executorch { namespace aten { diff --git a/runtime/core/exec_aten/testing_util/tensor_factory.h b/runtime/core/exec_aten/testing_util/tensor_factory.h index 367db09285a..1e29b220251 100644 --- a/runtime/core/exec_aten/testing_util/tensor_factory.h +++ b/runtime/core/exec_aten/testing_util/tensor_factory.h @@ -133,7 +133,7 @@ inline bool check_dim_order( size_t gauss_sum = 0; std::vector count(dim_order.size(), 0); for (int i = 0; i < dim_order.size(); i++) { - if (dim_order[i] < 0 || dim_order[i] >= sizes.size()) { + if (dim_order[i] >= sizes.size()) { return false; } gauss_sum += static_cast(dim_order[i]) + 1; diff --git a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp index ed8cc00f4ef..feb00f79b8f 100644 --- a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp +++ b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp @@ -26,8 +26,8 @@ using executorch::aten::SizesType; using executorch::aten::StridesType; using executorch::aten::Tensor; using executorch::aten::TensorList; +using executorch::ET_RUNTIME_NAMESPACE::resize_tensor; using executorch::runtime::Error; -using executorch::runtime::resize_tensor; using executorch::runtime::TensorShapeDynamism; using executorch::runtime::testing::TensorFactory; using executorch::runtime::testing::TensorListFactory; diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h index d07052c2ec2..6f81146e925 100644 --- a/runtime/core/exec_aten/util/scalar_type_util.h +++ b/runtime/core/exec_aten/util/scalar_type_util.h @@ -921,55 +921,7 @@ struct promote_types { } \ }() -#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__) - -#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...) \ +#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE( \ ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__) \ ET_INTERNAL_SWITCH_CASE( \ @@ -979,12 +931,73 @@ struct promote_types { ET_INTERNAL_SWITCH_CASE( \ ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__) \ ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__) \ + ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__) + +#define ET_INTERNAL_SWITCH_CASE_UINT_TYPES(CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::UInt16, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::UInt32, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::UInt64, CTYPE_ALIAS, __VA_ARGS__) + +#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE( \ ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) \ ET_INTERNAL_SWITCH_CASE( \ ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__) +#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__) + +#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) + +#define ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__) + +#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__) + +#define ET_INTERNAL_SWITCH_CASE_BITS_TYPES(CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::Bits1x8, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::Bits2x4, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::Bits4x2, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::Bits8, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::Bits16, CTYPE_ALIAS, __VA_ARGS__) + +#define ET_INTERNAL_SWITCH_CASE_ALL_TYPES(CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::Half, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::BFloat16, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE_BITS_TYPES(CTYPE_ALIAS, __VA_ARGS__) + #define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ ET_INTERNAL_SWITCH_CASE( \ @@ -1008,29 +1021,11 @@ struct promote_types { ET_INTERNAL_SWITCH_CASE( \ ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__) -#define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Char, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Short, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Int, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Long, CTYPE_ALIAS, __VA_ARGS__) - #define ET_INTERNAL_SWITCH_CASE_INT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ ET_INTERNAL_SWITCH_CASE( \ ::executorch::aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__) -#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, ...) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Double, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::Float, CTYPE_ALIAS, __VA_ARGS__) - #define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(ADDITIONAL, CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ ET_INTERNAL_SWITCH_CASE( \ @@ -1050,32 +1045,6 @@ struct promote_types { ET_INTERNAL_SWITCH_CASE( \ ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__) -#define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QUInt8, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QInt32, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QUInt4x2, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::QUInt2x4, CTYPE_ALIAS, __VA_ARGS__) - -#define ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, ...) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) - -#define ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, ...) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__) - #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE( \ ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__) \ @@ -1204,26 +1173,15 @@ struct promote_types { ET_SWITCH_REAL_TYPES_AND3( \ Half, Bool, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) -#define ET_SWITCH_REALHBBF16_AND_UINT_TYPES( \ - TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ - ET_INTERNAL_SWITCH( \ - TYPE, \ - CONTEXT, \ - NAME, \ - ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \ - Half, Bool, BFloat16, CTYPE_ALIAS, __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::UInt16, \ - CTYPE_ALIAS, \ - __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::UInt32, \ - CTYPE_ALIAS, \ - __VA_ARGS__) \ - ET_INTERNAL_SWITCH_CASE( \ - ::executorch::aten::ScalarType::UInt64, \ - CTYPE_ALIAS, \ - __VA_ARGS__)) +#define ET_SWITCH_REALHBBF16_AND_UINT_TYPES( \ + TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH( \ + TYPE, \ + CONTEXT, \ + NAME, \ + ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \ + Half, Bool, BFloat16, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE_UINT_TYPES(CTYPE_ALIAS, __VA_ARGS__)) #define ET_SWITCH_INT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH( \ diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index 4e5a0cebb07..b0b79882361 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -396,8 +396,7 @@ #scalar_tensor " could not be extracted: wrong type or out of range"); namespace executorch { -namespace runtime { - +namespace ET_RUNTIME_NAMESPACE { // // Utility functions for checking tensor attributes // @@ -446,10 +445,10 @@ inline bool tensor_can_cast_to( executorch::aten::Tensor a, executorch::aten::ScalarType dtype) { ET_CHECK_OR_RETURN_FALSE( - torch::executor::canCast(a.scalar_type(), dtype), + ::torch::executor::canCast(a.scalar_type(), dtype), "Tensor of dtype %s cannot cast to dtype %s", - torch::executor::toString(a.scalar_type()), - torch::executor::toString(dtype)); + ::torch::executor::toString(a.scalar_type()), + ::torch::executor::toString(dtype)); return true; } @@ -458,7 +457,7 @@ inline bool tensor_is_bool_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( t.scalar_type() == executorch::aten::ScalarType::Bool, "Expected to find bool type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } @@ -469,8 +468,8 @@ inline bool tensor_is_type( ET_CHECK_OR_RETURN_FALSE( t.scalar_type() == dtype, "Expected to find %s type, but tensor has type %s", - torch::executor::toString(dtype), - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(dtype), + ::torch::executor::toString(t.scalar_type())); return true; } @@ -482,9 +481,9 @@ inline bool tensor_is_type( ET_LOG_MSG_AND_RETURN_IF_FALSE( t.scalar_type() == dtype || t.scalar_type() == dtype2, "Expected to find %s or %s type, but tensor has type %s", - torch::executor::toString(dtype), - torch::executor::toString(dtype2), - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(dtype), + ::torch::executor::toString(dtype2), + ::torch::executor::toString(t.scalar_type())); return true; } @@ -498,10 +497,10 @@ inline bool tensor_is_type( t.scalar_type() == dtype || t.scalar_type() == dtype2 || t.scalar_type() == dtype3, "Expected to find %s, %s, or %s type, but tensor has type %s", - torch::executor::toString(dtype), - torch::executor::toString(dtype2), - torch::executor::toString(dtype3), - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(dtype), + ::torch::executor::toString(dtype2), + ::torch::executor::toString(dtype3), + ::torch::executor::toString(t.scalar_type())); return true; } @@ -510,36 +509,36 @@ inline bool tensor_is_integral_type( executorch::aten::Tensor t, bool includeBool = false) { ET_CHECK_OR_RETURN_FALSE( - torch::executor::isIntegralType(t.scalar_type(), includeBool), + ::torch::executor::isIntegralType(t.scalar_type(), includeBool), "Expected to find a integral type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } inline bool tensor_is_floating_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( - torch::executor::isFloatingType(t.scalar_type()), + ::torch::executor::isFloatingType(t.scalar_type()), "Expected to find a floating type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } inline bool tensor_is_real_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( - torch::executor::isRealType(t.scalar_type()), + ::torch::executor::isRealType(t.scalar_type()), "Expected to find a real type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } inline bool tensor_is_realh_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( - torch::executor::isRealHType(t.scalar_type()), + ::torch::executor::isRealHType(t.scalar_type()), "Expected to find a real type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } @@ -548,16 +547,16 @@ inline bool tensor_is_realhbf16_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( executorch::runtime::isRealHBF16Type(t.scalar_type()), "Expected to find a real type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } inline bool tensor_is_realhb_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( - torch::executor::isRealHBType(t.scalar_type()), + ::torch::executor::isRealHBType(t.scalar_type()), "Expected to find a real type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } @@ -566,25 +565,25 @@ inline bool tensor_is_realhbbf16_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( executorch::runtime::isRealHBBF16Type(t.scalar_type()), "Expected to find a real type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } inline bool tensor_is_complex_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( - torch::executor::isComplexType(t.scalar_type()), + ::torch::executor::isComplexType(t.scalar_type()), "Expected to find a complex type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } inline bool tensor_is_bits_type(executorch::aten::Tensor t) { ET_CHECK_OR_RETURN_FALSE( - torch::executor::isBitsType(t.scalar_type()), + ::torch::executor::isBitsType(t.scalar_type()), "Expected to find a bits type, but tensor has type %s", - torch::executor::toString(t.scalar_type())); + ::torch::executor::toString(t.scalar_type())); return true; } @@ -595,8 +594,8 @@ inline bool tensors_have_same_dtype( ET_CHECK_OR_RETURN_FALSE( a.scalar_type() == b.scalar_type(), ET_TENSOR_CHECK_PREFIX__ ": dtype={%s, %s}", - torch::executor::toString(a.scalar_type()), - torch::executor::toString(b.scalar_type())); + ::torch::executor::toString(a.scalar_type()), + ::torch::executor::toString(b.scalar_type())); return true; } @@ -607,9 +606,9 @@ inline bool tensors_have_same_dtype( ET_CHECK_OR_RETURN_FALSE( a.scalar_type() == b.scalar_type() && b.scalar_type() == c.scalar_type(), ET_TENSOR_CHECK_PREFIX__ ": dtype={%s, %s, %s}", - torch::executor::toString(a.scalar_type()), - torch::executor::toString(b.scalar_type()), - torch::executor::toString(c.scalar_type())); + ::torch::executor::toString(a.scalar_type()), + ::torch::executor::toString(b.scalar_type()), + ::torch::executor::toString(c.scalar_type())); return true; } @@ -1349,60 +1348,61 @@ inline size_t calculate_linear_index( return index; } -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::calculate_linear_index; -using ::executorch::runtime::coordinateToIndex; -using ::executorch::runtime::dim_is_valid; -using ::executorch::runtime::extract_scalar_tensor; -using ::executorch::runtime::get_dim_order; -using ::executorch::runtime::getLeadingDims; -using ::executorch::runtime::getTrailingDims; -using ::executorch::runtime::indexToCoordinate; +using ::executorch::ET_RUNTIME_NAMESPACE::calculate_linear_index; +using ::executorch::ET_RUNTIME_NAMESPACE::coordinateToIndex; +using ::executorch::ET_RUNTIME_NAMESPACE::dim_is_valid; +using ::executorch::ET_RUNTIME_NAMESPACE::extract_scalar_tensor; +using ::executorch::ET_RUNTIME_NAMESPACE::get_dim_order; +using ::executorch::ET_RUNTIME_NAMESPACE::getLeadingDims; +using ::executorch::ET_RUNTIME_NAMESPACE::getTrailingDims; +using ::executorch::ET_RUNTIME_NAMESPACE::indexToCoordinate; +using ::executorch::ET_RUNTIME_NAMESPACE::nonempty_size; +using ::executorch::ET_RUNTIME_NAMESPACE::nonzero_dim; +using ::executorch::ET_RUNTIME_NAMESPACE::resize; +using ::executorch::ET_RUNTIME_NAMESPACE::resize_tensor; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_can_cast_to; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_dim_has_index; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_dim; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_expected_size; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_non_empty_dim; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_rank_greater_or_equal_to; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_rank_smaller_or_equal_to; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_has_valid_dim_order; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_bits_type; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_bool_type; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_complex_type; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_contiguous; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_default_dim_order; +using ::executorch::ET_RUNTIME_NAMESPACE:: + tensor_is_default_or_channels_last_dim_order; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_floating_type; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_integral_type; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_rank; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_real_type; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_realh_type; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_realhb_type; +using ::executorch::ET_RUNTIME_NAMESPACE::tensor_is_scalar; +using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_dim_order; +using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_dtype; +using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_rank; +using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape; +using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape_and_dtype; +using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_size_at_dims; +using ::executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_strides; using ::executorch::runtime::kTensorDimensionLimit; -using ::executorch::runtime::nonempty_size; -using ::executorch::runtime::nonzero_dim; -using ::executorch::runtime::resize; -using ::executorch::runtime::resize_tensor; -using ::executorch::runtime::tensor_can_cast_to; -using ::executorch::runtime::tensor_dim_has_index; -using ::executorch::runtime::tensor_has_dim; -using ::executorch::runtime::tensor_has_expected_size; -using ::executorch::runtime::tensor_has_non_empty_dim; -using ::executorch::runtime::tensor_has_rank_greater_or_equal_to; -using ::executorch::runtime::tensor_has_rank_smaller_or_equal_to; -using ::executorch::runtime::tensor_has_valid_dim_order; -using ::executorch::runtime::tensor_is_bits_type; -using ::executorch::runtime::tensor_is_bool_type; -using ::executorch::runtime::tensor_is_complex_type; -using ::executorch::runtime::tensor_is_contiguous; -using ::executorch::runtime::tensor_is_default_dim_order; -using ::executorch::runtime::tensor_is_default_or_channels_last_dim_order; -using ::executorch::runtime::tensor_is_floating_type; -using ::executorch::runtime::tensor_is_integral_type; -using ::executorch::runtime::tensor_is_rank; -using ::executorch::runtime::tensor_is_real_type; -using ::executorch::runtime::tensor_is_realh_type; -using ::executorch::runtime::tensor_is_realhb_type; -using ::executorch::runtime::tensor_is_scalar; -using ::executorch::runtime::tensors_have_same_dim_order; -using ::executorch::runtime::tensors_have_same_dtype; -using ::executorch::runtime::tensors_have_same_rank; -using ::executorch::runtime::tensors_have_same_shape; -using ::executorch::runtime::tensors_have_same_shape_and_dtype; -using ::executorch::runtime::tensors_have_same_size_at_dims; -using ::executorch::runtime::tensors_have_same_strides; namespace internal { -using ::executorch::runtime::internal::copy_tensor_data; -using ::executorch::runtime::internal::reset_data_ptr; -using ::executorch::runtime::internal::resize_tensor_impl; -using ::executorch::runtime::internal::set_tensor_data; -using ::executorch::runtime::internal::share_tensor_data; +using ::executorch::ET_RUNTIME_NAMESPACE::internal::copy_tensor_data; +using ::executorch::ET_RUNTIME_NAMESPACE::internal::reset_data_ptr; +using ::executorch::ET_RUNTIME_NAMESPACE::internal::resize_tensor_impl; +using ::executorch::ET_RUNTIME_NAMESPACE::internal::set_tensor_data; +using ::executorch::ET_RUNTIME_NAMESPACE::internal::share_tensor_data; } // namespace internal } // namespace executor } // namespace torch diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp index 4df273d4dbb..ddfd0560a69 100644 --- a/runtime/core/exec_aten/util/tensor_util_aten.cpp +++ b/runtime/core/exec_aten/util/tensor_util_aten.cpp @@ -12,7 +12,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { /** * Implementation for ATen tensor util, should only be included in * `_aten` target and only be used in ATen mode. Explicitly taking @@ -214,6 +214,5 @@ Error resize_tensor_impl( } } // namespace internal - -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/core/exec_aten/util/test/tensor_util_test.cpp b/runtime/core/exec_aten/util/test/tensor_util_test.cpp index 7d30b0bbdbe..cdc391adf20 100644 --- a/runtime/core/exec_aten/util/test/tensor_util_test.cpp +++ b/runtime/core/exec_aten/util/test/tensor_util_test.cpp @@ -17,7 +17,7 @@ using namespace ::testing; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::extract_scalar_tensor; +using executorch::ET_RUNTIME_NAMESPACE::extract_scalar_tensor; using executorch::runtime::testing::TensorFactory; class TensorUtilTest : public ::testing::Test { @@ -148,13 +148,13 @@ TEST_F(TensorUtilTest, GetLeadingDimsSmokeTest) { Tensor t = tf_int_.ones({2, 3, 4}); // getLeadingDims(t, 1) => t.size(0) - EXPECT_EQ(executorch::runtime::getLeadingDims(t, 1), 2); + EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 1), 2); // getLeadingDims(t, 2) => t.size(0) * t.size(1) - EXPECT_EQ(executorch::runtime::getLeadingDims(t, 2), 6); + EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 2), 6); // getLeadingDims(t, 3) => t.size(0) * t.size(1) * t.size(2) - EXPECT_EQ(executorch::runtime::getLeadingDims(t, 3), 24); + EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 3), 24); } TEST_F(TensorUtilTest, GetLeadingDimsInputOutOfBoundDies) { @@ -162,9 +162,9 @@ TEST_F(TensorUtilTest, GetLeadingDimsInputOutOfBoundDies) { Tensor t = tf_int_.ones({2, 3, 4}); // dim needs to be in the range [0, t.dim()] - ET_EXPECT_DEATH(executorch::runtime::getLeadingDims(t, -2), ""); - ET_EXPECT_DEATH(executorch::runtime::getLeadingDims(t, -1), ""); - ET_EXPECT_DEATH(executorch::runtime::getLeadingDims(t, 4), ""); + ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, -2), ""); + ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, -1), ""); + ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 4), ""); } TEST_F(TensorUtilTest, GetTrailingDimsSmokeTest) { @@ -172,13 +172,13 @@ TEST_F(TensorUtilTest, GetTrailingDimsSmokeTest) { Tensor t = tf_int_.ones({2, 3, 4}); // getTrailingDims(t, 1) => t.size(2) - EXPECT_EQ(executorch::runtime::getTrailingDims(t, 1), 4); + EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 1), 4); // getTrailingDims(t, 0) => t.size(1) * t.size(2) - EXPECT_EQ(executorch::runtime::getTrailingDims(t, 0), 12); + EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 0), 12); // getTrailingDims(t, -1) => t.size(0) * t.size(1) * t.size(2) - EXPECT_EQ(executorch::runtime::getTrailingDims(t, -1), 24); + EXPECT_EQ(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, -1), 24); } TEST_F(TensorUtilTest, GetTrailingDimsInputOutOfBoundDies) { @@ -186,9 +186,9 @@ TEST_F(TensorUtilTest, GetTrailingDimsInputOutOfBoundDies) { Tensor t = tf_int_.ones({2, 3, 4}); // dim needs to be in the range [-1, t.dim() - 1) - ET_EXPECT_DEATH(executorch::runtime::getTrailingDims(t, -2), ""); - ET_EXPECT_DEATH(executorch::runtime::getTrailingDims(t, 3), ""); - ET_EXPECT_DEATH(executorch::runtime::getTrailingDims(t, 4), ""); + ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, -2), ""); + ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 3), ""); + ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 4), ""); } TEST_F(TensorUtilTest, ContiguousCheckSupported) { @@ -421,7 +421,7 @@ TEST_F(TensorUtilTest, BoolTensorNotScalarFails) { // TEST_F(TensorUtilTest, TensorIsRankTest) { - using executorch::runtime::tensor_is_rank; + using executorch::ET_RUNTIME_NAMESPACE::tensor_is_rank; Tensor a = tf_float_.ones({2, 3, 5}); EXPECT_TRUE(tensor_is_rank(a, 3)); @@ -430,7 +430,7 @@ TEST_F(TensorUtilTest, TensorIsRankTest) { } TEST_F(TensorUtilTest, TensorHasDimTest) { - using executorch::runtime::tensor_has_dim; + using executorch::ET_RUNTIME_NAMESPACE::tensor_has_dim; Tensor a = tf_float_.ones({2, 3, 5}); EXPECT_TRUE(tensor_has_dim(a, 2)); @@ -445,7 +445,7 @@ TEST_F(TensorUtilTest, TensorHasDimTest) { } TEST_F(TensorUtilTest, TensorsHaveSameDtypeTest) { - using executorch::runtime::tensors_have_same_dtype; + using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_dtype; Tensor a = tf_float_.ones({2, 3}); Tensor b = tf_float_.ones({2, 3}); Tensor c = tf_float_.ones({3, 3}); @@ -458,7 +458,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameDtypeTest) { } TEST_F(TensorUtilTest, TensorsHaveSameSizeAtDimTest) { - using executorch::runtime::tensors_have_same_size_at_dims; + using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_size_at_dims; Tensor a = tf_float_.ones({2, 3, 4, 5}); Tensor b = tf_float_.ones({5, 4, 3, 2}); @@ -470,7 +470,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameSizeAtDimTest) { } TEST_F(TensorUtilTest, TensorsHaveSameShapeTest) { - using executorch::runtime::tensors_have_same_shape; + using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape; Tensor a = tf_float_.ones({2, 3}); Tensor b = tf_int_.ones({2, 3}); Tensor c = tf_byte_.ones({2, 3}); @@ -493,7 +493,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameShapeTest) { } TEST_F(TensorUtilTest, TensorsHaveSameShapeAndDtypeTest) { - using executorch::runtime::tensors_have_same_shape_and_dtype; + using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_shape_and_dtype; Tensor a = tf_float_.ones({2, 3}); Tensor b = tf_float_.ones({2, 3}); Tensor c = tf_float_.ones({2, 3}); @@ -515,7 +515,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameShapeAndDtypeTest) { } TEST_F(TensorUtilTest, TensorsHaveSameStridesTest) { - using executorch::runtime::tensors_have_same_strides; + using executorch::ET_RUNTIME_NAMESPACE::tensors_have_same_strides; Tensor a = tf_float_.full_channels_last({4, 5, 2, 3}, 1); Tensor b = tf_float_.full_channels_last({4, 5, 2, 3}, 2); Tensor c = tf_float_.full_channels_last({4, 5, 2, 3}, 3); @@ -530,7 +530,7 @@ TEST_F(TensorUtilTest, TensorsHaveSameStridesTest) { } TEST_F(TensorUtilTest, TensorIsContiguous) { - using executorch::runtime::tensor_is_contiguous; + using executorch::ET_RUNTIME_NAMESPACE::tensor_is_contiguous; // Note that the strides.size() == 0 case is not tested, since Tensor a = tf_float_.full_channels_last({4, 5, 2, 3}, 1); Tensor b = tf_float_.ones({4, 5, 2, 3}); @@ -547,7 +547,7 @@ TEST_F(TensorUtilTest, ResizeZeroDimTensor) { Tensor a = tf_float_.ones({}); EXPECT_EQ( - executorch::runtime::resize_tensor(a, {}), + executorch::ET_RUNTIME_NAMESPACE::resize_tensor(a, {}), executorch::runtime::Error::Ok); EXPECT_EQ(a.dim(), 0); } diff --git a/runtime/core/named_data_map.h b/runtime/core/named_data_map.h index e79c7035989..14179d22795 100644 --- a/runtime/core/named_data_map.h +++ b/runtime/core/named_data_map.h @@ -22,8 +22,7 @@ #include namespace executorch { -namespace runtime { - +namespace ET_RUNTIME_NAMESPACE { /** * Interface to access and retrieve data via name. * See executorch/extension/flat_tensor/ for an example. @@ -37,8 +36,8 @@ class ET_EXPERIMENTAL NamedDataMap { * @param key The name of the tensor. * @return Result containing TensorLayout with tensor metadata. */ - ET_NODISCARD virtual Result - get_metadata(const char* key) const = 0; + ET_NODISCARD virtual Result get_metadata( + const char* key) const = 0; /** * Get data by key. * @@ -78,7 +77,7 @@ class ET_EXPERIMENTAL NamedDataMap { ET_NODISCARD virtual Result get_key(size_t index) const = 0; }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch #ifdef __GNUC__ diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h index 919eb6c8567..1429eda2acb 100644 --- a/runtime/core/portable_type/c10/c10/macros/Macros.h +++ b/runtime/core/portable_type/c10/c10/macros/Macros.h @@ -241,7 +241,7 @@ using namespace c10::xpu; #ifdef __HIPCC__ // Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. // We do this #include here so that C10_HOST_DEVICE and friends will Just Work. -// See https://github.com/ROCm-Developer-Tools/HIP/issues/441 +// See https://github.com/ROCm/hip/issues/441 #include #endif diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index d9d72b5be3f..4555d42a567 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -25,6 +25,9 @@ def define_common_targets(): "util/Half-inl.h", "util/TypeSafeSignMath.h", "util/bit_cast.h", + "util/complex.h", + "util/complex_math.h", + "util/complex_utils.h", "util/floating_point_utils.h", "util/irange.h", ], @@ -36,6 +39,7 @@ def define_common_targets(): ]), visibility = [ "//executorch/...", + "@EXECUTORCH_CLIENTS", ], deps = select({ "DEFAULT": [], diff --git a/runtime/core/portable_type/c10/c10/util/complex.h b/runtime/core/portable_type/c10/c10/util/complex.h new file mode 100644 index 00000000000..b63710d9458 --- /dev/null +++ b/runtime/core/portable_type/c10/c10/util/complex.h @@ -0,0 +1,668 @@ +#pragma once + +#include + +#include +#include + +#if defined(__CUDACC__) || defined(__HIPCC__) +#include +#endif + +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion") +#endif +#if C10_CLANG_HAS_WARNING("-Wfloat-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wfloat-conversion") +#endif + +namespace c10 { + +// c10::complex is an implementation of complex numbers that aims +// to work on all devices supported by PyTorch +// +// Most of the APIs duplicates std::complex +// Reference: https://en.cppreference.com/w/cpp/numeric/complex +// +// [NOTE: Complex Operator Unification] +// Operators currently use a mix of std::complex, thrust::complex, and +// c10::complex internally. The end state is that all operators will use +// c10::complex internally. Until then, there may be some hacks to support all +// variants. +// +// +// [Note on Constructors] +// +// The APIs of constructors are mostly copied from C++ standard: +// https://en.cppreference.com/w/cpp/numeric/complex/complex +// +// Since C++14, all constructors are constexpr in std::complex +// +// There are three types of constructors: +// - initializing from real and imag: +// `constexpr complex( const T& re = T(), const T& im = T() );` +// - implicitly-declared copy constructor +// - converting constructors +// +// Converting constructors: +// - std::complex defines converting constructor between float/double/long +// double, +// while we define converting constructor between float/double. +// - For these converting constructors, upcasting is implicit, downcasting is +// explicit. +// - We also define explicit casting from std::complex/thrust::complex +// - Note that the conversion from thrust is not constexpr, because +// thrust does not define them as constexpr ???? +// +// +// [Operator =] +// +// The APIs of operator = are mostly copied from C++ standard: +// https://en.cppreference.com/w/cpp/numeric/complex/operator%3D +// +// Since C++20, all operator= are constexpr. Although we are not building with +// C++20, we also obey this behavior. +// +// There are three types of assign operator: +// - Assign a real value from the same scalar type +// - In std, this is templated as complex& operator=(const T& x) +// with specialization `complex& operator=(T x)` for float/double/long +// double Since we only support float and double, on will use `complex& +// operator=(T x)` +// - Copy assignment operator and converting assignment operator +// - There is no specialization of converting assignment operators, which type +// is +// convertible is solely dependent on whether the scalar type is convertible +// +// In addition to the standard assignment, we also provide assignment operators +// with std and thrust +// +// +// [Casting operators] +// +// std::complex does not have casting operators. We define casting operators +// casting to std::complex and thrust::complex +// +// +// [Operator ""] +// +// std::complex has custom literals `i`, `if` and `il` defined in namespace +// `std::literals::complex_literals`. We define our own custom literals in the +// namespace `c10::complex_literals`. Our custom literals does not follow the +// same behavior as in std::complex, instead, we define _if, _id to construct +// float/double complex literals. +// +// +// [real() and imag()] +// +// In C++20, there are two overload of these functions, one it to return the +// real/imag, another is to set real/imag, they are both constexpr. We follow +// this design. +// +// +// [Operator +=,-=,*=,/=] +// +// Since C++20, these operators become constexpr. In our implementation, they +// are also constexpr. +// +// There are two types of such operators: operating with a real number, or +// operating with another complex number. For the operating with a real number, +// the generic template form has argument type `const T &`, while the overload +// for float/double/long double has `T`. We will follow the same type as +// float/double/long double in std. +// +// [Unary operator +-] +// +// Since C++20, they are constexpr. We also make them expr +// +// [Binary operators +-*/] +// +// Each operator has three versions (taking + as example): +// - complex + complex +// - complex + real +// - real + complex +// +// [Operator ==, !=] +// +// Each operator has three versions (taking == as example): +// - complex == complex +// - complex == real +// - real == complex +// +// Some of them are removed on C++20, but we decide to keep them +// +// [Operator <<, >>] +// +// These are implemented by casting to std::complex +// +// +// +// TODO(@zasdfgbnm): c10::complex is not currently supported, +// because: +// - lots of members and functions of c10::Half are not constexpr +// - thrust::complex only support float and double + +template +struct alignas(sizeof(T) * 2) complex { + using value_type = T; + + T real_ = T(0); + T imag_ = T(0); + + constexpr complex() = default; + C10_HOST_DEVICE constexpr complex(const T& re, const T& im = T()) + : real_(re), imag_(im) {} + template + explicit constexpr complex(const std::complex& other) + : complex(other.real(), other.imag()) {} +#if defined(__CUDACC__) || defined(__HIPCC__) + template + explicit C10_HOST_DEVICE complex(const thrust::complex& other) + : real_(other.real()), imag_(other.imag()) {} +// NOTE can not be implemented as follow due to ROCm bug: +// explicit C10_HOST_DEVICE complex(const thrust::complex &other): +// complex(other.real(), other.imag()) {} +#endif + + // Use SFINAE to specialize casting constructor for c10::complex and + // c10::complex + template + C10_HOST_DEVICE explicit constexpr complex( + const std::enable_if_t, complex>& other) + : real_(other.real_), imag_(other.imag_) {} + template + C10_HOST_DEVICE constexpr complex( + const std::enable_if_t, complex>& other) + : real_(other.real_), imag_(other.imag_) {} + + constexpr complex& operator=(T re) { + real_ = re; + imag_ = 0; + return *this; + } + + constexpr complex& operator+=(T re) { + real_ += re; + return *this; + } + + constexpr complex& operator-=(T re) { + real_ -= re; + return *this; + } + + constexpr complex& operator*=(T re) { + real_ *= re; + imag_ *= re; + return *this; + } + + constexpr complex& operator/=(T re) { + real_ /= re; + imag_ /= re; + return *this; + } + + template + constexpr complex& operator=(const complex& rhs) { + real_ = rhs.real(); + imag_ = rhs.imag(); + return *this; + } + + template + constexpr complex& operator+=(const complex& rhs) { + real_ += rhs.real(); + imag_ += rhs.imag(); + return *this; + } + + template + constexpr complex& operator-=(const complex& rhs) { + real_ -= rhs.real(); + imag_ -= rhs.imag(); + return *this; + } + + template + constexpr complex& operator*=(const complex& rhs) { + // (a + bi) * (c + di) = (a*c - b*d) + (a * d + b * c) i + T a = real_; + T b = imag_; + U c = rhs.real(); + U d = rhs.imag(); + real_ = a * c - b * d; + imag_ = a * d + b * c; + return *this; + } + +#ifdef __APPLE__ +#define FORCE_INLINE_APPLE __attribute__((always_inline)) +#else +#define FORCE_INLINE_APPLE +#endif + template + constexpr FORCE_INLINE_APPLE complex& operator/=(const complex& rhs) + __ubsan_ignore_float_divide_by_zero__ { + // (a + bi) / (c + di) = (ac + bd)/(c^2 + d^2) + (bc - ad)/(c^2 + d^2) i + // the calculation below follows numpy's complex division + T a = real_; + T b = imag_; + U c = rhs.real(); + U d = rhs.imag(); + +#if defined(__GNUC__) && !defined(__clang__) + // std::abs is already constexpr by gcc + auto abs_c = std::abs(c); + auto abs_d = std::abs(d); +#else + auto abs_c = c < 0 ? -c : c; + auto abs_d = d < 0 ? -d : d; +#endif + + if (abs_c >= abs_d) { + if (abs_c == U(0) && abs_d == U(0)) { + /* divide by zeros should yield a complex inf or nan */ + real_ = a / abs_c; + imag_ = b / abs_d; + } else { + auto rat = d / c; + auto scl = U(1.0) / (c + d * rat); + real_ = (a + b * rat) * scl; + imag_ = (b - a * rat) * scl; + } + } else { + auto rat = c / d; + auto scl = U(1.0) / (d + c * rat); + real_ = (a * rat + b) * scl; + imag_ = (b * rat - a) * scl; + } + return *this; + } +#undef FORCE_INLINE_APPLE + + template + constexpr complex& operator=(const std::complex& rhs) { + real_ = rhs.real(); + imag_ = rhs.imag(); + return *this; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + template + C10_HOST_DEVICE complex& operator=(const thrust::complex& rhs) { + real_ = rhs.real(); + imag_ = rhs.imag(); + return *this; + } +#endif + + template + explicit constexpr operator std::complex() const { + return std::complex(std::complex(real(), imag())); + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + template + C10_HOST_DEVICE explicit operator thrust::complex() const { + return static_cast>(thrust::complex(real(), imag())); + } +#endif + + // consistent with NumPy behavior + explicit constexpr operator bool() const { + return real() || imag(); + } + + C10_HOST_DEVICE constexpr T real() const { + return real_; + } + constexpr void real(T value) { + real_ = value; + } + C10_HOST_DEVICE constexpr T imag() const { + return imag_; + } + constexpr void imag(T value) { + imag_ = value; + } +}; + +namespace complex_literals { + +constexpr complex operator""_if(long double imag) { + return complex(0.0f, static_cast(imag)); +} + +constexpr complex operator""_id(long double imag) { + return complex(0.0, static_cast(imag)); +} + +constexpr complex operator""_if(unsigned long long imag) { + return complex(0.0f, static_cast(imag)); +} + +constexpr complex operator""_id(unsigned long long imag) { + return complex(0.0, static_cast(imag)); +} + +} // namespace complex_literals + +template +constexpr complex operator+(const complex& val) { + return val; +} + +template +constexpr complex operator-(const complex& val) { + return complex(-val.real(), -val.imag()); +} + +template +constexpr complex operator+(const complex& lhs, const complex& rhs) { + complex result = lhs; + return result += rhs; +} + +template +constexpr complex operator+(const complex& lhs, const T& rhs) { + complex result = lhs; + return result += rhs; +} + +template +constexpr complex operator+(const T& lhs, const complex& rhs) { + return complex(lhs + rhs.real(), rhs.imag()); +} + +template +constexpr complex operator-(const complex& lhs, const complex& rhs) { + complex result = lhs; + return result -= rhs; +} + +template +constexpr complex operator-(const complex& lhs, const T& rhs) { + complex result = lhs; + return result -= rhs; +} + +template +constexpr complex operator-(const T& lhs, const complex& rhs) { + complex result = -rhs; + return result += lhs; +} + +template +constexpr complex operator*(const complex& lhs, const complex& rhs) { + complex result = lhs; + return result *= rhs; +} + +template +constexpr complex operator*(const complex& lhs, const T& rhs) { + complex result = lhs; + return result *= rhs; +} + +template +constexpr complex operator*(const T& lhs, const complex& rhs) { + complex result = rhs; + return result *= lhs; +} + +template +constexpr complex operator/(const complex& lhs, const complex& rhs) { + complex result = lhs; + return result /= rhs; +} + +template +constexpr complex operator/(const complex& lhs, const T& rhs) { + complex result = lhs; + return result /= rhs; +} + +template +constexpr complex operator/(const T& lhs, const complex& rhs) { + complex result(lhs, T()); + return result /= rhs; +} + +// Define operators between integral scalars and c10::complex. std::complex does +// not support this when T is a floating-point number. This is useful because it +// saves a lot of "static_cast" when operate a complex and an integer. This +// makes the code both less verbose and potentially more efficient. +#define COMPLEX_INTEGER_OP_TEMPLATE_CONDITION \ + typename std::enable_if_t< \ + std::is_floating_point_v && std::is_integral_v, \ + int> = 0 + +template +constexpr c10::complex operator+(const c10::complex& a, const iT& b) { + return a + static_cast(b); +} + +template +constexpr c10::complex operator+(const iT& a, const c10::complex& b) { + return static_cast(a) + b; +} + +template +constexpr c10::complex operator-(const c10::complex& a, const iT& b) { + return a - static_cast(b); +} + +template +constexpr c10::complex operator-(const iT& a, const c10::complex& b) { + return static_cast(a) - b; +} + +template +constexpr c10::complex operator*(const c10::complex& a, const iT& b) { + return a * static_cast(b); +} + +template +constexpr c10::complex operator*(const iT& a, const c10::complex& b) { + return static_cast(a) * b; +} + +template +constexpr c10::complex operator/(const c10::complex& a, const iT& b) { + return a / static_cast(b); +} + +template +constexpr c10::complex operator/(const iT& a, const c10::complex& b) { + return static_cast(a) / b; +} + +#undef COMPLEX_INTEGER_OP_TEMPLATE_CONDITION + +template +constexpr bool operator==(const complex& lhs, const complex& rhs) { + return (lhs.real() == rhs.real()) && (lhs.imag() == rhs.imag()); +} + +template +constexpr bool operator==(const complex& lhs, const T& rhs) { + return (lhs.real() == rhs) && (lhs.imag() == T()); +} + +template +constexpr bool operator==(const T& lhs, const complex& rhs) { + return (lhs == rhs.real()) && (T() == rhs.imag()); +} + +template +constexpr bool operator!=(const complex& lhs, const complex& rhs) { + return !(lhs == rhs); +} + +template +constexpr bool operator!=(const complex& lhs, const T& rhs) { + return !(lhs == rhs); +} + +template +constexpr bool operator!=(const T& lhs, const complex& rhs) { + return !(lhs == rhs); +} + +template +std::basic_ostream& operator<<( + std::basic_ostream& os, + const complex& x) { + return (os << static_cast>(x)); +} + +template +std::basic_istream& operator>>( + std::basic_istream& is, + complex& x) { + std::complex tmp; + is >> tmp; + x = tmp; + return is; +} + +} // namespace c10 + +// std functions +// +// The implementation of these functions also follow the design of C++20 + +namespace std { + +template +constexpr T real(const c10::complex& z) { + return z.real(); +} + +template +constexpr T imag(const c10::complex& z) { + return z.imag(); +} + +template +C10_HOST_DEVICE T abs(const c10::complex& z) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return thrust::abs(static_cast>(z)); +#else + return std::abs(static_cast>(z)); +#endif +} + +#if defined(USE_ROCM) +#define ROCm_Bug(x) +#else +#define ROCm_Bug(x) x +#endif + +template +C10_HOST_DEVICE T arg(const c10::complex& z) { + return ROCm_Bug(std)::atan2(std::imag(z), std::real(z)); +} + +#undef ROCm_Bug + +template +constexpr T norm(const c10::complex& z) { + return z.real() * z.real() + z.imag() * z.imag(); +} + +// For std::conj, there are other versions of it: +// constexpr std::complex conj( float z ); +// template< class DoubleOrInteger > +// constexpr std::complex conj( DoubleOrInteger z ); +// constexpr std::complex conj( long double z ); +// These are not implemented +// TODO(@zasdfgbnm): implement them as c10::conj +template +constexpr c10::complex conj(const c10::complex& z) { + return c10::complex(z.real(), -z.imag()); +} + +// Thrust does not have complex --> complex version of thrust::proj, +// so this function is not implemented at c10 right now. +// TODO(@zasdfgbnm): implement it by ourselves + +// There is no c10 version of std::polar, because std::polar always +// returns std::complex. Use c10::polar instead; + +} // namespace std + +namespace c10 { + +template +C10_HOST_DEVICE complex polar(const T& r, const T& theta = T()) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>(thrust::polar(r, theta)); +#else + // std::polar() requires r >= 0, so spell out the explicit implementation to + // avoid a branch. + return complex(r * std::cos(theta), r * std::sin(theta)); +#endif +} + +template <> +struct alignas(4) complex { + Half real_; + Half imag_; + + // Constructors + complex() = default; + // Half constructor is not constexpr so the following constructor can't + // be constexpr + C10_HOST_DEVICE explicit inline complex(const Half& real, const Half& imag) + : real_(real), imag_(imag) {} + C10_HOST_DEVICE inline complex(const c10::complex& value) + : real_(value.real()), imag_(value.imag()) {} + + // Conversion operator + inline C10_HOST_DEVICE operator c10::complex() const { + return {real_, imag_}; + } + + constexpr C10_HOST_DEVICE Half real() const { + return real_; + } + constexpr C10_HOST_DEVICE Half imag() const { + return imag_; + } + + C10_HOST_DEVICE complex& operator+=(const complex& other) { + real_ = static_cast(real_) + static_cast(other.real_); + imag_ = static_cast(imag_) + static_cast(other.imag_); + return *this; + } + + C10_HOST_DEVICE complex& operator-=(const complex& other) { + real_ = static_cast(real_) - static_cast(other.real_); + imag_ = static_cast(imag_) - static_cast(other.imag_); + return *this; + } + + C10_HOST_DEVICE complex& operator*=(const complex& other) { + auto a = static_cast(real_); + auto b = static_cast(imag_); + auto c = static_cast(other.real()); + auto d = static_cast(other.imag()); + real_ = a * c - b * d; + imag_ = a * d + b * c; + return *this; + } +}; + +} // namespace c10 + +C10_CLANG_DIAGNOSTIC_POP() + +#define C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H +// math functions are included in a separate file +#include // IWYU pragma: keep +// utilities for complex types +#include // IWYU pragma: keep +#undef C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H diff --git a/runtime/core/portable_type/c10/c10/util/complex_math.h b/runtime/core/portable_type/c10/c10/util/complex_math.h new file mode 100644 index 00000000000..2b591026c94 --- /dev/null +++ b/runtime/core/portable_type/c10/c10/util/complex_math.h @@ -0,0 +1,406 @@ +#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H) +#error \ + "c10/util/complex_math.h is not meant to be individually included. Include c10/util/complex.h instead." +#endif + +namespace c10_complex_math { + +// Exponential functions + +template +C10_HOST_DEVICE inline c10::complex exp(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::exp(static_cast>(x))); +#else + return static_cast>( + std::exp(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex log(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::log(static_cast>(x))); +#else + return static_cast>( + std::log(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex log10(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::log10(static_cast>(x))); +#else + return static_cast>( + std::log10(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex log2(const c10::complex& x) { + const c10::complex log2 = c10::complex(::log(2.0), 0.0); + return c10_complex_math::log(x) / log2; +} + +// Power functions +// +#if defined(_LIBCPP_VERSION) || \ + (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX)) +namespace _detail { +C10_API c10::complex sqrt(const c10::complex& in); +C10_API c10::complex sqrt(const c10::complex& in); +C10_API c10::complex acos(const c10::complex& in); +C10_API c10::complex acos(const c10::complex& in); +} // namespace _detail +#endif + +template +C10_HOST_DEVICE inline c10::complex sqrt(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::sqrt(static_cast>(x))); +#elif !( \ + defined(_LIBCPP_VERSION) || \ + (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))) + return static_cast>( + std::sqrt(static_cast>(x))); +#else + return _detail::sqrt(x); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const c10::complex& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>(thrust::pow( + static_cast>(x), static_cast>(y))); +#else + return static_cast>(std::pow( + static_cast>(x), static_cast>(y))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const T& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::pow(static_cast>(x), y)); +#else + return static_cast>( + std::pow(static_cast>(x), y)); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const T& x, + const c10::complex& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::pow(x, static_cast>(y))); +#else + return static_cast>( + std::pow(x, static_cast>(y))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const c10::complex& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>(thrust::pow( + static_cast>(x), static_cast>(y))); +#else + return static_cast>(std::pow( + static_cast>(x), static_cast>(y))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const c10::complex& x, + const U& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::pow(static_cast>(x), y)); +#else + return static_cast>( + std::pow(static_cast>(x), y)); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex pow( + const T& x, + const c10::complex& y) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::pow(x, static_cast>(y))); +#else + return static_cast>( + std::pow(x, static_cast>(y))); +#endif +} + +// Trigonometric functions + +template +C10_HOST_DEVICE inline c10::complex sin(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::sin(static_cast>(x))); +#else + return static_cast>( + std::sin(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex cos(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::cos(static_cast>(x))); +#else + return static_cast>( + std::cos(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex tan(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::tan(static_cast>(x))); +#else + return static_cast>( + std::tan(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex asin(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::asin(static_cast>(x))); +#else + return static_cast>( + std::asin(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex acos(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::acos(static_cast>(x))); +#elif !defined(_LIBCPP_VERSION) + return static_cast>( + std::acos(static_cast>(x))); +#else + return _detail::acos(x); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex atan(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::atan(static_cast>(x))); +#else + return static_cast>( + std::atan(static_cast>(x))); +#endif +} + +// Hyperbolic functions + +template +C10_HOST_DEVICE inline c10::complex sinh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::sinh(static_cast>(x))); +#else + return static_cast>( + std::sinh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex cosh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::cosh(static_cast>(x))); +#else + return static_cast>( + std::cosh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex tanh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::tanh(static_cast>(x))); +#else + return static_cast>( + std::tanh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex asinh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::asinh(static_cast>(x))); +#else + return static_cast>( + std::asinh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex acosh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::acosh(static_cast>(x))); +#else + return static_cast>( + std::acosh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex atanh(const c10::complex& x) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return static_cast>( + thrust::atanh(static_cast>(x))); +#else + return static_cast>( + std::atanh(static_cast>(x))); +#endif +} + +template +C10_HOST_DEVICE inline c10::complex log1p(const c10::complex& z) { +#if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \ + defined(__HIPCC__) + // For Mac, the new implementation yielded a high relative error. Falling back + // to the old version for now. + // See https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354 + // For CUDA we also use this one, as thrust::log(thrust::complex) takes + // *forever* to compile + + // log1p(z) = log(1 + z) + // Let's define 1 + z = r * e ^ (i * a), then we have + // log(r * e ^ (i * a)) = log(r) + i * a + // With z = x + iy, the term r can be written as + // r = ((1 + x) ^ 2 + y ^ 2) ^ 0.5 + // = (1 + x ^ 2 + 2 * x + y ^ 2) ^ 0.5 + // So, log(r) is + // log(r) = 0.5 * log(1 + x ^ 2 + 2 * x + y ^ 2) + // = 0.5 * log1p(x * (x + 2) + y ^ 2) + // we need to use the expression only on certain condition to avoid overflow + // and underflow from `(x * (x + 2) + y ^ 2)` + T x = z.real(); + T y = z.imag(); + T zabs = std::abs(z); + T theta = std::atan2(y, x + T(1)); + if (zabs < 0.5) { + T r = x * (T(2) + x) + y * y; + if (r == 0) { // handle underflow + return {x, theta}; + } + return {T(0.5) * std::log1p(r), theta}; + } else { + T z0 = std::hypot(x + 1, y); + return {std::log(z0), theta}; + } +#else + // CPU path + // Based on https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354 + c10::complex u = z + T(1); + if (u == T(1)) { + return z; + } else { + auto log_u = log(u); + if (u - T(1) == z) { + return log_u; + } + return log_u * (z / (u - T(1))); + } +#endif +} + +template +C10_HOST_DEVICE inline c10::complex expm1(const c10::complex& z) { + // expm1(z) = exp(z) - 1 + // Define z = x + i * y + // f = e ^ (x + i * y) - 1 + // = e ^ x * e ^ (i * y) - 1 + // = (e ^ x * cos(y) - 1) + i * (e ^ x * sin(y)) + // = (e ^ x - 1) * cos(y) - (1 - cos(y)) + i * e ^ x * sin(y) + // = expm1(x) * cos(y) - 2 * sin(y / 2) ^ 2 + i * e ^ x * sin(y) + T x = z.real(); + T y = z.imag(); + T a = std::sin(y / 2); + T er = std::expm1(x) * std::cos(y) - T(2) * a * a; + T ei = std::exp(x) * std::sin(y); + return {er, ei}; +} + +} // namespace c10_complex_math + +using c10_complex_math::acos; +using c10_complex_math::acosh; +using c10_complex_math::asin; +using c10_complex_math::asinh; +using c10_complex_math::atan; +using c10_complex_math::atanh; +using c10_complex_math::cos; +using c10_complex_math::cosh; +using c10_complex_math::exp; +using c10_complex_math::expm1; +using c10_complex_math::log; +using c10_complex_math::log10; +using c10_complex_math::log1p; +using c10_complex_math::log2; +using c10_complex_math::pow; +using c10_complex_math::sin; +using c10_complex_math::sinh; +using c10_complex_math::sqrt; +using c10_complex_math::tan; +using c10_complex_math::tanh; + +namespace std { + +using c10_complex_math::acos; +using c10_complex_math::acosh; +using c10_complex_math::asin; +using c10_complex_math::asinh; +using c10_complex_math::atan; +using c10_complex_math::atanh; +using c10_complex_math::cos; +using c10_complex_math::cosh; +using c10_complex_math::exp; +using c10_complex_math::expm1; +using c10_complex_math::log; +using c10_complex_math::log10; +using c10_complex_math::log1p; +using c10_complex_math::log2; +using c10_complex_math::pow; +using c10_complex_math::sin; +using c10_complex_math::sinh; +using c10_complex_math::sqrt; +using c10_complex_math::tan; +using c10_complex_math::tanh; + +} // namespace std diff --git a/runtime/core/portable_type/c10/c10/util/complex_utils.h b/runtime/core/portable_type/c10/c10/util/complex_utils.h new file mode 100644 index 00000000000..1ca105f1d0a --- /dev/null +++ b/runtime/core/portable_type/c10/c10/util/complex_utils.h @@ -0,0 +1,46 @@ +#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H) +#error \ + "c10/util/complex_utils.h is not meant to be individually included. Include c10/util/complex.h instead." +#endif + +#include + +namespace c10 { + +template +struct is_complex : public std::false_type {}; + +template +struct is_complex> : public std::true_type {}; + +template +struct is_complex> : public std::true_type {}; + +// Extract double from std::complex; is identity otherwise +// TODO: Write in more idiomatic C++17 +template +struct scalar_value_type { + using type = T; +}; +template +struct scalar_value_type> { + using type = T; +}; +template +struct scalar_value_type> { + using type = T; +}; + +} // namespace c10 + +namespace std { + +template +class numeric_limits> : public numeric_limits {}; + +template +bool isnan(const c10::complex& v) { + return std::isnan(v.real()) || std::isnan(v.imag()); +} + +} // namespace std diff --git a/runtime/core/portable_type/complex.h b/runtime/core/portable_type/complex.h index e89a19e54d7..faf13a0432f 100644 --- a/runtime/core/portable_type/complex.h +++ b/runtime/core/portable_type/complex.h @@ -8,39 +8,14 @@ #pragma once -#include +#include -namespace executorch { -namespace runtime { -namespace etensor { +namespace executorch::runtime::etensor { +using c10::complex; +} // namespace executorch::runtime::etensor -/** - * An implementation of complex numbers, compatible with c10/util/complex.h from - * pytorch core. - */ -template -struct alignas(sizeof(T) * 2) complex { - T real_ = T(0); - T imag_ = T(0); -}; - -/** - * Specialization for Half, which is not a primitive C numeric type. - */ -template <> -struct alignas(4) complex { - Half real_; - Half imag_; -}; - -} // namespace etensor -} // namespace runtime -} // namespace executorch - -namespace torch { -namespace executor { +namespace torch::executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. using ::executorch::runtime::etensor::complex; -} // namespace executor -} // namespace torch +} // namespace torch::executor diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl index 3195e727d96..d3e02b1afb5 100644 --- a/runtime/core/targets.bzl +++ b/runtime/core/targets.bzl @@ -95,9 +95,9 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], exported_deps = [ - "//executorch/runtime/core:core", - "//executorch/runtime/core/exec_aten:lib" + aten_suffix, + ":core", ":tag", + "//executorch/runtime/core/exec_aten:lib" + aten_suffix, ], ) @@ -119,6 +119,37 @@ def define_common_targets(): ], ) + runtime.cxx_library( + name = "named_data_map" + aten_suffix, + exported_headers = [ + "named_data_map.h", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + exported_deps = [ + ":tensor_layout" + aten_suffix, + "//executorch/runtime/core/exec_aten:lib" + aten_suffix, + ], + ) + + + runtime.cxx_library( + name = "tensor_layout" + aten_suffix, + srcs = ["tensor_layout.cpp"], + exported_headers = ["tensor_layout.h"], + deps = [ + "//executorch/runtime/core/portable_type/c10/c10:c10", + ], + exported_deps = [ + ":core", + "//executorch/runtime/core/exec_aten:lib" + aten_suffix, + "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, + ], + visibility = ["//executorch/..."], + ) + runtime.cxx_library( name = "tag", srcs = ["tag.cpp"], @@ -133,31 +164,3 @@ def define_common_targets(): "//executorch/...", ], ) - - runtime.cxx_library( - name = "named_data_map", - exported_headers = [ - "named_data_map.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - exported_deps = [ - ":tensor_layout", - ], - ) - - runtime.cxx_library( - name = "tensor_layout", - srcs = ["tensor_layout.cpp"], - exported_headers = ["tensor_layout.h"], - deps = [ - "//executorch/runtime/core/portable_type/c10/c10:c10", - ], - exported_deps = [ - ":core", - "//executorch/runtime/core/exec_aten:lib", - ], - visibility = ["//executorch/..."], - ) diff --git a/runtime/core/tensor_layout.cpp b/runtime/core/tensor_layout.cpp index 2b862e6dc14..d33f79f27c4 100644 --- a/runtime/core/tensor_layout.cpp +++ b/runtime/core/tensor_layout.cpp @@ -13,7 +13,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace { Result calculate_nbytes( @@ -51,5 +51,5 @@ Result TensorLayout::create( } return TensorLayout(sizes, dim_order, scalar_type, nbytes.get()); } -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/core/tensor_layout.h b/runtime/core/tensor_layout.h index c2c3833f528..42131e6506e 100644 --- a/runtime/core/tensor_layout.h +++ b/runtime/core/tensor_layout.h @@ -14,7 +14,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { /** * Describes the layout of a tensor. @@ -89,5 +89,5 @@ class ET_EXPERIMENTAL TensorLayout final { const size_t nbytes_; }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/core/test/event_tracer_test.cpp b/runtime/core/test/event_tracer_test.cpp index 224e87cc2b1..1c9e1a446b9 100644 --- a/runtime/core/test/event_tracer_test.cpp +++ b/runtime/core/test/event_tracer_test.cpp @@ -28,6 +28,7 @@ using executorch::runtime::EValue; using executorch::runtime::EventTracer; using executorch::runtime::EventTracerDebugLogLevel; using executorch::runtime::EventTracerEntry; +using executorch::runtime::EventTracerFilterBase; using executorch::runtime::kUnsetChainId; using executorch::runtime::kUnsetDebugHandle; using executorch::runtime::kUnsetDelegateDebugIntId; @@ -90,6 +91,11 @@ class DummyEventTracer : public EventTracer { (void)metadata_len; } + void set_delegation_intermediate_output_filter( + EventTracerFilterBase* event_tracer_filter) override { + (void)event_tracer_filter; + } + void log_profiling_delegate( const char* name, DelegateDebugIntId delegate_debug_id, @@ -155,9 +161,11 @@ class DummyEventTracer : public EventTracer { return true; } - void log_evalue(const EValue& evalue, LoggedEValueType evalue_type) override { + Result log_evalue(const EValue& evalue, LoggedEValueType evalue_type) + override { logged_evalue_ = evalue; logged_evalue_type_ = evalue_type; + return true; } EValue logged_evalue() { diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index 41d44522a22..f09af8ac2e7 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -32,9 +32,8 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { -using deserialization::NamedData; using internal::PlatformMemoryAllocator; /** @@ -1594,6 +1593,37 @@ EValue& Method::mutable_input(size_t i) { return mutable_value(get_input_index(i)); } +Result Method::get_attribute( + executorch::aten::string_view name) { + auto flatbuffer_values = serialization_plan_->values(); + size_t counter = 0; + + for (size_t i = 0; i < flatbuffer_values->size(); ++i) { + auto serialization_value = flatbuffer_values->Get(i); + if (serialization_value->val_type() == + executorch_flatbuffer::KernelTypes::Tensor) { + const auto s_tensor = static_cast( + serialization_value->val()); + if (s_tensor->extra_tensor_info() != nullptr && + s_tensor->extra_tensor_info()->fully_qualified_name() != nullptr && + strcmp( + s_tensor->extra_tensor_info()->fully_qualified_name()->c_str(), + name.data()) == 0) { + if (!this->values_[counter].isTensor()) { + ET_LOG( + Error, + "Attribute tensor not at the expected location. The .pte is likely malformed. Please file a bug report on https://github.com/pytorch/executorch/issues"); + return Error::Internal; + } + return this->values_[counter].toTensor(); + } + } + ++counter; + } + + return Error::NotFound; +} + size_t Method::outputs_size() const { const auto* outputs = serialization_plan_->outputs(); return outputs == nullptr ? 0 : outputs->size(); @@ -1640,5 +1670,5 @@ Method::~Method() { } // All other fields are trivially destructible. } -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/executor/method.h b/runtime/executor/method.h index 0ca2df440ad..0cf7164c98e 100644 --- a/runtime/executor/method.h +++ b/runtime/executor/method.h @@ -32,7 +32,7 @@ struct EValue; } // namespace executorch_flatbuffer namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { // Forward declare NamedData. This is a public header and must not include // internal data types. @@ -192,6 +192,18 @@ class Method final { */ ET_NODISCARD Error get_inputs(EValue* input_evalues, size_t length); + /** + * + * Retrieves the attribute tensor associated with the given name. + * + * @param[in] name The name of the attribute tensor to retrieve. + * + * @returns Result containing the attribute tensor on success, non-Ok on + * failure. + */ + ET_NODISCARD Result get_attribute( + executorch::aten::string_view name); + /** * Execute the method. * @@ -394,14 +406,14 @@ class Method final { void log_outputs(); }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::Method; +using ::executorch::ET_RUNTIME_NAMESPACE::Method; } // namespace executor } // namespace torch diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp index 8f84fea940f..e810d195370 100644 --- a/runtime/executor/method_meta.cpp +++ b/runtime/executor/method_meta.cpp @@ -16,7 +16,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace { Result get_tag( @@ -69,9 +69,11 @@ TensorInfo::TensorInfo( Span sizes, Span dim_order, executorch::aten::ScalarType scalar_type, - const bool is_memory_planned) + const bool is_memory_planned, + executorch::aten::string_view name) : sizes_(sizes), dim_order_(dim_order), + name_(name), scalar_type_(scalar_type), is_memory_planned_(is_memory_planned), nbytes_(calculate_nbytes(sizes_, scalar_type_)) {} @@ -96,6 +98,10 @@ size_t TensorInfo::nbytes() const { return nbytes_; } +executorch::aten::string_view TensorInfo::name() const { + return name_; +} + MethodMeta::MethodMeta(const executorch_flatbuffer::ExecutionPlan* s_plan) : s_plan_(s_plan) {} @@ -149,8 +155,9 @@ Result MethodMeta::input_tensor_meta(size_t index) const { tensor_value->dim_order()->data(), tensor_value->dim_order()->size()), static_cast(tensor_value->scalar_type()), tensor_value->allocation_info() != nullptr || - tensor_value->data_buffer_idx() != - 0); // Count constant returns as memory planned. + tensor_value->data_buffer_idx() != 0 /* is_memory_planned */, + executorch::aten::string_view{nullptr, 0}); // Count constant returns as + // memory planned. } size_t MethodMeta::num_outputs() const { @@ -200,8 +207,60 @@ Result MethodMeta::output_tensor_meta(size_t index) const { tensor_value->dim_order()->data(), tensor_value->dim_order()->size()), static_cast(tensor_value->scalar_type()), tensor_value->allocation_info() != nullptr || - tensor_value->data_buffer_idx() != - 0); // Count constant returns as memory planned. + tensor_value->data_buffer_idx() != 0 /* is_memory_planned */, + executorch::aten::string_view{nullptr, 0}); // Count constant returns as + // memory planned. +} + +size_t MethodMeta::num_attributes() const { + size_t counter = 0; + auto values = s_plan_->values(); + for (size_t i = 0; i < values->size(); ++i) { + auto value = values->Get(i); + if (value->val_type() == executorch_flatbuffer::KernelTypes::Tensor) { + auto tensor_value = value->val_as_Tensor(); + if (tensor_value->extra_tensor_info() != nullptr && + tensor_value->extra_tensor_info()->fully_qualified_name()->c_str() != + nullptr) { + ++counter; + } + } + } + return counter; +} + +Result MethodMeta::attribute_tensor_meta(size_t index) const { + size_t counter = 0; + auto values = s_plan_->values(); + for (size_t i = 0; i < values->size(); ++i) { + auto value = values->Get(i); + if (value->val_type() == executorch_flatbuffer::KernelTypes::Tensor) { + auto tensor_value = value->val_as_Tensor(); + if (tensor_value->extra_tensor_info() != nullptr && + tensor_value->extra_tensor_info()->fully_qualified_name()->c_str() != + nullptr) { + if (counter == index) { + auto t_name = + tensor_value->extra_tensor_info()->fully_qualified_name(); + // Count constant returns as memory planned + return TensorInfo( + Span( + tensor_value->sizes()->data(), tensor_value->sizes()->size()), + Span( + tensor_value->dim_order()->data(), + tensor_value->dim_order()->size()), + static_cast( + tensor_value->scalar_type()), + tensor_value->allocation_info() != nullptr || + tensor_value->data_buffer_idx() != 0 /* is_memory_planned */, + executorch::aten::string_view{t_name->c_str(), t_name->size()}); + } + ++counter; + } + } + } + ET_LOG(Error, "No attribute tensor found at index %zu", index); + return Error::InvalidArgument; } size_t MethodMeta::num_memory_planned_buffers() const { @@ -279,6 +338,5 @@ size_t MethodMeta::num_instructions() const { } return num_instructions; } - -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h index d9bb64d68a7..ec910f9f6e4 100644 --- a/runtime/executor/method_meta.h +++ b/runtime/executor/method_meta.h @@ -20,7 +20,7 @@ struct ExecutionPlan; } // namespace executorch_flatbuffer namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { /** * Metadata about a specific tensor of an ExecuTorch Program. @@ -62,6 +62,12 @@ class TensorInfo final { */ size_t nbytes() const; + /** + * Returns the fully qualified name of the Tensor might be empty if the tensor + * is nameless. + */ + executorch::aten::string_view name() const; + private: // Let MethodMeta create TensorInfo. friend class MethodMeta; @@ -70,7 +76,8 @@ class TensorInfo final { Span sizes, Span dim_order, executorch::aten::ScalarType scalar_type, - const bool is_memory_planned); + const bool is_memory_planned, + executorch::aten::string_view name); /** * The sizes of the tensor. @@ -88,6 +95,9 @@ class TensorInfo final { */ Span dim_order_; + /// The fully qualified name of the Tensor. + executorch::aten::string_view name_; + /// The scalar type of the tensor. executorch::aten::ScalarType scalar_type_; @@ -170,6 +180,21 @@ class MethodMeta final { */ Result output_tensor_meta(size_t index) const; + /** + * Get the number of attribute tensors in this method. + * + * @returns The number of attribute tensors. + */ + size_t num_attributes() const; + + /** + * Get metadata about the specified attribute tensor. + * + * @param[in] index The index of the attribute tensor to look up. + * @returns The metadata on success, or an error on failure. + */ + Result attribute_tensor_meta(size_t index) const; + /** * Get the number of memory-planned buffers this method requires. * @@ -240,14 +265,14 @@ class MethodMeta final { const executorch_flatbuffer::ExecutionPlan* s_plan_; }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::MethodMeta; -using ::executorch::runtime::TensorInfo; +using ::executorch::ET_RUNTIME_NAMESPACE::MethodMeta; +using ::executorch::ET_RUNTIME_NAMESPACE::TensorInfo; } // namespace executor } // namespace torch diff --git a/runtime/executor/platform_memory_allocator.h b/runtime/executor/platform_memory_allocator.h index 09195a460ac..7ab58bf0f3d 100644 --- a/runtime/executor/platform_memory_allocator.h +++ b/runtime/executor/platform_memory_allocator.h @@ -17,7 +17,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace internal { /** @@ -107,5 +107,5 @@ class PlatformMemoryAllocator final : public MemoryAllocator { }; } // namespace internal -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp index 14e0b83d8aa..238c806b1d6 100644 --- a/runtime/executor/program.cpp +++ b/runtime/executor/program.cpp @@ -28,8 +28,7 @@ #endif namespace executorch { -namespace runtime { - +namespace ET_RUNTIME_NAMESPACE { namespace { /** @@ -535,5 +534,5 @@ Error Program::load_mutable_subsegment_into( segment_base_offset_ + segment->offset() + offset, size, info, buffer); } -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/executor/program.h b/runtime/executor/program.h index 0932e51619f..9670fd7c79f 100644 --- a/runtime/executor/program.h +++ b/runtime/executor/program.h @@ -36,8 +36,7 @@ struct Program; } // namespace executorch_flatbuffer namespace executorch { -namespace runtime { - +namespace ET_RUNTIME_NAMESPACE { namespace testing { // Provides test access to private Program methods. class ProgramTestFriend; @@ -313,14 +312,14 @@ class Program final { std::optional pte_data_map_; }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::Program; +using ::executorch::ET_RUNTIME_NAMESPACE::Program; } // namespace executor } // namespace torch diff --git a/runtime/executor/pte_data_map.cpp b/runtime/executor/pte_data_map.cpp index 5829395028a..fd064cb8256 100644 --- a/runtime/executor/pte_data_map.cpp +++ b/runtime/executor/pte_data_map.cpp @@ -10,7 +10,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace internal { /* static */ executorch::runtime::Result PteDataMap::create( @@ -83,5 +83,5 @@ ET_NODISCARD executorch::runtime::Result PteDataMap::get_key( } } // namespace internal -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/executor/pte_data_map.h b/runtime/executor/pte_data_map.h index 01c15555786..b26c0ac42f9 100644 --- a/runtime/executor/pte_data_map.h +++ b/runtime/executor/pte_data_map.h @@ -46,7 +46,7 @@ using FlatbufferDataSegment = flatbuffers:: #endif namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace internal { /** @@ -147,5 +147,5 @@ class PteDataMap final : public NamedDataMap { }; } // namespace internal -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl index cfb6c607359..649b2c13cc1 100644 --- a/runtime/executor/targets.bzl +++ b/runtime/executor/targets.bzl @@ -42,30 +42,33 @@ def define_common_targets(): ], ) - runtime.cxx_library( - name = "pte_data_map", - srcs = [ - "pte_data_map.cpp", - ], - exported_headers = [ - "pte_data_map.h", - ], - visibility = [ - "//executorch/runtime/executor/...", - "@EXECUTORCH_CLIENTS", - ], - exported_deps = [ - "//executorch/runtime/core:core", - "//executorch/runtime/core:named_data_map", - ], - deps = [ - "//executorch/schema:program", - ], - exported_preprocessor_flags = [] if runtime.is_oss else ["-DEXECUTORCH_INTERNAL_FLATBUFFERS=1"], - ) for aten_mode in get_aten_mode_options(): aten_suffix = "_aten" if aten_mode else "" + + runtime.cxx_library( + name = "pte_data_map" + aten_suffix, + srcs = [ + "pte_data_map.cpp", + ], + exported_headers = [ + "pte_data_map.h", + ], + visibility = [ + "//executorch/runtime/executor/...", + "@EXECUTORCH_CLIENTS", + ], + exported_deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/core:named_data_map" + aten_suffix, + "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, + ], + deps = [ + "//executorch/schema:program", + ], + exported_preprocessor_flags = [] if runtime.is_oss else ["-DEXECUTORCH_INTERNAL_FLATBUFFERS=1"], + ) + runtime.cxx_library( name = "program" + aten_suffix, exported_deps = [ @@ -103,17 +106,17 @@ def define_common_targets(): preprocessor_flags = _program_preprocessor_flags(), exported_deps = [ ":memory_manager", - ":pte_data_map", - "//executorch/runtime/backend:interface", + ":pte_data_map" + aten_suffix, + "//executorch/runtime/backend:interface" + aten_suffix, "//executorch/runtime/core:core", - "//executorch/runtime/core:named_data_map", + "//executorch/runtime/core:named_data_map" + aten_suffix, "//executorch/runtime/core:evalue" + aten_suffix, "//executorch/runtime/core:event_tracer" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix, - "//executorch/runtime/kernel:operator_registry", + "//executorch/runtime/kernel:operator_registry" + aten_suffix, "//executorch/runtime/platform:platform", "//executorch/schema:extended_header", ], diff --git a/runtime/executor/tensor_parser.h b/runtime/executor/tensor_parser.h index 1fae84cfb05..e2b5ff8d6ea 100644 --- a/runtime/executor/tensor_parser.h +++ b/runtime/executor/tensor_parser.h @@ -21,7 +21,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace deserialization { /// Data structure to hold key and data buffer for external data used @@ -142,7 +142,7 @@ ET_NODISCARD Result getTensorDataPtr( Span external_constants = {}); } // namespace deserialization -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { @@ -150,10 +150,11 @@ namespace executor { namespace deserialization { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::deserialization::getTensorDataPtr; -using ::executorch::runtime::deserialization::parseListOptionalType; -using ::executorch::runtime::deserialization::parseTensor; -using ::executorch::runtime::deserialization::parseTensorList; +using ::executorch::ET_RUNTIME_NAMESPACE::deserialization::getTensorDataPtr; +using ::executorch::ET_RUNTIME_NAMESPACE::deserialization:: + parseListOptionalType; +using ::executorch::ET_RUNTIME_NAMESPACE::deserialization::parseTensor; +using ::executorch::ET_RUNTIME_NAMESPACE::deserialization::parseTensorList; } // namespace deserialization } // namespace executor } // namespace torch diff --git a/runtime/executor/tensor_parser_aten.cpp b/runtime/executor/tensor_parser_aten.cpp index d1a2f712853..2d454d15be5 100644 --- a/runtime/executor/tensor_parser_aten.cpp +++ b/runtime/executor/tensor_parser_aten.cpp @@ -19,7 +19,9 @@ #include // @donotremove @manual=//caffe2/aten:ATen-core namespace executorch { +// This file is only used in ATen mode, so we use the runtime_aten namespace. namespace runtime { +namespace aten { namespace deserialization { namespace { @@ -126,5 +128,6 @@ Result parseTensor( } } // namespace deserialization +} // namespace aten } // namespace runtime } // namespace executorch diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp index 14ba5e0d42c..aa27bbf929d 100644 --- a/runtime/executor/tensor_parser_exec_aten.cpp +++ b/runtime/executor/tensor_parser_exec_aten.cpp @@ -16,11 +16,10 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace deserialization { using executorch::aten::ScalarType; -using executorch::runtime::TensorLayout; // Provides access to private Program methods. class TensorParser final { public: @@ -256,5 +255,5 @@ ET_NODISCARD Result getTensorDataPtr( } } // namespace deserialization -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp index 787af8b506b..e1f09d557ac 100644 --- a/runtime/executor/tensor_parser_portable.cpp +++ b/runtime/executor/tensor_parser_portable.cpp @@ -18,13 +18,13 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace deserialization { using executorch::runtime::Span; -using torch::executor::ScalarType; -using torch::executor::Tensor; -using torch::executor::TensorImpl; +using ::torch::executor::ScalarType; +using ::torch::executor::Tensor; +using ::torch::executor::TensorImpl; Result parseTensor( const Program* program, @@ -176,5 +176,5 @@ Result parseTensor( } } // namespace deserialization -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt index 2de32c9176a..512f832858f 100644 --- a/runtime/executor/test/CMakeLists.txt +++ b/runtime/executor/test/CMakeLists.txt @@ -27,9 +27,10 @@ add_custom_command( "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd" "${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte" "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte" + "${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte" COMMAND python3 -m test.models.export_program --modules - "ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain" + "ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful" --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null COMMAND python3 -m test.models.export_program --modules "ModuleLinear" @@ -51,6 +52,7 @@ add_custom_target( "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd" "${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte" "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte" + "${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte" ) set(test_env @@ -64,6 +66,7 @@ set(test_env "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd" "ET_MODULE_MULTI_ENTRY_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte" "ET_MODULE_SIMPLE_TRAIN_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte" + "ET_MODULE_STATEFUL_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte" ) et_cxx_test( @@ -152,3 +155,23 @@ target_include_directories( PRIVATE "${CMAKE_INSTALL_PREFIX}/schema/include" "${EXECUTORCH_ROOT}/third-party/flatbuffers/include" ) + +list(TRANSFORM _test_backend_compiler_lib__srcs PREPEND "${EXECUTORCH_ROOT}/") +add_library( + test_backend_compiler_lib + STATIC + ${_test_backend_compiler_lib__srcs} +) + +target_link_libraries( + test_backend_compiler_lib + PUBLIC + executorch_core +) + +target_link_options_shared_lib(test_backend_compiler_lib) + +install( + TARGETS test_backend_compiler_lib + DESTINATION lib +) diff --git a/runtime/executor/test/executor_test.cpp b/runtime/executor/test/executor_test.cpp index 328b23a8df3..e2a44429941 100644 --- a/runtime/executor/test/executor_test.cpp +++ b/runtime/executor/test/executor_test.cpp @@ -22,14 +22,14 @@ using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::SizesType; using executorch::aten::Tensor; +using executorch::ET_RUNTIME_NAMESPACE::get_op_function_from_registry; +using executorch::ET_RUNTIME_NAMESPACE::Kernel; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; +using executorch::ET_RUNTIME_NAMESPACE::OpFunction; +using executorch::ET_RUNTIME_NAMESPACE::register_kernel; +using executorch::ET_RUNTIME_NAMESPACE::registry_has_op_function; using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::get_op_function_from_registry; -using executorch::runtime::Kernel; -using executorch::runtime::KernelRuntimeContext; -using executorch::runtime::OpFunction; -using executorch::runtime::register_kernel; -using executorch::runtime::registry_has_op_function; using executorch::runtime::Result; using executorch::runtime::testing::TensorFactory; diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp index e9f09c38a59..211800d5dff 100644 --- a/runtime/executor/test/method_meta_test.cpp +++ b/runtime/executor/test/method_meta_test.cpp @@ -26,26 +26,34 @@ using torch::executor::util::FileDataLoader; class MethodMetaTest : public ::testing::Test { protected: - void SetUp() override { - // Create a loader for the serialized ModuleAdd program. - const char* path = std::getenv("ET_MODULE_ADD_PATH"); + void load_program(const char* path, const char* module_name) { + // Create a loader for the serialized program. Result loader = FileDataLoader::from(path); ASSERT_EQ(loader.error(), Error::Ok); - loader_ = std::make_unique(std::move(loader.get())); + loaders_.insert( + {module_name, + std::make_unique(std::move(loader.get()))}); // Use it to load the program. Result program = Program::load( - loader_.get(), Program::Verification::InternalConsistency); + loaders_[module_name].get(), + Program::Verification::InternalConsistency); ASSERT_EQ(program.error(), Error::Ok); - program_ = std::make_unique(std::move(program.get())); + programs_.insert( + {module_name, std::make_unique(std::move(program.get()))}); + } + + void SetUp() override { + load_program(std::getenv("ET_MODULE_ADD_PATH"), "add"); + load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful"); } private: // Must outlive program_, but tests shouldn't need to touch it. - std::unique_ptr loader_; + std::unordered_map> loaders_; protected: - std::unique_ptr program_; + std::unordered_map> programs_; }; namespace { @@ -67,7 +75,7 @@ void check_tensor(const TensorInfo& tensor_info) { } // namespace TEST_F(MethodMetaTest, MethodMetaApi) { - Result method_meta = program_->method_meta("forward"); + Result method_meta = programs_["add"]->method_meta("forward"); ASSERT_EQ(method_meta.error(), Error::Ok); // Appropriate amount of inputs @@ -97,11 +105,12 @@ TEST_F(MethodMetaTest, MethodMetaApi) { // Missing method fails EXPECT_EQ( - program_->method_meta("not_a_method").error(), Error::InvalidArgument); + programs_["add"]->method_meta("not_a_method").error(), + Error::InvalidArgument); } TEST_F(MethodMetaTest, TensorInfoApi) { - Result method_meta = program_->method_meta("forward"); + Result method_meta = programs_["add"]->method_meta("forward"); ASSERT_EQ(method_meta.error(), Error::Ok); // Input 1 @@ -138,3 +147,19 @@ TEST_F(MethodMetaTest, TensorInfoApi) { EXPECT_EQ( method_meta->output_tensor_meta(-1).error(), Error::InvalidArgument); } + +TEST_F(MethodMetaTest, MethodMetaAttribute) { + Result method_meta = + programs_["stateful"]->method_meta("forward"); + ASSERT_EQ(method_meta.error(), Error::Ok); + + ASSERT_EQ(method_meta->num_attributes(), 1); + auto state = method_meta->attribute_tensor_meta(0); + ASSERT_TRUE(state.ok()); + + ASSERT_EQ(state->name(), "state"); + ASSERT_FALSE(state->is_memory_planned()); + + auto bad_access = method_meta->attribute_tensor_meta(1); + ASSERT_EQ(bad_access.error(), Error::InvalidArgument); +} diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp index 0c6a2db94b7..5324ff5916d 100644 --- a/runtime/executor/test/method_test.cpp +++ b/runtime/executor/test/method_test.cpp @@ -79,6 +79,7 @@ class MethodTest : public ::testing::Test { load_program( std::getenv("ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH"), "cat"); load_program(std::getenv("ET_MODULE_LINEAR_PATH"), "linear"); + load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful"); load_program( std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"), "linear_constant_buffer"); @@ -339,6 +340,31 @@ TEST_F(MethodTest, ProgramDataSeparationTest) { ASSERT_EQ(err, Error::Ok); } +TEST_F(MethodTest, MethodGetAttributeTest) { + ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); + Result method = + programs_["stateful"]->load_method("forward", &mmm.get()); + ASSERT_EQ(method.error(), Error::Ok); + + auto res = method->get_attribute("state"); + ASSERT_TRUE(res.ok()); + // expect data to be empty + EXPECT_EQ(res->const_data_ptr(), nullptr); + + int32_t data = 0; + res->set_data(&data); + + // expect data to be set + EXPECT_EQ(res->const_data_ptr(), &data); + + // Can execute the method. + Error err = method->execute(); + ASSERT_EQ(err, Error::Ok); + + // Expect the state to be incremented + EXPECT_EQ(res->const_data_ptr()[0], 1); +} + /* * TODO(T161163608): Test is disabled due to a resize bug in tensor_index_out of * the portable op lib diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index dd5262b5ac6..75ea2674aa7 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -70,7 +70,7 @@ def define_common_targets(is_fbcode = False): "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core:evalue" + aten_suffix, "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix, - "//executorch/runtime/kernel:operator_registry", + "//executorch/runtime/kernel:operator_registry" + aten_suffix, "//executorch/runtime/platform:platform", ], ) @@ -122,6 +122,7 @@ def define_common_targets(is_fbcode = False): "ET_MODULE_LINEAR_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])", "ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])", "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])", + "ET_MODULE_STATEFUL_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleStateful.pte])", "ET_MODULE_LINEAR_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.pte])", "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])", } @@ -233,9 +234,9 @@ def define_common_targets(is_fbcode = False): # Uses an fbcode target path because the authoring/export tools # intentionally don't work in xplat (since they're host-only # tools). - "ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH": "$(location fbcode//executorch/test/models:exported_delegated_programs[ModuleAddMul-nosegments-da1024.pte])", - "ET_MODULE_ADD_MUL_NOSEGMENTS_PATH": "$(location fbcode//executorch/test/models:exported_delegated_programs[ModuleAddMul-nosegments.pte])", - "ET_MODULE_ADD_MUL_PATH": "$(location fbcode//executorch/test/models:exported_delegated_programs[ModuleAddMul.pte])", + "ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH": "$(location fbcode//executorch/test/models:exported_delegated_add_mul[ModuleAddMul-nosegments-da1024.pte])", + "ET_MODULE_ADD_MUL_NOSEGMENTS_PATH": "$(location fbcode//executorch/test/models:exported_delegated_add_mul[ModuleAddMul-nosegments.pte])", + "ET_MODULE_ADD_MUL_PATH": "$(location fbcode//executorch/test/models:exported_delegated_add_mul[ModuleAddMul.pte])", }, ) diff --git a/runtime/executor/test/test_backend_compiler_lib.cpp b/runtime/executor/test/test_backend_compiler_lib.cpp index 9eea6384d6f..ce631eb4f57 100644 --- a/runtime/executor/test/test_backend_compiler_lib.cpp +++ b/runtime/executor/test/test_backend_compiler_lib.cpp @@ -13,13 +13,13 @@ #include #include /* strtol */ +using executorch::ET_RUNTIME_NAMESPACE::Backend; +using executorch::ET_RUNTIME_NAMESPACE::BackendExecutionContext; +using executorch::ET_RUNTIME_NAMESPACE::BackendInitContext; +using executorch::ET_RUNTIME_NAMESPACE::BackendInterface; +using executorch::ET_RUNTIME_NAMESPACE::CompileSpec; +using executorch::ET_RUNTIME_NAMESPACE::DelegateHandle; using executorch::runtime::ArrayRef; -using executorch::runtime::Backend; -using executorch::runtime::BackendExecutionContext; -using executorch::runtime::BackendInitContext; -using executorch::runtime::BackendInterface; -using executorch::runtime::CompileSpec; -using executorch::runtime::DelegateHandle; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; diff --git a/runtime/executor/test/test_backend_with_delegate_mapping.cpp b/runtime/executor/test/test_backend_with_delegate_mapping.cpp index e6d84aca189..a0b79b09c6d 100644 --- a/runtime/executor/test/test_backend_with_delegate_mapping.cpp +++ b/runtime/executor/test/test_backend_with_delegate_mapping.cpp @@ -14,13 +14,13 @@ #include /* strtol */ #include +using executorch::ET_RUNTIME_NAMESPACE::Backend; +using executorch::ET_RUNTIME_NAMESPACE::BackendExecutionContext; +using executorch::ET_RUNTIME_NAMESPACE::BackendInitContext; +using executorch::ET_RUNTIME_NAMESPACE::BackendInterface; +using executorch::ET_RUNTIME_NAMESPACE::CompileSpec; +using executorch::ET_RUNTIME_NAMESPACE::DelegateHandle; using executorch::runtime::ArrayRef; -using executorch::runtime::Backend; -using executorch::runtime::BackendExecutionContext; -using executorch::runtime::BackendInitContext; -using executorch::runtime::BackendInterface; -using executorch::runtime::CompileSpec; -using executorch::runtime::DelegateHandle; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; diff --git a/runtime/kernel/kernel_runtime_context.h b/runtime/kernel/kernel_runtime_context.h index ad269f5dd4b..6facecc7632 100644 --- a/runtime/kernel/kernel_runtime_context.h +++ b/runtime/kernel/kernel_runtime_context.h @@ -15,7 +15,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { /** * Runtime state and functionality for kernel implementations. @@ -107,7 +107,7 @@ class KernelRuntimeContext { Error failure_state_ = Error::Ok; }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch // TODO(T197294990): Remove these deprecated aliases once all users have moved @@ -115,15 +115,15 @@ class KernelRuntimeContext { namespace torch { namespace executor { /// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead. -using ::executorch::runtime::KernelRuntimeContext; +using ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; /// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead. -using RuntimeContext = ::executorch::runtime::KernelRuntimeContext; +using RuntimeContext = ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; } // namespace executor } // namespace torch namespace executorch { namespace aten { /// DEPRECATED: Use ::executorch::runtime::KernelRuntimeContext instead. -using RuntimeContext = ::executorch::runtime::KernelRuntimeContext; +using RuntimeContext = ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; } // namespace aten } // namespace executorch // DEPRECATED: The executorch::aten:: namespace is deprecated. Use diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp index 85705e5b3fd..d7e7b298c10 100644 --- a/runtime/kernel/operator_registry.cpp +++ b/runtime/kernel/operator_registry.cpp @@ -15,7 +15,7 @@ #include namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { namespace { @@ -258,5 +258,5 @@ Span get_registered_kernels() { return {registered_kernels, num_registered_kernels}; } -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch diff --git a/runtime/kernel/operator_registry.h b/runtime/kernel/operator_registry.h index a3cdcd66cee..f7a62208dd8 100644 --- a/runtime/kernel/operator_registry.h +++ b/runtime/kernel/operator_registry.h @@ -40,7 +40,7 @@ } namespace executorch { -namespace runtime { +namespace ET_RUNTIME_NAMESPACE { class KernelRuntimeContext; // Forward declaration using OpFunction = void (*)(KernelRuntimeContext&, EValue**); @@ -258,38 +258,41 @@ ET_NODISCARD inline Error register_kernel(const Kernel& kernel) { return register_kernels({&kernel, 1}); }; -} // namespace runtime +} // namespace ET_RUNTIME_NAMESPACE } // namespace executorch namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::Kernel; -using ::executorch::runtime::KernelKey; -using ::executorch::runtime::KernelRuntimeContext; -using ::executorch::runtime::OpFunction; -using ::executorch::runtime::TensorMeta; -using KernelRuntimeContext = ::executorch::runtime::KernelRuntimeContext; +using ::executorch::ET_RUNTIME_NAMESPACE::Kernel; +using ::executorch::ET_RUNTIME_NAMESPACE::KernelKey; +using ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; +using ::executorch::ET_RUNTIME_NAMESPACE::OpFunction; +using ::executorch::ET_RUNTIME_NAMESPACE::TensorMeta; +using KernelRuntimeContext = + ::executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; inline ::executorch::runtime::Error register_kernels(ArrayRef kernels) { - return ::executorch::runtime::register_kernels( + return ::executorch::ET_RUNTIME_NAMESPACE::register_kernels( {kernels.data(), kernels.size()}); } inline OpFunction getOpsFn( const char* name, ArrayRef meta_list = {}) { - auto result = ::executorch::runtime::get_op_function_from_registry( - name, {meta_list.data(), meta_list.size()}); + auto result = + ::executorch::ET_RUNTIME_NAMESPACE::get_op_function_from_registry( + name, {meta_list.data(), meta_list.size()}); ET_CHECK(result.ok()); // get_op_function_from_registry() logs details. return *result; } inline bool hasOpsFn(const char* name, ArrayRef meta_list = {}) { - return ::executorch::runtime::registry_has_op_function( + return ::executorch::ET_RUNTIME_NAMESPACE::registry_has_op_function( name, {meta_list.data(), meta_list.size()}); } inline ArrayRef get_kernels() { - Span kernels = ::executorch::runtime::get_registered_kernels(); + Span kernels = + ::executorch::ET_RUNTIME_NAMESPACE::get_registered_kernels(); return ArrayRef(kernels.data(), kernels.size()); } } // namespace executor diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl index b6aa9d7a95e..8a945f19881 100644 --- a/runtime/kernel/targets.bzl +++ b/runtime/kernel/targets.bzl @@ -21,21 +21,6 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ - runtime.cxx_library( - name = "operator_registry", - srcs = ["operator_registry.cpp"], - exported_headers = ["operator_registry.h"], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - exported_deps = [ - "//executorch/runtime/core:core", - "//executorch/runtime/core:evalue", - ], - preprocessor_flags = _operator_registry_preprocessor_flags(), - ) - runtime.cxx_library( name = "operator_registry_MAX_NUM_KERNELS_TEST_ONLY", srcs = ["operator_registry.cpp"], @@ -68,6 +53,21 @@ def define_common_targets(): for aten_mode in get_aten_mode_options(): aten_suffix = "_aten" if aten_mode else "" + runtime.cxx_library( + name = "operator_registry" + aten_suffix, + srcs = ["operator_registry.cpp"], + exported_headers = ["operator_registry.h"], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + exported_deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/core:evalue" + aten_suffix, + ], + preprocessor_flags = _operator_registry_preprocessor_flags(), + ) + runtime.cxx_library( name = "kernel_runtime_context" + aten_suffix, exported_headers = [ diff --git a/runtime/kernel/test/kernel_runtime_context_test.cpp b/runtime/kernel/test/kernel_runtime_context_test.cpp index 50bd860fb9c..2c3b536b0d4 100644 --- a/runtime/kernel/test/kernel_runtime_context_test.cpp +++ b/runtime/kernel/test/kernel_runtime_context_test.cpp @@ -13,8 +13,8 @@ #include using namespace ::testing; +using executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext; using executorch::runtime::Error; -using executorch::runtime::KernelRuntimeContext; using executorch::runtime::MemoryAllocator; using executorch::runtime::Result; diff --git a/runtime/kernel/test/targets.bzl b/runtime/kernel/test/targets.bzl index bd66fc05b6f..4b3ed0f3139 100644 --- a/runtime/kernel/test/targets.bzl +++ b/runtime/kernel/test/targets.bzl @@ -101,3 +101,16 @@ def define_common_targets(): ":specialized_kernel_generated_lib", ], ) + + if aten_mode: + # Make sure we can depend on both generated_lib and generated_lib_aten + # in the same binary. + runtime.cxx_test( + name = "test_generated_lib_and_aten", + srcs = ["test_generated_lib_and_aten.cpp"], + deps = [ + "//executorch/kernels/portable:generated_lib", + "//executorch/kernels/portable:generated_lib_aten", + "//executorch/runtime/kernel:operator_registry_aten", + ], + ) diff --git a/runtime/kernel/test/test_generated_lib_and_aten.cpp b/runtime/kernel/test/test_generated_lib_and_aten.cpp new file mode 100644 index 00000000000..f9bfebc4a80 --- /dev/null +++ b/runtime/kernel/test/test_generated_lib_and_aten.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace ::testing; +using executorch::aten::ScalarType; +using executorch::runtime::Error; +using executorch::runtime::EValue; + +class GeneratedLibAndAtenTest : public ::testing::Test { + public: + void SetUp() override { + executorch::runtime::runtime_init(); + } +}; + +TEST_F(GeneratedLibAndAtenTest, GetKernelsFromATenRegistry) { + // Check if the kernel exists in the ATen registry + bool has_kernel = + executorch::runtime::aten::registry_has_op_function("aten::add.out"); + EXPECT_TRUE(has_kernel) + << "Kernel 'aten::add.out' not found in ATen registry"; + + // Get the kernel from the ATen registry + auto result = + executorch::runtime::aten::get_op_function_from_registry("aten::add.out"); + EXPECT_EQ(result.error(), Error::Ok) + << "Failed to get kernel from ATen registry"; + EXPECT_NE(*result, nullptr) << "Kernel function from ATen registry is null"; +} diff --git a/scripts/build_android_library.sh b/scripts/build_android_library.sh index 8a385ad6876..cbde7ae3d43 100755 --- a/scripts/build_android_library.sh +++ b/scripts/build_android_library.sh @@ -12,11 +12,6 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then fi which "${PYTHON_EXECUTABLE}" -copy_src() { - cp -r extension/android/build.gradle extension/android/settings.gradle extension/android/gradlew extension/android/gradle extension/android/gradlew.bat extension/android/gradle.properties "${BUILD_AAR_DIR}" - cp -r extension/android/executorch_android "${BUILD_AAR_DIR}/executorch_android" -} - build_android_native_library() { ANDROID_ABI="$1" ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}" @@ -70,11 +65,6 @@ build_android_native_library() { fi cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config "${EXECUTORCH_CMAKE_BUILD_TYPE}" - # Update tokenizers submodule - pushd extension/llm/tokenizers - echo "Update tokenizers submodule" - git submodule update --init - popd cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ @@ -93,54 +83,52 @@ build_android_native_library() { cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config "${EXECUTORCH_CMAKE_BUILD_TYPE}" # Copy artifacts to ABI specific directory - mkdir -p "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}" - cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" + local SO_STAGE_DIR="cmake-out-android-so/${ANDROID_ABI}" + mkdir -p ${SO_STAGE_DIR} + cp "${CMAKE_OUT}"/extension/android/*.so "${SO_STAGE_DIR}/libexecutorch.so" # Copy QNN related so library if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then - cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" - cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" - cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" - cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" - cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" - cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" - cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" - cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" - cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/" + cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so ${SO_STAGE_DIR} + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so ${SO_STAGE_DIR} + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so ${SO_STAGE_DIR} + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so ${SO_STAGE_DIR} + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so ${SO_STAGE_DIR} + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so ${SO_STAGE_DIR} + cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${SO_STAGE_DIR} + cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${SO_STAGE_DIR} + cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${SO_STAGE_DIR} fi # Copy MTK related so library if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ] && [ -n "$NEURON_USDK_ADAPTER_LIB" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then - cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/ - cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/ - cp "${NEURON_USDK_ADAPTER_LIB}" ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/ + cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${SO_STAGE_DIR} + cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${SO_STAGE_DIR} + cp "${NEURON_USDK_ADAPTER_LIB}" ${SO_STAGE_DIR} fi } build_aar() { - pushd "${BUILD_AAR_DIR}" - # Rename libexecutorch_jni.so to libexecutorch.so for soname consistency - # between Java and JNI - find . -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; if [ "$EXECUTORCH_CMAKE_BUILD_TYPE" == "Release" ]; then - find . -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \; + find cmake-out-android-so -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \; fi + pushd extension/android/ ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build - cp executorch_android/build/outputs/aar/executorch_android-debug.aar executorch.aar + # Use java unit test as sanity check + ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest popd + if [ ! -z $BUILD_AAR_DIR ]; then + cp extension/android/executorch_android/build/outputs/aar/executorch_android-debug.aar "${BUILD_AAR_DIR}/executorch.aar" + fi } main() { - if [[ -z "${BUILD_AAR_DIR:-}" ]]; then - BUILD_AAR_DIR="$(mktemp -d)" - fi - export BUILD_AAR_DIR if [ -z "$ANDROID_ABIS" ]; then ANDROID_ABIS=("arm64-v8a" "x86_64") fi export ANDROID_ABIS - copy_src + mkdir -p cmake-out-android-so/ for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do build_android_native_library ${ANDROID_ABI} done diff --git a/scripts/check_urls.sh b/scripts/check_urls.sh new file mode 100755 index 00000000000..ad6c1440ebe --- /dev/null +++ b/scripts/check_urls.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +status=0 +green='\e[1;32m'; red='\e[1;31m'; cyan='\e[1;36m'; yellow='\e[1;33m'; reset='\e[0m' +user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36" +last_filepath= + +while IFS=: read -r filepath url; do + if [ "$filepath" != "$last_filepath" ]; then + printf '\n%s:\n' "$filepath" + last_filepath=$filepath + fi + code=$(curl -gsLm30 --retry 3 --retry-delay 3 --retry-connrefused -o /dev/null -w "%{http_code}" -I "$url") || code=000 + if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then + code=$(curl -gsLm30 --retry 3 --retry-delay 3 --retry-connrefused -o /dev/null -w "%{http_code}" -r 0-0 -A "$user_agent" "$url") || code=000 + fi + if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then + request_id=$(curl -sS -H 'Accept: application/json' "https://check-host.net/check-http?host=$url&max_nodes=1&node=us3.node.check-host.net" \ + | jq -r .request_id) + for _ in {1..3}; do + code=$(curl -sS -H 'Accept: application/json' "https://check-host.net/check-result/$request_id" \ + | jq -r -e '.[][0][3]') || code=000 + [[ "$code" =~ ^[0-9]+$ ]] || code=000 + sleep 3 + done + fi + if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then + printf "${red}%s${reset} ${yellow}%s${reset}\n" "$code" "$url" >&2 + status=1 + else + printf "${green}%s${reset} ${cyan}%s${reset}\n" "$code" "$url" + fi +done < <( + git --no-pager grep --no-color -I -P -o \ + '(?\")]*[\{\}\$])[^[:space:]<>\")\[\]\(]+' \ + -- '*' \ + ':(exclude).*' \ + ':(exclude)**/.*' \ + ':(exclude)**/*.lock' \ + ':(exclude)**/*.svg' \ + ':(exclude)**/*.xml' \ + ':(exclude)**/*.gradle*' \ + ':(exclude)**/*gradle*' \ + ':(exclude)**/third-party/**' \ + | sed -E 's/[^/[:alnum:]]+$//' \ + | grep -Ev '://(0\.0\.0\.0|127\.0\.0\.1|localhost)([:/])' \ + | grep -Ev 'fwdproxy:8080' \ + || true +) + +exit $status diff --git a/scripts/check_xrefs.sh b/scripts/check_xrefs.sh new file mode 100755 index 00000000000..69e083a8a67 --- /dev/null +++ b/scripts/check_xrefs.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +status=0 +green='\e[1;32m'; red='\e[1;31m'; cyan='\e[1;36m'; yellow='\e[1;33m'; reset='\e[0m' +last_filepath= + +while IFS=: read -r filepath link; do + if [ "$filepath" != "$last_filepath" ]; then + printf '\n%s:\n' "$filepath" + last_filepath=$filepath + fi + if [ -e "$(dirname "$filepath")/${link%%#*}" ]; then + printf " ${green}OK${reset} ${cyan}%s${reset}\n" "$link" + else + printf "${red}FAIL${reset} ${yellow}%s${reset}\n" "$link" >&2 + status=1 + fi +done < <( + git --no-pager grep --no-color -I -o -E \ + '\[[^]]+\]\([^[:space:])]*/[^[:space:])]*\)|href="[^"]*/[^"]*"|src="[^"]*/[^"]*"' \ + -- '*' \ + ':(exclude).*' \ + ':(exclude)**/.*' \ + ':(exclude)**/*.lock' \ + ':(exclude)**/*.svg' \ + ':(exclude)**/*.xml' \ + ':(exclude)**/third-party/**' \ + | grep -Ev 'https?://' \ + | sed -E \ + -e 's#([^:]+):\[[^]]+\]\(([^)]+)\)#\1:\2#' \ + -e 's#([^:]+):href="([^"]+)"#\1:\2#' \ + -e 's#([^:]+):src="([^"]+)"#\1:\2#' \ + -e 's/[[:punct:]]*$//' \ + | grep -Ev '\{\{' \ + || true +) + +exit $status diff --git a/scripts/run_android_emulator.sh b/scripts/run_android_emulator.sh index fe73ec8a1d7..29c2425cd0e 100755 --- a/scripts/run_android_emulator.sh +++ b/scripts/run_android_emulator.sh @@ -18,17 +18,14 @@ $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; d echo "List all running emulators" $ADB_PATH devices -adb uninstall com.example.executorchllamademo || true -adb uninstall com.example.executorchllamademo.test || true -adb install -t app-debug.apk -adb install -t app-debug-androidTest.apk - -adb shell mkdir -p /data/local/tmp/llama -adb push model.pte /data/local/tmp/llama -adb push tokenizer.bin /data/local/tmp/llama -adb shell am instrument -w -r com.example.executorchllamademo.test/androidx.test.runner.AndroidJUnitRunner - adb uninstall org.pytorch.executorch.test || true adb install -t android-test-debug-androidTest.apk -adb shell am instrument -w -r org.pytorch.executorch.test/androidx.test.runner.AndroidJUnitRunner +adb logcat -c +adb shell am instrument -w -r \ + org.pytorch.executorch.test/androidx.test.runner.AndroidJUnitRunner >result.txt 2>&1 +adb logcat -d > logcat.txt +cat logcat.txt +grep -q FAILURES result.txt && cat result.txt +grep -q FAILURES result.txt && exit -1 +exit 0 diff --git a/scripts/test_ios.sh b/scripts/test_ios.sh index 385c85f3dfe..245f7b06f7a 100755 --- a/scripts/test_ios.sh +++ b/scripts/test_ios.sh @@ -15,7 +15,7 @@ set -e OUTPUT="${1:-executorch}" EXIT_STATUS=0 -APP_PATH="examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo" +APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo" MODEL_NAME="mv3" SIMULATOR_NAME="executorch" @@ -44,26 +44,17 @@ say() { echo -e "\033[1m\n\t** $1 **\n\033[0m" } -say "Cloning the Code" - -pushd . > /dev/null -git clone -b viable/strict https://github.com/pytorch/executorch.git "$OUTPUT" -cd "$OUTPUT" - -say "Updating the Submodules" - -git submodule update --init - say "Activating a Virtual Environment" -python3 -m venv .venv -source .venv/bin/activate +python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip say "Installing Requirements" -pip install --upgrade cmake pip setuptools wheel zstd +./install_executorch.sh -./install_executorch.sh --pybind coreml mps xnnpack +say "Cloning the Demo App" + +git clone --depth 1 https://github.com/pytorch-labs/executorch-examples.git say "Installing CoreML Backend Requirements" @@ -88,11 +79,6 @@ say "Downloading Labels" curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \ -o "$APP_PATH/Resources/Models/MobileNet/imagenet_classes.txt" -say "Building Frameworks" - -./scripts/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack -mv cmake-out "$APP_PATH/Frameworks" - say "Creating Simulator" xcrun simctl create "$SIMULATOR_NAME" "iPhone 15" diff --git a/setup.py b/setup.py index 44fb9a712a3..2c5f5578bcf 100644 --- a/setup.py +++ b/setup.py @@ -606,8 +606,8 @@ def run(self): # be found in the pip package. This is the subset of headers that are # essential for building custom ops extensions. # TODO: Use cmake to gather the headers instead of hard-coding them here. - # For example: https://discourse.cmake.org/t/installing-headers-the-modern- - # way-regurgitated-and-revisited/3238/3 + # For example: + # https://discourse.cmake.org/t/installing-headers-the-modern-way-regurgitated-and-revisited/3238/3 for include_dir in [ "runtime/core/", "runtime/kernel/", @@ -718,6 +718,7 @@ def run(self): # enabled. TODO(dbort): Remove this override once this option is # managed by cmake itself. "-DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF", + "-DEXECUTORCH_BUILD_TESTS=ON", ] build_args = [f"-j{self.parallel}"] diff --git a/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl b/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl index 5cb801489ed..bd011748786 100644 --- a/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl +++ b/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl @@ -1,2 +1,2 @@ -def get_qnn_library_verision(): +def get_qnn_library_version(): return "2.28" diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl index a6d6d59e0c2..e1cebaa1140 100644 --- a/shim_et/xplat/executorch/codegen/codegen.bzl +++ b/shim_et/xplat/executorch/codegen/codegen.bzl @@ -688,7 +688,7 @@ def executorch_generated_lib( "ovr_config//os:windows": [], }) + compiler_flags, deps = [ - "//executorch/runtime/kernel:operator_registry", + "//executorch/runtime/kernel:operator_registry" + aten_suffix, "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix, "//executorch/runtime/core:evalue" + aten_suffix, "//executorch/codegen:macros", diff --git a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl index 61eeaf7c179..1616304c3ea 100644 --- a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl +++ b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl @@ -21,9 +21,9 @@ PORTABLE_MODULE_DEPS = [ ] + get_all_cpu_backend_targets() ATEN_MODULE_DEPS = [ - "//executorch/runtime/kernel:operator_registry", + "//executorch/runtime/kernel:operator_registry_aten", "//executorch/runtime/executor:program_aten", - "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/exec_aten:lib_aten", "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs", "//executorch/extension/data_loader:buffer_data_loader", "//executorch/extension/data_loader:mmap_data_loader", diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index a1ffdc1eed3..d0c39bcf17f 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -372,7 +372,6 @@ ATEN_OPS = ( name = "op_bmm", deps = [ "//executorch/kernels/portable/cpu/util:matmul_ops_util", - ":vec_ops", ], ), op_target( @@ -1269,6 +1268,12 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:reduce_util", ], ), + op_target( + name = "op_view_as_real_copy", + deps = [ + "//executorch/kernels/portable/cpu/util:copy_ops_util", + ], + ), op_target( name = "op_view_copy", deps = [ diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py index d3dcc229100..a8124d62dd4 100644 --- a/test/end2end/exported_module.py +++ b/test/end2end/exported_module.py @@ -70,6 +70,7 @@ def export( skip_type_promotion: bool = False, export_joint_graph: bool = False, external_constants: bool = False, + export_state_names: bool = False, ) -> "ExportedModule": """ Creates a new ExportedModule for the specified module class. @@ -148,7 +149,9 @@ def return_wrapper(): for method in methods: method_name_to_dynamic_shapes[method] = trace_dynamic_shapes - memory_planning_pass = MemoryPlanningPass() + memory_planning_pass = MemoryPlanningPass( + alloc_mutable_buffers=not export_state_names + ) if hasattr(eager_module, "get_memory_planning_pass"): memory_planning_pass = eager_module.get_memory_planning_pass() # type: ignore[operator] @@ -208,6 +211,7 @@ def __init__(self, method): memory_planning_pass=memory_planning_pass, to_out_var_pass=ToOutVarPass(ignore_to_out_var_failure), external_constants=external_constants, + emit_mutable_buffer_names=export_state_names, ) ) diff --git a/test/models/export_delegated_program.py b/test/models/export_delegated_program.py index 4f4429aca88..44ae8df544f 100644 --- a/test/models/export_delegated_program.py +++ b/test/models/export_delegated_program.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import argparse import inspect import os @@ -19,6 +21,7 @@ from executorch.exir.backend.test.backend_with_compiler_demo import ( BackendWithCompilerDemo, ) +from executorch.exir.program import ExecutorchProgramManager from torch import nn from torch.export import export @@ -111,10 +114,10 @@ def export_module_to_program( *, backend_id: str, extract_delegate_segments: bool, - constant_tensor_alignemnt: Optional[int] = None, + constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, method: str = "forward", -) -> bytes: +) -> ExecutorchProgramManager: eager_module = module_class().eval() inputs = () if hasattr(eager_module, "get_random_inputs"): @@ -135,7 +138,7 @@ def forward(self, *args, **kwargs): edge_config = EdgeCompileConfig(_check_ir_validity=False) et_config = exir.ExecutorchBackendConfig( extract_delegate_segments=extract_delegate_segments, - constant_tensor_alignment=constant_tensor_alignemnt, + constant_tensor_alignment=constant_tensor_alignment, delegate_alignment=delegate_alignment, ) @@ -170,7 +173,7 @@ def forward(self, *args, **kwargs): export(composite_module, args=inputs, strict=True) ).to_executorch(config=et_config) - return executorch_program.buffer + return executorch_program def main() -> None: @@ -199,6 +202,14 @@ def main() -> None: help="ID of the backend to use for delegation; " + f"one of {known_backend_ids}", ) + parser.add_argument( + "--inline_delegate_segments", + action="store_true", + help="Store delegate data inside the flatbuffer.", + ) + parser.add_argument( + "--delegate_alignment", type=int, default=None, help="Delegate alignment." + ) parser.add_argument( "--outdir", type=str, @@ -219,25 +230,22 @@ def main() -> None: # Export and write to the output files. os.makedirs(args.outdir, exist_ok=True) + suffix = "" for module_name, module_class in module_names_to_classes.items(): - for extract_delegate_segments in (True, False): - suffix = "" if extract_delegate_segments else "-nosegments" - # Create files with the default alignment, and a large alignment. - # This alignment should be so large that it's extremely unlikely for - # the data to accidentally be aligned to it in the default case. - for delegate_alignment in (None, 1024): - suffix += f"-da{delegate_alignment}" if delegate_alignment else "" - outfile = os.path.join(args.outdir, f"{module_name}{suffix}.pte") - with open(outfile, "wb") as fp: - fp.write( - export_module_to_program( - module_class, - backend_id=args.backend_id, - extract_delegate_segments=extract_delegate_segments, - delegate_alignment=delegate_alignment, - ) - ) - print(f"Exported {module_name} and wrote program data to {outfile}") + if args.inline_delegate_segments: + suffix += "-nosegments" + if args.delegate_alignment is not None: + suffix += f"-da{args.delegate_alignment}" + outfile = os.path.join(args.outdir, f"{module_name}{suffix}.pte") + executorch_program = export_module_to_program( + module_class, + backend_id=args.backend_id, + extract_delegate_segments=not args.inline_delegate_segments, + delegate_alignment=args.delegate_alignment, + ) + with open(outfile, "wb") as fp: + fp.write(executorch_program.buffer) + print(f"Exported {module_name} and wrote program data to {outfile}") if __name__ == "__main__": diff --git a/test/models/export_program.py b/test/models/export_program.py index ccf8a965eb2..5ed9cba4f8e 100644 --- a/test/models/export_program.py +++ b/test/models/export_program.py @@ -183,6 +183,23 @@ def export_joint(): return True +class ModuleStateful(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer("state", torch.zeros(1, dtype=torch.int32)) + + def forward(self, x): + self.state.add_(1) + return x + self.state + + def get_random_inputs(self): + return (torch.ones(1),) + + @staticmethod + def export_state_names(): + return True + + # # Main logic. # @@ -201,8 +218,11 @@ def export_module_to_program( # pyre-ignore[16]: pyre doesn't know about get_export_kwargs. export_kwargs = module_class.get_export_kwargs() export_joint = False + export_state_names = False if hasattr(module_class, "export_joint"): export_joint = module_class.export_joint() # pyre-ignore + if hasattr(module_class, "export_state_names"): + export_state_names = module_class.export_state_names() if hasattr(module_class, "get_method_names_to_export"): # pyre-ignore[16]: pyre doesn't know about get_export_kwargs. methods = module_class.get_method_names_to_export() @@ -214,6 +234,7 @@ def export_module_to_program( skip_type_promotion=skip_type_promotion, export_joint_graph=export_joint, external_constants=external_constants, + export_state_names=export_state_names, **export_kwargs, ) return module.executorch_program diff --git a/test/models/targets.bzl b/test/models/targets.bzl index 6d5b6753f3f..6538302c507 100644 --- a/test/models/targets.bzl +++ b/test/models/targets.bzl @@ -67,6 +67,7 @@ def define_common_targets(): "ModuleIndex", "ModuleDynamicCatUnallocatedIO", "ModuleSimpleTrain", + "ModuleStateful", ] # Generates Executorch .pte program files for various modules at build time. @@ -150,7 +151,7 @@ def define_common_targets(): visibility = [], # Private ) - # Class names of nn.Modules for :exported_delegated_programs to export. + # Class names of nn.Modules available in export_delegated_program.py. DELEGATED_MODULES_TO_EXPORT = [ "ModuleAddMul", "ModuleAddLarge", @@ -160,23 +161,23 @@ def define_common_targets(): # Name of the backend to use when exporting delegated programs. BACKEND_ID = "StubBackend" - # Generates Executorch .pte program files for various modules at build time. + # Generates Executorch .pte program files for the AddMul module at build time. # To use one, depend on a target like - # ":exported_delegated_programs[ModuleAdd.pte]" or - # ":exported_delegated_programs[ModuleAdd-nosegments.pte]" (which does not + # ":exported_delegated_add_mul[ModuleAdd.pte]" or + # ":exported_delegated_add_mul[ModuleAdd-nosegments.pte]" (which does not # extract the delegate data blobs into segments). runtime.genrule( - name = "exported_delegated_programs", - cmd = "$(exe :export_delegated_program)" + - " --modules " + ",".join(DELEGATED_MODULES_TO_EXPORT) + - " --backend_id " + BACKEND_ID + - " --outdir $OUT", + name = "exported_delegated_add_mul", + cmd = "$(exe :export_delegated_program) --modules ModuleAddMul --backend_id " + BACKEND_ID + " --outdir $OUT" + + " && $(exe :export_delegated_program) --modules ModuleAddMul --backend_id " + BACKEND_ID + " --inline_delegate_segments --outdir $OUT" + + # Create files with a large alignment as well as the default. + # This alignment should be so large that it's extremely unlikely for + # the data to accidentally be aligned to it in the default case. + " && $(exe :export_delegated_program) --modules ModuleAddMul --backend_id " + BACKEND_ID + " --inline_delegate_segments --delegate_alignment 1024 --outdir $OUT", outs = { - fname + seg_suffix + da_suffix + ".pte": [fname + seg_suffix + da_suffix + ".pte"] - for fname in DELEGATED_MODULES_TO_EXPORT - for seg_suffix in ["", "-nosegments"] - # "da" = delegate alignment - for da_suffix in ["", "-da1024"] + "ModuleAddMul.pte": ["ModuleAddMul.pte"], + "ModuleAddMul-nosegments.pte": ["ModuleAddMul-nosegments.pte"], + "ModuleAddMul-nosegments-da1024.pte": ["ModuleAddMul-nosegments-da1024.pte"], }, default_outs = ["."], visibility = [ @@ -188,7 +189,7 @@ def define_common_targets(): runtime.genrule( name = "exported_xnnp_delegated_programs", cmd = "$(exe :export_delegated_program)" + - " --modules " + ",".join(DELEGATED_MODULES_TO_EXPORT) + + " --modules ModuleAddLarge,ModuleSubLarge" + " --backend_id " + "XnnpackBackend" + " --outdir $OUT", outs = { diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index ff2ed048257..422cd579d04 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -40,6 +40,7 @@ build_executorch() { -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ diff --git a/test/utils/DeathTest.h b/test/utils/DeathTest.h index 2ba9bd61bd9..be83593adf1 100644 --- a/test/utils/DeathTest.h +++ b/test/utils/DeathTest.h @@ -15,6 +15,10 @@ #include +#ifndef ET_BUILD_MODE_COV +#define ET_BUILD_MODE_COV 0 +#endif // ET_BUILD_MODE_COV + #if ET_BUILD_MODE_COV /** diff --git a/third-party/ao b/third-party/ao index 923242e22b5..7fa9c69dc09 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 923242e22b5fb67646473605ab959b90cc450abc +Subproject commit 7fa9c69dc0999023add31d000d4750e0ac2cd799 diff --git a/tools/cmake/cmake_deps.toml b/tools/cmake/cmake_deps.toml index ee810c2bfd5..9913a02c4d5 100644 --- a/tools/cmake/cmake_deps.toml +++ b/tools/cmake/cmake_deps.toml @@ -150,6 +150,20 @@ deps = [ "optimized_cpublas", "portable_kernels", ] + +[targets.test_backend_compiler_lib] +buck_targets = [ + "//runtime/executor/test:test_backend_compiler_lib", +] +filters = [ + ".cpp$", +] +excludes = [ +] +deps = [ + "executorch", + "executorch_core", +] # ---------------------------------- core end ---------------------------------- # ---------------------------------- extension start ---------------------------------- [targets.extension_data_loader] diff --git a/util/collect_env.py b/util/collect_env.py index 7d35c0636ce..ec44c9d6149 100644 --- a/util/collect_env.py +++ b/util/collect_env.py @@ -220,8 +220,7 @@ def get_cudnn_version(run_lambda): cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) elif get_platform() == "darwin": # CUDA libraries and drivers can be found in /usr/local/cuda/. See - # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install - # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac + # https://docs.nvidia.com/cuda/archive/10.1/cuda-installation-guide-mac-os-x/index.html#3.2-Install # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*" else: diff --git a/util/python_profiler.py b/util/python_profiler.py index 8993beb9429..c62b0ffafe0 100644 --- a/util/python_profiler.py +++ b/util/python_profiler.py @@ -44,7 +44,9 @@ def _from_pstat_to_static_html(stats: Stats, html_filename: str): html_filename: Output filename in which populated template is rendered """ RESTR = r'(?