Merge branch 'main' into kvaishnavi/whisper

microsoft · Feb 5, 2025 · ef955e7 · ef955e7
2 parents fbebe68 + 4d702e2
commit ef955e7
Show file tree

Hide file tree

Showing 77 changed files with 1,435 additions and 558 deletions.
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
@@ -10,6 +10,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
   ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
   ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
@@ -84,10 +85,6 @@ jobs:
           python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
           python3 -m pip install --user --no-index --no-deps --find-links build/cpu/wheel onnxruntime_genai
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Verify Build Artifacts
         if: always()
         continue-on-error: true

diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -12,6 +12,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   ort_dir: "onnxruntime-linux-x64-1.18.0"
   ort_zip: "onnxruntime-linux-x64-1.18.0.tgz"
   ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.18.0/onnxruntime-linux-x64-1.18.0.tgz"
@@ -55,10 +56,6 @@ jobs:
           python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
           python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl --no-deps
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Run the python tests
         run: |
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models --e2e

diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
@@ -12,6 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1"
   ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux
   ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
@@ -109,10 +110,6 @@ jobs:
             bash -c " \
               /usr/bin/cmake --build --preset linux_gcc_cuda_release"
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Install the onnxruntime-genai Python wheel and run python test
         run: |
           echo "Installing the onnxruntime-genai Python wheel and running the Python tests"

diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
@@ -10,6 +10,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
   ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 jobs:
@@ -86,7 +87,6 @@ jobs:
       - name: Run the python tests
         run: |
           source genai-macos-venv/bin/activate
-          export HF_TOKEN="12345"
           export ORTGENAI_LOG_ORT_LIB=1
           python3 -m pip install requests
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models

diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
@@ -11,6 +11,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   binaryDir: 'build/cpu/win-x64'
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
   ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
@@ -91,10 +92,6 @@ jobs:
         python3 -m pip install -r test\python\cpu\ort\requirements.txt --user
         python3 -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
-    - name: Use Dummy HuggingFace Token
-      run: |
-        Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
-
     - name: Run the Python Tests
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"

diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
@@ -12,6 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   AZCOPY_AUTO_LOGIN_TYPE: MSI
   AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
   cuda_dir: "${{ github.workspace }}\\cuda_sdk"
@@ -80,10 +81,6 @@ jobs:
         python -m pip install -r test\python\cuda\ort\requirements.txt
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
-    - name: Use Dummy HuggingFace Token
-      run: |
-        Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
-
     - name: Run the Python Tests
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e

diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
@@ -48,6 +48,11 @@ parameters:
   type: boolean
   default: false
 
+- name: enable_win_qnn
+  displayName: 'Whether QNN nuget package should be built.'
+  type: boolean
+  default: false
+
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
@@ -122,6 +127,7 @@ stages:
     ort_cuda_version: ${{ parameters.ort_cuda_version }}
     ort_dml_version: ${{ parameters.ort_dml_version }}
     build_config: ${{ parameters.build_config }}
+    enable_win_qnn: ${{ parameters.enable_win_qnn }}
 
 - ${{ if eq(parameters.enable_post_packaging_validation, true) }}:
   - template: stages/nuget-validation-stage.yml

diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml
@@ -47,6 +47,11 @@ parameters:
   type: boolean
   default: false
 
+- name: enable_win_qnn
+  displayName: 'Whether QNN nuget package should be built.'
+  type: boolean
+  default: false
+
 - name: ort_version
   type: string
 
@@ -55,7 +60,7 @@ parameters:
   default: 'release'
 
 jobs:
-- job: nuget_${{ parameters.ep }}_packaging
+- job: nuget_${{ parameters.ep }}_packaging_dep_qnn_${{ parameters.enable_win_qnn }}
   pool: 'onnxruntime-Win-CPU-2022'
   variables:
   - name: ep
@@ -81,20 +86,24 @@ jobs:
     value: 'onnxruntime-genai-${{ parameters.ep }}'
 
   - name: genai_nuget_package_name
-    ${{ if eq(parameters.ep, 'cpu') }}:
+    ${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, false)) }}:
       value: 'Microsoft.ML.OnnxRuntimeGenAI'
     ${{ if eq(parameters.ep, 'cuda') }}:
       value: 'Microsoft.ML.OnnxRuntimeGenAI.Cuda'
     ${{ if eq(parameters.ep, 'directml') }}:
       value: 'Microsoft.ML.OnnxRuntimeGenAI.DirectML'
+    ${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, true)) }}:
+      value: 'Microsoft.ML.OnnxRuntimeGenAI.QNN'
 
   - name: ort_nuget_package_name
-    ${{ if eq(parameters.ep, 'cpu') }}:
+    ${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, false)) }}:
       value: 'Microsoft.ML.OnnxRuntime'
     ${{ if eq(parameters.ep, 'cuda') }}:
       value: 'Microsoft.ML.OnnxRuntime.Gpu'
     ${{ if eq(parameters.ep, 'directml') }}:
       value: 'Microsoft.ML.OnnxRuntime.DirectML'
+    ${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, true)) }}:
+      value: 'Microsoft.ML.OnnxRuntime.QNN'
 
   steps:
   - ${{ if and(eq(parameters.enable_win_cpu, true), eq(parameters.ep, 'cpu')) }}:
@@ -181,6 +190,15 @@ jobs:
 
   - task: NuGetAuthenticate@1
 
+  - powershell: |
+      dotnet --info
+      dotnet workload install android
+      dotnet workload install ios
+      dotnet workload install maccatalyst
+      dotnet workload install macos
+    displayName: 'Install dependencies'
+    workingDirectory: '$(Build.Repository.LocalPath)\src\csharp'
+
   - powershell: |
       dotnet --info
       dotnet build Microsoft.ML.OnnxRuntimeGenAI.csproj -p:Configuration="$(buildConfig)" -p:IncludeMobileTargets=true --verbosity normal

diff --git a/.pipelines/stages/jobs/nuget-validation-job.yml b/.pipelines/stages/jobs/nuget-validation-job.yml
@@ -100,9 +100,9 @@ jobs:
 
   - name: cuda_docker_image
     ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
     ${{ else }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
 
   workspace:
     clean: all

diff --git a/.pipelines/stages/jobs/py-validation-job.yml b/.pipelines/stages/jobs/py-validation-job.yml
@@ -109,9 +109,9 @@ jobs:
 
   - name: cuda_docker_image
     ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
     ${{ else }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
 
   steps:
   - checkout: self

diff --git a/.pipelines/stages/jobs/steps/python-validation-step.yml b/.pipelines/stages/jobs/steps/python-validation-step.yml
@@ -35,6 +35,9 @@ steps:
         python -m pip install -r test/python/directml/torch/requirements.txt
         python -m pip install -r test/python/directml/ort/requirements.txt
       }
+      elseif ("$(arch)" -eq "arm64") {
+        python -m pip install onnxruntime-qnn
+      }
       else {
         python -m pip install -r test/python/cpu/torch/requirements.txt
         python -m pip install -r test/python/cpu/ort/requirements.txt

diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml
@@ -44,6 +44,11 @@ parameters:
   type: boolean
   default: true
 
+- name: enable_win_qnn
+  displayName: Whether QNN nuget package should be built.'
+  type: boolean
+  default: true
+
 - name: ort_version
   type: string
 - name: ort_cuda_version
@@ -85,3 +90,16 @@ stages:
           build_config: ${{ parameters.build_config }}
           enable_win_dml: ${{ parameters.enable_win_dml }}
           enable_win_arm64: ${{ parameters.enable_win_arm64 }}
+    - ${{ if eq(parameters.enable_win_qnn, true) }}:
+      - template: jobs/nuget-packaging-job.yml
+        parameters:
+          ep: 'cpu'
+          ort_version: ${{ parameters.ort_version }}
+          build_config: ${{ parameters.build_config }}
+          enable_linux_cpu: false
+          enable_win_cpu: false
+          enable_win_arm64: true
+          enable_macos_cpu: false
+          enable_android: false
+          enable_apple_framework: false
+          enable_win_qnn: true
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -41,8 +41,6 @@ include(cmake/check_cuda.cmake)
 include(cmake/check_rocm.cmake)
 # Checking if DML is supported
 include(cmake/check_dml.cmake)
-# Checking if WebGpu is supported
-include(cmake/check_webgpu.cmake)
 
 include(cmake/cxx_standard.cmake)
 

diff --git a/README.md b/README.md
@@ -14,13 +14,13 @@ It implements the generative AI loop for ONNX models, including pre and post pro
 See documentation at https://onnxruntime.ai/docs/genai.
 
 |Support matrix|Supported now|Under development|On the roadmap|
-|-|-|-|-|
-|Model architectures|  Gemma <br/> Llama * <br/> Mistral + <br/>Phi (language + vision)<br/>Qwen <br/>Nemotron <br/>|Whisper|Stable diffusion|
+| -------------- | ------------- | ----------------- | -------------- |
+| Model architectures | Gemma <br/> Llama * <br/> Mistral + <br/> Phi (language + vision) <br/> Qwen <br/> Nemotron <br/> Granite <br/> AMD OLMo | Whisper | Stable diffusion |
 |API| Python <br/>C# <br/>C/C++ <br/> Java ^ |Objective-C||
 |Platform| Linux <br/> Windows <br/>Mac ^ <br/>Android ^  ||iOS |||
 |Architecture|x86 <br/> x64 <br/> Arm64 ~ ||||
 |Hardware Acceleration|CUDA<br/>DirectML<br/>|QNN <br/> OpenVINO <br/> ROCm ||
-|Features|| Interactive decoding <br/> Customization (fine-tuning)| Speculative decoding |
+|Features|MultiLoRA <br/> Continuous decoding (session continuation)^ | Constrained decoding | Speculative decoding |
 
 \* The Llama model architecture supports similar model families such as CodeLlama, Vicuna, Yi, and more.
 
@@ -32,7 +32,7 @@ See documentation at https://onnxruntime.ai/docs/genai.
 
 ## Installation
 
-See https://onnxruntime.ai/docs/genai/howto/install
+See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install) or [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html)
 
 ## Sample code for Phi-3 in Python
 
@@ -143,6 +143,30 @@ See https://onnxruntime.ai/docs/genai/howto/install
    del generator
    ```
 
+### Choosing the Right Examples: Release vs. Main Branch
+
+Due to evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version. Majority of the steps would remain same, just the package installation and the model example file would change.
+
+### Stable version
+Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). Let's say you installed 0.5.2 version of ONNX Runtime GenAI, so the instructions would look like this:
+
+```bash
+# Clone the repo
+git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
+# Checkout the branch for the version you are using
+git checkout v0.5.2
+cd examples
+```
+
+### Nightly version (Main Branch)
+Build the package from source using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). Now just go to the folder location where all the examples are present.
+
+```bash
+# Clone the repo
+git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
+cd examples
+```
+
 ## Roadmap
 
 See the [Discussions](https://github.com/microsoft/onnxruntime-genai/discussions) to request new features and up-vote existing requests.

diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py
@@ -82,6 +82,14 @@ def generate_prompt(model, tokenizer, prompt_length, use_graph_capture) -> str:
         generator.generate_next_token()
     return tokenizer.decode(generator.get_sequence(0))
 
+# Use prompt length to get pre-defined prompt
+def get_prompt_by_length(prompt_length):
+    json_path = "prompts.json"
+    with open(json_path) as prompts_file:
+        content = prompts_file.read()
+        data = json.load(content)
+        return data[f"{prompt_length}"]
+
 def get_target_pip_package_version(target_pip_package_name_list):
     # get package name and version
     import pkg_resources
@@ -231,6 +239,18 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
         # use random tokens instead of generating a prompt using the model and then tokenizing it
         tokens = np.random.randint(100, size=(batch_size, prompt_length))
         prompt = [tokenizer.decode(tokens[0])] * batch_size
+    elif args.use_prompt_set:
+        prompt = [get_prompt_by_length(prompt_length)] * batch_size
+        tokens = tokenizer.encode_batch(prompt)
+
+        if len(tokens) > max_length:
+            # Shorten the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
+            tokens = tokens[:, :max_length]
+        elif len(tokens) < max_length:
+            # Lengthen the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
+            tokens_first_col = tokens[:, 0].unsqueeze(0).T
+            for _ in range(max_length - len(tokens)):
+                tokens = np.hstack((tokens_first_col, tokens))
     else:
         prompt = [generate_prompt(model, tokenizer, prompt_length, args.use_graph_capture)] * batch_size
         tokens = tokenizer.encode_batch(prompt)
@@ -416,6 +436,7 @@ def str2strlist(value):
     parser.add_argument('-mn', '--model_name', type=str, default='model_name', help='Model name defined by users')
     parser.add_argument('-pr', '--precision', type=str, default='fp16', help='Model precision for metrics info')
     parser.add_argument('--use_random_tokens', action='store_true', help='Use random tokens instead of generating a prompt')
+    parser.add_argument('--use_prompt_set', action='store_true', help='Use pre-generated prompt set instead of generating a prompt')
     args = parser.parse_args()
 
     # check max_lengths