Skip to content

Commit

Permalink
Merge branch 'main' into kvaishnavi/whisper
Browse files Browse the repository at this point in the history
  • Loading branch information
kunal-vaishnavi committed Feb 5, 2025
2 parents fbebe68 + 4d702e2 commit ef955e7
Show file tree
Hide file tree
Showing 77 changed files with 1,435 additions and 558 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/linux-cpu-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
Expand Down Expand Up @@ -84,10 +85,6 @@ jobs:
python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
python3 -m pip install --user --no-index --no-deps --find-links build/cpu/wheel onnxruntime_genai
- name: Use Dummy HuggingFace Token
run: |
echo "HF_TOKEN=12345" >> $GITHUB_ENV
- name: Verify Build Artifacts
if: always()
continue-on-error: true
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/linux-cpu-x64-nightly-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
ort_dir: "onnxruntime-linux-x64-1.18.0"
ort_zip: "onnxruntime-linux-x64-1.18.0.tgz"
ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.18.0/onnxruntime-linux-x64-1.18.0.tgz"
Expand Down Expand Up @@ -55,10 +56,6 @@ jobs:
python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl --no-deps
- name: Use Dummy HuggingFace Token
run: |
echo "HF_TOKEN=12345" >> $GITHUB_ENV
- name: Run the python tests
run: |
python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models --e2e
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/linux-gpu-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ concurrency:
cancel-in-progress: true

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1"
ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux
ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
Expand Down Expand Up @@ -109,10 +110,6 @@ jobs:
bash -c " \
/usr/bin/cmake --build --preset linux_gcc_cuda_release"
- name: Use Dummy HuggingFace Token
run: |
echo "HF_TOKEN=12345" >> $GITHUB_ENV
- name: Install the onnxruntime-genai Python wheel and run python test
run: |
echo "Installing the onnxruntime-genai Python wheel and running the Python tests"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mac-cpu-arm64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
jobs:
Expand Down Expand Up @@ -86,7 +87,6 @@ jobs:
- name: Run the python tests
run: |
source genai-macos-venv/bin/activate
export HF_TOKEN="12345"
export ORTGENAI_LOG_ORT_LIB=1
python3 -m pip install requests
python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/win-cpu-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
binaryDir: 'build/cpu/win-x64'
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
Expand Down Expand Up @@ -91,10 +92,6 @@ jobs:
python3 -m pip install -r test\python\cpu\ort\requirements.txt --user
python3 -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
- name: Use Dummy HuggingFace Token
run: |
Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
- name: Run the Python Tests
run: |
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/win-cuda-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ concurrency:
cancel-in-progress: true

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
AZCOPY_AUTO_LOGIN_TYPE: MSI
AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
cuda_dir: "${{ github.workspace }}\\cuda_sdk"
Expand Down Expand Up @@ -80,10 +81,6 @@ jobs:
python -m pip install -r test\python\cuda\ort\requirements.txt
python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
- name: Use Dummy HuggingFace Token
run: |
Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
- name: Run the Python Tests
run: |
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e
Expand Down
6 changes: 6 additions & 0 deletions .pipelines/nuget-publishing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ parameters:
type: boolean
default: false

- name: enable_win_qnn
displayName: 'Whether QNN nuget package should be built.'
type: boolean
default: false

- name: ort_version
displayName: 'OnnxRuntime version'
type: string
Expand Down Expand Up @@ -122,6 +127,7 @@ stages:
ort_cuda_version: ${{ parameters.ort_cuda_version }}
ort_dml_version: ${{ parameters.ort_dml_version }}
build_config: ${{ parameters.build_config }}
enable_win_qnn: ${{ parameters.enable_win_qnn }}

- ${{ if eq(parameters.enable_post_packaging_validation, true) }}:
- template: stages/nuget-validation-stage.yml
Expand Down
24 changes: 21 additions & 3 deletions .pipelines/stages/jobs/nuget-packaging-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ parameters:
type: boolean
default: false

- name: enable_win_qnn
displayName: 'Whether QNN nuget package should be built.'
type: boolean
default: false

- name: ort_version
type: string

Expand All @@ -55,7 +60,7 @@ parameters:
default: 'release'

jobs:
- job: nuget_${{ parameters.ep }}_packaging
- job: nuget_${{ parameters.ep }}_packaging_dep_qnn_${{ parameters.enable_win_qnn }}
pool: 'onnxruntime-Win-CPU-2022'
variables:
- name: ep
Expand All @@ -81,20 +86,24 @@ jobs:
value: 'onnxruntime-genai-${{ parameters.ep }}'

- name: genai_nuget_package_name
${{ if eq(parameters.ep, 'cpu') }}:
${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, false)) }}:
value: 'Microsoft.ML.OnnxRuntimeGenAI'
${{ if eq(parameters.ep, 'cuda') }}:
value: 'Microsoft.ML.OnnxRuntimeGenAI.Cuda'
${{ if eq(parameters.ep, 'directml') }}:
value: 'Microsoft.ML.OnnxRuntimeGenAI.DirectML'
${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, true)) }}:
value: 'Microsoft.ML.OnnxRuntimeGenAI.QNN'

- name: ort_nuget_package_name
${{ if eq(parameters.ep, 'cpu') }}:
${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, false)) }}:
value: 'Microsoft.ML.OnnxRuntime'
${{ if eq(parameters.ep, 'cuda') }}:
value: 'Microsoft.ML.OnnxRuntime.Gpu'
${{ if eq(parameters.ep, 'directml') }}:
value: 'Microsoft.ML.OnnxRuntime.DirectML'
${{ if and(eq(parameters.ep, 'cpu'), eq(parameters.enable_win_qnn, true)) }}:
value: 'Microsoft.ML.OnnxRuntime.QNN'

steps:
- ${{ if and(eq(parameters.enable_win_cpu, true), eq(parameters.ep, 'cpu')) }}:
Expand Down Expand Up @@ -181,6 +190,15 @@ jobs:

- task: NuGetAuthenticate@1

- powershell: |
dotnet --info
dotnet workload install android
dotnet workload install ios
dotnet workload install maccatalyst
dotnet workload install macos
displayName: 'Install dependencies'
workingDirectory: '$(Build.Repository.LocalPath)\src\csharp'
- powershell: |
dotnet --info
dotnet build Microsoft.ML.OnnxRuntimeGenAI.csproj -p:Configuration="$(buildConfig)" -p:IncludeMobileTargets=true --verbosity normal
Expand Down
4 changes: 2 additions & 2 deletions .pipelines/stages/jobs/nuget-validation-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,9 @@ jobs:

- name: cuda_docker_image
${{ if eq(parameters.cuda_version, '11.8') }}:
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
${{ else }}:
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1

workspace:
clean: all
Expand Down
4 changes: 2 additions & 2 deletions .pipelines/stages/jobs/py-validation-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ jobs:

- name: cuda_docker_image
${{ if eq(parameters.cuda_version, '11.8') }}:
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
${{ else }}:
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1

steps:
- checkout: self
Expand Down
3 changes: 3 additions & 0 deletions .pipelines/stages/jobs/steps/python-validation-step.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ steps:
python -m pip install -r test/python/directml/torch/requirements.txt
python -m pip install -r test/python/directml/ort/requirements.txt
}
elseif ("$(arch)" -eq "arm64") {
python -m pip install onnxruntime-qnn
}
else {
python -m pip install -r test/python/cpu/torch/requirements.txt
python -m pip install -r test/python/cpu/ort/requirements.txt
Expand Down
18 changes: 18 additions & 0 deletions .pipelines/stages/nuget-packaging-stage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ parameters:
type: boolean
default: true

- name: enable_win_qnn
displayName: Whether QNN nuget package should be built.'
type: boolean
default: true

- name: ort_version
type: string
- name: ort_cuda_version
Expand Down Expand Up @@ -85,3 +90,16 @@ stages:
build_config: ${{ parameters.build_config }}
enable_win_dml: ${{ parameters.enable_win_dml }}
enable_win_arm64: ${{ parameters.enable_win_arm64 }}
- ${{ if eq(parameters.enable_win_qnn, true) }}:
- template: jobs/nuget-packaging-job.yml
parameters:
ep: 'cpu'
ort_version: ${{ parameters.ort_version }}
build_config: ${{ parameters.build_config }}
enable_linux_cpu: false
enable_win_cpu: false
enable_win_arm64: true
enable_macos_cpu: false
enable_android: false
enable_apple_framework: false
enable_win_qnn: true
2 changes: 0 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ include(cmake/check_cuda.cmake)
include(cmake/check_rocm.cmake)
# Checking if DML is supported
include(cmake/check_dml.cmake)
# Checking if WebGpu is supported
include(cmake/check_webgpu.cmake)

include(cmake/cxx_standard.cmake)

Expand Down
32 changes: 28 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ It implements the generative AI loop for ONNX models, including pre and post pro
See documentation at https://onnxruntime.ai/docs/genai.

|Support matrix|Supported now|Under development|On the roadmap|
|-|-|-|-|
|Model architectures| Gemma <br/> Llama * <br/> Mistral + <br/>Phi (language + vision)<br/>Qwen <br/>Nemotron <br/>|Whisper|Stable diffusion|
| -------------- | ------------- | ----------------- | -------------- |
| Model architectures | Gemma <br/> Llama * <br/> Mistral + <br/> Phi (language + vision) <br/> Qwen <br/> Nemotron <br/> Granite <br/> AMD OLMo | Whisper | Stable diffusion |
|API| Python <br/>C# <br/>C/C++ <br/> Java ^ |Objective-C||
|Platform| Linux <br/> Windows <br/>Mac ^ <br/>Android ^ ||iOS |||
|Architecture|x86 <br/> x64 <br/> Arm64 ~ ||||
|Hardware Acceleration|CUDA<br/>DirectML<br/>|QNN <br/> OpenVINO <br/> ROCm ||
|Features|| Interactive decoding <br/> Customization (fine-tuning)| Speculative decoding |
|Features|MultiLoRA <br/> Continuous decoding (session continuation)^ | Constrained decoding | Speculative decoding |

\* The Llama model architecture supports similar model families such as CodeLlama, Vicuna, Yi, and more.

Expand All @@ -32,7 +32,7 @@ See documentation at https://onnxruntime.ai/docs/genai.

## Installation

See https://onnxruntime.ai/docs/genai/howto/install
See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install) or [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html)

## Sample code for Phi-3 in Python

Expand Down Expand Up @@ -143,6 +143,30 @@ See https://onnxruntime.ai/docs/genai/howto/install
del generator
```

### Choosing the Right Examples: Release vs. Main Branch

Due to evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version. Majority of the steps would remain same, just the package installation and the model example file would change.

### Stable version
Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). Let's say you installed 0.5.2 version of ONNX Runtime GenAI, so the instructions would look like this:

```bash
# Clone the repo
git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
# Checkout the branch for the version you are using
git checkout v0.5.2
cd examples
```

### Nightly version (Main Branch)
Build the package from source using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). Now just go to the folder location where all the examples are present.

```bash
# Clone the repo
git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
cd examples
```

## Roadmap

See the [Discussions](https://github.com/microsoft/onnxruntime-genai/discussions) to request new features and up-vote existing requests.
Expand Down
21 changes: 21 additions & 0 deletions benchmark/python/benchmark_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ def generate_prompt(model, tokenizer, prompt_length, use_graph_capture) -> str:
generator.generate_next_token()
return tokenizer.decode(generator.get_sequence(0))

# Use prompt length to get pre-defined prompt
def get_prompt_by_length(prompt_length):
json_path = "prompts.json"
with open(json_path) as prompts_file:
content = prompts_file.read()
data = json.load(content)
return data[f"{prompt_length}"]

def get_target_pip_package_version(target_pip_package_name_list):
# get package name and version
import pkg_resources
Expand Down Expand Up @@ -231,6 +239,18 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
# use random tokens instead of generating a prompt using the model and then tokenizing it
tokens = np.random.randint(100, size=(batch_size, prompt_length))
prompt = [tokenizer.decode(tokens[0])] * batch_size
elif args.use_prompt_set:
prompt = [get_prompt_by_length(prompt_length)] * batch_size
tokens = tokenizer.encode_batch(prompt)

if len(tokens) > max_length:
# Shorten the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
tokens = tokens[:, :max_length]
elif len(tokens) < max_length:
# Lengthen the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
tokens_first_col = tokens[:, 0].unsqueeze(0).T
for _ in range(max_length - len(tokens)):
tokens = np.hstack((tokens_first_col, tokens))
else:
prompt = [generate_prompt(model, tokenizer, prompt_length, args.use_graph_capture)] * batch_size
tokens = tokenizer.encode_batch(prompt)
Expand Down Expand Up @@ -416,6 +436,7 @@ def str2strlist(value):
parser.add_argument('-mn', '--model_name', type=str, default='model_name', help='Model name defined by users')
parser.add_argument('-pr', '--precision', type=str, default='fp16', help='Model precision for metrics info')
parser.add_argument('--use_random_tokens', action='store_true', help='Use random tokens instead of generating a prompt')
parser.add_argument('--use_prompt_set', action='store_true', help='Use pre-generated prompt set instead of generating a prompt')
args = parser.parse_args()

# check max_lengths
Expand Down
Loading

0 comments on commit ef955e7

Please sign in to comment.