Skip to content

Commit

Permalink
Merge branch 'microsoft:main' into add_graph_optimization_level_sessi…
Browse files Browse the repository at this point in the history
…on_option
  • Loading branch information
benHeid authored Feb 21, 2025
2 parents 25ed3bd + dee4160 commit f292fe1
Show file tree
Hide file tree
Showing 16 changed files with 248 additions and 68 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/linux-cpu-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,6 @@ jobs:
python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
python3 -m pip install --user --no-index --no-deps --find-links build/cpu/wheel onnxruntime_genai
- name: Use Dummy HuggingFace Token
run: |
echo "HF_TOKEN=12345" >> $GITHUB_ENV
- name: Verify Build Artifacts
if: always()
continue-on-error: true
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/linux-cpu-x64-nightly-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,6 @@ jobs:
python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl --no-deps
- name: Use Dummy HuggingFace Token
run: |
echo "HF_TOKEN=12345" >> $GITHUB_ENV
- name: Run the python tests
run: |
python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models --e2e
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/linux-gpu-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,6 @@ jobs:
bash -c " \
/usr/bin/cmake --build --preset linux_gcc_cuda_release"
- name: Use Dummy HuggingFace Token
run: |
echo "HF_TOKEN=12345" >> $GITHUB_ENV
- name: Install the onnxruntime-genai Python wheel and run python test
run: |
echo "Installing the onnxruntime-genai Python wheel and running the Python tests"
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/win-cpu-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,6 @@ jobs:
python3 -m pip install -r test\python\cpu\ort\requirements.txt --user
python3 -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
- name: Use Dummy HuggingFace Token
run: |
Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
- name: Run the Python Tests
run: |
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/win-cuda-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,6 @@ jobs:
python -m pip install -r test\python\cuda\ort\requirements.txt
python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
- name: Use Dummy HuggingFace Token
run: |
Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
- name: Run the Python Tests
run: |
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/win-directml-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ concurrency:
cancel-in-progress: true

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
AZCOPY_AUTO_LOGIN_TYPE: MSI
AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
Expand Down Expand Up @@ -80,7 +81,7 @@ jobs:
- name: Configure CMake
run: |
cmake --preset windows_x64_directml_release -DTEST_PHI2=False
cmake --preset windows_x64_directml_release -DTEST_PHI2=True
- name: Build with CMake
run: |
Expand All @@ -93,6 +94,10 @@ jobs:
python -m pip install -r test\python\directml\ort\requirements.txt
python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
- name: Run the Python Tests
run: |
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e
- name: Verify Build Artifacts
if: always()
continue-on-error: true
Expand Down
2 changes: 1 addition & 1 deletion VERSION_INFO
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.6.0-dev
0.7.0-dev
6 changes: 3 additions & 3 deletions examples/csharp/HelloPhi/HelloPhi.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
</ItemGroup>

<ItemGroup>
Expand Down
6 changes: 3 additions & 3 deletions examples/csharp/HelloPhi3V/HelloPhi3V.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
<PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
</ItemGroup>

</Project>
5 changes: 3 additions & 2 deletions src/python/py/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
# Quantization-specific variables (INT4, INT8, etc.)
self.quant_attrs = {
"int4": {
"accuracy_level": int(extra_options.get("int4_accuracy_level", 0)), # Default is 0 for non-QDQ formats, default is 4 for QDQ formats
"accuracy_level": int(extra_options.get("int4_accuracy_level", 4 if self.ep == "cpu" else 0)), # Default is 0 for non-QDQ formats, default is 4 for QDQ formats
"block_size": int(extra_options.get("int4_block_size", 32)),
"is_symmetric": extra_options.get("int4_is_symmetric", True),
"op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )),
Expand Down Expand Up @@ -3324,6 +3324,7 @@ def get_args():
3 is bf16.
2 is fp16.
1 is fp32.
Default is 4 for the CPU EP and 0 for non-CPU EPs.
int4_block_size = 16/32/64/128/256: Specify the block_size for int4 quantization.
int4_is_symmetric = Quantize the weights symmetrically. Default is true.
If true, quantization is done to int4. If false, quantization is done to uint4.
Expand Down Expand Up @@ -3354,7 +3355,7 @@ def get_args():
If enabled, all nodes being placed on the CUDA EP is the prerequisite for the CUDA graph to be used correctly.
It is not guaranteed that CUDA graph be enabled as it depends on the model and the graph structure.
use_8bits_moe = Use 8-bit quantization for MoE layers. Default is false.
If true, the QMoE op will use 4-bit quantization. If false, the QMoE op will use 8-bits quantization.
If true, the QMoE op will use 8-bit quantization. If false, the QMoE op will use 4-bit quantization.
use_qdq = Use the QDQ decomposition for ops.
Use this option when you want to use quantize-dequantize ops. For example, you will have a quantized MatMul op instead of the MatMulNBits op.
adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights).
Expand Down
73 changes: 70 additions & 3 deletions test/c_api_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#ifndef PHI2_PATH
#if USE_CUDA
#define PHI2_PATH MODEL_PATH "phi-2/int4/cuda"
#elif USE_DML
#define PHI2_PATH MODEL_PATH "phi-2/int4/dml"
#else
#define PHI2_PATH MODEL_PATH "phi-2/int4/cpu"
#endif
Expand Down Expand Up @@ -153,6 +155,7 @@ TEST(CAPITests, MaxLength) {
generator->AppendTokens(input_ids_0.data(), input_ids_0.size());
EXPECT_THROW(generator->AppendTokens(input_ids_1.data(), input_ids_1.size()), std::runtime_error);

#if !USE_DML
// Batch size 3 case
std::vector<int32_t> input_ids_2{1, 2, 3, 5, 8, 13, 21, 34, 55, 89,
0, 0, 0, 52, 104, 52, 53, 54, 55, 56,
Expand All @@ -163,10 +166,12 @@ TEST(CAPITests, MaxLength) {

generator = OgaGenerator::Create(*model, *params);
EXPECT_THROW(generator->AppendTokens(input_ids_2.data(), input_ids_2.size()), std::runtime_error);
#endif
}

// DML doesn't support batch_size > 1
TEST(CAPITests, EndToEndPhiBatch) {
#if TEST_PHI2
#if TEST_PHI2 && !USE_DML
auto model = OgaModel::Create(PHI2_PATH);
auto tokenizer = OgaTokenizer::Create(*model);

Expand Down Expand Up @@ -196,6 +201,65 @@ TEST(CAPITests, EndToEndPhiBatch) {
auto out_string = tokenizer->Decode(generator->GetSequenceData(i), generator->GetSequenceCount(i));
std::cout << "Decoded string:" << out_string << std::endl;
}

// Verify outputs match expected outputs
std::vector<int32_t> expected_output{
1212, 318, 257, 1332, 13, 50256, 50256, 50256, 50256, 50256, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278, 6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417, 6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974,
49, 1381, 389, 7427, 17252, 0, 50256, 50256, 50256, 50256, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256};

for (size_t i = 0; i < 3; i++) {
const auto sequence_length = generator->GetSequenceCount(i);
const auto* sequence_data = generator->GetSequenceData(i);

ASSERT_LE(sequence_length, 40);

const auto* expected_output_start = &expected_output[i * 40];
EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
}
#endif
}

TEST(CAPITests, EndToEndPhi) {
#if TEST_PHI2
auto model = OgaModel::Create(PHI2_PATH);
auto tokenizer = OgaTokenizer::Create(*model);

const char* input_strings[] = {
"This is a test."
};

auto input_sequences = OgaSequences::Create();
for (auto& string : input_strings)
tokenizer->Encode(string, *input_sequences);

auto params = OgaGeneratorParams::Create(*model);
params->SetSearchOption("max_length", 40);

auto generator = OgaGenerator::Create(*model, *params);
generator->AppendTokenSequences(*input_sequences);

while (!generator->IsDone()) {
generator->GenerateNextToken();
}

// Decode The Batch
auto out_string = tokenizer->Decode(generator->GetSequenceData(0), generator->GetSequenceCount(0));
std::cout << "Decoded string:" << out_string << std::endl;

// Verify outputs match expected outputs
std::vector<int32_t> expected_output{
1212, 318, 257, 1332, 13, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278,
6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417,
6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974, 82, 1039, 889, 263, 3684};

const auto sequence_length = generator->GetSequenceCount(0);
const auto* sequence_data = generator->GetSequenceData(0);

ASSERT_LE(sequence_length, 40);

const auto* expected_output_start = &expected_output[0];
EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
#endif
}

Expand Down Expand Up @@ -448,7 +512,8 @@ TEST(CAPITests, SetTerminate) {
#endif
}

#if TEST_PHI2
// DML Doesn't support batch_size > 1
#if TEST_PHI2 && !USE_DML

struct Phi2Test {
Phi2Test() {
Expand Down Expand Up @@ -526,12 +591,14 @@ TEST(CAPITests, TopKTopPCAPI) {
test.Run();
}

#endif // TEST_PHI2
#endif // TEST_PHI2 && !USE_DML

#if TEST_PHI2
TEST(CAPITests, AdaptersTest) {
#ifdef USE_CUDA
using OutputType = Ort::Float16_t;
#elif defined(USE_DML)
using OutputType = Ort::Float16_t;
#else
using OutputType = float;
#endif
Expand Down
65 changes: 65 additions & 0 deletions test/model_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#ifndef PHI2_PATH
#if USE_CUDA
#define PHI2_PATH MODEL_PATH "phi-2/int4/cuda"
#elif USE_DML
#define PHI2_PATH MODEL_PATH "phi-2/int4/dml"
#else
#define PHI2_PATH MODEL_PATH "phi-2/int4/cpu"
#endif
Expand Down Expand Up @@ -271,5 +273,68 @@ Print all primes between 1 and n
std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
#endif
}
#endif

#if USE_DML && TEST_PHI2
TEST(ModelTests, TestApiDml) {

auto prompt = R"(
def print_prime(n):
'''
Print all primes between 1 and n
'''
)";

std::cout << "With prompt:" << prompt << "\r\n";

auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH);
auto tokenizer = model->CreateTokenizer();
auto tokens = tokenizer->Encode(prompt);

auto params = Generators::CreateGeneratorParams(*model);
params->search.batch_size = 1;
params->search.max_length = 128;

// Generator version
auto generator = Generators::CreateGenerator(*model, *params);
generator->AppendTokens(Generators::cpu_span<int>(tokens.data(), tokens.size()));
while (!generator->IsDone()) {
generator->GenerateNextToken();
}

auto result = generator->GetSequence(0);

std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
}

TEST(ModelTests, TestTopKDml) {
auto prompt = R"(
def print_prime(n):
'''
Print all primes between 1 and n
'''
)";

std::cout << "With prompt:" << prompt << "\r\n";

auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH);
auto tokenizer = model->CreateTokenizer();
auto tokens = tokenizer->Encode(prompt);

auto params = Generators::CreateGeneratorParams(*model);
params->search.top_k = 3;
params->search.batch_size = 1;
params->search.max_length = 128;

// Generator version
auto generator = Generators::CreateGenerator(*model, *params);
generator->AppendTokens(Generators::cpu_span<int>(tokens.data(), tokens.size()));
while (!generator->IsDone()) {
generator->GenerateNextToken();
}

auto result = generator->GetSequence(0);

std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
}
#endif
Loading

0 comments on commit f292fe1

Please sign in to comment.