Skip to content

Commit

Permalink
Add DML tests (#1219)
Browse files Browse the repository at this point in the history
We previously had no tests for DML EP, this adds a few
  • Loading branch information
aciddelgado authored Feb 19, 2025
1 parent fe3604a commit 208926a
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 4 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/win-directml-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ concurrency:
cancel-in-progress: true

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
AZCOPY_AUTO_LOGIN_TYPE: MSI
AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
Expand Down Expand Up @@ -80,7 +81,7 @@ jobs:
- name: Configure CMake
run: |
cmake --preset windows_x64_directml_release -DTEST_PHI2=False
cmake --preset windows_x64_directml_release -DTEST_PHI2=True
- name: Build with CMake
run: |
Expand All @@ -93,6 +94,10 @@ jobs:
python -m pip install -r test\python\directml\ort\requirements.txt
python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
- name: Run the Python Tests
run: |
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e
- name: Verify Build Artifacts
if: always()
continue-on-error: true
Expand Down
73 changes: 70 additions & 3 deletions test/c_api_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#ifndef PHI2_PATH
#if USE_CUDA
#define PHI2_PATH MODEL_PATH "phi-2/int4/cuda"
#elif USE_DML
#define PHI2_PATH MODEL_PATH "phi-2/int4/dml"
#else
#define PHI2_PATH MODEL_PATH "phi-2/int4/cpu"
#endif
Expand Down Expand Up @@ -153,6 +155,7 @@ TEST(CAPITests, MaxLength) {
generator->AppendTokens(input_ids_0.data(), input_ids_0.size());
EXPECT_THROW(generator->AppendTokens(input_ids_1.data(), input_ids_1.size()), std::runtime_error);

#if !USE_DML
// Batch size 3 case
std::vector<int32_t> input_ids_2{1, 2, 3, 5, 8, 13, 21, 34, 55, 89,
0, 0, 0, 52, 104, 52, 53, 54, 55, 56,
Expand All @@ -163,10 +166,12 @@ TEST(CAPITests, MaxLength) {

generator = OgaGenerator::Create(*model, *params);
EXPECT_THROW(generator->AppendTokens(input_ids_2.data(), input_ids_2.size()), std::runtime_error);
#endif
}

// DML doesn't support batch_size > 1
TEST(CAPITests, EndToEndPhiBatch) {
#if TEST_PHI2
#if TEST_PHI2 && !USE_DML
auto model = OgaModel::Create(PHI2_PATH);
auto tokenizer = OgaTokenizer::Create(*model);

Expand Down Expand Up @@ -196,6 +201,65 @@ TEST(CAPITests, EndToEndPhiBatch) {
auto out_string = tokenizer->Decode(generator->GetSequenceData(i), generator->GetSequenceCount(i));
std::cout << "Decoded string:" << out_string << std::endl;
}

// Verify outputs match expected outputs
std::vector<int32_t> expected_output{
1212, 318, 257, 1332, 13, 50256, 50256, 50256, 50256, 50256, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278, 6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417, 6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974,
49, 1381, 389, 7427, 17252, 0, 50256, 50256, 50256, 50256, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256};

for (size_t i = 0; i < 3; i++) {
const auto sequence_length = generator->GetSequenceCount(i);
const auto* sequence_data = generator->GetSequenceData(i);

ASSERT_LE(sequence_length, 40);

const auto* expected_output_start = &expected_output[i * 40];
EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
}
#endif
}

TEST(CAPITests, EndToEndPhi) {
#if TEST_PHI2
auto model = OgaModel::Create(PHI2_PATH);
auto tokenizer = OgaTokenizer::Create(*model);

const char* input_strings[] = {
"This is a test."
};

auto input_sequences = OgaSequences::Create();
for (auto& string : input_strings)
tokenizer->Encode(string, *input_sequences);

auto params = OgaGeneratorParams::Create(*model);
params->SetSearchOption("max_length", 40);

auto generator = OgaGenerator::Create(*model, *params);
generator->AppendTokenSequences(*input_sequences);

while (!generator->IsDone()) {
generator->GenerateNextToken();
}

// Decode The Batch
auto out_string = tokenizer->Decode(generator->GetSequenceData(0), generator->GetSequenceCount(0));
std::cout << "Decoded string:" << out_string << std::endl;

// Verify outputs match expected outputs
std::vector<int32_t> expected_output{
1212, 318, 257, 1332, 13, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278,
6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417,
6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974, 82, 1039, 889, 263, 3684};

const auto sequence_length = generator->GetSequenceCount(0);
const auto* sequence_data = generator->GetSequenceData(0);

ASSERT_LE(sequence_length, 40);

const auto* expected_output_start = &expected_output[0];
EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
#endif
}

Expand Down Expand Up @@ -448,7 +512,8 @@ TEST(CAPITests, SetTerminate) {
#endif
}

#if TEST_PHI2
// DML Doesn't support batch_size > 1
#if TEST_PHI2 && !USE_DML

struct Phi2Test {
Phi2Test() {
Expand Down Expand Up @@ -526,12 +591,14 @@ TEST(CAPITests, TopKTopPCAPI) {
test.Run();
}

#endif // TEST_PHI2
#endif // TEST_PHI2 && !USE_DML

#if TEST_PHI2
TEST(CAPITests, AdaptersTest) {
#ifdef USE_CUDA
using OutputType = Ort::Float16_t;
#elif defined(USE_DML)
using OutputType = Ort::Float16_t;
#else
using OutputType = float;
#endif
Expand Down
65 changes: 65 additions & 0 deletions test/model_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#ifndef PHI2_PATH
#if USE_CUDA
#define PHI2_PATH MODEL_PATH "phi-2/int4/cuda"
#elif USE_DML
#define PHI2_PATH MODEL_PATH "phi-2/int4/dml"
#else
#define PHI2_PATH MODEL_PATH "phi-2/int4/cpu"
#endif
Expand Down Expand Up @@ -271,5 +273,68 @@ Print all primes between 1 and n
std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
#endif
}
#endif

#if USE_DML && TEST_PHI2
TEST(ModelTests, TestApiDml) {

auto prompt = R"(
def print_prime(n):
'''
Print all primes between 1 and n
'''
)";

std::cout << "With prompt:" << prompt << "\r\n";

auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH);
auto tokenizer = model->CreateTokenizer();
auto tokens = tokenizer->Encode(prompt);

auto params = Generators::CreateGeneratorParams(*model);
params->search.batch_size = 1;
params->search.max_length = 128;

// Generator version
auto generator = Generators::CreateGenerator(*model, *params);
generator->AppendTokens(Generators::cpu_span<int>(tokens.data(), tokens.size()));
while (!generator->IsDone()) {
generator->GenerateNextToken();
}

auto result = generator->GetSequence(0);

std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
}

TEST(ModelTests, TestTopKDml) {
auto prompt = R"(
def print_prime(n):
'''
Print all primes between 1 and n
'''
)";

std::cout << "With prompt:" << prompt << "\r\n";

auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH);
auto tokenizer = model->CreateTokenizer();
auto tokens = tokenizer->Encode(prompt);

auto params = Generators::CreateGeneratorParams(*model);
params->search.top_k = 3;
params->search.batch_size = 1;
params->search.max_length = 128;

// Generator version
auto generator = Generators::CreateGenerator(*model, *params);
generator->AppendTokens(Generators::cpu_span<int>(tokens.data(), tokens.size()));
while (!generator->IsDone()) {
generator->GenerateNextToken();
}

auto result = generator->GetSequence(0);

std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
}
#endif
2 changes: 2 additions & 0 deletions test/python/test_onnxruntime_genai.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def main():
output_paths += download_models(os.path.abspath(args.test_models), "int4", "cpu")
if og.is_cuda_available():
output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda")
if og.is_dml_available():
output_paths += download_models(os.path.abspath(args.test_models), "int4", "dml")

# Run ONNX Runtime GenAI tests
run_onnxruntime_genai_api_tests(os.path.abspath(args.cwd), log, os.path.abspath(args.test_models))
Expand Down
35 changes: 35 additions & 0 deletions test/python/test_onnxruntime_genai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,9 @@ def test_tokenizer_stream(device, phi2_for):
)
@pytest.mark.parametrize("device", devices)
def test_batching(device, phi2_for):
if device == "dml":
pytest.skip("EP DML does not support batching")

model = og.Model(phi2_for(device))
tokenizer = og.Tokenizer(model)

Expand All @@ -259,6 +262,32 @@ def test_batching(device, phi2_for):
params = og.GeneratorParams(model)
params.set_search_options(max_length=20, batch_size=len(prompts)) # To run faster

generator = og.Generator(model, params)
generator.append_tokens(tokenizer.encode_batch(prompts))
while not generator.is_done():
generator.generate_next_token()
for i in range(len(prompts)):
print(tokenizer.decode(generator.get_sequence(0)))


# TODO: CUDA pipelines use python3.6 and do not have a way to download models since downloading models
# requires pytorch and hf transformers. This test should be re-enabled once the pipeline is updated.
@pytest.mark.skipif(
sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8,
reason="Python 3.8 is required for downloading models.",
)
@pytest.mark.parametrize("device", devices)
def test_e2e(device, phi2_for):
model = og.Model(phi2_for(device))
tokenizer = og.Tokenizer(model)

prompts = [
"This is a test.",
]

params = og.GeneratorParams(model)
params.set_search_options(max_length=20, batch_size=len(prompts)) # To run faster

if device == "dml":
params.try_graph_capture_with_max_batch_size(len(prompts))

Expand Down Expand Up @@ -654,6 +683,9 @@ def _export_adapter(adapter, adapter_file_name):
adapter_paths.append(adapter_file_name)

return adapter_model_path, adapter_paths

if device == "dml":
pytest.skip("EP DML does not support adapters")

model_path, adapter_paths = _prepare_adapter_model(test_data_path)
model = og.Model(model_path)
Expand Down Expand Up @@ -728,6 +760,9 @@ def _prepare_model(test_data_path):

return extra_inputs_model_path, valid

if device == "dml":
pytest.skip("EP DML does not support preset extra inputs")

model_path, valid_model = _prepare_model(test_data_path)
model = og.Model(model_path)
tokenizer = og.Tokenizer(model)
Expand Down

0 comments on commit 208926a

Please sign in to comment.