Merge branch 'main' into add-slm-engine-example

microsoft · Feb 21, 2025 · aafc01c · aafc01c
2 parents 361099c + 20f907e
commit aafc01c
Show file tree

Hide file tree

Showing 31 changed files with 206 additions and 478 deletions.
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
@@ -84,10 +84,6 @@ jobs:
           python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
           python3 -m pip install --user --no-index --no-deps --find-links build/cpu/wheel onnxruntime_genai
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Verify Build Artifacts
         if: always()
         continue-on-error: true

diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -55,10 +55,6 @@ jobs:
           python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
           python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl --no-deps
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Run the python tests
         run: |
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models --e2e

diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
@@ -109,10 +109,6 @@ jobs:
             bash -c " \
               /usr/bin/cmake --build --preset linux_gcc_cuda_release"
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Install the onnxruntime-genai Python wheel and run python test
         run: |
           echo "Installing the onnxruntime-genai Python wheel and running the Python tests"

diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
@@ -91,10 +91,6 @@ jobs:
         python3 -m pip install -r test\python\cpu\ort\requirements.txt --user
         python3 -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
-    - name: Use Dummy HuggingFace Token
-      run: |
-        Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
-
     - name: Run the Python Tests
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"

diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
@@ -80,10 +80,6 @@ jobs:
         python -m pip install -r test\python\cuda\ort\requirements.txt
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
-    - name: Use Dummy HuggingFace Token
-      run: |
-        Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
-
     - name: Run the Python Tests
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e

diff --git a/VERSION_INFO b/VERSION_INFO
@@ -1 +1 @@
-0.6.0-dev
+0.7.0-dev
diff --git a/examples/c/README.md b/examples/c/README.md
@@ -11,13 +11,6 @@ git clone https://github.com/microsoft/onnxruntime-genai.git
 cd onnxruntime-genai/examples/c
 ```
 
-If they don't already exist, create folders called `include` and `lib`.
-
-```bash
-mkdir include
-mkdir lib
-```
-
 ## Phi-3.5 mini
 
 ### Download model

diff --git a/examples/csharp/HelloPhi/HelloPhi.csproj b/examples/csharp/HelloPhi/HelloPhi.csproj
@@ -10,9 +10,9 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/examples/csharp/HelloPhi/Program.cs b/examples/csharp/HelloPhi/Program.cs
@@ -83,54 +83,109 @@ void PrintUsage()
 if (interactive)
 {
     Console.WriteLine("Please enter option number:");
-    Console.WriteLine("1. Complete Output");
-    Console.WriteLine("2. Streaming Output");
+    Console.WriteLine("1. Complete Q&A");
+    Console.WriteLine("2. Streaming Q&A");
+    Console.WriteLine("3. Streaming Chat (not supported for DirectML and QNN currently)");
     int.TryParse(Console.ReadLine(), out option);
 }
 
-do
+int minLength = 50;
+int maxLength = 500;
+
+static string GetPrompt(bool interactive)
 {
     string prompt = "def is_prime(num):"; // Example prompt
     if (interactive)
     {
-        Console.WriteLine("Prompt:");
+        Console.WriteLine("Prompt: (Use quit() to exit)");
         prompt = Console.ReadLine();
     }
-    if (string.IsNullOrEmpty(prompt))
-    {
-        continue;
-    }
-    var sequences = tokenizer.Encode($"<|user|>{prompt}<|end|><|assistant|>");
+    return prompt;
+}
 
-    using GeneratorParams generatorParams = new GeneratorParams(model);
-    generatorParams.SetSearchOption("min_length", 50);
-    generatorParams.SetSearchOption("max_length", 200);
-    if (option == 1) // Complete Output
+if (option == 1 || option == 2)
+{
+    do
     {
-        using var generator = new Generator(model, generatorParams);
-        generator.AppendTokenSequences(sequences);
-        var watch = System.Diagnostics.Stopwatch.StartNew();
-        while (!generator.IsDone())
+        string prompt = GetPrompt(interactive);
+        if (string.IsNullOrEmpty(prompt))
         {
-            generator.GenerateNextToken();
+            continue;
+        }
+        if (string.Compare(prompt, "quit()", StringComparison.OrdinalIgnoreCase) == 0)
+        {
+            break;
         }
+        var sequences = tokenizer.Encode($"<|user|>{prompt}<|end|><|assistant|>");
 
-        var outputSequence = generator.GetSequence(0);
-        var outputString = tokenizer.Decode(outputSequence);
-        watch.Stop();
-        var runTimeInSeconds = watch.Elapsed.TotalSeconds;
-        Console.WriteLine("Output:");
-        Console.WriteLine(outputString);
-        var totalTokens = outputSequence.Length;
-        Console.WriteLine($"Tokens: {totalTokens} Time: {runTimeInSeconds:0.00} Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
-    }
+        if (option == 1) // Complete Output
+        {
+            using GeneratorParams generatorParams = new GeneratorParams(model);
+            generatorParams.SetSearchOption("min_length", minLength);
+            generatorParams.SetSearchOption("max_length", maxLength);
+            using var generator = new Generator(model, generatorParams);
+            generator.AppendTokenSequences(sequences);
+            var watch = System.Diagnostics.Stopwatch.StartNew();
+            while (!generator.IsDone())
+            {
+                generator.GenerateNextToken();
+            }
 
-    else if (option == 2) //Streaming Output
-    {
-        using var tokenizerStream = tokenizer.CreateStream();
-        using var generator = new Generator(model, generatorParams);
-        generator.AppendTokenSequences(sequences);
+            var outputSequence = generator.GetSequence(0);
+            var outputString = tokenizer.Decode(outputSequence);
+            watch.Stop();
+            var runTimeInSeconds = watch.Elapsed.TotalSeconds;
+            Console.WriteLine("Output:");
+            Console.WriteLine(outputString);
+            var totalTokens = outputSequence.Length;
+            Console.WriteLine($"Tokens: {totalTokens} Time: {runTimeInSeconds:0.00} Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
+        }
+
+        else if (option == 2) //Streaming Output
+        {
+            using GeneratorParams generatorParams = new GeneratorParams(model);
+            generatorParams.SetSearchOption("min_length", minLength);
+            generatorParams.SetSearchOption("max_length", maxLength);
+            using var tokenizerStream = tokenizer.CreateStream();
+            using var generator = new Generator(model, generatorParams);
+            generator.AppendTokenSequences(sequences);
+            var watch = System.Diagnostics.Stopwatch.StartNew();
+            while (!generator.IsDone())
+            {
+                generator.GenerateNextToken();
+                Console.Write(tokenizerStream.Decode(generator.GetSequence(0)[^1]));
+            }
+            Console.WriteLine();
+            watch.Stop();
+            var runTimeInSeconds = watch.Elapsed.TotalSeconds;
+            var outputSequence = generator.GetSequence(0);
+            var totalTokens = outputSequence.Length;
+            Console.WriteLine($"Streaming Tokens: {totalTokens} Time: {runTimeInSeconds:0.00} Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
+        }
+    } while (interactive);
+}
+
+if (option == 3) // Streaming Chat
+{
+    using GeneratorParams generatorParams = new GeneratorParams(model);
+    generatorParams.SetSearchOption("min_length", minLength);
+    generatorParams.SetSearchOption("max_length", maxLength);
+    using var tokenizerStream = tokenizer.CreateStream();
+    using var generator = new Generator(model, generatorParams);
+    var prevTotalTokens = 0;
+    do{
+        string prompt = GetPrompt(interactive);
+        if (string.IsNullOrEmpty(prompt))
+        {
+            continue;
+        }
+        if (string.Compare(prompt, "quit()", StringComparison.OrdinalIgnoreCase) == 0)
+        {
+            break;
+        }
+        var sequences = tokenizer.Encode($"<|user|>{prompt}<|end|><|assistant|>");
         var watch = System.Diagnostics.Stopwatch.StartNew();
+        generator.AppendTokenSequences(sequences);
         while (!generator.IsDone())
         {
             generator.GenerateNextToken();
@@ -140,7 +195,8 @@ void PrintUsage()
         watch.Stop();
         var runTimeInSeconds = watch.Elapsed.TotalSeconds;
         var outputSequence = generator.GetSequence(0);
-        var totalTokens = outputSequence.Length;
-        Console.WriteLine($"Streaming Tokens: {totalTokens} Time: {runTimeInSeconds:0.00} Tokens per second: {totalTokens / runTimeInSeconds:0.00}");
-    }
-} while (interactive);
+        var totalNewTokens = outputSequence.Length - prevTotalTokens;
+        prevTotalTokens = totalNewTokens;
+        Console.WriteLine($"Streaming Tokens: {totalNewTokens} Time: {runTimeInSeconds:0.00} Tokens per second: {totalNewTokens / runTimeInSeconds:0.00}");
+    } while (interactive);
+}
diff --git a/examples/csharp/HelloPhi/README.md b/examples/csharp/HelloPhi/README.md
@@ -5,7 +5,7 @@
 You can download a published model from Hugging Face. For example, this is Phi-3.5 mini optimized for CPU and mobile. You can find other models here: 
 
 ```script
-huggingface-cli download microsoft/Phi-3.5-mini-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir models
+huggingface-cli download microsoft/Phi-3.5-mini-instruct-onnx --include cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4/* --local-dir models
 move models\cpu_and_mobile\cpu-int4-rtn-block-32-acc-level-4 models\phi-3
 ```
 

diff --git a/examples/csharp/HelloPhi3V/HelloPhi3V.csproj b/examples/csharp/HelloPhi3V/HelloPhi3V.csproj
@@ -9,9 +9,9 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0-dev" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
   </ItemGroup>
 
 </Project>
diff --git a/src/csharp/Adapters.cs b/src/csharp/Adapters.cs
@@ -12,39 +12,34 @@ namespace Microsoft.ML.OnnxRuntimeGenAI
     public class Adapters : SafeHandle
     {
         /// <summary>
-        /// Constructs an Adapters object with the given model.
+        /// Creates a container for adapters
+        /// used to load, unload and hold them.
+        /// Throws on error.
         /// </summary>
         /// <param name="model">Reference to a loaded model</param>
-        /// <exception cref="OnnxRuntimeGenAIException">
-        /// Thrown when the call to the GenAI native API fails.
-        /// </exception>
+        /// <returns>new Adapters object</returns>
         public Adapters(Model model) : base(IntPtr.Zero, true)
         {
             Result.VerifySuccess(NativeMethods.OgaCreateAdapters(model.Handle, out handle));
         }
 
         /// <summary>
-        /// Loads the model adapter from the given adapter file path and adapter name.
+        /// Method that loads adapter data and assigns it a nmae that
+        /// it can be referred to. Throws on error.
         /// </summary>
-        /// <param name="adapterPath">The path of the adapter.</param>
-        /// <param name="adapterName">A unique user supplied adapter identifier.</param>
-        /// <exception cref="OnnxRuntimeGenAIException">
-        /// Thrown when the call to the GenAI native API fails.
-        /// </exception>
-        public void LoadAdapter(string adapterFilePath, string adapterName)
+        /// <param name="adapterPath">file path to load</param>
+        /// <param name="adapterName">adapter name</param>
+        public void LoadAdapter(string adapterPath, string adapterName)
         {
             Result.VerifySuccess(NativeMethods.OgaLoadAdapter(handle,
-                StringUtils.ToUtf8(adapterFilePath), StringUtils.ToUtf8(adapterName)));
+                StringUtils.ToUtf8(adapterPath), StringUtils.ToUtf8(adapterName)));
         }
 
         /// <summary>
-        /// Unloads the adapter with the given identifier from the previosly loaded adapters. If the
-        /// adapter is not found, or if it cannot be unloaded (when it is in use), an error is returned.
+        /// Unload the adatper that was loaded by the LoadAdapter method.
+        /// Throws on error.
         /// </summary>
         /// <param name="adapterName"></param>
-        /// <exception cref="OnnxRuntimeGenAIException">
-        /// Thrown when the call to the GenAI native API fails.
-        /// </exception>
         public void UnloadAdapter(string adapterName)
         {
             Result.VerifySuccess(NativeMethods.OgaUnloadAdapter(handle, StringUtils.ToUtf8(adapterName)));
@@ -53,7 +48,7 @@ public void UnloadAdapter(string adapterName)
         internal IntPtr Handle { get { return handle; } }
 
         /// <summary>
-        /// Implement SafeHandle override.
+        /// Implement SafeHandle override
         /// </summary>
         public override bool IsInvalid => handle == IntPtr.Zero;
 

diff --git a/src/csharp/Config.cs b/src/csharp/Config.cs
@@ -5,65 +5,29 @@
 
 namespace Microsoft.ML.OnnxRuntimeGenAI
 {
-    /// <summary>
-    /// Use Config to set the ORT execution providers (EPs) and their options. The EPs are applied based on
-    /// insertion order.
-    /// </summary>
     public class Config : IDisposable
     {
         private IntPtr _configHandle;
         private bool _disposed = false;
-
-        /// <summary>
-        /// Creates a Config from the given configuration directory.
-        /// </summary>
-        /// <param name="modelPath">The path to the configuration directory.</param>
-        /// <exception cref="OnnxRuntimeGenAIException">
-        /// Thrown when the call to the GenAI native API fails.
-        /// </exception>
         public Config(string modelPath)
         {
             Result.VerifySuccess(NativeMethods.OgaCreateConfig(StringUtils.ToUtf8(modelPath), out _configHandle));
         }
 
         internal IntPtr Handle { get { return _configHandle; } }
-
-        /// <summary>
-        /// Clear the list of providers in the config.
-        /// </summary>
-        /// <exception cref="OnnxRuntimeGenAIException">
-        /// Thrown when the call to the GenAI native API fails.
-        /// </exception>
         public void ClearProviders()
         {
             Result.VerifySuccess(NativeMethods.OgaConfigClearProviders(_configHandle));
         }
 
-        /// <summary>
-        /// Add the provider at the end of the list of providers in the given config if it doesn't already
-        /// exist. If it already exists, does nothing.
-        /// </summary>
-        /// <param name="providerName">Name of the provider</param>
-        /// <exception cref="OnnxRuntimeGenAIException">
-        /// Thrown when the call to the GenAI native API fails.
-        /// </exception>
-        public void AppendProvider(string providerName)
+        public void AppendProvider(string provider)
         {
-            Result.VerifySuccess(NativeMethods.OgaConfigAppendProvider(_configHandle, StringUtils.ToUtf8(providerName)));
+            Result.VerifySuccess(NativeMethods.OgaConfigAppendProvider(_configHandle, StringUtils.ToUtf8(provider)));
         }
 
-        /// <summary>
-        /// Set a provider option.
-        /// </summary>
-        /// <param name="providerName">Name of the provider</param>
-        /// <param name="optionKey">Name of the option</param>
-        /// <param name="optionValue">Value of the option</param>
-        /// <exception cref="OnnxRuntimeGenAIException">
-        /// Thrown when the call to the GenAI native API fails.
-        /// </exception>
-        public void SetProviderOption(string providerName, string optionKey, string optionValue)
+        public void SetProviderOption(string provider, string option, string value)
         {
-            Result.VerifySuccess(NativeMethods.OgaConfigSetProviderOption(_configHandle, StringUtils.ToUtf8(providerName), StringUtils.ToUtf8(optionKey), StringUtils.ToUtf8(optionValue)));
+            Result.VerifySuccess(NativeMethods.OgaConfigSetProviderOption(_configHandle, StringUtils.ToUtf8(provider), StringUtils.ToUtf8(option), StringUtils.ToUtf8(value)));
         }
 
         ~Config()