microsoft · RyanUnderhill · Feb 19, 2025 · Feb 19, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -87,24 +87,19 @@ if(WIN32)
   target_compile_definitions(onnxruntime-genai PRIVATE VERSION_PATCH=${VERSION_PATCH})
   target_compile_definitions(onnxruntime-genai PRIVATE VERSION_SUFFIX=${VERSION_SUFFIX})
   target_compile_definitions(onnxruntime-genai PRIVATE FILE_NAME=\"onnxruntime-genai.dll\")
-
-  add_library(onnxruntime-genai-static STATIC ${generator_srcs})
 else()
   add_library(onnxruntime-genai SHARED ${generator_srcs})
-  add_library(onnxruntime-genai-static STATIC ${generator_srcs})
 endif()
 
 target_include_directories(onnxruntime-genai PRIVATE ${ORT_HEADER_DIR})
-target_include_directories(onnxruntime-genai-static PRIVATE ${ORT_HEADER_DIR})
 target_include_directories(onnxruntime-genai PRIVATE ${onnxruntime_extensions_SOURCE_DIR}/include)
 target_include_directories(onnxruntime-genai PRIVATE ${onnxruntime_extensions_SOURCE_DIR}/shared/api/)
-target_include_directories(onnxruntime-genai-static PRIVATE ${onnxruntime_extensions_SOURCE_DIR}/include)
-target_include_directories(onnxruntime-genai-static PUBLIC ${onnxruntime_extensions_SOURCE_DIR}/shared/api/)
 target_link_libraries(onnxruntime-genai PRIVATE onnxruntime_extensions)
-target_link_libraries(onnxruntime-genai-static PUBLIC onnxruntime_extensions)
 target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR})
 target_link_libraries(onnxruntime-genai PRIVATE Threads::Threads)
-target_link_libraries(onnxruntime-genai-static PUBLIC Threads::Threads)
+
+# The genai library itself is always embedded in the shared library
+list(APPEND ortgenai_embed_libs "$<TARGET_FILE:onnxruntime-genai>")
 
 # we keep the shared libraries disconnected on Android as they will come from separate AARs and we don't want to force
 # the ORT version to match in both.
@@ -146,30 +141,20 @@ if(CMAKE_GENERATOR_TOOLSET MATCHES "Visual Studio")
 endif()
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-  set_target_properties(onnxruntime-genai-static PROPERTIES POSITION_INDEPENDENT_CODE ON)
-  target_link_libraries(onnxruntime-genai-static PRIVATE dl)  # For dlopen & co
+  set_target_properties(onnxruntime-genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  target_link_libraries(onnxruntime-genai PRIVATE dl)  # For dlopen & co
 endif()
 
-
 if(USE_DML)
   list(APPEND ortgenai_embed_libs "${D3D12_LIB_DIR}/D3D12Core.dll")
   target_include_directories(onnxruntime-genai PRIVATE $<TARGET_PROPERTY:${WIL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
   target_include_directories(onnxruntime-genai PRIVATE $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>/directx)
   target_include_directories(onnxruntime-genai PRIVATE $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
-  target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${WIL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
-  target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>/directx)
-  target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
   target_include_directories(onnxruntime-genai PRIVATE ${DML_HEADER_DIR})
-  target_include_directories(onnxruntime-genai-static PRIVATE ${DML_HEADER_DIR})
   target_include_directories(onnxruntime-genai PRIVATE ${D3D12_HEADER_DIR})
-  target_include_directories(onnxruntime-genai-static PRIVATE ${D3D12_HEADER_DIR})
   target_link_directories(onnxruntime-genai PRIVATE ${DML_LIB_DIR})
   target_link_directories(onnxruntime-genai PRIVATE ${D3D12_LIB_DIR})
-  target_link_directories(onnxruntime-genai-static PUBLIC ${DML_LIB_DIR})
-  target_link_directories(onnxruntime-genai-static PUBLIC ${D3D12_LIB_DIR})
-  target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib dxcore.lib dxguid.lib dxgi.lib)
   target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib dxcore.lib dxguid.lib dxgi.lib)
-  target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib dxcore.lib dxguid.lib dxgi.lib)
 
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/_deps ABSOLUTE)
   set(DXC_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.Direct3D.DXC.1.7.2308.12)
@@ -194,7 +179,6 @@ if(USE_DML)
 
   add_dependencies(RESTORE_PACKAGES nuget)
   add_dependencies(onnxruntime-genai RESTORE_PACKAGES)
-  add_dependencies(onnxruntime-genai-static RESTORE_PACKAGES)
 endif()
 
 if(ANDROID)

diff --git a/benchmark/c/CMakeLists.txt b/benchmark/c/CMakeLists.txt
@@ -26,7 +26,7 @@ target_include_directories(model_benchmark PRIVATE
   ${CMAKE_SOURCE_DIR}/src  # directory containing the ort_genai headers
 )
 
-target_link_libraries(model_benchmark PRIVATE onnxruntime-genai-static ${ONNXRUNTIME_LIB})
+target_link_libraries(model_benchmark PRIVATE onnxruntime-genai ${ONNXRUNTIME_LIB})
 
 target_link_directories(model_benchmark PRIVATE ${ORT_LIB_DIR})
 

diff --git a/src/config.cpp b/src/config.cpp
@@ -610,11 +610,19 @@ struct Search_Element : JSON::Element {
 };
 
 void SetSearchNumber(Config::Search& search, std::string_view name, double value) {
-  Search_Element(search).OnValue(name, value);
+  try {
+    Search_Element(search).OnValue(name, value);
+  } catch (...) {
+    JSON::TranslateException(name);
+  }
 }
 
 void SetSearchBool(Config::Search& search, std::string_view name, bool value) {
-  Search_Element(search).OnValue(name, value);
+  try {
+    Search_Element(search).OnValue(name, value);
+  } catch (...) {
+    JSON::TranslateException(name);
+  }
 }
 
 void ClearProviders(Config& config) {
@@ -712,6 +720,12 @@ void ParseConfig(const fs::path& filename, std::string_view json_overlay, Config
   }
 }
 
+void OverlayConfig(Config& config, std::string_view json) {
+  Root_Element root{config};
+  RootObject_Element element{root};
+  JSON::Parse(element, json);
+}
+
 Config::Config(const fs::path& path, std::string_view json_overlay) : config_path{path} {
   ParseConfig(path / "genai_config.json", json_overlay, *this);
 

diff --git a/src/config.h b/src/config.h
@@ -188,6 +188,7 @@ void SetSearchNumber(Config::Search& search, std::string_view name, double value
 void SetSearchBool(Config::Search& search, std::string_view name, bool value);
 void ClearProviders(Config& config);
 void SetProviderOption(Config& config, std::string_view provider_name, std::string_view option_name, std::string_view option_value);
+void OverlayConfig(Config& config, std::string_view json);
 bool IsCudaGraphEnabled(Config::SessionOptions& session_options);
 
 }  // namespace Generators
diff --git a/src/generators.cpp b/src/generators.cpp
@@ -128,37 +128,73 @@ struct GenaiInterfaceImpl : GenaiInterface {
   void Sequences_RewindTo(Sequences* p_this, size_t new_length) override { return p_this->RewindTo(new_length); }
 } g_genai;
 
-DeviceInterface* GetCudaInterface() {
-// Load the shared library onnxruntime-genai-cuda.dll
-// This is a workaround to avoid linking the CUDA library to the generator library
-// The CUDA library is only needed for the CUDA allocator
 #if defined(_WIN32)
-  static std::unique_ptr<void, void (*)(void*)> cuda_library{LoadLibrary((CurrentModulePath() + "onnxruntime-genai-cuda.dll").c_str()),
-                                                             [](void* h) { FreeLibrary(reinterpret_cast<HMODULE>(h)); }};
+struct LibraryHandle {
+  LibraryHandle(const char* filename) {
+    auto path = CurrentModulePath() + filename;
+    handle_ = LoadLibrary(path.c_str());
+    if (!handle_)
+      throw std::runtime_error(std::string("Failed to load library: ") + path + " Error: " + std::to_string(GetLastError()));
+  };
+
+  ~LibraryHandle() { FreeLibrary(handle_); }
+
+  FARPROC __stdcall GetSymbol(const char* name) { return ::GetProcAddress(handle_, name); }
+
+  operator HANDLE() { return handle_; }
+
+ private:
+  HMODULE handle_{};
+};
 #elif defined(__linux__) && !defined(__ANDROID__)
-  static std::unique_ptr<void, void (*)(void*)> cuda_library{dlopen((Ort::GetCurrentModuleDir() + "/libonnxruntime-genai-cuda.so").c_str(), RTLD_NOW | RTLD_DEEPBIND),
-                                                             [](void* h) { dlclose(h); }};
+struct LibraryHandle {
+  LibraryHandle(const char* filename) {
+    auto path = Ort::GetCurrentModuleDir() + "/" + filename;
+    handle_ = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (!handle_)
+      throw std::runtime_error(std::string("Failed to load library: ") + dlerror());  // dlerror() includes the path
+  }
+  ~LibraryHandle() {
+    dlclose(handle_);
+  }
+
+  void* GetSymbol(const char* name) { return ::dlsym(handle_, name); }
+
+  operator void*() { return handle_; }
+
+ private:
+  void* handle_{};
+};
 #else
-  static std::unique_ptr<void, void (*)(void*)> cuda_library{nullptr, [](void* h) {}};
-#endif
+struct LibraryHandle {
+  LibraryHandle(const char* filename) {}
+  ~LibraryHandle() {}
 
-  if (!cuda_library) {
-    throw std::runtime_error("Cuda interface not available.");
-  }
+  void* GetSymbol(const char* name) { return nullptr; }
+
+  operator bool() { return false; }
+};
+#endif
 
-  Generators::DeviceInterface* GetInterface(GenaiInterface * p_genai);
-  static DeviceInterface* cuda_interface{[] {
+DeviceInterface* GetCudaInterface() {
+  try {
 #if defined(_WIN32)
-    auto get_cuda_fn = reinterpret_cast<decltype(&GetInterface)>(GetProcAddress(reinterpret_cast<HMODULE>(cuda_library.get()), "GetInterface"));
+    static LibraryHandle library{"onnxruntime-genai-cuda.dll"};
 #elif defined(__linux__) && !defined(__ANDROID__)
-    auto get_cuda_fn = reinterpret_cast<decltype(&GetInterface)>(dlsym(cuda_library.get(), "GetInterface"));
+    static LibraryHandle library{"libonnxruntime-genai-cuda.so"};
 #else
-    auto get_cuda_fn = [](GenaiInterface*) { return nullptr; };
+    static LibraryHandle library{""};
 #endif
-    return get_cuda_fn(&g_genai);
-  }()};
+    if (!library)
+      throw std::runtime_error("Shared library load failure (see first error)");
 
-  return cuda_interface;
+    Generators::DeviceInterface* GetInterface(GenaiInterface * p_genai);
+    static DeviceInterface* cuda_interface = reinterpret_cast<decltype(&GetInterface)>(library.GetSymbol("GetInterface"))(&g_genai);
+
+    return cuda_interface;
+  } catch (const std::exception& e) {
+    throw std::runtime_error("Cuda interface not available: " + std::string(e.what()));
+  }
 }
 
 std::string to_string(DeviceType device_type) {

diff --git a/src/json.cpp b/src/json.cpp
@@ -35,6 +35,18 @@ void Parse(Element& element, std::string_view document) {
   JSON{element, document};
 }
 
+void TranslateException(std::string_view name) {
+  try {
+    throw;
+  } catch (const unknown_value_error&) {
+    throw std::runtime_error(" Unknown value \"" + std::string(name) + "\"");
+  } catch (const type_mismatch& e) {
+    throw std::runtime_error(std::string(name) + " - Expected a " + std::string(value_names[e.expected]) + " but saw a " + std::string(value_names[e.seen]));
+  } catch (...) {
+    throw;
+  }
+}
+
 JSON::JSON(Element& element, std::string_view document) : begin_{document.data()}, end_{document.data() + document.size()} {
   try {
     Parse_Value(element, {});
@@ -167,14 +179,12 @@ void JSON::Parse_Value(Element& element, std::string_view name) {
           throw unknown_value_error{};
         break;
     }
-  } catch (const unknown_value_error&) {
-    throw std::runtime_error(" Unknown value \"" + std::string(name) + "\"");
-  } catch (const type_mismatch& e) {
-    throw std::runtime_error(std::string(name) + " - Expected a " + std::string(value_names[e.expected]) + " but saw a " + std::string(value_names[e.seen]));
   } catch (const std::runtime_error& e) {
     if (!name.empty())
       throw std::runtime_error(std::string(name) + ":" + e.what());
     throw;
+  } catch (...) {
+    TranslateException(name);
   }
 
   Parse_Whitespace();

diff --git a/src/json.h b/src/json.h
@@ -35,4 +35,5 @@ struct Element {
 };
 
 void Parse(Element& element, std::string_view document);
+void TranslateException(std::string_view name);  // Translate JSON exceptions into std::runtime_exception with a useful message
 }  // namespace JSON
diff --git a/src/models/audio_processor.cpp b/src/models/audio_processor.cpp
@@ -38,7 +38,10 @@ std::unique_ptr<OrtValue> ProcessMel(ort_extensions::OrtxObjectPtr<OrtxTensor>&
 
 }  // namespace
 
-std::unique_ptr<Audios> LoadAudios(const std::span<const char* const>& audio_paths) {
+std::unique_ptr<Audios> LoadAudios(std::span<const char* const> audio_paths) {
+  if (audio_paths.empty())
+    throw std::runtime_error("No audios provided");
+
   for (const char* audio_path : audio_paths) {
     if (!fs::path(audio_path).exists()) {
       throw std::runtime_error("Audio path does not exist: " + std::string(audio_path));

diff --git a/src/models/audio_processor.h b/src/models/audio_processor.h
@@ -19,7 +19,7 @@ struct Audios {
   size_t num_audios_{};
 };
 
-std::unique_ptr<Audios> LoadAudios(const std::span<const char* const>& audio_paths);
+std::unique_ptr<Audios> LoadAudios(std::span<const char* const> audio_paths);
 
 struct AudioProcessor {
   AudioProcessor(Config& config, const SessionInfo& session_info);

diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -208,6 +208,24 @@ std::vector<int32_t> Tokenizer::EncodeBatch(std::span<const std::string> strings
   return PadInputs(span_sequences, pad_token_id_);
 }
 
+std::shared_ptr<Tensor> Tokenizer::EncodeBatch(std::span<const char*> strings) const {
+  std::vector<std::vector<int32_t>> sequences;
+  std::vector<std::span<const int32_t>> span_sequences;
+  for (size_t i = 0; i < strings.size(); i++) {
+    sequences.emplace_back(Encode(strings[i]));
+    span_sequences.emplace_back(sequences.back());
+  }
+
+  auto encoded = PadInputs(span_sequences, pad_token_id_);  // TODO: Pad directly into tensor vs copying?
+
+  auto tensor = std::make_shared<Tensor>();
+  auto shape = std::array<int64_t, 2>{static_cast<int64_t>(strings.size()), static_cast<int64_t>(encoded.size() / strings.size())};
+  tensor->ort_tensor_ = OrtValue::CreateTensor<int32_t>(Ort::Allocator::GetWithDefaultOptions(), shape);
+  std::copy(encoded.begin(), encoded.end(), tensor->ort_tensor_->GetTensorMutableData<int32_t>());
+
+  return tensor;
+}
+
 std::vector<std::string> Tokenizer::DecodeBatch(std::span<const int32_t> sequences, size_t count) const {
   if (sequences.size() % count != 0)
     throw std::runtime_error("DecodeBatch: sequences must be evenly divisible by the count");

diff --git a/src/models/model.h b/src/models/model.h
@@ -81,6 +81,7 @@ struct Tokenizer : std::enable_shared_from_this<Tokenizer>, LeakChecked<Tokenize
   std::string Decode(std::span<const int32_t> tokens) const;
 
   std::vector<int32_t> EncodeBatch(std::span<const std::string> strings) const;
+  std::shared_ptr<Tensor> EncodeBatch(std::span<const char*> strings) const;
   std::vector<std::string> DecodeBatch(std::span<const int32_t> sequences, size_t count) const;
 
   int32_t TokenToTokenId(const char* token) const;

diff --git a/src/models/onnxruntime_inline.h b/src/models/onnxruntime_inline.h
@@ -92,6 +92,39 @@ inline constexpr ONNXTensorElementDataType TypeToTensorType<Float16_t> = ONNX_TE
 template <>
 inline constexpr ONNXTensorElementDataType TypeToTensorType<BFloat16_t> = ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16;
 
+inline size_t SizeOf(ONNXTensorElementDataType type) {
+  switch (type) {
+    case Ort::TypeToTensorType<uint8_t>:
+      return sizeof(uint8_t);
+    case Ort::TypeToTensorType<int8_t>:
+      return sizeof(int8_t);
+    case Ort::TypeToTensorType<uint16_t>:
+      return sizeof(uint16_t);
+    case Ort::TypeToTensorType<int16_t>:
+      return sizeof(int16_t);
+    case Ort::TypeToTensorType<uint32_t>:
+      return sizeof(uint32_t);
+    case Ort::TypeToTensorType<int32_t>:
+      return sizeof(int32_t);
+    case Ort::TypeToTensorType<uint64_t>:
+      return sizeof(int64_t);
+    case Ort::TypeToTensorType<int64_t>:
+      return sizeof(int64_t);
+    case Ort::TypeToTensorType<bool>:
+      return sizeof(bool);
+    case Ort::TypeToTensorType<float>:
+      return sizeof(float);
+    case Ort::TypeToTensorType<double>:
+      return sizeof(double);
+    case Ort::TypeToTensorType<Ort::Float16_t>:
+      return sizeof(Ort::Float16_t);
+    case Ort::TypeToTensorType<Ort::BFloat16_t>:
+      return sizeof(Ort::BFloat16_t);
+    default:
+      throw std::runtime_error("Unsupported ONNXTensorElementDataType in GetTypeSize");
+  }
+}
+
 inline std::vector<std::string> GetAvailableProviders() {
   int len;
   char** providers;

diff --git a/src/models/prompt_image_processor.cpp b/src/models/prompt_image_processor.cpp
@@ -104,7 +104,10 @@ std::unique_ptr<OrtValue> ProcessImageSizes(ortc::Tensor<int64_t>* image_sizes,
 
 }  // namespace
 
-std::unique_ptr<Images> LoadImages(const std::span<const char* const>& image_paths) {
+std::unique_ptr<Images> LoadImages(std::span<const char* const> image_paths) {
+  if (image_paths.empty())
+    throw std::runtime_error("No images provided");
+
   for (const char* image_path : image_paths) {
     if (!fs::path(image_path).exists()) {
       throw std::runtime_error("Image path does not exist: " + std::string(image_path));

diff --git a/src/models/prompt_image_processor.h b/src/models/prompt_image_processor.h
@@ -15,7 +15,7 @@ struct Images {
   size_t num_images_{};
 };
 
-std::unique_ptr<Images> LoadImages(const std::span<const char* const>& image_paths);
+std::unique_ptr<Images> LoadImages(std::span<const char* const> image_paths);
 
 struct ImageProcessor {
   ImageProcessor(Config& config, const SessionInfo& session_info);

diff --git a/src/models/static_buffer.cpp b/src/models/static_buffer.cpp
@@ -11,7 +11,7 @@ StaticBuffer::StaticBuffer(Ort::Allocator* allocator, size_t max_beam_batch_size
 
 std::unique_ptr<OrtValue> StaticBuffer::CreateTensorOnStaticBuffer(std::span<const int64_t> shape,
                                                                    ONNXTensorElementDataType type) {
-  size_t new_bytes = SizeOf(type) * GetNumElements(shape);
+  size_t new_bytes = Ort::SizeOf(type) * GetNumElements(shape);
   if (buffer_ == nullptr) {
     // Assuming the first dimension is the batch size
     bytes_ = new_bytes * (max_beam_batch_size_ / shape[0]);

diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp
@@ -141,7 +141,7 @@ DeviceSpan<float> Whisper_State::Run(int current_length, DeviceSpan<int32_t>& ne
     case RunState::Decoder_First: {
       auto src_shape_info = init_presents_[0]->GetTensorTypeAndShapeInfo();
 
-      const auto copy_data_size_all = src_shape_info->GetElementCount() * SizeOf(src_shape_info->GetElementType());
+      const auto copy_data_size_all = src_shape_info->GetElementCount() * Ort::SizeOf(src_shape_info->GetElementType());
 
 #if 0  // USE_CUDA
       const auto src_dims = src_shape_info->GetShape();