jd-opensource · magicheng0816 · Nov 3, 2025 · Nov 9, 2025 · yq33victor · Nov 5, 2025
diff --git a/xllm/core/common/CMakeLists.txt b/xllm/core/common/CMakeLists.txt
@@ -15,6 +15,7 @@ cc_library(
     rate_limiter.h
     types.h
     device_monitor.h
+    version_singleton.h
   SRCS
     etcd_client.cpp
     global_flags.cpp

diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -389,3 +389,9 @@ DEFINE_string(reasoning_parser,
 
 // --- qwen3 reranker config ---
 DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
+
+DEFINE_bool(enable_constrained_decoding,
+            false,
+            "Whether to enable constrained decoding, which is used to ensure "
+            "that the output meets specific format or structural requirements "
+            "through pre-defined rules.");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
@@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
 DECLARE_string(reasoning_parser);
 
 DECLARE_bool(enable_shm);
+
+DECLARE_bool(enable_constrained_decoding);
diff --git a/xllm/core/common/types.h b/xllm/core/common/types.h
@@ -287,4 +287,8 @@ struct MMChatMessage {
   std::vector<MMInputData> content;
 };
 
+inline constexpr int REC_TOKEN_SIZE = 3;
+
+using RecTokenTriple = std::array<int32_t, REC_TOKEN_SIZE>;
+
 }  // namespace xllm
diff --git a/xllm/core/common/version_singleton.h b/xllm/core/common/version_singleton.h
@@ -0,0 +1,109 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace xllm {
+// a singleton mode by version
+template <typename T>
+class VersionSingleton {
+ public:
+  template <typename... Args>
+  static T* GetInstance(const std::string& version,
+                        bool delete_old_versions = true,
+                        int reserved_version_size =
+                            2,  // default retention of the last two versions
+                        Args&&... args) {
+    T* instance = nullptr;
+
+    {
+      std::shared_lock<std::shared_mutex> lock(instance_map_mutex_);
+      auto it = instance_map_.find(version);
+      if (it != instance_map_.end()) {
+        instance = it->second.get();
+      }
+    }
+
+    if (instance == nullptr) {
+      std::unique_lock<std::shared_mutex> lock(instance_map_mutex_);
+
+      auto it = instance_map_.find(version);
+      if (it == instance_map_.end()) {
+        instance = new T(std::forward<Args>(args)...);
+        instance_map_[version] = std::unique_ptr<T>(instance);
+        instance_version_list_.push_front(version);
+        if (delete_old_versions) {
+          if (instance_version_list_.size() > reserved_version_size) {
+            auto it = instance_version_list_.begin();
+            std::advance(it, reserved_version_size);
+            for (; it != instance_version_list_.end(); it++) {
+              instance_map_.erase(*it);
+            }
+            instance_version_list_.resize(reserved_version_size);
+          }
+        }
+      } else {
+        instance = it->second.get();
+      }
+    }
+
+    return instance;
+  }
+
+  static std::vector<std::string> GetVersions() {
+    std::lock_guard<std::mutex> lock(instance_map_mutex_);
+    std::vector<std::string> versions;
+    for (const auto& pair : instance_map_) {
+      versions.push_back(pair.first);
+    }
+    return versions;
+  }
+
+  static void DestroyAllInstances() {
+    std::lock_guard<std::mutex> lock(instance_map_mutex_);
+    instance_map_.clear();
+    instance_version_list_.clear();
+  }
+
+  VersionSingleton(const VersionSingleton&) = delete;
+  VersionSingleton& operator=(const VersionSingleton&) = delete;
+
+ private:
+  VersionSingleton() = default;
+  ~VersionSingleton() = default;
+
+  static std::unordered_map<std::string, std::unique_ptr<T>> instance_map_;
+  static std::list<std::string> instance_version_list_;
+  static std::shared_mutex instance_map_mutex_;
+};
+
+template <typename T>
+std::unordered_map<std::string, std::unique_ptr<T>>
+    VersionSingleton<T>::instance_map_;
+template <typename T>
+std::list<std::string> VersionSingleton<T>::instance_version_list_;
+template <typename T>
+std::shared_mutex VersionSingleton<T>::instance_map_mutex_;
+
+}  // namespace xllm
diff --git a/xllm/core/framework/hf_model_loader.cpp b/xllm/core/framework/hf_model_loader.cpp
@@ -25,7 +25,10 @@ limitations under the License.
 #include <filesystem>
 #include <vector>
 
+#include "core/common/version_singleton.h"
+#include "core/framework/state_dict/rec_vocab_dict.h"
 #include "core/framework/tokenizer/fast_tokenizer.h"
+#include "core/framework/tokenizer/rec_tokenizer.h"
 #include "core/framework/tokenizer/sentencepiece_tokenizer.h"
 #include "core/framework/tokenizer/tiktoken_tokenizer.h"
 #include "core/framework/tokenizer/tokenizer_factory.h"
@@ -50,6 +53,12 @@ HFModelLoader::HFModelLoader(const std::string& model_weights_path)
       << "Failed to find model weights files in " << model_weights_path;
   // sort the model weights files by name
   std::sort(model_weights_files_.begin(), model_weights_files_.end());
+
+  //@todo: 'false' will be replaced with generative recommendation judgment
+  if (false) {
+    CHECK(load_rec_vocab(model_weights_path))
+        << "Failed to load rec content from " << model_weights_path;
+  }
 }
 
 std::unique_ptr<Tokenizer> HFModelLoader::tokenizer() const {
@@ -70,6 +79,28 @@ std::vector<std::unique_ptr<StateDict>>& HFModelLoader::get_state_dicts() {
   return state_dicts_;
 }
 
+bool HFModelLoader::load_rec_vocab(const std::string& model_weights_path) {
+  if (!tokenizer_args_.vocab_file().empty()) {
+    std::filesystem::path path = model_weights_path;
+    std::string model_version = path.filename();
+    std::string vocab_full_path =
+        path.append(tokenizer_args_.vocab_file()).string();
+
+    LOG(INFO) << "model_version:" << model_version
+              << ", vocab_full_path:" << vocab_full_path;
+
+    CHECK(nullptr != VersionSingleton<RecVocabDict>::GetInstance(model_version))
+        << "Failed to get vocab dict instance";
+    CHECK(VersionSingleton<RecVocabDict>::GetInstance(model_version)
+              ->initialize(vocab_full_path))
+        << "Failed to initialize vocab dict from " << vocab_full_path;
+  } else {
+    LOG(ERROR) << "vocab file is not set";
+  }
+
+  return true;
+}
+
 bool HFModelLoader::load_args(const std::string& model_weights_path) {
   if (!load_model_args(model_weights_path)) {
     LOG(ERROR) << "Failed to load model args from " << model_weights_path;

diff --git a/xllm/core/framework/hf_model_loader.h b/xllm/core/framework/hf_model_loader.h
@@ -35,6 +35,7 @@ class HFModelLoader : public ModelLoader {
 
  private:
   bool load_args(const std::string& model_weights_path);
+  bool load_rec_vocab(const std::string& model_weights_path);
   bool load_model_args(const std::string& model_weights_path);
   bool load_quant_args(const std::string& model_weights_path);
   bool load_tokenizer_args(const std::string& model_weights_path);

diff --git a/xllm/core/framework/state_dict/CMakeLists.txt b/xllm/core/framework/state_dict/CMakeLists.txt
@@ -11,9 +11,11 @@ cc_library(
   HDRS
     state_dict.h
     utils.h
+    rec_vocab_dict.h
   SRCS
     state_dict.cpp
     utils.cpp
+    rec_vocab_dict.cpp
   DEPS
     rust_safetensors
     torch

diff --git a/xllm/core/framework/state_dict/rec_vocab_dict.cpp b/xllm/core/framework/state_dict/rec_vocab_dict.cpp
@@ -0,0 +1,138 @@
+#include "rec_vocab_dict.h"
+
+#include <algorithm>
+#include <array>
+#include <filesystem>
+#include <fstream>
+
+#include "common/global_flags.h"
+#include "util/timer.h"
+
+namespace xllm {
+
+bool RecVocabDict::initialize(const std::string& vocab_file) {
+  if (initialized_) {
+    return true;
+  }
+
+  Timer timer;
+
+  if (vocab_file.empty()) {
+    LOG(ERROR) << "content data file is empty, file: " << vocab_file;
+    return false;
+  }
+  if (!std::filesystem::exists(vocab_file)) {
+    LOG(ERROR) << "fail to find content data file: " << vocab_file;
+    return false;
+  }
+  std::ifstream ifs(vocab_file.data(), std::ios::binary | std::ios::ate);
+  if (!ifs.is_open()) {
+    LOG(ERROR) << "fail to load content data file: " << vocab_file;
+    return false;
+  }
+
+  const size_t file_size = ifs.tellg();
+  ifs.seekg(0, std::ios::beg);
+
+  // each line of content : 1 * int64_t(item id) + REC_TOKEN_SIZE *
+  //  int32_t(token id);
+  const size_t itemid_size = sizeof(int64_t);
+  const size_t tokens_size = REC_TOKEN_SIZE * sizeof(int32_t);
+  const size_t line_size = tokens_size + itemid_size;
+  const size_t estimated_lines = (file_size + line_size - 1) / line_size;
+
+  // 2 and 4 are only empirical values
+  item_to_tokens_map_.reserve(estimated_lines);
+  tokens_to_items_map_.reserve(estimated_lines / 2);
+  prefix_tokens_to_next_tokens_map_.reserve(estimated_lines / 4);
+
+  int64_t item_id = 0;
+  RecTokenTriple tokens;
+
+  while (ifs.read(reinterpret_cast<char*>(&item_id), itemid_size) &&
+         ifs.read(reinterpret_cast<char*>(tokens.data()), tokens_size)) {
+    if (FLAGS_enable_constrained_decoding) {
+      for (int i = 0; i < tokens.size(); i++) {
+        std::vector<int32_t> prefix_tokens;
+
+        for (int j = 0; j < i; j++) {
+          prefix_tokens.emplace_back(tokens[j]);
+        }
+
+        prefix_tokens_to_next_tokens_map_[prefix_tokens].insert(tokens[i]);
+      }
+    }
+
+    item_to_tokens_map_[item_id] = tokens;
+
+    tokens_to_items_map_[tokens].emplace_back(item_id);
+  }
+
+  if (ifs.gcount() != 0 && ifs.gcount() != line_size) {
+    LOG(ERROR) << "possibly containing incomplete lines : " << vocab_file;
+    item_to_tokens_map_.clear();
+    tokens_to_items_map_.clear();
+    prefix_tokens_to_next_tokens_map_.clear();
+    return false;
+  }
+
+  initialized_ = true;
+  LOG(INFO) << "total line size:" << estimated_lines
+            << ",parse tokens to item id map size: "
+            << tokens_to_items_map_.size()
+            << ", parse item to tokens map size:" << item_to_tokens_map_.size()
+            << ", parse prefix tokens to next tokens map size:"
+            << prefix_tokens_to_next_tokens_map_.size()
+            << ", cost: " << timer.elapsed_seconds() << " seconds";
+
+  return true;
+}
+
+bool RecVocabDict::get_items_by_tokens(const RecTokenTriple& rec_token_triple,
+                                       std::vector<int64_t>* item_ids) const {
+  CHECK_EQ(initialized_, true);
+  CHECK_NE(item_ids, nullptr);
+
+  auto iter = tokens_to_items_map_.find(rec_token_triple);
+  if (iter == tokens_to_items_map_.end()) {
+    return false;
+  }
+
+  std::copy(
+      iter->second.begin(), iter->second.end(), std::back_inserter(*item_ids));
+
+  return true;
+}
+
+bool RecVocabDict::get_tokens_by_item(int64_t item_id,
+                                      std::vector<int32_t>* token_ids) const {
+  CHECK_EQ(initialized_, true);
+  CHECK_NE(token_ids, nullptr);
+
+  auto iter = item_to_tokens_map_.find(item_id);
+  if (iter == item_to_tokens_map_.end()) {
+    return false;
+  }
+
+  std::copy(
+      iter->second.begin(), iter->second.end(), std::back_inserter(*token_ids));
+
+  return true;
+}
+
+const std::set<int32_t>& RecVocabDict::get_next_tokens_by_prefix_tokens(
+    const Slice<int32_t>& prefix_token_ids) const {
+  CHECK_EQ(initialized_, true);
+  CHECK_LT(prefix_token_ids.size(), REC_TOKEN_SIZE);
+
+  std::vector<int32_t> prefix_tokens_ids_vec = prefix_token_ids;
+  auto iter = prefix_tokens_to_next_tokens_map_.find(prefix_tokens_ids_vec);
+  if (iter == prefix_tokens_to_next_tokens_map_.end()) {
+    static std::set<int32_t> empty_set;
+    return empty_set;
+  }
+
+  return iter->second;
+}
+
+}  // namespace xllm
Original file line number	Diff line number	Diff line change
Expand Up		@@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
		DECLARE_string(reasoning_parser);

		DECLARE_bool(enable_shm);

		DECLARE_bool(enable_constrained_decoding);