diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 4e5fd00a555..30355d2fc99 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -103,6 +103,7 @@ let vulkan-headers vulkan-loader shaderc + spirv-headers ]; in @@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: { ninja pkg-config git - spirv-headers ] ++ optionals useCuda [ cudaPackages.cuda_nvcc diff --git a/.gitignore b/.gitignore index 417e591db6d..11358c72855 100644 --- a/.gitignore +++ b/.gitignore @@ -110,6 +110,7 @@ uv.lock # Nix +flake.lock /result # Test binaries diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1486171b8c5..e5dea18aeb2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1570,6 +1570,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015": # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B res = "f2llmv2" + if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57": + # ref: https://huggingface.co/sarvamai/sarvam-30b + res = "sarvam-moe" if res is None: logger.warning("\n") @@ -11591,6 +11594,34 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM") +class SarvamMoEModel(BailingMoeV2Model): + model_arch = gguf.MODEL_ARCH.BAILINGMOE2 + # Sarvam-MoE shares the BailingMoeV2 architecture; only differences: + # - full rotary (no partial_rotary_factor) + # - expert bias is zero-mean normalized at load time + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if name.endswith(".expert_bias"): + # Sarvam normalizes expert bias to zero mean + inner = gen + + def gen(): + t = inner() + return t - t.mean() + return super().filter_tensors((name, gen)) + + @ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") class GroveMoeModel(TextModel): model_arch = gguf.MODEL_ARCH.GROVEMOE diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 6e6cd057909..8d73b1f5546 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -155,6 +155,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", }, {"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", }, {"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", }, + {"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/flake.lock b/flake.lock deleted file mode 100644 index d114f4422a3..00000000000 --- a/flake.lock +++ /dev/null @@ -1,58 +0,0 @@ -{ - "nodes": { - "flake-parts": { - "inputs": { - "nixpkgs-lib": "nixpkgs-lib" - }, - "locked": { - "lastModified": 1730504689, - "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=", - "owner": "hercules-ci", - "repo": "flake-parts", - "rev": "506278e768c2a08bec68eb62932193e341f55c90", - "type": "github" - }, - "original": { - "owner": "hercules-ci", - "repo": "flake-parts", - "type": "github" - } - }, - "nixpkgs": { - "locked": { - "lastModified": 1732014248, - "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367", - "type": "github" - }, - "original": { - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" - } - }, - "nixpkgs-lib": { - "locked": { - "lastModified": 1730504152, - "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=", - "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz" - }, - "original": { - "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz" - } - }, - "root": { - "inputs": { - "flake-parts": "flake-parts", - "nixpkgs": "nixpkgs" - } - } - }, - "root": "root", - "version": 7 -} diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 163f222ef61..f43cf546ca0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -503,6 +503,14 @@ struct llm_tokenizer_bpe : llm_tokenizer { }; byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding break; + case LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE: + // Sarvam uses SPM-style BPE (same shape as Gemma4): spaces replaced with U+2581 + // by the normalizer, BPE merges over the whole text on raw UTF-8. + regex_exprs = { + "[^\\n]+|[\\n]+", + }; + byte_encode = false; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -2005,6 +2013,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "gemma4") { pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4; escape_whitespaces = true; + } else if ( + tokenizer_pre == "sarvam-moe") { + pre_type = LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE; + escape_whitespaces = true; + clean_spaces = false; } else if ( tokenizer_pre == "jina-v1-en" || tokenizer_pre == "jina-v2-code" || diff --git a/src/llama-vocab.h b/src/llama-vocab.h index dd38f45d3a2..8b040b912e2 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -59,6 +59,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, + LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51, }; struct LLM_KV;