From a8935c9595244a9877f07a5f55dca081f9f8ab17 Mon Sep 17 00:00:00 2001 From: lzhang Date: Fri, 18 Jul 2025 14:20:29 +0800 Subject: [PATCH] Fix MinicpmV model converter and clip to avoid using hardcode. --- tools/mtmd/clip-impl.h | 2 + tools/mtmd/clip.cpp | 62 ++++++---- .../minicpmv-convert-image-encoder-to-gguf.py | 107 +++++++++++++----- tools/mtmd/legacy-models/minicpmv-surgery.py | 2 + 4 files changed, 123 insertions(+), 50 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 62c936ed00f77..133e13eea461d 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -44,6 +44,8 @@ #define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" +#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" +#define KEY_MINICPMV_PROJECTION_DIM "clip.minicpmv_projection_dim" // audio-specific #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9146c9e9c4481..2461a21a1d2b4 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -201,6 +201,8 @@ struct clip_hparams { // legacy bool has_llava_projector = false; int minicpmv_version = 0; + int32_t minicpmv_query_num = 0; // MiniCPM-V query number + int32_t minicpmv_projection_dim = 0; // MiniCPM-V projection dimension }; struct clip_layer { @@ -847,13 +849,19 @@ struct clip_graph { int n_embd = clip_n_mmproj_embd(ctx); const int d_head = 128; int n_head = n_embd/d_head; + // Use actual config value if available, otherwise fall back to hardcoded values int num_query = 96; - if (ctx->model.hparams.minicpmv_version == 2) { - num_query = 96; - } else if (ctx->model.hparams.minicpmv_version == 3) { - num_query = 64; - } else if (ctx->model.hparams.minicpmv_version == 4) { - num_query = 64; + if (ctx->model.hparams.minicpmv_query_num > 0) { + num_query = ctx->model.hparams.minicpmv_query_num; + } else { + // Fallback to hardcoded values for legacy models + if (ctx->model.hparams.minicpmv_version == 2) { + num_query = 96; + } else if (ctx->model.hparams.minicpmv_version == 3) { + num_query = 64; + } else if (ctx->model.hparams.minicpmv_version == 4) { + num_query = 64; + } } ggml_tensor * Q = ggml_add(ctx0, @@ -2110,6 +2118,8 @@ struct clip_model_loader { get_u32(KEY_PATCH_SIZE, hparams.patch_size); get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy + get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false); + get_u32(KEY_MINICPMV_PROJECTION_DIM, hparams.minicpmv_projection_dim, false); } else if (is_audio) { get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins); @@ -3517,14 +3527,20 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_MINICPMV: { - if (params.minicpmv_version == 2) { - n_patches_sq = 96; - } else if (params.minicpmv_version == 3) { - n_patches_sq = 64; - } else if (params.minicpmv_version == 4) { - n_patches_sq = 64; + // Use actual config value if available, otherwise fall back to hardcoded values + if (params.minicpmv_query_num > 0) { + n_patches_sq = params.minicpmv_query_num; } else { - GGML_ABORT("Unknown minicpmv version"); + // Fallback to hardcoded values for legacy models + if (params.minicpmv_version == 2) { + n_patches_sq = 96; + } else if (params.minicpmv_version == 3) { + n_patches_sq = 64; + } else if (params.minicpmv_version == 4) { + n_patches_sq = 64; + } else { + GGML_ABORT("Unknown minicpmv version"); + } } } break; case PROJECTOR_TYPE_QWEN2VL: @@ -4059,14 +4075,20 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_MLP_NORM: return ctx->model.mm_3_b->ne[0]; case PROJECTOR_TYPE_MINICPMV: - if (hparams.minicpmv_version == 2) { - return 4096; - } else if (hparams.minicpmv_version == 3) { - return 3584; - } else if (hparams.minicpmv_version == 4) { - return 3584; + // Use actual config value if available, otherwise fall back to hardcoded values + if (hparams.minicpmv_projection_dim > 0) { + return hparams.minicpmv_projection_dim; + } else { + // Fallback to hardcoded values for legacy models + if (hparams.minicpmv_version == 2) { + return 4096; + } else if (hparams.minicpmv_version == 3) { + return 3584; + } else if (hparams.minicpmv_version == 4) { + return 3584; + } + GGML_ABORT("Unknown minicpmv version"); } - GGML_ABORT("Unknown minicpmv version"); case PROJECTOR_TYPE_GLM_EDGE: return ctx->model.mm_model_mlp_3_w->ne[1]; case PROJECTOR_TYPE_QWEN2VL: diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index cfe0961f9891a..daa2f39ae1e09 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -517,6 +517,16 @@ def bytes_to_unicode(): # output in the same directory as the model if output_dir is None dir_model = args.model_dir +# Read config.json to get actual model configuration +config_path = os.path.join(dir_model, "config.json") +model_config = {} +if os.path.exists(config_path): + with open(config_path, "r", encoding="utf-8") as f: + model_config = json.load(f) + print(f"Loaded config from {config_path}") +else: + print(f"Warning: config.json not found at {config_path}") + if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: vocab = None tokens = None @@ -544,34 +554,59 @@ def bytes_to_unicode(): # processor = CLIPProcessor.from_pretrained(dir_model) minicpmv_version = args.minicpmv_version -emb_dim = 4096 -block_count = 26 -if minicpmv_version == 1: - emb_dim = 2304 - block_count = 26 -elif minicpmv_version == 2: - emb_dim = 4096 - block_count = 27 -elif minicpmv_version == 3: - emb_dim = 3584 - block_count = 27 -elif minicpmv_version == 4: - emb_dim = 3584 - block_count = 27 - -default_vision_config = { - "hidden_size": 1152, - "image_size": 980, - "intermediate_size": 4304, - "model_type": "idefics2", - "num_attention_heads": 16, - "num_hidden_layers": 27, - "patch_size": 14, + +# Use actual config values instead of hardcoded ones +if model_config: + # For the projector/resampler, use the main model's hidden_size + emb_dim = model_config.get("hidden_size", 1536) + + # For the vision model, use vision_config values + vision_config_dict = model_config.get("vision_config", {}) + default_vision_config = { + "hidden_size": vision_config_dict.get("hidden_size", 1152), + "image_size": vision_config_dict.get("image_size", 980), + "intermediate_size": vision_config_dict.get("intermediate_size", 4304), + "model_type": vision_config_dict.get("model_type", "siglip"), + "num_attention_heads": vision_config_dict.get("num_attention_heads", 16), + "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27), + "patch_size": vision_config_dict.get("patch_size", 14), } + # Use vision model's num_hidden_layers for block_count + block_count = vision_config_dict.get("num_hidden_layers", 27) + + print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}") + print(f"Vision config: {default_vision_config}") +else: + # Fallback to original hardcoded logic if config.json not found + emb_dim = 4096 + block_count = 26 + if minicpmv_version == 1: + emb_dim = 2304 + block_count = 26 + elif minicpmv_version == 2: + emb_dim = 4096 + block_count = 27 + elif minicpmv_version == 3: + emb_dim = 3584 + block_count = 27 + elif minicpmv_version == 4: + emb_dim = 3584 + block_count = 27 + + default_vision_config = { + "hidden_size": 1152, + "image_size": 980, + "intermediate_size": 4304, + "model_type": "idefics2", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } + vision_config = Idefics2VisionConfig(**default_vision_config) model = Idefics2VisionTransformer(vision_config) -if minicpmv_version == 3: +if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"): vision_config = SiglipVisionConfig(**default_vision_config) model = SiglipVisionTransformer(vision_config) elif minicpmv_version == 4: @@ -626,16 +661,28 @@ def bytes_to_unicode(): fout.add_description("two-tower CLIP model") if has_vision_encoder: - # vision_model hparams - fout.add_uint32("clip.vision.image_size", 448) - fout.add_uint32("clip.vision.patch_size", 14) - fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152) - fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304) + # vision_model hparams - use actual config values + vision_image_size = model_config.get("image_size", 448) if model_config else 448 + vision_patch_size = default_vision_config.get("patch_size", 14) + vision_hidden_size = default_vision_config.get("hidden_size", 1152) + vision_intermediate_size = default_vision_config.get("intermediate_size", 4304) + vision_attention_heads = default_vision_config.get("num_attention_heads", 16) + + fout.add_uint32("clip.vision.image_size", vision_image_size) + fout.add_uint32("clip.vision.patch_size", vision_patch_size) + fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size) + fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size) fout.add_uint32("clip.vision.projection_dim", 0) - fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16) + fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads) fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) + # Add MiniCPM-V specific parameters + query_num = model_config.get("query_num", 0) if model_config else 0 + resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0 + fout.add_uint32("clip.minicpmv_query_num", query_num) + fout.add_uint32("clip.minicpmv_projection_dim", resampler_emb_dim) + if processor is not None: image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std diff --git a/tools/mtmd/legacy-models/minicpmv-surgery.py b/tools/mtmd/legacy-models/minicpmv-surgery.py index ba82116582b1f..53526623cd7cb 100644 --- a/tools/mtmd/legacy-models/minicpmv-surgery.py +++ b/tools/mtmd/legacy-models/minicpmv-surgery.py @@ -16,6 +16,8 @@ # store these tensors in a new dictionary and torch.save them projector = {name: checkpoint[name].float() for name in mm_tensors} +if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True: + projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb torch.save(projector, f"{args.model}/minicpmv.projector") clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]