Skip to content

Fix MinicpmV model converter and clip to avoid using hardcode. #14750

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
#define KEY_MINICPMV_PROJECTION_DIM "clip.minicpmv_projection_dim"

// audio-specific
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
Expand Down
62 changes: 42 additions & 20 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ struct clip_hparams {
// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
int32_t minicpmv_projection_dim = 0; // MiniCPM-V projection dimension
};

struct clip_layer {
Expand Down Expand Up @@ -847,13 +849,19 @@ struct clip_graph {
int n_embd = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = n_embd/d_head;
// Use actual config value if available, otherwise fall back to hardcoded values
int num_query = 96;
if (ctx->model.hparams.minicpmv_version == 2) {
num_query = 96;
} else if (ctx->model.hparams.minicpmv_version == 3) {
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 4) {
num_query = 64;
if (ctx->model.hparams.minicpmv_query_num > 0) {
num_query = ctx->model.hparams.minicpmv_query_num;
} else {
// Fallback to hardcoded values for legacy models
if (ctx->model.hparams.minicpmv_version == 2) {
num_query = 96;
} else if (ctx->model.hparams.minicpmv_version == 3) {
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 4) {
num_query = 64;
}
}

ggml_tensor * Q = ggml_add(ctx0,
Expand Down Expand Up @@ -2110,6 +2118,8 @@ struct clip_model_loader {
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
get_u32(KEY_MINICPMV_PROJECTION_DIM, hparams.minicpmv_projection_dim, false);

} else if (is_audio) {
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
Expand Down Expand Up @@ -3517,14 +3527,20 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_MINICPMV:
{
if (params.minicpmv_version == 2) {
n_patches_sq = 96;
} else if (params.minicpmv_version == 3) {
n_patches_sq = 64;
} else if (params.minicpmv_version == 4) {
n_patches_sq = 64;
// Use actual config value if available, otherwise fall back to hardcoded values
if (params.minicpmv_query_num > 0) {
n_patches_sq = params.minicpmv_query_num;
} else {
GGML_ABORT("Unknown minicpmv version");
// Fallback to hardcoded values for legacy models
if (params.minicpmv_version == 2) {
n_patches_sq = 96;
} else if (params.minicpmv_version == 3) {
n_patches_sq = 64;
} else if (params.minicpmv_version == 4) {
n_patches_sq = 64;
} else {
GGML_ABORT("Unknown minicpmv version");
}
}
} break;
case PROJECTOR_TYPE_QWEN2VL:
Expand Down Expand Up @@ -4059,14 +4075,20 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
case PROJECTOR_TYPE_MINICPMV:
if (hparams.minicpmv_version == 2) {
return 4096;
} else if (hparams.minicpmv_version == 3) {
return 3584;
} else if (hparams.minicpmv_version == 4) {
return 3584;
// Use actual config value if available, otherwise fall back to hardcoded values
if (hparams.minicpmv_projection_dim > 0) {
return hparams.minicpmv_projection_dim;
} else {
// Fallback to hardcoded values for legacy models
if (hparams.minicpmv_version == 2) {
return 4096;
} else if (hparams.minicpmv_version == 3) {
return 3584;
} else if (hparams.minicpmv_version == 4) {
return 3584;
}
GGML_ABORT("Unknown minicpmv version");
}
GGML_ABORT("Unknown minicpmv version");
case PROJECTOR_TYPE_GLM_EDGE:
return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:
Expand Down
107 changes: 77 additions & 30 deletions tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,16 @@ def bytes_to_unicode():
# output in the same directory as the model if output_dir is None
dir_model = args.model_dir

# Read config.json to get actual model configuration
config_path = os.path.join(dir_model, "config.json")
model_config = {}
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as f:
model_config = json.load(f)
print(f"Loaded config from {config_path}")
else:
print(f"Warning: config.json not found at {config_path}")

if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
vocab = None
tokens = None
Expand Down Expand Up @@ -544,34 +554,59 @@ def bytes_to_unicode():
# processor = CLIPProcessor.from_pretrained(dir_model)

minicpmv_version = args.minicpmv_version
emb_dim = 4096
block_count = 26
if minicpmv_version == 1:
emb_dim = 2304
block_count = 26
elif minicpmv_version == 2:
emb_dim = 4096
block_count = 27
elif minicpmv_version == 3:
emb_dim = 3584
block_count = 27
elif minicpmv_version == 4:
emb_dim = 3584
block_count = 27

default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "idefics2",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,

# Use actual config values instead of hardcoded ones
if model_config:
# For the projector/resampler, use the main model's hidden_size
emb_dim = model_config.get("hidden_size", 1536)

# For the vision model, use vision_config values
vision_config_dict = model_config.get("vision_config", {})
default_vision_config = {
"hidden_size": vision_config_dict.get("hidden_size", 1152),
"image_size": vision_config_dict.get("image_size", 980),
"intermediate_size": vision_config_dict.get("intermediate_size", 4304),
"model_type": vision_config_dict.get("model_type", "siglip"),
"num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
"num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
"patch_size": vision_config_dict.get("patch_size", 14),
}

# Use vision model's num_hidden_layers for block_count
block_count = vision_config_dict.get("num_hidden_layers", 27)

print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
print(f"Vision config: {default_vision_config}")
else:
# Fallback to original hardcoded logic if config.json not found
emb_dim = 4096
block_count = 26
if minicpmv_version == 1:
emb_dim = 2304
block_count = 26
elif minicpmv_version == 2:
emb_dim = 4096
block_count = 27
elif minicpmv_version == 3:
emb_dim = 3584
block_count = 27
elif minicpmv_version == 4:
emb_dim = 3584
block_count = 27

default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "idefics2",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,
}

vision_config = Idefics2VisionConfig(**default_vision_config)
model = Idefics2VisionTransformer(vision_config)
if minicpmv_version == 3:
if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
elif minicpmv_version == 4:
Expand Down Expand Up @@ -626,16 +661,28 @@ def bytes_to_unicode():
fout.add_description("two-tower CLIP model")

if has_vision_encoder:
# vision_model hparams
fout.add_uint32("clip.vision.image_size", 448)
fout.add_uint32("clip.vision.patch_size", 14)
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
# vision_model hparams - use actual config values
vision_image_size = model_config.get("image_size", 448) if model_config else 448
vision_patch_size = default_vision_config.get("patch_size", 14)
vision_hidden_size = default_vision_config.get("hidden_size", 1152)
vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
vision_attention_heads = default_vision_config.get("num_attention_heads", 16)

fout.add_uint32("clip.vision.image_size", vision_image_size)
fout.add_uint32("clip.vision.patch_size", vision_patch_size)
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
fout.add_uint32("clip.vision.projection_dim", 0)
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)

# Add MiniCPM-V specific parameters
query_num = model_config.get("query_num", 0) if model_config else 0
resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
fout.add_uint32("clip.minicpmv_query_num", query_num)
fout.add_uint32("clip.minicpmv_projection_dim", resampler_emb_dim)

if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/legacy-models/minicpmv-surgery.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

# store these tensors in a new dictionary and torch.save them
projector = {name: checkpoint[name].float() for name in mm_tensors}
if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
torch.save(projector, f"{args.model}/minicpmv.projector")

clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
Expand Down