|
@@ -517,6 +517,16 @@ if args.use_f32:
|
|
|
# output in the same directory as the model if output_dir is None
|
|
# output in the same directory as the model if output_dir is None
|
|
|
dir_model = args.model_dir
|
|
dir_model = args.model_dir
|
|
|
|
|
|
|
|
|
|
+# Read config.json to get actual model configuration
|
|
|
|
|
+config_path = os.path.join(dir_model, "config.json")
|
|
|
|
|
+model_config = {}
|
|
|
|
|
+if os.path.isfile(config_path):
|
|
|
|
|
+ with open(config_path, "r", encoding="utf-8") as f:
|
|
|
|
|
+ model_config = json.load(f)
|
|
|
|
|
+ print(f"Loaded config from {config_path}")
|
|
|
|
|
+else:
|
|
|
|
|
+ print(f"Warning: config.json not found at {config_path}")
|
|
|
|
|
+
|
|
|
# If minicpmv_projector is not specified but the default path exists, use the default path
|
|
# If minicpmv_projector is not specified but the default path exists, use the default path
|
|
|
if args.minicpmv_projector is None:
|
|
if args.minicpmv_projector is None:
|
|
|
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
|
|
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
|
|
@@ -555,37 +565,62 @@ if args.use_f32:
|
|
|
# processor = CLIPProcessor.from_pretrained(dir_model)
|
|
# processor = CLIPProcessor.from_pretrained(dir_model)
|
|
|
|
|
|
|
|
minicpmv_version = args.minicpmv_version
|
|
minicpmv_version = args.minicpmv_version
|
|
|
-emb_dim = 4096
|
|
|
|
|
-block_count = 26
|
|
|
|
|
-if minicpmv_version == 1: # MiniCPM-V 2.0
|
|
|
|
|
- emb_dim = 2304
|
|
|
|
|
- block_count = 26
|
|
|
|
|
-elif minicpmv_version == 2: # MiniCPM-V 2.5
|
|
|
|
|
- emb_dim = 4096
|
|
|
|
|
- block_count = 27
|
|
|
|
|
-elif minicpmv_version == 3: # MiniCPM-V 2.6
|
|
|
|
|
- emb_dim = 3584
|
|
|
|
|
- block_count = 27
|
|
|
|
|
-elif minicpmv_version == 4: # MiniCPM-o 2.6
|
|
|
|
|
- emb_dim = 3584
|
|
|
|
|
- block_count = 27
|
|
|
|
|
-elif minicpmv_version == 5: # MiniCPM-V 4.0
|
|
|
|
|
- emb_dim = 2560
|
|
|
|
|
- block_count = 27
|
|
|
|
|
-
|
|
|
|
|
-default_vision_config = {
|
|
|
|
|
- "hidden_size": 1152,
|
|
|
|
|
- "image_size": 980,
|
|
|
|
|
- "intermediate_size": 4304,
|
|
|
|
|
- "model_type": "idefics2",
|
|
|
|
|
- "num_attention_heads": 16,
|
|
|
|
|
- "num_hidden_layers": 27,
|
|
|
|
|
- "patch_size": 14,
|
|
|
|
|
|
|
+
|
|
|
|
|
+# Use actual config values instead of hardcoded ones
|
|
|
|
|
+if model_config:
|
|
|
|
|
+ # For the projector/resampler, use the main model's hidden_size
|
|
|
|
|
+ emb_dim = model_config.get("hidden_size", 1536)
|
|
|
|
|
+
|
|
|
|
|
+ # For the vision model, use vision_config values
|
|
|
|
|
+ vision_config_dict = model_config.get("vision_config", {})
|
|
|
|
|
+ default_vision_config = {
|
|
|
|
|
+ "hidden_size": vision_config_dict.get("hidden_size", 1152),
|
|
|
|
|
+ "image_size": vision_config_dict.get("image_size", 980),
|
|
|
|
|
+ "intermediate_size": vision_config_dict.get("intermediate_size", 4304),
|
|
|
|
|
+ "model_type": vision_config_dict.get("model_type", "siglip"),
|
|
|
|
|
+ "num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
|
|
|
|
|
+ "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
|
|
|
|
|
+ "patch_size": vision_config_dict.get("patch_size", 14),
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ # Use vision model's num_hidden_layers for block_count
|
|
|
|
|
+ block_count = vision_config_dict.get("num_hidden_layers", 27)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
|
|
|
|
|
+ print(f"Vision config: {default_vision_config}")
|
|
|
|
|
+else:
|
|
|
|
|
+ # Fallback to original hardcoded logic if config.json not found
|
|
|
|
|
+ emb_dim = 4096
|
|
|
|
|
+ block_count = 26
|
|
|
|
|
+ if minicpmv_version == 1:
|
|
|
|
|
+ emb_dim = 2304
|
|
|
|
|
+ block_count = 26
|
|
|
|
|
+ elif minicpmv_version == 2:
|
|
|
|
|
+ emb_dim = 4096
|
|
|
|
|
+ block_count = 27
|
|
|
|
|
+ elif minicpmv_version == 3:
|
|
|
|
|
+ emb_dim = 3584
|
|
|
|
|
+ block_count = 27
|
|
|
|
|
+ elif minicpmv_version == 4:
|
|
|
|
|
+ emb_dim = 3584
|
|
|
|
|
+ block_count = 27
|
|
|
|
|
+ elif minicpmv_version == 5:
|
|
|
|
|
+ emb_dim = 2560
|
|
|
|
|
+ block_count = 27
|
|
|
|
|
+
|
|
|
|
|
+ default_vision_config = {
|
|
|
|
|
+ "hidden_size": 1152,
|
|
|
|
|
+ "image_size": 980,
|
|
|
|
|
+ "intermediate_size": 4304,
|
|
|
|
|
+ "model_type": "idefics2",
|
|
|
|
|
+ "num_attention_heads": 16,
|
|
|
|
|
+ "num_hidden_layers": 27,
|
|
|
|
|
+ "patch_size": 14,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
vision_config = Idefics2VisionConfig(**default_vision_config)
|
|
vision_config = Idefics2VisionConfig(**default_vision_config)
|
|
|
model = Idefics2VisionTransformer(vision_config)
|
|
model = Idefics2VisionTransformer(vision_config)
|
|
|
-if minicpmv_version == 3:
|
|
|
|
|
|
|
+if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
|
|
|
vision_config = SiglipVisionConfig(**default_vision_config)
|
|
vision_config = SiglipVisionConfig(**default_vision_config)
|
|
|
model = SiglipVisionTransformer(vision_config)
|
|
model = SiglipVisionTransformer(vision_config)
|
|
|
elif minicpmv_version == 4:
|
|
elif minicpmv_version == 4:
|
|
@@ -644,16 +679,27 @@ else:
|
|
|
fout.add_description("two-tower CLIP model")
|
|
fout.add_description("two-tower CLIP model")
|
|
|
|
|
|
|
|
if has_vision_encoder:
|
|
if has_vision_encoder:
|
|
|
- # vision_model hparams
|
|
|
|
|
- fout.add_uint32("clip.vision.image_size", 448)
|
|
|
|
|
- fout.add_uint32("clip.vision.patch_size", 14)
|
|
|
|
|
- fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
|
|
|
|
|
- fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
|
|
|
|
|
|
|
+ # vision_model hparams - use actual config values
|
|
|
|
|
+ vision_image_size = model_config.get("image_size", 448) if model_config else 448
|
|
|
|
|
+ vision_patch_size = default_vision_config.get("patch_size", 14)
|
|
|
|
|
+ vision_hidden_size = default_vision_config.get("hidden_size", 1152)
|
|
|
|
|
+ vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
|
|
|
|
|
+ vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
|
|
|
|
|
+
|
|
|
|
|
+ fout.add_uint32("clip.vision.image_size", vision_image_size)
|
|
|
|
|
+ fout.add_uint32("clip.vision.patch_size", vision_patch_size)
|
|
|
|
|
+ fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
|
|
|
|
|
+ fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
|
|
|
fout.add_uint32("clip.vision.projection_dim", 0)
|
|
fout.add_uint32("clip.vision.projection_dim", 0)
|
|
|
- fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
|
|
|
|
|
|
|
+ fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
|
|
|
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
|
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
|
|
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
|
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
|
|
|
|
|
|
|
|
|
+ # Add MiniCPM-V specific parameters
|
|
|
|
|
+ query_num = model_config.get("query_num", 0) if model_config else 0
|
|
|
|
|
+ resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
|
|
|
|
|
+ fout.add_uint32("clip.minicpmv_query_num", query_num)
|
|
|
|
|
+
|
|
|
if processor is not None:
|
|
if processor is not None:
|
|
|
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
|
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
|
|
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
|
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|