преди 8 месеца · de4c07f937
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1909,16 +1909,20 @@ struct clip_model_loader {
 
				                     } break;
			
 
				                 case PROJECTOR_TYPE_QWEN2VL:
			
 
				                     {
			
 
				-                        // max image size = sqrt(max_pixels)
			
 
				-                        // https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
			
 
				-                        hparams.image_size = 3584;
			
 
				+                        // max image size = sqrt(max_pixels) = 3584
			
 
				+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
			
 
				+                        // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
			
 
				+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
			
 
				+                        hparams.image_size = 1024;
			
 
				                         hparams.warmup_image_size = hparams.patch_size * 8;
			
 
				                     } break;
			
 
				                 case PROJECTOR_TYPE_QWEN25VL:
			
 
				                     {
			
 
				                         // max image size = sqrt(max_pixels)
			
 
				                         // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
			
 
				-                        hparams.image_size = 3584;
			
 
				+                        // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
			
 
				+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
			
 
				+                        hparams.image_size = 1024;
			
 
				                         hparams.warmup_image_size = hparams.patch_size * 8;
			
 
				                         get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
			
 
				                     } break;