1
0
Эх сурвалжийг харах

mtmd: improve struct initialization (#16981)

Xuan-Son Nguyen 2 сар өмнө
parent
commit
2f0c2db43e
2 өөрчлөгдсөн 19 нэмэгдсэн , 17 устгасан
  1. 1 0
      tools/mtmd/clip.cpp
  2. 18 17
      tools/mtmd/mtmd.cpp

+ 1 - 0
tools/mtmd/clip.cpp

@@ -2761,6 +2761,7 @@ struct clip_model_loader {
                     {
                         // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
                         // TODO: verify the image_min_tokens
+                        hparams.n_merge = 1; // the original pixtral does not use patch merging
                         hparams.rope_theta = 10000.0f;
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         hparams.set_limit_image_tokens(8, 1024);

+ 18 - 17
tools/mtmd/mtmd.cpp

@@ -101,16 +101,17 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_
 }
 
 mtmd_context_params mtmd_context_params_default() {
-    mtmd_context_params params;
-    params.use_gpu = true;
-    params.print_timings = true;
-    params.n_threads = 4;
-    params.verbosity = GGML_LOG_LEVEL_INFO;
-    params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
-    params.media_marker = mtmd_default_marker();
-    params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
-    params.image_min_tokens = -1;
-    params.image_max_tokens = -1;
+    mtmd_context_params params {
+        /* use_gpu           */ true,
+        /* print_timings     */ true,
+        /* n_threads         */ 4,
+        /* verbosity         */ GGML_LOG_LEVEL_INFO,
+        /* image_marker      */ MTMD_DEFAULT_IMAGE_MARKER,
+        /* media_marker      */ mtmd_default_marker(),
+        /* flash_attn_type   */ LLAMA_FLASH_ATTN_TYPE_AUTO,
+        /* image_min_tokens  */ -1,
+        /* image_max_tokens  */ -1,
+    };
     return params;
 }
 
@@ -172,13 +173,13 @@ struct mtmd_context {
             throw std::runtime_error("media_marker must not be empty");
         }
 
-        clip_context_params ctx_clip_params;
-        ctx_clip_params.use_gpu          = ctx_params.use_gpu;
-        ctx_clip_params.verbosity        = ctx_params.verbosity;
-        ctx_clip_params.flash_attn_type  = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
-        // custom image token limits
-        ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens;
-        ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens;
+        clip_context_params ctx_clip_params {
+            /* use_gpu           */ ctx_params.use_gpu,
+            /* verbosity         */ ctx_params.verbosity,
+            /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
+            /* image_min_tokens  */ ctx_params.image_min_tokens,
+            /* image_max_tokens  */ ctx_params.image_max_tokens,
+        };
 
         auto res = clip_init(mmproj_fname, ctx_clip_params);
         ctx_v = res.ctx_v;