Browse Source

model : mtmd : make input norm optional in LFM2-VL (#18594)

Upcoming LFM2-VL releases will have configurable input norm.
See https://github.com/huggingface/transformers/pull/43087 for details.
Tarek Dakhran 3 weeks ago
parent
commit
4974bf53cf
2 changed files with 17 additions and 4 deletions
  1. 8 0
      tools/mtmd/clip.cpp
  2. 9 4
      tools/mtmd/models/siglip.cpp

+ 8 - 0
tools/mtmd/clip.cpp

@@ -1552,6 +1552,14 @@ struct clip_model_loader {
                     model.projection = get_tensor(TN_MM_PROJECTOR);
                     model.projection = get_tensor(TN_MM_PROJECTOR);
                 } break;
                 } break;
             case PROJECTOR_TYPE_LFM2:
             case PROJECTOR_TYPE_LFM2:
+                {
+                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
+                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                } break;
             case PROJECTOR_TYPE_KIMIVL:
             case PROJECTOR_TYPE_KIMIVL:
                 {
                 {
                     model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
                     model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);

+ 9 - 4
tools/mtmd/models/siglip.cpp

@@ -50,10 +50,15 @@ ggml_cgraph * clip_graph_siglip::build() {
         const int scale_factor = model.hparams.n_merge;
         const int scale_factor = model.hparams.n_merge;
         cur = build_patch_merge_permute(cur, scale_factor);
         cur = build_patch_merge_permute(cur, scale_factor);
 
 
-        // projection
-        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
-        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
-        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+        // projection, in LFM2-VL input norm is optional
+        if (model.mm_input_norm_w) {
+            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        }
+
+        if (model.mm_input_norm_b) {
+            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+        }
 
 
         cur = build_ffn(cur,
         cur = build_ffn(cur,
             model.mm_1_w, model.mm_1_b,
             model.mm_1_w, model.mm_1_b,