2 weeks ago · e047f9ee9d
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -258,12 +258,12 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
 
				         res->add_input(std::move(inp));
			
 
				     } else {
			
 
				         // Vision embedding path: use padding token (ID=0) embedding
			
 
				+        // TODO: verify if this is the correct behavior in transformers implementation
			
 
				         const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
			
 
				 
			
 
				-        // Extract and dequantize padding token embedding (column 0)
			
 
				-        ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
			
 
				-        ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
			
 
				-        inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
			
 
				+        // Extract and dequantize padding token embedding (row 0)
			
 
				+        ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
			
 
				+        inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
			
 
				 
			
 
				         // Reshape to [n_embd_altup, n_layer, 1]
			
 
				         inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
			
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3808,18 +3808,6 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
 
				     return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
			
 
				 }
			
 
				 
			
 
				-bool clip_is_mrope(const struct clip_ctx * ctx) {
			
 
				-    switch (ctx->proj_type()) {
			
 
				-        case PROJECTOR_TYPE_QWEN2VL:
			
 
				-        case PROJECTOR_TYPE_QWEN25VL:
			
 
				-        case PROJECTOR_TYPE_QWEN3VL:
			
 
				-        case PROJECTOR_TYPE_GLM4V:
			
 
				-            return true;
			
 
				-        default:
			
 
				-            return false;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				 bool clip_is_llava(const struct clip_ctx * ctx) {
			
 
				     return ctx->model.hparams.has_llava_projector;
			
 
				 }
			
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -104,7 +104,6 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
 
				 
			
 
				 int clip_is_minicpmv(const struct clip_ctx * ctx);
			
 
				 bool clip_is_glm(const struct clip_ctx * ctx);
			
 
				-bool clip_is_mrope(const struct clip_ctx * ctx);
			
 
				 bool clip_is_llava(const struct clip_ctx * ctx);
			
 
				 // note for contributor: this clip_is_(model) pattern is deprecated
			
 
				 //                       do NOT add new functions like this
			
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -146,8 +146,6 @@ struct mtmd_context {
 
				     bool        tok_row_end_trail = false;
			
 
				     bool        ov_img_first      = false;
			
 
				 
			
 
				-    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
			
 
				-
			
 
				     // string template for slice image delimiters with row/col (idefics3)
			
 
				     std::string sli_img_start_tmpl;
			
 
				 
			
@@ -217,7 +215,6 @@ struct mtmd_context {
 
				 
			
 
				     void init_vision() {
			
 
				         GGML_ASSERT(ctx_v != nullptr);
			
 
				-        use_mrope = clip_is_mrope(ctx_v);
			
 
				 
			
 
				         projector_type proj = clip_get_projector_type(ctx_v);
			
 
				         int minicpmv_version = clip_is_minicpmv(ctx_v);
			
@@ -627,7 +624,7 @@ struct mtmd_tokenizer {
 
				                 }
			
 
				 
			
 
				                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
			
 
				-                if (ctx->use_mrope) {
			
 
				+                if (mtmd_decode_use_mrope(ctx)) {
			
 
				                     // for Qwen2VL, we need this information for M-RoPE decoding positions
			
 
				                     image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
			
 
				                     image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
			
@@ -863,10 +860,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 
				 
			
 
				 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
			
 
				     switch (ctx->proj_type_v()) {
			
 
				-        case PROJECTOR_TYPE_QWEN2VL:
			
 
				-        case PROJECTOR_TYPE_QWEN25VL:
			
 
				-        case PROJECTOR_TYPE_QWEN3VL:
			
 
				-        case PROJECTOR_TYPE_YOUTUVL:
			
 
				+        case PROJECTOR_TYPE_GEMMA3:
			
 
				             return true;
			
 
				         default:
			
 
				             return false;
			
@@ -874,7 +868,15 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
 
				 }
			
 
				 
			
 
				 bool mtmd_decode_use_mrope(mtmd_context * ctx) {
			
 
				-    return ctx->use_mrope;
			
 
				+    switch (ctx->proj_type_v()) {
			
 
				+        case PROJECTOR_TYPE_QWEN2VL:
			
 
				+        case PROJECTOR_TYPE_QWEN25VL:
			
 
				+        case PROJECTOR_TYPE_QWEN3VL:
			
 
				+        case PROJECTOR_TYPE_GLM4V:
			
 
				+            return true;
			
 
				+        default:
			
 
				+            return false;
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 bool mtmd_support_vision(mtmd_context * ctx) {