3 ヶ月前 · ca71fb9b36
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -891,6 +891,9 @@ class TextModel(ModelBase):
 
				         if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
			
 
				             # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
			
 
				             res = "llada-moe"
			
 
				+        if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
			
 
				+            # ref: https://huggingface.co/ibm-granite/granite-docling-258M
			
 
				+            res = "granite-docling"
			
 
				 
			
 
				         if res is None:
			
 
				             logger.warning("\n")
			
@@ -1325,6 +1328,7 @@ class MmprojModel(ModelBase):
 
				         self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
			
 
				 
			
 
				         # load preprocessor config
			
 
				+        self.preprocessor_config = {}
			
 
				         if not self.is_mistral_format:
			
 
				             with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
			
 
				                 self.preprocessor_config = json.load(f)
			
@@ -1347,7 +1351,8 @@ class MmprojModel(ModelBase):
 
				             self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
			
 
				 
			
 
				             # vision config
			
 
				-            self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
			
 
				+            self.image_size = self.find_vparam(["image_size"])
			
 
				+            self.gguf_writer.add_vision_image_size(self.image_size)
			
 
				             self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
			
 
				             self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
			
 
				             self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
			
@@ -2378,6 +2383,10 @@ class SmolVLMModel(MmprojModel):
 
				         self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
			
 
				         self.gguf_writer.add_vision_use_gelu(True)
			
 
				 
			
 
				+        # Add the preprocessor longest edge size
			
 
				+        preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
			
 
				+        self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
			
 
				+
			
 
				     def tensor_force_quant(self, name, new_name, bid, n_dims):
			
 
				         if ".embeddings." in name:
			
 
				             return gguf.GGMLQuantizationType.F32
			
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -140,6 +140,7 @@ models = [
 
				     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
			
 
				     {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
			
 
				     {"name": "llada-moe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
			
 
				+    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
			
 
				 ]
			
 
				 
			
 
				 # some models are known to be broken upstream, so we will skip them as exceptions
			
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -261,6 +261,7 @@ class Keys:
 
				 
			
 
				     class ClipVision:
			
 
				         IMAGE_SIZE          = "clip.vision.image_size"
			
 
				+        PREPROC_IMAGE_SIZE  = "clip.vision.preproc_image_size"
			
 
				         PATCH_SIZE          = "clip.vision.patch_size"
			
 
				         EMBEDDING_LENGTH    = "clip.vision.embedding_length"
			
 
				         FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
			
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1037,6 +1037,9 @@ class GGUFWriter:
 
				     def add_vision_image_size(self, value: int) -> None:
			
 
				         self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
			
 
				 
			
 
				+    def add_vision_preproc_image_size(self, value: int) -> None:
			
 
				+        self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
			
 
				+
			
 
				     def add_vision_image_mean(self, values: Sequence[float]) -> None:
			
 
				         self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
			
 
				 
			
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -347,6 +347,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 
				             case LLAMA_VOCAB_PRE_TYPE_OLMO:
			
 
				             case LLAMA_VOCAB_PRE_TYPE_JAIS:
			
 
				             case LLAMA_VOCAB_PRE_TYPE_TRILLION:
			
 
				+            case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
			
 
				                 regex_exprs = {
			
 
				                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
			
 
				                 };
			
@@ -1961,6 +1962,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
				                 tokenizer_pre == "trillion") {
			
 
				                 pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
			
 
				                 clean_spaces = false;
			
 
				+            } else if (
			
 
				+                tokenizer_pre == "granite-docling") {
			
 
				+                pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
			
 
				+                clean_spaces = false;
			
 
				             } else if (
			
 
				                 tokenizer_pre == "bailingmoe" ||
			
 
				                 tokenizer_pre == "llada-moe") {
			
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -8,46 +8,47 @@
 
				 
			
 
				 // pre-tokenization types
			
 
				 enum llama_vocab_pre_type {
			
 
				-    LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_KIMI_K2        = 37,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE  = 38,
			
 
				-    LLAMA_VOCAB_PRE_TYPE_GROK_2         = 39,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_DEFAULT         = 0,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_LLAMA3          = 1,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM    = 2,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER  = 3,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_FALCON          = 4,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_MPT             = 5,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_STARCODER       = 6,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_GPT2            = 7,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_REFACT          = 8,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_COMMAND_R       = 9,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_STABLELM2       = 10,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_QWEN2           = 11,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_OLMO            = 12,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_DBRX            = 13,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_SMAUG           = 14,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_PORO            = 15,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_CHATGLM3        = 16,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_CHATGLM4        = 17,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_VIKING          = 18,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_JAIS            = 19,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_TEKKEN          = 20,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_SMOLLM          = 21,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_CODESHELL       = 22,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_BLOOM           = 23,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH    = 24,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_EXAONE          = 25,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_CHAMELEON       = 26,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_MINERVA         = 27,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM   = 28,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_GPT4O           = 29,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_SUPERBPE        = 30,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_TRILLION        = 31,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_BAILINGMOE      = 32,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_LLAMA4          = 33,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_PIXTRAL         = 34,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_SEED_CODER      = 35,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_HUNYUAN         = 36,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_KIMI_K2         = 37,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE   = 38,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_GROK_2          = 39,
			
 
				+    LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
			
 
				 };
			
 
				 
			
 
				 struct LLM_KV;
			
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -31,6 +31,7 @@
 
				 
			
 
				 // vision-specific
			
 
				 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
			
 
				+#define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
			
 
				 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
			
 
				 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
			
 
				 #define KEY_IMAGE_STD           "clip.vision.image_std"
			
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -170,7 +170,9 @@ struct clip_hparams {
 
				     int32_t projection_dim;
			
 
				     int32_t n_head;
			
 
				     int32_t n_layer;
			
 
				-    int32_t proj_scale_factor = 0; // idefics3
			
 
				+    // idefics3
			
 
				+    int32_t preproc_image_size = 0;
			
 
				+    int32_t proj_scale_factor = 0;
			
 
				 
			
 
				     float image_mean[3];
			
 
				     float image_std[3];
			
@@ -2250,6 +2252,7 @@ struct clip_model_loader {
 
				 
			
 
				             if (is_vision) {
			
 
				                 get_u32(KEY_IMAGE_SIZE, hparams.image_size);
			
 
				+                get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
			
 
				                 get_u32(KEY_PATCH_SIZE, hparams.patch_size);
			
 
				                 get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
			
 
				                 get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
			
@@ -3551,10 +3554,51 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
 
				         // res_imgs->data[0] = *res;
			
 
				         res_imgs->entries.push_back(std::move(img_f32));
			
 
				         return true;
			
 
				-    }
			
 
				-    else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
			
 
				+    } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
			
 
				+        // The refined size has two steps:
			
 
				+        // 1. Resize w/ aspect-ratio preserving such that the longer side is
			
 
				+        //      the preprocessor longest size
			
 
				+        // 2. Resize w/out preserving aspect ratio such that both sides are
			
 
				+        //      multiples of image_size (always rounding up)
			
 
				+        //
			
 
				+        // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
			
 
				+        const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
			
 
				+            original_size, params.image_size, params.preproc_image_size);
			
 
				+
			
 
				+        llava_uhd::slice_instructions instructions;
			
 
				+        instructions.overview_size = clip_image_size{params.image_size, params.image_size};
			
 
				+        instructions.refined_size = refined_size;
			
 
				+        instructions.grid_size = clip_image_size{
			
 
				+            static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
			
 
				+            static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
			
 
				+        };
			
 
				+        for (int y = 0; y < refined_size.height; y += params.image_size) {
			
 
				+            for (int x = 0; x < refined_size.width; x += params.image_size) {
			
 
				+                instructions.slices.push_back(llava_uhd::slice_coordinates{
			
 
				+                    /* x    */x,
			
 
				+                    /* y    */y,
			
 
				+                    /* size */clip_image_size{
			
 
				+                        std::min(params.image_size, refined_size.width - x),
			
 
				+                        std::min(params.image_size, refined_size.height - y)
			
 
				+                    }
			
 
				+                });
			
 
				+            }
			
 
				+        }
			
 
				+        auto imgs = llava_uhd::slice_image(img, instructions);
			
 
				+
			
 
				+        // cast and normalize to f32
			
 
				+        for (size_t i = 0; i < imgs.size(); ++i) {
			
 
				+            // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
			
 
				+            clip_image_f32_ptr res(clip_image_f32_init());
			
 
				+            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
			
 
				+            res_imgs->entries.push_back(std::move(res));
			
 
				+        }
			
 
				+
			
 
				+        res_imgs->grid_x = instructions.grid_size.width;
			
 
				+        res_imgs->grid_y = instructions.grid_size.height;
			
 
				+        return true;
			
 
				+    } else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
			
 
				             || ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
			
 
				-            || ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
			
 
				             || ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
			
 
				     ) {
			
 
				         clip_image_u8 resized_image;
			
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -76,7 +76,7 @@ enum mtmd_slice_tmpl {
 
				     MTMD_SLICE_TMPL_MINICPMV_2_5,
			
 
				     MTMD_SLICE_TMPL_MINICPMV_2_6,
			
 
				     MTMD_SLICE_TMPL_LLAMA4,
			
 
				-    // TODO @ngxson : add support for idefics (SmolVLM)
			
 
				+    MTMD_SLICE_TMPL_IDEFICS3,
			
 
				 };
			
 
				 
			
 
				 const char * mtmd_default_marker() {
			
@@ -114,19 +114,22 @@ struct mtmd_context {
 
				     // for llava-uhd style models, we need special tokens in-between slices
			
 
				     // minicpmv calls them "slices", llama 4 calls them "tiles"
			
 
				     mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
			
 
				-    llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image
			
 
				-    llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
			
 
				-    llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
			
 
				-    llama_token tok_slices_end    = LLAMA_TOKEN_NULL; // end of all slices
			
 
				-    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
			
 
				-    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice end
			
 
				-    llama_token tok_sli_img_mid   = LLAMA_TOKEN_NULL; // between 2 slices
			
 
				-    llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
			
 
				+    std::vector<llama_token> tok_ov_img_start;  // overview image
			
 
				+    std::vector<llama_token> tok_ov_img_end;    // overview image
			
 
				+    std::vector<llama_token> tok_slices_start;  // start of all slices
			
 
				+    std::vector<llama_token> tok_slices_end;    // end of all slices
			
 
				+    std::vector<llama_token> tok_sli_img_start; // single slice start
			
 
				+    std::vector<llama_token> tok_sli_img_end;   // single slice end
			
 
				+    std::vector<llama_token> tok_sli_img_mid;   // between 2 slices
			
 
				+    std::vector<llama_token> tok_row_end;       // end of row
			
 
				     bool        tok_row_end_trail = false;
			
 
				     bool        ov_img_first      = false;
			
 
				 
			
 
				     bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
			
 
				 
			
 
				+    // string template for slice image delimiters with row/col (idefics3)
			
 
				+    std::string sli_img_start_tmpl;
			
 
				+
			
 
				     // for whisper, we pre-calculate the mel filter bank
			
 
				     whisper_preprocessor::whisper_filters w_filters;
			
 
				 
			
@@ -197,13 +200,13 @@ struct mtmd_context {
 
				             // minicpmv 2.5 format:
			
 
				             // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
			
 
				             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
			
 
				-            tok_ov_img_start  = lookup_token("<image>");
			
 
				-            tok_ov_img_end    = lookup_token("</image>");
			
 
				-            tok_slices_start  = lookup_token("<slice>");
			
 
				-            tok_slices_end    = lookup_token("</slice>");
			
 
				+            tok_ov_img_start  = {lookup_token("<image>")};
			
 
				+            tok_ov_img_end    = {lookup_token("</image>")};
			
 
				+            tok_slices_start  = {lookup_token("<slice>")};
			
 
				+            tok_slices_end    = {lookup_token("</slice>")};
			
 
				             tok_sli_img_start = tok_ov_img_start;
			
 
				             tok_sli_img_end   = tok_ov_img_end;
			
 
				-            tok_row_end       = lookup_token("\n");
			
 
				+            tok_row_end       = {lookup_token("\n")};
			
 
				             tok_row_end_trail = false; // no trailing end-of-row token
			
 
				             ov_img_first      = true;
			
 
				 
			
@@ -211,11 +214,11 @@ struct mtmd_context {
 
				             // minicpmv 2.6 format:
			
 
				             // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
			
 
				             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
			
 
				-            tok_ov_img_start  = lookup_token("<image>");
			
 
				-            tok_ov_img_end    = lookup_token("</image>");
			
 
				-            tok_sli_img_start = lookup_token("<slice>");
			
 
				-            tok_sli_img_end   = lookup_token("</slice>");
			
 
				-            tok_row_end       = lookup_token("\n");
			
 
				+            tok_ov_img_start  = {lookup_token("<image>")};
			
 
				+            tok_ov_img_end    = {lookup_token("</image>")};
			
 
				+            tok_sli_img_start = {lookup_token("<slice>")};
			
 
				+            tok_sli_img_end   = {lookup_token("</slice>")};
			
 
				+            tok_row_end       = {lookup_token("\n")};
			
 
				             tok_row_end_trail = false; // no trailing end-of-row token
			
 
				             ov_img_first      = true;
			
 
				 
			
@@ -230,9 +233,9 @@ struct mtmd_context {
 
				             // <|image|> (overview)           <-- overview image is last
			
 
				             // <|image_end|>
			
 
				             slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
			
 
				-            tok_ov_img_start  = lookup_token("<|image|>");
			
 
				-            tok_sli_img_mid   = lookup_token("<|tile_x_separator|>");
			
 
				-            tok_row_end       = lookup_token("<|tile_y_separator|>");
			
 
				+            tok_ov_img_start  = {lookup_token("<|image|>")};
			
 
				+            tok_sli_img_mid   = {lookup_token("<|tile_x_separator|>")};
			
 
				+            tok_row_end       = {lookup_token("<|tile_y_separator|>")};
			
 
				             tok_row_end_trail = true; // add trailing end-of-row token
			
 
				             ov_img_first      = false; // overview image is last
			
 
				         }
			
@@ -245,8 +248,12 @@ struct mtmd_context {
 
				 
			
 
				         } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
			
 
				             // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
			
 
				-            img_beg = "<fake_token_around_image><global-img>";
			
 
				-            img_end = "<fake_token_around_image>";
			
 
				+            slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3;
			
 
				+            tok_ov_img_start   = {lookup_token("\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
			
 
				+            tok_ov_img_end     = {lookup_token("<fake_token_around_image>")};
			
 
				+            tok_row_end        = {lookup_token("\n")};
			
 
				+            img_beg            = "<fake_token_around_image>";
			
 
				+            sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
			
 
				 
			
 
				         } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
			
 
				             // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
			
@@ -504,6 +511,7 @@ struct mtmd_tokenizer {
 
				                 ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
			
 
				                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
			
 
				                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
			
 
				+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
			
 
				             ) {
			
 
				                 const int n_col = batch_f32.grid_x;
			
 
				                 const int n_row = batch_f32.grid_y;
			
@@ -517,53 +525,45 @@ struct mtmd_tokenizer {
 
				 
			
 
				                 // add overview image (first)
			
 
				                 if (ctx->ov_img_first) {
			
 
				-                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
			
 
				-                        add_text({ctx->tok_ov_img_start});
			
 
				-                    }
			
 
				+                    add_text(ctx->tok_ov_img_start);
			
 
				                     cur.entries.emplace_back(std::move(ov_chunk));
			
 
				-                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
			
 
				-                        add_text({ctx->tok_ov_img_end});
			
 
				-                    }
			
 
				+                    add_text(ctx->tok_ov_img_end);
			
 
				                 }
			
 
				 
			
 
				                 // add slices (or tiles)
			
 
				                 if (!chunks.empty()) {
			
 
				                     GGML_ASSERT((int)chunks.size() == n_row * n_col);
			
 
				-                    if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
			
 
				-                        add_text({ctx->tok_slices_start});
			
 
				-                    }
			
 
				+                    add_text(ctx->tok_slices_start);
			
 
				                     for (int y = 0; y < n_row; y++) {
			
 
				                         for (int x = 0; x < n_col; x++) {
			
 
				                             const bool is_last_in_row = (x == n_col - 1);
			
 
				-                            if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
			
 
				-                                add_text({ctx->tok_sli_img_start});
			
 
				+                            if (!ctx->tok_sli_img_start.empty()) {
			
 
				+                                add_text(ctx->tok_sli_img_start);
			
 
				+                            } else if (!ctx->sli_img_start_tmpl.empty()) {
			
 
				+                                // If using a template to preceed a slice image
			
 
				+                                const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
			
 
				+                                std::unique_ptr<char[]> buf(new char[sz]);
			
 
				+                                std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
			
 
				+                                add_text(std::string(buf.get(), buf.get() + sz - 1), true);
			
 
				                             }
			
 
				                             cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
			
 
				-                            if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
			
 
				-                                add_text({ctx->tok_sli_img_end});
			
 
				-                            }
			
 
				-                            if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
			
 
				-                                add_text({ctx->tok_sli_img_mid});
			
 
				+                            add_text(ctx->tok_sli_img_end);
			
 
				+                            if (!is_last_in_row) {
			
 
				+                                add_text(ctx->tok_sli_img_mid);
			
 
				                             }
			
 
				                         }
			
 
				-                        if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
			
 
				-                            add_text({ctx->tok_row_end});
			
 
				+                        if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
			
 
				+                            add_text(ctx->tok_row_end);
			
 
				                         }
			
 
				                     }
			
 
				-                    if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
			
 
				-                        add_text({ctx->tok_slices_end});
			
 
				-                    }
			
 
				+                    add_text(ctx->tok_slices_end);
			
 
				                 }
			
 
				 
			
 
				                 // add overview image (last)
			
 
				                 if (!ctx->ov_img_first) {
			
 
				-                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
			
 
				-                        add_text({ctx->tok_ov_img_start});
			
 
				-                    }
			
 
				+                    add_text(ctx->tok_ov_img_start);
			
 
				                     cur.entries.emplace_back(std::move(ov_chunk));
			
 
				-                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
			
 
				-                        add_text({ctx->tok_ov_img_end});
			
 
				-                    }
			
 
				+                    add_text(ctx->tok_ov_img_end);
			
 
				                 }
			
 
				 
			
 
				             } else {
			
@@ -780,7 +780,9 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
 
				     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
			
 
				     bool ok = false;
			
 
				 
			
 
				-    if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
			
 
				+    if (clip_is_llava(ctx_clip)
			
 
				+        || clip_is_minicpmv(ctx_clip)
			
 
				+        || clip_is_glm(ctx_clip)) {
			
 
				         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
			
 
				         const auto & entries = image_tokens->batch_f32.entries;
			
 
				         for (size_t i = 0; i < entries.size(); i++) {
			
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -69,6 +69,7 @@ add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
 
				 add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
			
 
				 add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
			
 
				 add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
			
 
				+add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
			
 
				 
			
 
				 add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
			
 
				 add_test_audio  "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"