1 ماه پیش · c6b2c9310c
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -595,11 +595,12 @@ struct clip_graph {
 
				             cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
			
 
				             cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
			
 
				 
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
			
 
				-            cur = ggml_add(ctx0, cur, model.mm_1_b);
			
 
				-            cur = ggml_gelu(ctx0, cur);
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
			
 
				-            cur = ggml_add(ctx0, cur, model.mm_2_b);
			
 
				+            cur = build_ffn(cur,
			
 
				+                model.mm_1_w, model.mm_1_b,
			
 
				+                nullptr, nullptr,
			
 
				+                model.mm_2_w, model.mm_2_b,
			
 
				+                FFN_GELU,
			
 
				+                -1);
			
 
				 
			
 
				         } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
			
 
				             cur = build_ffn(cur,
			
@@ -667,16 +668,12 @@ struct clip_graph {
 
				 
			
 
				         // LlavaMultiModalProjector (always using GELU activation)
			
 
				         {
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
			
 
				-            if (model.mm_1_b) {
			
 
				-                cur = ggml_add(ctx0, cur, model.mm_1_b);
			
 
				-            }
			
 
				-
			
 
				-            cur = ggml_gelu(ctx0, cur);
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
			
 
				-            if (model.mm_2_b) {
			
 
				-                cur = ggml_add(ctx0, cur, model.mm_2_b);
			
 
				-            }
			
 
				+            cur = build_ffn(cur,
			
 
				+                model.mm_1_w, model.mm_1_b,
			
 
				+                nullptr, nullptr,
			
 
				+                model.mm_2_w, model.mm_2_b,
			
 
				+                FFN_GELU,
			
 
				+                -1);
			
 
				         }
			
 
				 
			
 
				         // arrangement of the [IMG_BREAK] token
			
@@ -866,16 +863,12 @@ struct clip_graph {
 
				         // multimodal projection
			
 
				         ggml_tensor * embeddings = inpL;
			
 
				         embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
			
 
				-
			
 
				-        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
			
 
				-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
			
 
				-
			
 
				-        // GELU activation
			
 
				-        embeddings = ggml_gelu(ctx0, embeddings);
			
 
				-
			
 
				-        // Second linear layer
			
 
				-        embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
			
 
				-        embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
			
 
				+        embeddings = build_ffn(embeddings,
			
 
				+                            model.mm_0_w, model.mm_0_b,
			
 
				+                            nullptr, nullptr,
			
 
				+                            model.mm_1_w, model.mm_1_b,
			
 
				+                            FFN_GELU,
			
 
				+                            -1);
			
 
				 
			
 
				         if (use_window_attn) {
			
 
				             window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
			
@@ -1253,11 +1246,12 @@ struct clip_graph {
 
				             // projector LayerNorm uses pytorch's default eps = 1e-5
			
 
				             // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
			
 
				             cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
			
 
				-            cur = ggml_add(ctx0, cur, model.mm_1_b);
			
 
				-            cur = ggml_gelu(ctx0, cur);
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
			
 
				-            cur = ggml_add(ctx0, cur, model.mm_3_b);
			
 
				+            cur = build_ffn(cur,
			
 
				+                model.mm_1_w, model.mm_1_b,
			
 
				+                nullptr, nullptr,
			
 
				+                model.mm_3_w, model.mm_3_b,
			
 
				+                FFN_GELU,
			
 
				+                -1);
			
 
				         }
			
 
				 
			
 
				         // build the graph
			
@@ -1408,11 +1402,12 @@ struct clip_graph {
 
				             cb(cur, "proj_inp_normed", -1);
			
 
				 
			
 
				             // projection mlp
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
			
 
				-            cur = ggml_add(ctx0, cur, model.mm_1_b);
			
 
				-            cur = ggml_gelu(ctx0, cur);
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
			
 
				-            cur = ggml_add(ctx0, cur, model.mm_2_b);
			
 
				+            cur = build_ffn(cur,
			
 
				+                model.mm_1_w, model.mm_1_b,
			
 
				+                nullptr, nullptr,
			
 
				+                model.mm_2_w, model.mm_2_b,
			
 
				+                FFN_GELU,
			
 
				+                -1);
			
 
				             cb(cur, "proj_out", -1);
			
 
				         }
			
 
				 
			
@@ -1883,9 +1878,12 @@ struct clip_graph {
 
				 
			
 
				         } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
			
 
				             // projector
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
			
 
				-            cur = ggml_gelu_erf(ctx0, cur);
			
 
				-            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
			
 
				+            cur = build_ffn(cur,
			
 
				+                model.mm_1_w, model.mm_1_b,
			
 
				+                nullptr, nullptr,
			
 
				+                model.mm_2_w, model.mm_2_b,
			
 
				+                FFN_GELU_ERF,
			
 
				+                -1);
			
 
				 
			
 
				         } else {
			
 
				             GGML_ABORT("%s: unknown projector type", __func__);
			
@@ -2070,34 +2068,66 @@ private:
 
				 
			
 
				             // self-attention
			
 
				             {
			
 
				-                ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
			
 
				-                if (layer.q_b) {
			
 
				-                    Qcur = ggml_add(ctx0, Qcur, layer.q_b);
			
 
				-                }
			
 
				+                ggml_tensor * Qcur = nullptr;
			
 
				+                ggml_tensor * Kcur = nullptr;
			
 
				+                ggml_tensor * Vcur = nullptr;
			
 
				+                if (layer.qkv_w != nullptr) {
			
 
				+                    // fused qkv
			
 
				+                    cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
			
 
				+                    if (layer.qkv_b != nullptr) {
			
 
				+                        cur = ggml_add(ctx0, cur, layer.qkv_b);
			
 
				+                    }
			
 
				 
			
 
				-                ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
			
 
				-                if (layer.k_b) {
			
 
				-                    Kcur = ggml_add(ctx0, Kcur, layer.k_b);
			
 
				-                }
			
 
				+                    Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
			
 
				+                        /* nb1    */ ggml_row_size(cur->type, d_head),
			
 
				+                        /* nb2    */ cur->nb[1],
			
 
				+                        /* offset */ 0);
			
 
				 
			
 
				-                ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
			
 
				-                if (layer.v_b) {
			
 
				-                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
			
 
				-                }
			
 
				+                    Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
			
 
				+                        /* nb1    */ ggml_row_size(cur->type, d_head),
			
 
				+                        /* nb2    */ cur->nb[1],
			
 
				+                        /* offset */ ggml_row_size(cur->type, n_embd));
			
 
				 
			
 
				-                if (layer.q_norm) {
			
 
				-                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
			
 
				-                    cb(Qcur, "Qcur_norm", il);
			
 
				-                }
			
 
				+                    Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
			
 
				+                        /* nb1    */ ggml_row_size(cur->type, d_head),
			
 
				+                        /* nb2    */ cur->nb[1],
			
 
				+                        /* offset */ ggml_row_size(cur->type, 2 * n_embd));
			
 
				 
			
 
				-                if (layer.k_norm) {
			
 
				-                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
			
 
				-                    cb(Kcur, "Kcur_norm", il);
			
 
				-                }
			
 
				+                    // TODO: q/k norm requires row size == n_embd, while here it's d_head
			
 
				+                    // we can add support in the future if needed
			
 
				+                    GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
			
 
				 
			
 
				-                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
			
 
				-                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
			
 
				-                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
			
 
				+                } else {
			
 
				+                    // separate q, k, v
			
 
				+                    Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
			
 
				+                    if (layer.q_b) {
			
 
				+                        Qcur = ggml_add(ctx0, Qcur, layer.q_b);
			
 
				+                    }
			
 
				+
			
 
				+                    Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
			
 
				+                    if (layer.k_b) {
			
 
				+                        Kcur = ggml_add(ctx0, Kcur, layer.k_b);
			
 
				+                    }
			
 
				+
			
 
				+                    Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
			
 
				+                    if (layer.v_b) {
			
 
				+                        Vcur = ggml_add(ctx0, Vcur, layer.v_b);
			
 
				+                    }
			
 
				+
			
 
				+                    if (layer.q_norm) {
			
 
				+                        Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
			
 
				+                        cb(Qcur, "Qcur_norm", il);
			
 
				+                    }
			
 
				+
			
 
				+                    if (layer.k_norm) {
			
 
				+                        Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
			
 
				+                        cb(Kcur, "Kcur_norm", il);
			
 
				+                    }
			
 
				+
			
 
				+                    Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
			
 
				+                    Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
			
 
				+                    Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
			
 
				+                }
			
 
				 
			
 
				                 cb(Qcur, "Qcur", il);
			
 
				                 cb(Kcur, "Kcur", il);
			
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -318,7 +318,9 @@ int main(int argc, char ** argv) {
 
				         g_is_generating = true;
			
 
				         if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
			
 
				             for (size_t i = 0; i < params.image.size(); i++) {
			
 
				-                params.prompt += mtmd_default_marker();
			
 
				+                // most models require the marker before each image
			
 
				+                // ref: https://github.com/ggml-org/llama.cpp/pull/17616
			
 
				+                params.prompt = mtmd_default_marker() + params.prompt;
			
 
				             }
			
 
				         }
			
 
				         common_chat_msg msg;
			
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -32,23 +32,32 @@ fi
 
				 
			
 
				 arr_prefix=()
			
 
				 arr_hf=()
			
 
				-arr_tmpl=() # chat template
			
 
				+arr_extra_args=()
			
 
				 arr_file=()
			
 
				 
			
 
				 add_test_vision() {
			
 
				     local hf=$1
			
 
				-    local tmpl=${2:-""} # default to empty string if not provided
			
 
				+    shift
			
 
				+    local extra_args=""
			
 
				+    if [ $# -gt 0 ]; then
			
 
				+        extra_args=$(printf " %q" "$@")
			
 
				+    fi
			
 
				     arr_prefix+=("[vision]")
			
 
				     arr_hf+=("$hf")
			
 
				-    arr_tmpl+=("$tmpl")
			
 
				+    arr_extra_args+=("$extra_args")
			
 
				     arr_file+=("test-1.jpeg")
			
 
				 }
			
 
				 
			
 
				 add_test_audio() {
			
 
				     local hf=$1
			
 
				+    shift
			
 
				+    local extra_args=""
			
 
				+    if [ $# -gt 0 ]; then
			
 
				+        extra_args=$(printf " %q" "$@")
			
 
				+    fi
			
 
				     arr_prefix+=("[audio] ")
			
 
				     arr_hf+=("$hf")
			
 
				-    arr_tmpl+=("") # no need for chat tmpl
			
 
				+    arr_extra_args+=("$extra_args")
			
 
				     arr_file+=("test-2.mp3")
			
 
				 }
			
 
				 
			
@@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
 
				 add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
			
 
				 add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
			
 
				 add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
			
 
				-add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
			
 
				-add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
			
 
				-add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M"         "vicuna"
			
 
				+add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
			
 
				+add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
			
 
				+add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
			
 
				 add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
			
 
				 add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
			
 
				 add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
			
@@ -79,7 +88,7 @@ add_test_audio  "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
 
				 # to test the big models, run: ./tests.sh big
			
 
				 if [ "$RUN_BIG_TESTS" = true ]; then
			
 
				     add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
			
 
				-    add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
			
 
				+    add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
			
 
				     add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
			
 
				     add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
			
 
				     add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
			
@@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
 
				     add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
			
 
				     add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
			
 
				     # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
			
 
				-    add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
			
 
				+    # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
			
 
				 
			
 
				     add_test_audio  "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
			
 
				     add_test_audio  "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
			
@@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
 
				     bin="llama-mtmd-cli"
			
 
				     prefix="${arr_prefix[$i]}"
			
 
				     hf="${arr_hf[$i]}"
			
 
				-    tmpl="${arr_tmpl[$i]}"
			
 
				+    extra_args="${arr_extra_args[$i]}"
			
 
				     inp_file="${arr_file[$i]}"
			
 
				 
			
 
				     echo "Running test with binary: $bin and HF model: $hf"
			
 
				     echo ""
			
 
				     echo ""
			
 
				 
			
 
				-    output=$(\
			
 
				-        "$PROJ_ROOT/build/bin/$bin" \
			
 
				-        -hf "$hf" \
			
 
				-        --image $SCRIPT_DIR/$inp_file \
			
 
				-        -p "what is the publisher name of the newspaper?" \
			
 
				+    cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
			
 
				+        -hf $(printf %q "$hf") \
			
 
				+        --image $(printf %q "$SCRIPT_DIR/$inp_file") \
			
 
				         --temp 0 -n 128 \
			
 
				-        ${tmpl:+--chat-template "$tmpl"} \
			
 
				-        2>&1 | tee /dev/tty)
			
 
				+        ${extra_args}"
			
 
				+
			
 
				+    # if extra_args does not contain -p, we add a default prompt
			
 
				+    if ! [[ "$extra_args" =~ "-p" ]]; then
			
 
				+        cmd+=" -p \"what is the publisher name of the newspaper?\""
			
 
				+    fi
			
 
				+
			
 
				+    output=$(eval "$cmd" 2>&1 | tee /dev/tty)
			
 
				 
			
 
				     echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
			
 
				 
			
@@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
 
				     if echo "$output" | grep -iq "new york" \
			
 
				             || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
			
 
				     then
			
 
				-        result="$prefix \033[32mOK\033[0m:   $bin $hf"
			
 
				+        result="$prefix \033[32mOK\033[0m:   $hf"
			
 
				     else
			
 
				-        result="$prefix \033[31mFAIL\033[0m: $bin $hf"
			
 
				+        result="$prefix \033[31mFAIL\033[0m: $hf"
			
 
				     fi
			
 
				     echo -e "$result"
			
 
				     arr_res+=("$result")