1 week ago · d98b548120
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -60,6 +60,8 @@ add_library(${TARGET} STATIC
 
				     common.h
			
 
				     console.cpp
			
 
				     console.h
			
 
				+    debug.cpp
			
 
				+    debug.h
			
 
				     download.cpp
			
 
				     download.h
			
 
				     http.h
			
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -0,0 +1,165 @@
 
				+#include "debug.h"
			
 
				+
			
 
				+#include "log.h"
			
 
				+
			
 
				+#include <cmath>
			
 
				+#include <string>
			
 
				+
			
 
				+static std::string common_ggml_ne_string(const ggml_tensor * t) {
			
 
				+    std::string str;
			
 
				+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
			
 
				+        str += std::to_string(t->ne[i]);
			
 
				+        if (i + 1 < GGML_MAX_DIMS) {
			
 
				+            str += ", ";
			
 
				+        }
			
 
				+    }
			
 
				+    return str;
			
 
				+}
			
 
				+
			
 
				+static float common_ggml_get_float_value(const uint8_t * data,
			
 
				+                           ggml_type       type,
			
 
				+                           const size_t *  nb,
			
 
				+                           size_t          i0,
			
 
				+                           size_t          i1,
			
 
				+                           size_t          i2,
			
 
				+                           size_t          i3) {
			
 
				+    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
			
 
				+    float  v;
			
 
				+    if (type == GGML_TYPE_F16) {
			
 
				+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
			
 
				+    } else if (type == GGML_TYPE_F32) {
			
 
				+        v = *(const float *) &data[i];
			
 
				+    } else if (type == GGML_TYPE_I64) {
			
 
				+        v = (float) *(const int64_t *) &data[i];
			
 
				+    } else if (type == GGML_TYPE_I32) {
			
 
				+        v = (float) *(const int32_t *) &data[i];
			
 
				+    } else if (type == GGML_TYPE_I16) {
			
 
				+        v = (float) *(const int16_t *) &data[i];
			
 
				+    } else if (type == GGML_TYPE_I8) {
			
 
				+        v = (float) *(const int8_t *) &data[i];
			
 
				+    } else if (type == GGML_TYPE_BF16) {
			
 
				+        v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
			
 
				+    } else {
			
 
				+        GGML_ABORT("fatal error");
			
 
				+    }
			
 
				+    return v;
			
 
				+}
			
 
				+
			
 
				+template <bool abort>
			
 
				+void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
			
 
				+    GGML_ASSERT(n > 0);
			
 
				+    float sum = 0;
			
 
				+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
			
 
				+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
			
 
				+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
			
 
				+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
			
 
				+                    const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
			
 
				+                    sum += v;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
			
 
				+        LOG_ERR("                                     [\n");
			
 
				+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
			
 
				+            if (i2 == n && ne[2] > 2 * n) {
			
 
				+                LOG_ERR("                                      ..., \n");
			
 
				+                i2 = ne[2] - n;
			
 
				+            }
			
 
				+            LOG_ERR("                                      [\n");
			
 
				+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
			
 
				+                if (i1 == n && ne[1] > 2 * n) {
			
 
				+                    LOG_ERR("                                       ..., \n");
			
 
				+                    i1 = ne[1] - n;
			
 
				+                }
			
 
				+                LOG_ERR("                                       [");
			
 
				+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
			
 
				+                    if (i0 == n && ne[0] > 2 * n) {
			
 
				+                        LOG_ERR("..., ");
			
 
				+                        i0 = ne[0] - n;
			
 
				+                    }
			
 
				+                    const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
			
 
				+                    LOG_ERR("%12.4f", v);
			
 
				+                    if (i0 < ne[0] - 1) {
			
 
				+                        LOG_ERR(", ");
			
 
				+                    }
			
 
				+                }
			
 
				+                LOG_ERR("],\n");
			
 
				+            }
			
 
				+            LOG_ERR("                                      ],\n");
			
 
				+        }
			
 
				+        LOG_ERR("                                     ]\n");
			
 
				+        LOG_ERR("                                     sum = %f\n", sum);
			
 
				+    }
			
 
				+
			
 
				+    if constexpr (abort) {
			
 
				+        if (std::isnan(sum)) {
			
 
				+            LOG_ERR("encountered NaN - aborting\n");
			
 
				+            exit(0);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * GGML operations callback during the graph execution.
			
 
				+ *
			
 
				+ * @param t current tensor
			
 
				+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
			
 
				+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
			
 
				+ *            see ggml_backend_sched_eval_callback
			
 
				+ * @param user_data user data to pass at each call back
			
 
				+ * @return true to receive data or continue the graph, false otherwise
			
 
				+ */
			
 
				+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
			
 
				+    auto * cb_data = (base_callback_data *) user_data;
			
 
				+
			
 
				+    const struct ggml_tensor * src0 = t->src[0];
			
 
				+    const struct ggml_tensor * src1 = t->src[1];
			
 
				+
			
 
				+    if (ask) {
			
 
				+        return true;  // Always retrieve data
			
 
				+    }
			
 
				+
			
 
				+    bool matches_filter = cb_data->tensor_filters.empty();
			
 
				+
			
 
				+    if (!matches_filter) {
			
 
				+        for (const auto & filter : cb_data->tensor_filters) {
			
 
				+            if (std::regex_search(t->name, filter)) {
			
 
				+                matches_filter = true;
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    char src1_str[128] = { 0 };
			
 
				+    if (src1) {
			
 
				+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
			
 
				+    }
			
 
				+
			
 
				+    if (matches_filter) {
			
 
				+        LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
			
 
				+                ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
			
 
				+                common_ggml_ne_string(t).c_str());
			
 
				+    }
			
 
				+
			
 
				+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
			
 
				+
			
 
				+    if (!is_host) {
			
 
				+        auto n_bytes = ggml_nbytes(t);
			
 
				+        cb_data->data.resize(n_bytes);
			
 
				+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
			
 
				+    }
			
 
				+
			
 
				+    if (!ggml_is_quantized(t->type) && matches_filter) {
			
 
				+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
			
 
				+        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+// Explicit template instantiations
			
 
				+template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
			
 
				+template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
			
 
				+template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
			
 
				+template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
			
--- a/common/debug.h
+++ b/common/debug.h
@@ -0,0 +1,43 @@
 
				+#pragma once
			
 
				+#include "common.h"
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+#include <regex>
			
 
				+
			
 
				+// common debug functions and structs
			
 
				+
			
 
				+// Print a tensor's detailed data
			
 
				+// data - the tensor's data in byte format
			
 
				+// type - the tensor's quantization type
			
 
				+// ne   - the tensor dimensions array
			
 
				+// nb   - the tensor strides array
			
 
				+// n    - the number of rows/columns to fully print
			
 
				+template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
			
 
				+
			
 
				+// Intended to use as callback for ggml_backend_sched_eval_callback
			
 
				+// prints tensors that are processed in the computation graph
			
 
				+// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
			
 
				+// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
			
 
				+// The template parameter determins whether an error should be thrown whenever a NaN is encountered
			
 
				+// in a tensor (useful for stopping debug sessions on first erroneous tensor)
			
 
				+// The callback data will be passed as the third parameter (user_data)
			
 
				+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
			
 
				+struct base_callback_data {
			
 
				+    std::vector<uint8_t>    data;
			
 
				+    std::vector<std::regex> tensor_filters;
			
 
				+
			
 
				+    base_callback_data() = default;
			
 
				+
			
 
				+    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
			
 
				+        for (const auto & pattern : filter_patterns) {
			
 
				+            try {
			
 
				+                std::string anchored_pattern = "^" + pattern;
			
 
				+                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
			
 
				+            } catch (const std::regex_error & e) {
			
 
				+                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
			
 
				+            }
			
 
				+        }
			
 
				+        params.cb_eval           = common_debug_cb_eval<false>;
			
 
				+        params.cb_eval_user_data = this;
			
 
				+    }
			
 
				+};
			
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -1,4 +1,4 @@
 
				-{
			
 
				+{
			
 
				   "version": 4,
			
 
				   "configurePresets": [
			
 
				     {
			
--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@@ -1,11 +1,9 @@
 
				+#include "debug.h"
			
 
				 #include "arg.h"
			
 
				 #include "common.h"
			
 
				 #include "log.h"
			
 
				 #include "llama.h"
			
 
				-#include "ggml.h"
			
 
				 
			
 
				-#include <cmath>
			
 
				-#include <cstdint>
			
 
				 #include <cstdlib>
			
 
				 #include <string>
			
 
				 #include <vector>
			
@@ -13,7 +11,7 @@
 
				 #include <fstream>
			
 
				 #include <regex>
			
 
				 
			
 
				-static void print_usage(int, char ** argv) {
			
 
				+static void print_usage(int /*argc*/, char ** argv) {
			
 
				     const std::string usage_template = R"(
			
 
				         example usage:
			
 
				 
			
@@ -35,28 +33,6 @@ static void print_usage(int, char ** argv) {
 
				     LOG("%s\n", usage.c_str());
			
 
				 }
			
 
				 
			
 
				-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data);
			
 
				-
			
 
				-struct callback_data {
			
 
				-    std::vector<uint8_t>    data;
			
 
				-    std::vector<std::regex> tensor_filters;
			
 
				-
			
 
				-    callback_data() = default;
			
 
				-
			
 
				-    callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
			
 
				-        for (const auto & pattern : filter_patterns) {
			
 
				-            try {
			
 
				-                std::string anchored_pattern = "^" + pattern;
			
 
				-                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
			
 
				-            } catch (const std::regex_error & e) {
			
 
				-                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
			
 
				-            }
			
 
				-        }
			
 
				-        params.cb_eval           = ggml_debug;
			
 
				-        params.cb_eval_user_data = this;
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				 static bool has_pooling(llama_context * ctx) {
			
 
				     switch (llama_pooling_type(ctx)) {
			
 
				         case LLAMA_POOLING_TYPE_NONE:
			
@@ -120,168 +96,6 @@ struct output_data {
 
				     }
			
 
				 };
			
 
				 
			
 
				-static std::string ggml_ne_string(const ggml_tensor * t) {
			
 
				-    std::string str;
			
 
				-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
			
 
				-        str += std::to_string(t->ne[i]);
			
 
				-        if (i + 1 < GGML_MAX_DIMS) {
			
 
				-            str += ", ";
			
 
				-        }
			
 
				-    }
			
 
				-    return str;
			
 
				-}
			
 
				-
			
 
				-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
			
 
				-    union {
			
 
				-        float f;
			
 
				-        uint32_t i;
			
 
				-    } u;
			
 
				-    u.i = (uint32_t)h.bits << 16;
			
 
				-    return u.f;
			
 
				-}
			
 
				-
			
 
				-static float ggml_get_float_value(const uint8_t * data, ggml_type type,
			
 
				-        const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
			
 
				-    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
			
 
				-    switch (type) {
			
 
				-        case GGML_TYPE_F16:
			
 
				-            return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
			
 
				-        case GGML_TYPE_F32:
			
 
				-            return *(const float *) &data[i];
			
 
				-        case GGML_TYPE_I64:
			
 
				-            return (float) *(const int64_t *) &data[i];
			
 
				-        case GGML_TYPE_I32:
			
 
				-            return (float) *(const int32_t *) &data[i];
			
 
				-        case GGML_TYPE_I16:
			
 
				-            return (float) *(const int16_t *) &data[i];
			
 
				-        case GGML_TYPE_I8:
			
 
				-            return (float) *(const int8_t *) &data[i];
			
 
				-        case GGML_TYPE_BF16:
			
 
				-            return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
			
 
				-        default:
			
 
				-            GGML_ABORT("fatal error");
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
			
 
				-    GGML_ASSERT(n > 0);
			
 
				-    float sum    = 0;
			
 
				-    float sum_sq = 0.0;
			
 
				-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
			
 
				-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
			
 
				-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
			
 
				-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
			
 
				-                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
			
 
				-                    sum    += v;
			
 
				-                    sum_sq += v * v;
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
			
 
				-        LOG_DBG("                                     [\n");
			
 
				-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
			
 
				-            if (i2 == n && ne[2] > 2*n) {
			
 
				-                LOG_DBG("                                      ..., \n");
			
 
				-                i2 = ne[2] - n;
			
 
				-            }
			
 
				-            LOG_DBG("                                      [\n");
			
 
				-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
			
 
				-                if (i1 == n && ne[1] > 2*n) {
			
 
				-                    LOG_DBG("                                       ..., \n");
			
 
				-                    i1 = ne[1] - n;
			
 
				-                }
			
 
				-                LOG_DBG("                                       [");
			
 
				-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
			
 
				-                    if (i0 == n && ne[0] > 2*n) {
			
 
				-                        LOG_DBG("..., ");
			
 
				-                        i0 = ne[0] - n;
			
 
				-                    }
			
 
				-                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
			
 
				-                    LOG_DBG("%12.4f", v);
			
 
				-                    if (i0 < ne[0] - 1) {
			
 
				-                        LOG_DBG(", ");
			
 
				-                    }
			
 
				-                }
			
 
				-                LOG_DBG("],\n");
			
 
				-            }
			
 
				-            LOG_DBG("                                      ],\n");
			
 
				-        }
			
 
				-        LOG_DBG("                                     ]\n");
			
 
				-        LOG_DBG("                                     sum    = %f\n", sum);
			
 
				-        LOG_DBG("                                     sum_sq = %f\n", sum_sq);
			
 
				-    }
			
 
				-
			
 
				-    if (std::isnan(sum)) {
			
 
				-        LOG_ERR("encountered NaN - aborting\n");
			
 
				-        exit(0);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * GGML operations callback during the graph execution.
			
 
				- *
			
 
				- * @param t current tensor
			
 
				- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
			
 
				- *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
			
 
				- *            see ggml_backend_sched_eval_callback
			
 
				- * @param user_data user data to pass at each call back
			
 
				- * @return true to receive data or continue the graph, false otherwise
			
 
				- */
			
 
				-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
			
 
				-    auto * cb_data = (callback_data *) user_data;
			
 
				-
			
 
				-    const struct ggml_tensor * src0 = t->src[0];
			
 
				-    const struct ggml_tensor * src1 = t->src[1];
			
 
				-
			
 
				-    if (ask) {
			
 
				-        return true; // Always retrieve data
			
 
				-    }
			
 
				-
			
 
				-    bool matches_filter = cb_data->tensor_filters.empty();
			
 
				-
			
 
				-    if (!matches_filter) {
			
 
				-        for (const auto & filter : cb_data->tensor_filters) {
			
 
				-            if (std::regex_search(t->name, filter)) {
			
 
				-                matches_filter = true;
			
 
				-                break;
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    char src1_str[128] = {0};
			
 
				-    if (src1) {
			
 
				-        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
			
 
				-    }
			
 
				-
			
 
				-    if (matches_filter) {
			
 
				-        LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
			
 
				-             t->name,
			
 
				-             ggml_type_name(t->type),
			
 
				-             ggml_op_desc(t),
			
 
				-             src0->name,
			
 
				-             ggml_ne_string(src0).c_str(),
			
 
				-             src1 ? src1_str : "",
			
 
				-             ggml_ne_string(t).c_str());
			
 
				-    }
			
 
				-
			
 
				-    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
			
 
				-
			
 
				-    if (!is_host) {
			
 
				-        auto n_bytes = ggml_nbytes(t);
			
 
				-        cb_data->data.resize(n_bytes);
			
 
				-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
			
 
				-    }
			
 
				-
			
 
				-    if (!ggml_is_quantized(t->type) && matches_filter) {
			
 
				-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
			
 
				-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
			
 
				-    }
			
 
				-
			
 
				-    return true;
			
 
				-}
			
 
				-
			
 
				-
			
 
				 static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
			
 
				     std::filesystem::create_directory(output_dir);
			
 
				     auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
			
@@ -408,7 +222,7 @@ int main(int argc, char ** argv) {
 
				     llama_backend_init();
			
 
				     llama_numa_init(params.numa);
			
 
				 
			
 
				-    callback_data cb_data(params, params.tensor_filter);
			
 
				+    base_callback_data cb_data(params, params.tensor_filter);
			
 
				 
			
 
				     auto llama_init = common_init_from_params(params);
			
 
				 
			
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,165 +1,12 @@
 
				 #include "arg.h"
			
 
				 #include "common.h"
			
 
				+#include "debug.h"
			
 
				 #include "log.h"
			
 
				 #include "llama.h"
			
 
				-#include "ggml.h"
			
 
				-
			
 
				-#include <cmath>
			
 
				-#include <cstdio>
			
 
				+#include "llama-cpp.h"
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 
			
 
				-/**
			
 
				- * This the arbitrary data which will be passed to each callback.
			
 
				- * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
			
 
				- */
			
 
				-struct callback_data {
			
 
				-    std::vector<uint8_t> data;
			
 
				-};
			
 
				-
			
 
				-static std::string ggml_ne_string(const ggml_tensor * t) {
			
 
				-    std::string str;
			
 
				-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
			
 
				-        str += std::to_string(t->ne[i]);
			
 
				-        if (i + 1 < GGML_MAX_DIMS) {
			
 
				-            str += ", ";
			
 
				-        }
			
 
				-    }
			
 
				-    return str;
			
 
				-}
			
 
				-
			
 
				-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
			
 
				-    union {
			
 
				-        float f;
			
 
				-        uint32_t i;
			
 
				-    } u;
			
 
				-    u.i = (uint32_t)h.bits << 16;
			
 
				-    return u.f;
			
 
				-}
			
 
				-
			
 
				-static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
			
 
				-    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
			
 
				-    float v;
			
 
				-    if (type == GGML_TYPE_F16) {
			
 
				-        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
			
 
				-    } else if (type == GGML_TYPE_F32) {
			
 
				-        v = *(const float *) &data[i];
			
 
				-    } else if (type == GGML_TYPE_I64) {
			
 
				-        v = (float) *(const int64_t *) &data[i];
			
 
				-    } else if (type == GGML_TYPE_I32) {
			
 
				-        v = (float) *(const int32_t *) &data[i];
			
 
				-    } else if (type == GGML_TYPE_I16) {
			
 
				-        v = (float) *(const int16_t *) &data[i];
			
 
				-    } else if (type == GGML_TYPE_I8) {
			
 
				-        v = (float) *(const int8_t *) &data[i];
			
 
				-    } else if (type == GGML_TYPE_BF16) {
			
 
				-        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
			
 
				-    } else {
			
 
				-        GGML_ABORT("fatal error");
			
 
				-    }
			
 
				-    return v;
			
 
				-}
			
 
				-
			
 
				-static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
			
 
				-    GGML_ASSERT(n > 0);
			
 
				-    float sum = 0;
			
 
				-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
			
 
				-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
			
 
				-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
			
 
				-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
			
 
				-                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
			
 
				-                    sum += v;
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
			
 
				-        LOG("                                     [\n");
			
 
				-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
			
 
				-            if (i2 == n && ne[2] > 2*n) {
			
 
				-                LOG("                                      ..., \n");
			
 
				-                i2 = ne[2] - n;
			
 
				-            }
			
 
				-            LOG("                                      [\n");
			
 
				-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
			
 
				-                if (i1 == n && ne[1] > 2*n) {
			
 
				-                    LOG("                                       ..., \n");
			
 
				-                    i1 = ne[1] - n;
			
 
				-                }
			
 
				-                LOG("                                       [");
			
 
				-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
			
 
				-                    if (i0 == n && ne[0] > 2*n) {
			
 
				-                        LOG("..., ");
			
 
				-                        i0 = ne[0] - n;
			
 
				-                    }
			
 
				-                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
			
 
				-                    LOG("%12.4f", v);
			
 
				-                    if (i0 < ne[0] - 1) LOG(", ");
			
 
				-                }
			
 
				-                LOG("],\n");
			
 
				-            }
			
 
				-            LOG("                                      ],\n");
			
 
				-        }
			
 
				-        LOG("                                     ]\n");
			
 
				-        LOG("                                     sum = %f\n", sum);
			
 
				-    }
			
 
				-
			
 
				-    // TODO: make this abort configurable/optional?
			
 
				-    if (std::isnan(sum)) {
			
 
				-        LOG_ERR("encountered NaN - aborting\n");
			
 
				-        exit(0);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * GGML operations callback during the graph execution.
			
 
				- *
			
 
				- * @param t current tensor
			
 
				- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
			
 
				- *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
			
 
				- *            see ggml_backend_sched_eval_callback
			
 
				- * @param user_data user data to pass at each call back
			
 
				- * @return true to receive data or continue the graph, false otherwise
			
 
				- */
			
 
				-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
			
 
				-    auto * cb_data = (callback_data *) user_data;
			
 
				-
			
 
				-    const struct ggml_tensor * src0 = t->src[0];
			
 
				-    const struct ggml_tensor * src1 = t->src[1];
			
 
				-
			
 
				-    if (ask) {
			
 
				-        return true; // Always retrieve data
			
 
				-    }
			
 
				-
			
 
				-    char src1_str[128] = {0};
			
 
				-    if (src1) {
			
 
				-        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
			
 
				-    }
			
 
				-
			
 
				-    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
			
 
				-         t->name, ggml_type_name(t->type), ggml_op_desc(t),
			
 
				-         src0->name, ggml_ne_string(src0).c_str(),
			
 
				-         src1 ? src1_str : "",
			
 
				-         ggml_ne_string(t).c_str());
			
 
				-
			
 
				-
			
 
				-    // copy the data from the GPU memory if needed
			
 
				-    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
			
 
				-
			
 
				-    if (!is_host) {
			
 
				-        auto n_bytes = ggml_nbytes(t);
			
 
				-        cb_data->data.resize(n_bytes);
			
 
				-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
			
 
				-    }
			
 
				-
			
 
				-    if (!ggml_is_quantized(t->type)) {
			
 
				-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
			
 
				-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
			
 
				-    }
			
 
				-
			
 
				-    return true;
			
 
				-}
			
 
				-
			
 
				 static bool run(llama_context * ctx, const common_params & params) {
			
 
				     const llama_model * model = llama_get_model(ctx);
			
 
				     const llama_vocab * vocab = llama_model_get_vocab(model);
			
@@ -182,7 +29,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 
				 }
			
 
				 
			
 
				 int main(int argc, char ** argv) {
			
 
				-    callback_data cb_data;
			
 
				+    base_callback_data cb_data;
			
 
				 
			
 
				     common_params params;
			
 
				 
			
@@ -197,7 +44,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // pass the callback to the backend scheduler
			
 
				     // it will be executed for each node during the graph computation
			
 
				-    params.cb_eval = ggml_debug;
			
 
				+    params.cb_eval = common_debug_cb_eval<false>;
			
 
				     params.cb_eval_user_data = &cb_data;
			
 
				     params.warmup = false;
			
 
				 
			
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -32,10 +32,6 @@ struct clip_graph {
 
				     const float kq_scale;
			
 
				     const clip_flash_attn_type flash_attn_type;
			
 
				 
			
 
				-    // for debugging
			
 
				-    const bool debug_graph;
			
 
				-    std::vector<ggml_tensor *> & debug_print_tensors;
			
 
				-
			
 
				     ggml_context_ptr ctx0_ptr;
			
 
				     ggml_context * ctx0;
			
 
				     ggml_cgraph * gf;
			
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -152,18 +152,14 @@ struct clip_ctx {
 
				     ggml_backend_t backend_cpu = nullptr;
			
 
				     ggml_backend_buffer_ptr buf;
			
 
				 
			
 
				+
			
 
				     int max_nodes = 8192;
			
 
				     ggml_backend_sched_ptr sched;
			
 
				     clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
			
 
				     bool is_allocated = false;
			
 
				 
			
 
				-    // for debugging
			
 
				-    bool debug_graph = false;
			
 
				-    std::vector<ggml_tensor *> debug_print_tensors;
			
 
				-
			
 
				     clip_ctx(clip_context_params & ctx_params) {
			
 
				         flash_attn_type = ctx_params.flash_attn_type;
			
 
				-        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
			
 
				         backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
			
 
				         if (!backend_cpu) {
			
 
				             throw std::runtime_error("failed to initialize CPU backend");
			
@@ -204,6 +200,10 @@ struct clip_ctx {
 
				         sched.reset(
			
 
				             ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
			
 
				         );
			
 
				+
			
 
				+        if (ctx_params.cb_eval != nullptr) {
			
 
				+            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     ~clip_ctx() {
			
@@ -239,9 +239,7 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
 
				         n_mmproj_embd(clip_n_mmproj_embd(ctx)),
			
 
				         eps(hparams.eps),
			
 
				         kq_scale(1.0f / sqrtf((float)d_head)),
			
 
				-        flash_attn_type(ctx->flash_attn_type),
			
 
				-        debug_graph(ctx->debug_graph),
			
 
				-        debug_print_tensors(ctx->debug_print_tensors) {
			
 
				+        flash_attn_type(ctx->flash_attn_type) {
			
 
				     struct ggml_init_params params = {
			
 
				         /*.mem_size   =*/ ctx->buf_compute_meta.size(),
			
 
				         /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
			
@@ -252,14 +250,11 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
 
				     gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
			
 
				 }
			
 
				 
			
 
				-void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
			
 
				-    if (debug_graph) {
			
 
				-        ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
			
 
				-        std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
			
 
				-        ggml_set_name(cur, cur_name.c_str());
			
 
				-        ggml_set_output(cur);
			
 
				-        ggml_build_forward_expand(gf, cur);
			
 
				-        debug_print_tensors.push_back(cur);
			
 
				+void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
			
 
				+    if (il >= 0) {
			
 
				+        ggml_format_name(cur, "%s-%d", name, il);
			
 
				+    } else {
			
 
				+        ggml_set_name(cur, name);
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -1519,8 +1514,8 @@ struct clip_model_loader {
 
				                     model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
			
 
				                     model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
			
 
				                     model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
			
 
				-                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
			
 
				-                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
			
 
				+                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
			
 
				+                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
			
 
				                 } break;
			
 
				             case PROJECTOR_TYPE_QWEN2VL:
			
 
				             case PROJECTOR_TYPE_QWEN25VL:
			
@@ -1761,8 +1756,8 @@ struct clip_model_loader {
 
				                     model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
			
 
				                     model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
			
 
				                     model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
			
 
				-                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
			
 
				-                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
			
 
				+                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
			
 
				+                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
			
 
				                 } break;
			
 
				             case PROJECTOR_TYPE_LLAMA4:
			
 
				                 {
			
@@ -3339,7 +3334,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
				     }
			
 
				 
			
 
				     // build the inference graph
			
 
				-    ctx->debug_print_tensors.clear();
			
 
				     ggml_backend_sched_reset(ctx->sched.get());
			
 
				     ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
			
 
				     ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
			
@@ -3709,18 +3703,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
				         return false;
			
 
				     }
			
 
				 
			
 
				-    // print debug nodes
			
 
				-    if (ctx->debug_graph) {
			
 
				-        LOG_INF("\n\n---\n\n");
			
 
				-        LOG_INF("\n\nDebug graph:\n\n");
			
 
				-        for (ggml_tensor * t : ctx->debug_print_tensors) {
			
 
				-            std::vector<uint8_t> data(ggml_nbytes(t));
			
 
				-            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
			
 
				-            print_tensor_shape(t);
			
 
				-            print_tensor_data(t, data.data(), 3);
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				     // the last node is the embedding tensor
			
 
				     ggml_tensor * embeddings = ggml_graph_node(gf, -1);
			
 
				 
			
@@ -3872,7 +3854,6 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
 
				 //
			
 
				 // API for debugging
			
 
				 //
			
 
				-
			
 
				 void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
			
 
				     clip_image_f32 img;
			
 
				     img.nx = w;
			
@@ -3881,9 +3862,6 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
 
				     for (int i = 0; i < h * w * 3; i++) {
			
 
				         img.buf[i] = static_cast<float>(fill_value);
			
 
				     }
			
 
				-    bool cur_debug_graph = ctx->debug_graph;
			
 
				-    ctx->debug_graph = true;
			
 
				     clip_image_encode(ctx, 1, &img, nullptr);
			
 
				-    ctx->debug_graph = cur_debug_graph;
			
 
				     GGML_ASSERT(img.buf.empty() && "expected, always stop here");
			
 
				 }
			
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -1,6 +1,7 @@
 
				 #pragma once
			
 
				 
			
 
				 #include "ggml.h"
			
 
				+#include "mtmd.h"
			
 
				 
			
 
				 #include <stddef.h>
			
 
				 #include <stdint.h>
			
@@ -37,6 +38,8 @@ struct clip_context_params {
 
				     int image_min_tokens;
			
 
				     int image_max_tokens;
			
 
				     bool warmup;
			
 
				+    ggml_backend_sched_eval_callback cb_eval;
			
 
				+    void * cb_eval_user_data;
			
 
				 };
			
 
				 
			
 
				 struct clip_init_result {
			
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -1,4 +1,5 @@
 
				 #include "arg.h"
			
 
				+#include "debug.h"
			
 
				 #include "log.h"
			
 
				 #include "common.h"
			
 
				 #include "sampling.h"
			
@@ -88,6 +89,8 @@ struct mtmd_cli_context {
 
				     int n_threads    = 1;
			
 
				     llama_pos n_past = 0;
			
 
				 
			
 
				+    base_callback_data cb_data;
			
 
				+
			
 
				     mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
			
 
				         model = llama_init->model();
			
 
				         lctx = llama_init->context();
			
@@ -139,6 +142,10 @@ struct mtmd_cli_context {
 
				         mparams.warmup           = params.warmup;
			
 
				         mparams.image_min_tokens = params.image_min_tokens;
			
 
				         mparams.image_max_tokens = params.image_max_tokens;
			
 
				+        if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
			
 
				+            mparams.cb_eval_user_data = &cb_data;
			
 
				+            mparams.cb_eval = common_debug_cb_eval<false>;
			
 
				+        }
			
 
				         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
			
 
				         if (!ctx_vision.get()) {
			
 
				             LOG_ERR("Failed to load vision model from %s\n", clip_path);
			
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -111,6 +111,8 @@ mtmd_context_params mtmd_context_params_default() {
 
				         /* warmup            */ true,
			
 
				         /* image_min_tokens  */ -1,
			
 
				         /* image_max_tokens  */ -1,
			
 
				+        /* cb_eval           */ nullptr,
			
 
				+        /* cb_eval_user_data */ nullptr,
			
 
				     };
			
 
				     return params;
			
 
				 }
			
@@ -176,6 +178,8 @@ struct mtmd_context {
 
				             /* image_min_tokens  */ ctx_params.image_min_tokens,
			
 
				             /* image_max_tokens  */ ctx_params.image_max_tokens,
			
 
				             /* warmup            */ ctx_params.warmup,
			
 
				+            /* cb_eval           */ ctx_params.cb_eval,
			
 
				+            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
			
 
				         };
			
 
				 
			
 
				         auto res = clip_init(mmproj_fname, ctx_clip_params);
			
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -95,6 +95,10 @@ struct mtmd_context_params {
 
				     // limit number of image tokens, only for vision models with dynamic resolution
			
 
				     int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
			
 
				     int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
			
 
				+
			
 
				+    // callback function passed over to mtmd proper
			
 
				+    ggml_backend_sched_eval_callback cb_eval;
			
 
				+    void * cb_eval_user_data;
			
 
				 };
			
 
				 
			
 
				 MTMD_API const char * mtmd_default_marker(void);
			
@@ -273,12 +277,12 @@ struct bitmap {
 
				         ptr.reset(mtmd_bitmap_init(nx, ny, data));
			
 
				     }
			
 
				     ~bitmap() = default;
			
 
				-    uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
			
 
				-    uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
			
 
				-    const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
			
 
				-    size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
			
 
				-    std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
			
 
				-    void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
			
 
				+    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
			
 
				+    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
			
 
				+    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
			
 
				+    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
			
 
				+    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
			
 
				+    void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
			
 
				 };
			
 
				 
			
 
				 struct bitmaps {
			
@@ -302,8 +306,8 @@ struct input_chunks {
 
				     input_chunks() = default;
			
 
				     input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
			
 
				     ~input_chunks() = default;
			
 
				-    size_t size() { return mtmd_input_chunks_size(ptr.get()); }
			
 
				-    const mtmd_input_chunk * operator[](size_t idx) {
			
 
				+    size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
			
 
				+    const mtmd_input_chunk * operator[](size_t idx) const {
			
 
				         return mtmd_input_chunks_get(ptr.get(), idx);
			
 
				     }
			
 
				 };