|
|
@@ -264,11 +264,11 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
|
|
|
}
|
|
|
|
|
|
// siglip2 naflex
|
|
|
-ggml_tensor * clip_graph::resize_position_embeddings() {
|
|
|
+ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
|
|
|
ggml_tensor * pos_embd = model.position_embeddings;
|
|
|
const int height = img.ny / patch_size;
|
|
|
const int width = img.nx / patch_size;
|
|
|
- const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
|
|
|
+ const uint32_t mode = interpolation_mode;
|
|
|
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
|
|
|
|
|
GGML_ASSERT(pos_embd);
|
|
|
@@ -485,19 +485,14 @@ ggml_tensor * clip_graph::build_norm(
|
|
|
? ggml_rms_norm(ctx0, cur, norm_eps)
|
|
|
: ggml_norm(ctx0, cur, norm_eps);
|
|
|
|
|
|
- if (mw || mb) {
|
|
|
- cb(cur, "norm", il);
|
|
|
- }
|
|
|
-
|
|
|
if (mw) {
|
|
|
cur = ggml_mul(ctx0, cur, mw);
|
|
|
- if (mb) {
|
|
|
- cb(cur, "norm_w", il);
|
|
|
- }
|
|
|
+ cb(cur, "norm_w", il);
|
|
|
}
|
|
|
|
|
|
if (mb) {
|
|
|
cur = ggml_add(ctx0, cur, mb);
|
|
|
+ cb(cur, "norm_b", il);
|
|
|
}
|
|
|
|
|
|
return cur;
|
|
|
@@ -842,6 +837,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
{
|
|
|
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
|
|
} break;
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
+ {
|
|
|
+ builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
|
|
+ } break;
|
|
|
default:
|
|
|
GGML_ABORT("missing cgraph builder");
|
|
|
}
|
|
|
@@ -1155,6 +1154,14 @@ struct clip_model_loader {
|
|
|
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
|
|
}
|
|
|
} break;
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
+ {
|
|
|
+ hparams.rope_theta = 10000.0f;
|
|
|
+ hparams.n_merge = 2; // default value for GLM4-V
|
|
|
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
|
|
+ hparams.set_limit_image_tokens(8, 4096);
|
|
|
+ hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
|
|
|
+ } break;
|
|
|
case PROJECTOR_TYPE_LLAMA4:
|
|
|
{
|
|
|
hparams.rope_theta = 10000.0f;
|
|
|
@@ -1282,6 +1289,9 @@ struct clip_model_loader {
|
|
|
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
|
|
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
|
|
|
|
|
+ model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
|
|
|
+ model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false);
|
|
|
+
|
|
|
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
|
|
|
|
|
// layers
|
|
|
@@ -1470,6 +1480,20 @@ struct clip_model_loader {
|
|
|
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
|
|
} break;
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
+ {
|
|
|
+ model.projection = get_tensor(TN_MM_PROJECTOR);
|
|
|
+ model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight"));
|
|
|
+ model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false);
|
|
|
+ model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
|
|
|
+ model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false);
|
|
|
+ model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight"));
|
|
|
+ model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false);
|
|
|
+ model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
|
|
|
+ model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
|
|
|
+ model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
|
|
|
+ model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
|
|
|
+ } break;
|
|
|
case PROJECTOR_TYPE_GEMMA3:
|
|
|
{
|
|
|
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
|
|
@@ -1498,8 +1522,8 @@ struct clip_model_loader {
|
|
|
// [IMG_BREAK] token embedding
|
|
|
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
|
|
// for mistral small 3.1
|
|
|
- model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
|
- model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
|
|
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
|
+ model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
|
|
} break;
|
|
|
case PROJECTOR_TYPE_LIGHTONOCR:
|
|
|
{
|
|
|
@@ -1507,8 +1531,8 @@ struct clip_model_loader {
|
|
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
|
|
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
|
|
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
|
|
- model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
|
- model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
|
|
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
|
+ model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
|
|
} break;
|
|
|
case PROJECTOR_TYPE_ULTRAVOX:
|
|
|
{
|
|
|
@@ -1873,6 +1897,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|
|
if (ctx_params.warmup) {
|
|
|
loader.warmup(*ctx_vision);
|
|
|
}
|
|
|
+
|
|
|
+ // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
|
|
}
|
|
|
|
|
|
if (loader.has_audio) {
|
|
|
@@ -2582,6 +2608,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
|
case PROJECTOR_TYPE_QWEN2VL:
|
|
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
|
case PROJECTOR_TYPE_QWEN3VL:
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
{
|
|
|
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
|
|
clip_image_u8 resized;
|
|
|
@@ -2824,16 +2851,30 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
|
|
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
|
const auto & params = ctx->model.hparams;
|
|
|
const int n_total = clip_n_output_tokens(ctx, img);
|
|
|
- if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
|
|
|
- return img->nx / (params.patch_size * 2);
|
|
|
+ const auto & proj = ctx->proj_type();
|
|
|
+ switch (proj) {
|
|
|
+ case PROJECTOR_TYPE_QWEN2VL:
|
|
|
+ case PROJECTOR_TYPE_QWEN25VL:
|
|
|
+ case PROJECTOR_TYPE_QWEN3VL:
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
+ return (img->nx / params.patch_size) / 2;
|
|
|
+ default:
|
|
|
+ break;
|
|
|
}
|
|
|
return n_total;
|
|
|
}
|
|
|
|
|
|
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
|
|
const auto & params = ctx->model.hparams;
|
|
|
- if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
|
|
|
- return img->ny / (params.patch_size * 2);
|
|
|
+ const auto & proj = ctx->proj_type();
|
|
|
+ switch (proj) {
|
|
|
+ case PROJECTOR_TYPE_QWEN2VL:
|
|
|
+ case PROJECTOR_TYPE_QWEN25VL:
|
|
|
+ case PROJECTOR_TYPE_QWEN3VL:
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
+ return (img->ny / params.patch_size) / 2;
|
|
|
+ default:
|
|
|
+ break;
|
|
|
}
|
|
|
return 1;
|
|
|
}
|
|
|
@@ -2890,6 +2931,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|
|
case PROJECTOR_TYPE_QWEN2VL:
|
|
|
case PROJECTOR_TYPE_QWEN25VL:
|
|
|
case PROJECTOR_TYPE_QWEN3VL:
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
{
|
|
|
// dynamic size (2 conv, so double patch size)
|
|
|
int x_patch = img->nx / (params.patch_size * 2);
|
|
|
@@ -3137,6 +3179,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
} break;
|
|
|
case PROJECTOR_TYPE_QWEN2VL:
|
|
|
case PROJECTOR_TYPE_QWEN3VL:
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
{
|
|
|
const int merge_ratio = hparams.n_merge;
|
|
|
const int pw = image_size_width / patch_size;
|
|
|
@@ -3363,7 +3406,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
}
|
|
|
|
|
|
// copy the embeddings to the location passed by the user
|
|
|
- ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
|
|
+ if (vec != nullptr) {
|
|
|
+ ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
|
|
+ }
|
|
|
|
|
|
return true;
|
|
|
}
|
|
|
@@ -3411,6 +3456,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
return ctx->model.mm_2_w->ne[1];
|
|
|
case PROJECTOR_TYPE_COGVLM:
|
|
|
return ctx->model.mm_4h_to_h_w->ne[1];
|
|
|
+ case PROJECTOR_TYPE_GLM4V:
|
|
|
+ return ctx->model.mm_ffn_down_w->ne[1];
|
|
|
default:
|
|
|
GGML_ABORT("Unknown projector type");
|
|
|
}
|
|
|
@@ -3427,10 +3474,11 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
|
|
|
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
|
|
}
|
|
|
|
|
|
-bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
|
|
+bool clip_is_mrope(const struct clip_ctx * ctx) {
|
|
|
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
|
|
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
|
|
- || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL;
|
|
|
+ || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|
|
|
+ || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
|
|
|
}
|
|
|
|
|
|
bool clip_is_llava(const struct clip_ctx * ctx) {
|
|
|
@@ -3491,3 +3539,22 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
|
|
|
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
|
|
return &ctx->model.hparams;
|
|
|
}
|
|
|
+
|
|
|
+//
|
|
|
+// API for debugging
|
|
|
+//
|
|
|
+
|
|
|
+void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
|
|
+ clip_image_f32 img;
|
|
|
+ img.nx = w;
|
|
|
+ img.ny = h;
|
|
|
+ img.buf.resize(h * w * 3);
|
|
|
+ for (int i = 0; i < h * w * 3; i++) {
|
|
|
+ img.buf[i] = static_cast<float>(fill_value);
|
|
|
+ }
|
|
|
+ bool cur_debug_graph = ctx->debug_graph;
|
|
|
+ ctx->debug_graph = true;
|
|
|
+ clip_image_encode(ctx, 1, &img, nullptr);
|
|
|
+ ctx->debug_graph = cur_debug_graph;
|
|
|
+ GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
|
|
+}
|