6 days ago · 557515be1e
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -72,7 +72,7 @@ jobs:
 
				       - name: Build
			
 
				         id: cmake_build
			
 
				         run: |
			
 
				-          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
			
 
				+          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
			
 
				           cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
			
 
				 
			
 
				       - name: Python setup
			
@@ -108,7 +108,7 @@ jobs:
 
				       - name: Build
			
 
				         id: cmake_build
			
 
				         run: |
			
 
				-          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
			
 
				+          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
			
 
				           cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
			
 
				 
			
 
				       - name: Python setup
			
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2903,7 +2903,7 @@ void llama_context::opt_epoch_iter(
 
				                 };
			
 
				                 ctx_compute_opt = ggml_init(params);
			
 
				             }
			
 
				-            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
			
 
				+            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
			
 
				             ggml_opt_alloc(opt_ctx, train);
			
 
				 
			
 
				             res->set_inputs(&ubatch);
			
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -23,7 +23,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
 
				     }
			
 
				 
			
 
				     if (ubatch->embd) {
			
 
				-        const int64_t n_embd   = embd->ne[0];
			
 
				+        GGML_ASSERT(n_embd == embd->ne[0]);
			
 
				+
			
 
				         const int64_t n_tokens = ubatch->n_tokens;
			
 
				 
			
 
				         ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
			
@@ -33,8 +34,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
 
				 bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
			
 
				     bool res = true;
			
 
				 
			
 
				-    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
			
 
				-    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[1] == params.ubatch.n_tokens);
			
 
				+    res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
			
 
				+    res &= (!params.ubatch.embd)  || (embd   &&   embd->ne[1] == params.ubatch.n_tokens);
			
 
				 
			
 
				     return res;
			
 
				 }
			
@@ -634,7 +635,8 @@ int64_t llm_graph_result::get_max_nodes() const {
 
				 }
			
 
				 
			
 
				 void llm_graph_result::reset() {
			
 
				-    t_tokens      = nullptr;
			
 
				+    t_inp_tokens  = nullptr;
			
 
				+    t_inp_embd    = nullptr;
			
 
				     t_logits      = nullptr;
			
 
				     t_embd        = nullptr;
			
 
				     t_embd_pooled = nullptr;
			
@@ -1338,17 +1340,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
				 
			
 
				 // input embeddings with optional lora
			
 
				 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
			
 
				-    const int64_t n_embd = hparams.n_embd_inp();
			
 
				+    const int64_t n_embd_inp = hparams.n_embd_inp();
			
 
				+    const int64_t n_embd     = hparams.n_embd;
			
 
				+
			
 
				+    assert(n_embd_inp >= n_embd);
			
 
				+
			
 
				+    auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
			
 
				+
			
 
				+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
			
 
				+    cb(inp->tokens, "inp_tokens", -1);
			
 
				+    ggml_set_input(inp->tokens);
			
 
				+    res->t_inp_tokens = inp->tokens;
			
 
				 
			
 
				-    auto inp = std::make_unique<llm_graph_input_embd>();
			
 
				+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
			
 
				+    cb(inp->embd, "inp_embd", -1);
			
 
				+    ggml_set_input(inp->embd);
			
 
				 
			
 
				-    ggml_tensor * cur = nullptr;
			
 
				+    // select one of the 2 inputs, based on the batch contents
			
 
				+    // ref: https://github.com/ggml-org/llama.cpp/pull/18550
			
 
				+    std::array<ggml_tensor *, 2> inps;
			
 
				 
			
 
				-    if (ubatch.token) {
			
 
				-        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
			
 
				-        //cb(inp->tokens, "inp_tokens", -1);
			
 
				-        ggml_set_input(inp->tokens);
			
 
				-        res->t_tokens = inp->tokens;
			
 
				+    // token embeddings path (ubatch.token != nullptr)
			
 
				+    {
			
 
				+        auto & cur = inps[0];
			
 
				 
			
 
				         cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
			
 
				 
			
@@ -1369,19 +1383,36 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 
				 
			
 
				             cur = ggml_add(ctx0, cur, inpL_delta);
			
 
				         }
			
 
				-    } else {
			
 
				-        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
			
 
				-        ggml_set_input(inp->embd);
			
 
				+
			
 
				+        if (n_embd_inp != n_embd) {
			
 
				+            cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // vector embeddings path (ubatch.embd != nullptr)
			
 
				+    {
			
 
				+        auto & cur = inps[1];
			
 
				 
			
 
				         cur = inp->embd;
			
 
				     }
			
 
				 
			
 
				+    assert(ggml_are_same_shape (inps[0], inps[1]));
			
 
				+    assert(ggml_are_same_stride(inps[0], inps[1]));
			
 
				+
			
 
				+    ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
			
 
				+
			
 
				+    if (n_embd_inp != n_embd) {
			
 
				+        cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0);
			
 
				+    }
			
 
				+
			
 
				+    res->t_inp_embd = cur;
			
 
				+
			
 
				     // For Granite architecture
			
 
				     if (hparams.f_embedding_scale != 0.0f) {
			
 
				         cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
			
 
				     }
			
 
				 
			
 
				-    cb(cur, "inp_embd", -1);
			
 
				+    cb(cur, "embd", -1);
			
 
				 
			
 
				     res->add_input(std::move(inp));
			
 
				 
			
@@ -1480,7 +1511,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
 
				     //}
			
 
				 
			
 
				     const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
			
 
				-    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
			
 
				+    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc  : hparams.n_ctx_train;
			
 
				 
			
 
				     cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
			
 
				     ggml_set_input(cur);
			
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -106,7 +106,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
 
				 
			
 
				 class llm_graph_input_embd : public llm_graph_input_i {
			
 
				 public:
			
 
				-    llm_graph_input_embd()          = default;
			
 
				+    llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
			
 
				     virtual ~llm_graph_input_embd() = default;
			
 
				 
			
 
				     void set_input(const llama_ubatch * ubatch) override;
			
@@ -115,6 +115,8 @@ public:
 
				 
			
 
				     ggml_tensor * tokens = nullptr; // I32 [n_batch]
			
 
				     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
			
 
				+
			
 
				+    const int64_t n_embd = 0;
			
 
				 };
			
 
				 
			
 
				 class llm_graph_input_pos : public llm_graph_input_i {
			
@@ -566,7 +568,7 @@ public:
 
				 
			
 
				     virtual ~llm_graph_result() = default;
			
 
				 
			
 
				-    ggml_tensor * get_tokens()      const { return t_tokens; }
			
 
				+    ggml_tensor * get_inp_tokens()  const { return t_inp_tokens; }
			
 
				     ggml_tensor * get_logits()      const { return t_logits; }
			
 
				     ggml_tensor * get_embd()        const { return t_embd; }
			
 
				     ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
			
@@ -593,7 +595,8 @@ public:
 
				     void set_params(const llm_graph_params & params);
			
 
				 
			
 
				     // important graph nodes
			
 
				-    ggml_tensor * t_tokens      = nullptr;
			
 
				+    ggml_tensor * t_inp_tokens  = nullptr;
			
 
				+    ggml_tensor * t_inp_embd    = nullptr; // [n_embd_inp, n_tokens]
			
 
				     ggml_tensor * t_logits      = nullptr;
			
 
				     ggml_tensor * t_embd        = nullptr;
			
 
				     ggml_tensor * t_embd_pooled = nullptr;
			
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -245,12 +245,12 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
 
				 // equivalent to get_per_layer_inputs() in python code
			
 
				 // output shape: [n_embd_altup, n_layer, n_tokens]
			
 
				 ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
			
 
				-    auto inp = std::make_unique<llm_graph_input_embd>();
			
 
				+    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
			
 
				     ggml_tensor * inp_per_layer;
			
 
				     if (ubatch.token) {
			
 
				         inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
			
 
				         ggml_set_input(inp->tokens);
			
 
				-        res->t_tokens = inp->tokens;
			
 
				+        res->t_inp_tokens = inp->tokens;
			
 
				         inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
			
 
				         inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
			
 
				         inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
			
--- a/src/models/qwen3vl-moe.cpp
+++ b/src/models/qwen3vl-moe.cpp
@@ -2,7 +2,8 @@
 
				 
			
 
				 llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
			
 
				     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
			
 
				-    const int64_t n_embd = hparams.n_embd;
			
 
				+
			
 
				+    const int64_t n_embd      = hparams.n_embd;
			
 
				     const int64_t n_embd_head = hparams.n_embd_head_v;
			
 
				 
			
 
				     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
			
@@ -16,17 +17,6 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
 
				     int sections[4];
			
 
				     std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
			
 
				 
			
 
				-    std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
			
 
				-
			
 
				-    if (ubatch.embd) {
			
 
				-        // Image input: split main embd and deepstack embds
			
 
				-        ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
			
 
				-        for (size_t i = 0; i < n_deepstack_layers; i++) {
			
 
				-            deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
			
 
				-        }
			
 
				-        inpL = inpL_main;
			
 
				-    }
			
 
				-
			
 
				     // inp_pos - contains the positions
			
 
				     ggml_tensor * inp_pos = build_inp_pos();
			
 
				 
			
@@ -120,8 +110,9 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
 
				         cur = build_cvec(cur, il);
			
 
				         cb(cur, "l_out", il);
			
 
				 
			
 
				-        if (ubatch.embd && (size_t)il < n_deepstack_layers) {
			
 
				-            cur = ggml_add(ctx0, cur, deepstack_features[il]);
			
 
				+        if (il < (int) n_deepstack_layers) {
			
 
				+            ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
			
 
				+            cur = ggml_add(ctx0, cur, ds);
			
 
				             cb(cur, "deepstack_out", il);
			
 
				         }
			
 
				 
			
--- a/src/models/qwen3vl.cpp
+++ b/src/models/qwen3vl.cpp
@@ -2,7 +2,8 @@
 
				 
			
 
				 llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
			
 
				     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
			
 
				-    const int64_t n_embd = hparams.n_embd;
			
 
				+
			
 
				+    const int64_t n_embd      = hparams.n_embd;
			
 
				     const int64_t n_embd_head = hparams.n_embd_head_v;
			
 
				 
			
 
				     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
			
@@ -16,17 +17,6 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
 
				     int sections[4];
			
 
				     std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
			
 
				 
			
 
				-    std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
			
 
				-
			
 
				-    if (ubatch.embd) {
			
 
				-        // Image input: split main embd and deepstack embds
			
 
				-        ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
			
 
				-        for (size_t i = 0; i < n_deepstack_layers; i++) {
			
 
				-            deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
			
 
				-        }
			
 
				-        inpL = inpL_main;
			
 
				-    }
			
 
				-
			
 
				     // inp_pos - contains the positions
			
 
				     ggml_tensor * inp_pos = build_inp_pos();
			
 
				 
			
@@ -113,8 +103,9 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
 
				         cur = build_cvec(cur, il);
			
 
				         cb(cur, "l_out", il);
			
 
				 
			
 
				-        if (ubatch.embd && (size_t)il < n_deepstack_layers) {
			
 
				-            cur = ggml_add(ctx0, cur, deepstack_features[il]);
			
 
				+        if (il < (int) n_deepstack_layers) {
			
 
				+            ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
			
 
				+            cur = ggml_add(ctx0, cur, ds);
			
 
				             cb(cur, "deepstack_out", il);
			
 
				         }