3 weeks ago · af1e8e1a6c
--- a/src/models/cogvlm.cpp
+++ b/src/models/cogvlm.cpp
@@ -3,12 +3,14 @@
 
				 llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
			
 
				     llm_graph_context(params) {
			
 
				     const int64_t n_embd_head = hparams.n_embd_head_v;
			
 
				-    float         kq_scale    = 1.0f / sqrtf(float(n_embd_head));
			
 
				+    const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
			
 
				 
			
 
				     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
			
 
				     GGML_ASSERT(n_embd_head == hparams.n_rot);
			
 
				 
			
 
				-    ggml_tensor *inpL, *cur;
			
 
				+    ggml_tensor * inpL;
			
 
				+    ggml_tensor * cur;
			
 
				+
			
 
				     inpL = build_inp_embd(model.tok_embd);
			
 
				 
			
 
				     ggml_tensor * inp_pos = build_inp_pos();
			
@@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
 
				         }
			
 
				 
			
 
				         ggml_tensor * inpSA = inpL;
			
 
				-        cur                 = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
			
 
				+        cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
			
 
				 
			
 
				         // build self attention
			
 
				         {
			
--- a/src/models/gemma-embedding.cpp
+++ b/src/models/gemma-embedding.cpp
@@ -1,7 +1,5 @@
 
				 #include "models.h"
			
 
				 
			
 
				-
			
 
				-
			
 
				 llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
			
 
				     llm_graph_context(params) {
			
 
				     const int64_t n_embd_head = hparams.n_embd_head_k;
			
@@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
 
				     inpL = build_inp_embd(model.tok_embd);
			
 
				 
			
 
				     // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
			
 
				-    if (ubatch.token) {
			
 
				-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
			
 
				-        cb(inpL, "inp_scaled", -1);
			
 
				-    }
			
 
				+    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
			
 
				+    cb(inpL, "inp_scaled", -1);
			
 
				 
			
 
				     // inp_pos - contains the positions
			
 
				     ggml_tensor * inp_pos = build_inp_pos();
			
--- a/src/models/gemma3.cpp
+++ b/src/models/gemma3.cpp
@@ -10,10 +10,9 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
 
				     inpL = build_inp_embd(model.tok_embd);
			
 
				 
			
 
				     // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
			
 
				-    if (ubatch.token) {
			
 
				-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
			
 
				-        cb(inpL, "inp_scaled", -1);
			
 
				-    }
			
 
				+    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
			
 
				+    cb(inpL, "inp_scaled", -1);
			
 
				+
			
 
				     // inp_pos - contains the positions
			
 
				     ggml_tensor * inp_pos = build_inp_pos();
			
 
				 
			
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -1,7 +1,5 @@
 
				 #include "models.h"
			
 
				 
			
 
				-
			
 
				-
			
 
				 llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
			
 
				     llm_graph_context(params),
			
 
				     model(model),
			
@@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
 
				     inpL = build_inp_embd(model.tok_embd);
			
 
				 
			
 
				     // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
			
 
				-    if (ubatch.token) {
			
 
				-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
			
 
				-        cb(inpL, "inp_scaled", -1);
			
 
				-    }
			
 
				+    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
			
 
				+    cb(inpL, "inp_scaled", -1);
			
 
				+
			
 
				     // inp_pos - contains the positions
			
 
				     ggml_tensor * inp_pos = build_inp_pos();
			
 
				 
			
@@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
 
				 // equivalent to get_per_layer_inputs() in python code
			
 
				 // output shape: [n_embd_altup, n_layer, n_tokens]
			
 
				 ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
			
 
				-    auto          inp = std::make_unique<llm_graph_input_embd>();
			
 
				+    auto inp = std::make_unique<llm_graph_input_embd>();
			
 
				     ggml_tensor * inp_per_layer;
			
 
				     if (ubatch.token) {
			
 
				         inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);