4 mesi fa · fb15d649ed
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5122,6 +5122,15 @@ class Gemma3Model(TextModel):
 
				         return [(self.map_tensor_name(name), data_torch)]
			
 
				 
			
 
				 
			
 
				+@ModelBase.register("Gemma3TextModel")
			
 
				+class EmbeddingGemma(Gemma3Model):
			
 
				+    model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
			
 
				+
			
 
				+    def set_gguf_parameters(self):
			
 
				+        super().set_gguf_parameters()
			
 
				+        self._try_set_pooling_type()
			
 
				+
			
 
				+
			
 
				 @ModelBase.register("Gemma3ForConditionalGeneration")
			
 
				 class Gemma3VisionModel(MmprojModel):
			
 
				     def set_gguf_parameters(self):
			
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -340,6 +340,7 @@ class MODEL_ARCH(IntEnum):
 
				     GEMMA2           = auto()
			
 
				     GEMMA3           = auto()
			
 
				     GEMMA3N          = auto()
			
 
				+    GEMMA_EMBEDDING  = auto()
			
 
				     STARCODER2       = auto()
			
 
				     RWKV6            = auto()
			
 
				     RWKV6QWEN2       = auto()
			
@@ -674,6 +675,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
 
				     MODEL_ARCH.GEMMA2:           "gemma2",
			
 
				     MODEL_ARCH.GEMMA3:           "gemma3",
			
 
				     MODEL_ARCH.GEMMA3N:          "gemma3n",
			
 
				+    MODEL_ARCH.GEMMA_EMBEDDING:  "gemma-embedding",
			
 
				     MODEL_ARCH.STARCODER2:       "starcoder2",
			
 
				     MODEL_ARCH.RWKV6:            "rwkv6",
			
 
				     MODEL_ARCH.RWKV6QWEN2:       "rwkv6qwen2",
			
@@ -1719,6 +1721,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
 
				         MODEL_TENSOR.LAUREL_R,
			
 
				         MODEL_TENSOR.LAUREL_POST_NORM,
			
 
				     ],
			
 
				+    MODEL_ARCH.GEMMA_EMBEDDING: [
			
 
				+        MODEL_TENSOR.TOKEN_EMBD,
			
 
				+        MODEL_TENSOR.OUTPUT,
			
 
				+        MODEL_TENSOR.OUTPUT_NORM,
			
 
				+        MODEL_TENSOR.ATTN_Q,
			
 
				+        MODEL_TENSOR.ATTN_Q_NORM,
			
 
				+        MODEL_TENSOR.ATTN_K,
			
 
				+        MODEL_TENSOR.ATTN_K_NORM,
			
 
				+        MODEL_TENSOR.ATTN_V,
			
 
				+        MODEL_TENSOR.ATTN_OUT,
			
 
				+        MODEL_TENSOR.FFN_GATE,
			
 
				+        MODEL_TENSOR.FFN_DOWN,
			
 
				+        MODEL_TENSOR.FFN_UP,
			
 
				+        MODEL_TENSOR.ATTN_NORM,
			
 
				+        MODEL_TENSOR.ATTN_POST_NORM,
			
 
				+        MODEL_TENSOR.FFN_PRE_NORM,
			
 
				+        MODEL_TENSOR.FFN_POST_NORM,
			
 
				+    ],
			
 
				     MODEL_ARCH.STARCODER2: [
			
 
				         MODEL_TENSOR.TOKEN_EMBD,
			
 
				         MODEL_TENSOR.OUTPUT_NORM,
			
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -14,6 +14,7 @@ class TensorNameMap:
 
				             "transformer.word_embeddings",               # falcon
			
 
				             "word_embeddings",                           # bloom
			
 
				             "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
			
 
				+            "embed_tokens",                              # embeddinggemma
			
 
				             "tok_embeddings",                            # llama-pth
			
 
				             "embeddings.word_embeddings",                # bert nomic-bert
			
 
				             "language_model.embedding.word_embeddings",  # persimmon
			
@@ -141,6 +142,7 @@ class TensorNameMap:
 
				             "rwkv.blocks.{bid}.ln1",                                # rwkv6
			
 
				             "model.layers.{bid}.ln1",                               # rwkv7
			
 
				             "model.layers.{bid}.input_layernorm",                   # llama4
			
 
				+            "layers.{bid}.input_layernorm",                         # embeddinggemma
			
 
				             "transformer_encoder.{bid}.attention_norm",             # neobert
			
 
				             "model.layers.{bid}.operator_norm",                     # lfm2
			
 
				             "model.transformer.blocks.{bid}.attn_norm",             # llada
			
@@ -179,6 +181,7 @@ class TensorNameMap:
 
				         # Attention query
			
 
				         MODEL_TENSOR.ATTN_Q: (
			
 
				             "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
			
 
				+            "layers.{bid}.self_attn.q_proj",                             # embeddinggemma
			
 
				             "model.layers.{bid}.self_attn.q_proj_no_perm",               # llama-custom
			
 
				             "layers.{bid}.attention.wq",                                 # llama-pth
			
 
				             "encoder.layer.{bid}.attention.self.query",                  # bert
			
@@ -197,6 +200,7 @@ class TensorNameMap:
 
				         # Attention key
			
 
				         MODEL_TENSOR.ATTN_K: (
			
 
				             "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe olmo2 phimoe
			
 
				+            "layers.{bid}.self_attn.k_proj",                           # embeddinggemma
			
 
				             "model.layers.{bid}.self_attn.k_proj_no_perm",             # llama-custom
			
 
				             "layers.{bid}.attention.wk",                               # llama-pth
			
 
				             "encoder.layer.{bid}.attention.self.key",                  # bert
			
@@ -216,6 +220,7 @@ class TensorNameMap:
 
				         # Attention value
			
 
				         MODEL_TENSOR.ATTN_V: (
			
 
				             "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
			
 
				+            "layers.{bid}.self_attn.v_proj",                             # embeddinggemma
			
 
				             "layers.{bid}.attention.wv",                                 # llama-pth
			
 
				             "encoder.layer.{bid}.attention.self.value",                  # bert
			
 
				             "transformer.layer.{bid}.attention.v_lin",                   # distillbert
			
@@ -239,6 +244,7 @@ class TensorNameMap:
 
				             "transformer.h.{bid}.self_attention.dense",                     # falcon
			
 
				             "h.{bid}.self_attention.dense",                                 # bloom
			
 
				             "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2 phimoe
			
 
				+            "layers.{bid}.self_attn.o_proj",                                # embeddinggemma
			
 
				             "model.layers.{bid}.self_attn.out_proj",                        # lfm2
			
 
				             "model.layers.{bid}.self_attn.linear_attn",                     # deci
			
 
				             "layers.{bid}.attention.wo",                                    # llama-pth
			
@@ -277,6 +283,7 @@ class TensorNameMap:
 
				 
			
 
				         MODEL_TENSOR.ATTN_POST_NORM: (
			
 
				             "model.layers.{bid}.post_attention_layernorm",       # gemma2 olmo2    # ge
			
 
				+            "layers.{bid}.post_attention_layernorm",             # embeddinggemma
			
 
				             "model.layers.{bid}.post_self_attn_layernorm",       # glm-4-0414
			
 
				             "model.layers.layers.{bid}.post_mixer_norm.weight",  # plamo2
			
 
				         ),
			
@@ -320,12 +327,14 @@ class TensorNameMap:
 
				         # Post feed-forward norm
			
 
				         MODEL_TENSOR.FFN_PRE_NORM: (
			
 
				             "model.layers.{bid}.pre_feedforward_layernorm", # gemma2
			
 
				+            "layers.{bid}.pre_feedforward_layernorm",       # embeddinggemma
			
 
				             "model.layers.{bid}.pre_ff_layernorm.weight",
			
 
				         ),
			
 
				 
			
 
				         # Post feed-forward norm
			
 
				         MODEL_TENSOR.FFN_POST_NORM: (
			
 
				             "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
			
 
				+            "layers.{bid}.post_feedforward_layernorm",       # embeddinggemma
			
 
				             "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
			
 
				             "model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
			
 
				             "model.layers.{bid}.feed_forward.up_proj",
			
@@ -362,6 +371,7 @@ class TensorNameMap:
 
				             "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
			
 
				             "h.{bid}.mlp.dense_h_to_4h",                              # bloom
			
 
				             "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
			
 
				+            "layers.{bid}.mlp.up_proj",                               # embeddinggemma
			
 
				             "layers.{bid}.feed_forward.w3",                           # llama-pth
			
 
				             "encoder.layer.{bid}.intermediate.dense",                 # bert
			
 
				             "transformer.layer.{bid}.ffn.lin1",                       # distillbert
			
@@ -421,6 +431,7 @@ class TensorNameMap:
 
				         # Feed-forward gate
			
 
				         MODEL_TENSOR.FFN_GATE: (
			
 
				             "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact olmo2
			
 
				+            "layers.{bid}.mlp.gate_proj",                 # embeddinggemma
			
 
				             "layers.{bid}.feed_forward.w1",               # llama-pth
			
 
				             "transformer.h.{bid}.mlp.w2",                 # qwen
			
 
				             "transformer.h.{bid}.mlp.c_fc2",              # jais
			
@@ -461,6 +472,7 @@ class TensorNameMap:
 
				             "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
			
 
				             "h.{bid}.mlp.dense_4h_to_h",                              # bloom
			
 
				             "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
			
 
				+            "layers.{bid}.mlp.down_proj",                             # embeddinggemma
			
 
				             "layers.{bid}.feed_forward.w2",                           # llama-pth
			
 
				             "encoder.layer.{bid}.output.dense",                       # bert
			
 
				             "transformer.layer.{bid}.ffn.lin2",                       # distillbert
			
@@ -513,6 +525,7 @@ class TensorNameMap:
 
				             "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
			
 
				             "model.layers.{bid}.self_attn.query_layernorm",                   # hunyuan
			
 
				             "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo2
			
 
				+            "layers.{bid}.self_attn.q_norm",                                  # embeddinggemma
			
 
				             "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
			
 
				             "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
			
 
				             "transformer.layers.{bid}.attn.q_norm",                           # openelm
			
@@ -525,6 +538,7 @@ class TensorNameMap:
 
				             "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
			
 
				             "model.layers.{bid}.self_attn.key_layernorm",                     # hunyuan
			
 
				             "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo2
			
 
				+            "layers.{bid}.self_attn.k_norm",                                  # embeddinggemma
			
 
				             "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
			
 
				             "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
			
 
				             "transformer.layers.{bid}.attn.k_norm",                           # openelm
			
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -45,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
 
				     { LLM_ARCH_GEMMA2,           "gemma2"           },
			
 
				     { LLM_ARCH_GEMMA3,           "gemma3"           },
			
 
				     { LLM_ARCH_GEMMA3N,          "gemma3n"          },
			
 
				+    { LLM_ARCH_GEMMA_EMBEDDING,  "gemma-embedding"  },
			
 
				     { LLM_ARCH_STARCODER2,       "starcoder2"       },
			
 
				     { LLM_ARCH_MAMBA,            "mamba"            },
			
 
				     { LLM_ARCH_MAMBA2,           "mamba2"           },
			
@@ -1038,6 +1039,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
 
				             { LLM_TENSOR_LAUREL_POST_NORM,     "blk.%d.laurel_post_norm" },
			
 
				         },
			
 
				     },
			
 
				+    {
			
 
				+        LLM_ARCH_GEMMA_EMBEDDING,
			
 
				+        {
			
 
				+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
			
 
				+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
			
 
				+            { LLM_TENSOR_OUTPUT,          "output" },
			
 
				+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
			
 
				+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
			
 
				+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
			
 
				+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
			
 
				+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
			
 
				+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
			
 
				+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
			
 
				+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
			
 
				+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
			
 
				+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
			
 
				+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
			
 
				+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
			
 
				+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
			
 
				+        },
			
 
				+    },
			
 
				     {
			
 
				         LLM_ARCH_STARCODER2,
			
 
				         {
			
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -49,6 +49,7 @@ enum llm_arch {
 
				     LLM_ARCH_GEMMA2,
			
 
				     LLM_ARCH_GEMMA3,
			
 
				     LLM_ARCH_GEMMA3N,
			
 
				+    LLM_ARCH_GEMMA_EMBEDDING,
			
 
				     LLM_ARCH_STARCODER2,
			
 
				     LLM_ARCH_MAMBA,
			
 
				     LLM_ARCH_MAMBA2,
			
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -258,6 +258,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
 
				     }
			
 
				 }
			
 
				 
			
 
				+static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
			
 
				+    LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
			
 
				+    const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
			
 
				+                          (swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
			
 
				+                          (swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
			
 
				+                          (swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
			
 
				+    LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
			
 
				+    LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
			
 
				+    LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
			
 
				+
			
 
				+    LLAMA_LOG_DEBUG("    ");
			
 
				+    for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
			
 
				+        LLAMA_LOG_DEBUG("%2d", j);
			
 
				+    }
			
 
				+    LLAMA_LOG_DEBUG("\n");
			
 
				+
			
 
				+    for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
			
 
				+        LLAMA_LOG_DEBUG(" %2d ", i);
			
 
				+        for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
			
 
				+            float val = data[i * n_kv + j];
			
 
				+            if (val == -INFINITY) {
			
 
				+                LLAMA_LOG_DEBUG(" ∞");
			
 
				+            } else {
			
 
				+                LLAMA_LOG_DEBUG(" 0");
			
 
				+            }
			
 
				+        }
			
 
				+        LLAMA_LOG_DEBUG("\n");
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
			
 
				     const int64_t n_kv     = ubatch->n_tokens;
			
 
				     const int64_t n_tokens = ubatch->n_tokens;
			
@@ -277,21 +307,32 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 
				                 for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
			
 
				                     const llama_seq_id s0 = ubatch->seq_id[i0][0];
			
 
				 
			
 
				-                    // TODO: reimplement this like in llama_kv_cache
			
 
				-                    if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
			
 
				-                        if (hparams.use_alibi) {
			
 
				-                            f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
			
 
				-                        } else {
			
 
				-                            f = 0.0f;
			
 
				-                        }
			
 
				-                        break;
			
 
				+                    if (s0 != s1) {
			
 
				+                        continue; // skip different sequences
			
 
				                     }
			
 
				-                }
			
 
				 
			
 
				+                    if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
			
 
				+                        continue; // skip future tokens for causal attention
			
 
				+                    }
			
 
				+
			
 
				+                    if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
			
 
				+                        continue; // skip masked tokens for SWA
			
 
				+                    }
			
 
				+
			
 
				+                    // TODO: reimplement this like in llama_kv_cache_unified
			
 
				+                    if (hparams.use_alibi) {
			
 
				+                        f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
			
 
				+                    } else {
			
 
				+                        f = 0.0f;
			
 
				+                    }
			
 
				+                }
			
 
				                 data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				+    if (debug) {
			
 
				+        print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
			
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -78,6 +78,11 @@ struct llm_graph_params;
 
				 
			
 
				 class llm_graph_input_i {
			
 
				 public:
			
 
				+    llm_graph_input_i() {
			
 
				+        const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
			
 
				+        debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
			
 
				+    }
			
 
				+
			
 
				     virtual ~llm_graph_input_i() = default;
			
 
				 
			
 
				     virtual void set_input(const llama_ubatch * ubatch) = 0;
			
@@ -90,6 +95,9 @@ public:
 
				         GGML_UNUSED(params);
			
 
				         return false;
			
 
				     }
			
 
				+protected:
			
 
				+    // env: LLAMA_GRAPH_INPUT_DEBUG
			
 
				+    int debug = 0;
			
 
				 };
			
 
				 
			
 
				 using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
			
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -1,6 +1,7 @@
 
				 #include "llama-hparams.h"
			
 
				 
			
 
				 #include "ggml.h"
			
 
				+#include <cassert>
			
 
				 
			
 
				 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
			
 
				     if (dense_first) {
			
@@ -178,3 +179,39 @@ uint32_t llama_hparams::n_layer_kv() const {
 
				 
			
 
				     return res;
			
 
				 }
			
 
				+
			
 
				+bool llama_hparams::is_masked_swa(llama_pos p0, llama_pos p1) const {
			
 
				+    assert(p0 >= 0 && p1 >= 0);
			
 
				+
			
 
				+    switch (swa_type) {
			
 
				+        case LLAMA_SWA_TYPE_NONE:
			
 
				+            {
			
 
				+            } break;
			
 
				+        case LLAMA_SWA_TYPE_STANDARD:
			
 
				+            {
			
 
				+                if (p1 - p0 >= (int32_t) n_swa) {
			
 
				+                    return true;
			
 
				+                }
			
 
				+            } break;
			
 
				+        case LLAMA_SWA_TYPE_CHUNKED:
			
 
				+            {
			
 
				+                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
			
 
				+
			
 
				+                if (p0 < pos_chunk_start) {
			
 
				+                    return true;
			
 
				+                }
			
 
				+            } break;
			
 
				+        case LLAMA_SWA_TYPE_SYMMETRIC:
			
 
				+            {
			
 
				+                const int32_t half_n_swa = (int32_t) n_swa / 2;
			
 
				+                const int32_t pos_diff = p1 - p0;
			
 
				+
			
 
				+                // Mask if outside the symmetric window
			
 
				+                if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
			
 
				+                    return true;
			
 
				+                }
			
 
				+            } break;
			
 
				+    }
			
 
				+
			
 
				+    return false;
			
 
				+}
			
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -16,9 +16,10 @@ enum llama_expert_gating_func_type {
 
				 };
			
 
				 
			
 
				 enum llama_swa_type {
			
 
				-    LLAMA_SWA_TYPE_NONE     = 0,
			
 
				-    LLAMA_SWA_TYPE_STANDARD = 1,
			
 
				-    LLAMA_SWA_TYPE_CHUNKED  = 2,
			
 
				+    LLAMA_SWA_TYPE_NONE      = 0,
			
 
				+    LLAMA_SWA_TYPE_STANDARD  = 1,
			
 
				+    LLAMA_SWA_TYPE_CHUNKED   = 2,
			
 
				+    LLAMA_SWA_TYPE_SYMMETRIC = 3,
			
 
				 };
			
 
				 
			
 
				 struct llama_hparams_posnet {
			
@@ -227,6 +228,8 @@ struct llama_hparams {
 
				 
			
 
				     // number of layers for which has_kv() returns true
			
 
				     uint32_t n_layer_kv() const;
			
 
				+
			
 
				+    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
			
 
				 };
			
 
				 
			
 
				 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
			
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
 
				     kv_base = std::make_unique<llama_kv_cache>(
			
 
				             model, type_k, type_v,
			
 
				             v_trans, offload, unified, size_base, n_seq_max, n_pad,
			
 
				-            0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
			
 
				+            0, filter_base, reuse);
			
 
				 
			
 
				     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
			
 
				 
			
 
				     kv_swa = std::make_unique<llama_kv_cache>(
			
 
				             model, type_k, type_v,
			
 
				             v_trans, offload, unified, size_swa, n_seq_max, n_pad,
			
 
				-            hparams.n_swa, hparams.swa_type, filter_swa, reuse);
			
 
				+            hparams.n_swa, filter_swa, reuse);
			
 
				 }
			
 
				 
			
 
				 void llama_kv_cache_iswa::clear(bool data) {
			
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -27,11 +27,10 @@ llama_kv_cache::llama_kv_cache(
 
				                  uint32_t   n_seq_max,
			
 
				                  uint32_t   n_pad,
			
 
				                  uint32_t   n_swa,
			
 
				-           llama_swa_type   swa_type,
			
 
				     const layer_filter_cb & filter,
			
 
				     const  layer_reuse_cb & reuse) :
			
 
				     model(model), hparams(model.hparams), v_trans(v_trans),
			
 
				-    n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
			
 
				+    n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa) {
			
 
				 
			
 
				     GGML_ASSERT(kv_size % n_pad == 0);
			
 
				 
			
@@ -1393,29 +1392,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
 
				 }
			
 
				 
			
 
				 bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
			
 
				-    assert(p0 >= 0 && p1 >= 0);
			
 
				-
			
 
				-    switch (swa_type) {
			
 
				-        case LLAMA_SWA_TYPE_NONE:
			
 
				-            {
			
 
				-            } break;
			
 
				-        case LLAMA_SWA_TYPE_STANDARD:
			
 
				-            {
			
 
				-                if (p1 - p0 >= (int32_t) n_swa) {
			
 
				-                    return true;
			
 
				-                }
			
 
				-            } break;
			
 
				-        case LLAMA_SWA_TYPE_CHUNKED:
			
 
				-            {
			
 
				-                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
			
 
				-
			
 
				-                if (p0 < pos_chunk_start) {
			
 
				-                    return true;
			
 
				-                }
			
 
				-            } break;
			
 
				-    }
			
 
				-
			
 
				-    return false;
			
 
				+    return hparams.is_masked_swa(p0, p1);
			
 
				 }
			
 
				 
			
 
				 void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
			
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -89,7 +89,6 @@ public:
 
				                      uint32_t   n_seq_max,
			
 
				                      uint32_t   n_pad,
			
 
				                      uint32_t   n_swa,
			
 
				-               llama_swa_type   swa_type,
			
 
				         const layer_filter_cb & filter,
			
 
				         const  layer_reuse_cb & reuse);
			
 
				 
			
@@ -212,8 +211,6 @@ private:
 
				     // env: LLAMA_KV_CACHE_DEBUG
			
 
				     int debug = 0;
			
 
				 
			
 
				-    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
			
 
				-
			
 
				     std::vector<ggml_context_ptr>        ctxs;
			
 
				     std::vector<ggml_backend_buffer_ptr> bufs;
			
 
				 
			
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -17,7 +17,6 @@ llama_memory_hybrid::llama_memory_hybrid(
 
				                  uint32_t   kv_size,
			
 
				                  uint32_t   n_pad,
			
 
				                  uint32_t   n_swa,
			
 
				-           llama_swa_type   swa_type,
			
 
				                             /* recurrent */
			
 
				                 ggml_type   type_r,
			
 
				                 ggml_type   type_s,
			
@@ -41,7 +40,6 @@ llama_memory_hybrid::llama_memory_hybrid(
 
				         n_seq_max,
			
 
				         n_pad,
			
 
				         n_swa,
			
 
				-        swa_type,
			
 
				         filter_attn == nullptr ?
			
 
				             [&](int32_t il) { return !hparams.is_recurrent(il); }
			
 
				             : filter_attn,
			
--- a/src/llama-memory-hybrid.h
+++ b/src/llama-memory-hybrid.h
@@ -27,7 +27,6 @@ public:
 
				                  uint32_t   kv_size,
			
 
				                  uint32_t   n_pad,
			
 
				                  uint32_t   n_swa,
			
 
				-           llama_swa_type   swa_type,
			
 
				                             /* recurrent */
			
 
				                 ggml_type   type_r,
			
 
				                 ggml_type   type_s,
			
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1142,6 +1142,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
				                     default: type = LLM_TYPE_UNKNOWN;
			
 
				                 }
			
 
				             } break;
			
 
				+        case LLM_ARCH_GEMMA_EMBEDDING:
			
 
				+            {
			
 
				+                hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
			
 
				+                hparams.set_swa_pattern(6);
			
 
				+
			
 
				+                hparams.causal_attn = false; // embeddings do not use causal attention
			
 
				+                hparams.rope_freq_base_train_swa  = 10000.0f;
			
 
				+                hparams.rope_freq_scale_train_swa = 1.0f;
			
 
				+
			
 
				+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
			
 
				+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
			
 
				+                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
			
 
				+
			
 
				+                switch (hparams.n_layer) {
			
 
				+                    case 24: type = LLM_TYPE_0_3B; break;
			
 
				+                    default: type = LLM_TYPE_UNKNOWN;
			
 
				+                }
			
 
				+                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
			
 
				+
			
 
				+            } break;
			
 
				         case LLM_ARCH_STARCODER2:
			
 
				             {
			
 
				                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
			
@@ -3484,6 +3504,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
				                     }
			
 
				                 } break;
			
 
				             case LLM_ARCH_GEMMA3:
			
 
				+            case LLM_ARCH_GEMMA_EMBEDDING:
			
 
				                 {
			
 
				                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
			
 
				 
			
@@ -11045,6 +11066,136 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
 
				     }
			
 
				 };
			
 
				 
			
 
				+struct llm_build_gemma_embedding_iswa : public llm_graph_context {
			
 
				+    llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
			
 
				+        const int64_t n_embd_head = hparams.n_embd_head_k;
			
 
				+
			
 
				+        ggml_tensor * cur;
			
 
				+        ggml_tensor * inpL;
			
 
				+
			
 
				+        inpL = build_inp_embd(model.tok_embd);
			
 
				+
			
 
				+        // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
			
 
				+        if (ubatch.token) {
			
 
				+            inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
			
 
				+            cb(inpL, "inp_scaled", -1);
			
 
				+        }
			
 
				+
			
 
				+        // inp_pos - contains the positions
			
 
				+        ggml_tensor * inp_pos = build_inp_pos();
			
 
				+
			
 
				+        auto * inp_attn = build_attn_inp_no_cache();
			
 
				+
			
 
				+        ggml_tensor * inp_out_ids = build_inp_out_ids();
			
 
				+
			
 
				+        for (int il = 0; il < n_layer; ++il) {
			
 
				+            const float freq_base_l  = model.get_rope_freq_base (cparams, il);
			
 
				+            const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
			
 
				+
			
 
				+            // norm
			
 
				+            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
			
 
				+            cb(cur, "attn_norm", il);
			
 
				+
			
 
				+            // self-attention
			
 
				+            {
			
 
				+                // compute Q and K and RoPE them
			
 
				+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
			
 
				+                cb(Qcur, "Qcur", il);
			
 
				+
			
 
				+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
			
 
				+                cb(Kcur, "Kcur", il);
			
 
				+
			
 
				+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
			
 
				+                cb(Vcur, "Vcur", il);
			
 
				+
			
 
				+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
			
 
				+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
			
 
				+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
			
 
				+
			
 
				+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
			
 
				+                cb(Qcur, "Qcur_normed", il);
			
 
				+
			
 
				+                Qcur = ggml_rope_ext(
			
 
				+                        ctx0, Qcur, inp_pos, nullptr,
			
 
				+                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
			
 
				+                        ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+
			
 
				+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
			
 
				+                cb(Kcur, "Kcur_normed", il);
			
 
				+
			
 
				+                Kcur = ggml_rope_ext(
			
 
				+                        ctx0, Kcur, inp_pos, nullptr,
			
 
				+                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
			
 
				+                        ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+
			
 
				+                cb(Qcur, "Qcur", il);
			
 
				+                cb(Kcur, "Kcur", il);
			
 
				+                cb(Vcur, "Vcur", il);
			
 
				+
			
 
				+                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
			
 
				+                Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
			
 
				+
			
 
				+                cur = build_attn(inp_attn,
			
 
				+                        model.layers[il].wo, NULL,
			
 
				+                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
			
 
				+            }
			
 
				+
			
 
				+            if (il == n_layer - 1 && inp_out_ids) {
			
 
				+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
			
 
				+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
			
 
				+            }
			
 
				+
			
 
				+            cur = build_norm(cur,
			
 
				+                    model.layers[il].attn_post_norm, NULL,
			
 
				+                    LLM_NORM_RMS, il);
			
 
				+            cb(cur, "attn_post_norm", il);
			
 
				+
			
 
				+            ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
			
 
				+            cb(sa_out, "sa_out", il);
			
 
				+
			
 
				+            cur = build_norm(sa_out,
			
 
				+                    model.layers[il].ffn_norm, NULL,
			
 
				+                    LLM_NORM_RMS, il);
			
 
				+            cb(cur, "ffn_norm", il);
			
 
				+
			
 
				+            // feed-forward network
			
 
				+            {
			
 
				+                cur = build_ffn(cur,
			
 
				+                        model.layers[il].ffn_up,   NULL, NULL,
			
 
				+                        model.layers[il].ffn_gate, NULL, NULL,
			
 
				+                        model.layers[il].ffn_down, NULL, NULL,
			
 
				+                        NULL,
			
 
				+                        LLM_FFN_GELU, LLM_FFN_PAR, il);
			
 
				+                cb(cur, "ffn_out", il);
			
 
				+            }
			
 
				+
			
 
				+            cur = build_norm(cur,
			
 
				+                    model.layers[il].ffn_post_norm, NULL,
			
 
				+                    LLM_NORM_RMS, -1);
			
 
				+            cb(cur, "ffn_post_norm", -1);
			
 
				+
			
 
				+            cur = ggml_add(ctx0, cur, sa_out);
			
 
				+
			
 
				+            cur = build_cvec(cur, il);
			
 
				+            cb(cur, "l_out", il);
			
 
				+
			
 
				+            // input for next layer
			
 
				+            inpL = cur;
			
 
				+        }
			
 
				+
			
 
				+        cur = inpL;
			
 
				+
			
 
				+        cur = build_norm(cur,
			
 
				+                model.output_norm, NULL,
			
 
				+                LLM_NORM_RMS, -1);
			
 
				+
			
 
				+        cb(cur, "result_norm", -1);
			
 
				+        res->t_embd = cur;
			
 
				+
			
 
				+        ggml_build_forward_expand(gf, cur);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				 // TODO: move up next to build_starcoder
			
 
				 struct llm_build_starcoder2 : public llm_graph_context {
			
 
				     llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
			
@@ -18481,6 +18632,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
				         case LLM_ARCH_NOMIC_BERT_MOE:
			
 
				         case LLM_ARCH_NEO_BERT:
			
 
				         case LLM_ARCH_WAVTOKENIZER_DEC:
			
 
				+        case LLM_ARCH_GEMMA_EMBEDDING:
			
 
				         case LLM_ARCH_DREAM:
			
 
				         case LLM_ARCH_LLADA:
			
 
				             {
			
@@ -18529,7 +18681,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
				                         /* attn_kv_size      */ cparams.n_ctx,
			
 
				                         /* attn_n_pad        */ padding,
			
 
				                         /* attn_n_swa        */ hparams.n_swa,
			
 
				-                        /* attn_swa_type     */ hparams.swa_type,
			
 
				                         /* recurrent_type_k  */ GGML_TYPE_F32,
			
 
				                         /* recurrent_type_v  */ GGML_TYPE_F32,
			
 
				                         /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
			
@@ -18599,7 +18750,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
				                                 cparams.n_seq_max,
			
 
				                                 padding,
			
 
				                                 hparams.n_swa,
			
 
				-                                hparams.swa_type,
			
 
				                                 nullptr,
			
 
				                                 nullptr);
			
 
				                     }
			
@@ -18761,6 +18911,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
 
				             {
			
 
				                 llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
			
 
				             } break;
			
 
				+        case LLM_ARCH_GEMMA_EMBEDDING:
			
 
				+            {
			
 
				+                llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
			
 
				+            } break;
			
 
				         case LLM_ARCH_STARCODER2:
			
 
				             {
			
 
				                 llm = std::make_unique<llm_build_starcoder2>(*this, params);
			
@@ -19161,6 +19315,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
 
				         case LLM_ARCH_GEMMA2:
			
 
				         case LLM_ARCH_GEMMA3:
			
 
				         case LLM_ARCH_GEMMA3N:
			
 
				+        case LLM_ARCH_GEMMA_EMBEDDING:
			
 
				         case LLM_ARCH_STARCODER2:
			
 
				         case LLM_ARCH_OPENELM:
			
 
				         case LLM_ARCH_GPTNEOX: