1 год назад · 4524290e87
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1650,7 +1650,29 @@ class BertModel(Model):
 
				     def set_gguf_parameters(self):
			
 
				         super().set_gguf_parameters()
			
 
				         self.gguf_writer.add_causal_attention(False)
			
 
				-        self.gguf_writer.add_pooling_layer(True)
			
 
				+
			
 
				+        # get pooling path
			
 
				+        with open(self.dir_model / "modules.json", encoding="utf-8") as f:
			
 
				+            modules = json.load(f)
			
 
				+        pooling_path = None
			
 
				+        for mod in modules:
			
 
				+            if mod["type"] == "sentence_transformers.models.Pooling":
			
 
				+                pooling_path = mod["path"]
			
 
				+                break
			
 
				+
			
 
				+        # get pooling type
			
 
				+        pooling_type = gguf.PoolingType.NONE
			
 
				+        if pooling_path is not None:
			
 
				+            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
			
 
				+                pooling = json.load(f)
			
 
				+            if pooling["pooling_mode_mean_tokens"]:
			
 
				+                pooling_type = gguf.PoolingType.MEAN
			
 
				+            elif pooling["pooling_mode_cls_token"]:
			
 
				+                pooling_type = gguf.PoolingType.CLS
			
 
				+            else:
			
 
				+                raise NotImplementedError("Only MEAN and CLS pooling types supported")
			
 
				+
			
 
				+        self.gguf_writer.add_pooling_type(pooling_type.value)
			
 
				 
			
 
				     def set_vocab(self):
			
 
				         path = self.dir_model
			
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -40,7 +40,7 @@ class Keys:
 
				         TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
			
 
				         EXPERT_COUNT          = "{arch}.expert_count"
			
 
				         EXPERT_USED_COUNT     = "{arch}.expert_used_count"
			
 
				-        POOLING_LAYER         = "{arch}.pooling_layer"
			
 
				+        POOLING_TYPE          = "{arch}.pooling_type"
			
 
				 
			
 
				     class Attention:
			
 
				         HEAD_COUNT        = "{arch}.attention.head_count"
			
@@ -561,6 +561,12 @@ class RopeScalingType(Enum):
 
				     YARN   = 'yarn'
			
 
				 
			
 
				 
			
 
				+class PoolingType(IntEnum):
			
 
				+    NONE = 0
			
 
				+    MEAN = 1
			
 
				+    CLS  = 2
			
 
				+
			
 
				+
			
 
				 class GGMLQuantizationType(IntEnum):
			
 
				     F32  = 0
			
 
				     F16  = 1
			
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -19,6 +19,7 @@ from .constants import (
 
				     GGUFValueType,
			
 
				     Keys,
			
 
				     RopeScalingType,
			
 
				+    PoolingType,
			
 
				     TokenType,
			
 
				 )
			
 
				 
			
@@ -360,8 +361,8 @@ class GGUFWriter:
 
				     def add_causal_attention(self, value: bool) -> None:
			
 
				         self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
			
 
				 
			
 
				-    def add_pooling_layer(self, value: bool) -> None:
			
 
				-        self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
			
 
				+    def add_pooling_type(self, value: PoolingType) -> None:
			
 
				+        self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value)
			
 
				 
			
 
				     def add_rope_dimension_count(self, count: int) -> None:
			
 
				         self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -256,7 +256,7 @@ enum llm_kv {
 
				     LLM_KV_TENSOR_DATA_LAYOUT,
			
 
				     LLM_KV_EXPERT_COUNT,
			
 
				     LLM_KV_EXPERT_USED_COUNT,
			
 
				-    LLM_KV_POOLING_LAYER,
			
 
				+    LLM_KV_POOLING_TYPE,
			
 
				 
			
 
				     LLM_KV_ATTENTION_HEAD_COUNT,
			
 
				     LLM_KV_ATTENTION_HEAD_COUNT_KV,
			
@@ -314,7 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
				     { LLM_KV_TENSOR_DATA_LAYOUT,            "%s.tensor_data_layout"    },
			
 
				     { LLM_KV_EXPERT_COUNT,                  "%s.expert_count"          },
			
 
				     { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
			
 
				-    { LLM_KV_POOLING_LAYER,                 "%s.pooling_layer"         },
			
 
				+    { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
			
 
				 
			
 
				     { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
			
 
				     { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
			
@@ -1561,7 +1561,7 @@ struct llama_hparams {
 
				     float f_max_alibi_bias;
			
 
				 
			
 
				     bool causal_attn = true;
			
 
				-    bool pooling_layer = false;
			
 
				+    uint32_t pooling_type = LLAMA_POOLING_NONE;
			
 
				 
			
 
				 
			
 
				     bool operator!=(const llama_hparams & other) const {
			
@@ -1924,7 +1924,8 @@ struct llama_context {
 
				     struct ggml_tensor * inp_pos;       // I32 [n_batch]
			
 
				     struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
			
 
				     struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
			
 
				-    struct ggml_tensor * inp_sum;       // F32 [n_batch, n_batch]
			
 
				+    struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
			
 
				+    struct ggml_tensor * inp_cls;       // I32 [n_batch]
			
 
				 
			
 
				 #ifdef GGML_USE_MPI
			
 
				     ggml_mpi_context * ctx_mpi = NULL;
			
@@ -3086,7 +3087,7 @@ static void llm_load_hparams(
 
				                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
			
 
				                 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
			
 
				                 ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
			
 
				-                ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
			
 
				+                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
			
 
				 
			
 
				                 switch (hparams.n_layer) {
			
 
				                     case 3:
			
@@ -3107,7 +3108,7 @@ static void llm_load_hparams(
 
				                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
			
 
				                 ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
			
 
				                 ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
			
 
				-                ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
			
 
				+                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
			
 
				 
			
 
				                 if (hparams.n_layer == 12 && hparams.n_embd == 768) {
			
 
				                     model.type = e_model::MODEL_137M;
			
@@ -4934,7 +4935,7 @@ struct llm_build_context {
 
				     const int32_t n_orig_ctx;
			
 
				 
			
 
				     const bool do_rope_shift;
			
 
				-    const bool do_pooling;
			
 
				+    const uint32_t pooling_type;
			
 
				 
			
 
				     const llm_build_cb & cb;
			
 
				 
			
@@ -4978,7 +4979,7 @@ struct llm_build_context {
 
				         kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
			
 
				         n_orig_ctx       (cparams.n_yarn_orig_ctx),
			
 
				         do_rope_shift    (worst_case || kv_self.has_shift),
			
 
				-        do_pooling       (hparams.pooling_layer && cparams.do_pooling),
			
 
				+        pooling_type     (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
			
 
				         cb               (cb),
			
 
				         buf_compute_meta (lctx.buf_compute_meta) {
			
 
				             // all initializations should be done in init()
			
@@ -5835,7 +5836,8 @@ struct llm_build_context {
 
				         // get input vectors with right size
			
 
				         const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
			
 
				         struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
			
 
				-        struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
			
 
				+        struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
			
 
				+        struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
			
 
				 
			
 
				         // construct input embeddings (token, type, position)
			
 
				         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
			
@@ -5952,8 +5954,12 @@ struct llm_build_context {
 
				         cur = inpL;
			
 
				 
			
 
				         // pooling layer
			
 
				-        if (do_pooling) {
			
 
				-            cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
			
 
				+        if (pooling_type == LLAMA_POOLING_MEAN) {
			
 
				+            cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
			
 
				+        } else if (pooling_type == LLAMA_POOLING_CLS) {
			
 
				+            cur = ggml_get_rows(ctx0, cur, inp_cls);
			
 
				+        } else {
			
 
				+            GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
			
 
				         }
			
 
				         cb(cur, "result_embd", -1);
			
 
				 
			
@@ -7501,15 +7507,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    {
			
 
				-        assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
			
 
				-        float * data = (float *) lctx.inp_sum->data;
			
 
				-
			
 
				-        for (int i = 0; i < batch.n_tokens; ++i) {
			
 
				-            data[i] = 1.0f/float(batch.n_tokens);
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				     if (kv_self.has_shift) {
			
 
				         const int64_t n_ctx = cparams.n_ctx;
			
 
				 
			
@@ -7522,17 +7519,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    if (hparams.pooling_layer && cparams.do_pooling) {
			
 
				+    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
			
 
				         const int64_t n_tokens = batch.n_tokens;
			
 
				 
			
 
				-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
			
 
				-        float * data = (float *) lctx.inp_sum->data;
			
 
				+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
			
 
				+        float * data = (float *) lctx.inp_mean->data;
			
 
				 
			
 
				-        memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
			
 
				+        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
			
 
				 
			
 
				+        std::vector<uint64_t> sum(n_tokens, 0);
			
 
				         for (int i = 0; i < n_tokens; ++i) {
			
 
				             const llama_seq_id seq_id = batch.seq_id[i][0];
			
 
				-            data[seq_id*n_tokens + i] = 1.0f;
			
 
				+            sum[seq_id] += 1;
			
 
				+        }
			
 
				+
			
 
				+        std::vector<float> div(n_tokens, 0.0f);
			
 
				+        for (int i = 0; i < n_tokens; ++i) {
			
 
				+            const uint64_t s = sum[i];
			
 
				+            if (s > 0) {
			
 
				+                div[i] = 1.0f/float(s);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        for (int i = 0; i < n_tokens; ++i) {
			
 
				+            const llama_seq_id seq_id = batch.seq_id[i][0];
			
 
				+            data[seq_id*n_tokens + i] = div[seq_id];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
			
 
				+        const int64_t n_tokens = batch.n_tokens;
			
 
				+
			
 
				+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
			
 
				+        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
			
 
				+
			
 
				+        for (int i = 0; i < n_tokens; ++i) {
			
 
				+            const llama_seq_id seq_id = batch.seq_id[i][0];
			
 
				+            const llama_pos pos = batch.pos[i];
			
 
				+            if (pos == 0) {
			
 
				+                data[seq_id] = i;
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -11417,14 +11443,16 @@ struct llama_context * llama_new_context_with_model(
 
				             ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
			
 
				             ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
			
 
				             ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
			
 
				-            ctx->inp_sum     = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
			
 
				+            ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
			
 
				+            ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
			
 
				 
			
 
				             ggml_set_name(ctx->inp_tokens,  "inp_tokens");
			
 
				             ggml_set_name(ctx->inp_embd,    "inp_embd");
			
 
				             ggml_set_name(ctx->inp_pos,     "inp_pos");
			
 
				             ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
			
 
				             ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
			
 
				-            ggml_set_name(ctx->inp_sum,     "inp_sum");
			
 
				+            ggml_set_name(ctx->inp_mean,    "inp_mean");
			
 
				+            ggml_set_name(ctx->inp_cls,     "inp_cls");
			
 
				 
			
 
				             ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
			
 
				 
			
--- a/llama.h
+++ b/llama.h
@@ -112,6 +112,12 @@ extern "C" {
 
				         LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
			
 
				     };
			
 
				 
			
 
				+    enum llama_pooling_type {
			
 
				+        LLAMA_POOLING_NONE = 0,
			
 
				+        LLAMA_POOLING_MEAN = 1,
			
 
				+        LLAMA_POOLING_CLS  = 2,
			
 
				+    };
			
 
				+
			
 
				     enum llama_split_mode {
			
 
				         LLAMA_SPLIT_NONE    = 0, // single GPU
			
 
				         LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs