пре 7 месеци · d17a809ef0
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -236,9 +236,24 @@ int main(int argc, char ** argv) {
 
				                 LOG("\n");
			
 
				             }
			
 
				         } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
			
 
				+            const uint32_t n_cls_out = llama_model_n_cls_out(model);
			
 
				+            std::vector<std::string> cls_out_labels;
			
 
				+
			
 
				+            for (uint32_t i = 0; i < n_cls_out; i++) {
			
 
				+                const char * label = llama_model_cls_label(model, i);
			
 
				+                const std::string label_i(label == nullptr ? "" : label);
			
 
				+                cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
			
 
				+            }
			
 
				+
			
 
				             for (int j = 0; j < n_embd_count; j++) {
			
 
				-                // NOTE: if you change this log - update the tests in ci/run.sh
			
 
				-                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
			
 
				+                for (uint32_t i = 0; i < n_cls_out; i++) {
			
 
				+                    // NOTE: if you change this log - update the tests in ci/run.sh
			
 
				+                    if (n_cls_out == 1) {
			
 
				+                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
			
 
				+                    } else {
			
 
				+                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
			
 
				+                    }
			
 
				+                }
			
 
				             }
			
 
				         } else {
			
 
				             // print the first part of the embeddings or for a single prompt, the full embedding
			
--- a/include/llama.h
+++ b/include/llama.h
@@ -514,6 +514,13 @@ extern "C" {
 
				     // Get the model's RoPE frequency scaling factor
			
 
				     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
			
 
				 
			
 
				+    // Returns the number of classifier outputs (only valid for classifier models)
			
 
				+    // Undefined behavior for non-classifier models
			
 
				+    LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
			
 
				+
			
 
				+    // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
			
 
				+    LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
			
 
				+
			
 
				     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
			
 
				 
			
 
				     LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
			
@@ -992,7 +999,7 @@ extern "C" {
 
				 
			
 
				     // Get the embeddings for a sequence id
			
 
				     // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
			
 
				-    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
			
 
				+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
			
 
				     // otherwise: float[n_embd] (1-dimensional)
			
 
				     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
			
 
				 
			
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -839,16 +839,17 @@ int llama_context::encode(llama_batch & inp_batch) {
 
				                 } break;
			
 
				             case LLAMA_POOLING_TYPE_RANK:
			
 
				                 {
			
 
				-                    // extract the rerank score - a single float per sequence
			
 
				+                    // extract the rerank score - n_cls_out floats per sequence
			
 
				                     auto & embd_seq_out = embd_seq;
			
 
				+                    const uint32_t n_cls_out = hparams.n_cls_out;
			
 
				 
			
 
				                     for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
			
 
				                         const llama_seq_id seq_id = ubatch.seq_id[s][0];
			
 
				                         if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
			
 
				                             continue;
			
 
				                         }
			
 
				-                        embd_seq_out[seq_id].resize(1);
			
 
				-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
			
 
				+                        embd_seq_out[seq_id].resize(n_cls_out);
			
 
				+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_id)*sizeof(float), n_cls_out*sizeof(float));
			
 
				                     }
			
 
				                 } break;
			
 
				             case LLAMA_POOLING_TYPE_UNSPECIFIED:
			
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -288,9 +288,10 @@ namespace GGUFMeta {
 
				 
			
 
				     template<typename T>
			
 
				     bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
			
 
				-        const int kid = gguf_find_key(meta.get(), key.c_str());
			
 
				+        const gguf_context * ctx = meta.get();
			
 
				+        const int kid = gguf_find_key(ctx, key.c_str());
			
 
				 
			
 
				-        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
			
 
				+        if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
			
 
				             if (required) {
			
 
				                 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
			
 
				             }
			
@@ -298,28 +299,40 @@ namespace GGUFMeta {
 
				         }
			
 
				 
			
 
				         struct GGUFMeta::ArrayInfo arr_info =
			
 
				-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
			
 
				+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
			
 
				 
			
 
				         switch (arr_info.gt) {
			
 
				             case GGUF_TYPE_UINT32:
			
 
				-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
			
 
				-                                                (std::is_same<T, uint32_t>::value)); break;
			
 
				-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
			
 
				+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
			
 
				+                                                (std::is_same<T,    uint32_t>::value)); break;
			
 
				+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
			
 
				+            case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
			
 
				             default:
			
 
				-                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
			
 
				+                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
			
 
				         }
			
 
				 
			
 
				-        result.resize(arr_info.length);
			
 
				-        result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
			
 
				+        if constexpr (std::is_same<T, std::string>::value) {
			
 
				+            const size_t n_items = gguf_get_arr_n(ctx, kid);
			
 
				+            result.clear();
			
 
				+
			
 
				+            for (size_t i = 0; i < n_items; i++) {
			
 
				+                const T value = gguf_get_arr_str(ctx, kid, i);
			
 
				+                result.emplace_back(value);
			
 
				+            }
			
 
				+        } else {
			
 
				+            result.resize(arr_info.length);
			
 
				+            result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
			
 
				+        }
			
 
				 
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     template<typename T, size_t N_MAX>
			
 
				     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
			
 
				-        const int kid = gguf_find_key(meta.get(), key.c_str());
			
 
				+        const gguf_context * ctx = meta.get();
			
 
				+        const int kid = gguf_find_key(ctx, key.c_str());
			
 
				 
			
 
				-        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
			
 
				+        if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
			
 
				             if (required) {
			
 
				                 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
			
 
				             }
			
@@ -327,22 +340,32 @@ namespace GGUFMeta {
 
				         }
			
 
				 
			
 
				         struct GGUFMeta::ArrayInfo arr_info =
			
 
				-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
			
 
				+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
			
 
				 
			
 
				         switch (arr_info.gt) {
			
 
				             case GGUF_TYPE_UINT32:
			
 
				-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
			
 
				-                                                (std::is_same<T, uint32_t>::value)); break;
			
 
				-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
			
 
				+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
			
 
				+                                                (std::is_same<T,    uint32_t>::value)); break;
			
 
				+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
			
 
				+            case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
			
 
				             default:
			
 
				-                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
			
 
				+                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
			
 
				         }
			
 
				 
			
 
				         if (arr_info.length > N_MAX) {
			
 
				             throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
			
 
				         }
			
 
				 
			
 
				-        std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
			
 
				+        if constexpr (std::is_same<T, std::string>::value) {
			
 
				+            const size_t n_items = gguf_get_arr_n(ctx, kid);
			
 
				+
			
 
				+            for (size_t i = 0; i < n_items; i++) {
			
 
				+                const T value = gguf_get_arr_str(ctx, kid, i);
			
 
				+                result[i] = value;
			
 
				+            }
			
 
				+        } else {
			
 
				+            std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
			
 
				+        }
			
 
				 
			
 
				         return true;
			
 
				     }
			
@@ -352,6 +375,8 @@ namespace GGUFMeta {
 
				         return get_arr(llm_kv(kid), result, required);
			
 
				     }
			
 
				 
			
 
				+    template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
			
 
				+
			
 
				     template<typename T>
			
 
				     bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
			
 
				         auto it = kv_overrides.find(key);
			
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -543,6 +543,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
				     uint32_t n_vocab = 0;
			
 
				     ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
			
 
				 
			
 
				+    // for classifier models
			
 
				+    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
			
 
				+    if (!classifier_labels.empty()) {
			
 
				+        hparams.n_cls_out = classifier_labels.size();
			
 
				+    }
			
 
				+
			
 
				     // arch-specific KVs
			
 
				     switch (arch) {
			
 
				         case LLM_ARCH_LLAMA:
			
@@ -686,7 +692,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
				                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
			
 
				                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
			
 
				                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
			
 
				-                ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
			
 
				 
			
 
				                 switch (hparams.n_layer) {
			
 
				                     case 3:
			
@@ -4362,6 +4367,15 @@ void llama_model::print_info() const {
 
				         LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
			
 
				         LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
			
 
				         LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
			
 
				+
			
 
				+        if (!classifier_labels.empty()) {
			
 
				+            LLAMA_LOG_INFO("%s: n_cls_out        = %u\n", __func__, hparams.n_cls_out);
			
 
				+
			
 
				+            size_t i = 0;
			
 
				+            for (auto label : classifier_labels) {
			
 
				+                LLAMA_LOG_INFO("%s: cls_label[%2zu]    = %s\n", __func__, i++, label.c_str());
			
 
				+            }
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, type_name().c_str());
			
@@ -13602,6 +13616,18 @@ int32_t llama_model_n_swa(const llama_model * model) {
 
				     return model->hparams.n_swa;
			
 
				 }
			
 
				 
			
 
				+uint32_t llama_model_n_cls_out(const struct llama_model * model) {
			
 
				+    return model->hparams.n_cls_out;
			
 
				+}
			
 
				+
			
 
				+const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
			
 
				+    if (i < model->classifier_labels.size()) {
			
 
				+        return model->classifier_labels[i].c_str();
			
 
				+    }
			
 
				+
			
 
				+    return nullptr;
			
 
				+}
			
 
				+
			
 
				 // deprecated
			
 
				 int32_t llama_n_ctx_train(const llama_model * model) {
			
 
				     return llama_model_n_ctx_train(model);
			
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -329,6 +329,9 @@ struct llama_model {
 
				     llama_hparams hparams = {};
			
 
				     llama_vocab   vocab;
			
 
				 
			
 
				+    // for classifier models
			
 
				+    std::vector<std::string> classifier_labels;
			
 
				+
			
 
				     struct ggml_tensor * tok_embd   = nullptr;
			
 
				     struct ggml_tensor * type_embd  = nullptr;
			
 
				     struct ggml_tensor * pos_embd   = nullptr;