|
@@ -54,6 +54,7 @@
|
|
|
#include <cassert>
|
|
#include <cassert>
|
|
|
#include <cinttypes>
|
|
#include <cinttypes>
|
|
|
#include <climits>
|
|
#include <climits>
|
|
|
|
|
+#include <cmath>
|
|
|
#include <cstdarg>
|
|
#include <cstdarg>
|
|
|
#include <cstddef>
|
|
#include <cstddef>
|
|
|
#include <cstdint>
|
|
#include <cstdint>
|
|
@@ -235,6 +236,10 @@ enum llm_kv {
|
|
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
|
LLM_KV_ROPE_FREQ_BASE,
|
|
LLM_KV_ROPE_FREQ_BASE,
|
|
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
|
|
|
+ LLM_KV_ROPE_SCALING_TYPE,
|
|
|
|
|
+ LLM_KV_ROPE_SCALING_FACTOR,
|
|
|
|
|
+ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
|
|
|
+ LLM_KV_ROPE_SCALING_FINETUNED,
|
|
|
|
|
|
|
|
LLM_KV_TOKENIZER_MODEL,
|
|
LLM_KV_TOKENIZER_MODEL,
|
|
|
LLM_KV_TOKENIZER_LIST,
|
|
LLM_KV_TOKENIZER_LIST,
|
|
@@ -276,9 +281,13 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
|
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
|
|
|
|
|
|
|
- { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
|
|
|
- { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
|
|
|
- { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
|
|
|
|
|
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
|
|
|
+ { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
|
|
|
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
|
|
|
+ { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
|
|
|
+ { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
|
|
|
|
+ { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
|
|
|
|
+ { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
|
|
|
|
|
|
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
|
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
|
@@ -552,6 +561,22 @@ do { \
|
|
|
} \
|
|
} \
|
|
|
} while (0)
|
|
} while (0)
|
|
|
|
|
|
|
|
|
|
+static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
|
|
|
|
+ { LLAMA_ROPE_SCALING_NONE, "none" },
|
|
|
|
|
+ { LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
|
|
|
|
+ { LLAMA_ROPE_SCALING_YARN, "yarn" },
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
|
|
|
+ for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
|
|
|
|
+ if (kv.second == name) {
|
|
|
|
|
+ return kv.first;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
//
|
|
//
|
|
|
// ggml helpers
|
|
// ggml helpers
|
|
|
//
|
|
//
|
|
@@ -1035,8 +1060,11 @@ struct llama_hparams {
|
|
|
float f_norm_eps;
|
|
float f_norm_eps;
|
|
|
float f_norm_rms_eps;
|
|
float f_norm_rms_eps;
|
|
|
|
|
|
|
|
- float rope_freq_base_train;
|
|
|
|
|
- float rope_freq_scale_train;
|
|
|
|
|
|
|
+ float rope_freq_base_train;
|
|
|
|
|
+ float rope_freq_scale_train;
|
|
|
|
|
+ uint32_t n_yarn_orig_ctx;
|
|
|
|
|
+ int8_t rope_scaling_type_train : 3;
|
|
|
|
|
+ bool rope_finetuned : 1;
|
|
|
|
|
|
|
|
float f_clamp_kqv;
|
|
float f_clamp_kqv;
|
|
|
float f_max_alibi_bias;
|
|
float f_max_alibi_bias;
|
|
@@ -1051,6 +1079,8 @@ struct llama_hparams {
|
|
|
if (this->n_layer != other.n_layer) return true;
|
|
if (this->n_layer != other.n_layer) return true;
|
|
|
if (this->n_rot != other.n_rot) return true;
|
|
if (this->n_rot != other.n_rot) return true;
|
|
|
if (this->n_ff != other.n_ff) return true;
|
|
if (this->n_ff != other.n_ff) return true;
|
|
|
|
|
+ if (this->rope_finetuned != other.rope_finetuned) return true;
|
|
|
|
|
+ if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
|
|
|
|
|
|
|
const float EPSILON = 1e-9;
|
|
const float EPSILON = 1e-9;
|
|
|
|
|
|
|
@@ -1081,8 +1111,16 @@ struct llama_cparams {
|
|
|
uint32_t n_threads; // number of threads to use for generation
|
|
uint32_t n_threads; // number of threads to use for generation
|
|
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
|
|
|
|
|
|
|
- float rope_freq_base;
|
|
|
|
|
- float rope_freq_scale;
|
|
|
|
|
|
|
+ float rope_freq_base;
|
|
|
|
|
+ float rope_freq_scale;
|
|
|
|
|
+
|
|
|
|
|
+ uint32_t n_yarn_orig_ctx;
|
|
|
|
|
+ // These hyperparameters are not exposed in GGUF, because all
|
|
|
|
|
+ // existing YaRN models use the same values for them.
|
|
|
|
|
+ float yarn_ext_factor;
|
|
|
|
|
+ float yarn_attn_factor;
|
|
|
|
|
+ float yarn_beta_fast;
|
|
|
|
|
+ float yarn_beta_slow;
|
|
|
|
|
|
|
|
bool mul_mat_q;
|
|
bool mul_mat_q;
|
|
|
};
|
|
};
|
|
@@ -2014,14 +2052,30 @@ static void llm_load_hparams(
|
|
|
hparams.n_head_kv = hparams.n_head;
|
|
hparams.n_head_kv = hparams.n_head;
|
|
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
|
|
|
|
|
|
|
|
|
+ hparams.rope_finetuned = false;
|
|
|
|
|
+ GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
|
|
|
|
|
+ kv(LLM_KV_ROPE_SCALING_FINETUNED));
|
|
|
|
|
+
|
|
|
|
|
+ hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
|
|
|
|
|
+ GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
|
|
|
|
|
+ kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
|
|
|
|
|
+
|
|
|
// rope_freq_base (optional)
|
|
// rope_freq_base (optional)
|
|
|
hparams.rope_freq_base_train = 10000.0f;
|
|
hparams.rope_freq_base_train = 10000.0f;
|
|
|
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
|
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
|
|
|
|
|
|
|
|
|
+ std::string rope_scaling("linear");
|
|
|
|
|
+ GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
|
|
|
|
|
+ hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
|
|
|
|
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
|
|
|
|
|
+
|
|
|
// rope_freq_scale (inverse of the kv) is optional
|
|
// rope_freq_scale (inverse of the kv) is optional
|
|
|
- float ropescale = 1.0f;
|
|
|
|
|
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
|
|
|
|
- hparams.rope_freq_scale_train = 1.0f/ropescale;
|
|
|
|
|
|
|
+ float ropescale = 0.0f;
|
|
|
|
|
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
|
|
|
|
|
+ if (ropescale == 0.0f) { // try the old key name
|
|
|
|
|
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
|
|
|
|
+ }
|
|
|
|
|
+ hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
|
|
|
|
|
|
// sanity check for n_rot (optional)
|
|
// sanity check for n_rot (optional)
|
|
|
{
|
|
{
|
|
@@ -2371,6 +2425,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
const auto & hparams = model.hparams;
|
|
const auto & hparams = model.hparams;
|
|
|
const auto & vocab = model.vocab;
|
|
const auto & vocab = model.vocab;
|
|
|
|
|
|
|
|
|
|
+ const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
|
|
|
|
+
|
|
|
// hparams
|
|
// hparams
|
|
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
|
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
|
@@ -2389,8 +2445,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
|
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
|
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
|
|
|
|
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
|
|
|
+ LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
|
|
|
|
+ LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
|
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
|
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
|
@@ -3047,21 +3106,11 @@ static void llm_load_tensors(
|
|
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static bool llama_model_load(
|
|
|
|
|
- const std::string & fname,
|
|
|
|
|
- llama_model & model,
|
|
|
|
|
- int n_gpu_layers,
|
|
|
|
|
- int main_gpu,
|
|
|
|
|
- const float * tensor_split,
|
|
|
|
|
- bool use_mmap,
|
|
|
|
|
- bool use_mlock,
|
|
|
|
|
- bool vocab_only,
|
|
|
|
|
- llama_progress_callback progress_callback,
|
|
|
|
|
- void *progress_callback_user_data) {
|
|
|
|
|
|
|
+static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
|
|
try {
|
|
try {
|
|
|
- llama_model_loader ml(fname, use_mmap);
|
|
|
|
|
|
|
+ llama_model_loader ml(fname, params.use_mmap);
|
|
|
|
|
|
|
|
- model.hparams.vocab_only = vocab_only;
|
|
|
|
|
|
|
+ model.hparams.vocab_only = params.vocab_only;
|
|
|
|
|
|
|
|
llm_load_arch (ml, model);
|
|
llm_load_arch (ml, model);
|
|
|
llm_load_hparams(ml, model);
|
|
llm_load_hparams(ml, model);
|
|
@@ -3073,15 +3122,15 @@ static bool llama_model_load(
|
|
|
throw std::runtime_error("vocab size mismatch");
|
|
throw std::runtime_error("vocab size mismatch");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if (vocab_only) {
|
|
|
|
|
|
|
+ if (params.vocab_only) {
|
|
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
llm_load_tensors(
|
|
llm_load_tensors(
|
|
|
- ml, model, n_gpu_layers,
|
|
|
|
|
- main_gpu, tensor_split,
|
|
|
|
|
- use_mlock, progress_callback, progress_callback_user_data);
|
|
|
|
|
|
|
+ ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
|
|
|
|
+ params.progress_callback, params.progress_callback_user_data
|
|
|
|
|
+ );
|
|
|
} catch (const std::exception & err) {
|
|
} catch (const std::exception & err) {
|
|
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
|
|
return false;
|
|
return false;
|
|
@@ -3150,6 +3199,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
|
static void llm_build_k_shift(
|
|
static void llm_build_k_shift(
|
|
|
struct ggml_context * ctx,
|
|
struct ggml_context * ctx,
|
|
|
const llama_hparams & hparams,
|
|
const llama_hparams & hparams,
|
|
|
|
|
+ const llama_cparams & cparams,
|
|
|
const llama_kv_cache & kv,
|
|
const llama_kv_cache & kv,
|
|
|
struct ggml_cgraph * graph,
|
|
struct ggml_cgraph * graph,
|
|
|
llm_rope_type type,
|
|
llm_rope_type type,
|
|
@@ -3162,6 +3212,11 @@ static void llm_build_k_shift(
|
|
|
const int64_t n_head_kv = hparams.n_head_kv;
|
|
const int64_t n_head_kv = hparams.n_head_kv;
|
|
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
|
|
const int64_t n_embd_head = hparams.n_embd_head();
|
|
const int64_t n_embd_head = hparams.n_embd_head();
|
|
|
|
|
+ const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
|
|
|
|
+ const float ext_factor = cparams.yarn_ext_factor;
|
|
|
|
|
+ const float attn_factor = cparams.yarn_attn_factor;
|
|
|
|
|
+ const float beta_fast = cparams.yarn_beta_fast;
|
|
|
|
|
+ const float beta_slow = cparams.yarn_beta_slow;
|
|
|
|
|
|
|
|
GGML_ASSERT(n_embd_head % n_rot == 0);
|
|
GGML_ASSERT(n_embd_head % n_rot == 0);
|
|
|
|
|
|
|
@@ -3185,7 +3240,8 @@ static void llm_build_k_shift(
|
|
|
ggml_element_size(kv.k)*n_embd_head,
|
|
ggml_element_size(kv.k)*n_embd_head,
|
|
|
ggml_element_size(kv.k)*n_embd_gqa,
|
|
ggml_element_size(kv.k)*n_embd_gqa,
|
|
|
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
|
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
|
|
- K_shift, n_rot, rope_type, 0, freq_base, freq_scale);
|
|
|
|
|
|
|
+ K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
|
|
|
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
|
|
cb(tmp, "K_shifted", il);
|
|
cb(tmp, "K_shifted", il);
|
|
|
ggml_build_forward_expand(graph, tmp);
|
|
ggml_build_forward_expand(graph, tmp);
|
|
|
}
|
|
}
|
|
@@ -3442,12 +3498,17 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
const float freq_base;
|
|
const float freq_base;
|
|
|
const float freq_scale;
|
|
const float freq_scale;
|
|
|
|
|
+ const float ext_factor;
|
|
|
|
|
+ const float attn_factor;
|
|
|
|
|
+ const float beta_fast;
|
|
|
|
|
+ const float beta_slow;
|
|
|
const float norm_eps;
|
|
const float norm_eps;
|
|
|
const float norm_rms_eps;
|
|
const float norm_rms_eps;
|
|
|
|
|
|
|
|
const int32_t n_tokens;
|
|
const int32_t n_tokens;
|
|
|
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
|
|
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
|
|
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
|
|
|
|
+ const int32_t n_orig_ctx;
|
|
|
|
|
|
|
|
const bool do_rope_shift;
|
|
const bool do_rope_shift;
|
|
|
|
|
|
|
@@ -3477,11 +3538,16 @@ struct llm_build_context {
|
|
|
n_embd_gqa (hparams.n_embd_gqa()),
|
|
n_embd_gqa (hparams.n_embd_gqa()),
|
|
|
freq_base (cparams.rope_freq_base),
|
|
freq_base (cparams.rope_freq_base),
|
|
|
freq_scale (cparams.rope_freq_scale),
|
|
freq_scale (cparams.rope_freq_scale),
|
|
|
|
|
+ ext_factor (cparams.yarn_ext_factor),
|
|
|
|
|
+ attn_factor (cparams.yarn_attn_factor),
|
|
|
|
|
+ beta_fast (cparams.yarn_beta_fast),
|
|
|
|
|
+ beta_slow (cparams.yarn_beta_slow),
|
|
|
norm_eps (hparams.f_norm_eps),
|
|
norm_eps (hparams.f_norm_eps),
|
|
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
|
|
n_tokens (batch.n_tokens),
|
|
n_tokens (batch.n_tokens),
|
|
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
|
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
|
|
|
|
+ n_orig_ctx (cparams.n_yarn_orig_ctx),
|
|
|
do_rope_shift (worst_case || kv_self.has_shift),
|
|
do_rope_shift (worst_case || kv_self.has_shift),
|
|
|
cb (cb),
|
|
cb (cb),
|
|
|
buf_compute (lctx.buf_compute) {
|
|
buf_compute (lctx.buf_compute) {
|
|
@@ -3532,7 +3598,7 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
// shift the entire K-cache if needed
|
|
// shift the entire K-cache if needed
|
|
|
if (do_rope_shift) {
|
|
if (do_rope_shift) {
|
|
|
- llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
|
|
|
|
|
|
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -3556,10 +3622,18 @@ struct llm_build_context {
|
|
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
|
cb(Vcur, "Vcur", il);
|
|
cb(Vcur, "Vcur", il);
|
|
|
|
|
|
|
|
- Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
|
|
|
|
|
|
+ Qcur = ggml_rope_custom(
|
|
|
|
|
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
|
|
|
+ n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
|
|
|
+ ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
|
+ );
|
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Qcur, "Qcur", il);
|
|
|
|
|
|
|
|
- Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
|
|
|
|
|
|
+ Kcur = ggml_rope_custom(
|
|
|
|
|
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
|
|
|
+ n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
|
|
|
+ ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
|
+ );
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
@@ -3634,7 +3708,7 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
// shift the entire K-cache if needed
|
|
// shift the entire K-cache if needed
|
|
|
if (do_rope_shift) {
|
|
if (do_rope_shift) {
|
|
|
- llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
|
|
|
|
|
|
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -3658,8 +3732,16 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
switch (model.type) {
|
|
switch (model.type) {
|
|
|
case MODEL_7B:
|
|
case MODEL_7B:
|
|
|
- Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
|
|
|
|
- Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
|
|
|
|
|
|
+ Qcur = ggml_rope_custom(
|
|
|
|
|
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
|
|
|
+ n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
|
|
|
+ ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
|
+ );
|
|
|
|
|
+ Kcur = ggml_rope_custom(
|
|
|
|
|
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
|
|
|
+ n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
|
|
|
+ ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
|
+ );
|
|
|
break;
|
|
break;
|
|
|
case MODEL_13B:
|
|
case MODEL_13B:
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
|
|
@@ -3746,7 +3828,7 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
// shift the entire K-cache if needed
|
|
// shift the entire K-cache if needed
|
|
|
if (do_rope_shift) {
|
|
if (do_rope_shift) {
|
|
|
- llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
|
|
|
|
|
|
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -3786,10 +3868,16 @@ struct llm_build_context {
|
|
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
|
|
|
// using mode = 2 for neox mode
|
|
// using mode = 2 for neox mode
|
|
|
- Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
|
|
|
|
|
|
+ Qcur = ggml_rope_custom(
|
|
|
|
|
+ ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
|
|
|
|
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
|
+ );
|
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Qcur, "Qcur", il);
|
|
|
|
|
|
|
|
- Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
|
|
|
|
|
|
+ Kcur = ggml_rope_custom(
|
|
|
|
|
+ ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
|
|
|
|
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
|
+ );
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
@@ -3960,7 +4048,7 @@ struct llm_build_context {
|
|
|
cb(KQ_mask, "KQ_mask", -1);
|
|
cb(KQ_mask, "KQ_mask", -1);
|
|
|
|
|
|
|
|
if (do_rope_shift) {
|
|
if (do_rope_shift) {
|
|
|
- llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
|
|
|
|
|
|
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -4053,13 +4141,15 @@ struct llm_build_context {
|
|
|
cb(kpass, "kpass", il);
|
|
cb(kpass, "kpass", il);
|
|
|
|
|
|
|
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
|
|
- ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale
|
|
|
|
|
- );
|
|
|
|
|
|
|
+ ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
|
|
|
|
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
|
+ );
|
|
|
cb(qrotated, "qrotated", il);
|
|
cb(qrotated, "qrotated", il);
|
|
|
|
|
|
|
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
|
|
- ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale
|
|
|
|
|
- );
|
|
|
|
|
|
|
+ ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
|
|
|
|
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
|
|
+ );
|
|
|
cb(krotated, "krotated", il);
|
|
cb(krotated, "krotated", il);
|
|
|
|
|
|
|
|
// ggml currently only supports concatenation on dim=2
|
|
// ggml currently only supports concatenation on dim=2
|
|
@@ -7883,8 +7973,13 @@ struct llama_context_params llama_context_default_params() {
|
|
|
/*.n_batch =*/ 512,
|
|
/*.n_batch =*/ 512,
|
|
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
|
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
|
|
|
|
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
|
|
/*.rope_freq_base =*/ 0.0f,
|
|
/*.rope_freq_base =*/ 0.0f,
|
|
|
/*.rope_freq_scale =*/ 0.0f,
|
|
/*.rope_freq_scale =*/ 0.0f,
|
|
|
|
|
+ /*.yarn_ext_factor =*/ NAN,
|
|
|
|
|
+ /*.yarn_attn_factor =*/ 1.0f,
|
|
|
|
|
+ /*.yarn_beta_fast =*/ 32.0f,
|
|
|
|
|
+ /*.yarn_beta_slow =*/ 1.0f,
|
|
|
/*.mul_mat_q =*/ true,
|
|
/*.mul_mat_q =*/ true,
|
|
|
/*.f16_kv =*/ true,
|
|
/*.f16_kv =*/ true,
|
|
|
/*.logits_all =*/ false,
|
|
/*.logits_all =*/ false,
|
|
@@ -7971,10 +8066,7 @@ struct llama_model * llama_load_model_from_file(
|
|
|
};
|
|
};
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if (!llama_model_load(path_model, *model, params.n_gpu_layers,
|
|
|
|
|
- params.main_gpu, params.tensor_split,
|
|
|
|
|
- params.use_mmap, params.use_mlock, params.vocab_only,
|
|
|
|
|
- params.progress_callback, params.progress_callback_user_data)) {
|
|
|
|
|
|
|
+ if (!llama_model_load(path_model, *model, params)) {
|
|
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
|
|
delete model;
|
|
delete model;
|
|
|
return nullptr;
|
|
return nullptr;
|
|
@@ -8000,13 +8092,35 @@ struct llama_context * llama_new_context_with_model(
|
|
|
const auto & hparams = model->hparams;
|
|
const auto & hparams = model->hparams;
|
|
|
auto & cparams = ctx->cparams;
|
|
auto & cparams = ctx->cparams;
|
|
|
|
|
|
|
|
- cparams.n_batch = params.n_batch;
|
|
|
|
|
- cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
|
|
|
|
- cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
|
|
|
|
|
- cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
|
|
|
|
- cparams.n_threads = params.n_threads;
|
|
|
|
|
- cparams.n_threads_batch = params.n_threads_batch;
|
|
|
|
|
- cparams.mul_mat_q = params.mul_mat_q;
|
|
|
|
|
|
|
+ cparams.n_batch = params.n_batch;
|
|
|
|
|
+ cparams.n_threads = params.n_threads;
|
|
|
|
|
+ cparams.n_threads_batch = params.n_threads_batch;
|
|
|
|
|
+ cparams.yarn_ext_factor = params.yarn_ext_factor;
|
|
|
|
|
+ cparams.yarn_attn_factor = params.yarn_attn_factor;
|
|
|
|
|
+ cparams.yarn_beta_fast = params.yarn_beta_fast;
|
|
|
|
|
+ cparams.yarn_beta_slow = params.yarn_beta_slow;
|
|
|
|
|
+ cparams.mul_mat_q = params.mul_mat_q;
|
|
|
|
|
+
|
|
|
|
|
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
|
|
|
|
+ cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
|
|
|
|
+ cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
|
|
|
|
+
|
|
|
|
|
+ cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
|
|
|
|
+ hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
|
|
|
|
+ hparams.n_ctx_train;
|
|
|
|
|
+
|
|
|
|
|
+ auto rope_scaling_type = params.rope_scaling_type;
|
|
|
|
|
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
|
|
|
|
+ rope_scaling_type = hparams.rope_scaling_type_train;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
|
|
|
|
|
+ cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
|
|
|
|
|
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
|
params.seed = time(NULL);
|
|
params.seed = time(NULL);
|