|
|
@@ -929,23 +929,22 @@ static const size_t kB = 1024;
|
|
|
static const size_t MB = kB*kB;
|
|
|
static const size_t GB = kB*kB*kB;
|
|
|
|
|
|
-// default hparams (LLaMA 7B)
|
|
|
struct llama_hparams {
|
|
|
- uint32_t n_vocab = 32000;
|
|
|
- uint32_t n_ctx_train = 2048; // the context size used during training
|
|
|
- uint32_t n_ctx = 512; // the context size used during inference
|
|
|
- uint32_t n_embd = 4096;
|
|
|
- uint32_t n_head = 32;
|
|
|
- uint32_t n_head_kv = 32;
|
|
|
- uint32_t n_layer = 32;
|
|
|
- uint32_t n_rot = 64;
|
|
|
- uint32_t n_ff = 11008;
|
|
|
-
|
|
|
- float f_norm_eps = 1e-5;
|
|
|
- float f_norm_rms_eps = 1e-5;
|
|
|
-
|
|
|
- float rope_freq_base = 10000.0f;
|
|
|
- float rope_freq_scale = 1.0f;
|
|
|
+ uint32_t n_vocab;
|
|
|
+ uint32_t n_ctx_train; // context size the model was trained on
|
|
|
+ uint32_t n_ctx; // context size used during inference
|
|
|
+ uint32_t n_embd;
|
|
|
+ uint32_t n_head;
|
|
|
+ uint32_t n_head_kv;
|
|
|
+ uint32_t n_layer;
|
|
|
+ uint32_t n_rot;
|
|
|
+ uint32_t n_ff;
|
|
|
+
|
|
|
+ float f_norm_eps;
|
|
|
+ float f_norm_rms_eps;
|
|
|
+
|
|
|
+ float rope_freq_base;
|
|
|
+ float rope_freq_scale;
|
|
|
|
|
|
bool operator!=(const llama_hparams & other) const {
|
|
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
|
|
@@ -1076,7 +1075,7 @@ struct llama_model {
|
|
|
|
|
|
std::string name = "n/a";
|
|
|
|
|
|
- llama_hparams hparams;
|
|
|
+ llama_hparams hparams = {};
|
|
|
llama_vocab vocab;
|
|
|
|
|
|
struct ggml_tensor * tok_embeddings;
|
|
|
@@ -1674,28 +1673,17 @@ static void llm_load_hparams(
|
|
|
hparams.n_head_kv = hparams.n_head;
|
|
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
|
|
|
|
|
- // TODO: manually setting rope freq base and scale should override this
|
|
|
- // FIXME: partial fix when the param specified is not the default value, but
|
|
|
- // will not work for overriding the model value to the params default
|
|
|
-
|
|
|
- llama_context_params defaults = llama_context_default_params();
|
|
|
-
|
|
|
- // rope_freq_base
|
|
|
- {
|
|
|
- float ropebase = 10000.0f;
|
|
|
- GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
|
|
- if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
|
|
- rope_freq_base = ropebase;
|
|
|
- }
|
|
|
+ // rope_freq_base (optional)
|
|
|
+ if (rope_freq_base == 0.0f) {
|
|
|
+ rope_freq_base = 10000.0f;
|
|
|
+ GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
|
|
}
|
|
|
|
|
|
// rope_freq_scale (inverse of the kv) is optional
|
|
|
- {
|
|
|
+ if (rope_freq_scale == 0.0f) {
|
|
|
float ropescale = 1.0f;
|
|
|
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
|
|
- if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
|
|
|
- rope_freq_scale = 1.0f/ropescale;
|
|
|
- }
|
|
|
+ rope_freq_scale = 1.0f/ropescale;
|
|
|
}
|
|
|
|
|
|
// sanity check for n_rot (optional)
|
|
|
@@ -6188,8 +6176,8 @@ struct llama_context_params llama_context_default_params() {
|
|
|
/*.n_gpu_layers =*/ 0,
|
|
|
/*.main_gpu =*/ 0,
|
|
|
/*.tensor_split =*/ nullptr,
|
|
|
- /*.rope_freq_base =*/ 10000.0f,
|
|
|
- /*.rope_freq_scale =*/ 1.0f,
|
|
|
+ /*.rope_freq_base =*/ 0.0f,
|
|
|
+ /*.rope_freq_scale =*/ 0.0f,
|
|
|
/*.progress_callback =*/ nullptr,
|
|
|
/*.progress_callback_user_data =*/ nullptr,
|
|
|
/*.low_vram =*/ false,
|