2 년 전 · a5661d7e71
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -647,9 +647,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 
				     printf("  --cfg-negative-prompt-file FNAME\n");
			
 
				     printf("                        negative prompt file to use for guidance. (default: empty)\n");
			
 
				     printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
			
 
				-    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
			
 
				-    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
			
 
				-    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
			
 
				+    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
			
 
				+    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
			
 
				+    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
			
 
				     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
			
 
				     printf("  --no-penalize-nl      do not penalize newline token\n");
			
 
				     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -701,8 +701,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
 
				     printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
			
 
				     printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
			
 
				     printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
			
 
				-    printf("  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
			
 
				-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
			
 
				+    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
			
 
				+    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
			
 
				     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
			
 
				     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
			
 
				     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -929,23 +929,22 @@ static const size_t kB = 1024;
 
				 static const size_t MB = kB*kB;
			
 
				 static const size_t GB = kB*kB*kB;
			
 
				 
			
 
				-// default hparams (LLaMA 7B)
			
 
				 struct llama_hparams {
			
 
				-    uint32_t n_vocab     = 32000;
			
 
				-    uint32_t n_ctx_train = 2048;  // the context size used during training
			
 
				-    uint32_t n_ctx       = 512;   // the context size used during inference
			
 
				-    uint32_t n_embd      = 4096;
			
 
				-    uint32_t n_head      = 32;
			
 
				-    uint32_t n_head_kv   = 32;
			
 
				-    uint32_t n_layer     = 32;
			
 
				-    uint32_t n_rot       = 64;
			
 
				-    uint32_t n_ff        = 11008;
			
 
				-
			
 
				-    float f_norm_eps     = 1e-5;
			
 
				-    float f_norm_rms_eps = 1e-5;
			
 
				-
			
 
				-    float rope_freq_base  = 10000.0f;
			
 
				-    float rope_freq_scale = 1.0f;
			
 
				+    uint32_t n_vocab;
			
 
				+    uint32_t n_ctx_train; // context size the model was trained on
			
 
				+    uint32_t n_ctx;       // context size used during inference
			
 
				+    uint32_t n_embd;
			
 
				+    uint32_t n_head;
			
 
				+    uint32_t n_head_kv;
			
 
				+    uint32_t n_layer;
			
 
				+    uint32_t n_rot;
			
 
				+    uint32_t n_ff;
			
 
				+
			
 
				+    float f_norm_eps;
			
 
				+    float f_norm_rms_eps;
			
 
				+
			
 
				+    float rope_freq_base;
			
 
				+    float rope_freq_scale;
			
 
				 
			
 
				     bool operator!=(const llama_hparams & other) const {
			
 
				         return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
			
@@ -1076,7 +1075,7 @@ struct llama_model {
 
				 
			
 
				     std::string name = "n/a";
			
 
				 
			
 
				-    llama_hparams hparams;
			
 
				+    llama_hparams hparams = {};
			
 
				     llama_vocab   vocab;
			
 
				 
			
 
				     struct ggml_tensor * tok_embeddings;
			
@@ -1674,28 +1673,17 @@ static void llm_load_hparams(
 
				     hparams.n_head_kv = hparams.n_head;
			
 
				     GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
			
 
				 
			
 
				-    // TODO: manually setting rope freq base and scale should override this
			
 
				-    // FIXME: partial fix when the param specified is not the default value, but
			
 
				-    //        will not work for overriding the model value to the params default
			
 
				-
			
 
				-    llama_context_params defaults = llama_context_default_params();
			
 
				-
			
 
				-    // rope_freq_base
			
 
				-    {
			
 
				-        float ropebase = 10000.0f;
			
 
				-        GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
			
 
				-        if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
			
 
				-            rope_freq_base = ropebase;
			
 
				-        }
			
 
				+    // rope_freq_base (optional)
			
 
				+    if (rope_freq_base == 0.0f) {
			
 
				+        rope_freq_base = 10000.0f;
			
 
				+        GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
			
 
				     }
			
 
				 
			
 
				     // rope_freq_scale (inverse of the kv) is optional
			
 
				-    {
			
 
				+    if (rope_freq_scale == 0.0f) {
			
 
				         float ropescale = 1.0f;
			
 
				         GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
			
 
				-        if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
			
 
				-            rope_freq_scale = 1.0f/ropescale;
			
 
				-        }
			
 
				+        rope_freq_scale = 1.0f/ropescale;
			
 
				     }
			
 
				 
			
 
				     // sanity check for n_rot (optional)
			
@@ -6188,8 +6176,8 @@ struct llama_context_params llama_context_default_params() {
 
				         /*.n_gpu_layers                =*/ 0,
			
 
				         /*.main_gpu                    =*/ 0,
			
 
				         /*.tensor_split                =*/ nullptr,
			
 
				-        /*.rope_freq_base              =*/ 10000.0f,
			
 
				-        /*.rope_freq_scale             =*/ 1.0f,
			
 
				+        /*.rope_freq_base              =*/ 0.0f,
			
 
				+        /*.rope_freq_scale             =*/ 0.0f,
			
 
				         /*.progress_callback           =*/ nullptr,
			
 
				         /*.progress_callback_user_data =*/ nullptr,
			
 
				         /*.low_vram                    =*/ false,