11 месяцев назад · abd4d0bc4f
--- a/common/common.h
+++ b/common/common.h
@@ -178,10 +178,10 @@ struct common_params_speculative {
 
				 
			
 
				     int32_t n_ctx        =     0; // draft context size
			
 
				     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
			
 
				-    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
			
 
				+    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
			
 
				     int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
			
 
				     float   p_split      =  0.1f; // speculative decoding split probability
			
 
				-    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
			
 
				+    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
			
 
				 
			
 
				     struct cpu_params cpuparams;
			
 
				     struct cpu_params cpuparams_batch;
			
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft(
 
				         // add drafted token for each sequence
			
 
				         const llama_token id = cur_p->data[0].id;
			
 
				 
			
 
				-        // only collect very high-confidence draft tokens
			
 
				-        if (cur_p->data[0].p < params.p_min) {
			
 
				-            break;
			
 
				-        }
			
 
				-
			
 
				         common_sampler_accept(smpl, id, true);
			
 
				 
			
 
				         result.push_back(id);
			
@@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft(
 
				             break;
			
 
				         }
			
 
				 
			
 
				+        // only collect very high-confidence draft tokens
			
 
				+        if (cur_p->data[0].p < params.p_min) {
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				         common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
			
 
				 
			
 
				         // evaluate the drafted tokens on the draft model
			
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -9,7 +9,7 @@ struct common_speculative_params {
 
				     int n_draft = 16;  // max drafted tokens
			
 
				     int n_reuse = 256;
			
 
				 
			
 
				-    float p_min = 0.9f; // min probability required to accept a token in the draft
			
 
				+    float p_min = 0.75f; // min probability required to accept a token in the draft
			
 
				 };
			
 
				 
			
 
				 struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -274,7 +274,7 @@ struct server_task {
 
				         params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
			
 
				 
			
 
				         params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
			
 
				-        params.speculative.n_min = std::max(params.speculative.n_min, 2);
			
 
				+        params.speculative.n_min = std::max(params.speculative.n_min, 0);
			
 
				         params.speculative.n_max = std::max(params.speculative.n_max, 0);
			
 
				 
			
 
				         // Use OpenAI API logprobs only if n_probs wasn't provided