před 2 roky · 5eaf9964fc
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -129,6 +129,8 @@ static void sampler_queue(
 
				     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
			
 
				 
			
 
				     const float         temp              = params.temp;
			
 
				+    const float         dynatemp_range    = params.dynatemp_range;
			
 
				+    const float         dynatemp_exponent = params.dynatemp_exponent;
			
 
				     const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
			
 
				     const float         top_p             = params.top_p;
			
 
				     const float         min_p             = params.min_p;
			
@@ -143,7 +145,15 @@ static void sampler_queue(
 
				             case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
			
 
				             case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
			
 
				             case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
			
 
				-            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
			
 
				+            case 't':
			
 
				+                if (dynatemp_range > 0) {
			
 
				+                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
			
 
				+                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
			
 
				+                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
			
 
				+                } else {
			
 
				+                    llama_sample_temp(ctx_main, &cur_p, temp);
			
 
				+                }
			
 
				+                break;
			
 
				             default : break;
			
 
				         }
			
 
				     }
			
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
 
				     float       tfs_z                 = 1.00f;    // 1.0 = disabled
			
 
				     float       typical_p             = 1.00f;    // 1.0 = disabled
			
 
				     float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
			
 
				+    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
			
 
				+    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
			
 
				     int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
			
 
				     float       penalty_repeat        = 1.10f;    // 1.0 = disabled
			
 
				     float       penalty_freq          = 0.00f;    // 0.0 = disabled
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -8151,6 +8151,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
 
				     }
			
 
				 }
			
 
				 
			
 
				+void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
			
 
				+    const int64_t t_start_sample_us = ggml_time_us();
			
 
				+
			
 
				+    // no need to do anything if there is only one (or zero) candidates
			
 
				+    if(candidates_p->size <= 1) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    // Calculate maximum possible entropy
			
 
				+    float max_entropy = -logf(1.0f / candidates_p->size);
			
 
				+
			
 
				+    llama_sample_softmax(nullptr, candidates_p);
			
 
				+
			
 
				+    // Calculate entropy of the softmax probabilities
			
 
				+    float entropy = 0.0f;
			
 
				+    for (size_t i = 0; i < candidates_p->size; ++i) {
			
 
				+        float prob = candidates_p->data[i].p;
			
 
				+        if (prob > 0.0f) { // Ensure no log(0)
			
 
				+            entropy -= prob * logf(prob);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
			
 
				+    float normalized_entropy = entropy / max_entropy;
			
 
				+
			
 
				+    // Map the normalized entropy to the desired temperature range using the power function
			
 
				+    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+    LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
			
 
				+    LLAMA_LOG_INFO("Entropy: %f\n", entropy);
			
 
				+    LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
			
 
				+    LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
			
 
				+    LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
			
 
				+    LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
			
 
				+#endif
			
 
				+
			
 
				+    // Apply the dynamically calculated temperature scaling
			
 
				+    for (size_t i = 0; i < candidates_p->size; ++i) {
			
 
				+        candidates_p->data[i].logit /= dyn_temp;
			
 
				+    }
			
 
				+
			
 
				+    // Re-compute softmax probabilities after scaling logits with dynamic temperature
			
 
				+    double max_l_double = candidates_p->data[0].logit;
			
 
				+    double cum_sum_double = 0.0;
			
 
				+    for (size_t i = 0; i < candidates_p->size; ++i) {
			
 
				+        double p = exp(candidates_p->data[i].logit - max_l_double);
			
 
				+        candidates_p->data[i].p = p; // Store the scaled probability
			
 
				+        cum_sum_double += p;
			
 
				+    }
			
 
				+    for (size_t i = 0; i < candidates_p->size; ++i) {
			
 
				+        candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
			
 
				+    }
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+    // Print the updated top 25 probabilities after temperature scaling
			
 
				+    LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
			
 
				+    for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
			
 
				+        LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    if (ctx) {
			
 
				+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
			
 
				     const int64_t t_start_sample_us = ggml_time_us();
			
 
				 
			
--- a/llama.h
+++ b/llama.h
@@ -775,6 +775,14 @@ extern "C" {
 
				                            float   p,
			
 
				                           size_t   min_keep);
			
 
				 
			
 
				+    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
			
 
				+    LLAMA_API void llama_sample_entropy(
			
 
				+            struct llama_context * ctx,
			
 
				+          llama_token_data_array * candidates_p,
			
 
				+                           float   min_temp,
			
 
				+                           float   max_temp,
			
 
				+                           float   exponent_val);
			
 
				+
			
 
				     LLAMA_API void llama_sample_temp(
			
 
				             struct llama_context * ctx,
			
 
				           llama_token_data_array * candidates,