2 weeks ago · f5f8812f7c
--- a/include/llama.h
+++ b/include/llama.h
@@ -1292,7 +1292,9 @@ extern "C" {
 
				     // available samplers:
			
 
				 
			
 
				     LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
			
 
				-    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
			
 
				+
			
 
				+    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
			
 
				+    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
			
 
				 
			
 
				     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
			
 
				     /// Setting k <= 0 makes this a noop
			
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -2142,7 +2142,7 @@ struct llama_sampler_xtc {
 
				     const uint32_t seed;
			
 
				     uint32_t       seed_cur;
			
 
				 
			
 
				-    std::mt19937    rng;
			
 
				+    std::mt19937   rng;
			
 
				 };
			
 
				 
			
 
				 static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
			
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -4,7 +4,6 @@
 
				 #include "server-task.h"
			
 
				 #include "server-queue.h"
			
 
				 
			
 
				-#include "arg.h"
			
 
				 #include "common.h"
			
 
				 #include "llama.h"
			
 
				 #include "log.h"
			
@@ -16,7 +15,6 @@
 
				 #include <cstddef>
			
 
				 #include <cinttypes>
			
 
				 #include <memory>
			
 
				-#include <unordered_set>
			
 
				 #include <filesystem>
			
 
				 
			
 
				 // fix problem with std::min and std::max
			
@@ -2927,9 +2925,14 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
 
				             if (task.params.n_cmpl > 1) {
			
 
				                 task.n_children = task.params.n_cmpl - 1;
			
 
				                 for (size_t j = 0; j < task.n_children; j++) {
			
 
				-                    server_task child = task.create_child(
			
 
				-                        task.id,
			
 
				-                        rd.get_new_id());
			
 
				+                    server_task child = task.create_child(task.id, rd.get_new_id());
			
 
				+
			
 
				+                    // use different sampling seed for each child
			
 
				+                    // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
			
 
				+                    if (child.params.sampling.seed != LLAMA_DEFAULT_SEED) {
			
 
				+                        child.params.sampling.seed += j + 1;
			
 
				+                    }
			
 
				+
			
 
				                     tasks.push_back(std::move(child));
			
 
				                 }
			
 
				             }
			
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -503,5 +503,4 @@ def test_chat_completions_multiple_choices():
 
				     assert len(res.body["choices"]) == 2
			
 
				     for choice in res.body["choices"]:
			
 
				         assert "assistant" == choice["message"]["role"]
			
 
				-        assert match_regex("Suddenly", choice["message"]["content"])
			
 
				         assert choice["finish_reason"] == "length"