před 1 rokem · 6e7d133a5f
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -52,8 +52,8 @@ Feature: Parallel
 
				     Then all prompts are predicted with <n_predict> tokens
			
 
				     Examples:
			
 
				       | streaming | n_predict |
			
 
				-      | disabled  | 128       |
			
 
				-      | enabled   | 64        |
			
 
				+      | disabled  | 200       |
			
 
				+      | enabled   | 200       |
			
 
				 
			
 
				   Scenario Outline: Multi users OAI completions compatibility no v1
			
 
				     Given a system prompt You are a writer.
			
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -818,7 +818,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
 
				     for prompt_no in range(context.n_prompts):
			
 
				         shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
			
 
				         context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
			
 
				-    await asyncio.sleep(0.1)
			
 
				+    await asyncio.sleep(0.01)
			
 
				 
			
 
				 
			
 
				 @step('the slot {slot_id:d} is saved with filename "{filename}"')
			
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@@ -8,9 +8,12 @@ Feature: Wrong usage of llama.cpp server
 
				   Scenario: Infinite loop
			
 
				     Given a server listening on localhost:8080
			
 
				     And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
			
 
				+    And   42 as server seed
			
 
				+    And   2048 KV cache size
			
 
				     # Uncomment below to fix the issue
			
 
				     #And   64 server max tokens to predict
			
 
				     Then  the server is starting
			
 
				+    Then  the server is healthy
			
 
				     Given a prompt:
			
 
				       """
			
 
				       Go to: infinite loop
			
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -3,6 +3,14 @@
 
				 #include "llama.h"
			
 
				 #include "common.h"
			
 
				 
			
 
				+#ifndef NDEBUG
			
 
				+// crash the server in debug mode, otherwise send an http 500 error
			
 
				+#define CPPHTTPLIB_NO_EXCEPTIONS 1
			
 
				+#endif
			
 
				+// increase max payload length to allow use of larger context size
			
 
				+#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
			
 
				+#include "httplib.h"
			
 
				+
			
 
				 // Change JSON_ASSERT from assert() to GGML_ASSERT:
			
 
				 #define JSON_ASSERT GGML_ASSERT
			
 
				 #include "json.hpp"
			
@@ -279,6 +287,18 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
 
				     return std::string::npos;
			
 
				 }
			
 
				 
			
 
				+static bool json_is_array_of_numbers(json data) {
			
 
				+    if (data.is_array()) {
			
 
				+        for (const auto & e : data) {
			
 
				+            if (!e.is_number()) {
			
 
				+                return false;
			
 
				+            }
			
 
				+        }
			
 
				+        return true;
			
 
				+    }
			
 
				+    return false;
			
 
				+}
			
 
				+
			
 
				 // TODO: reuse llama_detokenize
			
 
				 template <class Iter>
			
 
				 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
			
@@ -343,6 +363,19 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
 
				     return out;
			
 
				 }
			
 
				 
			
 
				+static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
			
 
				+    const std::string str =
			
 
				+        std::string(event) + ": " +
			
 
				+        data.dump(-1, ' ', false, json::error_handler_t::replace) +
			
 
				+        "\n\n";
			
 
				+
			
 
				+    LOG_VERBOSE("data stream", {
			
 
				+        { "to_send", str }
			
 
				+    });
			
 
				+
			
 
				+    return sink.write(str.c_str(), str.size());
			
 
				+}
			
 
				+
			
 
				 //
			
 
				 // OAI utils
			
 
				 //