1 anno fa · fd72d2d2a5
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1128,6 +1128,7 @@ struct server_context {
 
															             LOG_VERBOSE("stopped by limit", {
														
 
															                 {"id_slot",   slot.id},
														
 
															+                {"id_task",   slot.id_task},
														
 
															                 {"n_decoded", slot.n_decoded},
														
 
															                 {"n_predict", slot.params.n_predict},
														
 
															             });
														
@@ -1141,6 +1142,8 @@ struct server_context {
 
															         }
														
 
															         LOG_VERBOSE("next token", {
														
 
															+            {"id_slot",        slot.id},
														
 
															+            {"id_task",        slot.id_task},
														
 
															             {"token",          result.tok},
														
 
															             {"token_text",     tokens_to_output_formatted_string(ctx, result.tok)},
														
 
															             {"has_next_token", slot.has_next_token},
														
@@ -1750,6 +1753,15 @@ struct server_context {
 
															                         slot.n_past = 0;
														
 
															                         slot.n_prompt_tokens = prompt_tokens.size();
														
 
															+                        LOG_VERBOSE("prompt tokenized", {
														
 
															+                            {"id_slot",         slot.id},
														
 
															+                            {"id_task",         slot.id_task},
														
 
															+                            {"n_ctx",           slot.n_ctx},
														
 
															+                            {"n_keep",          slot.params.n_keep},
														
 
															+                            {"n_prompt_tokens", slot.n_prompt_tokens},
														
 
															+                            {"prompt_tokens",   tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
														
 
															+                        });
														
 
															+
														
 
															                         if (slot.embedding) {
														
 
															                             // this prompt is too large to process - discard it
														
 
															                             if (slot.n_prompt_tokens > n_batch) {
														
@@ -1788,10 +1800,13 @@ struct server_context {
 
															                                 slot.n_prompt_tokens = prompt_tokens.size();
														
 
															                                 LOG_VERBOSE("input truncated", {
														
 
															-                                    {"n_ctx",         slot.n_ctx},
														
 
															-                                    {"n_keep",        slot.params.n_keep},
														
 
															-                                    {"n_left",        n_left},
														
 
															-                                    {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
														
 
															+                                    {"id_slot",         slot.id},
														
 
															+                                    {"id_task",         slot.id_task},
														
 
															+                                    {"n_ctx",           slot.n_ctx},
														
 
															+                                    {"n_keep",          slot.params.n_keep},
														
 
															+                                    {"n_left",          n_left},
														
 
															+                                    {"n_prompt_tokens", slot.n_prompt_tokens},
														
 
															+                                    {"prompt_tokens",   tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
														
 
															                                 });
														
 
															                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
														
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -6,8 +6,8 @@ Feature: Parallel
 
															     Given a server listening on localhost:8080
														
 
															     And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
														
 
															     And   42 as server seed
														
 
															-    And   512 as batch size
														
 
															-    And   64 KV cache size
														
 
															+    And   128 as batch size
														
 
															+    And   256 KV cache size
														
 
															     And   2 slots
														
 
															     And   continuous batching
														
 
															     Then  the server is starting
														
@@ -76,6 +76,7 @@ Feature: Parallel
 
															       | disabled  | 128       |
														
 
															       | enabled   | 64        |
														
 
															+
														
 
															   Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
														
 
															     Given a prompt:
														
 
															       """
														
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -10,11 +10,10 @@ Feature: llama.cpp server
 
															       # KV Cache corresponds to the total amount of tokens
														
 
															       # that can be stored across all independent sequences: #4130
														
 
															       # see --ctx-size and #5568
														
 
															-    And   32 KV cache size
														
 
															-    And   512 as batch size
														
 
															-    And   1 slots
														
 
															-    And   embeddings extraction
														
 
															-    And   32 server max tokens to predict
														
 
															+    And   256 KV cache size
														
 
															+    And   32 as batch size
														
 
															+    And   2 slots
														
 
															+    And   64 server max tokens to predict
														
 
															     And   prometheus compatible metrics exposed
														
 
															     Then  the server is starting
														
 
															     Then  the server is healthy
														
@@ -23,18 +22,35 @@ Feature: llama.cpp server
 
															     Then the server is ready
														
 
															     And  all slots are idle
														
 
															+
														
 
															   Scenario Outline: Completion
														
 
															     Given a prompt <prompt>
														
 
															     And   <n_predict> max tokens to predict
														
 
															     And   a completion request with no api error
														
 
															     Then  <n_predicted> tokens are predicted matching <re_content>
														
 
															+    And   the completion is <truncated> truncated
														
 
															+    And   <n_prompt> prompt tokens are processed
														
 
															     And   prometheus metrics are exposed
														
 
															     And   metric llamacpp:tokens_predicted is <n_predicted>
														
 
															     Examples: Prompts
														
 
															-      | prompt                           | n_predict | re_content                       | n_predicted |
														
 
															-      | I believe the meaning of life is | 8         | (read\|going)+                   | 8           |
														
 
															-      | Write a joke about AI            | 64        | (park\|friends\|scared\|always)+ | 32          |
														
 
															+      | prompt                                                                    | n_predict | re_content                    | n_prompt | n_predicted | truncated |
														
 
															+      | I believe the meaning of life is                                          | 8         | (read\|going)+                | 18       | 8           | not       |
														
 
															+      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids)+ | 46       | 64          | not       |
														
 
															+
														
 
															+  Scenario: Completion prompt truncated
														
 
															+    Given a prompt:
														
 
															+    """
														
 
															+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
														
 
															+    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
														
 
															+    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
														
 
															+    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
														
 
															+    """
														
 
															+    And   a completion request with no api error
														
 
															+    Then  64 tokens are predicted matching fun|Annaks|popcorns
														
 
															+    And   the completion is  truncated
														
 
															+    And   109 prompt tokens are processed
														
 
															+
														
 
															   Scenario Outline: OAI Compatibility
														
 
															     Given a model <model>
														
@@ -44,11 +60,14 @@ Feature: llama.cpp server
 
															     And   streaming is <enable_streaming>
														
 
															     Given an OAI compatible chat completions request with no api error
														
 
															     Then  <n_predicted> tokens are predicted matching <re_content>
														
 
															+    And   <n_prompt> prompt tokens are processed
														
 
															+    And   the completion is <truncated> truncated
														
 
															     Examples: Prompts
														
 
															-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content             | n_predicted | enable_streaming |
														
 
															-      | llama-2      | Book                        | What is the best book                | 8          | (Mom\|what)+           | 8           | disabled         |
														
 
															-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks\|happy\|bird)+ | 32          | enabled          |
														
 
															+      | model        | system_prompt               | user_prompt                          | max_tokens | re_content             | n_prompt | n_predicted | enable_streaming | truncated |
														
 
															+      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+          | 77       | 8           | disabled         | not       |
														
 
															+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird)+ | -1       | 64          | enabled          |           |
														
 
															+
														
 
															   Scenario: Tokenize / Detokenize
														
 
															     When tokenizing:
														
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -196,12 +196,30 @@ async def step_request_completion(context, api_error):
 
															 @step(u'{predicted_n:d} tokens are predicted matching {re_content}')
														
 
															 def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
														
 
															-    assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content)
														
 
															+    context.completion = context.tasks_result.pop()
														
 
															+    assert_n_tokens_predicted(context.completion, predicted_n, re_content)
														
 
															 @step(u'{predicted_n:d} tokens are predicted')
														
 
															 def step_n_tokens_predicted(context, predicted_n):
														
 
															-    assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n)
														
 
															+    context.completion = context.tasks_result.pop()
														
 
															+    assert_n_tokens_predicted(context.completion, predicted_n)
														
 
															+
														
 
															+
														
 
															+@step(u'the completion is  truncated')
														
 
															+def step_assert_completion_truncated(context):
														
 
															+    step_assert_completion_truncated(context, '')
														
 
															+
														
 
															+
														
 
															+@step(u'the completion is {truncated} truncated')
														
 
															+def step_assert_completion_truncated(context, truncated):
														
 
															+    truncated = truncated != "not"
														
 
															+    assert context.completion['truncated'] == truncated, f'{context.completion}'
														
 
															+
														
 
															+
														
 
															+@step(u'{n_prompt:d} prompt tokens are processed')
														
 
															+def step_impl(context, n_prompt):
														
 
															+    assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
														
 
															 @step(u'a user prompt {user_prompt}')
														
@@ -722,7 +740,8 @@ async def oai_chat_completions(user_prompt,
 
															     completion_response = {
														
 
															         'content': '',
														
 
															         'timings': {
														
 
															-            'predicted_n': 0
														
 
															+            'predicted_n': 0,
														
 
															+            'prompt_n': 0
														
 
															         }
														
 
															     }
														
 
															     if async_client:
														
@@ -763,7 +782,8 @@ async def oai_chat_completions(user_prompt,
 
															                         completion_response = {
														
 
															                             'content': chat_completion_raw['choices'][0]['message'],
														
 
															                             'timings': {
														
 
															-                                'predicted_n': chat_completion_raw['usage']['completion_tokens']
														
 
															+                                'predicted_n': chat_completion_raw['usage']['completion_tokens'],
														
 
															+                                'prompt_n': chat_completion_raw['usage']['prompt_tokens']
														
 
															                             }
														
 
															                         }
														
 
															                     else:
														
@@ -792,13 +812,16 @@ async def oai_chat_completions(user_prompt,
 
															                 if 'content' in delta:
														
 
															                     completion_response['content'] += delta['content']
														
 
															                     completion_response['timings']['predicted_n'] += 1
														
 
															+                completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
														
 
															         else:
														
 
															             assert len(chat_completion.choices) == 1
														
 
															             completion_response = {
														
 
															                 'content': chat_completion.choices[0].message.content,
														
 
															                 'timings': {
														
 
															-                    'predicted_n': chat_completion.usage.completion_tokens
														
 
															-                }
														
 
															+                    'predicted_n': chat_completion.usage.completion_tokens,
														
 
															+                    'prompt_n': chat_completion.usage.prompt_tokens
														
 
															+                    },
														
 
															+                'truncated': chat_completion.choices[0].finish_reason != 'stop'
														
 
															             }
														
 
															     if debug:
														
 
															         print("OAI response formatted to llama.cpp:", completion_response)