1 year ago · 2f0ee84b9b
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
 
				 
			
 
				 SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
			
 
				 
			
 
				-Example:
			
 
				+Example (assuming golang >= 1.21 is installed):
			
 
				 ```shell
			
 
				 go install go.k6.io/xk6/cmd/xk6@latest
			
 
				-xk6 build master \
			
 
				+$GOPATH/bin/xk6 build master \
			
 
				 --with github.com/phymbert/xk6-sse
			
 
				 ```
			
 
				 
			
@@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
 
				 
			
 
				 Example:
			
 
				 ```shell
			
 
				-server --host localhost --port 8080 \
			
 
				+llama-server --host localhost --port 8080 \
			
 
				   --model ggml-model-q4_0.gguf \
			
 
				   --cont-batching \
			
 
				   --metrics \
			
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -189,12 +189,12 @@ xychart-beta
 
				         "pp": {
			
 
				             "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
			
 
				             "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
			
 
				-            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
			
 
				+            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
			
 
				         },
			
 
				         "tg": {
			
 
				             "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
			
 
				             "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
			
 
				-            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
			
 
				+            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
			
 
				         },
			
 
				     }
			
 
				     with open("results.github.env", 'a') as github_env:
			
@@ -214,11 +214,14 @@ def start_benchmark(args):
 
				     k6_args = [
			
 
				         'run', args.scenario,
			
 
				         '--no-color',
			
 
				+        '--no-connection-reuse',
			
 
				+        '--no-vu-connection-reuse',
			
 
				     ]
			
 
				     k6_args.extend(['--duration', args.duration])
			
 
				     k6_args.extend(['--iterations', args.n_prompts])
			
 
				     k6_args.extend(['--vus', args.parallel])
			
 
				     k6_args.extend(['--summary-export', 'k6-results.json'])
			
 
				+    k6_args.extend(['--out', 'csv=k6-results.csv'])
			
 
				     args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
			
 
				     args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
			
 
				     print(f"bench: starting k6 with: {args}")
			
@@ -231,7 +234,7 @@ def start_server(args):
 
				     server_process = start_server_background(args)
			
 
				 
			
 
				     attempts = 0
			
 
				-    max_attempts = 20
			
 
				+    max_attempts = 600
			
 
				     if 'GITHUB_ACTIONS' in os.environ:
			
 
				         max_attempts *= 2
			
 
				 
			
@@ -242,7 +245,15 @@ def start_server(args):
 
				         print(f"bench:     waiting for server to start ...")
			
 
				         time.sleep(0.5)
			
 
				 
			
 
				-    print("bench: server started.")
			
 
				+    attempts = 0
			
 
				+    while not is_server_ready(args.host, args.port):
			
 
				+        attempts += 1
			
 
				+        if attempts > max_attempts:
			
 
				+            assert False, "server not ready"
			
 
				+        print(f"bench:     waiting for server to be ready ...")
			
 
				+        time.sleep(0.5)
			
 
				+
			
 
				+    print("bench: server started and ready.")
			
 
				     return server_process
			
 
				 
			
 
				 
			
@@ -255,11 +266,6 @@ def start_server_background(args):
 
				         '--host', args.host,
			
 
				         '--port', args.port,
			
 
				     ]
			
 
				-    model_file = args.model_path_prefix + os.path.sep + args.hf_file
			
 
				-    model_dir  = os.path.dirname(model_file)
			
 
				-    if not os.path.exists(model_dir):
			
 
				-        os.makedirs(model_dir)
			
 
				-    server_args.extend(['--model', model_file])
			
 
				     server_args.extend(['--hf-repo', args.hf_repo])
			
 
				     server_args.extend(['--hf-file', args.hf_file])
			
 
				     server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
			
@@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
 
				         return _is_server_listening
			
 
				 
			
 
				 
			
 
				+def is_server_ready(server_fqdn, server_port):
			
 
				+    url = f"http://{server_fqdn}:{server_port}/health"
			
 
				+    response = requests.get(url)
			
 
				+    return response.status_code == 200
			
 
				+
			
 
				+
			
 
				 def escape_metric_name(metric_name):
			
 
				     return re.sub('[^A-Z0-9]', '_', metric_name.upper())
			
 
				 
			
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
 
				 
			
 
				 const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
			
 
				 const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
			
 
				+const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
			
 
				 
			
 
				 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
			
 
				 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
			
@@ -89,6 +90,9 @@ export default function () {
 
				         ],
			
 
				         "model": model,
			
 
				         "stream": true,
			
 
				+        "stream_options": {
			
 
				+          "include_usage": true, // False to be supported in llama.cpp server
			
 
				+        },
			
 
				         "seed": 42,
			
 
				         "max_tokens": max_tokens,
			
 
				         "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
			
@@ -105,12 +109,20 @@ export default function () {
 
				         client.on('event', function (event) {
			
 
				             if (promptEvalEndTime == null) {
			
 
				                 promptEvalEndTime = new Date()
			
 
				+                llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
			
 
				+            }
			
 
				+
			
 
				+            if (event.data === '[DONE]' || event.data === '') {
			
 
				+                return
			
 
				             }
			
 
				 
			
 
				             let chunk = JSON.parse(event.data)
			
 
				-            let choice = chunk.choices[0]
			
 
				-            if (choice.finish_reason) {
			
 
				-                finish_reason = choice.finish_reason
			
 
				+
			
 
				+            if (chunk.choices && chunk.choices.length > 0) {
			
 
				+                let choice = chunk.choices[0]
			
 
				+                if (choice.finish_reason) {
			
 
				+                    finish_reason = choice.finish_reason
			
 
				+                }
			
 
				             }
			
 
				 
			
 
				             if (chunk.usage) {