1 year ago · 75cd4c7729
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -79,12 +79,18 @@ jobs:
 
				             sleep 0.1
			
 
				           done
			
 
				 
			
 
				-      - name: Install k6
			
 
				+      - name: Set up Go
			
 
				+        uses: actions/setup-go@v5
			
 
				+        with:
			
 
				+          go-version: '1.21'
			
 
				+
			
 
				+      - name: Install k6 and xk6-sse
			
 
				         id: k6_installation
			
 
				         run: |
			
 
				           cd examples/server/bench
			
 
				-          wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
			
 
				-          tar xzf k6*.tar.gz --strip-components=1
			
 
				+          go install go.k6.io/xk6/cmd/xk6@latest
			
 
				+          xk6 build master \
			
 
				+              --with github.com/phymbert/xk6-sse
			
 
				 
			
 
				       - name: Build
			
 
				         id: cmake_build
			
@@ -118,7 +124,7 @@ jobs:
 
				 
			
 
				           cd examples/server/bench
			
 
				           source venv/bin/activate
			
 
				-          BENCH_K6_BIN_PATH=./k6 python bench.py \
			
 
				+          python bench.py \
			
 
				               --runner-label ${{ env.RUNNER_LABEL }} \
			
 
				               --name ${{ github.job }} \
			
 
				               --branch ${{ github.head_ref || github.ref_name }} \
			
@@ -228,9 +234,9 @@ jobs:
 
				             <summary>Expand details for performance related PR only</summary>
			
 
				 
			
 
				             - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
			
 
				-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
			
 
				-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
			
 
				-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
			
 
				+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
			
 
				+            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
			
 
				+            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
			
 
				             - ${{ env.BENCH_GRAPH_XLABEL }}
			
 
				 
			
 
				 
			
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -2,13 +2,15 @@
 
				 
			
 
				 Benchmark is using [k6](https://k6.io/).
			
 
				 
			
 
				-##### Install k6
			
 
				+##### Install k6 and sse extension
			
 
				 
			
 
				-Follow instruction from: https://k6.io/docs/get-started/installation/
			
 
				+SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
			
 
				 
			
 
				-Example for ubuntu:
			
 
				+Example:
			
 
				 ```shell
			
 
				-snap install k6
			
 
				+go install go.k6.io/xk6/cmd/xk6@latest
			
 
				+xk6 build master \
			
 
				+--with github.com/phymbert/xk6-sse
			
 
				 ```
			
 
				 
			
 
				 #### Download a dataset
			
@@ -46,7 +48,7 @@ server --host localhost --port 8080 \
 
				 
			
 
				 For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
			
 
				 ```shell
			
 
				-k6 run script.js --duration 10m --iterations 500 --vus 8
			
 
				+./k6 run script.js --duration 10m --iterations 500 --vus 8
			
 
				 ```
			
 
				 
			
 
				 The benchmark values can be overridden with:
			
@@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with:
 
				 ```shell
			
 
				 curl http://localhost:8080/metrics
			
 
				 ```
			
 
				+
			
 
				+### Using the CI python script
			
 
				+The `bench.py` script does several steps:
			
 
				+- start the server
			
 
				+- define good variable for k6
			
 
				+- run k6 script
			
 
				+- extract metrics from prometheus
			
 
				+
			
 
				+It aims to be used in the CI, but you can run it manually:
			
 
				+
			
 
				+```shell
			
 
				+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
			
 
				+              --runner-label local \
			
 
				+              --name local \
			
 
				+              --branch `git rev-parse --abbrev-ref HEAD` \
			
 
				+              --commit `git rev-parse HEAD` \
			
 
				+              --scenario script.js \
			
 
				+              --duration 5m \
			
 
				+              --hf-repo ggml-org/models	 \
			
 
				+              --hf-file phi-2/ggml-model-q4_0.gguf \
			
 
				+              --model-path-prefix models \
			
 
				+              --parallel 4 \
			
 
				+              -ngl 33 \
			
 
				+              --batch-size 2048 \
			
 
				+              --ubatch-size	256 \
			
 
				+              --ctx-size 4096 \
			
 
				+              --n-prompts 200 \
			
 
				+              --max-prompt-tokens 256 \
			
 
				+              --max-tokens 256
			
 
				+```
			
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None:
 
				                             data['metrics'][metric_name][metric_metric]=value
			
 
				                             github_env.write(
			
 
				                                 f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
			
 
				-                token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
			
 
				                 iterations = data['root_group']['checks']['success completion']['passes']
			
 
				 
			
 
				     except Exception:
			
@@ -181,16 +180,16 @@ xychart-beta
 
				     bench_results = {
			
 
				         "i": iterations,
			
 
				         "req": {
			
 
				-            "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
			
 
				+            "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
			
 
				             "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
			
 
				         },
			
 
				         "pp": {
			
 
				-            "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
			
 
				-            "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
			
 
				+            "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
			
 
				+            "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
			
 
				             "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
			
 
				         },
			
 
				         "tg": {
			
 
				-            "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
			
 
				+            "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
			
 
				             "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
			
 
				             "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
			
 
				         },
			
@@ -206,7 +205,7 @@ xychart-beta
 
				 
			
 
				 
			
 
				 def start_benchmark(args):
			
 
				-    k6_path = 'k6'
			
 
				+    k6_path = './k6'
			
 
				     if 'BENCH_K6_BIN_PATH' in os.environ:
			
 
				         k6_path = os.environ['BENCH_K6_BIN_PATH']
			
 
				     k6_args = [
			
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -1,4 +1,4 @@
 
				-import http from 'k6/http'
			
 
				+import sse from 'k6/x/sse'
			
 
				 import {check, sleep} from 'k6'
			
 
				 import {SharedArray} from 'k6/data'
			
 
				 import {Counter, Rate, Trend} from 'k6/metrics'
			
@@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () {
 
				 
			
 
				 const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
			
 
				 const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
			
 
				+
			
 
				 const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
			
 
				+const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
			
 
				 
			
 
				 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
			
 
				 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
			
@@ -86,36 +88,62 @@ export default function () {
 
				             }
			
 
				         ],
			
 
				         "model": model,
			
 
				-        "stream": false,
			
 
				+        "stream": true,
			
 
				         "seed": 42,
			
 
				         "max_tokens": max_tokens
			
 
				     }
			
 
				 
			
 
				-    const body = JSON.stringify(payload)
			
 
				+    const params = {method: 'POST', body: JSON.stringify(payload)};
			
 
				+
			
 
				+    const startTime = new Date()
			
 
				+    let promptEvalEndTime = null
			
 
				+    let prompt_tokens = 0
			
 
				+    let completions_tokens = 0
			
 
				+    let finish_reason = null
			
 
				+    const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
			
 
				+        client.on('event', function (event) {
			
 
				+            if (promptEvalEndTime == null) {
			
 
				+                promptEvalEndTime = new Date()
			
 
				+            }
			
 
				 
			
 
				-    let res = http.post(`${server_url}/chat/completions`, body, {
			
 
				-        headers: {'Content-Type': 'application/json'},
			
 
				-        timeout: '300s'
			
 
				-    })
			
 
				+            let chunk = JSON.parse(event.data)
			
 
				+            let choice = chunk.choices[0]
			
 
				+            if (choice.finish_reason) {
			
 
				+                finish_reason = choice.finish_reason
			
 
				+            }
			
 
				 
			
 
				-    check(res, {'success completion': (r) => r.status === 200})
			
 
				+            if (chunk.usage) {
			
 
				+                prompt_tokens = chunk.usage.prompt_tokens
			
 
				+                llamacpp_prompt_tokens.add(prompt_tokens)
			
 
				+                llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
			
 
				+
			
 
				+                completions_tokens = chunk.usage.completion_tokens
			
 
				+                llamacpp_completion_tokens.add(completions_tokens)
			
 
				+                llamacpp_completion_tokens_total_counter.add(completions_tokens)
			
 
				+            }
			
 
				+        })
			
 
				 
			
 
				-    if (res.status === 200) {
			
 
				-        const completions = res.json()
			
 
				+        client.on('error', function (e) {
			
 
				+            console.log('An unexpected error occurred: ', e.error());
			
 
				+            throw e;
			
 
				+        })
			
 
				+    })
			
 
				 
			
 
				-        llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
			
 
				-        llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
			
 
				+    check(res, {'success completion': (r) => r.status === 200})
			
 
				 
			
 
				-        llamacpp_completion_tokens.add(completions.usage.completion_tokens)
			
 
				-        llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
			
 
				+    const endTime = new Date()
			
 
				 
			
 
				-        llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
			
 
				-        llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
			
 
				+    const promptEvalTime = promptEvalEndTime - startTime
			
 
				+    if (promptEvalTime > 0) {
			
 
				+        llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
			
 
				+    }
			
 
				 
			
 
				-        llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
			
 
				-    } else {
			
 
				-        console.error(`response: ${res.body} request=${payload}`)
			
 
				+    const completion_time = endTime - promptEvalEndTime
			
 
				+    if (completions_tokens > 0 && completion_time > 0) {
			
 
				+        llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
			
 
				     }
			
 
				+    llamacpp_completions_truncated_rate.add(finish_reason === 'length')
			
 
				+    llamacpp_completions_stop_rate.add(finish_reason === 'stop')
			
 
				 
			
 
				     sleep(0.3)
			
 
				 }
			
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -567,6 +567,15 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
 
				         {"model",   modelname},
			
 
				         {"object",  "chat.completion.chunk"}
			
 
				     };
			
 
				+    if (!finish_reason.empty()) {
			
 
				+        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
			
 
				+        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
			
 
				+        ret.push_back({"usage", json {
			
 
				+            {"completion_tokens", num_tokens_predicted},
			
 
				+            {"prompt_tokens",     num_prompt_tokens},
			
 
				+            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
			
 
				+        }});
			
 
				+    }
			
 
				 
			
 
				     return std::vector<json>({ret});
			
 
				 }