1 year ago · 7a2c92637a
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -24,10 +24,10 @@ on:
 
				   push:
			
 
				     branches:
			
 
				       - master
			
 
				-    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
			
 
				+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
			
 
				   pull_request_target:
			
 
				     types: [opened, synchronize, reopened]
			
 
				-    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
			
 
				+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
			
 
				   schedule:
			
 
				     -  cron: '04 2 * * *'
			
 
				 
			
@@ -42,6 +42,16 @@ jobs:
 
				       RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
			
 
				       N_USERS: 8
			
 
				       DURATION: 10m
			
 
				+
			
 
				+    strategy:
			
 
				+      matrix:
			
 
				+        model: [phi-2]
			
 
				+        ftype: [q4_0, q8_0, f16]
			
 
				+        include:
			
 
				+          - model: phi-2
			
 
				+            ftype: q4_0
			
 
				+            pr_comment_enabled: "true"
			
 
				+
			
 
				     if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
			
 
				     steps:
			
 
				       - name: Clone
			
@@ -116,7 +126,7 @@ jobs:
 
				               --scenario script.js \
			
 
				               --duration ${{ github.event.inputs.duration || env.DURATION }} \
			
 
				               --hf-repo ggml-org/models	 \
			
 
				-              --hf-file phi-2/ggml-model-q4_0.gguf \
			
 
				+              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
			
 
				               --model-path-prefix /models \
			
 
				               --parallel ${{ env.N_USERS }} \
			
 
				               -ngl 33 \
			
@@ -134,7 +144,7 @@ jobs:
 
				 
			
 
				       - uses: actions/upload-artifact@v4
			
 
				         with:
			
 
				-          name: benchmark-results
			
 
				+          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
			
 
				           compression-level: 9
			
 
				           path: |
			
 
				             examples/server/bench/*.jpg
			
@@ -146,7 +156,7 @@ jobs:
 
				         with:
			
 
				           authToken: ${{secrets.GITHUB_TOKEN}}
			
 
				           sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
			
 
				-          context: bench-server-baseline
			
 
				+          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
			
 
				           description: |
			
 
				             ${{ env.BENCH_RESULTS }}
			
 
				           state: 'success'
			
@@ -203,11 +213,19 @@ jobs:
 
				       - name: Comment PR
			
 
				         uses: mshick/add-pr-comment@v2
			
 
				         id: comment_pr
			
 
				-        if: ${{ github.event.pull_request != '' }}
			
 
				+        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
			
 
				         with:
			
 
				-          message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
			
 
				+          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
			
 
				           message: |
			
 
				-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
			
 
				+            <p align="center">
			
 
				+
			
 
				+            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
			
 
				+
			
 
				+            </p>
			
 
				+
			
 
				+            <details>
			
 
				+
			
 
				+            <summary>Expand details for performance related PR only</summary>
			
 
				 
			
 
				             - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
			
 
				             - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
			
@@ -215,9 +233,6 @@ jobs:
 
				             - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
			
 
				             - ${{ env.BENCH_GRAPH_XLABEL }}
			
 
				 
			
 
				-            <details>
			
 
				-
			
 
				-            <summary>Time series</summary>
			
 
				 
			
 
				             <p align="center">
			
 
				 
			
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -16,6 +16,7 @@ import matplotlib
 
				 import matplotlib.dates
			
 
				 import matplotlib.pyplot as plt
			
 
				 import requests
			
 
				+from statistics import mean
			
 
				 
			
 
				 
			
 
				 def main(args_in: list[str] | None = None) -> None:
			
@@ -109,6 +110,7 @@ def main(args_in: list[str] | None = None) -> None:
 
				 
			
 
				     # Prometheus
			
 
				     end_time = time.time()
			
 
				+    prometheus_metrics = {}
			
 
				     if is_server_listening("0.0.0.0", 9090):
			
 
				         metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
			
 
				                    'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
			
@@ -127,6 +129,7 @@ def main(args_in: list[str] | None = None) -> None:
 
				                 values = metric_data['data']['result'][0]['values']
			
 
				                 timestamps, metric_values = zip(*values)
			
 
				                 metric_values = [float(value) for value in metric_values]
			
 
				+                prometheus_metrics[metric] = metric_values
			
 
				                 timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
			
 
				                 plt.figure(figsize=(16, 10), dpi=80)
			
 
				                 plt.plot(timestamps_dt, metric_values, label=metric)
			
@@ -176,17 +179,20 @@ xychart-beta
 
				 
			
 
				     # 140 chars max for commit status description
			
 
				     bench_results = {
			
 
				+        "i": iterations,
			
 
				         "req": {
			
 
				-            "p90": data['metrics']["http_req_duration"]["p(90)"],
			
 
				-            "avg": data['metrics']["http_req_duration"]["avg"],
			
 
				+            "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
			
 
				+            "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
			
 
				         },
			
 
				         "pp": {
			
 
				-            "p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
			
 
				-            "avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
			
 
				+            "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
			
 
				+            "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
			
 
				+            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
			
 
				         },
			
 
				         "tg": {
			
 
				-            "p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
			
 
				-            "avg": data['metrics']["llamacpp_tokens_second"]["avg"],
			
 
				+            "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
			
 
				+            "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
			
 
				+            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
			
 
				         },
			
 
				     }
			
 
				     with open("results.github.env", 'a') as github_env:
			
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -87,6 +87,7 @@ export default function () {
 
				         ],
			
 
				         "model": model,
			
 
				         "stream": false,
			
 
				+        "seed": 42,
			
 
				         "max_tokens": max_tokens
			
 
				     }