| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- # TODO: there have been some issues with the workflow, so disabling for now
- # https://github.com/ggml-org/llama.cpp/issues/7893
- #
- # Benchmark
- name: Benchmark
- on:
- workflow_dispatch:
- inputs:
- gpu-series:
- description: 'Azure GPU series to run with'
- required: true
- type: choice
- options:
- - Standard_NC4as_T4_v3
- - Standard_NC24ads_A100_v4
- - Standard_NC80adis_H100_v5
- sha:
- description: 'Commit SHA1 to build'
- required: false
- type: string
- duration:
- description: 'Duration of the bench'
- type: string
- default: 10m
- push:
- branches:
- - master
- paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
- pull_request_target:
- types: [opened, synchronize, reopened]
- paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
- schedule:
- - cron: '04 2 * * *'
- concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
- cancel-in-progress: true
- jobs:
- bench-server-baseline:
- runs-on: Standard_NC4as_T4_v3
- env:
- RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
- N_USERS: 8
- DURATION: 10m
- strategy:
- matrix:
- model: [phi-2]
- ftype: [q4_0, q8_0, f16]
- include:
- - model: phi-2
- ftype: q4_0
- pr_comment_enabled: "true"
- if: |
- inputs.gpu-series == 'Standard_NC4as_T4_v3'
- || github.event_name == 'pull_request_target'
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- - name: Install python env
- id: pipenv
- run: |
- cd tools/server/bench
- python3 -m venv venv
- source venv/bin/activate
- pip install -r requirements.txt
- - name: Prometheus
- id: install_prometheus
- run: |
- wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
- tar xzf prometheus*.tar.gz --strip-components=1
- ./prometheus --config.file=tools/server/bench/prometheus.yml &
- while ! nc -z localhost 9090; do
- sleep 0.1
- done
- - name: Set up Go
- uses: actions/setup-go@v5
- with:
- go-version: '1.21'
- - name: Install k6 and xk6-sse
- id: k6_installation
- run: |
- cd tools/server/bench
- go install go.k6.io/xk6/cmd/xk6@latest
- xk6 build master \
- --with github.com/phymbert/xk6-sse
- - name: Build
- id: cmake_build
- run: |
- set -eux
- cmake -B build \
- -DGGML_NATIVE=OFF \
- -DLLAMA_BUILD_SERVER=ON \
- -DLLAMA_CUBLAS=ON \
- -DCUDAToolkit_ROOT=/usr/local/cuda \
- -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
- -DCMAKE_CUDA_ARCHITECTURES=75 \
- -DLLAMA_FATAL_WARNINGS=OFF \
- -DLLAMA_ALL_WARNINGS=OFF \
- -DCMAKE_BUILD_TYPE=Release;
- cmake --build build --config Release -j $(nproc) --target llama-server
- - name: Download the dataset
- id: download_dataset
- run: |
- cd tools/server/bench
- wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
- - name: Server bench
- id: server_bench
- env:
- HEAD_REF: ${{ github.head_ref || github.ref_name }}
- run: |
- set -eux
- cd tools/server/bench
- source venv/bin/activate
- python bench.py \
- --runner-label ${{ env.RUNNER_LABEL }} \
- --name ${{ github.job }} \
- --branch $HEAD_REF \
- --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
- --scenario script.js \
- --duration ${{ github.event.inputs.duration || env.DURATION }} \
- --hf-repo ggml-org/models \
- --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
- --model-path-prefix /models \
- --parallel ${{ env.N_USERS }} \
- -ngl 33 \
- --batch-size 2048 \
- --ubatch-size 256 \
- --ctx-size 16384 \
- --n-prompts 1000 \
- --max-prompt-tokens 1024 \
- --max-tokens 2048
- cat results.github.env >> $GITHUB_ENV
- # Remove dataset as we do not want it in the artefact
- rm ShareGPT_V3_unfiltered_cleaned_split.json
- - uses: actions/upload-artifact@v4
- with:
- name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
- compression-level: 9
- path: |
- tools/server/bench/*.jpg
- tools/server/bench/*.json
- tools/server/bench/*.log
- - name: Commit status
- uses: Sibz/github-status-action@v1
- with:
- authToken: ${{secrets.GITHUB_TOKEN}}
- sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
- context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
- description: |
- ${{ env.BENCH_RESULTS }}
- state: 'success'
- - name: Upload benchmark images
- uses: devicons/public-upload-to-imgur@v2.2.2
- continue-on-error: true # Important as it looks unstable: 503
- id: imgur_step
- with:
- client_id: ${{secrets.IMGUR_CLIENT_ID}}
- path: |
- tools/server/bench/prompt_tokens_seconds.jpg
- tools/server/bench/predicted_tokens_seconds.jpg
- tools/server/bench/kv_cache_usage_ratio.jpg
- tools/server/bench/requests_processing.jpg
- - name: Extract mermaid
- id: set_mermaid
- run: |
- set -eux
- cd tools/server/bench
- PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
- echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
- echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
- PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
- echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
- echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
- KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
- echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
- echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
- REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
- echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
- echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
- - name: Extract image url
- id: extract_image_url
- continue-on-error: true
- run: |
- set -eux
- echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
- echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
- echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
- echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
- - name: Comment PR
- uses: mshick/add-pr-comment@v2
- id: comment_pr
- if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
- with:
- message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
- message: |
- <p align="center">
- 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
- </p>
- <details>
- <summary>Expand details for performance related PR only</summary>
- - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
- - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
- - ${{ env.BENCH_GRAPH_XLABEL }}
- <p align="center">
- <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
- <details>
- <summary>More</summary>
- ```mermaid
- ${{ env.PROMPT_TOKENS_SECONDS }}
- ```
- </details>
- <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
- <details>
- <summary>More</summary>
- ```mermaid
- ${{ env.PREDICTED_TOKENS_SECONDS }}
- ```
- </details>
- </p>
- <details>
- <summary>Details</summary>
- <p align="center">
- <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
- <details>
- <summary>More</summary>
- ```mermaid
- ${{ env.KV_CACHE_USAGE_RATIO }}
- ```
- </details>
- <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
- <details>
- <summary>More</summary>
- ```mermaid
- ${{ env.REQUESTS_PROCESSING }}
- ```
- </details>
- </p>
- </details>
- </details>
|