bench.yml.disabled 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. # TODO: there have been some issues with the workflow, so disabling for now
  2. # https://github.com/ggerganov/llama.cpp/issues/7893
  3. #
  4. # Benchmark
  5. name: Benchmark
  6. on:
  7. workflow_dispatch:
  8. inputs:
  9. gpu-series:
  10. description: 'Azure GPU series to run with'
  11. required: true
  12. type: choice
  13. options:
  14. - Standard_NC4as_T4_v3
  15. - Standard_NC24ads_A100_v4
  16. - Standard_NC80adis_H100_v5
  17. sha:
  18. description: 'Commit SHA1 to build'
  19. required: false
  20. type: string
  21. duration:
  22. description: 'Duration of the bench'
  23. type: string
  24. default: 10m
  25. push:
  26. branches:
  27. - master
  28. paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  29. pull_request_target:
  30. types: [opened, synchronize, reopened]
  31. paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  32. schedule:
  33. - cron: '04 2 * * *'
  34. concurrency:
  35. group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
  36. cancel-in-progress: true
  37. jobs:
  38. bench-server-baseline:
  39. runs-on: Standard_NC4as_T4_v3
  40. env:
  41. RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
  42. N_USERS: 8
  43. DURATION: 10m
  44. strategy:
  45. matrix:
  46. model: [phi-2]
  47. ftype: [q4_0, q8_0, f16]
  48. include:
  49. - model: phi-2
  50. ftype: q4_0
  51. pr_comment_enabled: "true"
  52. if: |
  53. inputs.gpu-series == 'Standard_NC4as_T4_v3'
  54. || (
  55. github.event_name == 'schedule'
  56. && github.ref_name == 'master'
  57. && github.repository_owner == 'ggerganov'
  58. )
  59. || github.event_name == 'pull_request_target'
  60. || (
  61. github.event_name == 'push'
  62. && github.event.ref == 'refs/heads/master'
  63. && github.repository_owner == 'ggerganov'
  64. )
  65. steps:
  66. - name: Clone
  67. id: checkout
  68. uses: actions/checkout@v4
  69. with:
  70. fetch-depth: 0
  71. ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
  72. - name: Install python env
  73. id: pipenv
  74. run: |
  75. cd examples/server/bench
  76. python3 -m venv venv
  77. source venv/bin/activate
  78. pip install -r requirements.txt
  79. - name: Prometheus
  80. id: install_prometheus
  81. run: |
  82. wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
  83. tar xzf prometheus*.tar.gz --strip-components=1
  84. ./prometheus --config.file=examples/server/bench/prometheus.yml &
  85. while ! nc -z localhost 9090; do
  86. sleep 0.1
  87. done
  88. - name: Set up Go
  89. uses: actions/setup-go@v5
  90. with:
  91. go-version: '1.21'
  92. - name: Install k6 and xk6-sse
  93. id: k6_installation
  94. run: |
  95. cd examples/server/bench
  96. go install go.k6.io/xk6/cmd/xk6@latest
  97. xk6 build master \
  98. --with github.com/phymbert/xk6-sse
  99. - name: Build
  100. id: cmake_build
  101. run: |
  102. set -eux
  103. cmake -B build \
  104. -DGGML_NATIVE=OFF \
  105. -DLLAMA_BUILD_SERVER=ON \
  106. -DLLAMA_CURL=ON \
  107. -DLLAMA_CUBLAS=ON \
  108. -DCUDAToolkit_ROOT=/usr/local/cuda \
  109. -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
  110. -DCMAKE_CUDA_ARCHITECTURES=75 \
  111. -DLLAMA_FATAL_WARNINGS=OFF \
  112. -DLLAMA_ALL_WARNINGS=OFF \
  113. -DCMAKE_BUILD_TYPE=Release;
  114. cmake --build build --config Release -j $(nproc) --target llama-server
  115. - name: Download the dataset
  116. id: download_dataset
  117. run: |
  118. cd examples/server/bench
  119. wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  120. - name: Server bench
  121. id: server_bench
  122. env:
  123. HEAD_REF: ${{ github.head_ref || github.ref_name }}
  124. run: |
  125. set -eux
  126. cd examples/server/bench
  127. source venv/bin/activate
  128. python bench.py \
  129. --runner-label ${{ env.RUNNER_LABEL }} \
  130. --name ${{ github.job }} \
  131. --branch $HEAD_REF \
  132. --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
  133. --scenario script.js \
  134. --duration ${{ github.event.inputs.duration || env.DURATION }} \
  135. --hf-repo ggml-org/models \
  136. --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
  137. --model-path-prefix /models \
  138. --parallel ${{ env.N_USERS }} \
  139. -ngl 33 \
  140. --batch-size 2048 \
  141. --ubatch-size 256 \
  142. --ctx-size 16384 \
  143. --n-prompts 1000 \
  144. --max-prompt-tokens 1024 \
  145. --max-tokens 2048
  146. cat results.github.env >> $GITHUB_ENV
  147. # Remove dataset as we do not want it in the artefact
  148. rm ShareGPT_V3_unfiltered_cleaned_split.json
  149. - uses: actions/upload-artifact@v4
  150. with:
  151. name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
  152. compression-level: 9
  153. path: |
  154. examples/server/bench/*.jpg
  155. examples/server/bench/*.json
  156. examples/server/bench/*.log
  157. - name: Commit status
  158. uses: Sibz/github-status-action@v1
  159. with:
  160. authToken: ${{secrets.GITHUB_TOKEN}}
  161. sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
  162. context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
  163. description: |
  164. ${{ env.BENCH_RESULTS }}
  165. state: 'success'
  166. - name: Upload benchmark images
  167. uses: devicons/public-upload-to-imgur@v2.2.2
  168. continue-on-error: true # Important as it looks unstable: 503
  169. id: imgur_step
  170. with:
  171. client_id: ${{secrets.IMGUR_CLIENT_ID}}
  172. path: |
  173. examples/server/bench/prompt_tokens_seconds.jpg
  174. examples/server/bench/predicted_tokens_seconds.jpg
  175. examples/server/bench/kv_cache_usage_ratio.jpg
  176. examples/server/bench/requests_processing.jpg
  177. - name: Extract mermaid
  178. id: set_mermaid
  179. run: |
  180. set -eux
  181. cd examples/server/bench
  182. PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
  183. echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
  184. echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
  185. echo "EOF" >> $GITHUB_ENV
  186. PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
  187. echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
  188. echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
  189. echo "EOF" >> $GITHUB_ENV
  190. KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
  191. echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
  192. echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
  193. echo "EOF" >> $GITHUB_ENV
  194. REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
  195. echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
  196. echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
  197. echo "EOF" >> $GITHUB_ENV
  198. - name: Extract image url
  199. id: extract_image_url
  200. continue-on-error: true
  201. run: |
  202. set -eux
  203. echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
  204. echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
  205. echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
  206. echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
  207. - name: Comment PR
  208. uses: mshick/add-pr-comment@v2
  209. id: comment_pr
  210. if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
  211. with:
  212. message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
  213. message: |
  214. <p align="center">
  215. 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
  216. </p>
  217. <details>
  218. <summary>Expand details for performance related PR only</summary>
  219. - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
  220. - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
  221. - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
  222. - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
  223. - ${{ env.BENCH_GRAPH_XLABEL }}
  224. <p align="center">
  225. <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
  226. <details>
  227. <summary>More</summary>
  228. ```mermaid
  229. ${{ env.PROMPT_TOKENS_SECONDS }}
  230. ```
  231. </details>
  232. <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
  233. <details>
  234. <summary>More</summary>
  235. ```mermaid
  236. ${{ env.PREDICTED_TOKENS_SECONDS }}
  237. ```
  238. </details>
  239. </p>
  240. <details>
  241. <summary>Details</summary>
  242. <p align="center">
  243. <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
  244. <details>
  245. <summary>More</summary>
  246. ```mermaid
  247. ${{ env.KV_CACHE_USAGE_RATIO }}
  248. ```
  249. </details>
  250. <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
  251. <details>
  252. <summary>More</summary>
  253. ```mermaid
  254. ${{ env.REQUESTS_PROCESSING }}
  255. ```
  256. </details>
  257. </p>
  258. </details>
  259. </details>