bench.yml 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. # Benchmark
  2. name: Benchmark
  3. on:
  4. workflow_dispatch:
  5. inputs:
  6. gpu-series:
  7. description: 'Azure GPU series to run with'
  8. required: true
  9. type: choice
  10. options:
  11. - Standard_NC4as_T4_v3
  12. - Standard_NC24ads_A100_v4
  13. - Standard_NC80adis_H100_v5
  14. sha:
  15. description: 'Commit SHA1 to build'
  16. required: false
  17. type: string
  18. duration:
  19. description: 'Duration of the bench'
  20. type: string
  21. default: 10m
  22. push:
  23. branches:
  24. - master
  25. paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
  26. pull_request:
  27. types: [opened, synchronize, reopened]
  28. paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
  29. schedule:
  30. - cron: '04 2 * * *'
  31. concurrency:
  32. group: ${{ github.workflow }}-${{ github.ref }}
  33. cancel-in-progress: true
  34. jobs:
  35. bench-server-baseline:
  36. runs-on: Standard_NC4as_T4_v3
  37. env:
  38. RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
  39. N_USERS: 8
  40. DURATION: 10m
  41. if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
  42. steps:
  43. - name: Clone
  44. id: checkout
  45. uses: actions/checkout@v3
  46. with:
  47. fetch-depth: 0
  48. ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
  49. - name: Install python env
  50. id: pipenv
  51. run: |
  52. cd examples/server/bench
  53. python3 -m venv venv
  54. source venv/bin/activate
  55. pip install -r requirements.txt
  56. - name: Prometheus
  57. id: install_prometheus
  58. run: |
  59. wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
  60. tar xzf prometheus*.tar.gz --strip-components=1
  61. ./prometheus --config.file=examples/server/bench/prometheus.yml &
  62. while ! nc -z localhost 9090; do
  63. sleep 0.1
  64. done
  65. - name: Install k6
  66. id: k6_installation
  67. run: |
  68. cd examples/server/bench
  69. wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
  70. tar xzf k6*.tar.gz --strip-components=1
  71. - name: Build
  72. id: cmake_build
  73. run: |
  74. set -eux
  75. mkdir build
  76. cd build
  77. cmake .. \
  78. -DLLAMA_NATIVE=OFF \
  79. -DLLAMA_BUILD_SERVER=ON \
  80. -DLLAMA_CURL=ON \
  81. -DLLAMA_CUBLAS=ON \
  82. -DCUDAToolkit_ROOT=/usr/local/cuda \
  83. -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
  84. -DCMAKE_CUDA_ARCHITECTURES=75 \
  85. -DLLAMA_FATAL_WARNINGS=OFF \
  86. -DLLAMA_ALL_WARNINGS=OFF \
  87. -DCMAKE_BUILD_TYPE=Release;
  88. cmake --build . --config Release -j $(nproc) --target server
  89. - name: Download the dataset
  90. id: download_dataset
  91. run: |
  92. cd examples/server/bench
  93. wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  94. - name: Server bench
  95. id: server_bench
  96. run: |
  97. set -eux
  98. cd examples/server/bench
  99. source venv/bin/activate
  100. BENCH_K6_BIN_PATH=./k6 python bench.py \
  101. --runner-label ${{ env.RUNNER_LABEL }} \
  102. --name ${{ github.job }} \
  103. --branch ${{ github.head_ref || github.ref_name }} \
  104. --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
  105. --scenario script.js \
  106. --duration ${{ github.event.inputs.duration || env.DURATION }} \
  107. --hf-repo ggml-org/models \
  108. --hf-file phi-2/ggml-model-q4_0.gguf \
  109. --model-path-prefix /models \
  110. --parallel ${{ env.N_USERS }} \
  111. -ngl 33 \
  112. --batch-size 2048 \
  113. --ubatch-size 256 \
  114. --ctx-size 16384 \
  115. --n-prompts 1000 \
  116. --max-prompt-tokens 1024 \
  117. --max-tokens 2048
  118. cat results.github.env >> $GITHUB_ENV
  119. # Remove dataset as we do not want it in the artefact
  120. rm ShareGPT_V3_unfiltered_cleaned_split.json
  121. - uses: actions/upload-artifact@v4
  122. with:
  123. name: benchmark-results
  124. compression-level: 9
  125. path: |
  126. examples/server/bench/*.jpg
  127. examples/server/bench/*.json
  128. examples/server/bench/*.log
  129. - name: Commit status
  130. uses: Sibz/github-status-action@v1
  131. continue-on-error: true # If not authorized on external repo
  132. with:
  133. authToken: ${{secrets.GITHUB_TOKEN}}
  134. sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
  135. context: bench-server-baseline
  136. description: |
  137. ${{ env.BENCH_RESULTS }}
  138. state: 'success'
  139. - name: Upload benchmark images
  140. uses: devicons/public-upload-to-imgur@v2.2.2
  141. continue-on-error: true # Important as it looks unstable: 503
  142. id: imgur_step
  143. with:
  144. client_id: ${{secrets.IMGUR_CLIENT_ID}}
  145. path: |
  146. examples/server/bench/prompt_tokens_seconds.jpg
  147. examples/server/bench/predicted_tokens_seconds.jpg
  148. examples/server/bench/kv_cache_usage_ratio.jpg
  149. examples/server/bench/requests_processing.jpg
  150. - name: Extract mermaid
  151. id: set_mermaid
  152. run: |
  153. set -eux
  154. cd examples/server/bench
  155. PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
  156. echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
  157. echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
  158. echo "EOF" >> $GITHUB_ENV
  159. PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
  160. echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
  161. echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
  162. echo "EOF" >> $GITHUB_ENV
  163. KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
  164. echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
  165. echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
  166. echo "EOF" >> $GITHUB_ENV
  167. REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
  168. echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
  169. echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
  170. echo "EOF" >> $GITHUB_ENV
  171. - name: Extract image url
  172. id: extract_image_url
  173. continue-on-error: true
  174. run: |
  175. set -eux
  176. echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
  177. echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
  178. echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
  179. echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
  180. - name: Comment PR
  181. uses: mshick/add-pr-comment@v2
  182. id: comment_pr
  183. if: ${{ github.event.pull_request != '' }}
  184. with:
  185. message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
  186. message: |
  187. 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
  188. - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
  189. - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
  190. - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
  191. - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
  192. - ${{ env.BENCH_GRAPH_XLABEL }}
  193. <details>
  194. <summary>Time series</summary>
  195. <p align="center">
  196. <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
  197. <details>
  198. <summary>More</summary>
  199. ```mermaid
  200. ${{ env.PROMPT_TOKENS_SECONDS }}
  201. ```
  202. </details>
  203. <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
  204. <details>
  205. <summary>More</summary>
  206. ```mermaid
  207. ${{ env.PREDICTED_TOKENS_SECONDS }}
  208. ```
  209. </details>
  210. </p>
  211. <details>
  212. <summary>Details</summary>
  213. <p align="center">
  214. <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
  215. <details>
  216. <summary>More</summary>
  217. ```mermaid
  218. ${{ env.KV_CACHE_USAGE_RATIO }}
  219. ```
  220. </details>
  221. <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
  222. <details>
  223. <summary>More</summary>
  224. ```mermaid
  225. ${{ env.REQUESTS_PROCESSING }}
  226. ```
  227. </details>
  228. </p>
  229. </details>
  230. </details>