script.js 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. import sse from 'k6/x/sse'
  2. import {check, sleep} from 'k6'
  3. import {SharedArray} from 'k6/data'
  4. import {Counter, Rate, Trend} from 'k6/metrics'
  5. import exec from 'k6/execution';
  6. // Server chat completions prefix
  7. const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
  8. // Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
  9. const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
  10. // Model name to request
  11. const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
  12. // Dataset path
  13. const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
  14. // Max tokens to predict
  15. const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
  16. // Max prompt tokens
  17. const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
  18. // Max slot context
  19. const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
  20. export function setup() {
  21. console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
  22. }
  23. const data = new SharedArray('conversations', function () {
  24. const tokenizer = (message) => message.split(/[\s,'".?]/)
  25. return JSON.parse(open(dataset_path))
  26. // Filter out the conversations with less than 2 turns.
  27. .filter(data => data["conversations"].length >= 2)
  28. .filter(data => data["conversations"][0]["from"] === "human")
  29. .map(data => {
  30. return {
  31. prompt: data["conversations"][0]["value"],
  32. n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
  33. n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
  34. }
  35. })
  36. // Filter out too short sequences
  37. .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
  38. // Filter out too long sequences.
  39. .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
  40. // Keep only first n prompts
  41. .slice(0, n_prompt)
  42. })
  43. const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
  44. const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
  45. const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
  46. const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
  47. const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
  48. const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
  49. const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
  50. const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
  51. const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
  52. export const options = {
  53. thresholds: {
  54. llamacpp_completions_truncated_rate: [
  55. // more than 80% of truncated input will abort the test
  56. {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
  57. ],
  58. },
  59. duration: '10m',
  60. vus: 8,
  61. }
  62. export default function () {
  63. const conversation = data[exec.scenario.iterationInInstance % data.length]
  64. const payload = {
  65. "messages": [
  66. {
  67. "role": "system",
  68. "content": "You are ChatGPT, an AI assistant.",
  69. },
  70. {
  71. "role": "user",
  72. "content": conversation.prompt,
  73. }
  74. ],
  75. "model": model,
  76. "stream": true,
  77. "stream_options": {
  78. "include_usage": true, // False to be supported in llama.cpp server
  79. },
  80. "seed": 42,
  81. "max_tokens": max_tokens,
  82. "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
  83. }
  84. const params = {method: 'POST', body: JSON.stringify(payload)};
  85. const startTime = new Date()
  86. let promptEvalEndTime = null
  87. let prompt_tokens = 0
  88. let completions_tokens = 0
  89. let finish_reason = null
  90. const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
  91. client.on('event', function (event) {
  92. if (promptEvalEndTime == null) {
  93. promptEvalEndTime = new Date()
  94. llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
  95. }
  96. if (event.data === '[DONE]' || event.data === '') {
  97. return
  98. }
  99. let chunk = JSON.parse(event.data)
  100. if (chunk.choices && chunk.choices.length > 0) {
  101. let choice = chunk.choices[0]
  102. if (choice.finish_reason) {
  103. finish_reason = choice.finish_reason
  104. }
  105. }
  106. if (chunk.usage) {
  107. prompt_tokens = chunk.usage.prompt_tokens
  108. llamacpp_prompt_tokens.add(prompt_tokens)
  109. llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
  110. completions_tokens = chunk.usage.completion_tokens
  111. llamacpp_completion_tokens.add(completions_tokens)
  112. llamacpp_completion_tokens_total_counter.add(completions_tokens)
  113. }
  114. })
  115. client.on('error', function (e) {
  116. console.log('An unexpected error occurred: ', e.error());
  117. throw e;
  118. })
  119. })
  120. check(res, {'success completion': (r) => r.status === 200})
  121. const endTime = new Date()
  122. const promptEvalTime = promptEvalEndTime - startTime
  123. if (promptEvalTime > 0) {
  124. llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
  125. }
  126. const completion_time = endTime - promptEvalEndTime
  127. if (completions_tokens > 0 && completion_time > 0) {
  128. llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
  129. }
  130. llamacpp_completions_truncated_rate.add(finish_reason === 'length')
  131. llamacpp_completions_stop_rate.add(finish_reason === 'stop')
  132. sleep(0.3)
  133. }