| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- import sse from 'k6/x/sse'
- import {check, sleep} from 'k6'
- import {SharedArray} from 'k6/data'
- import {Counter, Rate, Trend} from 'k6/metrics'
- import exec from 'k6/execution';
- // Server chat completions prefix
- const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
- // Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
- const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
- // Model name to request
- const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
- // Dataset path
- const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
- // Max tokens to predict
- const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
- // Max prompt tokens
- const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
- // Max slot context
- const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
- export function setup() {
- console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
- }
- const data = new SharedArray('conversations', function () {
- const tokenizer = (message) => message.split(/[\s,'".?]/)
- return JSON.parse(open(dataset_path))
- // Filter out the conversations with less than 2 turns.
- .filter(data => data["conversations"].length >= 2)
- .filter(data => data["conversations"][0]["from"] === "human")
- .map(data => {
- return {
- prompt: data["conversations"][0]["value"],
- n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
- n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
- }
- })
- // Filter out too short sequences
- .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
- // Filter out too long sequences.
- .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
- // Keep only first n prompts
- .slice(0, n_prompt)
- })
- const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
- const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
- const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
- const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
- const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
- const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
- const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
- const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
- export const options = {
- thresholds: {
- llamacpp_completions_truncated_rate: [
- // more than 80% of truncated input will abort the test
- {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
- ],
- },
- duration: '10m',
- vus: 8,
- }
- export default function () {
- const conversation = data[exec.scenario.iterationInInstance % data.length]
- const payload = {
- "messages": [
- {
- "role": "system",
- "content": "You are ChatGPT, an AI assistant.",
- },
- {
- "role": "user",
- "content": conversation.prompt,
- }
- ],
- "model": model,
- "stream": true,
- "seed": 42,
- "max_tokens": max_tokens,
- "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
- }
- const params = {method: 'POST', body: JSON.stringify(payload)};
- const startTime = new Date()
- let promptEvalEndTime = null
- let prompt_tokens = 0
- let completions_tokens = 0
- let finish_reason = null
- const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
- client.on('event', function (event) {
- if (promptEvalEndTime == null) {
- promptEvalEndTime = new Date()
- }
- let chunk = JSON.parse(event.data)
- let choice = chunk.choices[0]
- if (choice.finish_reason) {
- finish_reason = choice.finish_reason
- }
- if (chunk.usage) {
- prompt_tokens = chunk.usage.prompt_tokens
- llamacpp_prompt_tokens.add(prompt_tokens)
- llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
- completions_tokens = chunk.usage.completion_tokens
- llamacpp_completion_tokens.add(completions_tokens)
- llamacpp_completion_tokens_total_counter.add(completions_tokens)
- }
- })
- client.on('error', function (e) {
- console.log('An unexpected error occurred: ', e.error());
- throw e;
- })
- })
- check(res, {'success completion': (r) => r.status === 200})
- const endTime = new Date()
- const promptEvalTime = promptEvalEndTime - startTime
- if (promptEvalTime > 0) {
- llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
- }
- const completion_time = endTime - promptEvalEndTime
- if (completions_tokens > 0 && completion_time > 0) {
- llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
- }
- llamacpp_completions_truncated_rate.add(finish_reason === 'length')
- llamacpp_completions_stop_rate.add(finish_reason === 'stop')
- sleep(0.3)
- }
|