|
|
@@ -189,12 +189,12 @@ xychart-beta
|
|
|
"pp": {
|
|
|
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
|
|
|
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
|
|
|
- "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
|
|
|
+ "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
|
|
|
},
|
|
|
"tg": {
|
|
|
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
|
|
|
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
|
|
|
- "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
|
|
|
+ "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
|
|
|
},
|
|
|
}
|
|
|
with open("results.github.env", 'a') as github_env:
|
|
|
@@ -214,11 +214,14 @@ def start_benchmark(args):
|
|
|
k6_args = [
|
|
|
'run', args.scenario,
|
|
|
'--no-color',
|
|
|
+ '--no-connection-reuse',
|
|
|
+ '--no-vu-connection-reuse',
|
|
|
]
|
|
|
k6_args.extend(['--duration', args.duration])
|
|
|
k6_args.extend(['--iterations', args.n_prompts])
|
|
|
k6_args.extend(['--vus', args.parallel])
|
|
|
k6_args.extend(['--summary-export', 'k6-results.json'])
|
|
|
+ k6_args.extend(['--out', 'csv=k6-results.csv'])
|
|
|
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
|
|
|
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
|
|
|
print(f"bench: starting k6 with: {args}")
|
|
|
@@ -231,7 +234,7 @@ def start_server(args):
|
|
|
server_process = start_server_background(args)
|
|
|
|
|
|
attempts = 0
|
|
|
- max_attempts = 20
|
|
|
+ max_attempts = 600
|
|
|
if 'GITHUB_ACTIONS' in os.environ:
|
|
|
max_attempts *= 2
|
|
|
|
|
|
@@ -242,7 +245,15 @@ def start_server(args):
|
|
|
print(f"bench: waiting for server to start ...")
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
- print("bench: server started.")
|
|
|
+ attempts = 0
|
|
|
+ while not is_server_ready(args.host, args.port):
|
|
|
+ attempts += 1
|
|
|
+ if attempts > max_attempts:
|
|
|
+ assert False, "server not ready"
|
|
|
+ print(f"bench: waiting for server to be ready ...")
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ print("bench: server started and ready.")
|
|
|
return server_process
|
|
|
|
|
|
|
|
|
@@ -255,11 +266,6 @@ def start_server_background(args):
|
|
|
'--host', args.host,
|
|
|
'--port', args.port,
|
|
|
]
|
|
|
- model_file = args.model_path_prefix + os.path.sep + args.hf_file
|
|
|
- model_dir = os.path.dirname(model_file)
|
|
|
- if not os.path.exists(model_dir):
|
|
|
- os.makedirs(model_dir)
|
|
|
- server_args.extend(['--model', model_file])
|
|
|
server_args.extend(['--hf-repo', args.hf_repo])
|
|
|
server_args.extend(['--hf-file', args.hf_file])
|
|
|
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
|
|
|
@@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
|
|
|
return _is_server_listening
|
|
|
|
|
|
|
|
|
+def is_server_ready(server_fqdn, server_port):
|
|
|
+ url = f"http://{server_fqdn}:{server_port}/health"
|
|
|
+ response = requests.get(url)
|
|
|
+ return response.status_code == 200
|
|
|
+
|
|
|
+
|
|
|
def escape_metric_name(metric_name):
|
|
|
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
|
|
|