bench.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. from __future__ import annotations
  2. import argparse
  3. import json
  4. import os
  5. import re
  6. import signal
  7. import socket
  8. import subprocess
  9. import sys
  10. import threading
  11. import time
  12. import traceback
  13. from contextlib import closing
  14. from datetime import datetime
  15. import matplotlib
  16. import matplotlib.dates
  17. import matplotlib.pyplot as plt
  18. import requests
  19. from statistics import mean
  20. def main(args_in: list[str] | None = None) -> None:
  21. parser = argparse.ArgumentParser(description="Start server benchmark scenario")
  22. parser.add_argument("--name", type=str, help="Bench name", required=True)
  23. parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
  24. parser.add_argument("--branch", type=str, help="Branch name", default="detached")
  25. parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
  26. parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
  27. parser.add_argument("--port", type=int, help="Server listen host", default="8080")
  28. parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
  29. parser.add_argument("--n-prompts", type=int,
  30. help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
  31. parser.add_argument("--max-prompt-tokens", type=int,
  32. help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
  33. required=True)
  34. parser.add_argument("--max-tokens", type=int,
  35. help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
  36. required=True)
  37. parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
  38. parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
  39. parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
  40. parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
  41. parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
  42. parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
  43. parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
  44. parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
  45. parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
  46. args = parser.parse_args(args_in)
  47. start_time = time.time()
  48. # Start the server and performance scenario
  49. try:
  50. server_process = start_server(args)
  51. except Exception:
  52. print("bench: server start error :")
  53. traceback.print_exc(file=sys.stdout)
  54. sys.exit(1)
  55. # start the benchmark
  56. iterations = 0
  57. data = {}
  58. try:
  59. start_benchmark(args)
  60. with open("results.github.env", 'w') as github_env:
  61. # parse output
  62. with open('k6-results.json', 'r') as bench_results:
  63. # Load JSON data from file
  64. data = json.load(bench_results)
  65. for metric_name in data['metrics']:
  66. for metric_metric in data['metrics'][metric_name]:
  67. value = data['metrics'][metric_name][metric_metric]
  68. if isinstance(value, float) or isinstance(value, int):
  69. value = round(value, 2)
  70. data['metrics'][metric_name][metric_metric]=value
  71. github_env.write(
  72. f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
  73. iterations = data['root_group']['checks']['success completion']['passes']
  74. except Exception:
  75. print("bench: error :")
  76. traceback.print_exc(file=sys.stdout)
  77. # Stop the server
  78. if server_process:
  79. try:
  80. print(f"bench: shutting down server pid={server_process.pid} ...")
  81. if os.name == 'nt':
  82. interrupt = signal.CTRL_C_EVENT
  83. else:
  84. interrupt = signal.SIGINT
  85. server_process.send_signal(interrupt)
  86. server_process.wait(0.5)
  87. except subprocess.TimeoutExpired:
  88. print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
  89. server_process.kill() # SIGKILL
  90. server_process.wait()
  91. while is_server_listening(args.host, args.port):
  92. time.sleep(0.1)
  93. title = (f"llama.cpp {args.name} on {args.runner_label}\n "
  94. f"duration={args.duration} {iterations} iterations")
  95. xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
  96. f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
  97. f"branch={args.branch} commit={args.commit}")
  98. # Prometheus
  99. end_time = time.time()
  100. prometheus_metrics = {}
  101. if is_server_listening("0.0.0.0", 9090):
  102. metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
  103. 'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
  104. for metric in metrics:
  105. resp = requests.get(f"http://localhost:9090/api/v1/query_range",
  106. params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
  107. with open(f"{metric}.json", 'w') as metric_json:
  108. metric_json.write(resp.text)
  109. if resp.status_code != 200:
  110. print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
  111. else:
  112. metric_data = resp.json()
  113. values = metric_data['data']['result'][0]['values']
  114. timestamps, metric_values = zip(*values)
  115. metric_values = [float(value) for value in metric_values]
  116. prometheus_metrics[metric] = metric_values
  117. timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps]
  118. plt.figure(figsize=(16, 10), dpi=80)
  119. plt.plot(timestamps_dt, metric_values, label=metric)
  120. plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
  121. plt.yticks(fontsize=12, alpha=.7)
  122. ylabel = f"llamacpp:{metric}"
  123. plt.title(title,
  124. fontsize=14, wrap=True)
  125. plt.grid(axis='both', alpha=.3)
  126. plt.ylabel(ylabel, fontsize=22)
  127. plt.xlabel(xlabel, fontsize=14, wrap=True)
  128. plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
  129. plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
  130. plt.gcf().autofmt_xdate()
  131. # Remove borders
  132. plt.gca().spines["top"].set_alpha(0.0)
  133. plt.gca().spines["bottom"].set_alpha(0.3)
  134. plt.gca().spines["right"].set_alpha(0.0)
  135. plt.gca().spines["left"].set_alpha(0.3)
  136. # Save the plot as a jpg image
  137. plt.savefig(f'{metric}.jpg', dpi=60)
  138. plt.close()
  139. # Mermaid format in case images upload failed
  140. with open(f"{metric}.mermaid", 'w') as mermaid_f:
  141. mermaid = (
  142. f"""---
  143. config:
  144. xyChart:
  145. titleFontSize: 12
  146. width: 900
  147. height: 600
  148. themeVariables:
  149. xyChart:
  150. titleColor: "#000000"
  151. ---
  152. xychart-beta
  153. title "{title}"
  154. y-axis "llamacpp:{metric}"
  155. x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
  156. line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
  157. """)
  158. mermaid_f.write(mermaid)
  159. # 140 chars max for commit status description
  160. bench_results = {
  161. "i": iterations,
  162. "req": {
  163. "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
  164. "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
  165. },
  166. "pp": {
  167. "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
  168. "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
  169. "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
  170. },
  171. "tg": {
  172. "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
  173. "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
  174. "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
  175. },
  176. }
  177. with open("results.github.env", 'a') as github_env:
  178. github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
  179. github_env.write(f"BENCH_ITERATIONS={iterations}\n")
  180. title = title.replace('\n', ' ')
  181. xlabel = xlabel.replace('\n', ' ')
  182. github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
  183. github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
  184. def start_benchmark(args):
  185. k6_path = './k6'
  186. if 'BENCH_K6_BIN_PATH' in os.environ:
  187. k6_path = os.environ['BENCH_K6_BIN_PATH']
  188. k6_args = [
  189. 'run', args.scenario,
  190. '--no-color',
  191. ]
  192. k6_args.extend(['--duration', args.duration])
  193. k6_args.extend(['--iterations', args.n_prompts])
  194. k6_args.extend(['--vus', args.parallel])
  195. k6_args.extend(['--summary-export', 'k6-results.json'])
  196. args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
  197. args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
  198. print(f"bench: starting k6 with: {args}")
  199. k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
  200. if k6_completed.returncode != 0:
  201. raise Exception("bench: unable to run k6")
  202. def start_server(args):
  203. server_process = start_server_background(args)
  204. attempts = 0
  205. max_attempts = 20
  206. if 'GITHUB_ACTIONS' in os.environ:
  207. max_attempts *= 2
  208. while not is_server_listening(args.host, args.port):
  209. attempts += 1
  210. if attempts > max_attempts:
  211. assert False, "server not started"
  212. print(f"bench: waiting for server to start ...")
  213. time.sleep(0.5)
  214. print("bench: server started.")
  215. return server_process
  216. def start_server_background(args):
  217. # Start the server
  218. server_path = '../../../build/bin/llama-server'
  219. if 'LLAMA_SERVER_BIN_PATH' in os.environ:
  220. server_path = os.environ['LLAMA_SERVER_BIN_PATH']
  221. server_args = [
  222. '--host', args.host,
  223. '--port', args.port,
  224. ]
  225. model_file = args.model_path_prefix + os.path.sep + args.hf_file
  226. model_dir = os.path.dirname(model_file)
  227. if not os.path.exists(model_dir):
  228. os.makedirs(model_dir)
  229. server_args.extend(['--model', model_file])
  230. server_args.extend(['--hf-repo', args.hf_repo])
  231. server_args.extend(['--hf-file', args.hf_file])
  232. server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
  233. server_args.extend(['--ctx-size', args.ctx_size])
  234. server_args.extend(['--parallel', args.parallel])
  235. server_args.extend(['--batch-size', args.batch_size])
  236. server_args.extend(['--ubatch-size', args.ubatch_size])
  237. server_args.extend(['--n-predict', args.max_tokens * 2])
  238. server_args.extend(['--defrag-thold', "0.1"])
  239. server_args.append('--cont-batching')
  240. server_args.append('--metrics')
  241. server_args.append('--flash-attn')
  242. args = [str(arg) for arg in [server_path, *server_args]]
  243. print(f"bench: starting server with: {' '.join(args)}")
  244. pkwargs = {
  245. 'stdout': subprocess.PIPE,
  246. 'stderr': subprocess.PIPE
  247. }
  248. server_process = subprocess.Popen(
  249. args,
  250. **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue]
  251. def server_log(in_stream, out_stream):
  252. for line in iter(in_stream.readline, b''):
  253. print(line.decode('utf-8'), end='', file=out_stream)
  254. thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
  255. thread_stdout.start()
  256. thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
  257. thread_stderr.start()
  258. return server_process
  259. def is_server_listening(server_fqdn, server_port):
  260. with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
  261. result = sock.connect_ex((server_fqdn, server_port))
  262. _is_server_listening = result == 0
  263. if _is_server_listening:
  264. print(f"server is listening on {server_fqdn}:{server_port}...")
  265. return _is_server_listening
  266. def escape_metric_name(metric_name):
  267. return re.sub('[^A-Z0-9]', '_', metric_name.upper())
  268. if __name__ == '__main__':
  269. main()