test_completion.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. import pytest
  2. import requests
  3. import time
  4. from openai import OpenAI
  5. from utils import *
  6. server = ServerPreset.tinyllama2()
  7. @pytest.fixture(autouse=True)
  8. def create_server():
  9. global server
  10. server = ServerPreset.tinyllama2()
  11. @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
  12. ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
  13. ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
  14. ])
  15. def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
  16. global server
  17. server.start()
  18. res = server.make_request("POST", "/completion", data={
  19. "n_predict": n_predict,
  20. "prompt": prompt,
  21. "return_tokens": return_tokens,
  22. })
  23. assert res.status_code == 200
  24. assert res.body["timings"]["prompt_n"] == n_prompt
  25. assert res.body["timings"]["predicted_n"] == n_predicted
  26. assert res.body["truncated"] == truncated
  27. assert type(res.body["has_new_line"]) == bool
  28. assert match_regex(re_content, res.body["content"])
  29. if return_tokens:
  30. assert len(res.body["tokens"]) > 0
  31. assert all(type(tok) == int for tok in res.body["tokens"])
  32. else:
  33. assert res.body["tokens"] == []
  34. @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
  35. ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
  36. ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
  37. ])
  38. def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
  39. global server
  40. server.start()
  41. res = server.make_stream_request("POST", "/completion", data={
  42. "n_predict": n_predict,
  43. "prompt": prompt,
  44. "stream": True,
  45. })
  46. content = ""
  47. for data in res:
  48. assert "stop" in data and type(data["stop"]) == bool
  49. if data["stop"]:
  50. assert data["timings"]["prompt_n"] == n_prompt
  51. assert data["timings"]["predicted_n"] == n_predicted
  52. assert data["truncated"] == truncated
  53. assert data["stop_type"] == "limit"
  54. assert type(data["has_new_line"]) == bool
  55. assert "generation_settings" in data
  56. assert server.n_predict is not None
  57. assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
  58. assert data["generation_settings"]["seed"] == server.seed
  59. assert match_regex(re_content, content)
  60. else:
  61. assert len(data["tokens"]) > 0
  62. assert all(type(tok) == int for tok in data["tokens"])
  63. content += data["content"]
  64. def test_completion_stream_vs_non_stream():
  65. global server
  66. server.start()
  67. res_stream = server.make_stream_request("POST", "/completion", data={
  68. "n_predict": 8,
  69. "prompt": "I believe the meaning of life is",
  70. "stream": True,
  71. })
  72. res_non_stream = server.make_request("POST", "/completion", data={
  73. "n_predict": 8,
  74. "prompt": "I believe the meaning of life is",
  75. })
  76. content_stream = ""
  77. for data in res_stream:
  78. content_stream += data["content"]
  79. assert content_stream == res_non_stream.body["content"]
  80. def test_completion_with_openai_library():
  81. global server
  82. server.start()
  83. client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
  84. res = client.completions.create(
  85. model="davinci-002",
  86. prompt="I believe the meaning of life is",
  87. max_tokens=8,
  88. )
  89. assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
  90. assert res.choices[0].finish_reason == "length"
  91. assert res.choices[0].text is not None
  92. assert match_regex("(going|bed)+", res.choices[0].text)
  93. def test_completion_stream_with_openai_library():
  94. global server
  95. server.start()
  96. client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
  97. res = client.completions.create(
  98. model="davinci-002",
  99. prompt="I believe the meaning of life is",
  100. max_tokens=8,
  101. stream=True,
  102. )
  103. output_text = ''
  104. for data in res:
  105. choice = data.choices[0]
  106. if choice.finish_reason is None:
  107. assert choice.text is not None
  108. output_text += choice.text
  109. assert match_regex("(going|bed)+", output_text)
  110. # Test case from https://github.com/ggml-org/llama.cpp/issues/13780
  111. @pytest.mark.slow
  112. def test_completion_stream_with_openai_library_stops():
  113. global server
  114. server.model_hf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M"
  115. server.model_hf_file = None
  116. server.start()
  117. client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
  118. res = client.completions.create(
  119. model="davinci-002",
  120. prompt="System: You are helpfull assistant.\nAssistant:\nHey! How could I help?\nUser:\nTell me a joke.\nAssistant:\n",
  121. stop=["User:\n", "Assistant:\n"],
  122. max_tokens=200,
  123. stream=True,
  124. )
  125. output_text = ''
  126. for data in res:
  127. choice = data.choices[0]
  128. if choice.finish_reason is None:
  129. assert choice.text is not None
  130. output_text += choice.text
  131. assert match_regex("Sure, here's one for[\\s\\S]*", output_text), f'Unexpected output: {output_text}'
  132. @pytest.mark.parametrize("n_slots", [1, 2])
  133. def test_consistent_result_same_seed(n_slots: int):
  134. global server
  135. server.n_slots = n_slots
  136. server.start()
  137. last_res = None
  138. for _ in range(4):
  139. res = server.make_request("POST", "/completion", data={
  140. "prompt": "I believe the meaning of life is",
  141. "seed": 42,
  142. "temperature": 0.0,
  143. "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
  144. })
  145. if last_res is not None:
  146. assert res.body["content"] == last_res.body["content"]
  147. last_res = res
  148. @pytest.mark.parametrize("n_slots", [1, 2])
  149. def test_different_result_different_seed(n_slots: int):
  150. global server
  151. server.n_slots = n_slots
  152. server.start()
  153. last_res = None
  154. for seed in range(4):
  155. res = server.make_request("POST", "/completion", data={
  156. "prompt": "I believe the meaning of life is",
  157. "seed": seed,
  158. "temperature": 1.0,
  159. "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
  160. })
  161. if last_res is not None:
  162. assert res.body["content"] != last_res.body["content"]
  163. last_res = res
  164. # TODO figure why it don't work with temperature = 1
  165. # @pytest.mark.parametrize("temperature", [0.0, 1.0])
  166. @pytest.mark.parametrize("n_batch", [16, 32])
  167. @pytest.mark.parametrize("temperature", [0.0])
  168. def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
  169. global server
  170. server.n_batch = n_batch
  171. server.start()
  172. last_res = None
  173. for _ in range(4):
  174. res = server.make_request("POST", "/completion", data={
  175. "prompt": "I believe the meaning of life is",
  176. "seed": 42,
  177. "temperature": temperature,
  178. "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
  179. })
  180. if last_res is not None:
  181. assert res.body["content"] == last_res.body["content"]
  182. last_res = res
  183. @pytest.mark.skip(reason="This test fails on linux, need to be fixed")
  184. def test_cache_vs_nocache_prompt():
  185. global server
  186. server.start()
  187. res_cache = server.make_request("POST", "/completion", data={
  188. "prompt": "I believe the meaning of life is",
  189. "seed": 42,
  190. "temperature": 1.0,
  191. "cache_prompt": True,
  192. })
  193. res_no_cache = server.make_request("POST", "/completion", data={
  194. "prompt": "I believe the meaning of life is",
  195. "seed": 42,
  196. "temperature": 1.0,
  197. "cache_prompt": False,
  198. })
  199. assert res_cache.body["content"] == res_no_cache.body["content"]
  200. def test_nocache_long_input_prompt():
  201. global server
  202. server.start()
  203. res = server.make_request("POST", "/completion", data={
  204. "prompt": "I believe the meaning of life is"*32,
  205. "seed": 42,
  206. "temperature": 1.0,
  207. "cache_prompt": False,
  208. })
  209. assert res.status_code == 400
  210. def test_completion_with_tokens_input():
  211. global server
  212. server.temperature = 0.0
  213. server.start()
  214. prompt_str = "I believe the meaning of life is"
  215. res = server.make_request("POST", "/tokenize", data={
  216. "content": prompt_str,
  217. "add_special": True,
  218. })
  219. assert res.status_code == 200
  220. tokens = res.body["tokens"]
  221. # single completion
  222. res = server.make_request("POST", "/completion", data={
  223. "prompt": tokens,
  224. })
  225. assert res.status_code == 200
  226. assert type(res.body["content"]) == str
  227. # batch completion
  228. res = server.make_request("POST", "/completion", data={
  229. "prompt": [tokens, tokens],
  230. })
  231. assert res.status_code == 200
  232. assert type(res.body) == list
  233. assert len(res.body) == 2
  234. assert res.body[0]["content"] == res.body[1]["content"]
  235. # mixed string and tokens
  236. res = server.make_request("POST", "/completion", data={
  237. "prompt": [tokens, prompt_str],
  238. })
  239. assert res.status_code == 200
  240. assert type(res.body) == list
  241. assert len(res.body) == 2
  242. assert res.body[0]["content"] == res.body[1]["content"]
  243. # mixed string and tokens in one sequence
  244. res = server.make_request("POST", "/completion", data={
  245. "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
  246. })
  247. assert res.status_code == 200
  248. assert type(res.body["content"]) == str
  249. @pytest.mark.parametrize("n_slots,n_requests", [
  250. (1, 3),
  251. (2, 2),
  252. (2, 4),
  253. (4, 2), # some slots must be idle
  254. (4, 6),
  255. ])
  256. def test_completion_parallel_slots(n_slots: int, n_requests: int):
  257. global server
  258. server.n_slots = n_slots
  259. server.temperature = 0.0
  260. server.start()
  261. PROMPTS = [
  262. ("Write a very long book.", "(very|special|big)+"),
  263. ("Write another a poem.", "(small|house)+"),
  264. ("What is LLM?", "(Dad|said)+"),
  265. ("The sky is blue and I love it.", "(climb|leaf)+"),
  266. ("Write another very long music lyrics.", "(friends|step|sky)+"),
  267. ("Write a very long joke.", "(cat|Whiskers)+"),
  268. ]
  269. def check_slots_status():
  270. should_all_slots_busy = n_requests >= n_slots
  271. time.sleep(0.1)
  272. res = server.make_request("GET", "/slots")
  273. n_busy = sum([1 for slot in res.body if slot["is_processing"]])
  274. if should_all_slots_busy:
  275. assert n_busy == n_slots
  276. else:
  277. assert n_busy <= n_slots
  278. tasks = []
  279. for i in range(n_requests):
  280. prompt, re_content = PROMPTS[i % len(PROMPTS)]
  281. tasks.append((server.make_request, ("POST", "/completion", {
  282. "prompt": prompt,
  283. "seed": 42,
  284. "temperature": 1.0,
  285. })))
  286. tasks.append((check_slots_status, ()))
  287. results = parallel_function_calls(tasks)
  288. # check results
  289. for i in range(n_requests):
  290. prompt, re_content = PROMPTS[i % len(PROMPTS)]
  291. res = results[i]
  292. assert res.status_code == 200
  293. assert type(res.body["content"]) == str
  294. assert len(res.body["content"]) > 10
  295. # FIXME: the result is not deterministic when using other slot than slot 0
  296. # assert match_regex(re_content, res.body["content"])
  297. @pytest.mark.parametrize(
  298. "prompt,n_predict,response_fields",
  299. [
  300. ("I believe the meaning of life is", 8, []),
  301. ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
  302. ],
  303. )
  304. def test_completion_response_fields(
  305. prompt: str, n_predict: int, response_fields: list[str]
  306. ):
  307. global server
  308. server.start()
  309. res = server.make_request(
  310. "POST",
  311. "/completion",
  312. data={
  313. "n_predict": n_predict,
  314. "prompt": prompt,
  315. "response_fields": response_fields,
  316. },
  317. )
  318. assert res.status_code == 200
  319. assert "content" in res.body
  320. assert len(res.body["content"])
  321. if len(response_fields):
  322. assert res.body["generation_settings/n_predict"] == n_predict
  323. assert res.body["prompt"] == "<s> " + prompt
  324. assert isinstance(res.body["content"], str)
  325. assert len(res.body) == len(response_fields)
  326. else:
  327. assert len(res.body)
  328. assert "generation_settings" in res.body
  329. def test_n_probs():
  330. global server
  331. server.start()
  332. res = server.make_request("POST", "/completion", data={
  333. "prompt": "I believe the meaning of life is",
  334. "n_probs": 10,
  335. "temperature": 0.0,
  336. "n_predict": 5,
  337. })
  338. assert res.status_code == 200
  339. assert "completion_probabilities" in res.body
  340. assert len(res.body["completion_probabilities"]) == 5
  341. for tok in res.body["completion_probabilities"]:
  342. assert "id" in tok and tok["id"] > 0
  343. assert "token" in tok and type(tok["token"]) == str
  344. assert "logprob" in tok and tok["logprob"] <= 0.0
  345. assert "bytes" in tok and type(tok["bytes"]) == list
  346. assert len(tok["top_logprobs"]) == 10
  347. for prob in tok["top_logprobs"]:
  348. assert "id" in prob and prob["id"] > 0
  349. assert "token" in prob and type(prob["token"]) == str
  350. assert "logprob" in prob and prob["logprob"] <= 0.0
  351. assert "bytes" in prob and type(prob["bytes"]) == list
  352. def test_n_probs_stream():
  353. global server
  354. server.start()
  355. res = server.make_stream_request("POST", "/completion", data={
  356. "prompt": "I believe the meaning of life is",
  357. "n_probs": 10,
  358. "temperature": 0.0,
  359. "n_predict": 5,
  360. "stream": True,
  361. })
  362. for data in res:
  363. if data["stop"] == False:
  364. assert "completion_probabilities" in data
  365. assert len(data["completion_probabilities"]) == 1
  366. for tok in data["completion_probabilities"]:
  367. assert "id" in tok and tok["id"] > 0
  368. assert "token" in tok and type(tok["token"]) == str
  369. assert "logprob" in tok and tok["logprob"] <= 0.0
  370. assert "bytes" in tok and type(tok["bytes"]) == list
  371. assert len(tok["top_logprobs"]) == 10
  372. for prob in tok["top_logprobs"]:
  373. assert "id" in prob and prob["id"] > 0
  374. assert "token" in prob and type(prob["token"]) == str
  375. assert "logprob" in prob and prob["logprob"] <= 0.0
  376. assert "bytes" in prob and type(prob["bytes"]) == list
  377. def test_n_probs_post_sampling():
  378. global server
  379. server.start()
  380. res = server.make_request("POST", "/completion", data={
  381. "prompt": "I believe the meaning of life is",
  382. "n_probs": 10,
  383. "temperature": 0.0,
  384. "n_predict": 5,
  385. "post_sampling_probs": True,
  386. })
  387. assert res.status_code == 200
  388. assert "completion_probabilities" in res.body
  389. assert len(res.body["completion_probabilities"]) == 5
  390. for tok in res.body["completion_probabilities"]:
  391. assert "id" in tok and tok["id"] > 0
  392. assert "token" in tok and type(tok["token"]) == str
  393. assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
  394. assert "bytes" in tok and type(tok["bytes"]) == list
  395. assert len(tok["top_probs"]) == 10
  396. for prob in tok["top_probs"]:
  397. assert "id" in prob and prob["id"] > 0
  398. assert "token" in prob and type(prob["token"]) == str
  399. assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
  400. assert "bytes" in prob and type(prob["bytes"]) == list
  401. # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
  402. assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
  403. @pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)])
  404. def test_logit_bias(tokenize, openai_style):
  405. global server
  406. server.start()
  407. exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
  408. logit_bias = []
  409. if tokenize:
  410. res = server.make_request("POST", "/tokenize", data={
  411. "content": " " + " ".join(exclude) + " ",
  412. })
  413. assert res.status_code == 200
  414. tokens = res.body["tokens"]
  415. logit_bias = [[tok, -100] for tok in tokens]
  416. else:
  417. logit_bias = [[" " + tok + " ", -100] for tok in exclude]
  418. if openai_style:
  419. logit_bias = {el[0]: -100 for el in logit_bias}
  420. res = server.make_request("POST", "/completion", data={
  421. "n_predict": 64,
  422. "prompt": "What is the best book",
  423. "logit_bias": logit_bias,
  424. "temperature": 0.0
  425. })
  426. assert res.status_code == 200
  427. output_text = res.body["content"]
  428. assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
  429. def test_cancel_request():
  430. global server
  431. server.n_ctx = 4096
  432. server.n_predict = -1
  433. server.n_slots = 1
  434. server.server_slots = True
  435. server.start()
  436. # send a request that will take a long time, but cancel it before it finishes
  437. try:
  438. server.make_request("POST", "/completion", data={
  439. "prompt": "I believe the meaning of life is",
  440. }, timeout=0.1)
  441. except requests.exceptions.ReadTimeout:
  442. pass # expected
  443. # make sure the slot is free
  444. time.sleep(1) # wait for HTTP_POLLING_SECONDS
  445. res = server.make_request("GET", "/slots")
  446. assert res.body[0]["is_processing"] == False