test_tool_call.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. import pytest
  2. from utils import *
  3. server: ServerProcess
  4. TIMEOUT_SERVER_START = 15*60
  5. TIMEOUT_HTTP_REQUEST = 60
  6. @pytest.fixture(autouse=True)
  7. def create_server():
  8. global server
  9. server = ServerPreset.tinyllama2()
  10. server.model_alias = "tinyllama-2-tool-call"
  11. server.server_port = 8081
  12. TEST_TOOL = {
  13. "type":"function",
  14. "function": {
  15. "name": "test",
  16. "description": "",
  17. "parameters": {
  18. "type": "object",
  19. "properties": {
  20. "success": {"type": "boolean", "const": True},
  21. },
  22. "required": ["success"]
  23. }
  24. }
  25. }
  26. PYTHON_TOOL = {
  27. "type": "function",
  28. "function": {
  29. "name": "python",
  30. "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
  31. "parameters": {
  32. "type": "object",
  33. "properties": {
  34. "code": {
  35. "type": "string",
  36. "description": "The code to run in the ipython interpreter."
  37. }
  38. },
  39. "required": ["code"]
  40. }
  41. }
  42. }
  43. WEATHER_TOOL = {
  44. "type":"function",
  45. "function":{
  46. "name":"get_current_weather",
  47. "description":"Get the current weather in a given location",
  48. "parameters":{
  49. "type":"object",
  50. "properties":{
  51. "location":{
  52. "type":"string",
  53. "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
  54. }
  55. },
  56. "required":["location"]
  57. }
  58. }
  59. }
  60. def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
  61. global server
  62. n_predict = 512
  63. # server = ServerPreset.stories15m_moe()
  64. server.jinja = True
  65. server.n_predict = n_predict
  66. server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
  67. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  68. res = server.make_request("POST", "/chat/completions", data={
  69. "max_tokens": n_predict,
  70. "messages": [
  71. {"role": "system", "content": "You are a coding assistant."},
  72. {"role": "user", "content": "Write an example"},
  73. ],
  74. "tool_choice": "required",
  75. "tools": [tool],
  76. "parallel_tool_calls": False,
  77. "temperature": 0.0,
  78. "top_k": 1,
  79. "top_p": 1.0,
  80. })
  81. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  82. choice = res.body["choices"][0]
  83. tool_calls = choice["message"].get("tool_calls")
  84. assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
  85. tool_call = tool_calls[0]
  86. assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
  87. expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
  88. assert expected_function_name == tool_call["function"]["name"]
  89. actual_arguments = tool_call["function"]["arguments"]
  90. assert isinstance(actual_arguments, str)
  91. if argument_key is not None:
  92. actual_arguments = json.loads(actual_arguments)
  93. assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
  94. @pytest.mark.parametrize("template_name,tool,argument_key", [
  95. ("google-gemma-2-2b-it", TEST_TOOL, "success"),
  96. ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
  97. ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
  98. ])
  99. def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
  100. do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
  101. @pytest.mark.slow
  102. @pytest.mark.parametrize("template_name,tool,argument_key", [
  103. ("meta-llama-Llama-3.1-8B-Instruct", TEST_TOOL, "success"),
  104. ("meta-llama-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"),
  105. ("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"),
  106. ("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"),
  107. ("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"),
  108. ("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"),
  109. ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"),
  110. ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"),
  111. ("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"),
  112. ("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"),
  113. ("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"),
  114. ("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"),
  115. ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"),
  116. ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"),
  117. ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
  118. ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
  119. ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
  120. ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
  121. ])
  122. def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
  123. do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
  124. @pytest.mark.slow
  125. @pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
  126. (TEST_TOOL, "success", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  127. (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  128. (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
  129. # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
  130. (TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
  131. (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
  132. (TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  133. (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  134. (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
  135. (TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  136. (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  137. (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
  138. (TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  139. (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  140. # (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
  141. (TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
  142. (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
  143. # (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
  144. (TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  145. (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  146. (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
  147. (TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
  148. (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
  149. (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", "chatml"),
  150. (TEST_TOOL, "success", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  151. (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  152. (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
  153. (TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  154. (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  155. # (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
  156. # TODO: fix these
  157. # (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  158. # (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  159. ])
  160. def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
  161. global server
  162. n_predict = 512
  163. server.n_slots = 1
  164. server.jinja = True
  165. server.n_ctx = 8192
  166. server.n_predict = n_predict
  167. server.model_hf_repo = hf_repo
  168. server.model_hf_file = None
  169. if isinstance(template_override, tuple):
  170. (template_hf_repo, template_variant) = template_override
  171. server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
  172. assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
  173. elif isinstance(template_override, str):
  174. server.chat_template = template_override
  175. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  176. res = server.make_request("POST", "/chat/completions", data={
  177. "max_tokens": n_predict,
  178. "messages": [
  179. {"role": "system", "content": "You are a coding assistant."},
  180. {"role": "user", "content": "Write an example"},
  181. ],
  182. "tool_choice": "required",
  183. "tools": [tool],
  184. "parallel_tool_calls": False,
  185. "temperature": 0.0,
  186. "top_k": 1,
  187. "top_p": 1.0,
  188. }, timeout=TIMEOUT_HTTP_REQUEST)
  189. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  190. choice = res.body["choices"][0]
  191. tool_calls = choice["message"].get("tool_calls")
  192. assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
  193. tool_call = tool_calls[0]
  194. assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
  195. expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
  196. assert expected_function_name == tool_call["function"]["name"]
  197. actual_arguments = tool_call["function"]["arguments"]
  198. assert isinstance(actual_arguments, str)
  199. if argument_key is not None:
  200. actual_arguments = json.loads(actual_arguments)
  201. assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
  202. def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
  203. global server
  204. server.jinja = True
  205. server.n_predict = n_predict
  206. server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
  207. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  208. res = server.make_request("POST", "/chat/completions", data={
  209. "max_tokens": n_predict,
  210. "messages": [
  211. {"role": "system", "content": "You are a coding assistant."},
  212. {"role": "user", "content": "say hello world with python"},
  213. ],
  214. "tools": tools if tools else None,
  215. "tool_choice": tool_choice,
  216. "temperature": 0.0,
  217. "top_k": 1,
  218. "top_p": 1.0,
  219. }, timeout=TIMEOUT_HTTP_REQUEST)
  220. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  221. choice = res.body["choices"][0]
  222. assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
  223. @pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
  224. ("meta-llama-Llama-3.3-70B-Instruct", 128, [], None),
  225. ("meta-llama-Llama-3.3-70B-Instruct", 128, [TEST_TOOL], None),
  226. ("meta-llama-Llama-3.3-70B-Instruct", 128, [PYTHON_TOOL], 'none'),
  227. ])
  228. def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
  229. do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
  230. @pytest.mark.slow
  231. @pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
  232. ("meetkai-functionary-medium-v3.2", 256, [], None),
  233. ("meetkai-functionary-medium-v3.2", 256, [TEST_TOOL], None),
  234. ("meetkai-functionary-medium-v3.2", 256, [PYTHON_TOOL], 'none'),
  235. ("meetkai-functionary-medium-v3.1", 256, [], None),
  236. ("meetkai-functionary-medium-v3.1", 256, [TEST_TOOL], None),
  237. ("meetkai-functionary-medium-v3.1", 256, [PYTHON_TOOL], 'none'),
  238. ("meta-llama-Llama-3.2-3B-Instruct", 256, [], None),
  239. ("meta-llama-Llama-3.2-3B-Instruct", 256, [TEST_TOOL], None),
  240. ("meta-llama-Llama-3.2-3B-Instruct", 256, [PYTHON_TOOL], 'none'),
  241. ])
  242. def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
  243. do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
  244. @pytest.mark.slow
  245. @pytest.mark.parametrize("hf_repo,template_override", [
  246. ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  247. ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
  248. ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  249. ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
  250. ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  251. ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
  252. ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  253. ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
  254. ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
  255. ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
  256. ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  257. ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
  258. ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
  259. ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
  260. ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  261. ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
  262. ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
  263. ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  264. # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
  265. ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
  266. # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  267. ])
  268. def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
  269. global server
  270. n_predict = 512
  271. server.n_slots = 1
  272. server.jinja = True
  273. server.n_ctx = 8192
  274. server.n_predict = n_predict
  275. server.model_hf_repo = hf_repo
  276. server.model_hf_file = None
  277. if isinstance(template_override, tuple):
  278. (template_hf_repo, template_variant) = template_override
  279. server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
  280. assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
  281. elif isinstance(template_override, str):
  282. server.chat_template = template_override
  283. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  284. res = server.make_request("POST", "/chat/completions", data={
  285. "max_tokens": n_predict,
  286. "messages": [
  287. {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
  288. {"role": "user", "content": "What is the weather in Istanbul?"},
  289. ],
  290. "tools": [WEATHER_TOOL],
  291. }, timeout=TIMEOUT_HTTP_REQUEST)
  292. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  293. choice = res.body["choices"][0]
  294. tool_calls = choice["message"].get("tool_calls")
  295. assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
  296. tool_call = tool_calls[0]
  297. assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
  298. assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"]
  299. actual_arguments = json.loads(tool_call["function"]["arguments"])
  300. assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
  301. location = actual_arguments["location"]
  302. assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
  303. assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
  304. @pytest.mark.slow
  305. @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
  306. (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
  307. (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  308. (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
  309. (None, 128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  310. (None, 128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
  311. (None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
  312. (None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  313. (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  314. (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
  315. (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  316. # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
  317. ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  318. # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
  319. ])
  320. def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
  321. global server
  322. # n_predict = 512
  323. server.n_slots = 1
  324. server.jinja = True
  325. server.n_ctx = 8192 * 2
  326. server.n_predict = n_predict
  327. server.model_hf_repo = hf_repo
  328. server.model_hf_file = None
  329. if isinstance(template_override, tuple):
  330. (template_hf_repo, template_variant) = template_override
  331. server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
  332. assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
  333. elif isinstance(template_override, str):
  334. server.chat_template = template_override
  335. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  336. res = server.make_request("POST", "/chat/completions", data={
  337. "max_tokens": n_predict,
  338. "messages": [
  339. {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things, and provide very concise answers. Do not explain your reasoning to the user. Provide any numerical values back to the user with at most two decimals."},
  340. {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"},
  341. {
  342. "role": "assistant",
  343. "content": None,
  344. "tool_calls": [
  345. {
  346. "id": "call_6789",
  347. "type": "function",
  348. "function": {
  349. "name": "calculate",
  350. "arguments": "{\"expression\":\"sin(30 * pi / 180)\"}"
  351. }
  352. }
  353. ]
  354. },
  355. {
  356. "role": "tool",
  357. "name": "calculate",
  358. "content": "0.55644242476",
  359. "tool_call_id": "call_6789"
  360. }
  361. ],
  362. "tools": [
  363. {
  364. "type":"function",
  365. "function":{
  366. "name":"calculate",
  367. "description":"A calculator function that computes values of arithmetic expressions in the Python syntax",
  368. "parameters":{
  369. "type":"object",
  370. "properties":{
  371. "expression":{
  372. "type":"string",
  373. "description":"An arithmetic expression to compute the value of (Python syntad, assuming all floats)"
  374. }
  375. },
  376. "required":["expression"]
  377. }
  378. }
  379. }
  380. ]
  381. }, timeout=TIMEOUT_HTTP_REQUEST)
  382. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  383. choice = res.body["choices"][0]
  384. tool_calls = choice["message"].get("tool_calls")
  385. assert tool_calls is None, f'Expected no tool call in {choice["message"]}'
  386. content = choice["message"].get("content")
  387. assert content is not None, f'Expected content in {choice["message"]}'
  388. if result_override is not None:
  389. assert re.match(result_override, content), f'Expected {result_override}, got {content}'
  390. else:
  391. assert re.match('^[\\s\\S]*?The (y[ -])?coordinate [\\s\\S]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \
  392. f'Expected something like "The y coordinate is 0.56.", got {content}'
  393. @pytest.mark.slow
  394. @pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [
  395. (128, 'deepseek', "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  396. (128, None, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  397. (1024, 'deepseek', "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  398. (1024, 'none', "^I need[\\s\\S]*?</think>\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  399. (1024, 'deepseek', "To find the sum of.*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
  400. ])
  401. def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
  402. global server
  403. server.n_slots = 1
  404. server.reasoning_format = reasoning_format
  405. server.jinja = True
  406. server.n_ctx = 8192 * 2
  407. server.n_predict = n_predict
  408. server.model_hf_repo = hf_repo
  409. server.model_hf_file = None
  410. if isinstance(template_override, tuple):
  411. (template_hf_repo, template_variant) = template_override
  412. server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
  413. assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
  414. elif isinstance(template_override, str):
  415. server.chat_template = template_override
  416. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  417. res = server.make_request("POST", "/chat/completions", data={
  418. "max_tokens": n_predict,
  419. "messages": [
  420. {"role": "user", "content": "What's the sum of 102 and 7?"},
  421. ]
  422. }, timeout=TIMEOUT_HTTP_REQUEST)
  423. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  424. choice = res.body["choices"][0]
  425. assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
  426. content = choice["message"].get("content")
  427. if expect_content is None:
  428. assert content is None, f'Expected no content in {choice["message"]}'
  429. else:
  430. assert re.match(expect_content, content), f'Expected {expect_content}, got {content}'
  431. reasoning_content = choice["message"].get("reasoning_content")
  432. if expect_reasoning_content is None:
  433. assert reasoning_content is None, f'Expected no reasoning content in {choice["message"]}'
  434. else:
  435. assert re.match(expect_reasoning_content, reasoning_content), f'Expected {expect_reasoning_content}, got {reasoning_content}'
  436. @pytest.mark.slow
  437. @pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
  438. (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  439. # (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
  440. (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  441. (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
  442. (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
  443. (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
  444. ('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  445. (None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
  446. (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
  447. (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
  448. ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
  449. (None, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
  450. (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  451. (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
  452. (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  453. (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
  454. (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
  455. (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
  456. (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  457. (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
  458. # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
  459. (None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
  460. ])
  461. def test_hello_world(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
  462. global server
  463. server.n_slots = 1
  464. server.jinja = True
  465. server.n_ctx = 8192
  466. server.n_predict = 512 # High because of DeepSeek R1
  467. server.model_hf_repo = hf_repo
  468. server.model_hf_file = None
  469. if isinstance(template_override, tuple):
  470. (template_hf_repo, template_variant) = template_override
  471. server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
  472. assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
  473. elif isinstance(template_override, str):
  474. server.chat_template = template_override
  475. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  476. res = server.make_request("POST", "/chat/completions", data={
  477. "max_tokens": 256,
  478. "messages": [
  479. {"role": "system", "content": "You are a coding assistant."},
  480. {"role": "user", "content": "say hello world with python"},
  481. ],
  482. "tools": [PYTHON_TOOL],
  483. # Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n print("Hello, World!")\nhello_world()` which is correct but a pain to test.
  484. "temperature": 0.0,
  485. "top_k": 1,
  486. "top_p": 1.0,
  487. }, timeout=TIMEOUT_HTTP_REQUEST)
  488. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  489. choice = res.body["choices"][0]
  490. tool_calls = choice["message"].get("tool_calls")
  491. assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
  492. tool_call = tool_calls[0]
  493. assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
  494. assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
  495. actual_arguments = tool_call["function"]["arguments"]
  496. if expected_arguments_override is not None:
  497. assert actual_arguments == expected_arguments_override
  498. else:
  499. actual_arguments = json.loads(actual_arguments)
  500. assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
  501. code = actual_arguments["code"]
  502. assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
  503. assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'