test_tool_call.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. import pytest
  2. from utils import *
  3. server: ServerProcess
  4. TIMEOUT_SERVER_START = 15*60
  5. TIMEOUT_HTTP_REQUEST = 60
  6. @pytest.fixture(autouse=True)
  7. def create_server():
  8. global server
  9. server = ServerPreset.tinyllama2()
  10. server.model_alias = "tinyllama-2-tool-call"
  11. server.server_port = 8081
  12. TEST_TOOL = {
  13. "type":"function",
  14. "function": {
  15. "name": "test",
  16. "description": "",
  17. "parameters": {
  18. "type": "object",
  19. "properties": {
  20. "success": {"type": "boolean", "const": True},
  21. },
  22. "required": ["success"]
  23. }
  24. }
  25. }
  26. PYTHON_TOOL = {
  27. "type": "function",
  28. "function": {
  29. "name": "python",
  30. "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
  31. "parameters": {
  32. "type": "object",
  33. "properties": {
  34. "code": {
  35. "type": "string",
  36. "description": "The code to run in the ipython interpreter."
  37. }
  38. },
  39. "required": ["code"]
  40. }
  41. }
  42. }
  43. WEATHER_TOOL = {
  44. "type":"function",
  45. "function":{
  46. "name":"get_current_weather",
  47. "description":"Get the current weather in a given location",
  48. "parameters":{
  49. "type":"object",
  50. "properties":{
  51. "location":{
  52. "type":"string",
  53. "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
  54. }
  55. },
  56. "required":["location"]
  57. }
  58. }
  59. }
  60. def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
  61. global server
  62. n_predict = 512
  63. # server = ServerPreset.stories15m_moe()
  64. server.jinja = True
  65. server.n_predict = n_predict
  66. server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
  67. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  68. res = server.make_request("POST", "/chat/completions", data={
  69. "max_tokens": n_predict,
  70. "messages": [
  71. {"role": "system", "content": "You are a coding assistant."},
  72. {"role": "user", "content": "Write an example"},
  73. ],
  74. "tool_choice": "required",
  75. "tools": [tool],
  76. "parallel_tool_calls": False,
  77. "temperature": 0.0,
  78. "top_k": 1,
  79. "top_p": 1.0,
  80. })
  81. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  82. choice = res.body["choices"][0]
  83. tool_calls = choice["message"].get("tool_calls")
  84. assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
  85. tool_call = tool_calls[0]
  86. expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
  87. assert expected_function_name == tool_call["function"]["name"]
  88. actual_arguments = tool_call["function"]["arguments"]
  89. assert isinstance(actual_arguments, str)
  90. if argument_key is not None:
  91. actual_arguments = json.loads(actual_arguments)
  92. assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
  93. @pytest.mark.parametrize("template_name,tool,argument_key", [
  94. ("google-gemma-2-2b-it", TEST_TOOL, "success"),
  95. ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
  96. ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
  97. ])
  98. def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
  99. do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
  100. @pytest.mark.slow
  101. @pytest.mark.parametrize("template_name,tool,argument_key", [
  102. ("meta-llama-Llama-3.1-8B-Instruct", TEST_TOOL, "success"),
  103. ("meta-llama-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"),
  104. ("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"),
  105. ("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"),
  106. ("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"),
  107. ("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"),
  108. ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"),
  109. ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"),
  110. ("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"),
  111. ("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"),
  112. ("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"),
  113. ("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"),
  114. ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"),
  115. ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"),
  116. ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
  117. ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
  118. ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
  119. ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
  120. ])
  121. def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
  122. do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
  123. @pytest.mark.slow
  124. @pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
  125. (TEST_TOOL, "success", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  126. (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  127. (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
  128. # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
  129. (TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
  130. (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
  131. (TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  132. (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  133. (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
  134. (TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  135. (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  136. (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
  137. (TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  138. (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  139. (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
  140. (TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
  141. (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
  142. (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
  143. (TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  144. (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  145. (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
  146. (TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
  147. (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
  148. (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", "chatml"),
  149. (TEST_TOOL, "success", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  150. (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  151. (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
  152. (TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  153. (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  154. (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
  155. # TODO: fix these
  156. # (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  157. # (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  158. ])
  159. def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
  160. global server
  161. n_predict = 512
  162. server.n_slots = 1
  163. server.jinja = True
  164. server.n_ctx = 8192
  165. server.n_predict = n_predict
  166. server.model_hf_repo = hf_repo
  167. server.model_hf_file = None
  168. if isinstance(template_override, tuple):
  169. (template_hf_repo, template_variant) = template_override
  170. server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
  171. assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
  172. elif isinstance(template_override, str):
  173. server.chat_template = template_override
  174. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  175. res = server.make_request("POST", "/chat/completions", data={
  176. "max_tokens": n_predict,
  177. "messages": [
  178. {"role": "system", "content": "You are a coding assistant."},
  179. {"role": "user", "content": "Write an example"},
  180. ],
  181. "tool_choice": "required",
  182. "tools": [tool],
  183. "parallel_tool_calls": False,
  184. "temperature": 0.0,
  185. "top_k": 1,
  186. "top_p": 1.0,
  187. }, timeout=TIMEOUT_HTTP_REQUEST)
  188. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  189. choice = res.body["choices"][0]
  190. tool_calls = choice["message"].get("tool_calls")
  191. assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
  192. tool_call = tool_calls[0]
  193. expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
  194. assert expected_function_name == tool_call["function"]["name"]
  195. actual_arguments = tool_call["function"]["arguments"]
  196. assert isinstance(actual_arguments, str)
  197. if argument_key is not None:
  198. actual_arguments = json.loads(actual_arguments)
  199. assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
  200. def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
  201. global server
  202. server.jinja = True
  203. server.n_predict = n_predict
  204. server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
  205. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  206. res = server.make_request("POST", "/chat/completions", data={
  207. "max_tokens": n_predict,
  208. "messages": [
  209. {"role": "system", "content": "You are a coding assistant."},
  210. {"role": "user", "content": "say hello world with python"},
  211. ],
  212. "tools": tools if tools else None,
  213. "tool_choice": tool_choice,
  214. "temperature": 0.0,
  215. "top_k": 1,
  216. "top_p": 1.0,
  217. }, timeout=TIMEOUT_HTTP_REQUEST)
  218. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  219. choice = res.body["choices"][0]
  220. assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
  221. @pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
  222. ("meta-llama-Llama-3.3-70B-Instruct", 128, [], None),
  223. ("meta-llama-Llama-3.3-70B-Instruct", 128, [TEST_TOOL], None),
  224. ("meta-llama-Llama-3.3-70B-Instruct", 128, [PYTHON_TOOL], 'none'),
  225. ])
  226. def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
  227. do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
  228. @pytest.mark.slow
  229. @pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
  230. ("meetkai-functionary-medium-v3.2", 256, [], None),
  231. ("meetkai-functionary-medium-v3.2", 256, [TEST_TOOL], None),
  232. ("meetkai-functionary-medium-v3.2", 256, [PYTHON_TOOL], 'none'),
  233. ("meetkai-functionary-medium-v3.1", 256, [], None),
  234. ("meetkai-functionary-medium-v3.1", 256, [TEST_TOOL], None),
  235. ("meetkai-functionary-medium-v3.1", 256, [PYTHON_TOOL], 'none'),
  236. ("meta-llama-Llama-3.2-3B-Instruct", 256, [], None),
  237. ("meta-llama-Llama-3.2-3B-Instruct", 256, [TEST_TOOL], None),
  238. ("meta-llama-Llama-3.2-3B-Instruct", 256, [PYTHON_TOOL], 'none'),
  239. ])
  240. def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
  241. do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
  242. @pytest.mark.slow
  243. @pytest.mark.parametrize("hf_repo,template_override", [
  244. ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  245. ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
  246. ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  247. ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
  248. ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  249. ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
  250. ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  251. ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
  252. ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
  253. ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
  254. ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  255. ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
  256. ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
  257. ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
  258. ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  259. ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
  260. # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
  261. ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
  262. # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
  263. # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  264. ])
  265. def test_weather_tool_call(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
  266. global server
  267. server.n_slots = 1
  268. server.jinja = True
  269. server.n_ctx = 8192
  270. server.n_predict = 512
  271. server.model_hf_repo = hf_repo
  272. server.model_hf_file = None
  273. if isinstance(template_override, tuple):
  274. (template_hf_repo, template_variant) = template_override
  275. server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
  276. assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
  277. elif isinstance(template_override, str):
  278. server.chat_template = template_override
  279. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  280. res = server.make_request("POST", "/chat/completions", data={
  281. "max_tokens": 256,
  282. "messages": [
  283. {"role": "user", "content": "What is the weather in Istanbul?"},
  284. ],
  285. "tools": [WEATHER_TOOL],
  286. }, timeout=TIMEOUT_HTTP_REQUEST)
  287. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  288. choice = res.body["choices"][0]
  289. tool_calls = choice["message"].get("tool_calls")
  290. assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
  291. tool_call = tool_calls[0]
  292. assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"]
  293. actual_arguments = json.loads(tool_call["function"]["arguments"])
  294. assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
  295. location = actual_arguments["location"]
  296. assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
  297. assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
  298. @pytest.mark.slow
  299. @pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
  300. (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
  301. (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
  302. (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
  303. (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
  304. (None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
  305. ('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
  306. ('{"code":"print("}', "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
  307. (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
  308. ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
  309. ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
  310. (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
  311. (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
  312. (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
  313. (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
  314. (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
  315. (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
  316. (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
  317. (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
  318. # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
  319. (None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
  320. # (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
  321. ])
  322. def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
  323. global server
  324. server.n_slots = 1
  325. server.jinja = True
  326. server.n_ctx = 8192
  327. server.n_predict = 128
  328. server.model_hf_repo = hf_repo
  329. server.model_hf_file = None
  330. if isinstance(template_override, tuple):
  331. (template_hf_repo, template_variant) = template_override
  332. server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
  333. assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
  334. elif isinstance(template_override, str):
  335. server.chat_template = template_override
  336. server.start(timeout_seconds=TIMEOUT_SERVER_START)
  337. res = server.make_request("POST", "/chat/completions", data={
  338. "max_tokens": 256,
  339. "messages": [
  340. {"role": "system", "content": "You are a coding assistant."},
  341. {"role": "user", "content": "say hello world with python"},
  342. ],
  343. "tools": [PYTHON_TOOL],
  344. # Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n print("Hello, World!")\nhello_world()` which is correct but a pain to test.
  345. "temperature": 0.0,
  346. "top_k": 1,
  347. "top_p": 1.0,
  348. }, timeout=TIMEOUT_HTTP_REQUEST)
  349. assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
  350. choice = res.body["choices"][0]
  351. tool_calls = choice["message"].get("tool_calls")
  352. assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
  353. tool_call = tool_calls[0]
  354. assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
  355. actual_arguments = tool_call["function"]["arguments"]
  356. if expected_arguments_override is not None:
  357. assert actual_arguments == expected_arguments_override
  358. else:
  359. actual_arguments = json.loads(actual_arguments)
  360. assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
  361. code = actual_arguments["code"]
  362. assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
  363. assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'