server.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. #include "server-context.h"
  2. #include "server-http.h"
  3. #include "arg.h"
  4. #include "common.h"
  5. #include "llama.h"
  6. #include "log.h"
  7. #include <atomic>
  8. #include <signal.h>
  9. #include <thread> // for std::thread::hardware_concurrency
  10. #if defined(_WIN32)
  11. #include <windows.h>
  12. #endif
  13. static std::function<void(int)> shutdown_handler;
  14. static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
  15. static inline void signal_handler(int signal) {
  16. if (is_terminating.test_and_set()) {
  17. // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
  18. // this is for better developer experience, we can remove when the server is stable enough
  19. fprintf(stderr, "Received second interrupt, terminating immediately.\n");
  20. exit(1);
  21. }
  22. shutdown_handler(signal);
  23. }
  24. // wrapper function that handles exceptions and logs errors
  25. // this is to make sure handler_t never throws exceptions; instead, it returns an error response
  26. static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
  27. return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
  28. std::string message;
  29. try {
  30. return func(req);
  31. } catch (const std::exception & e) {
  32. message = e.what();
  33. } catch (...) {
  34. message = "unknown error";
  35. }
  36. auto res = std::make_unique<server_http_res>();
  37. res->status = 500;
  38. try {
  39. json error_data = format_error_response(message, ERROR_TYPE_SERVER);
  40. res->status = json_value(error_data, "code", 500);
  41. res->data = safe_json_to_str({{ "error", error_data }});
  42. LOG_WRN("got exception: %s\n", res->data.c_str());
  43. } catch (const std::exception & e) {
  44. LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
  45. res->data = "Internal Server Error";
  46. }
  47. return res;
  48. };
  49. }
  50. int main(int argc, char ** argv) {
  51. // own arguments required by this example
  52. common_params params;
  53. if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
  54. return 1;
  55. }
  56. // TODO: should we have a separate n_parallel parameter for the server?
  57. // https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
  58. // TODO: this is a common configuration that is suitable for most local use cases
  59. // however, overriding the parameters is a bit confusing - figure out something more intuitive
  60. if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
  61. LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
  62. params.n_parallel = 4;
  63. params.kv_unified = true;
  64. }
  65. common_init();
  66. // struct that contains llama context and inference
  67. server_context ctx_server;
  68. llama_backend_init();
  69. llama_numa_init(params.numa);
  70. LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
  71. LOG_INF("\n");
  72. LOG_INF("%s\n", common_params_get_system_info(params).c_str());
  73. LOG_INF("\n");
  74. server_http_context ctx_http;
  75. if (!ctx_http.init(params)) {
  76. LOG_ERR("%s: failed to initialize HTTP server\n", __func__);
  77. return 1;
  78. }
  79. //
  80. // Router
  81. //
  82. // register API routes
  83. server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); });
  84. ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
  85. ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
  86. ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics));
  87. ctx_http.get ("/props", ex_wrapper(routes.get_props));
  88. ctx_http.post("/props", ex_wrapper(routes.post_props));
  89. ctx_http.post("/api/show", ex_wrapper(routes.get_api_show));
  90. ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check)
  91. ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check)
  92. ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
  93. ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy
  94. ctx_http.post("/completions", ex_wrapper(routes.post_completions));
  95. ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai));
  96. ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
  97. ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
  98. ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
  99. ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
  100. ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
  101. ctx_http.post("/infill", ex_wrapper(routes.post_infill));
  102. ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy
  103. ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings));
  104. ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai));
  105. ctx_http.post("/rerank", ex_wrapper(routes.post_rerank));
  106. ctx_http.post("/reranking", ex_wrapper(routes.post_rerank));
  107. ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank));
  108. ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank));
  109. ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize));
  110. ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize));
  111. ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template));
  112. // LoRA adapters hotswap
  113. ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters));
  114. ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters));
  115. // Save & load slots
  116. ctx_http.get ("/slots", ex_wrapper(routes.get_slots));
  117. ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots));
  118. //
  119. // Start the server
  120. //
  121. // setup clean up function, to be called before exit
  122. auto clean_up = [&ctx_http, &ctx_server]() {
  123. SRV_INF("%s: cleaning up before exit...\n", __func__);
  124. ctx_http.stop();
  125. ctx_server.terminate();
  126. llama_backend_free();
  127. };
  128. // start the HTTP server before loading the model to be able to serve /health requests
  129. if (!ctx_http.start()) {
  130. clean_up();
  131. LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
  132. return 1;
  133. }
  134. // load the model
  135. LOG_INF("%s: loading model\n", __func__);
  136. if (!ctx_server.load_model(params)) {
  137. clean_up();
  138. if (ctx_http.thread.joinable()) {
  139. ctx_http.thread.join();
  140. }
  141. LOG_ERR("%s: exiting due to model loading error\n", __func__);
  142. return 1;
  143. }
  144. ctx_server.init();
  145. ctx_http.is_ready.store(true);
  146. LOG_INF("%s: model loaded\n", __func__);
  147. shutdown_handler = [&](int) {
  148. // this will unblock start_loop()
  149. ctx_server.terminate();
  150. };
  151. // TODO: refactor in common/console
  152. #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
  153. struct sigaction sigint_action;
  154. sigint_action.sa_handler = signal_handler;
  155. sigemptyset (&sigint_action.sa_mask);
  156. sigint_action.sa_flags = 0;
  157. sigaction(SIGINT, &sigint_action, NULL);
  158. sigaction(SIGTERM, &sigint_action, NULL);
  159. #elif defined (_WIN32)
  160. auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
  161. return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
  162. };
  163. SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
  164. #endif
  165. LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
  166. LOG_INF("%s: starting the main loop...\n", __func__);
  167. // this call blocks the main thread until ctx_server.terminate() is called
  168. ctx_server.start_loop();
  169. clean_up();
  170. if (ctx_http.thread.joinable()) {
  171. ctx_http.thread.join();
  172. }
  173. llama_memory_breakdown_print(ctx_server.get_llama_context());
  174. return 0;
  175. }