server-context.h 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. #include "server-http.h"
  2. #include "server-task.h"
  3. #include "server-queue.h"
  4. #include <nlohmann/json_fwd.hpp>
  5. #include <cstddef>
  6. #include <memory>
  7. struct server_context_impl; // private implementation
  8. struct server_context_meta {
  9. std::string build_info;
  10. std::string model_name;
  11. std::string model_path;
  12. bool has_mtmd;
  13. bool has_inp_image;
  14. bool has_inp_audio;
  15. json json_webui_settings;
  16. int slot_n_ctx;
  17. enum llama_pooling_type pooling_type;
  18. // chat template
  19. std::string chat_template;
  20. std::string chat_template_tool_use;
  21. // tokens
  22. std::string bos_token_str;
  23. std::string eos_token_str;
  24. llama_token fim_pre_token;
  25. llama_token fim_sub_token;
  26. llama_token fim_mid_token;
  27. // model meta
  28. enum llama_vocab_type model_vocab_type;
  29. int32_t model_vocab_n_tokens;
  30. int32_t model_n_ctx_train;
  31. int32_t model_n_embd_inp;
  32. uint64_t model_n_params;
  33. uint64_t model_size;
  34. };
  35. struct server_context {
  36. std::unique_ptr<server_context_impl> impl;
  37. server_context();
  38. ~server_context();
  39. // load the model and initialize llama_context
  40. // returns true on success
  41. bool load_model(const common_params & params);
  42. // this function will block main thread until termination
  43. void start_loop();
  44. // terminate main loop (will unblock start_loop)
  45. void terminate();
  46. // get the underlaying llama_context, can return nullptr if sleeping
  47. // not thread-safe, should only be used from the main thread
  48. llama_context * get_llama_context() const;
  49. // get a new response reader, used by CLI application
  50. server_response_reader get_response_reader();
  51. // get server metadata (read-only), can only be called after load_model()
  52. // not thread-safe, should only be used from the main thread
  53. server_context_meta get_meta() const;
  54. };
  55. // forward declarations
  56. struct server_res_generator;
  57. struct server_routes {
  58. server_routes(const common_params & params, server_context & ctx_server);
  59. void init_routes();
  60. // note: this is not thread-safe and can only when ctx_http.is_ready is false
  61. void update_meta(const server_context & ctx_server) {
  62. this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
  63. }
  64. // handlers using lambda function, so that they can capture `this` without `std::bind`
  65. // they won't be called until ctx_http.is_ready is set to true
  66. server_http_context::handler_t get_health;
  67. server_http_context::handler_t get_metrics;
  68. server_http_context::handler_t get_slots;
  69. server_http_context::handler_t post_slots;
  70. server_http_context::handler_t get_props;
  71. server_http_context::handler_t post_props;
  72. server_http_context::handler_t get_api_show;
  73. server_http_context::handler_t post_infill;
  74. server_http_context::handler_t post_completions;
  75. server_http_context::handler_t post_completions_oai;
  76. server_http_context::handler_t post_chat_completions;
  77. server_http_context::handler_t post_anthropic_messages;
  78. server_http_context::handler_t post_anthropic_count_tokens;
  79. server_http_context::handler_t post_apply_template;
  80. server_http_context::handler_t get_models;
  81. server_http_context::handler_t post_tokenize;
  82. server_http_context::handler_t post_detokenize;
  83. server_http_context::handler_t post_embeddings;
  84. server_http_context::handler_t post_embeddings_oai;
  85. server_http_context::handler_t post_rerank;
  86. server_http_context::handler_t get_lora_adapters;
  87. server_http_context::handler_t post_lora_adapters;
  88. private:
  89. std::unique_ptr<server_res_generator> handle_completions_impl(
  90. const server_http_req & req,
  91. server_task_type type,
  92. const json & data,
  93. const std::vector<raw_buffer> & files,
  94. task_response_type res_type);
  95. std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
  96. std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
  97. std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
  98. std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
  99. // using unique_ptr to allow late initialization of const
  100. std::unique_ptr<const server_context_meta> meta;
  101. const common_params & params;
  102. const server_context_impl & ctx_server;
  103. server_queue & queue_tasks;
  104. server_response & queue_results;
  105. std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
  106. };