| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- #include "server-http.h"
- #include "server-task.h"
- #include "server-queue.h"
- #include <nlohmann/json_fwd.hpp>
- #include <cstddef>
- #include <memory>
- struct server_context_impl; // private implementation
- struct server_context_meta {
- std::string build_info;
- std::string model_name;
- std::string model_path;
- bool has_mtmd;
- bool has_inp_image;
- bool has_inp_audio;
- json json_webui_settings;
- int slot_n_ctx;
- enum llama_pooling_type pooling_type;
- // chat template
- std::string chat_template;
- std::string chat_template_tool_use;
- // tokens
- std::string bos_token_str;
- std::string eos_token_str;
- llama_token fim_pre_token;
- llama_token fim_sub_token;
- llama_token fim_mid_token;
- // model meta
- enum llama_vocab_type model_vocab_type;
- int32_t model_vocab_n_tokens;
- int32_t model_n_ctx_train;
- int32_t model_n_embd_inp;
- uint64_t model_n_params;
- uint64_t model_size;
- };
- struct server_context {
- std::unique_ptr<server_context_impl> impl;
- server_context();
- ~server_context();
- // load the model and initialize llama_context
- // returns true on success
- bool load_model(const common_params & params);
- // this function will block main thread until termination
- void start_loop();
- // terminate main loop (will unblock start_loop)
- void terminate();
- // get the underlaying llama_context, can return nullptr if sleeping
- // not thread-safe, should only be used from the main thread
- llama_context * get_llama_context() const;
- // get a new response reader, used by CLI application
- server_response_reader get_response_reader();
- // get server metadata (read-only), can only be called after load_model()
- // not thread-safe, should only be used from the main thread
- server_context_meta get_meta() const;
- };
- // forward declarations
- struct server_res_generator;
- struct server_routes {
- server_routes(const common_params & params, server_context & ctx_server);
- void init_routes();
- // note: this is not thread-safe and can only when ctx_http.is_ready is false
- void update_meta(const server_context & ctx_server) {
- this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
- }
- // handlers using lambda function, so that they can capture `this` without `std::bind`
- // they won't be called until ctx_http.is_ready is set to true
- server_http_context::handler_t get_health;
- server_http_context::handler_t get_metrics;
- server_http_context::handler_t get_slots;
- server_http_context::handler_t post_slots;
- server_http_context::handler_t get_props;
- server_http_context::handler_t post_props;
- server_http_context::handler_t get_api_show;
- server_http_context::handler_t post_infill;
- server_http_context::handler_t post_completions;
- server_http_context::handler_t post_completions_oai;
- server_http_context::handler_t post_chat_completions;
- server_http_context::handler_t post_anthropic_messages;
- server_http_context::handler_t post_anthropic_count_tokens;
- server_http_context::handler_t post_apply_template;
- server_http_context::handler_t get_models;
- server_http_context::handler_t post_tokenize;
- server_http_context::handler_t post_detokenize;
- server_http_context::handler_t post_embeddings;
- server_http_context::handler_t post_embeddings_oai;
- server_http_context::handler_t post_rerank;
- server_http_context::handler_t get_lora_adapters;
- server_http_context::handler_t post_lora_adapters;
- private:
- std::unique_ptr<server_res_generator> handle_completions_impl(
- const server_http_req & req,
- server_task_type type,
- const json & data,
- const std::vector<raw_buffer> & files,
- task_response_type res_type);
- std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
- std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
- std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
- std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
- // using unique_ptr to allow late initialization of const
- std::unique_ptr<const server_context_meta> meta;
- const common_params & params;
- const server_context_impl & ctx_server;
- server_queue & queue_tasks;
- server_response & queue_results;
- std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
- };
|