| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174 |
- #pragma once
- #include "common.h"
- #include "server-http.h"
- #include <mutex>
- #include <condition_variable>
- #include <functional>
- #include <memory>
- /**
- * state diagram:
- *
- * UNLOADED ──► LOADING ──► LOADED
- * ▲ │ │
- * └───failed───┘ │
- * ▲ │
- * └────────unloaded─────────┘
- */
- enum server_model_status {
- // TODO: also add downloading state when the logic is added
- SERVER_MODEL_STATUS_UNLOADED,
- SERVER_MODEL_STATUS_LOADING,
- SERVER_MODEL_STATUS_LOADED
- };
- static server_model_status server_model_status_from_string(const std::string & status_str) {
- if (status_str == "unloaded") {
- return SERVER_MODEL_STATUS_UNLOADED;
- }
- if (status_str == "loading") {
- return SERVER_MODEL_STATUS_LOADING;
- }
- if (status_str == "loaded") {
- return SERVER_MODEL_STATUS_LOADED;
- }
- throw std::runtime_error("invalid server model status");
- }
- static std::string server_model_status_to_string(server_model_status status) {
- switch (status) {
- case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
- case SERVER_MODEL_STATUS_LOADING: return "loading";
- case SERVER_MODEL_STATUS_LOADED: return "loaded";
- default: return "unknown";
- }
- }
- struct server_model_meta {
- std::string name;
- std::string path;
- std::string path_mmproj; // only available if in_cache=false
- bool in_cache = false; // if true, use -hf; use -m otherwise
- int port = 0;
- server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
- int64_t last_used = 0; // for LRU unloading
- std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
- int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
- bool is_active() const {
- return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
- }
- bool is_failed() const {
- return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
- }
- };
- struct subprocess_s;
- struct server_models {
- private:
- struct instance_t {
- std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
- std::thread th;
- server_model_meta meta;
- FILE * stdin_file = nullptr;
- };
- std::mutex mutex;
- std::condition_variable cv;
- std::map<std::string, instance_t> mapping;
- common_params base_params;
- std::vector<std::string> base_args;
- std::vector<std::string> base_env;
- void update_meta(const std::string & name, const server_model_meta & meta);
- // unload least recently used models if the limit is reached
- void unload_lru();
- public:
- server_models(const common_params & params, int argc, char ** argv, char ** envp);
- // check if a model instance exists
- bool has_model(const std::string & name);
- // return a copy of model metadata
- std::optional<server_model_meta> get_meta(const std::string & name);
- // return a copy of all model metadata
- std::vector<server_model_meta> get_all_meta();
- // if auto_load is true, load the model with previous args if any
- void load(const std::string & name, bool auto_load);
- void unload(const std::string & name);
- void unload_all();
- // update the status of a model instance
- void update_status(const std::string & name, server_model_status status);
- // wait until the model instance is fully loaded
- // return when the model is loaded or failed to load
- void wait_until_loaded(const std::string & name);
- // load the model if not loaded, otherwise do nothing
- // return false if model is already loaded; return true otherwise (meta may need to be refreshed)
- bool ensure_model_loaded(const std::string & name);
- // proxy an HTTP request to the model instance
- server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
- // notify the router server that a model instance is ready
- // return the monitoring thread (to be joined by the caller)
- static std::thread setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
- };
- struct server_models_routes {
- common_params params;
- server_models models;
- server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
- : params(params), models(params, argc, argv, envp) {
- init_routes();
- }
- void init_routes();
- // handlers using lambda function, so that they can capture `this` without `std::bind`
- server_http_context::handler_t get_router_props;
- server_http_context::handler_t proxy_get;
- server_http_context::handler_t proxy_post;
- server_http_context::handler_t get_router_models;
- server_http_context::handler_t post_router_models_load;
- server_http_context::handler_t post_router_models_status;
- server_http_context::handler_t post_router_models_unload;
- };
- /**
- * A simple HTTP proxy that forwards requests to another server
- * and relays the responses back.
- */
- struct server_http_proxy : server_http_res {
- std::function<void()> cleanup = nullptr;
- public:
- server_http_proxy(const std::string & method,
- const std::string & host,
- int port,
- const std::string & path,
- const std::map<std::string, std::string> & headers,
- const std::string & body,
- const std::function<bool()> should_stop);
- ~server_http_proxy() {
- if (cleanup) {
- cleanup();
- }
- }
- private:
- std::thread thread;
- struct msg_t {
- std::map<std::string, std::string> headers;
- int status = 0;
- std::string data;
- };
- };
|