server-common.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. #pragma once
  2. #include "common.h"
  3. #include "log.h"
  4. #include "llama.h"
  5. #include "chat.h"
  6. #include "mtmd.h"
  7. #define JSON_ASSERT GGML_ASSERT
  8. #include <nlohmann/json.hpp>
  9. #include <string>
  10. #include <vector>
  11. #include <cinttypes>
  12. #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
  13. const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
  14. using json = nlohmann::ordered_json;
  15. #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
  16. #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
  17. #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
  18. #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
  19. #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
  20. #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
  21. #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
  22. #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
  23. using raw_buffer = std::vector<uint8_t>;
  24. template <typename T>
  25. static T json_value(const json & body, const std::string & key, const T & default_value) {
  26. // Fallback null to default value
  27. if (body.contains(key) && !body.at(key).is_null()) {
  28. try {
  29. return body.at(key);
  30. } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
  31. LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
  32. return default_value;
  33. }
  34. } else {
  35. return default_value;
  36. }
  37. }
  38. // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
  39. enum error_type {
  40. ERROR_TYPE_INVALID_REQUEST,
  41. ERROR_TYPE_AUTHENTICATION,
  42. ERROR_TYPE_SERVER,
  43. ERROR_TYPE_NOT_FOUND,
  44. ERROR_TYPE_PERMISSION,
  45. ERROR_TYPE_UNAVAILABLE, // custom error
  46. ERROR_TYPE_NOT_SUPPORTED, // custom error
  47. ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error
  48. };
  49. // thin wrapper around common_grammar_trigger with (de)serialization functions
  50. struct server_grammar_trigger {
  51. common_grammar_trigger value;
  52. server_grammar_trigger() = default;
  53. server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
  54. server_grammar_trigger(const json & in) {
  55. value.type = (common_grammar_trigger_type) in.at("type").get<int>();
  56. value.value = in.at("value").get<std::string>();
  57. if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
  58. value.token = (llama_token) in.at("token").get<int>();
  59. }
  60. }
  61. json to_json() const {
  62. json out {
  63. {"type", (int) value.type},
  64. {"value", value.value},
  65. };
  66. if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
  67. out["token"] = (int) value.token;
  68. }
  69. return out;
  70. }
  71. };
  72. json format_error_response(const std::string & message, const enum error_type type);
  73. //
  74. // random string / id
  75. //
  76. std::string random_string();
  77. std::string gen_chatcmplid();
  78. std::string gen_tool_call_id();
  79. //
  80. // lora utils
  81. //
  82. // check whether the given lora set has only aloras activated (empty => false)
  83. bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
  84. // if the two sets of loras are different, they require a cache clear unless the
  85. // change is only from aloras to aloras.
  86. bool lora_should_clear_cache(
  87. const std::vector<common_adapter_lora_info> & current,
  88. const std::vector<common_adapter_lora_info> & next);
  89. std::vector<common_adapter_lora_info> parse_lora_request(
  90. const std::vector<common_adapter_lora_info> & lora_base,
  91. const json & data);
  92. bool are_lora_equal(
  93. const std::vector<common_adapter_lora_info> & l1,
  94. const std::vector<common_adapter_lora_info> & l2);
  95. // get the ids of all enabled loras
  96. std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
  97. //
  98. // server_tokens
  99. //
  100. /**
  101. * server_tokens is a helper to manage the input tokens and image for the server.
  102. * it is made this way to simplify the logic of KV cache management.
  103. */
  104. struct server_tokens {
  105. bool has_mtmd = false;
  106. private: // disallow accessing these members directly, risking out-of-sync
  107. // map a **start** index in tokens to the image chunk
  108. // note: the order need to be in-sync with tokens
  109. std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
  110. // list of tokens
  111. // if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
  112. // otherwise, it is a normal text token
  113. // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
  114. // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
  115. llama_tokens tokens;
  116. // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
  117. // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
  118. // idx 0 1 2 3 4 5 6 7 8 9 10
  119. // pos 0 1 2 3 4 5 5 5 7 7 7
  120. // map_idx_to_media will contain: {5, img0}, {8, img1}
  121. public:
  122. server_tokens() = default;
  123. ~server_tokens() = default;
  124. // Prevent copying
  125. // TODO: server_tokens should be copyable - remove this:
  126. server_tokens(const server_tokens&) = delete;
  127. server_tokens& operator=(const server_tokens&) = delete;
  128. // Allow moving (usually implicitly generated if members are movable)
  129. server_tokens(server_tokens&&) = default;
  130. server_tokens& operator=(server_tokens&&) = default;
  131. // Allow accessing elements using [] operator
  132. llama_token operator[](size_t index) { return tokens[index]; }
  133. const llama_token& operator[](size_t index) const { return tokens[index]; }
  134. server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd);
  135. server_tokens(const llama_tokens & tokens, bool has_mtmd);
  136. // for debugging
  137. std::string str() const;
  138. llama_pos pos_next() const;
  139. const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
  140. void push_back(llama_token tok);
  141. // will create a copy of the chunk if it contains non-text data
  142. void push_back(const mtmd_input_chunk * chunk);
  143. // appends server tokens, updates the media map. copies media chunks.
  144. void push_back(server_tokens & tokens);
  145. // for compatibility with context shift and prompt truncation
  146. void insert(const llama_tokens & inp_tokens);
  147. // for compatibility with speculative decoding, ctx shift, slot save/load
  148. const llama_tokens & get_text_tokens() const;
  149. // for compatibility with speculative decoding
  150. void set_token(llama_pos pos, llama_token id);
  151. size_t size() const { return tokens.size(); }
  152. bool empty() const { return tokens.empty(); }
  153. void clear() {
  154. map_idx_to_media.clear();
  155. tokens.clear();
  156. }
  157. void keep_first(size_t n);
  158. std::string detokenize(const llama_context * ctx, bool special) const;
  159. size_t get_common_prefix(const server_tokens & b) const;
  160. // make sure all text tokens are within the vocab range
  161. bool validate(const struct llama_context * ctx) const;
  162. // encode and decode the image chunk
  163. int32_t process_chunk(
  164. llama_context * ctx,
  165. mtmd_context * mctx,
  166. size_t idx,
  167. llama_pos pos,
  168. int32_t seq_id,
  169. size_t & n_tokens_out) const;
  170. };
  171. //
  172. // tokenizer and input processing utils
  173. //
  174. bool json_is_array_of_numbers(const json & data);
  175. // is array having BOTH numbers & strings?
  176. bool json_is_array_of_mixed_numbers_strings(const json & data);
  177. // does array have any individual integers/tokens?
  178. bool json_is_array_and_contains_numbers(const json & data);
  179. // get value by path(key1 / key2)
  180. json json_get_nested_values(const std::vector<std::string> & paths, const json & js);
  181. /**
  182. * this handles 2 cases:
  183. * - only string, example: "string"
  184. * - mixed string and tokens, example: [12, 34, "string", 56, 78]
  185. */
  186. llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special);
  187. // return the last index of character that can form a valid string
  188. // if the last character is potentially cut in half, return the index before the cut
  189. // if validate_utf8(text) == text.size(), then the whole text is valid utf8
  190. size_t validate_utf8(const std::string& text);
  191. // process mtmd prompt, return the server_tokens containing both text tokens and media chunks
  192. server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
  193. /**
  194. * break the input "prompt" object into multiple prompt if needed, then tokenize them
  195. * this supports these cases:
  196. * - "prompt": "string"
  197. * - "prompt": [12, 34, 56]
  198. * - "prompt": [12, 34, "string", 56, 78]
  199. * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
  200. * and multiple prompts (multi-tasks):
  201. * - "prompt": ["string1", "string2"]
  202. * - "prompt": ["string1", [12, 34, 56]]
  203. * - "prompt": [[12, 34, 56], [78, 90, 12]]
  204. * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
  205. */
  206. std::vector<server_tokens> tokenize_input_prompts(
  207. const llama_vocab * vocab,
  208. mtmd_context * mctx,
  209. const json & json_prompt,
  210. bool add_special,
  211. bool parse_special);
  212. //
  213. // OAI utils
  214. //
  215. // used by /completions endpoint
  216. json oaicompat_completion_params_parse(const json & body);
  217. struct oaicompat_parser_options {
  218. bool use_jinja;
  219. bool prefill_assistant;
  220. common_reasoning_format reasoning_format;
  221. std::map<std::string,std::string> chat_template_kwargs;
  222. common_chat_templates * tmpls;
  223. bool allow_image;
  224. bool allow_audio;
  225. bool enable_thinking = true;
  226. };
  227. // used by /chat/completions endpoint
  228. json oaicompat_chat_params_parse(
  229. json & body, /* openai api json semantics */
  230. const oaicompat_parser_options & opt,
  231. std::vector<raw_buffer> & out_files);
  232. // convert Anthropic Messages API format to OpenAI Chat Completions API format
  233. json convert_anthropic_to_oai(const json & body);
  234. // TODO: move it to server-task.cpp
  235. json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
  236. // TODO: move it to server-task.cpp
  237. json format_response_rerank(
  238. const json & request,
  239. const json & ranks,
  240. bool is_tei_format,
  241. std::vector<std::string> & texts,
  242. int top_n);
  243. //
  244. // other utils
  245. //
  246. std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
  247. std::string safe_json_to_str(const json & data);
  248. std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
  249. // format incomplete utf-8 multibyte character for output
  250. std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);
  251. // format server-sent event (SSE), return the formatted string to send
  252. // note: if data is a json array, it will be sent as multiple events, one per item
  253. std::string format_oai_sse(const json & data);
  254. // format Anthropic-style SSE with event types
  255. std::string format_anthropic_sse(const json & data);
  256. bool is_valid_utf8(const std::string & str);
  257. //
  258. // formatting output responses
  259. // TODO: move these to server-task.cpp
  260. //
  261. llama_tokens format_prompt_infill(
  262. const llama_vocab * vocab,
  263. const json & input_prefix,
  264. const json & input_suffix,
  265. const json & input_extra,
  266. const int n_batch,
  267. const int n_predict,
  268. const int n_ctx,
  269. const bool spm_infill,
  270. const llama_tokens & tokens_prompt);
  271. // format rerank task: [BOS]query[EOS][SEP]doc[EOS].
  272. server_tokens format_prompt_rerank(
  273. const struct llama_model * model,
  274. const struct llama_vocab * vocab,
  275. mtmd_context * mctx,
  276. const std::string & query,
  277. const std::string & doc);