server.cpp 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124
  1. #include "common.h"
  2. #include "llama.h"
  3. #include "build-info.h"
  4. #ifndef NDEBUG
  5. // crash the server in debug mode, otherwise send an http 500 error
  6. #define CPPHTTPLIB_NO_EXCEPTIONS 1
  7. #endif
  8. #include "httplib.h"
  9. #include "json.hpp"
  10. // auto generated files (update with ./deps.sh)
  11. #include "index.html.hpp"
  12. #include "index.js.hpp"
  13. #include "completion.js.hpp"
  14. #ifndef SERVER_VERBOSE
  15. #define SERVER_VERBOSE 1
  16. #endif
  17. using namespace httplib;
  18. using json = nlohmann::json;
  19. struct server_params {
  20. std::string hostname = "127.0.0.1";
  21. std::string public_path = "examples/server/public";
  22. int32_t port = 8080;
  23. int32_t read_timeout = 600;
  24. int32_t write_timeout = 600;
  25. };
  26. // completion token output with probabilities
  27. struct completion_token_output {
  28. struct token_prob {
  29. llama_token tok;
  30. float prob;
  31. };
  32. std::vector<token_prob> probs;
  33. llama_token tok;
  34. };
  35. static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
  36. size_t i;
  37. for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
  38. return i;
  39. }
  40. enum stop_type {
  41. STOP_FULL,
  42. STOP_PARTIAL,
  43. };
  44. static bool ends_with(const std::string & str, const std::string & suffix) {
  45. return str.size() >= suffix.size() &&
  46. 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
  47. }
  48. static size_t find_partial_stop_string(const std::string & stop,
  49. const std::string & text) {
  50. if (!text.empty() && !stop.empty()) {
  51. const char text_last_char = text.back();
  52. for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
  53. if (stop[char_index] == text_last_char) {
  54. const std::string current_partial = stop.substr(0, char_index + 1);
  55. if (ends_with(text, current_partial)) {
  56. return text.size() - char_index - 1;
  57. }
  58. }
  59. }
  60. }
  61. return std::string::npos;
  62. }
  63. template<class Iter>
  64. static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
  65. std::string ret;
  66. for (; begin != end; ++begin) {
  67. ret += llama_token_to_str(ctx, *begin);
  68. }
  69. return ret;
  70. }
  71. static void server_log(const char * level, const char * function, int line,
  72. const char * message, const nlohmann::ordered_json & extra) {
  73. nlohmann::ordered_json log {
  74. { "timestamp", time(nullptr) },
  75. { "level", level },
  76. { "function", function },
  77. { "line", line },
  78. { "message", message },
  79. };
  80. if (!extra.empty()) {
  81. log.merge_patch(extra);
  82. }
  83. const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
  84. fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
  85. fflush(stdout);
  86. }
  87. // format incomplete utf-8 multibyte character for output
  88. static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
  89. std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
  90. // if first bit is 1, meaning it's a partial character
  91. if (out.size() > 0 && (out[0] & 0x80) == 0x80) {
  92. std::stringstream ss;
  93. ss<< std::hex << (out[0] & 0xff);
  94. std::string res ( ss.str() );
  95. out = "byte: \\x" + res;
  96. }
  97. return out;
  98. }
  99. // convert a vector of completion_token_output to json
  100. static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> probs) {
  101. json out = json::array();
  102. for (const auto & prob : probs) {
  103. json probs_for_token = json::array();
  104. for (const auto & p : prob.probs) {
  105. std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
  106. probs_for_token.push_back(json {
  107. { "tok_str", tok_str },
  108. { "prob", p.prob },
  109. });
  110. }
  111. std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
  112. out.push_back(json {
  113. {"content", tok_str},
  114. {"probs", probs_for_token},
  115. });
  116. }
  117. return out;
  118. }
  119. static bool server_verbose = false;
  120. #if SERVER_VERBOSE != 1
  121. # define LOG_VERBOSE(MSG, ...)
  122. #else
  123. # define LOG_VERBOSE(MSG, ...) \
  124. do { \
  125. if (server_verbose) { \
  126. server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
  127. } \
  128. } while(0)
  129. #endif
  130. #define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
  131. #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
  132. #define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
  133. struct llama_server_context {
  134. bool stream = false;
  135. bool has_next_token = false;
  136. std::string generated_text;
  137. std::vector<completion_token_output> generated_token_probs;
  138. size_t num_prompt_tokens = 0;
  139. size_t num_tokens_predicted = 0;
  140. size_t n_past = 0;
  141. size_t n_remain = 0;
  142. std::vector<llama_token> embd;
  143. std::vector<llama_token> last_n_tokens;
  144. llama_model * model = nullptr;
  145. llama_context * ctx = nullptr;
  146. gpt_params params;
  147. bool truncated = false;
  148. bool stopped_eos = false;
  149. bool stopped_word = false;
  150. bool stopped_limit = false;
  151. std::string stopping_word;
  152. int32_t multibyte_pending = 0;
  153. std::mutex mutex;
  154. std::unique_lock<std::mutex> lock() {
  155. return std::unique_lock<std::mutex>(mutex);
  156. }
  157. ~llama_server_context() {
  158. if (ctx) {
  159. llama_free(ctx);
  160. ctx = nullptr;
  161. }
  162. if (model) {
  163. llama_free_model(model);
  164. model = nullptr;
  165. }
  166. }
  167. void rewind() {
  168. params.antiprompt.clear();
  169. num_prompt_tokens = 0;
  170. num_tokens_predicted = 0;
  171. generated_text = "";
  172. generated_text.reserve(params.n_ctx);
  173. generated_token_probs.clear();
  174. truncated = false;
  175. stopped_eos = false;
  176. stopped_word = false;
  177. stopped_limit = false;
  178. stopping_word = "";
  179. multibyte_pending = 0;
  180. n_remain = 0;
  181. n_past = 0;
  182. }
  183. bool loadModel(const gpt_params & params_) {
  184. params = params_;
  185. std::tie(model, ctx) = llama_init_from_gpt_params(params);
  186. if (model == nullptr) {
  187. LOG_ERROR("unable to load model", { { "model", params_.model } });
  188. return false;
  189. }
  190. last_n_tokens.resize(params.n_ctx);
  191. std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
  192. return true;
  193. }
  194. void loadPrompt() {
  195. params.prompt.insert(0, 1, ' '); // always add a first space
  196. std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
  197. num_prompt_tokens = prompt_tokens.size();
  198. if (params.n_keep < 0) {
  199. params.n_keep = (int)num_prompt_tokens;
  200. }
  201. params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
  202. // if input prompt is too big, truncate like normal
  203. if (num_prompt_tokens>= (size_t)params.n_ctx) {
  204. const int n_left = (params.n_ctx - params.n_keep) / 2;
  205. std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
  206. const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
  207. new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
  208. std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
  209. LOG_VERBOSE("input truncated", {
  210. { "n_ctx", params.n_ctx },
  211. { "n_keep", params.n_keep },
  212. { "n_left", n_left },
  213. { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) },
  214. });
  215. truncated = true;
  216. prompt_tokens = new_tokens;
  217. } else {
  218. const size_t ps = num_prompt_tokens;
  219. std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
  220. std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
  221. }
  222. // compare the evaluated prompt with the new prompt
  223. n_past = common_part(embd, prompt_tokens);
  224. embd = prompt_tokens;
  225. if (n_past == num_prompt_tokens) {
  226. // we have to evaluate at least 1 token to generate logits.
  227. n_past--;
  228. }
  229. LOG_VERBOSE("prompt ingested", {
  230. { "n_past", n_past },
  231. { "cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past) },
  232. { "to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) },
  233. });
  234. has_next_token = true;
  235. }
  236. void beginCompletion() {
  237. // number of tokens to keep when resetting context
  238. n_remain = params.n_predict;
  239. llama_set_rng_seed(ctx, params.seed);
  240. }
  241. completion_token_output nextToken() {
  242. completion_token_output result;
  243. result.tok = -1;
  244. if (embd.size() >= (size_t)params.n_ctx) {
  245. // Reset context
  246. const int n_left = (params.n_ctx - params.n_keep) / 2;
  247. std::vector<llama_token> new_tokens(embd.begin(), embd.begin() + params.n_keep);
  248. new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end());
  249. embd = new_tokens;
  250. n_past = params.n_keep;
  251. truncated = true;
  252. LOG_VERBOSE("input truncated", {
  253. { "n_ctx", params.n_ctx },
  254. { "n_keep", params.n_keep },
  255. { "n_left", n_left },
  256. { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) },
  257. });
  258. }
  259. while (n_past < embd.size()) {
  260. int n_eval = (int)embd.size() - n_past;
  261. if (n_eval > params.n_batch) {
  262. n_eval = params.n_batch;
  263. }
  264. if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) {
  265. LOG_ERROR("failed to eval", {
  266. { "n_eval", n_eval },
  267. { "n_past", n_past },
  268. { "n_threads", params.n_threads },
  269. { "embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) },
  270. });
  271. has_next_token = false;
  272. return result;
  273. }
  274. n_past += n_eval;
  275. }
  276. if (params.n_predict == 0) {
  277. has_next_token = false;
  278. result.tok = llama_token_eos();
  279. return result;
  280. }
  281. // out of user input, sample next token
  282. const float temp = params.temp;
  283. const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
  284. const float top_p = params.top_p;
  285. const float tfs_z = params.tfs_z;
  286. const float typical_p = params.typical_p;
  287. const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
  288. const float repeat_penalty = params.repeat_penalty;
  289. const float alpha_presence = params.presence_penalty;
  290. const float alpha_frequency = params.frequency_penalty;
  291. const int mirostat = params.mirostat;
  292. const float mirostat_tau = params.mirostat_tau;
  293. const float mirostat_eta = params.mirostat_eta;
  294. const bool penalize_nl = params.penalize_nl;
  295. const int32_t n_probs = params.n_probs;
  296. {
  297. auto * logits = llama_get_logits(ctx);
  298. auto n_vocab = llama_n_vocab(ctx);
  299. // Apply params.logit_bias map
  300. for (const auto & it : params.logit_bias) {
  301. logits[it.first] += it.second;
  302. }
  303. std::vector<llama_token_data> candidates;
  304. candidates.reserve(n_vocab);
  305. for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
  306. candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
  307. }
  308. llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
  309. // Apply penalties
  310. float nl_logit = logits[llama_token_nl()];
  311. auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
  312. llama_sample_repetition_penalty(ctx, &candidates_p,
  313. last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
  314. last_n_repeat, repeat_penalty);
  315. llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
  316. last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
  317. last_n_repeat, alpha_frequency, alpha_presence);
  318. if (!penalize_nl) {
  319. logits[llama_token_nl()] = nl_logit;
  320. }
  321. if (temp <= 0) {
  322. // Greedy sampling
  323. result.tok = llama_sample_token_greedy(ctx, &candidates_p);
  324. if (n_probs > 0) {
  325. llama_sample_softmax(ctx, &candidates_p);
  326. }
  327. } else {
  328. if (mirostat == 1) {
  329. static float mirostat_mu = 2.0f * mirostat_tau;
  330. const int mirostat_m = 100;
  331. llama_sample_temperature(ctx, &candidates_p, temp);
  332. result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
  333. } else if (mirostat == 2) {
  334. static float mirostat_mu = 2.0f * mirostat_tau;
  335. llama_sample_temperature(ctx, &candidates_p, temp);
  336. result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
  337. } else {
  338. // Temperature sampling
  339. size_t min_keep = std::max(1, n_probs);
  340. llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
  341. llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
  342. llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
  343. llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
  344. llama_sample_temperature(ctx, &candidates_p, temp);
  345. result.tok = llama_sample_token(ctx, &candidates_p);
  346. }
  347. }
  348. for (size_t i = 0; i < std::min(candidates_p.size, (size_t) n_probs); ++i) {
  349. result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
  350. }
  351. last_n_tokens.erase(last_n_tokens.begin());
  352. last_n_tokens.push_back(result.tok);
  353. num_tokens_predicted++;
  354. }
  355. // add it to the context
  356. embd.push_back(result.tok);
  357. // decrement remaining sampling budget
  358. --n_remain;
  359. if (!embd.empty() && embd.back() == llama_token_eos()) {
  360. //stopping_word = llama_token_to_str(ctx, embd.back());
  361. has_next_token = false;
  362. stopped_eos = true;
  363. LOG_VERBOSE("eos token found", {});
  364. return result;
  365. }
  366. has_next_token = params.n_predict == -1 || n_remain != 0;
  367. return result;
  368. }
  369. size_t findStoppingStrings(const std::string & text, const size_t last_token_size,
  370. const stop_type type) {
  371. size_t stop_pos = std::string::npos;
  372. for (const std::string & word : params.antiprompt) {
  373. size_t pos;
  374. if (type == STOP_FULL) {
  375. const size_t tmp = word.size() + last_token_size;
  376. const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
  377. pos = text.find(word, from_pos);
  378. }
  379. else {
  380. pos = find_partial_stop_string(word, text);
  381. }
  382. if (pos != std::string::npos &&
  383. (stop_pos == std::string::npos || pos < stop_pos)) {
  384. if (type == STOP_FULL) {
  385. stopping_word = word;
  386. stopped_word = true;
  387. has_next_token = false;
  388. }
  389. stop_pos = pos;
  390. }
  391. }
  392. return stop_pos;
  393. }
  394. completion_token_output doCompletion() {
  395. const completion_token_output token_with_probs = nextToken();
  396. const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
  397. generated_text += token_text;
  398. if (params.n_probs > 0) {
  399. generated_token_probs.push_back(token_with_probs);
  400. }
  401. if (multibyte_pending > 0) {
  402. multibyte_pending -= token_text.size();
  403. } else if (token_text.size() == 1) {
  404. const char c = token_text[0];
  405. // 2-byte characters: 110xxxxx 10xxxxxx
  406. if ((c & 0xE0) == 0xC0) {
  407. multibyte_pending = 1;
  408. // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
  409. } else if ((c & 0xF0) == 0xE0) {
  410. multibyte_pending = 2;
  411. // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  412. } else if ((c & 0xF8) == 0xF0) {
  413. multibyte_pending = 3;
  414. } else {
  415. multibyte_pending = 0;
  416. }
  417. }
  418. if (multibyte_pending > 0 && !has_next_token) {
  419. has_next_token = true;
  420. n_remain++;
  421. }
  422. if (!has_next_token && n_remain == 0) {
  423. stopped_limit = true;
  424. }
  425. LOG_VERBOSE("next token", {
  426. { "token", token_with_probs.tok },
  427. { "token_text", tokens_to_output_formatted_string(ctx, token_with_probs.tok) },
  428. { "has_next_token", has_next_token },
  429. { "n_remain", n_remain },
  430. { "num_tokens_predicted", num_tokens_predicted },
  431. { "stopped_eos", stopped_eos },
  432. { "stopped_word", stopped_word },
  433. { "stopped_limit", stopped_limit },
  434. { "stopping_word", stopping_word },
  435. });
  436. return token_with_probs;
  437. }
  438. std::vector<float> getEmbedding() {
  439. static const int n_embd = llama_n_embd(ctx);
  440. if (!params.embedding) {
  441. LOG_WARNING("embedding disabled", {
  442. { "params.embedding", params.embedding },
  443. });
  444. return std::vector<float>(n_embd, 0.0f);
  445. }
  446. const float * data = llama_get_embeddings(ctx);
  447. std::vector<float> embedding(data, data + n_embd);
  448. return embedding;
  449. }
  450. };
  451. static void server_print_usage(const char * argv0, const gpt_params & params,
  452. const server_params & sparams) {
  453. fprintf(stderr, "usage: %s [options]\n", argv0);
  454. fprintf(stderr, "\n");
  455. fprintf(stderr, "options:\n");
  456. fprintf(stderr, " -h, --help show this help message and exit\n");
  457. fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
  458. fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
  459. fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
  460. fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
  461. fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
  462. fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
  463. if (llama_mlock_supported()) {
  464. fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
  465. }
  466. if (llama_mmap_supported()) {
  467. fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
  468. }
  469. #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
  470. fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
  471. fprintf(stderr, " number of layers to store in VRAM\n");
  472. fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
  473. fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
  474. fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
  475. fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
  476. fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
  477. #endif
  478. fprintf(stderr, " -m FNAME, --model FNAME\n");
  479. fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
  480. fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
  481. fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
  482. fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
  483. fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
  484. fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
  485. fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
  486. fprintf(stderr, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
  487. fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
  488. fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
  489. fprintf(stderr, "\n");
  490. }
  491. static void server_params_parse(int argc, char ** argv, server_params & sparams,
  492. gpt_params & params) {
  493. gpt_params default_params;
  494. server_params default_sparams;
  495. std::string arg;
  496. bool invalid_param = false;
  497. for (int i = 1; i < argc; i++) {
  498. arg = argv[i];
  499. if (arg == "--port") {
  500. if (++i >= argc) {
  501. invalid_param = true;
  502. break;
  503. }
  504. sparams.port = std::stoi(argv[i]);
  505. } else if (arg == "--host") {
  506. if (++i >= argc) {
  507. invalid_param = true;
  508. break;
  509. }
  510. sparams.hostname = argv[i];
  511. } else if (arg == "--path") {
  512. if (++i >= argc) {
  513. invalid_param = true;
  514. break;
  515. }
  516. sparams.public_path = argv[i];
  517. } else if (arg == "--timeout" || arg == "-to") {
  518. if (++i >= argc) {
  519. invalid_param = true;
  520. break;
  521. }
  522. sparams.read_timeout = std::stoi(argv[i]);
  523. sparams.write_timeout = std::stoi(argv[i]);
  524. } else if (arg == "-m" || arg == "--model") {
  525. if (++i >= argc) {
  526. invalid_param = true;
  527. break;
  528. }
  529. params.model = argv[i];
  530. } else if (arg == "-a" || arg == "--alias") {
  531. if (++i >= argc) {
  532. invalid_param = true;
  533. break;
  534. }
  535. params.model_alias = argv[i];
  536. } else if (arg == "-h" || arg == "--help") {
  537. server_print_usage(argv[0], default_params, default_sparams);
  538. exit(0);
  539. } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
  540. if (++i >= argc) {
  541. invalid_param = true;
  542. break;
  543. }
  544. params.n_ctx = std::stoi(argv[i]);
  545. } else if (arg == "--memory-f32" || arg == "--memory_f32") {
  546. params.memory_f16 = false;
  547. } else if (arg == "--threads" || arg == "-t") {
  548. if (++i >= argc) {
  549. invalid_param = true;
  550. break;
  551. }
  552. params.n_threads = std::stoi(argv[i]);
  553. } else if (arg == "-b" || arg == "--batch-size") {
  554. if (++i >= argc) {
  555. invalid_param = true;
  556. break;
  557. }
  558. params.n_batch = std::stoi(argv[i]);
  559. params.n_batch = std::min(512, params.n_batch);
  560. } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
  561. if (++i >= argc) {
  562. invalid_param = true;
  563. break;
  564. }
  565. #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
  566. params.n_gpu_layers = std::stoi(argv[i]);
  567. #else
  568. LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
  569. "See main README.md for information on enabling GPU BLAS support", { { "n_gpu_layers", params.n_gpu_layers } });
  570. #endif
  571. }
  572. else if (arg == "--tensor-split" || arg == "-ts") {
  573. if (++i >= argc) {
  574. invalid_param = true;
  575. break;
  576. }
  577. #ifdef GGML_USE_CUBLAS
  578. std::string arg_next = argv[i];
  579. // split string by , and /
  580. const std::regex regex{ R"([,/]+)" };
  581. std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
  582. std::vector<std::string> split_arg{ it, {} };
  583. GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
  584. for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
  585. if (i_device < split_arg.size()) {
  586. params.tensor_split[i_device] = std::stof(split_arg[i_device]);
  587. }
  588. else {
  589. params.tensor_split[i_device] = 0.0f;
  590. }
  591. }
  592. #else
  593. LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
  594. #endif // GGML_USE_CUBLAS
  595. }
  596. else if (arg == "--low-vram" || arg == "-lv")
  597. {
  598. #ifdef GGML_USE_CUBLAS
  599. params.low_vram = true;
  600. #else
  601. fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
  602. #endif // GGML_USE_CUBLAS
  603. }
  604. else if (arg == "--main-gpu" || arg == "-mg") {
  605. if (++i >= argc) {
  606. invalid_param = true;
  607. break;
  608. }
  609. #ifdef GGML_USE_CUBLAS
  610. params.main_gpu = std::stoi(argv[i]);
  611. #else
  612. LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
  613. #endif
  614. } else if (arg == "--lora") {
  615. if (++i >= argc) {
  616. invalid_param = true;
  617. break;
  618. }
  619. params.lora_adapter = argv[i];
  620. params.use_mmap = false;
  621. } else if (arg == "--lora-base") {
  622. if (++i >= argc) {
  623. invalid_param = true;
  624. break;
  625. }
  626. params.lora_base = argv[i];
  627. } else if (arg == "-v" || arg == "--verbose") {
  628. #if SERVER_VERBOSE != 1
  629. LOG_WARNING("server.cpp is not built with verbose logging.", {});
  630. #else
  631. server_verbose = true;
  632. #endif
  633. } else if (arg == "--mlock") {
  634. params.use_mlock = true;
  635. } else if (arg == "--no-mmap") {
  636. params.use_mmap = false;
  637. } else if (arg == "--embedding") {
  638. params.embedding = true;
  639. } else {
  640. fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
  641. server_print_usage(argv[0], default_params, default_sparams);
  642. exit(1);
  643. }
  644. }
  645. if (invalid_param) {
  646. fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
  647. server_print_usage(argv[0], default_params, default_sparams);
  648. exit(1);
  649. }
  650. }
  651. static json format_generation_settings(llama_server_context & llama) {
  652. const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
  653. const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
  654. eos_bias->second < 0.0f && std::isinf(eos_bias->second);
  655. return json {
  656. { "seed", llama.params.seed },
  657. { "temp", llama.params.temp },
  658. { "top_k", llama.params.top_k },
  659. { "top_p", llama.params.top_p },
  660. { "tfs_z", llama.params.tfs_z },
  661. { "typical_p", llama.params.typical_p },
  662. { "repeat_last_n", llama.params.repeat_last_n },
  663. { "repeat_penalty", llama.params.repeat_penalty },
  664. { "presence_penalty", llama.params.presence_penalty },
  665. { "frequency_penalty", llama.params.frequency_penalty },
  666. { "mirostat", llama.params.mirostat },
  667. { "mirostat_tau", llama.params.mirostat_tau },
  668. { "mirostat_eta", llama.params.mirostat_eta },
  669. { "penalize_nl", llama.params.penalize_nl },
  670. { "stop", llama.params.antiprompt },
  671. { "n_predict", llama.params.n_predict },
  672. { "n_keep", llama.params.n_keep },
  673. { "ignore_eos", ignore_eos },
  674. { "stream", llama.stream },
  675. { "logit_bias", llama.params.logit_bias },
  676. { "n_probs", llama.params.n_probs },
  677. };
  678. }
  679. static json format_embedding_response(llama_server_context & llama) {
  680. return json {
  681. { "embedding", llama.getEmbedding() },
  682. };
  683. }
  684. static json format_final_response(llama_server_context & llama, const std::string & content, const std::vector<completion_token_output> & probs) {
  685. json res = json {
  686. { "content", content },
  687. { "stop", true },
  688. { "model", llama.params.model_alias },
  689. { "tokens_predicted", llama.num_tokens_predicted },
  690. { "tokens_evaluated", llama.num_prompt_tokens },
  691. { "generation_settings", format_generation_settings(llama) },
  692. { "prompt", llama.params.prompt },
  693. { "truncated", llama.truncated },
  694. { "stopped_eos", llama.stopped_eos },
  695. { "stopped_word", llama.stopped_word },
  696. { "stopped_limit", llama.stopped_limit },
  697. { "stopping_word", llama.stopping_word },
  698. };
  699. if (llama.params.n_probs > 0) {
  700. res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
  701. }
  702. return res;
  703. }
  704. static json format_partial_response(llama_server_context & llama, const std::string & content, const std::vector<completion_token_output> & probs) {
  705. json res = json {
  706. { "content", content },
  707. { "stop", false },
  708. };
  709. if (llama.params.n_probs > 0) {
  710. res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
  711. }
  712. return res;
  713. }
  714. static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
  715. return json {
  716. { "tokens", tokens }
  717. };
  718. }
  719. static void parse_options_completion(const json & body, llama_server_context & llama) {
  720. gpt_params default_params;
  721. llama.stream = body.value("stream", false);
  722. llama.params.n_predict = body.value("n_predict", default_params.n_predict);
  723. llama.params.top_k = body.value("top_k", default_params.top_k);
  724. llama.params.top_p = body.value("top_p", default_params.top_p);
  725. llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
  726. llama.params.typical_p = body.value("typical_p", default_params.typical_p);
  727. llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
  728. llama.params.temp = body.value("temperature", default_params.temp);
  729. llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
  730. llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
  731. llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
  732. llama.params.mirostat = body.value("mirostat", default_params.mirostat);
  733. llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
  734. llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
  735. llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
  736. llama.params.n_keep = body.value("n_keep", default_params.n_keep);
  737. llama.params.seed = body.value("seed", default_params.seed);
  738. llama.params.prompt = body.value("prompt", default_params.prompt);
  739. llama.params.n_probs = body.value("n_probs", default_params.n_probs);
  740. llama.params.logit_bias.clear();
  741. if (body.value("ignore_eos", false)) {
  742. llama.params.logit_bias[llama_token_eos()] = -INFINITY;
  743. }
  744. const auto & logit_bias = body.find("logit_bias");
  745. if (logit_bias != body.end() && logit_bias->is_array()) {
  746. const int n_vocab = llama_n_vocab(llama.ctx);
  747. for (const auto & el : *logit_bias) {
  748. if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
  749. llama_token tok = el[0].get<llama_token>();
  750. if (tok >= 0 && tok < n_vocab) {
  751. if (el[1].is_number()) {
  752. llama.params.logit_bias[tok] = el[1].get<float>();
  753. } else if (el[1].is_boolean() && !el[1].get<bool>()) {
  754. llama.params.logit_bias[tok] = -INFINITY;
  755. }
  756. }
  757. }
  758. }
  759. }
  760. llama.params.antiprompt.clear();
  761. const auto & stop = body.find("stop");
  762. if (stop != body.end() && stop->is_array()) {
  763. for (const auto & word : *stop) {
  764. if (!word.empty()) {
  765. llama.params.antiprompt.push_back(word);
  766. }
  767. }
  768. }
  769. LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
  770. }
  771. static void log_server_request(const Request & req, const Response & res) {
  772. LOG_INFO("request", {
  773. { "remote_addr", req.remote_addr },
  774. { "remote_port", req.remote_port },
  775. { "status", res.status },
  776. { "method", req.method },
  777. { "path", req.path },
  778. { "params", req.params },
  779. });
  780. LOG_VERBOSE("request", {
  781. { "request", req.body },
  782. { "response", res.body },
  783. });
  784. }
  785. int main(int argc, char ** argv) {
  786. // own arguments required by this example
  787. gpt_params params;
  788. server_params sparams;
  789. // struct that contains llama context and inference
  790. llama_server_context llama;
  791. server_params_parse(argc, argv, sparams, params);
  792. if (params.model_alias == "unknown") {
  793. params.model_alias = params.model;
  794. }
  795. llama_init_backend(params.numa);
  796. LOG_INFO("build info", {
  797. { "build", BUILD_NUMBER },
  798. { "commit", BUILD_COMMIT }
  799. });
  800. LOG_INFO("system info", {
  801. { "n_threads", params.n_threads },
  802. { "total_threads", std::thread::hardware_concurrency() },
  803. { "system_info", llama_print_system_info() },
  804. });
  805. // load the model
  806. if (!llama.loadModel(params)) {
  807. return 1;
  808. }
  809. Server svr;
  810. svr.set_default_headers({
  811. { "Server", "llama.cpp" },
  812. { "Access-Control-Allow-Origin", "*" },
  813. { "Access-Control-Allow-Headers", "content-type" }
  814. });
  815. // this is only called if no index.js is found in the public --path
  816. svr.Get("/index.js", [](const Request &, Response & res) {
  817. res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
  818. return false;
  819. });
  820. // this is only called if no index.html is found in the public --path
  821. svr.Get("/", [](const Request &, Response & res) {
  822. res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
  823. return false;
  824. });
  825. // this is only called if no index.html is found in the public --path
  826. svr.Get("/completion.js", [](const Request &, Response & res) {
  827. res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
  828. return false;
  829. });
  830. svr.Post("/completion", [&llama](const Request & req, Response & res) {
  831. auto lock = llama.lock();
  832. llama.rewind();
  833. llama_reset_timings(llama.ctx);
  834. parse_options_completion(json::parse(req.body), llama);
  835. llama.loadPrompt();
  836. llama.beginCompletion();
  837. if (!llama.stream) {
  838. size_t stop_pos = std::string::npos;
  839. while (llama.has_next_token) {
  840. const completion_token_output token_with_probs = llama.doCompletion();
  841. const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
  842. stop_pos = llama.findStoppingStrings(llama.generated_text,
  843. token_text.size(), STOP_FULL);
  844. }
  845. if (stop_pos == std::string::npos) {
  846. stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
  847. }
  848. if (stop_pos != std::string::npos) {
  849. llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
  850. llama.generated_text.end());
  851. }
  852. const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
  853. llama_print_timings(llama.ctx);
  854. res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
  855. "application/json");
  856. } else {
  857. const auto chunked_content_provider = [&](size_t, DataSink & sink) {
  858. size_t sent_count = 0;
  859. size_t sent_token_probs_index = 0;
  860. while (llama.has_next_token) {
  861. const completion_token_output token_with_probs = llama.doCompletion();
  862. const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
  863. if (llama.multibyte_pending > 0) {
  864. continue;
  865. }
  866. size_t pos = std::min(sent_count, llama.generated_text.size());
  867. const std::string str_test = llama.generated_text.substr(pos);
  868. size_t stop_pos =
  869. llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
  870. if (stop_pos != std::string::npos) {
  871. llama.generated_text.erase(
  872. llama.generated_text.begin() + pos + stop_pos,
  873. llama.generated_text.end());
  874. pos = std::min(sent_count, llama.generated_text.size());
  875. } else {
  876. stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
  877. STOP_PARTIAL);
  878. }
  879. const std::string to_send = llama.generated_text.substr(pos, stop_pos);
  880. sent_count += to_send.size();
  881. std::vector<completion_token_output> probs_output = {};
  882. if (llama.params.n_probs > 0) {
  883. const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
  884. size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
  885. size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
  886. if (probs_pos < probs_stop_pos) {
  887. probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
  888. }
  889. sent_token_probs_index = probs_stop_pos;
  890. }
  891. const json data = llama.has_next_token
  892. ? format_partial_response(llama, to_send, probs_output)
  893. // Generation is done, send extra information.
  894. : format_final_response(llama, to_send, llama.generated_token_probs);
  895. const std::string str =
  896. "data: " +
  897. data.dump(-1, ' ', false, json::error_handler_t::replace) +
  898. "\n\n";
  899. LOG_VERBOSE("data stream", {
  900. { "to_send", str }
  901. });
  902. if (!sink.write(str.data(), str.size())) {
  903. LOG_VERBOSE("stream closed", {});
  904. llama_print_timings(llama.ctx);
  905. return false;
  906. }
  907. }
  908. llama_print_timings(llama.ctx);
  909. sink.done();
  910. return true;
  911. };
  912. res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
  913. }
  914. });
  915. svr.Options(R"(/.*)", [](const Request &, Response & res) {
  916. return res.set_content("", "application/json");
  917. });
  918. svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
  919. auto lock = llama.lock();
  920. const json body = json::parse(req.body);
  921. const std::string content = body.value("content", "");
  922. const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
  923. const json data = format_tokenizer_response(tokens);
  924. return res.set_content(data.dump(), "application/json");
  925. });
  926. svr.Post("/embedding", [&llama](const Request & req, Response & res) {
  927. auto lock = llama.lock();
  928. const json body = json::parse(req.body);
  929. llama.rewind();
  930. llama_reset_timings(llama.ctx);
  931. llama.params.prompt = body.value("content", "");
  932. llama.params.n_predict = 0;
  933. llama.loadPrompt();
  934. llama.beginCompletion();
  935. llama.doCompletion();
  936. const json data = format_embedding_response(llama);
  937. return res.set_content(data.dump(), "application/json");
  938. });
  939. svr.set_logger(log_server_request);
  940. svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
  941. const auto * fmt = "500 Internal Server Error\n%s";
  942. char buf[BUFSIZ];
  943. try {
  944. std::rethrow_exception(std::move(ep));
  945. } catch (std::exception & e) {
  946. snprintf(buf, sizeof(buf), fmt, e.what());
  947. } catch (...) {
  948. snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
  949. }
  950. res.set_content(buf, "text/plain");
  951. res.status = 500;
  952. });
  953. svr.set_error_handler([](const Request &, Response & res) {
  954. res.set_content("File Not Found", "text/plain");
  955. res.status = 404;
  956. });
  957. // set timeouts and change hostname and port
  958. svr.set_read_timeout(sparams.read_timeout);
  959. svr.set_write_timeout(sparams.write_timeout);
  960. if (!svr.bind_to_port(sparams.hostname, sparams.port)) {
  961. fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
  962. return 1;
  963. }
  964. // Set the base directory for serving static files
  965. svr.set_base_dir(sparams.public_path);
  966. // to make it ctrl+clickable:
  967. fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
  968. LOG_INFO("HTTP server listening", {
  969. { "hostname", sparams.hostname },
  970. { "port", sparams.port },
  971. });
  972. if (!svr.listen_after_bind()) {
  973. return 1;
  974. }
  975. return 0;
  976. }