|
|
@@ -47,6 +47,7 @@ struct server_params
|
|
|
};
|
|
|
|
|
|
bool server_verbose = false;
|
|
|
+bool server_log_json = true;
|
|
|
|
|
|
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
|
|
{
|
|
|
@@ -302,12 +303,43 @@ struct llama_client_slot
|
|
|
}
|
|
|
|
|
|
void print_timings() const {
|
|
|
- LOG_TEE("\n");
|
|
|
- LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
|
- __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
|
|
- LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
|
- __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
|
|
|
- LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
|
|
|
+ char buffer[512];
|
|
|
+ double t_token = t_prompt_processing / num_prompt_tokens_processed;
|
|
|
+ double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
|
|
|
+ sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
|
|
+ t_prompt_processing, num_prompt_tokens_processed,
|
|
|
+ t_token, n_tokens_second);
|
|
|
+ LOG_INFO(buffer, {
|
|
|
+ {"slot_id", id},
|
|
|
+ {"task_id", task_id},
|
|
|
+ {"t_prompt_processing", t_prompt_processing},
|
|
|
+ {"num_prompt_tokens_processed", num_prompt_tokens_processed},
|
|
|
+ {"t_token", t_token},
|
|
|
+ {"n_tokens_second", n_tokens_second},
|
|
|
+ });
|
|
|
+
|
|
|
+ t_token = t_token_generation / n_decoded;
|
|
|
+ n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
|
|
+ sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
|
|
+ t_token_generation, n_decoded,
|
|
|
+ t_token, n_tokens_second);
|
|
|
+ LOG_INFO(buffer, {
|
|
|
+ {"slot_id", id},
|
|
|
+ {"task_id", task_id},
|
|
|
+ {"t_token_generation", t_token_generation},
|
|
|
+ {"n_decoded", n_decoded},
|
|
|
+ {"t_token", t_token},
|
|
|
+ {"n_tokens_second", n_tokens_second},
|
|
|
+ });
|
|
|
+
|
|
|
+ sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
|
|
+ LOG_INFO(buffer, {
|
|
|
+ {"slot_id", id},
|
|
|
+ {"task_id", task_id},
|
|
|
+ {"t_prompt_processing", t_prompt_processing},
|
|
|
+ {"t_token_generation", t_token_generation},
|
|
|
+ {"t_total", t_prompt_processing + t_token_generation},
|
|
|
+ });
|
|
|
}
|
|
|
};
|
|
|
|
|
|
@@ -399,7 +431,7 @@ struct llama_server_context
|
|
|
params = params_;
|
|
|
if (!params.mmproj.empty()) {
|
|
|
multimodal = true;
|
|
|
- LOG_TEE("Multi Modal Mode Enabled");
|
|
|
+ LOG_INFO("Multi Modal Mode Enabled", {});
|
|
|
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
|
|
if(clp_ctx == nullptr) {
|
|
|
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
|
|
@@ -452,7 +484,7 @@ struct llama_server_context
|
|
|
|
|
|
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
|
|
|
|
|
- LOG_TEE("Available slots:\n");
|
|
|
+ LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
|
|
|
for (int i = 0; i < params.n_parallel; i++)
|
|
|
{
|
|
|
llama_client_slot slot;
|
|
|
@@ -461,7 +493,10 @@ struct llama_server_context
|
|
|
slot.n_ctx = n_ctx_slot;
|
|
|
slot.n_predict = params.n_predict;
|
|
|
|
|
|
- LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
|
|
+ LOG_INFO("new slot", {
|
|
|
+ {"slot_id", slot.id},
|
|
|
+ {"n_ctx_slot", slot.n_ctx}
|
|
|
+ });
|
|
|
|
|
|
const int ga_n = params.grp_attn_n;
|
|
|
const int ga_w = params.grp_attn_w;
|
|
|
@@ -471,7 +506,12 @@ struct llama_server_context
|
|
|
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
|
|
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
|
|
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
|
|
- LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
|
|
+
|
|
|
+ LOG_INFO("slot self-extend", {
|
|
|
+ {"slot_id", slot.id},
|
|
|
+ {"ga_n", ga_n},
|
|
|
+ {"ga_w", ga_w}
|
|
|
+ });
|
|
|
}
|
|
|
|
|
|
slot.ga_i = 0;
|
|
|
@@ -765,10 +805,16 @@ struct llama_server_context
|
|
|
img_sl.img_data = clip_image_u8_init();
|
|
|
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
|
|
{
|
|
|
- LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
|
|
+ LOG_ERROR("failed to load image", {
|
|
|
+ {"slot_id", slot->id},
|
|
|
+ {"img_sl_id", img_sl.id}
|
|
|
+ });
|
|
|
return false;
|
|
|
}
|
|
|
- LOG_TEE("slot %i - loaded image\n", slot->id);
|
|
|
+ LOG_VERBOSE("image loaded", {
|
|
|
+ {"slot_id", slot->id},
|
|
|
+ {"img_sl_id", img_sl.id}
|
|
|
+ });
|
|
|
img_sl.request_encode_image = true;
|
|
|
slot->images.push_back(img_sl);
|
|
|
}
|
|
|
@@ -828,7 +874,10 @@ struct llama_server_context
|
|
|
|
|
|
all_slots_are_idle = false;
|
|
|
|
|
|
- LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
|
|
|
+ LOG_INFO("slot is processing task", {
|
|
|
+ {"slot_id", slot->id},
|
|
|
+ {"task_id", slot->task_id},
|
|
|
+ });
|
|
|
|
|
|
return true;
|
|
|
}
|
|
|
@@ -1391,7 +1440,7 @@ struct llama_server_context
|
|
|
if (slot == nullptr)
|
|
|
{
|
|
|
// if no slot is available, we defer this task for processing later
|
|
|
- LOG_VERBOSE("no slot is available", {});
|
|
|
+ LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
|
|
|
queue_tasks.defer(task);
|
|
|
break;
|
|
|
}
|
|
|
@@ -1467,7 +1516,17 @@ struct llama_server_context
|
|
|
}
|
|
|
slots_data.push_back(slot_data);
|
|
|
}
|
|
|
- LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
|
|
|
+ LOG_INFO("slot data", {
|
|
|
+ {"task_id", task.id},
|
|
|
+ {"n_idle_slots", n_idle_slots},
|
|
|
+ {"n_processing_slots", n_processing_slots}
|
|
|
+ });
|
|
|
+ LOG_VERBOSE("slot data", {
|
|
|
+ {"task_id", task.id},
|
|
|
+ {"n_idle_slots", n_idle_slots},
|
|
|
+ {"n_processing_slots", n_processing_slots},
|
|
|
+ {"slots", slots_data}
|
|
|
+ });
|
|
|
task_result res;
|
|
|
res.id = task.id;
|
|
|
res.multitask_id = task.multitask_id;
|
|
|
@@ -1519,7 +1578,7 @@ struct llama_server_context
|
|
|
bool update_slots() {
|
|
|
if (system_need_update)
|
|
|
{
|
|
|
- LOG_TEE("updating system prompt\n");
|
|
|
+ LOG_INFO("updating system prompt", {});
|
|
|
update_system_prompt();
|
|
|
}
|
|
|
|
|
|
@@ -1529,12 +1588,13 @@ struct llama_server_context
|
|
|
{
|
|
|
if (system_prompt.empty() && clean_kv_cache)
|
|
|
{
|
|
|
- LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
|
|
|
+ LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
|
|
|
kv_cache_clear();
|
|
|
}
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
+ LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
|
|
task_server task;
|
|
|
task.type = TASK_TYPE_NEXT_RESPONSE;
|
|
|
task.target_id = -1;
|
|
|
@@ -1548,10 +1608,20 @@ struct llama_server_context
|
|
|
{
|
|
|
// Shift context
|
|
|
const int n_keep = slot.params.n_keep + add_bos_token;
|
|
|
- const int n_left = system_tokens.size() + slot.n_past - n_keep;
|
|
|
+ const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
|
|
const int n_discard = n_left / 2;
|
|
|
|
|
|
- LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
|
|
|
+ LOG_INFO("slot context shift", {
|
|
|
+ {"slot_id", slot.id},
|
|
|
+ {"task_id", slot.task_id},
|
|
|
+ {"n_keep", n_keep},
|
|
|
+ {"n_left", n_left},
|
|
|
+ {"n_discard", n_discard},
|
|
|
+ {"n_ctx", n_ctx},
|
|
|
+ {"n_past", slot.n_past},
|
|
|
+ {"n_system_tokens", system_tokens.size()},
|
|
|
+ {"n_cache_tokens", slot.cache_tokens.size()}
|
|
|
+ });
|
|
|
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
|
|
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
|
|
|
|
|
@@ -1565,17 +1635,12 @@ struct llama_server_context
|
|
|
slot.n_past -= n_discard;
|
|
|
|
|
|
slot.truncated = true;
|
|
|
-
|
|
|
- LOG_VERBOSE("context shift", {
|
|
|
- { "n_ctx", n_ctx },
|
|
|
- { "n_keep", n_keep },
|
|
|
- { "n_left", n_left },
|
|
|
- });
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// decode any currently ongoing sequences
|
|
|
+ LOG_VERBOSE("decoding ongoing sequences", {});
|
|
|
for (auto & slot : slots)
|
|
|
{
|
|
|
// release the slot
|
|
|
@@ -1585,7 +1650,15 @@ struct llama_server_context
|
|
|
slot.command = NONE;
|
|
|
slot.t_last_used = ggml_time_us();
|
|
|
|
|
|
- LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
|
|
+ LOG_INFO("slot released", {
|
|
|
+ {"slot_id", slot.id},
|
|
|
+ {"task_id", slot.task_id},
|
|
|
+ {"n_ctx", n_ctx},
|
|
|
+ {"n_past", slot.n_past},
|
|
|
+ {"n_system_tokens", system_tokens.size()},
|
|
|
+ {"n_cache_tokens", slot.cache_tokens.size()},
|
|
|
+ {"truncated", slot.truncated}
|
|
|
+ });
|
|
|
queue_tasks.notify_slot_changed();
|
|
|
|
|
|
continue;
|
|
|
@@ -1733,7 +1806,12 @@ struct llama_server_context
|
|
|
slot.ga_i = ga_i;
|
|
|
}
|
|
|
|
|
|
- LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
|
|
+ LOG_INFO("slot progression", {
|
|
|
+ { "slot_id", slot.id },
|
|
|
+ { "task_id", slot.task_id },
|
|
|
+ { "n_past", slot.n_past },
|
|
|
+ { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
|
|
|
+ });
|
|
|
}
|
|
|
|
|
|
slot.cache_tokens = prompt_tokens;
|
|
|
@@ -1741,7 +1819,10 @@ struct llama_server_context
|
|
|
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
|
|
{
|
|
|
// we have to evaluate at least 1 token to generate logits.
|
|
|
- LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
|
|
|
+ LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
|
|
+ { "slot_id", slot.id },
|
|
|
+ { "task_id", slot.task_id }
|
|
|
+ });
|
|
|
slot.n_past--;
|
|
|
if (slot.ga_i > 0)
|
|
|
{
|
|
|
@@ -1749,9 +1830,13 @@ struct llama_server_context
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
|
|
-
|
|
|
- llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
|
|
+ int p0 = (int) system_tokens.size() + slot.n_past;
|
|
|
+ LOG_INFO("kv cache rm [p0, end)", {
|
|
|
+ { "slot_id", slot.id },
|
|
|
+ { "task_id", slot.task_id },
|
|
|
+ { "p0", p0 }
|
|
|
+ });
|
|
|
+ llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
|
|
|
|
|
LOG_VERBOSE("prompt ingested", {
|
|
|
{"n_past", slot.n_past},
|
|
|
@@ -1786,7 +1871,13 @@ struct llama_server_context
|
|
|
|
|
|
if (has_images && !ingest_images(slot, n_batch))
|
|
|
{
|
|
|
- LOG_TEE("failed processing images\n");
|
|
|
+ LOG_ERROR("failed processing images", {
|
|
|
+ "slot_id", slot.id,
|
|
|
+ "task_id", slot.task_id,
|
|
|
+ });
|
|
|
+ // FIXME @phymbert: to be properly tested
|
|
|
+ // early returning without changing the slot state will block the slot for ever
|
|
|
+ // no one at the moment is checking the return value
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
@@ -1928,6 +2019,8 @@ struct llama_server_context
|
|
|
slot.i_batch = -1;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ LOG_VERBOSE("slots updated", {});
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
@@ -2005,6 +2098,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|
|
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
|
|
printf(" KV cache data type for V (default: f16)\n");
|
|
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
|
|
+ printf(" --log-format log output format: json or text (default: json)\n");
|
|
|
printf(" --log-disable disables logging to a file.\n");
|
|
|
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
|
|
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
|
|
@@ -2458,6 +2552,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|
|
}
|
|
|
params.mmproj = argv[i];
|
|
|
}
|
|
|
+ else if (arg == "--log-format")
|
|
|
+ {
|
|
|
+ if (++i >= argc)
|
|
|
+ {
|
|
|
+ invalid_param = true;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ if (std::strcmp(argv[i], "json") == 0)
|
|
|
+ {
|
|
|
+ server_log_json = true;
|
|
|
+ }
|
|
|
+ else if (std::strcmp(argv[i], "text") == 0)
|
|
|
+ {
|
|
|
+ server_log_json = false;
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ invalid_param = true;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
else if (arg == "--log-disable")
|
|
|
{
|
|
|
log_set_target(stdout);
|
|
|
@@ -2571,32 +2686,40 @@ static json format_partial_response(
|
|
|
|
|
|
static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
|
|
{
|
|
|
- return json{
|
|
|
- {"tokens", tokens}};
|
|
|
+ return json {
|
|
|
+ {"tokens", tokens}
|
|
|
+ };
|
|
|
}
|
|
|
|
|
|
static json format_detokenized_response(std::string content)
|
|
|
{
|
|
|
- return json{
|
|
|
- {"content", content}};
|
|
|
+ return json {
|
|
|
+ {"content", content}
|
|
|
+ };
|
|
|
}
|
|
|
|
|
|
|
|
|
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
|
|
{
|
|
|
+ // skip GH copilot requests when using default port
|
|
|
+ if (req.path == "/v1/health" || req.path == "/v1/completions")
|
|
|
+ {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
LOG_INFO("request", {
|
|
|
- {"remote_addr", req.remote_addr},
|
|
|
- {"remote_port", req.remote_port},
|
|
|
- {"status", res.status},
|
|
|
- {"method", req.method},
|
|
|
- {"path", req.path},
|
|
|
- {"params", req.params},
|
|
|
- });
|
|
|
+ {"remote_addr", req.remote_addr},
|
|
|
+ {"remote_port", req.remote_port},
|
|
|
+ {"status", res.status},
|
|
|
+ {"method", req.method},
|
|
|
+ {"path", req.path},
|
|
|
+ {"params", req.params},
|
|
|
+ });
|
|
|
|
|
|
LOG_VERBOSE("request", {
|
|
|
- {"request", req.body},
|
|
|
- {"response", res.body},
|
|
|
- });
|
|
|
+ {"request", req.body},
|
|
|
+ {"response", res.body},
|
|
|
+ });
|
|
|
}
|
|
|
|
|
|
struct token_translator
|
|
|
@@ -2873,9 +2996,6 @@ int main(int argc, char **argv)
|
|
|
// Set the base directory for serving static files
|
|
|
svr.set_base_dir(sparams.public_path);
|
|
|
|
|
|
- // to make it ctrl+clickable:
|
|
|
- LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
|
|
-
|
|
|
std::unordered_map<std::string, std::string> log_data;
|
|
|
log_data["hostname"] = sparams.hostname;
|
|
|
log_data["port"] = std::to_string(sparams.port);
|