1 jaar geleden · 930b178026
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -39,10 +39,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 
				 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
			
 
				 - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
			
 
				 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
			
 
				-- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
			
 
				+- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
			
 
				 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
			
 
				 - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
			
 
				 - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
			
 
				+- `--log-disable`: Output logs to stdout only, default: enabled.
			
 
				+- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
			
 
				 
			
 
				 ## Build
			
 
				 
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -47,6 +47,7 @@ struct server_params
 
				 };
			
 
				 
			
 
				 bool server_verbose = false;
			
 
				+bool server_log_json = true;
			
 
				 
			
 
				 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
			
 
				 {
			
@@ -302,12 +303,43 @@ struct llama_client_slot
 
				     }
			
 
				 
			
 
				     void print_timings() const {
			
 
				-        LOG_TEE("\n");
			
 
				-        LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				-            __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
			
 
				-        LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				-            __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
			
 
				-        LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
			
 
				+       char buffer[512];
			
 
				+        double t_token = t_prompt_processing / num_prompt_tokens_processed;
			
 
				+        double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
			
 
				+        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
			
 
				+                t_prompt_processing, num_prompt_tokens_processed,
			
 
				+                t_token, n_tokens_second);
			
 
				+        LOG_INFO(buffer, {
			
 
				+            {"slot_id",                     id},
			
 
				+            {"task_id",                     task_id},
			
 
				+            {"t_prompt_processing",         t_prompt_processing},
			
 
				+            {"num_prompt_tokens_processed", num_prompt_tokens_processed},
			
 
				+            {"t_token",                     t_token},
			
 
				+            {"n_tokens_second",             n_tokens_second},
			
 
				+        });
			
 
				+
			
 
				+        t_token = t_token_generation / n_decoded;
			
 
				+        n_tokens_second = 1e3 / t_token_generation * n_decoded;
			
 
				+        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
			
 
				+                t_token_generation, n_decoded,
			
 
				+                t_token, n_tokens_second);
			
 
				+        LOG_INFO(buffer, {
			
 
				+            {"slot_id",            id},
			
 
				+            {"task_id",            task_id},
			
 
				+            {"t_token_generation", t_token_generation},
			
 
				+            {"n_decoded",          n_decoded},
			
 
				+            {"t_token",            t_token},
			
 
				+            {"n_tokens_second",    n_tokens_second},
			
 
				+        });
			
 
				+
			
 
				+        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
			
 
				+        LOG_INFO(buffer, {
			
 
				+            {"slot_id",             id},
			
 
				+            {"task_id",             task_id},
			
 
				+            {"t_prompt_processing", t_prompt_processing},
			
 
				+            {"t_token_generation",  t_token_generation},
			
 
				+            {"t_total",             t_prompt_processing + t_token_generation},
			
 
				+        });
			
 
				     }
			
 
				 };
			
 
				 
			
@@ -399,7 +431,7 @@ struct llama_server_context
 
				         params = params_;
			
 
				         if (!params.mmproj.empty()) {
			
 
				             multimodal = true;
			
 
				-            LOG_TEE("Multi Modal Mode Enabled");
			
 
				+            LOG_INFO("Multi Modal Mode Enabled", {});
			
 
				             clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
			
 
				             if(clp_ctx == nullptr) {
			
 
				                 LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
			
@@ -452,7 +484,7 @@ struct llama_server_context
 
				 
			
 
				         const int32_t n_ctx_slot = n_ctx / params.n_parallel;
			
 
				 
			
 
				-        LOG_TEE("Available slots:\n");
			
 
				+        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
			
 
				         for (int i = 0; i < params.n_parallel; i++)
			
 
				         {
			
 
				             llama_client_slot slot;
			
@@ -461,7 +493,10 @@ struct llama_server_context
 
				             slot.n_ctx = n_ctx_slot;
			
 
				             slot.n_predict = params.n_predict;
			
 
				 
			
 
				-            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
			
 
				+            LOG_INFO("new slot", {
			
 
				+                {"slot_id",    slot.id},
			
 
				+                {"n_ctx_slot", slot.n_ctx}
			
 
				+            });
			
 
				 
			
 
				             const int ga_n = params.grp_attn_n;
			
 
				             const int ga_w = params.grp_attn_w;
			
@@ -471,7 +506,12 @@ struct llama_server_context
 
				                 GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
			
 
				                 //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
			
 
				                 //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
			
 
				-                LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
			
 
				+
			
 
				+                LOG_INFO("slot self-extend", {
			
 
				+                    {"slot_id",   slot.id},
			
 
				+                    {"ga_n",      ga_n},
			
 
				+                    {"ga_w",      ga_w}
			
 
				+                });
			
 
				             }
			
 
				 
			
 
				             slot.ga_i = 0;
			
@@ -765,10 +805,16 @@ struct llama_server_context
 
				                     img_sl.img_data = clip_image_u8_init();
			
 
				                     if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
			
 
				                     {
			
 
				-                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
			
 
				+                        LOG_ERROR("failed to load image", {
			
 
				+                            {"slot_id",   slot->id},
			
 
				+                            {"img_sl_id", img_sl.id}
			
 
				+                        });
			
 
				                         return false;
			
 
				                     }
			
 
				-                    LOG_TEE("slot %i - loaded image\n", slot->id);
			
 
				+                    LOG_VERBOSE("image loaded", {
			
 
				+                        {"slot_id",   slot->id},
			
 
				+                        {"img_sl_id", img_sl.id}
			
 
				+                    });
			
 
				                     img_sl.request_encode_image = true;
			
 
				                     slot->images.push_back(img_sl);
			
 
				                 }
			
@@ -828,7 +874,10 @@ struct llama_server_context
 
				 
			
 
				         all_slots_are_idle = false;
			
 
				 
			
 
				-        LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
			
 
				+        LOG_INFO("slot is processing task", {
			
 
				+            {"slot_id", slot->id},
			
 
				+            {"task_id", slot->task_id},
			
 
				+        });
			
 
				 
			
 
				         return true;
			
 
				     }
			
@@ -1391,7 +1440,7 @@ struct llama_server_context
 
				                 if (slot == nullptr)
			
 
				                 {
			
 
				                     // if no slot is available, we defer this task for processing later
			
 
				-                    LOG_VERBOSE("no slot is available", {});
			
 
				+                    LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
			
 
				                     queue_tasks.defer(task);
			
 
				                     break;
			
 
				                 }
			
@@ -1467,7 +1516,17 @@ struct llama_server_context
 
				                     }
			
 
				                     slots_data.push_back(slot_data);
			
 
				                 }
			
 
				-                LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
			
 
				+                LOG_INFO("slot data", {
			
 
				+                    {"task_id",            task.id},
			
 
				+                    {"n_idle_slots",       n_idle_slots},
			
 
				+                    {"n_processing_slots", n_processing_slots}
			
 
				+                });
			
 
				+                LOG_VERBOSE("slot data", {
			
 
				+                    {"task_id",            task.id},
			
 
				+                    {"n_idle_slots",       n_idle_slots},
			
 
				+                    {"n_processing_slots", n_processing_slots},
			
 
				+                    {"slots",              slots_data}
			
 
				+                });
			
 
				                 task_result res;
			
 
				                 res.id = task.id;
			
 
				                 res.multitask_id = task.multitask_id;
			
@@ -1519,7 +1578,7 @@ struct llama_server_context
 
				     bool update_slots() {
			
 
				         if (system_need_update)
			
 
				         {
			
 
				-            LOG_TEE("updating system prompt\n");
			
 
				+            LOG_INFO("updating system prompt", {});
			
 
				             update_system_prompt();
			
 
				         }
			
 
				 
			
@@ -1529,12 +1588,13 @@ struct llama_server_context
 
				         {
			
 
				             if (system_prompt.empty() && clean_kv_cache)
			
 
				             {
			
 
				-                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
			
 
				+                LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
			
 
				                 kv_cache_clear();
			
 
				             }
			
 
				             return true;
			
 
				         }
			
 
				 
			
 
				+        LOG_VERBOSE("posting NEXT_RESPONSE", {});
			
 
				         task_server task;
			
 
				         task.type = TASK_TYPE_NEXT_RESPONSE;
			
 
				         task.target_id = -1;
			
@@ -1548,10 +1608,20 @@ struct llama_server_context
 
				                 {
			
 
				                     // Shift context
			
 
				                     const int n_keep    = slot.params.n_keep + add_bos_token;
			
 
				-                    const int n_left    = system_tokens.size() + slot.n_past - n_keep;
			
 
				+                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
			
 
				                     const int n_discard = n_left / 2;
			
 
				 
			
 
				-                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
			
 
				+                    LOG_INFO("slot context shift", {
			
 
				+                        {"slot_id",         slot.id},
			
 
				+                        {"task_id",         slot.task_id},
			
 
				+                        {"n_keep",          n_keep},
			
 
				+                        {"n_left",          n_left},
			
 
				+                        {"n_discard",       n_discard},
			
 
				+                        {"n_ctx",           n_ctx},
			
 
				+                        {"n_past",          slot.n_past},
			
 
				+                        {"n_system_tokens", system_tokens.size()},
			
 
				+                        {"n_cache_tokens",  slot.cache_tokens.size()}
			
 
				+                    });
			
 
				                     llama_kv_cache_seq_rm   (ctx, slot.id, n_keep            , n_keep + n_discard);
			
 
				                     llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
			
 
				 
			
@@ -1565,17 +1635,12 @@ struct llama_server_context
 
				                     slot.n_past -= n_discard;
			
 
				 
			
 
				                     slot.truncated = true;
			
 
				-
			
 
				-                    LOG_VERBOSE("context shift", {
			
 
				-                        { "n_ctx", n_ctx },
			
 
				-                        { "n_keep", n_keep },
			
 
				-                        { "n_left", n_left },
			
 
				-                    });
			
 
				                 }
			
 
				             }
			
 
				         }
			
 
				 
			
 
				         // decode any currently ongoing sequences
			
 
				+        LOG_VERBOSE("decoding ongoing sequences", {});
			
 
				         for (auto & slot : slots)
			
 
				         {
			
 
				             // release the slot
			
@@ -1585,7 +1650,15 @@ struct llama_server_context
 
				                 slot.command = NONE;
			
 
				                 slot.t_last_used = ggml_time_us();
			
 
				 
			
 
				-                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
			
 
				+                LOG_INFO("slot released", {
			
 
				+                    {"slot_id",         slot.id},
			
 
				+                    {"task_id",         slot.task_id},
			
 
				+                    {"n_ctx",           n_ctx},
			
 
				+                    {"n_past",          slot.n_past},
			
 
				+                    {"n_system_tokens", system_tokens.size()},
			
 
				+                    {"n_cache_tokens",  slot.cache_tokens.size()},
			
 
				+                    {"truncated",       slot.truncated}
			
 
				+                });
			
 
				                 queue_tasks.notify_slot_changed();
			
 
				 
			
 
				                 continue;
			
@@ -1733,7 +1806,12 @@ struct llama_server_context
 
				                             slot.ga_i = ga_i;
			
 
				                         }
			
 
				 
			
 
				-                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
			
 
				+                        LOG_INFO("slot progression", {
			
 
				+                            { "slot_id", slot.id },
			
 
				+                            { "task_id", slot.task_id },
			
 
				+                            { "n_past",  slot.n_past },
			
 
				+                            { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
			
 
				+                        });
			
 
				                     }
			
 
				 
			
 
				                     slot.cache_tokens = prompt_tokens;
			
@@ -1741,7 +1819,10 @@ struct llama_server_context
 
				                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
			
 
				                     {
			
 
				                         // we have to evaluate at least 1 token to generate logits.
			
 
				-                        LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
			
 
				+                        LOG_INFO("we have to evaluate at least 1 token to generate logits", {
			
 
				+                            { "slot_id", slot.id },
			
 
				+                            { "task_id", slot.task_id }
			
 
				+                        });
			
 
				                         slot.n_past--;
			
 
				                         if (slot.ga_i > 0)
			
 
				                         {
			
@@ -1749,9 +1830,13 @@ struct llama_server_context
 
				                         }
			
 
				                     }
			
 
				 
			
 
				-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
			
 
				-
			
 
				-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
			
 
				+                    int p0 = (int) system_tokens.size() + slot.n_past;
			
 
				+                    LOG_INFO("kv cache rm [p0, end)", {
			
 
				+                        { "slot_id", slot.id },
			
 
				+                        { "task_id", slot.task_id },
			
 
				+                        { "p0",      p0 }
			
 
				+                    });
			
 
				+                    llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
			
 
				 
			
 
				                     LOG_VERBOSE("prompt ingested", {
			
 
				                                                     {"n_past",  slot.n_past},
			
@@ -1786,7 +1871,13 @@ struct llama_server_context
 
				 
			
 
				                     if (has_images && !ingest_images(slot, n_batch))
			
 
				                     {
			
 
				-                        LOG_TEE("failed processing images\n");
			
 
				+                        LOG_ERROR("failed processing images", {
			
 
				+                            "slot_id", slot.id,
			
 
				+                            "task_id", slot.task_id,
			
 
				+                        });
			
 
				+                        // FIXME @phymbert: to be properly tested
			
 
				+                        //  early returning without changing the slot state will block the slot for ever
			
 
				+                        // no one at the moment is checking the return value
			
 
				                         return false;
			
 
				                     }
			
 
				 
			
@@ -1928,6 +2019,8 @@ struct llama_server_context
 
				                 slot.i_batch = -1;
			
 
				             }
			
 
				         }
			
 
				+
			
 
				+        LOG_VERBOSE("slots updated", {});
			
 
				         return true;
			
 
				     }
			
 
				 
			
@@ -2005,6 +2098,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
 
				     printf("  -ctv TYPE, --cache-type-v TYPE\n");
			
 
				     printf("                            KV cache data type for V (default: f16)\n");
			
 
				     printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
			
 
				+    printf("  --log-format              log output format: json or text (default: json)\n");
			
 
				     printf("  --log-disable             disables logging to a file.\n");
			
 
				     printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
			
 
				     printf("  --metrics                 enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
			
@@ -2458,6 +2552,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
				             }
			
 
				             params.mmproj = argv[i];
			
 
				         }
			
 
				+        else if (arg == "--log-format")
			
 
				+        {
			
 
				+            if (++i >= argc)
			
 
				+            {
			
 
				+                invalid_param = true;
			
 
				+                break;
			
 
				+            }
			
 
				+            if (std::strcmp(argv[i], "json") == 0)
			
 
				+            {
			
 
				+                server_log_json = true;
			
 
				+            }
			
 
				+            else if (std::strcmp(argv[i], "text") == 0)
			
 
				+            {
			
 
				+                server_log_json = false;
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                invalid_param = true;
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				         else if (arg == "--log-disable")
			
 
				         {
			
 
				             log_set_target(stdout);
			
@@ -2571,32 +2686,40 @@ static json format_partial_response(
 
				 
			
 
				 static json format_tokenizer_response(const std::vector<llama_token> &tokens)
			
 
				 {
			
 
				-    return json{
			
 
				-        {"tokens", tokens}};
			
 
				+    return json {
			
 
				+        {"tokens", tokens}
			
 
				+    };
			
 
				 }
			
 
				 
			
 
				 static json format_detokenized_response(std::string content)
			
 
				 {
			
 
				-    return json{
			
 
				-        {"content", content}};
			
 
				+    return json {
			
 
				+        {"content", content}
			
 
				+    };
			
 
				 }
			
 
				 
			
 
				 
			
 
				 static void log_server_request(const httplib::Request &req, const httplib::Response &res)
			
 
				 {
			
 
				+    // skip GH copilot requests when using default port
			
 
				+    if (req.path == "/v1/health" || req.path == "/v1/completions")
			
 
				+    {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				     LOG_INFO("request", {
			
 
				-                            {"remote_addr", req.remote_addr},
			
 
				-                            {"remote_port", req.remote_port},
			
 
				-                            {"status", res.status},
			
 
				-                            {"method", req.method},
			
 
				-                            {"path", req.path},
			
 
				-                            {"params", req.params},
			
 
				-                        });
			
 
				+        {"remote_addr", req.remote_addr},
			
 
				+        {"remote_port", req.remote_port},
			
 
				+        {"status",      res.status},
			
 
				+        {"method",      req.method},
			
 
				+        {"path",        req.path},
			
 
				+        {"params",      req.params},
			
 
				+    });
			
 
				 
			
 
				     LOG_VERBOSE("request", {
			
 
				-                               {"request", req.body},
			
 
				-                               {"response", res.body},
			
 
				-                           });
			
 
				+        {"request",  req.body},
			
 
				+        {"response", res.body},
			
 
				+    });
			
 
				 }
			
 
				 
			
 
				 struct token_translator
			
@@ -2873,9 +2996,6 @@ int main(int argc, char **argv)
 
				     // Set the base directory for serving static files
			
 
				     svr.set_base_dir(sparams.public_path);
			
 
				 
			
 
				-    // to make it ctrl+clickable:
			
 
				-    LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
			
 
				-
			
 
				     std::unordered_map<std::string, std::string> log_data;
			
 
				     log_data["hostname"] = sparams.hostname;
			
 
				     log_data["port"] = std::to_string(sparams.port);
			
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -32,6 +32,7 @@ It's possible to override some scenario steps values with environment variables:
 
				  - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
			
 
				  - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
			
 
				  - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
			
 
				+ - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
			
 
				 
			
 
				 ### Run @bug, @wip or @wrong_usage annotated scenario
			
 
				 
			
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -29,9 +29,9 @@ Feature: llama.cpp server
 
				     And   prometheus metrics are exposed
			
 
				 
			
 
				     Examples: Prompts
			
 
				-      | prompt                           | n_predict | re_content                   | n_predicted |
			
 
				-      | I believe the meaning of life is | 8         | read                         | 8           |
			
 
				-      | Write a joke about AI            | 64        | (park<or>friends<or>scared)+ | 32          |
			
 
				+      | prompt                           | n_predict | re_content                             | n_predicted |
			
 
				+      | I believe the meaning of life is | 8         | (read<or>going)+                       | 8           |
			
 
				+      | Write a joke about AI            | 64        | (park<or>friends<or>scared<or>always)+ | 32          |
			
 
				 
			
 
				   Scenario Outline: OAI Compatibility
			
 
				     Given a model <model>
			
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -792,6 +792,8 @@ def start_server_background(context):
 
				         server_args.extend(['--api-key', context.server_api_key])
			
 
				     if context.debug:
			
 
				         server_args.append('--verbose')
			
 
				+    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
			
 
				+        server_args.extend(['--log-format', "text"])
			
 
				     print(f"starting server with: {context.server_path}", *server_args)
			
 
				     context.server_process = subprocess.Popen(
			
 
				         [str(arg) for arg in [context.server_path, *server_args]],
			
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -14,6 +14,7 @@
 
				 using json = nlohmann::json;
			
 
				 
			
 
				 extern bool server_verbose;
			
 
				+extern bool server_log_json;
			
 
				 
			
 
				 #ifndef SERVER_VERBOSE
			
 
				 #define SERVER_VERBOSE 1
			
@@ -27,14 +28,14 @@ extern bool server_verbose;
 
				     {                                                                    \
			
 
				         if (server_verbose)                                              \
			
 
				         {                                                                \
			
 
				-            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
			
 
				+            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
			
 
				         }                                                                \
			
 
				     } while (0)
			
 
				 #endif
			
 
				 
			
 
				-#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
			
 
				-#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
			
 
				-#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
			
 
				+#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
			
 
				+#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
			
 
				+#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
			
 
				 
			
 
				 //
			
 
				 // parallel
			
@@ -133,26 +134,48 @@ struct completion_token_output
 
				     std::string text_to_send;
			
 
				 };
			
 
				 
			
 
				-static inline void server_log(const char *level, const char *function, int line,
			
 
				-                       const char *message, const nlohmann::ordered_json &extra)
			
 
				+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
			
 
				 {
			
 
				-    nlohmann::ordered_json log
			
 
				-    {
			
 
				+    std::stringstream ss_tid;
			
 
				+    ss_tid << std::this_thread::get_id();
			
 
				+    json log = nlohmann::ordered_json{
			
 
				+        {"tid", ss_tid.str()},
			
 
				         {"timestamp", time(nullptr)},
			
 
				-        {"level",     level},
			
 
				-        {"function",  function},
			
 
				-        {"line",      line},
			
 
				-        {"message",   message},
			
 
				     };
			
 
				 
			
 
				-    if (!extra.empty())
			
 
				-    {
			
 
				-        log.merge_patch(extra);
			
 
				-    }
			
 
				+    if (server_log_json) {
			
 
				+        log.merge_patch(
			
 
				+                {
			
 
				+                        {"level",     level},
			
 
				+                        {"function",  function},
			
 
				+                        {"line",      line},
			
 
				+                        {"msg",       message},
			
 
				+                });
			
 
				+        if (!extra.empty()) {
			
 
				+            log.merge_patch(extra);
			
 
				+        }
			
 
				+
			
 
				+        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
			
 
				+    } else {
			
 
				+        char buf[1024];
			
 
				+        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
			
 
				 
			
 
				-    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
			
 
				-    printf("%.*s\n", (int)str.size(), str.data());
			
 
				-    fflush(stdout);
			
 
				+        if (!extra.empty()) {
			
 
				+            log.merge_patch(extra);
			
 
				+        }
			
 
				+        std::stringstream ss;
			
 
				+        ss << buf << " |";
			
 
				+        for (const auto& el : log.items())
			
 
				+        {
			
 
				+            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
			
 
				+            snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
			
 
				+            ss << buf;
			
 
				+        }
			
 
				+
			
 
				+        const std::string str = ss.str();
			
 
				+        printf("%.*s\n", (int)str.size(), str.data());
			
 
				+        fflush(stdout);
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 //
			
@@ -234,6 +257,7 @@ struct llama_server_queue {
 
				         std::unique_lock<std::mutex> lock(mutex_tasks);
			
 
				         if (task.id == -1) {
			
 
				             task.id = id++;
			
 
				+            LOG_VERBOSE("new task id", {{"new_id", task.id}});
			
 
				         }
			
 
				         queue_tasks.push_back(std::move(task));
			
 
				         condition_tasks.notify_one();
			
@@ -249,7 +273,9 @@ struct llama_server_queue {
 
				     // Get the next id for creating anew task
			
 
				     int get_new_id() {
			
 
				         std::unique_lock<std::mutex> lock(mutex_tasks);
			
 
				-        return id++;
			
 
				+        int new_id = id++;
			
 
				+        LOG_VERBOSE("new task id", {{"new_id", new_id}});
			
 
				+        return new_id;
			
 
				     }
			
 
				 
			
 
				     // Register function to process a new task
			
@@ -290,8 +316,7 @@ struct llama_server_queue {
 
				     void start_loop() {
			
 
				         running = true;
			
 
				         while (true) {
			
 
				-            // new task arrived
			
 
				-            LOG_VERBOSE("have new task", {});
			
 
				+            LOG_VERBOSE("new task may arrive", {});
			
 
				             {
			
 
				                 while (true)
			
 
				                 {
			
@@ -303,7 +328,7 @@ struct llama_server_queue {
 
				                     task_server task = queue_tasks.front();
			
 
				                     queue_tasks.erase(queue_tasks.begin());
			
 
				                     lock.unlock();
			
 
				-                    LOG_VERBOSE("callback_new_task", {});
			
 
				+                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
			
 
				                     callback_new_task(task);
			
 
				                 }
			
 
				                 LOG_VERBOSE("callback_all_task_finished", {});
			
@@ -384,11 +409,13 @@ struct llama_server_response {
 
				     std::condition_variable condition_results;
			
 
				 
			
 
				     void add_waiting_task_id(int task_id) {
			
 
				+        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
			
 
				         std::unique_lock<std::mutex> lock(mutex_results);
			
 
				         waiting_task_ids.insert(task_id);
			
 
				     }
			
 
				 
			
 
				     void remove_waiting_task_id(int task_id) {
			
 
				+        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
			
 
				         std::unique_lock<std::mutex> lock(mutex_results);
			
 
				         waiting_task_ids.erase(task_id);
			
 
				     }
			
@@ -401,7 +428,6 @@ struct llama_server_response {
 
				             condition_results.wait(lock, [&]{
			
 
				                 return !queue_results.empty();
			
 
				             });
			
 
				-            LOG_VERBOSE("condition_results unblock", {});
			
 
				 
			
 
				             for (int i = 0; i < (int) queue_results.size(); i++)
			
 
				             {
			
@@ -426,20 +452,20 @@ struct llama_server_response {
 
				     // Send a new result to a waiting task_id
			
 
				     void send(task_result result) {
			
 
				         std::unique_lock<std::mutex> lock(mutex_results);
			
 
				-        LOG_VERBOSE("send new result", {});
			
 
				+        LOG_VERBOSE("send new result", {{"task_id", result.id}});
			
 
				         for (auto& task_id : waiting_task_ids) {
			
 
				             // LOG_TEE("waiting task id %i \n", task_id);
			
 
				             // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
			
 
				             if (result.multitask_id == task_id)
			
 
				             {
			
 
				-                LOG_VERBOSE("callback_update_multitask", {});
			
 
				+                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
			
 
				                 callback_update_multitask(task_id, result.id, result);
			
 
				                 continue;
			
 
				             }
			
 
				 
			
 
				             if (result.id == task_id)
			
 
				             {
			
 
				-                LOG_VERBOSE("queue_results.push_back", {});
			
 
				+                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
			
 
				                 queue_results.push_back(result);
			
 
				                 condition_results.notify_all();
			
 
				                 return;