|
@@ -158,7 +158,7 @@ struct server_slot {
|
|
|
double t_prompt_processing; // ms
|
|
double t_prompt_processing; // ms
|
|
|
double t_token_generation; // ms
|
|
double t_token_generation; // ms
|
|
|
|
|
|
|
|
- std::function<void(int)> callback_on_release;
|
|
|
|
|
|
|
+ std::function<void(int /* slot_id */)> callback_on_release;
|
|
|
|
|
|
|
|
// Speculative decoding stats
|
|
// Speculative decoding stats
|
|
|
int32_t n_draft_total = 0; // Total draft tokens generated
|
|
int32_t n_draft_total = 0; // Total draft tokens generated
|
|
@@ -298,17 +298,6 @@ struct server_slot {
|
|
|
return n_draft_max;
|
|
return n_draft_max;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // note: a slot can also be either a parent or a child
|
|
|
|
|
- // TODO: move to server_task
|
|
|
|
|
- bool is_parent() const {
|
|
|
|
|
- return task->n_children > 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // TODO: move to server_task
|
|
|
|
|
- bool is_child() const {
|
|
|
|
|
- return task->id_parent >= 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
void release() {
|
|
void release() {
|
|
|
if (is_processing()) {
|
|
if (is_processing()) {
|
|
|
GGML_ASSERT(task);
|
|
GGML_ASSERT(task);
|
|
@@ -321,7 +310,7 @@ struct server_slot {
|
|
|
state = SLOT_STATE_IDLE;
|
|
state = SLOT_STATE_IDLE;
|
|
|
|
|
|
|
|
// do not keep context of the child slots - the parent's context is enough
|
|
// do not keep context of the child slots - the parent's context is enough
|
|
|
- if (is_child()) {
|
|
|
|
|
|
|
+ if (task->is_child()) {
|
|
|
prompt_clear(false);
|
|
prompt_clear(false);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -805,8 +794,8 @@ private:
|
|
|
|
|
|
|
|
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
|
|
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
|
|
|
|
|
|
|
|
- slot.callback_on_release = [this](int) {
|
|
|
|
|
- queue_tasks.pop_deferred_task();
|
|
|
|
|
|
|
+ slot.callback_on_release = [this](int slot_id) {
|
|
|
|
|
+ queue_tasks.pop_deferred_task(slot_id);
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
slot.reset();
|
|
slot.reset();
|
|
@@ -920,9 +909,9 @@ private:
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- server_slot * get_slot_by_id(int id) {
|
|
|
|
|
|
|
+ server_slot * get_slot_by_id(int id_slot) {
|
|
|
for (server_slot & slot : slots) {
|
|
for (server_slot & slot : slots) {
|
|
|
- if (slot.id == id) {
|
|
|
|
|
|
|
+ if (slot.id == id_slot) {
|
|
|
return &slot;
|
|
return &slot;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -1196,12 +1185,11 @@ private:
|
|
|
|
|
|
|
|
slot.task = std::make_unique<const server_task>(std::move(task));
|
|
slot.task = std::make_unique<const server_task>(std::move(task));
|
|
|
|
|
|
|
|
- slot.state = slot.is_child()
|
|
|
|
|
|
|
+ slot.state = slot.task->is_child()
|
|
|
? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
|
|
? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
|
|
|
: SLOT_STATE_STARTED;
|
|
: SLOT_STATE_STARTED;
|
|
|
|
|
|
|
|
- SLT_INF(slot, "processing task, is_child = %d\n", slot.is_child());
|
|
|
|
|
-
|
|
|
|
|
|
|
+ SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -1596,9 +1584,7 @@ private:
|
|
|
|
|
|
|
|
// tokenize the input if it's set by CLI, return false on error
|
|
// tokenize the input if it's set by CLI, return false on error
|
|
|
bool tokenize_cli_input(server_task & task) {
|
|
bool tokenize_cli_input(server_task & task) {
|
|
|
- if (task.cli_input == nullptr) {
|
|
|
|
|
- return true; // nothing to do
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ GGML_ASSERT(task.cli_input != nullptr);
|
|
|
try {
|
|
try {
|
|
|
auto & opt = oai_parser_opt;
|
|
auto & opt = oai_parser_opt;
|
|
|
common_chat_templates_inputs inputs;
|
|
common_chat_templates_inputs inputs;
|
|
@@ -1632,6 +1618,64 @@ private:
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ std::vector<server_slot *> get_free_slots(size_t n_slots_needed, int exclude_id_slot) {
|
|
|
|
|
+ std::vector<server_slot *> free_slots;
|
|
|
|
|
+ for (auto & slot : slots) {
|
|
|
|
|
+ if (!slot.is_processing() && slot.id != exclude_id_slot) {
|
|
|
|
|
+ free_slots.push_back(&slot);
|
|
|
|
|
+ }
|
|
|
|
|
+ if (free_slots.size() >= n_slots_needed) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return free_slots;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // launch multiple slots for parent + child tasks
|
|
|
|
|
+ bool launch_slots_with_parent_task(server_slot & parent_slot, std::vector<server_slot *> & child_slots, server_task && parent_task) {
|
|
|
|
|
+ GGML_ASSERT(!parent_slot.is_processing());
|
|
|
|
|
+ GGML_ASSERT(parent_task.is_parent());
|
|
|
|
|
+ GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
|
|
|
|
|
+
|
|
|
|
|
+ int id_parent = parent_task.id;
|
|
|
|
|
+
|
|
|
|
|
+ SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
|
|
|
|
|
+
|
|
|
|
|
+ // to be called in case of failure to release all launched slots
|
|
|
|
|
+ auto release_slots = [this, id_parent]() {
|
|
|
|
|
+ for (auto & slot : slots) {
|
|
|
|
|
+ if (slot.is_processing() && (
|
|
|
|
|
+ slot.task->id == id_parent ||
|
|
|
|
|
+ slot.task->id_parent == id_parent
|
|
|
|
|
+ )) {
|
|
|
|
|
+ slot.release();
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ // launch all child tasks first
|
|
|
|
|
+ size_t idx = 0;
|
|
|
|
|
+ GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
|
|
|
|
|
+ for (auto * slot : child_slots) {
|
|
|
|
|
+ int id_child = parent_task.child_tasks[idx].id;
|
|
|
|
|
+ if (!launch_slot_with_task(*slot, std::move(parent_task.child_tasks[idx]))) {
|
|
|
|
|
+ SRV_ERR("failed to launch slot with child task, id_task = %d\n", id_child);
|
|
|
|
|
+ release_slots();
|
|
|
|
|
+ return false;
|
|
|
|
|
+ }
|
|
|
|
|
+ idx++;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // finally, launch the parent task
|
|
|
|
|
+ if (!launch_slot_with_task(parent_slot, std::move(parent_task))) {
|
|
|
|
|
+ SRV_ERR("failed to launch slot with task, id_task = %d\n", id_parent);
|
|
|
|
|
+ release_slots();
|
|
|
|
|
+ return false;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
void process_single_task(server_task && task) {
|
|
void process_single_task(server_task && task) {
|
|
|
switch (task.type) {
|
|
switch (task.type) {
|
|
|
case SERVER_TASK_TYPE_COMPLETION:
|
|
case SERVER_TASK_TYPE_COMPLETION:
|
|
@@ -1639,31 +1683,55 @@ private:
|
|
|
case SERVER_TASK_TYPE_EMBEDDING:
|
|
case SERVER_TASK_TYPE_EMBEDDING:
|
|
|
case SERVER_TASK_TYPE_RERANK:
|
|
case SERVER_TASK_TYPE_RERANK:
|
|
|
{
|
|
{
|
|
|
- if (!tokenize_cli_input(task)) {
|
|
|
|
|
- break;
|
|
|
|
|
|
|
+ // special case: if input is provided via CLI, tokenize it first
|
|
|
|
|
+ // otherwise, no need to tokenize as it's already done inside the HTTP thread
|
|
|
|
|
+ if (task.cli_input != nullptr) {
|
|
|
|
|
+ if (!tokenize_cli_input(task)) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
const int id_slot = task.id_slot;
|
|
const int id_slot = task.id_slot;
|
|
|
|
|
+ const int id_task = task.id;
|
|
|
|
|
+
|
|
|
|
|
+ server_slot * slot = id_slot != -1
|
|
|
|
|
+ ? get_slot_by_id(id_slot)
|
|
|
|
|
+ : get_available_slot(task);
|
|
|
|
|
|
|
|
- server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
|
|
|
|
|
|
+ //
|
|
|
|
|
+ // slot scheduling logic
|
|
|
|
|
+ //
|
|
|
|
|
|
|
|
if (slot == nullptr) {
|
|
if (slot == nullptr) {
|
|
|
// if no slot is available, we defer this task for processing later
|
|
// if no slot is available, we defer this task for processing later
|
|
|
- SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
|
|
|
|
|
|
|
+ SRV_DBG("no slot is available, defer task, id_task = %d\n", id_task);
|
|
|
queue_tasks.defer(std::move(task));
|
|
queue_tasks.defer(std::move(task));
|
|
|
break;
|
|
break;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
if (slot->is_processing()) {
|
|
if (slot->is_processing()) {
|
|
|
// if requested slot is unavailable, we defer this task for processing later
|
|
// if requested slot is unavailable, we defer this task for processing later
|
|
|
- SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
|
|
|
|
|
|
+ SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", id_task);
|
|
|
queue_tasks.defer(std::move(task));
|
|
queue_tasks.defer(std::move(task));
|
|
|
break;
|
|
break;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if (!launch_slot_with_task(*slot, std::move(task))) {
|
|
|
|
|
- SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
|
|
|
|
|
- break;
|
|
|
|
|
|
|
+ if (task.is_parent()) {
|
|
|
|
|
+ // try getting free slots for all child tasks
|
|
|
|
|
+ size_t n_child_tasks = task.child_tasks.size();
|
|
|
|
|
+ std::vector<server_slot *> child_slots = get_free_slots(n_child_tasks, slot->id);
|
|
|
|
|
+ if (child_slots.size() < n_child_tasks) {
|
|
|
|
|
+ SRV_DBG("not enough free slots for child tasks, n_free = %zu, n_children = %zu, defer task, id_task = %d\n", child_slots.size(), n_child_tasks, id_task);
|
|
|
|
|
+ queue_tasks.defer(std::move(task));
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (!launch_slots_with_parent_task(*slot, child_slots, std::move(task))) {
|
|
|
|
|
+ SRV_ERR("failed to launch slot with parent task, id_task = %d\n", id_task);
|
|
|
|
|
+ break; // drop the task
|
|
|
|
|
+ }
|
|
|
|
|
+ } else if (!launch_slot_with_task(*slot, std::move(task))) {
|
|
|
|
|
+ SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
|
|
|
|
|
+ break; // drop the task
|
|
|
}
|
|
}
|
|
|
} break;
|
|
} break;
|
|
|
case SERVER_TASK_TYPE_CANCEL:
|
|
case SERVER_TASK_TYPE_CANCEL:
|
|
@@ -1932,7 +2000,7 @@ private:
|
|
|
GGML_ABORT("not supported by multimodal");
|
|
GGML_ABORT("not supported by multimodal");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if (slot.is_parent() || slot.is_child()) {
|
|
|
|
|
|
|
+ if (slot.task->is_parent() || slot.task->is_child()) {
|
|
|
send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
|
|
send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
|
|
|
slot.release();
|
|
slot.release();
|
|
|
continue;
|
|
continue;
|
|
@@ -2079,21 +2147,6 @@ private:
|
|
|
|
|
|
|
|
// this slot still has a prompt to be processed
|
|
// this slot still has a prompt to be processed
|
|
|
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
|
|
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
|
|
|
- // wait for all children to be launched
|
|
|
|
|
- if (slot.is_parent()) {
|
|
|
|
|
- int n_launched = 0;
|
|
|
|
|
- for (auto & other : slots) {
|
|
|
|
|
- if (other.is_processing() && other.is_child() && other.task->id_parent == slot.task->id) {
|
|
|
|
|
- ++n_launched;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- if (n_launched < slot.task->n_children) {
|
|
|
|
|
- SLT_DBG(slot, "waiting for children to be launched, n_children = %d, n_launched = %d\n", slot.task->n_children, n_launched);
|
|
|
|
|
- continue;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
const auto & input_tokens = slot.task->tokens;
|
|
const auto & input_tokens = slot.task->tokens;
|
|
|
|
|
|
|
|
// TODO: maybe move branch to outside of this loop in the future
|
|
// TODO: maybe move branch to outside of this loop in the future
|
|
@@ -2647,9 +2700,7 @@ private:
|
|
|
|
|
|
|
|
// handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
|
|
// handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
|
|
|
for (auto & slot : slots) {
|
|
for (auto & slot : slots) {
|
|
|
- if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
|
|
|
|
|
- SLT_INF(slot, "parent task prompt done, n_children = %d\n", slot.task->n_children);
|
|
|
|
|
-
|
|
|
|
|
|
|
+ if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) {
|
|
|
std::vector<server_slot *> children;
|
|
std::vector<server_slot *> children;
|
|
|
for (auto & other : slots) {
|
|
for (auto & other : slots) {
|
|
|
if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
|
|
if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
|
|
@@ -2657,17 +2708,15 @@ private:
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // we can only proceed if all child slots are having the correct tasks
|
|
|
|
|
- if (slot.task->n_children == (int) children.size()) {
|
|
|
|
|
- // copy state to the child slots
|
|
|
|
|
- for (auto & child : children) {
|
|
|
|
|
- SLT_INF(slot, " - copying state to child %d\n", child->id);
|
|
|
|
|
|
|
+ // all children slots should already launched by launch_slots_with_parent_task()
|
|
|
|
|
+ // copy state to the child slots
|
|
|
|
|
+ for (auto & child : children) {
|
|
|
|
|
+ SLT_INF(slot, " - copying state to child %d\n", child->id);
|
|
|
|
|
|
|
|
- GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
|
|
|
|
|
|
|
+ GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
|
|
|
|
|
|
|
|
- slot.copy_state_to(*child);
|
|
|
|
|
- child->state = SLOT_STATE_DONE_PROMPT;
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ slot.copy_state_to(*child);
|
|
|
|
|
+ child->state = SLOT_STATE_DONE_PROMPT;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -2943,7 +2992,9 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
|
|
// Everything else, including multimodal completions.
|
|
// Everything else, including multimodal completions.
|
|
|
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
|
|
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
|
|
|
}
|
|
}
|
|
|
- tasks.reserve(inputs.size());
|
|
|
|
|
|
|
+
|
|
|
|
|
+ // tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
|
|
|
|
|
+
|
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
|
|
server_task task = server_task(type);
|
|
server_task task = server_task(type);
|
|
|
|
|
|
|
@@ -2964,23 +3015,13 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
|
|
|
|
|
|
|
// prepare child tasks
|
|
// prepare child tasks
|
|
|
if (task.params.n_cmpl > 1) {
|
|
if (task.params.n_cmpl > 1) {
|
|
|
- task.n_children = task.params.n_cmpl - 1;
|
|
|
|
|
-
|
|
|
|
|
- for (int j = 0; j < task.n_children; j++) {
|
|
|
|
|
- server_task child = task.create_child(task.id, rd.get_new_id());
|
|
|
|
|
-
|
|
|
|
|
- // use different sampling seed for each child
|
|
|
|
|
- // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
|
|
|
|
|
- if (child.params.sampling.seed != LLAMA_DEFAULT_SEED) {
|
|
|
|
|
- child.params.sampling.seed += j + 1;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- tasks.push_back(std::move(child));
|
|
|
|
|
|
|
+ int n_children = task.params.n_cmpl - 1;
|
|
|
|
|
+ for (int j = 0; j < n_children; j++) {
|
|
|
|
|
+ task.add_child(task.id, rd.get_new_id());
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // note: the parent task always launches first
|
|
|
|
|
- tasks.insert(tasks.begin(), std::move(task));
|
|
|
|
|
|
|
+ tasks.push_back(std::move(task));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
rd.post_tasks(std::move(tasks));
|
|
rd.post_tasks(std::move(tasks));
|