|
|
@@ -1558,6 +1558,7 @@ struct llama_server_context
|
|
|
void process_tasks()
|
|
|
{
|
|
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
|
+ std::vector<task_server> deferred_tasks;
|
|
|
while (!queue_tasks.empty())
|
|
|
{
|
|
|
task_server task = queue_tasks.front();
|
|
|
@@ -1568,9 +1569,8 @@ struct llama_server_context
|
|
|
llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
|
|
|
if (slot == nullptr)
|
|
|
{
|
|
|
- LOG_TEE("slot unavailable\n");
|
|
|
- // send error result
|
|
|
- send_error(task, "slot unavailable");
|
|
|
+ // if no slot is available, we defer this task for processing later
|
|
|
+ deferred_tasks.push_back(task);
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
@@ -1616,6 +1616,12 @@ struct llama_server_context
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ // add all the deferred tasks back the the queue
|
|
|
+ for (task_server &task : deferred_tasks)
|
|
|
+ {
|
|
|
+ queue_tasks.push_back(task);
|
|
|
+ }
|
|
|
+
|
|
|
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
|
|
|
std::vector<task_result> agg_results;
|
|
|
auto queue_iterator = queue_multitasks.begin();
|