|
@@ -2866,10 +2866,12 @@ struct server_context {
|
|
|
|
|
|
|
|
// if context shifting is disabled, make sure that we don't run out of context
|
|
// if context shifting is disabled, make sure that we don't run out of context
|
|
|
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
|
|
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
|
|
|
|
|
+ slot.truncated = true;
|
|
|
slot.stop = STOP_TYPE_LIMIT;
|
|
slot.stop = STOP_TYPE_LIMIT;
|
|
|
slot.has_next_token = false;
|
|
slot.has_next_token = false;
|
|
|
|
|
|
|
|
- SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
|
|
|
|
|
|
|
+ SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
|
|
|
|
|
+ slot.n_decoded, slot.n_prompt_tokens(), slot.n_past, slot.n_ctx);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// check the limits
|
|
// check the limits
|
|
@@ -2929,16 +2931,6 @@ struct server_context {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // if context shift is disabled, we stop when it reaches the context limit
|
|
|
|
|
- if (slot.n_past >= slot.n_ctx) {
|
|
|
|
|
- slot.truncated = true;
|
|
|
|
|
- slot.stop = STOP_TYPE_LIMIT;
|
|
|
|
|
- slot.has_next_token = false;
|
|
|
|
|
-
|
|
|
|
|
- SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
|
|
|
|
|
- slot.n_decoded, slot.n_prompt_tokens(), slot.n_past, slot.n_ctx);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
if (llama_vocab_is_eog(vocab, result.tok)) {
|
|
if (llama_vocab_is_eog(vocab, result.tok)) {
|
|
|
slot.stop = STOP_TYPE_EOS;
|
|
slot.stop = STOP_TYPE_EOS;
|
|
|
slot.has_next_token = false;
|
|
slot.has_next_token = false;
|
|
@@ -2946,19 +2938,6 @@ struct server_context {
|
|
|
SLT_DBG(slot, "%s", "stopped by EOS\n");
|
|
SLT_DBG(slot, "%s", "stopped by EOS\n");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- const auto n_ctx_train = llama_model_n_ctx_train(model);
|
|
|
|
|
-
|
|
|
|
|
- if (slot.task->params.n_predict < 1 && slot.n_prompt_tokens() + slot.n_decoded >= n_ctx_train) {
|
|
|
|
|
- slot.truncated = true;
|
|
|
|
|
- slot.stop = STOP_TYPE_LIMIT;
|
|
|
|
|
- slot.has_next_token = false; // stop prediction
|
|
|
|
|
-
|
|
|
|
|
- SLT_WRN(slot,
|
|
|
|
|
- "n_predict (%d) is set for infinite generation. "
|
|
|
|
|
- "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
|
|
|
|
|
- slot.task->params.n_predict, n_ctx_train);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
|
|
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
|
|
|
|
|
|
|
|
return slot.has_next_token; // continue
|
|
return slot.has_next_token; // continue
|