|
@@ -2016,11 +2016,6 @@ struct server_context {
|
|
|
params_base.n_cache_reuse = 0;
|
|
params_base.n_cache_reuse = 0;
|
|
|
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
|
|
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
- if (!params_base.speculative.model.path.empty()) {
|
|
|
|
|
- SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
|
|
|
|
|
- return false;
|
|
|
|
|
- }
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
return true;
|
|
@@ -3215,8 +3210,14 @@ struct server_context {
|
|
|
|
|
|
|
|
if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
|
|
if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
|
|
|
const auto pos_min = llama_kv_self_seq_pos_min(ctx, slot.id);
|
|
const auto pos_min = llama_kv_self_seq_pos_min(ctx, slot.id);
|
|
|
- if (pos_min > 0) {
|
|
|
|
|
- SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min);
|
|
|
|
|
|
|
+ if (pos_min == -1) {
|
|
|
|
|
+ SLT_ERR(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min);
|
|
|
|
|
+ GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const auto n_swa = llama_model_n_swa(model);
|
|
|
|
|
+ if (pos_min > slot.n_past - n_swa) {
|
|
|
|
|
+ SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
|
|
|
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
|
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
|
|
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
|
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
|
|
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
|
|
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
|