|
@@ -285,6 +285,9 @@ llama_context::llama_context(
|
|
|
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
|
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
|
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
|
|
|
|
|
|
|
|
+ // avoid reserving graphs with zero outputs
|
|
|
|
|
+ n_outputs = 1;
|
|
|
|
|
+
|
|
|
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
|
|
|
|
|
|
// resolve automatic Flash Attention use
|
|
// resolve automatic Flash Attention use
|
|
@@ -1368,6 +1371,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|
|
|
|
|
|
|
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
|
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
|
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
|
|
|
+ GGML_ASSERT(n_outputs >= 1);
|
|
|
|
|
|
|
|
if (n_tokens % n_seqs != 0) {
|
|
if (n_tokens % n_seqs != 0) {
|
|
|
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
|
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|