The function 'output_reserve' return type is 'uint32_t', so need to add explicit casting.
@@ -181,7 +181,7 @@ llama_context::llama_context(
// graph outputs buffer
{
// resized during inference when a batch uses more outputs
- if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
+ if (output_reserve(params.n_seq_max) < params.n_seq_max) {
throw std::runtime_error("failed to reserve initial output buffer");
}