|
|
@@ -116,8 +116,6 @@ llama_context::llama_context(
|
|
|
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
|
|
}
|
|
|
|
|
|
- logits_all = params.logits_all;
|
|
|
-
|
|
|
if (!hparams.vocab_only) {
|
|
|
// GPU backends
|
|
|
for (auto * dev : model.devices) {
|
|
|
@@ -890,7 +888,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
|
|
n_outputs_all += batch.logits[i] != 0;
|
|
|
}
|
|
|
- } else if (logits_all || embd_pooled) {
|
|
|
+ } else if (embd_pooled) {
|
|
|
n_outputs_all = n_tokens_all;
|
|
|
} else {
|
|
|
// keep last output only
|
|
|
@@ -1853,13 +1851,12 @@ llama_context_params llama_context_default_params() {
|
|
|
/*.cb_eval_user_data =*/ nullptr,
|
|
|
/*.type_k =*/ GGML_TYPE_F16,
|
|
|
/*.type_v =*/ GGML_TYPE_F16,
|
|
|
- /*.logits_all =*/ false,
|
|
|
+ /*.abort_callback =*/ nullptr,
|
|
|
+ /*.abort_callback_data =*/ nullptr,
|
|
|
/*.embeddings =*/ false,
|
|
|
/*.offload_kqv =*/ true,
|
|
|
/*.flash_attn =*/ false,
|
|
|
/*.no_perf =*/ true,
|
|
|
- /*.abort_callback =*/ nullptr,
|
|
|
- /*.abort_callback_data =*/ nullptr,
|
|
|
};
|
|
|
|
|
|
return result;
|