|
|
@@ -759,6 +759,12 @@ extern "C" {
|
|
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
|
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
|
|
|
|
|
+ // Get the number of threads used for generation of a single token.
|
|
|
+ LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
|
|
+
|
|
|
+ // Get the number of threads used for prompt and batch processing (multiple token).
|
|
|
+ LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
|
|
+
|
|
|
// Set whether to use causal attention or not
|
|
|
// If set to true, the model will only attend to the past tokens
|
|
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|