1 tahun lalu · 3015851c5a
--- a/llama.cpp
+++ b/llama.cpp
@@ -17410,6 +17410,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
 
				     ctx->cparams.n_threads_batch = n_threads_batch;
			
 
				 }
			
 
				 
			
 
				+uint32_t llama_n_threads(struct llama_context * ctx) {
			
 
				+    return ctx->cparams.n_threads;
			
 
				+}
			
 
				+
			
 
				+uint32_t llama_n_threads_batch(struct llama_context * ctx) {
			
 
				+    return ctx->cparams.n_threads_batch;
			
 
				+}
			
 
				+
			
 
				 void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
			
 
				     ctx->abort_callback      = abort_callback;
			
 
				     ctx->abort_callback_data = abort_callback_data;
			
--- a/llama.h
+++ b/llama.h
@@ -759,6 +759,12 @@ extern "C" {
 
				     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
			
 
				     LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
			
 
				 
			
 
				+    // Get the number of threads used for generation of a single token.
			
 
				+    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
			
 
				+
			
 
				+    // Get the number of threads used for prompt and batch processing (multiple token).
			
 
				+    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
			
 
				+
			
 
				     // Set whether to use causal attention or not
			
 
				     // If set to true, the model will only attend to the past tokens
			
 
				     LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);