5 ماه پیش · c1dbea752a
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -105,7 +105,7 @@ llama_context::llama_context(
 
				 
			
 
				     {
			
 
				         const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
			
 
				-        const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
			
 
				+        supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
			
 
				 
			
 
				         if (!supports_set_rows && !cparams.kv_unified) {
			
 
				             LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
			
@@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
				         }
			
 
				     }
			
 
				 
			
 
				+    if (!supports_set_rows) {
			
 
				+        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
			
 
				+        // overlap with device computation.
			
 
				+        ggml_backend_sched_reset(sched.get());
			
 
				+    }
			
 
				+
			
 
				     // TODO: hacky solution
			
 
				     if (model.arch == LLM_ARCH_T5 && t_embd) {
			
 
				         //cross.t_embd = t_embd;
			
@@ -1229,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
				     // wait for the computation to finish (automatically done when obtaining the model output)
			
 
				     //synchronize();
			
 
				 
			
 
				+    if (!supports_set_rows) {
			
 
				+        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
			
 
				+        // overlap with device computation.
			
 
				+        ggml_backend_sched_reset(sched.get());
			
 
				+    }
			
 
				+
			
 
				     return 0;
			
 
				 }
			
 
				 
			
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -287,6 +287,10 @@ private:
 
				 
			
 
				     bool has_evaluated_once = false;
			
 
				 
			
 
				+    // env: LLAMA_SET_ROWS (temporary)
			
 
				+    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
			
 
				+    bool supports_set_rows = false;
			
 
				+
			
 
				     // perf
			
 
				     mutable int64_t t_start_us  = 0;
			
 
				     mutable int64_t t_load_us   = 0;