2 лет назад · d7b800b8bc
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1083,7 +1083,7 @@ void ggml_metal_graph_compute(
 
				 
			
 
				                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared
			
 
				                             // to the matrix-vector kernel
			
 
				-                            int ne11_mm_min = 1;
			
 
				+                            int ne11_mm_min = src0t == GGML_TYPE_F16 ? 1 : 16;
			
 
				 
			
 
				 #if 0
			
 
				                             // the numbers below are measured on M2 Ultra for 7B and 13B models
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -5744,8 +5744,7 @@ static int llama_decode_internal(
 
				     // a heuristic, to avoid attending the full cache if it is not yet utilized
			
 
				     // after enough generations, the benefit from this heuristic disappears
			
 
				     // if we start defragmenting the cache, the benefit from this will be more important
			
 
				-    //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
			
 
				-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
			
 
				+    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
			
 
				 
			
 
				     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);