6 месяцев назад · d498af3d5a
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
				 //
			
 
				 
			
 
				 uint32_t llama_context::graph_max_nodes() const {
			
 
				-    return std::max<uint32_t>(65536u, 5u*model.n_tensors());
			
 
				+    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
			
 
				 }
			
 
				 
			
 
				 llm_graph_result * llama_context::get_gf_res_reserve() const {
			
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
				     }
			
 
				 
			
 
				     // aggregate experts
			
 
				+    // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
			
 
				+    //       to avoid potentially a large number of add nodes during warmup
			
 
				+    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
			
 
				     ggml_tensor * moe_out = nullptr;
			
 
				-    for (int i = 0; i < n_expert_used; ++i) {
			
 
				+    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
			
 
				         ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
			
 
				                 experts->nb[2], i*experts->nb[1]);
			
 
				 
			
@@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
				         }
			
 
				     }
			
 
				 
			
 
				-    if (n_expert_used == 1) {
			
 
				+    if (hparams.n_expert_used == 1) {
			
 
				         // avoid returning a non-contiguous tensor
			
 
				         moe_out = ggml_cont(ctx0, moe_out);
			
 
				     }