1 месяц назад · 6eea666912
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3274,7 +3274,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
				                         GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
			
 
				                     }
			
 
				                 }
			
 
				-                prev_i = i;
			
 
				 
			
 
				 #ifdef GGML_CUDA_DEBUG
			
 
				                 const int nodes_fused = i - prev_i - 1;
			
@@ -3282,6 +3281,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
				                     GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
			
 
				                 }
			
 
				 #endif
			
 
				+                prev_i = i;
			
 
				 
			
 
				                 if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
			
 
				                     continue;
			
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -810,9 +810,6 @@ ggml_tensor * llm_graph_context::build_ffn(
 
				             GGML_ABORT("fatal error");
			
 
				     }
			
 
				 
			
 
				-    //expand here so that we can fuse ffn gate
			
 
				-    ggml_build_forward_expand(gf, cur);
			
 
				-
			
 
				     if (gate && type_gate == LLM_FFN_PAR) {
			
 
				         cur = ggml_mul(ctx0, cur, tmp);
			
 
				         cb(cur, "ffn_gate_par", il);
			
@@ -1093,9 +1090,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
				             GGML_ABORT("fatal error");
			
 
				     }
			
 
				 
			
 
				-    //expand here so that we can fuse ffn gate
			
 
				-    ggml_build_forward_expand(gf, cur);
			
 
				-
			
 
				     experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
			
 
				     cb(experts, "ffn_moe_down", il);