2 місяців тому · 7bef684118
--- a/src/models/ernie4-5.cpp
+++ b/src/models/ernie4-5.cpp
@@ -1,7 +1,5 @@
 
				 #include "models.h"
			
 
				 
			
 
				-
			
 
				-
			
 
				 llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
			
 
				     llm_graph_context(params) {
			
 
				     const int64_t n_embd_head = hparams.n_embd_head_v;
			
@@ -19,6 +17,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
 
				 
			
 
				     auto * inp_attn = build_attn_inp_kv();
			
 
				 
			
 
				+    ggml_tensor * inp_out_ids = build_inp_out_ids();
			
 
				+
			
 
				     for (int il = 0; il < n_layer; ++il) {
			
 
				         ggml_tensor * inpSA = inpL;
			
 
				 
			
@@ -67,9 +67,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
 
				         }
			
 
				         if (il == n_layer - 1) {
			
 
				             // skip computing output for unused tokens
			
 
				-            ggml_tensor * inp_out_ids = build_inp_out_ids();
			
 
				-            cur                       = ggml_get_rows(ctx0, cur, inp_out_ids);
			
 
				-            inpSA                     = ggml_get_rows(ctx0, inpSA, inp_out_ids);
			
 
				+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
			
 
				+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
			
 
				         }
			
 
				         ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
			
 
				         cb(ffn_inp, "ffn_inp", il);
			
--- a/src/models/openai-moe-iswa.cpp
+++ b/src/models/openai-moe-iswa.cpp
@@ -11,6 +11,8 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
 
				 
			
 
				     auto * inp_attn = build_attn_inp_kv_iswa();
			
 
				 
			
 
				+    ggml_tensor * inp_out_ids = build_inp_out_ids();
			
 
				+
			
 
				     for (int il = 0; il < n_layer; ++il) {
			
 
				         ggml_tensor * inpSA = inpL;
			
 
				 
			
@@ -69,7 +71,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
 
				         }
			
 
				         if (il == n_layer - 1) {
			
 
				             // skip computing output for unused tokens
			
 
				-            ggml_tensor * inp_out_ids = build_inp_out_ids();
			
 
				             cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
			
 
				             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
			
 
				         }