|
|
@@ -4495,6 +4495,13 @@ static bool llm_load_tensors(
|
|
|
|
|
|
auto & hparams = model.hparams;
|
|
|
|
|
|
+#ifdef GGML_USE_SYCL
|
|
|
+ // disable MoE with SYCL until mul_mat_id is updated
|
|
|
+ if (hparams.n_expert > 0) {
|
|
|
+ n_gpu_layers = 0;
|
|
|
+ }
|
|
|
+#endif
|
|
|
+
|
|
|
model.split_mode = split_mode;
|
|
|
model.main_gpu = main_gpu;
|
|
|
model.n_gpu_layers = n_gpu_layers;
|
|
|
@@ -6099,6 +6106,100 @@ static struct ggml_tensor * llm_build_ffn(
|
|
|
return cur;
|
|
|
}
|
|
|
|
|
|
+static struct ggml_tensor * llm_build_moe_ffn(
|
|
|
+ struct ggml_context * ctx,
|
|
|
+ struct ggml_tensor * cur,
|
|
|
+ struct ggml_tensor * gate_inp,
|
|
|
+ struct ggml_tensor * up_exps,
|
|
|
+ struct ggml_tensor * gate_exps,
|
|
|
+ struct ggml_tensor * down_exps,
|
|
|
+ int64_t n_expert,
|
|
|
+ int64_t n_expert_used,
|
|
|
+ llm_ffn_op_type type_op,
|
|
|
+ bool norm_w,
|
|
|
+ const llm_build_cb & cb,
|
|
|
+ int il) {
|
|
|
+ int64_t n_embd = cur->ne[0];
|
|
|
+ int64_t n_tokens = cur->ne[1];
|
|
|
+
|
|
|
+ ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
|
|
|
+ cb(logits, "ffn_moe_logits", il);
|
|
|
+
|
|
|
+ ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
|
|
+ cb(probs, "ffn_moe_probs", il);
|
|
|
+
|
|
|
+ // select experts
|
|
|
+ ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
|
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
|
+ cb(selected_experts, "ffn_moe_topk", il);
|
|
|
+
|
|
|
+ ggml_tensor * weights = ggml_get_rows(ctx,
|
|
|
+ ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
|
|
+ cb(weights, "ffn_moe_weights", il);
|
|
|
+
|
|
|
+ if (norm_w) {
|
|
|
+ weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
|
|
|
+
|
|
|
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
|
|
+ cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
|
+
|
|
|
+ weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
|
|
|
+ cb(weights, "ffn_moe_weights_norm", il);
|
|
|
+
|
|
|
+ weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
|
|
+ }
|
|
|
+
|
|
|
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
|
|
+ ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
|
+ cb(up, "ffn_moe_up", il);
|
|
|
+
|
|
|
+ ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
|
+ cb(gate, "ffn_moe_gate", il);
|
|
|
+
|
|
|
+ switch (type_op) {
|
|
|
+ case LLM_FFN_SILU:
|
|
|
+ {
|
|
|
+ gate = ggml_silu(ctx, gate);
|
|
|
+ cb(gate, "ffn_moe_silu", il);
|
|
|
+ } break;
|
|
|
+ case LLM_FFN_GELU:
|
|
|
+ {
|
|
|
+ gate = ggml_gelu(ctx, gate);
|
|
|
+ cb(gate, "ffn_moe_gelu", il);
|
|
|
+ } break;
|
|
|
+ default:
|
|
|
+ GGML_ASSERT(false);
|
|
|
+ }
|
|
|
+
|
|
|
+ ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
|
|
+ cb(par, "ffn_moe_gate_par", il);
|
|
|
+
|
|
|
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
|
+ cb(experts, "ffn_moe_down", il);
|
|
|
+
|
|
|
+ experts = ggml_mul(ctx, experts, weights);
|
|
|
+
|
|
|
+ // aggregate experts
|
|
|
+ ggml_tensor * moe_out = nullptr;
|
|
|
+ for (int i = 0; i < n_expert_used; ++i) {
|
|
|
+ ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
|
|
|
+ experts->nb[2], i*experts->nb[1]);
|
|
|
+
|
|
|
+ if (i == 0) {
|
|
|
+ moe_out = cur_expert;
|
|
|
+ } else {
|
|
|
+ moe_out = ggml_add(ctx, moe_out, cur_expert);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (n_expert_used == 1) {
|
|
|
+ // avoid returning a non-contiguous tensor
|
|
|
+ moe_out = ggml_cont(ctx, moe_out);
|
|
|
+ }
|
|
|
+
|
|
|
+ return moe_out;
|
|
|
+}
|
|
|
+
|
|
|
// if max_alibi_bias > 0 then apply ALiBi
|
|
|
static struct ggml_tensor * llm_build_kqv(
|
|
|
struct ggml_context * ctx,
|
|
|
@@ -6642,7 +6743,15 @@ struct llm_build_context {
|
|
|
LLM_NORM_RMS, cb, il);
|
|
|
cb(cur, "ffn_norm", il);
|
|
|
|
|
|
- cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, true, il);
|
|
|
+ cur = llm_build_moe_ffn(ctx0, cur,
|
|
|
+ model.layers[il].ffn_gate_inp,
|
|
|
+ model.layers[il].ffn_up_exps,
|
|
|
+ model.layers[il].ffn_gate_exps,
|
|
|
+ model.layers[il].ffn_down_exps,
|
|
|
+ n_expert, n_expert_used,
|
|
|
+ LLM_FFN_SILU, true,
|
|
|
+ cb, il);
|
|
|
+ cb(cur, "ffn_moe_out", il);
|
|
|
}
|
|
|
|
|
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
|
@@ -6674,80 +6783,6 @@ struct llm_build_context {
|
|
|
return gf;
|
|
|
}
|
|
|
|
|
|
- // REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505
|
|
|
- ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, bool norm_w, int il) {
|
|
|
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
|
|
- cb(logits, "ffn_moe_logits", il);
|
|
|
-
|
|
|
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
|
|
- cb(probs, "ffn_moe_probs", il);
|
|
|
-
|
|
|
- // select experts
|
|
|
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
|
|
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
|
-
|
|
|
- ggml_tensor * weights = ggml_get_rows(ctx0,
|
|
|
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
|
|
- cb(weights, "ffn_moe_weights", il);
|
|
|
-
|
|
|
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
|
|
-
|
|
|
- if (norm_w) {
|
|
|
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
|
|
- cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
|
-
|
|
|
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
|
|
- cb(weights, "ffn_moe_weights_norm", il);
|
|
|
- }
|
|
|
-
|
|
|
- // compute expert outputs
|
|
|
- ggml_tensor * moe_out = nullptr;
|
|
|
-
|
|
|
- for (int i = 0; i < n_expert_used; ++i) {
|
|
|
- ggml_tensor * cur_expert;
|
|
|
-
|
|
|
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
|
|
- cb(cur_up, "ffn_moe_up", il);
|
|
|
-
|
|
|
- ggml_tensor * gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
|
|
- cb(gate, "ffn_moe_gate", il);
|
|
|
-
|
|
|
- switch (type_op) {
|
|
|
- case LLM_FFN_SILU:
|
|
|
- {
|
|
|
- gate = ggml_silu(ctx0, gate);
|
|
|
- cb(gate, "ffn_moe_silu", il);
|
|
|
- } break;
|
|
|
- case LLM_FFN_GELU:
|
|
|
- {
|
|
|
- gate = ggml_gelu(ctx0, gate);
|
|
|
- cb(gate, "ffn_moe_gelu", il);
|
|
|
- } break;
|
|
|
- default:
|
|
|
- GGML_ASSERT(false);
|
|
|
- }
|
|
|
-
|
|
|
- cur_expert = ggml_mul(ctx0, cur_up, gate);
|
|
|
- cb(cur_expert, "ffn_moe_gate_par", il);
|
|
|
-
|
|
|
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
|
|
- cb(cur_expert, "ffn_moe_down", il);
|
|
|
-
|
|
|
- cur_expert = ggml_mul(ctx0, cur_expert,
|
|
|
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
|
|
- cb(cur_expert, "ffn_moe_weighted", il);
|
|
|
-
|
|
|
- if (i == 0) {
|
|
|
- moe_out = cur_expert;
|
|
|
- } else {
|
|
|
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
|
|
- cb(moe_out, "ffn_moe_out", il);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return moe_out;
|
|
|
- }
|
|
|
-
|
|
|
struct ggml_cgraph * build_baichuan() {
|
|
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
|
|
|
|
@@ -7195,7 +7230,15 @@ struct llm_build_context {
|
|
|
LLM_NORM_RMS, cb, il);
|
|
|
cb(cur, "ffn_norm", il);
|
|
|
|
|
|
- cur = build_moe_ffn(cur, n_tokens, LLM_FFN_GELU, true, il);
|
|
|
+ cur = llm_build_moe_ffn(ctx0, cur,
|
|
|
+ model.layers[il].ffn_gate_inp,
|
|
|
+ model.layers[il].ffn_up_exps,
|
|
|
+ model.layers[il].ffn_gate_exps,
|
|
|
+ model.layers[il].ffn_down_exps,
|
|
|
+ n_expert, n_expert_used,
|
|
|
+ LLM_FFN_GELU, true,
|
|
|
+ cb, il);
|
|
|
+ cb(cur, "ffn_moe_out", il);
|
|
|
|
|
|
// Grok
|
|
|
// if layer_out_norm is present then apply it before adding the input
|
|
|
@@ -7207,7 +7250,6 @@ struct llm_build_context {
|
|
|
cb(cur, "layer_out_norm", il);
|
|
|
}
|
|
|
|
|
|
-
|
|
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
|
cb(cur, "ffn_out", il);
|
|
|
|
|
|
@@ -7331,7 +7373,15 @@ struct llm_build_context {
|
|
|
LLM_NORM, cb, il);
|
|
|
cb(cur, "attn_out_norm", il);
|
|
|
|
|
|
- cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, true, il);
|
|
|
+ cur = llm_build_moe_ffn(ctx0, cur,
|
|
|
+ model.layers[il].ffn_gate_inp,
|
|
|
+ model.layers[il].ffn_up_exps,
|
|
|
+ model.layers[il].ffn_gate_exps,
|
|
|
+ model.layers[il].ffn_down_exps,
|
|
|
+ n_expert, n_expert_used,
|
|
|
+ LLM_FFN_SILU, true,
|
|
|
+ cb, il);
|
|
|
+ cb(cur, "ffn_moe_out", il);
|
|
|
|
|
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
|
cb(cur, "ffn_out", il);
|
|
|
@@ -8502,12 +8552,6 @@ struct llm_build_context {
|
|
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
|
cb(Vcur, "Vcur", il);
|
|
|
|
|
|
- // these nodes are added to the graph together so that they are not reordered
|
|
|
- // by doing so, the number of splits in the graph is reduced
|
|
|
- ggml_build_forward_expand(gf, Qcur);
|
|
|
- ggml_build_forward_expand(gf, Kcur);
|
|
|
- ggml_build_forward_expand(gf, Vcur);
|
|
|
-
|
|
|
Qcur = ggml_rope_custom(
|
|
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
|
@@ -8658,7 +8702,16 @@ struct llm_build_context {
|
|
|
LLM_NORM_RMS, cb, il);
|
|
|
cb(cur, "ffn_norm", il);
|
|
|
|
|
|
- ggml_tensor * moe_out = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, false, il);
|
|
|
+ ggml_tensor * moe_out =
|
|
|
+ llm_build_moe_ffn(ctx0, cur,
|
|
|
+ model.layers[il].ffn_gate_inp,
|
|
|
+ model.layers[il].ffn_up_exps,
|
|
|
+ model.layers[il].ffn_gate_exps,
|
|
|
+ model.layers[il].ffn_down_exps,
|
|
|
+ n_expert, n_expert_used,
|
|
|
+ LLM_FFN_SILU, false,
|
|
|
+ cb, il);
|
|
|
+ cb(cur, "ffn_moe_out", il);
|
|
|
|
|
|
// FFN shared expert
|
|
|
{
|