|
@@ -1212,6 +1212,50 @@ static bool llm_load_tensors(
|
|
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
|
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
|
|
|
|
|
|
|
|
|
|
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
|
|
|
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
|
|
|
+ }
|
|
|
|
|
+ } break;
|
|
|
|
|
+ case LLM_ARCH_PHIMOE:
|
|
|
|
|
+ {
|
|
|
|
|
+ const int64_t n_embd_head = n_embd / n_head;
|
|
|
|
|
+
|
|
|
|
|
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
|
|
|
+
|
|
|
|
|
+ // output
|
|
|
|
|
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
|
|
|
+ model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
|
|
|
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
|
|
|
|
|
+ model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
|
|
|
|
|
+
|
|
|
|
|
+ for (int i = 0; i < n_layer; ++i) {
|
|
|
|
|
+ auto & layer = model.layers[i];
|
|
|
|
|
+
|
|
|
|
|
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
|
|
|
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
|
|
|
|
|
+
|
|
|
|
|
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ if (layer.wqkv == nullptr) {
|
|
|
|
|
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
|
|
|
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
|
|
|
+
|
|
|
|
|
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
|
|
|
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
|
|
|
|
+
|
|
|
|
|
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
|
|
|
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
|
|
|
+ }
|
|
|
|
|
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
|
|
|
|
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
|
|
|
|
|
+
|
|
|
|
|
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
|
|
|
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
|
|
|
|
|
+
|
|
|
|
|
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
|
|
|
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
|
|
|
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
|
|
|
|
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
|
|
|
+
|
|
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
|
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
|
}
|
|
}
|
|
@@ -6266,7 +6310,7 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
|
|
model.layers[il].attn_norm,
|
|
model.layers[il].attn_norm,
|
|
|
- NULL,
|
|
|
|
|
|
|
+ model.layers[il].attn_norm_b,
|
|
|
LLM_NORM_RMS, cb, il);
|
|
LLM_NORM_RMS, cb, il);
|
|
|
cb(attn_norm_output, "attn_norm", il);
|
|
cb(attn_norm_output, "attn_norm", il);
|
|
|
|
|
|
|
@@ -6281,8 +6325,7 @@ struct llm_build_context {
|
|
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
|
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
|
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
|
|
- }
|
|
|
|
|
- else {
|
|
|
|
|
|
|
+ } else {
|
|
|
Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
|
Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
|
Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
@@ -6326,14 +6369,12 @@ struct llm_build_context {
|
|
|
residual = cur;
|
|
residual = cur;
|
|
|
|
|
|
|
|
cur = llm_build_norm(ctx0, cur, hparams,
|
|
cur = llm_build_norm(ctx0, cur, hparams,
|
|
|
- model.layers[il].ffn_norm, NULL,
|
|
|
|
|
|
|
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
|
|
LLM_NORM_RMS, cb, il);
|
|
LLM_NORM_RMS, cb, il);
|
|
|
cb(cur, "ffn_norm", il);
|
|
cb(cur, "ffn_norm", il);
|
|
|
|
|
|
|
|
- // FF
|
|
|
|
|
- // special-case: the up and gate tensors are merged into a single tensor
|
|
|
|
|
- // TOOD: support into llm_build_ffn
|
|
|
|
|
- {
|
|
|
|
|
|
|
+ // feed-forward network
|
|
|
|
|
+ if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
|
cur = llm_build_ffn(ctx0, lctx, cur,
|
|
cur = llm_build_ffn(ctx0, lctx, cur,
|
|
|
model.layers[il].ffn_up, NULL, NULL,
|
|
model.layers[il].ffn_up, NULL, NULL,
|
|
|
NULL, NULL, NULL,
|
|
NULL, NULL, NULL,
|
|
@@ -6341,6 +6382,20 @@ struct llm_build_context {
|
|
|
NULL,
|
|
NULL,
|
|
|
LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
|
|
LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
|
|
|
cb(cur, "ffn_out", il);
|
|
cb(cur, "ffn_out", il);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // MoE branch
|
|
|
|
|
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
|
|
|
|
+ model.layers[il].ffn_gate_inp,
|
|
|
|
|
+ model.layers[il].ffn_up_exps,
|
|
|
|
|
+ model.layers[il].ffn_gate_exps,
|
|
|
|
|
+ model.layers[il].ffn_down_exps,
|
|
|
|
|
+ nullptr,
|
|
|
|
|
+ n_expert, n_expert_used,
|
|
|
|
|
+ LLM_FFN_SILU, true,
|
|
|
|
|
+ false, 0.0,
|
|
|
|
|
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
|
|
|
+ cb, il);
|
|
|
|
|
+ cb(cur, "ffn_moe_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
cur = ggml_add(ctx0, residual, cur);
|
|
cur = ggml_add(ctx0, residual, cur);
|
|
@@ -6353,11 +6408,16 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
|
model.output_norm,
|
|
model.output_norm,
|
|
|
- NULL,
|
|
|
|
|
|
|
+ model.output_norm_b,
|
|
|
LLM_NORM_RMS, cb, -1);
|
|
LLM_NORM_RMS, cb, -1);
|
|
|
cb(cur, "result_norm", -1);
|
|
cb(cur, "result_norm", -1);
|
|
|
|
|
|
|
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
|
|
|
+
|
|
|
|
|
+ if (model.output_b != nullptr) {
|
|
|
|
|
+ cb(cur, "result_output_no_bias", -1);
|
|
|
|
|
+ cur = ggml_add(ctx0, cur, model.output_b);
|
|
|
|
|
+ }
|
|
|
cb(cur, "result_output", -1);
|
|
cb(cur, "result_output", -1);
|
|
|
|
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
ggml_build_forward_expand(gf, cur);
|
|
@@ -10536,6 +10596,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
result = llm.build_phi2();
|
|
result = llm.build_phi2();
|
|
|
} break;
|
|
} break;
|
|
|
case LLM_ARCH_PHI3:
|
|
case LLM_ARCH_PHI3:
|
|
|
|
|
+ case LLM_ARCH_PHIMOE:
|
|
|
{
|
|
{
|
|
|
result = llm.build_phi3();
|
|
result = llm.build_phi3();
|
|
|
} break;
|
|
} break;
|