|
|
@@ -2451,6 +2451,7 @@ struct llama_layer {
|
|
|
// long rope factors
|
|
|
struct ggml_tensor * rope_long = nullptr;
|
|
|
struct ggml_tensor * rope_short = nullptr;
|
|
|
+ struct ggml_tensor * rope_freqs = nullptr;
|
|
|
|
|
|
// bitnet scale
|
|
|
struct ggml_tensor * wq_scale;
|
|
|
@@ -6059,6 +6060,8 @@ static bool llm_load_tensors(
|
|
|
|
|
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
|
|
|
|
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
|
+
|
|
|
if (n_expert == 0) {
|
|
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
|
@@ -8536,6 +8539,10 @@ struct llm_build_context {
|
|
|
// choose long/short freq factors based on the context size
|
|
|
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
|
|
|
|
+ if (model.layers[il].rope_freqs != nullptr) {
|
|
|
+ return model.layers[il].rope_freqs;
|
|
|
+ }
|
|
|
+
|
|
|
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
|
|
return model.layers[il].rope_long;
|
|
|
}
|
|
|
@@ -8730,6 +8737,9 @@ struct llm_build_context {
|
|
|
|
|
|
// self-attention
|
|
|
{
|
|
|
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
|
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
|
+
|
|
|
// compute Q and K and RoPE them
|
|
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
|
|
cb(Qcur, "Qcur", il);
|
|
|
@@ -8753,14 +8763,14 @@ struct llm_build_context {
|
|
|
}
|
|
|
|
|
|
Qcur = ggml_rope_ext(
|
|
|
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
|
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
|
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
);
|
|
|
cb(Qcur, "Qcur", il);
|
|
|
|
|
|
Kcur = ggml_rope_ext(
|
|
|
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
|
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
|
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
|
);
|