|
@@ -2132,7 +2132,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
for (int i = 0; i < n_layer; ++i) {
|
|
for (int i = 0; i < n_layer; ++i) {
|
|
|
auto & layer = layers[i];
|
|
auto & layer = layers[i];
|
|
|
|
|
|
|
|
- if (arch == LLM_ARCH_BERT) {
|
|
|
|
|
|
|
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
|
|
|
+
|
|
|
|
|
+ if (!layer.wqkv) {
|
|
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
|
|
|
|
|
@@ -2141,12 +2144,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
|
|
|
|
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
|
- } else {
|
|
|
|
|
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
|
|
|
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
@@ -5910,36 +5907,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
ggml_tensor * Vcur;
|
|
ggml_tensor * Vcur;
|
|
|
|
|
|
|
|
// self-attention
|
|
// self-attention
|
|
|
- if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
|
|
|
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
|
|
|
-
|
|
|
|
|
- if (model.layers[il].attn_q_norm) {
|
|
|
|
|
- Qcur = build_norm(Qcur,
|
|
|
|
|
- model.layers[il].attn_q_norm,
|
|
|
|
|
- model.layers[il].attn_q_norm_b,
|
|
|
|
|
- LLM_NORM, il);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
|
|
|
-
|
|
|
|
|
- if (model.layers[il].attn_k_norm) {
|
|
|
|
|
- Kcur = build_norm(Kcur,
|
|
|
|
|
- model.layers[il].attn_k_norm,
|
|
|
|
|
- model.layers[il].attn_k_norm_b,
|
|
|
|
|
- LLM_NORM, il);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
|
|
|
-
|
|
|
|
|
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
|
|
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
- } else {
|
|
|
|
|
- // compute Q and K and RoPE them
|
|
|
|
|
|
|
+ if (model.layers[il].wqkv) {
|
|
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
|
cb(cur, "wqkv", il);
|
|
cb(cur, "wqkv", il);
|
|
|
|
|
|
|
|
- if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
|
|
|
|
|
+ if (model.layers[il].bqkv) {
|
|
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
|
cb(cur, "bqkv", il);
|
|
cb(cur, "bqkv", il);
|
|
|
}
|
|
}
|
|
@@ -5947,11 +5919,32 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
|
|
|
+ } else {
|
|
|
|
|
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
|
|
|
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
|
|
|
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
|
|
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
|
|
+ if (model.layers[il].attn_q_norm) {
|
|
|
|
|
+ Qcur = build_norm(Qcur,
|
|
|
|
|
+ model.layers[il].attn_q_norm,
|
|
|
|
|
+ model.layers[il].attn_q_norm_b,
|
|
|
|
|
+ LLM_NORM, il);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (model.layers[il].attn_k_norm) {
|
|
|
|
|
+ Kcur = build_norm(Kcur,
|
|
|
|
|
+ model.layers[il].attn_k_norm,
|
|
|
|
|
+ model.layers[il].attn_k_norm_b,
|
|
|
|
|
+ LLM_NORM, il);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
|
|
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
|
|
|
|
|
+ // RoPE
|
|
|
|
|
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
|
Qcur = ggml_rope_ext(
|
|
Qcur = ggml_rope_ext(
|
|
|
ctx0, Qcur, inp_pos, nullptr,
|
|
ctx0, Qcur, inp_pos, nullptr,
|
|
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|