|
|
@@ -13333,7 +13333,13 @@ struct llm_build_context {
|
|
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
|
- struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
|
|
+ struct ggml_tensor * KQ_mask = nullptr;
|
|
|
+ if (hparams.n_swa == 0) {
|
|
|
+ // Phi-4 doesn't use sliding window attention
|
|
|
+ KQ_mask = build_inp_KQ_mask();
|
|
|
+ } else {
|
|
|
+ KQ_mask = build_inp_KQ_mask_swa();
|
|
|
+ }
|
|
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
|
auto residual = inpL;
|
|
|
@@ -13391,7 +13397,7 @@ struct llm_build_context {
|
|
|
|
|
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
|
}
|
|
|
|
|
|
if (il == n_layer - 1) {
|