|
@@ -3260,6 +3260,8 @@ static void llama_kv_cache_seq_add(
|
|
|
|
|
|
|
|
if (p0 < 0) p0 = 0;
|
|
if (p0 < 0) p0 = 0;
|
|
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
|
|
|
|
+ // If there is no range then return early to avoid looping over the cache.
|
|
|
|
|
+ if (p0 == p1) return;
|
|
|
|
|
|
|
|
if (cache.recurrent) {
|
|
if (cache.recurrent) {
|
|
|
// for Mamba-like models, only the pos needs to be shifted
|
|
// for Mamba-like models, only the pos needs to be shifted
|
|
@@ -3304,6 +3306,8 @@ static void llama_kv_cache_seq_div(
|
|
|
int d) {
|
|
int d) {
|
|
|
if (p0 < 0) p0 = 0;
|
|
if (p0 < 0) p0 = 0;
|
|
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
|
|
|
|
+ // If there is no range then return early to avoid looping over the cache.
|
|
|
|
|
+ if (p0 == p1) return;
|
|
|
|
|
|
|
|
if (cache.recurrent) {
|
|
if (cache.recurrent) {
|
|
|
// for Mamba-like models, only the pos needs to be changed
|
|
// for Mamba-like models, only the pos needs to be changed
|