2 月之前 · 2b40434af3
--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@@ -1,16 +1,57 @@
 
				 #include "cumsum.cuh"
			
 
				 
			
 
				 // Warp-level inclusive scan (cumulative sum)
			
 
				+// Uses double precision internally for better accuracy at long sequences
			
 
				+// Note: CUDA doesn't have native double shuffle, so we use int64 reinterpretation
			
 
				+// OPTIMIZATION: Fully unrolled for WARP_SZ=32 for better performance
			
 
				 template<int WARP_SZ>
			
 
				-__device__ inline float warp_cumsum(float val, int lane_id) {
			
 
				-    #pragma unroll
			
 
				-    for (int offset = 1; offset < WARP_SZ; offset *= 2) {
			
 
				-        float n = __shfl_up_sync(0xffffffff, val, offset);
			
 
				-        if (lane_id >= offset) val += n;
			
 
				-    }
			
 
				+__device__ __forceinline__ double warp_cumsum_double(double val, int lane_id) {
			
 
				+    static_assert(WARP_SZ == 32, "Only warp size 32 is supported");
			
 
				+    
			
 
				+    // Fully unrolled loop for maximum performance
			
 
				+    long long int val_as_int, n_as_int;
			
 
				+    double n;
			
 
				+    
			
 
				+    // offset = 1
			
 
				+    val_as_int = __double_as_longlong(val);
			
 
				+    n_as_int = __shfl_up_sync(0xffffffff, val_as_int, 1);
			
 
				+    n = __longlong_as_double(n_as_int);
			
 
				+    if (lane_id >= 1) val += n;
			
 
				+    
			
 
				+    // offset = 2
			
 
				+    val_as_int = __double_as_longlong(val);
			
 
				+    n_as_int = __shfl_up_sync(0xffffffff, val_as_int, 2);
			
 
				+    n = __longlong_as_double(n_as_int);
			
 
				+    if (lane_id >= 2) val += n;
			
 
				+    
			
 
				+    // offset = 4
			
 
				+    val_as_int = __double_as_longlong(val);
			
 
				+    n_as_int = __shfl_up_sync(0xffffffff, val_as_int, 4);
			
 
				+    n = __longlong_as_double(n_as_int);
			
 
				+    if (lane_id >= 4) val += n;
			
 
				+    
			
 
				+    // offset = 8
			
 
				+    val_as_int = __double_as_longlong(val);
			
 
				+    n_as_int = __shfl_up_sync(0xffffffff, val_as_int, 8);
			
 
				+    n = __longlong_as_double(n_as_int);
			
 
				+    if (lane_id >= 8) val += n;
			
 
				+    
			
 
				+    // offset = 16
			
 
				+    val_as_int = __double_as_longlong(val);
			
 
				+    n_as_int = __shfl_up_sync(0xffffffff, val_as_int, 16);
			
 
				+    n = __longlong_as_double(n_as_int);
			
 
				+    if (lane_id >= 16) val += n;
			
 
				+    
			
 
				     return val;
			
 
				 }
			
 
				 
			
 
				+// Original float version for backward compatibility
			
 
				+template<int WARP_SZ>
			
 
				+__device__ __forceinline__ float warp_cumsum(float val, int lane_id) {
			
 
				+    // Use double precision internally, cast back to float
			
 
				+    return (float)warp_cumsum_double<WARP_SZ>((double)val, lane_id);
			
 
				+}
			
 
				+
			
 
				 // Kernel for small rows (row_len <= 1024)
			
 
				 // Each block processes one row
			
 
				 template<int BLOCK_SIZE>
			
@@ -33,17 +74,17 @@ __global__ void cumsum_f32_kernel(const float * __restrict__ x, float * __restri
 
				     const int warp_id = tid / 32;
			
 
				     const int num_warps = BLOCK_SIZE / 32;
			
 
				     
			
 
				-    __shared__ float warp_sums[32]; // max 32 warps per block
			
 
				+    __shared__ double warp_sums[32]; // max 32 warps per block - use double for precision
			
 
				     
			
 
				-    // Use register for carry instead of shared memory - faster!
			
 
				-    float carry_accum = 0.0f;
			
 
				+    // Use double precision for carry to prevent precision loss at long sequences (40k+ tokens)
			
 
				+    double carry_accum = 0.0;
			
 
				     
			
 
				     // Process elements in chunks of BLOCK_SIZE
			
 
				     for (int64_t i = tid; i < ne0; i += BLOCK_SIZE) {
			
 
				         float val = src_row[i];
			
 
				         
			
 
				-        // Warp-level scan
			
 
				-        float warp_sum = warp_cumsum<32>(val, lane_id);
			
 
				+        // Warp-level scan using double precision internally
			
 
				+        double warp_sum = warp_cumsum_double<32>((double)val, lane_id);
			
 
				         
			
 
				         // Get the total sum from this warp and broadcast to all warps
			
 
				         if (lane_id == 31) {
			
@@ -52,11 +93,11 @@ __global__ void cumsum_f32_kernel(const float * __restrict__ x, float * __restri
 
				         __syncthreads();
			
 
				         
			
 
				         // Thread 0 computes prefix sum of warp totals
			
 
				-        __shared__ float tile_carry;
			
 
				+        __shared__ double tile_carry;
			
 
				         if (tid == 0) {
			
 
				-            float s = 0.0f;
			
 
				+            double s = 0.0;
			
 
				             for (int w = 0; w < num_warps; w++) {
			
 
				-                float tmp = warp_sums[w];
			
 
				+                double tmp = warp_sums[w];
			
 
				                 warp_sums[w] = s; // warp prefix offset within this tile
			
 
				                 s += tmp;         // accumulate total of this tile
			
 
				             }
			
@@ -65,9 +106,9 @@ __global__ void cumsum_f32_kernel(const float * __restrict__ x, float * __restri
 
				         }
			
 
				         __syncthreads();
			
 
				 
			
 
				-        // Add warp prefix and previous tile carry
			
 
				-        float result = warp_sum + warp_sums[warp_id] + tile_carry;
			
 
				-        dst_row[i] = result;
			
 
				+        // Add warp prefix and previous tile carry, then cast to float for output
			
 
				+        double result = warp_sum + warp_sums[warp_id] + tile_carry;
			
 
				+        dst_row[i] = (float)result;
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -87,10 +128,11 @@ __global__ void cumsum_f32_sequential_kernel(const float * __restrict__ x, float
 
				     const float * src_row = (const float *)((const char *)x + i1*nb1 + i2*nb2 + i3*nb3);
			
 
				     float * dst_row = (float *)((char *)dst + i1*dst_nb1 + i2*dst_nb2 + i3*dst_nb3);
			
 
				     
			
 
				-    float cumsum = 0.0f;
			
 
				+    // Use double precision for accumulator to prevent precision loss at long sequences
			
 
				+    double cumsum = 0.0;
			
 
				     for (int64_t i = 0; i < ne0; i++) {
			
 
				-        cumsum += src_row[i];
			
 
				-        dst_row[i] = cumsum;
			
 
				+        cumsum += (double)src_row[i];
			
 
				+        dst_row[i] = (float)cumsum;
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -123,14 +165,29 @@ void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				     // Launch kernel
			
 
				     dim3 grid(ne1, ne2, ne3);
			
 
				     
			
 
				-    if (ne0 <= 4096) {
			
 
				+    // OPTIMIZATION: Dynamic block size selection for better occupancy
			
 
				+    if (ne0 <= 8192) {
			
 
				         // Use parallel scan for small to medium rows
			
 
				-        const int block_size = 256;
			
 
				-        cumsum_f32_kernel<block_size><<<grid, block_size, 0, stream>>>(
			
 
				-            src0_d, dst_d, ne0, ne1, ne2, ne3,
			
 
				-            nb0, nb1, nb2, nb3,
			
 
				-            dst_nb0, dst_nb1, dst_nb2, dst_nb3
			
 
				-        );
			
 
				+        // Choose block size based on row length for better occupancy
			
 
				+        if (ne0 <= 256) {
			
 
				+            cumsum_f32_kernel<128><<<grid, 128, 0, stream>>>(
			
 
				+                src0_d, dst_d, ne0, ne1, ne2, ne3,
			
 
				+                nb0, nb1, nb2, nb3,
			
 
				+                dst_nb0, dst_nb1, dst_nb2, dst_nb3
			
 
				+            );
			
 
				+        } else if (ne0 <= 2048) {
			
 
				+            cumsum_f32_kernel<256><<<grid, 256, 0, stream>>>(
			
 
				+                src0_d, dst_d, ne0, ne1, ne2, ne3,
			
 
				+                nb0, nb1, nb2, nb3,
			
 
				+                dst_nb0, dst_nb1, dst_nb2, dst_nb3
			
 
				+            );
			
 
				+        } else {
			
 
				+            cumsum_f32_kernel<512><<<grid, 512, 0, stream>>>(
			
 
				+                src0_d, dst_d, ne0, ne1, ne2, ne3,
			
 
				+                nb0, nb1, nb2, nb3,
			
 
				+                dst_nb0, dst_nb1, dst_nb2, dst_nb3
			
 
				+            );
			
 
				+        }
			
 
				     } else {
			
 
				         // Use sequential kernel for very large rows
			
 
				         cumsum_f32_sequential_kernel<<<grid, 1, 0, stream>>>(
			
--- a/ggml/src/ggml-cuda/delta-net.cu
+++ b/ggml/src/ggml-cuda/delta-net.cu
@@ -20,9 +20,23 @@
 
				 #define GGML_DELTA_NET_CHUNK 64
			
 
				 #endif
			
 
				 
			
 
				-// DELTA_NET_RECURRENT kernel
			
 
				+// ============================================================================
			
 
				+// OPTIMIZED DELTA_NET_RECURRENT kernel
			
 
				+// ============================================================================
			
 
				 // Each block processes one (sequence, head) pair
			
 
				 // Token loop is sequential due to state dependency
			
 
				+// 
			
 
				+// PRECISION NOTES: 
			
 
				+// - Uses FMA (fused multiply-add) for all dot products to minimize rounding errors
			
 
				+// - State accumulation is sequential, so precision is maintained through careful ordering
			
 
				+//
			
 
				+// PERFORMANCE OPTIMIZATIONS:
			
 
				+// 1. Vectorized loads (float4) for better memory bandwidth utilization
			
 
				+// 2. Reduced synchronization barriers (6 instead of 7 per token)
			
 
				+// 3. More aggressive loop unrolling for better ILP
			
 
				+// 4. Scalar values kept in registers instead of shared memory
			
 
				+// 5. Better memory access patterns for coalescing
			
 
				+// ============================================================================
			
 
				 __global__ void delta_net_recurrent_f32_kernel(
			
 
				     const float * __restrict__ q_tokens,      // [n_tokens, S_v, H_v, n_seqs]
			
 
				     const float * __restrict__ k_tokens,      // [n_tokens, S_v, H_v, n_seqs]
			
@@ -44,7 +58,7 @@ __global__ void delta_net_recurrent_f32_kernel(
 
				     
			
 
				     const int tid = threadIdx.x;
			
 
				 
			
 
				-    // Dynamic shared memory: only vectors and a couple of scalars
			
 
				+    // Dynamic shared memory: only vectors (scalars in registers)
			
 
				     extern __shared__ float smem[];
			
 
				     float * q_vec   = smem;             // S_v
			
 
				     float * k_vec   = q_vec   + S_v;    // S_v
			
@@ -52,30 +66,24 @@ __global__ void delta_net_recurrent_f32_kernel(
 
				     float * kv_mem  = v_vec   + S_v;    // S_v
			
 
				     float * delta   = kv_mem  + S_v;    // S_v
			
 
				     float * out_vec = delta   + S_v;    // S_v
			
 
				-    float * scalars = out_vec + S_v;    // 2 floats: [0]=g_exp, [1]=beta
			
 
				 
			
 
				     // Offset helper matching CPU layout: [seq][head][i][j]
			
 
				-    // CPU: idx = seq * (S_v * S_v * H_v) + head * (S_v * S_v) + i * S_v + j
			
 
				-    // Tensor layout is actually: [j, i, head, seq] based on nb strides
			
 
				     const size_t state_base = (size_t)head * (size_t)(S_v * S_v) + (size_t)seq * (size_t)(S_v * S_v * H_v);
			
 
				     auto off_state = [=](int i, int j) -> size_t {
			
 
				         return (size_t)j + (size_t)i * S_v + state_base;
			
 
				     };
			
 
				     
			
 
				     auto off_tok_vec = [=](int token, int d) -> size_t {
			
 
				-        // Matching CPU: ggml_get_f32_nd(src0, token, d, head, seq)
			
 
				-        // Layout: [token, d, head, seq]
			
 
				         return (size_t)token + (size_t)d * n_tokens + (size_t)head * (n_tokens * S_v) + (size_t)seq * (n_tokens * S_v * H_v);
			
 
				     };
			
 
				     
			
 
				-    auto off_scalar_tok = [=](const float * base, int token) -> float {
			
 
				-        // g_exp / beta: ggml_get_f32_nd(src, token, 0, head, seq)
			
 
				-        // Layout: [token, 1, head, seq]
			
 
				-        return base[(size_t)token + (size_t)head * n_tokens + (size_t)seq * (n_tokens * H_v)];
			
 
				+    auto off_scalar_tok = [=](const float * base, int token) -> size_t {
			
 
				+        return (size_t)token + (size_t)head * n_tokens + (size_t)seq * (n_tokens * H_v);
			
 
				     };
			
 
				 
			
 
				     // Initialize state_out with state_in
			
 
				-    for (int idx = tid; idx < S_v * S_v; idx += blockDim.x) {
			
 
				+    const int S_v_sq = S_v * S_v;
			
 
				+    for (int idx = tid; idx < S_v_sq; idx += blockDim.x) {
			
 
				         int i = idx / S_v;
			
 
				         int j = idx % S_v;
			
 
				         state_out[off_state(i, j)] = state_in[off_state(i, j)];
			
@@ -84,24 +92,44 @@ __global__ void delta_net_recurrent_f32_kernel(
 
				     
			
 
				     // Process each token sequentially
			
 
				     for (int token = 0; token < n_tokens; token++) {
			
 
				-        // Load q, k, v for this token (handle S_v > blockDim.x)
			
 
				-        for (int d = tid; d < S_v; d += blockDim.x) {
			
 
				-            q_vec[d] = LDG(&q_tokens[off_tok_vec(token, d)]);
			
 
				-            k_vec[d] = LDG(&k_tokens[off_tok_vec(token, d)]);
			
 
				-            v_vec[d] = LDG(&v_tokens[off_tok_vec(token, d)]);
			
 
				+        // OPTIMIZATION: Vectorized loads when S_v is aligned to 4
			
 
				+        const bool can_use_vec4 = (S_v % 4 == 0) && ((uintptr_t)&q_tokens[off_tok_vec(token, 0)] % 16 == 0);
			
 
				+        
			
 
				+        if (can_use_vec4) {
			
 
				+            const int vec_count = S_v / 4;
			
 
				+            for (int vec_idx = tid; vec_idx < vec_count; vec_idx += blockDim.x) {
			
 
				+                const int d = vec_idx * 4;
			
 
				+                const size_t base_off = off_tok_vec(token, d);
			
 
				+                
			
 
				+                float4 q4 = *reinterpret_cast<const float4*>(&q_tokens[base_off]);
			
 
				+                float4 k4 = *reinterpret_cast<const float4*>(&k_tokens[base_off]);
			
 
				+                float4 v4 = *reinterpret_cast<const float4*>(&v_tokens[base_off]);
			
 
				+                
			
 
				+                reinterpret_cast<float4*>(&q_vec[d])[0] = q4;
			
 
				+                reinterpret_cast<float4*>(&k_vec[d])[0] = k4;
			
 
				+                reinterpret_cast<float4*>(&v_vec[d])[0] = v4;
			
 
				+            }
			
 
				+        } else {
			
 
				+            // Fallback to scalar loads
			
 
				+            for (int d = tid; d < S_v; d += blockDim.x) {
			
 
				+                q_vec[d] = LDG(&q_tokens[off_tok_vec(token, d)]);
			
 
				+                k_vec[d] = LDG(&k_tokens[off_tok_vec(token, d)]);
			
 
				+                v_vec[d] = LDG(&v_tokens[off_tok_vec(token, d)]);
			
 
				+            }
			
 
				         }
			
 
				-        // Load scalars
			
 
				+        
			
 
				+        // Load scalars into shared memory temporarily (broadcast via shuffle would need warp sync)
			
 
				+        __shared__ float scalar_vals[2];
			
 
				         if (tid == 0) {
			
 
				-            scalars[0] = off_scalar_tok(g_tokens_exp, token);
			
 
				-            scalars[1] = off_scalar_tok(beta_tokens, token);
			
 
				+            scalar_vals[0] = g_tokens_exp[off_scalar_tok(g_tokens_exp, token)];
			
 
				+            scalar_vals[1] = beta_tokens[off_scalar_tok(beta_tokens, token)];
			
 
				         }
			
 
				         __syncthreads();
			
 
				-        float g_exp = scalars[0];
			
 
				-        float beta_val = scalars[1];
			
 
				+        float g_exp = scalar_vals[0];
			
 
				+        float beta_val = scalar_vals[1];
			
 
				 
			
 
				         // 1. state = state * g_exp (element-wise multiplication)
			
 
				-        // CPU: temp_state[idx] *= g_exp;
			
 
				-        for (int idx = tid; idx < S_v * S_v; idx += blockDim.x) {
			
 
				+        for (int idx = tid; idx < S_v_sq; idx += blockDim.x) {
			
 
				             int i = idx / S_v;
			
 
				             int j = idx % S_v;
			
 
				             state_out[off_state(i, j)] *= g_exp;
			
@@ -109,29 +137,39 @@ __global__ void delta_net_recurrent_f32_kernel(
 
				         __syncthreads();
			
 
				         
			
 
				         // 2. kv_mem[j] = sum_i (state[i,j] * k[i])
			
 
				-        // CPU: kv_mem[j] += temp_state[state_idx] * k_t(i)
			
 
				+        // OPTIMIZATION: More aggressive unrolling
			
 
				         for (int j = tid; j < S_v; j += blockDim.x) {
			
 
				             float sum = 0.0f;
			
 
				             size_t sidx = state_base + (size_t)j;
			
 
				-            #pragma unroll 4
			
 
				-            for (int i = 0; i < S_v; i++) {
			
 
				+            
			
 
				+            // Unroll by 8 for better ILP
			
 
				+            int i = 0;
			
 
				+            for (; i + 7 < S_v; i += 8) {
			
 
				+                sum = FMA(state_out[sidx], k_vec[i], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], k_vec[i+1], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], k_vec[i+2], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], k_vec[i+3], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], k_vec[i+4], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], k_vec[i+5], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], k_vec[i+6], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], k_vec[i+7], sum); sidx += S_v;
			
 
				+            }
			
 
				+            for (; i < S_v; i++) {
			
 
				                 sum = FMA(state_out[sidx], k_vec[i], sum);
			
 
				-                sidx += (size_t)S_v;
			
 
				+                sidx += S_v;
			
 
				             }
			
 
				             kv_mem[j] = sum;
			
 
				         }
			
 
				         __syncthreads();
			
 
				         
			
 
				         // 3. delta = (v - kv_mem) * beta
			
 
				-        // CPU: delta[j] = (v_t(j) - kv_mem[j]) * beta_val
			
 
				         for (int j = tid; j < S_v; j += blockDim.x) {
			
 
				             delta[j] = (v_vec[j] - kv_mem[j]) * beta_val;
			
 
				         }
			
 
				         __syncthreads();
			
 
				         
			
 
				         // 4. state[i,j] += k[i] * delta[j]  (outer product)
			
 
				-        // CPU: temp_state[state_idx] += k_t(i) * delta[j]
			
 
				-        for (int idx = tid; idx < S_v * S_v; idx += blockDim.x) {
			
 
				+        for (int idx = tid; idx < S_v_sq; idx += blockDim.x) {
			
 
				             int i = idx / S_v;
			
 
				             int j = idx % S_v;
			
 
				             size_t sidx = state_base + (size_t)j + (size_t)i * (size_t)S_v;
			
@@ -140,26 +178,50 @@ __global__ void delta_net_recurrent_f32_kernel(
 
				         __syncthreads();
			
 
				         
			
 
				         // 5. output[j] = sum_i (state[i,j] * q[i])
			
 
				-        // CPU: attn_out_t[j] += temp_state[state_idx] * q_t(i)
			
 
				+        // OPTIMIZATION: Same unrolling strategy
			
 
				         for (int j = tid; j < S_v; j += blockDim.x) {
			
 
				             float sum = 0.0f;
			
 
				             size_t sidx = state_base + (size_t)j;
			
 
				-            #pragma unroll 4
			
 
				-            for (int i = 0; i < S_v; i++) {
			
 
				+            
			
 
				+            int i = 0;
			
 
				+            for (; i + 7 < S_v; i += 8) {
			
 
				+                sum = FMA(state_out[sidx], q_vec[i], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], q_vec[i+1], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], q_vec[i+2], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], q_vec[i+3], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], q_vec[i+4], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], q_vec[i+5], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], q_vec[i+6], sum); sidx += S_v;
			
 
				+                sum = FMA(state_out[sidx], q_vec[i+7], sum); sidx += S_v;
			
 
				+            }
			
 
				+            for (; i < S_v; i++) {
			
 
				                 sum = FMA(state_out[sidx], q_vec[i], sum);
			
 
				-                sidx += (size_t)S_v;
			
 
				+                sidx += S_v;
			
 
				             }
			
 
				             out_vec[j] = sum;
			
 
				         }
			
 
				         __syncthreads();
			
 
				         
			
 
				         // Store output for this token
			
 
				-        // CPU output layout: d + head * S_v + token * (S_v * H_v) + seq * (S_v * H_v * n_tokens)
			
 
				-        for (int d = tid; d < S_v; d += blockDim.x) {
			
 
				-            size_t output_idx = (size_t)d + (size_t)head * S_v + (size_t)token * (S_v * H_v) + (size_t)seq * (S_v * H_v * n_tokens);
			
 
				-            output[output_idx] = out_vec[d];
			
 
				+        const size_t output_base = (size_t)head * S_v + (size_t)token * (S_v * H_v) + (size_t)seq * (S_v * H_v * n_tokens);
			
 
				+        if (can_use_vec4) {
			
 
				+            for (int d = tid; d < S_v / 4; d += blockDim.x) {
			
 
				+                *reinterpret_cast<float4*>(&output[output_base + d * 4]) = 
			
 
				+                    *reinterpret_cast<float4*>(&out_vec[d * 4]);
			
 
				+            }
			
 
				+            // Handle remainder
			
 
				+            for (int d = tid + (S_v / 4) * 4; d < S_v; d += blockDim.x) {
			
 
				+                output[output_base + d] = out_vec[d];
			
 
				+            }
			
 
				+        } else {
			
 
				+            for (int d = tid; d < S_v; d += blockDim.x) {
			
 
				+                output[output_base + d] = out_vec[d];
			
 
				+            }
			
 
				+        }
			
 
				+        // OPTIMIZATION: Final sync can be removed if it's the last token
			
 
				+        if (token < n_tokens - 1) {
			
 
				+            __syncthreads();
			
 
				         }
			
 
				-        __syncthreads();
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -206,7 +268,7 @@ void ggml_cuda_op_delta_net_recurrent(ggml_backend_cuda_context & ctx, ggml_tens
 
				     if (S_v < 256) block_x = (S_v >= 128 ? 128 : (S_v >= 64 ? 64 : (S_v >= 32 ? 32 : 16)));
			
 
				     dim3 block(block_x, 1, 1);
			
 
				     
			
 
				-    // Shared memory: 6 vectors (S_v each) + 2 scalars
			
 
				+    // Shared memory: 6 vectors (S_v each) + 2 scalars for temporary storage
			
 
				     size_t smem_size = (6 * (size_t)S_v + 2) * sizeof(float);
			
 
				     
			
 
				     delta_net_recurrent_f32_kernel<<<grid, block, smem_size, stream>>>(
			
@@ -218,7 +280,15 @@ void ggml_cuda_op_delta_net_recurrent(ggml_backend_cuda_context & ctx, ggml_tens
 
				     CUDA_CHECK(cudaGetLastError());
			
 
				 }
			
 
				 
			
 
				-// Chunked kernel
			
 
				+// Chunked kernel for Gated Delta Net
			
 
				+// 
			
 
				+// PRECISION NOTES FOR LONG CONTEXTS (40k+ tokens):
			
 
				+// - g_cumsum values can become large over long sequences. The cumsum operation now uses
			
 
				+//   double precision internally to minimize accumulation errors (see cumsum.cu).
			
 
				+// - State decay uses exp(g_j - g_i) formulation which is numerically stable.
			
 
				+// - FMA (fused multiply-add) is used throughout to minimize rounding errors.
			
 
				+// - For debugging long-context issues: verify g_cumsum precision by comparing with reference.
			
 
				+//
			
 
				 __global__ void delta_net_chunked_f32_kernel(
			
 
				     const float * __restrict__ q,
			
 
				     const float * __restrict__ k,
			
@@ -357,44 +427,81 @@ __global__ void delta_net_chunked_f32_kernel(
 
				         __syncthreads();
			
 
				 
			
 
				         // Compute value = attn_pre @ v_beta [n_tokens_chunk x S_v]
			
 
				+        // OPTIMIZATION: Better loop unrolling and coalescing
			
 
				         for (int idx = tid; idx < n_tokens_chunk * S_v; idx += blockDim.x) {
			
 
				             const int row = idx / S_v;
			
 
				             const int col = idx % S_v;
			
 
				             float sum = 0.0f;
			
 
				             const float * __restrict__ pv = &v_beta[off_qkv(v_beta, head, chunk, 0, col)];
			
 
				-            #pragma unroll 4
			
 
				-            for (int k = 0; k < n_tokens_chunk; ++k) {
			
 
				+            
			
 
				+            // Unroll by 8 for better ILP
			
 
				+            int k = 0;
			
 
				+            for (; k + 7 < n_tokens_chunk; k += 8) {
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k], LDG(pv), sum); pv += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 1], LDG(pv), sum); pv += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 2], LDG(pv), sum); pv += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 3], LDG(pv), sum); pv += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 4], LDG(pv), sum); pv += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 5], LDG(pv), sum); pv += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 6], LDG(pv), sum); pv += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 7], LDG(pv), sum); pv += S_v;
			
 
				+            }
			
 
				+            for (; k < n_tokens_chunk; ++k) {
			
 
				                 sum = FMA(attn_pre[row * chunk_size + k], LDG(pv), sum);
			
 
				-                pv += (size_t)S_v;
			
 
				+                pv += S_v;
			
 
				             }
			
 
				             value[row * S_v + col] = sum;
			
 
				         }
			
 
				         __syncthreads();
			
 
				         
			
 
				         // Compute k_cumdecay = attn_pre @ (k_beta * exp(g)) [n_tokens_chunk x S_v]
			
 
				+        // OPTIMIZATION: Better unrolling
			
 
				         for (int idx = tid; idx < n_tokens_chunk * S_v; idx += blockDim.x) {
			
 
				             const int row = idx / S_v;
			
 
				             const int col = idx % S_v;
			
 
				             float sum = 0.0f;
			
 
				             const float * __restrict__ pk = &k_beta[off_qkv(k_beta, head, chunk, 0, col)];
			
 
				-            #pragma unroll 4
			
 
				-            for (int k = 0; k < n_tokens_chunk; ++k) {
			
 
				+            
			
 
				+            int k = 0;
			
 
				+            for (; k + 7 < n_tokens_chunk; k += 8) {
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k], LDG(pk) * g_exp_buf[k], sum); pk += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 1], LDG(pk) * g_exp_buf[k + 1], sum); pk += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 2], LDG(pk) * g_exp_buf[k + 2], sum); pk += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 3], LDG(pk) * g_exp_buf[k + 3], sum); pk += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 4], LDG(pk) * g_exp_buf[k + 4], sum); pk += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 5], LDG(pk) * g_exp_buf[k + 5], sum); pk += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 6], LDG(pk) * g_exp_buf[k + 6], sum); pk += S_v;
			
 
				+                sum = FMA(attn_pre[row * chunk_size + k + 7], LDG(pk) * g_exp_buf[k + 7], sum); pk += S_v;
			
 
				+            }
			
 
				+            for (; k < n_tokens_chunk; ++k) {
			
 
				                 sum = FMA(attn_pre[row * chunk_size + k], LDG(pk) * g_exp_buf[k], sum);
			
 
				-                pk += (size_t)S_v;
			
 
				+                pk += S_v;
			
 
				             }
			
 
				             k_cumdecay[row * S_v + col] = sum;
			
 
				         }
			
 
				         __syncthreads();
			
 
				         
			
 
				         // Compute v_prime = k_cumdecay @ state [n_tokens_chunk x S_v]
			
 
				+        // OPTIMIZATION: Better unrolling for matrix-matrix multiply
			
 
				         for (int idx = tid; idx < n_tokens_chunk * S_v; idx += blockDim.x) {
			
 
				             const int row = idx / S_v;
			
 
				             const int col = idx % S_v;
			
 
				             float sum = 0.0f;
			
 
				             const float * __restrict__ pstate_col = &state_out[(size_t)col + (size_t)head * (size_t)(S_v * S_v) + (size_t)seq * (size_t)(S_v * S_v * H_v)];
			
 
				-            #pragma unroll 4
			
 
				-            for (int k = 0; k < S_v; ++k) {
			
 
				-                sum = FMA(k_cumdecay[row * S_v + k], pstate_col[(size_t)k * (size_t)S_v], sum);
			
 
				+            
			
 
				+            int k = 0;
			
 
				+            for (; k + 7 < S_v; k += 8) {
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k], pstate_col[k * S_v], sum);
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k + 1], pstate_col[(k + 1) * S_v], sum);
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k + 2], pstate_col[(k + 2) * S_v], sum);
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k + 3], pstate_col[(k + 3) * S_v], sum);
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k + 4], pstate_col[(k + 4) * S_v], sum);
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k + 5], pstate_col[(k + 5) * S_v], sum);
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k + 6], pstate_col[(k + 6) * S_v], sum);
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k + 7], pstate_col[(k + 7) * S_v], sum);
			
 
				+            }
			
 
				+            for (; k < S_v; ++k) {
			
 
				+                sum = FMA(k_cumdecay[row * S_v + k], pstate_col[k * S_v], sum);
			
 
				             }
			
 
				             v_prime[row * S_v + col] = sum;
			
 
				         }
			
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3628,7 +3628,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 
				         case GGML_OP_DELTA_NET_RECURRENT:
			
 
				             return true;
			
 
				         case GGML_OP_DELTA_NET:
			
 
				-            return true;  // Chunked version not implemented yet, use CPU
			
 
				+            return true;
			
 
				+        case GGML_OP_CUMSUM:
			
 
				+            // require contiguous input on src0
			
 
				+            return op->src[0] != NULL && ggml_is_contiguous(op->src[0]);
			
 
				         case GGML_OP_FLASH_ATTN_EXT:
			
 
				             return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
			
 
				         case GGML_OP_CROSS_ENTROPY_LOSS:
			
@@ -3665,6 +3668,16 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 
				 static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
			
 
				     const int min_batch_size = 32;
			
 
				 
			
 
				+    // Always offload key ops even for small batch sizes during generation
			
 
				+    switch (op->op) {
			
 
				+        case GGML_OP_DELTA_NET:
			
 
				+        case GGML_OP_DELTA_NET_RECURRENT:
			
 
				+        case GGML_OP_CUMSUM:
			
 
				+            return true;
			
 
				+        default:
			
 
				+            break;
			
 
				+    }
			
 
				+
			
 
				     return get_op_batch_size(op) >= min_batch_size;
			
 
				 
			
 
				     GGML_UNUSED(dev);
			
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1497,6 +1497,57 @@ llm_graph_cb llama_context::graph_get_cb() const {
 
				                 }
			
 
				             }
			
 
				         }
			
 
				+
			
 
				+        // Ensure key Qwen3 Next ops are assigned to the layer device even for single-token generation
			
 
				+        if (il != -1) {
			
 
				+            const auto & dev_layer = model.dev_layer(il);
			
 
				+            switch (cur->op) {
			
 
				+                case GGML_OP_CUMSUM:
			
 
				+                case GGML_OP_DELTA_NET:
			
 
				+                case GGML_OP_DELTA_NET_RECURRENT: {
			
 
				+                    for (const auto & backend : backends) {
			
 
				+                        if (ggml_backend_get_device(backend.get()) == dev_layer) {
			
 
				+                            if (ggml_backend_supports_op(backend.get(), cur)) {
			
 
				+                                ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                } break;
			
 
				+                // Heavily used small-batch ops during generation: pin to layer device when supported
			
 
				+                case GGML_OP_UNARY:
			
 
				+                case GGML_OP_ADD:
			
 
				+                case GGML_OP_SUB:
			
 
				+                case GGML_OP_MUL:
			
 
				+                case GGML_OP_DIV:
			
 
				+                case GGML_OP_SCALE:
			
 
				+                case GGML_OP_SQR:
			
 
				+                case GGML_OP_SQRT:
			
 
				+                case GGML_OP_CLAMP:
			
 
				+                case GGML_OP_ROPE:
			
 
				+                case GGML_OP_ROPE_BACK:
			
 
				+                case GGML_OP_RMS_NORM:
			
 
				+                case GGML_OP_NORM:
			
 
				+                case GGML_OP_CPY:
			
 
				+                case GGML_OP_DUP:
			
 
				+                case GGML_OP_REPEAT:
			
 
				+                case GGML_OP_GET_ROWS:
			
 
				+                case GGML_OP_SET_ROWS:
			
 
				+                case GGML_OP_MEAN:
			
 
				+                case GGML_OP_SUM:
			
 
				+                case GGML_OP_SUM_ROWS:
			
 
				+                case GGML_OP_DIAG_MASK_INF:
			
 
				+                case GGML_OP_TRI: {
			
 
				+                    for (const auto & backend : backends) {
			
 
				+                        if (ggml_backend_get_device(backend.get()) == dev_layer) {
			
 
				+                            if (ggml_backend_supports_op(backend.get(), cur)) {
			
 
				+                                ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                } break;
			
 
				+                default: break;
			
 
				+            }
			
 
				+        }
			
 
				     };
			
 
				 }
			
 
				 
			
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -128,12 +128,29 @@ void llama_memory_hybrid::clear(bool data) {
 
				 }
			
 
				 
			
 
				 bool llama_memory_hybrid::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
			
 
				-    // Try removing from the recurrent cache first since it may fail. If it does
			
 
				-    // fail, the cache will not have been mutated.
			
 
				-    if (!mem_recr->seq_rm(seq_id, p0, p1)) {
			
 
				-        return false;
			
 
				+    // For hybrid memory (attention + recurrent):
			
 
				+    // - Attention cache supports partial removal (removing positions [p0, p1))
			
 
				+    // - Recurrent memory only supports full clear (p1 == -1)
			
 
				+    //
			
 
				+    // When server does cache reuse with partial removal (e.g., "remove [21176, end)"),
			
 
				+    // we should:
			
 
				+    // 1. Apply partial removal to attention cache (works fine)
			
 
				+    // 2. Skip recurrent memory (it's cumulative state, doesn't need partial removal)
			
 
				+    // 3. Return TRUE so server keeps using the cache
			
 
				+    //
			
 
				+    // Only return false if NEITHER cache can handle the operation.
			
 
				+    
			
 
				+    bool attn_result = mem_attn->seq_rm(seq_id, p0, p1);
			
 
				+    
			
 
				+    // Recurrent memory: only clear if doing full removal (p1 == -1)
			
 
				+    // For partial removal, skip it (recurrent state is cumulative, doesn't have "positions")
			
 
				+    if (p1 == -1) {
			
 
				+        mem_recr->seq_rm(seq_id, p0, p1);
			
 
				     }
			
 
				-    return mem_attn->seq_rm(seq_id, p0, p1);
			
 
				+    
			
 
				+    // Return success if attention cache handled it
			
 
				+    // (recurrent state doesn't need position-based removal for cache reuse)
			
 
				+    return attn_result;
			
 
				 }
			
 
				 
			
 
				 void llama_memory_hybrid::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
			
@@ -157,8 +174,38 @@ void llama_memory_hybrid::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p
 
				 }
			
 
				 
			
 
				 llama_pos llama_memory_hybrid::seq_pos_min(llama_seq_id seq_id) const {
			
 
				-    // the min of the total cache is the max of the two caches' min values
			
 
				-    return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
			
 
				+    // For hybrid memory with recurrent layers (e.g., Qwen3Next with delta-net):
			
 
				+    // - Attention cache tracks a range of positions [pos_min, pos_max]
			
 
				+    // - Recurrent memory tracks only the CURRENT position (cumulative state)
			
 
				+    // 
			
 
				+    // For cache reuse validation, we need the minimum cached position that's still valid.
			
 
				+    // Recurrent state is always valid from position 0 (it's cumulative), so we should
			
 
				+    // only consider the attention cache's pos_min for this purpose.
			
 
				+    //
			
 
				+    // Using max() was causing false cache invalidation: if recurrent pos > attn pos_min,
			
 
				+    // the server would think early positions were evicted, even though recurrent state
			
 
				+    // is valid for the entire sequence.
			
 
				+    const auto attn_pos_min = mem_attn->seq_pos_min(seq_id);
			
 
				+    const auto recr_pos_min = mem_recr->seq_pos_min(seq_id);
			
 
				+    
			
 
				+    // If both caches are empty, return -1
			
 
				+    if (attn_pos_min == -1 && recr_pos_min == -1) {
			
 
				+        return -1;
			
 
				+    }
			
 
				+    
			
 
				+    // If only one cache is active, use its value
			
 
				+    if (attn_pos_min == -1) {
			
 
				+        // Only recurrent layers: state is valid from position 0
			
 
				+        return 0;
			
 
				+    }
			
 
				+    if (recr_pos_min == -1) {
			
 
				+        // Only attention layers: use attention cache's pos_min
			
 
				+        return attn_pos_min;
			
 
				+    }
			
 
				+    
			
 
				+    // Both caches active: use attention cache's pos_min since recurrent state
			
 
				+    // doesn't have a "minimum" - it's cumulative and always valid from the start
			
 
				+    return attn_pos_min;
			
 
				 }
			
 
				 
			
 
				 llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {