1 year ago · 19b7a836f6
--- a/ggml-cuda/mmvq.cu
+++ b/ggml-cuda/mmvq.cu
@@ -117,7 +117,7 @@ static __global__ void mul_mat_vec_q(
 
															             tmp[j][i] = warp_reduce_sum(tmp[j][i]);
														
 
															         }
														
 
															-        if (threadIdx.x < rows_per_cuda_block) {
														
 
															+        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
														
 
															             dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
														
 
															         }
														
 
															     }