1 year ago · 8c570c9496
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -8330,24 +8330,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
 
				     const int blocks_per_row = ncols / qk;
			
 
				     const int blocks_per_warp = vdr * WARP_SIZE / qi;
			
 
				 
			
 
				-// partial sum for each thread
			
 
				+    const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
			
 
				+
			
 
				+    // partial sum for each thread
			
 
				     float tmp = 0.0f;
			
 
				 
			
 
				     const block_q_t  * x = (const block_q_t  *) vx;
			
 
				     const block_q8_1 * y = (const block_q8_1 *) vy;
			
 
				 
			
 
				-    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
			
 
				+    for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
			
 
				          i += blocks_per_warp) {
			
 
				-        const int ibx = row*blocks_per_row + i; // x block index
			
 
				+      const int ibx = row * blocks_per_row + i; // x block index
			
 
				 
			
 
				-        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
			
 
				+      const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
			
 
				 
			
 
				-        const int iqs =
			
 
				-            vdr *
			
 
				-            (item_ct1.get_local_id(2) %
			
 
				-             (qi / vdr)); // x block quant index when casting the quants to int
			
 
				+      const int iqs =
			
 
				+          vdr *
			
 
				+          (item_ct1.get_local_id(2) -
			
 
				+           i * qi_vdr); // x block quant index when casting the quants to int
			
 
				 
			
 
				-        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
			
 
				+      tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
			
 
				     }
			
 
				 
			
 
				     // sum up partial sums and write back result