mmvq.cpp 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027
  1. #include "mmvq.hpp"
  2. #include "vecdotq.hpp"
  3. template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
  4. static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
  5. const sycl::nd_item<3> &item_ct1) {
  6. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  7. item_ct1.get_local_id(1);
  8. if (row >= nrows) {
  9. return;
  10. }
  11. const int blocks_per_row = ncols / qk;
  12. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  13. // partial sum for each thread
  14. float tmp = 0.0f;
  15. const block_q_t * x = (const block_q_t *) vx;
  16. const block_q8_1 * y = (const block_q8_1 *) vy;
  17. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  18. i += blocks_per_warp) {
  19. const int ibx = row*blocks_per_row + i; // x block index
  20. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  21. const int iqs =
  22. vdr *
  23. (item_ct1.get_local_id(2) %
  24. (qi / vdr)); // x block quant index when casting the quants to int
  25. tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
  26. }
  27. // sum up partial sums and write back result
  28. #pragma unroll
  29. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  30. tmp +=
  31. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  32. }
  33. if (item_ct1.get_local_id(2) == 0) {
  34. dst[row] = tmp;
  35. }
  36. }
  37. template <int qk, int qi, typename block_q_t, int vdr>
  38. static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
  39. const void *__restrict__ vy,
  40. float *__restrict__ dst, const int ncols,
  41. const int nrows,
  42. const sycl::nd_item<3> &item_ct1) {
  43. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  44. item_ct1.get_local_id(1);
  45. if (row >= nrows) {
  46. return;
  47. }
  48. const int blocks_per_row = ncols / qk;
  49. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  50. // partial sum for each thread
  51. float tmp = 0.0f;
  52. const block_q_t * x = (const block_q_t *) vx;
  53. const block_q8_1 * y = (const block_q8_1 *) vy;
  54. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  55. i += blocks_per_warp) {
  56. const int ibx = row*blocks_per_row + i; // x block index
  57. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  58. const int iqs =
  59. vdr *
  60. (item_ct1.get_local_id(2) %
  61. (qi / vdr)); // x block quant index when casting the quants to int
  62. tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
  63. }
  64. // sum up partial sums and write back result
  65. #pragma unroll
  66. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  67. tmp +=
  68. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  69. }
  70. if (item_ct1.get_local_id(2) == 0) {
  71. dst[row] = tmp;
  72. }
  73. }
  74. template <int qk, int qi, typename block_q_t, int vdr>
  75. static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
  76. const void *__restrict__ vy,
  77. float *__restrict__ dst, const int ncols,
  78. const int nrows,
  79. const sycl::nd_item<3> &item_ct1) {
  80. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  81. item_ct1.get_local_id(1);
  82. if (row >= nrows) {
  83. return;
  84. }
  85. const int blocks_per_row = ncols / qk;
  86. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  87. // partial sum for each thread
  88. float tmp = 0.0f;
  89. const block_q_t * x = (const block_q_t *) vx;
  90. const block_q8_1 * y = (const block_q8_1 *) vy;
  91. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  92. i += blocks_per_warp) {
  93. const int ibx = row*blocks_per_row + i; // x block index
  94. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  95. const int iqs =
  96. vdr *
  97. (item_ct1.get_local_id(2) %
  98. (qi / vdr)); // x block quant index when casting the quants to int
  99. tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
  100. }
  101. // sum up partial sums and write back result
  102. #pragma unroll
  103. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  104. tmp +=
  105. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  106. }
  107. if (item_ct1.get_local_id(2) == 0) {
  108. dst[row] = tmp;
  109. }
  110. }
  111. template <int qk, int qi, typename block_q_t, int vdr>
  112. static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
  113. const void *__restrict__ vy,
  114. float *__restrict__ dst, const int ncols,
  115. const int nrows,
  116. const sycl::nd_item<3> &item_ct1) {
  117. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  118. item_ct1.get_local_id(1);
  119. if (row >= nrows) {
  120. return;
  121. }
  122. const int blocks_per_row = ncols / qk;
  123. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  124. // partial sum for each thread
  125. float tmp = 0.0f;
  126. const block_q_t * x = (const block_q_t *) vx;
  127. const block_q8_1 * y = (const block_q8_1 *) vy;
  128. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  129. i += blocks_per_warp) {
  130. const int ibx = row*blocks_per_row + i; // x block index
  131. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  132. const int iqs =
  133. vdr *
  134. (item_ct1.get_local_id(2) %
  135. (qi / vdr)); // x block quant index when casting the quants to int
  136. tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
  137. }
  138. // sum up partial sums and write back result
  139. #pragma unroll
  140. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  141. tmp +=
  142. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  143. }
  144. if (item_ct1.get_local_id(2) == 0) {
  145. dst[row] = tmp;
  146. }
  147. }
  148. template <int qk, int qi, typename block_q_t, int vdr>
  149. static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
  150. const void *__restrict__ vy,
  151. float *__restrict__ dst, const int ncols,
  152. const int nrows,
  153. const sycl::nd_item<3> &item_ct1) {
  154. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  155. item_ct1.get_local_id(1);
  156. if (row >= nrows) {
  157. return;
  158. }
  159. const int blocks_per_row = ncols / qk;
  160. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  161. // partial sum for each thread
  162. float tmp = 0.0f;
  163. const block_q_t * x = (const block_q_t *) vx;
  164. const block_q8_1 * y = (const block_q8_1 *) vy;
  165. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  166. i += blocks_per_warp) {
  167. const int ibx = row*blocks_per_row + i; // x block index
  168. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  169. const int iqs =
  170. vdr *
  171. (item_ct1.get_local_id(2) %
  172. (qi / vdr)); // x block quant index when casting the quants to int
  173. tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
  174. }
  175. // sum up partial sums and write back result
  176. #pragma unroll
  177. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  178. tmp +=
  179. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  180. }
  181. if (item_ct1.get_local_id(2) == 0) {
  182. dst[row] = tmp;
  183. }
  184. }
  185. template <int qk, int qi, typename block_q_t, int vdr>
  186. static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
  187. const void *__restrict__ vy,
  188. float *__restrict__ dst, const int ncols,
  189. const int nrows,
  190. const sycl::nd_item<3> &item_ct1) {
  191. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  192. item_ct1.get_local_id(1);
  193. if (row >= nrows) {
  194. return;
  195. }
  196. const int blocks_per_row = ncols / qk;
  197. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  198. // partial sum for each thread
  199. float tmp = 0.0f;
  200. const block_q_t * x = (const block_q_t *) vx;
  201. const block_q8_1 * y = (const block_q8_1 *) vy;
  202. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  203. i += blocks_per_warp) {
  204. const int ibx = row*blocks_per_row + i; // x block index
  205. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  206. const int iqs =
  207. vdr *
  208. (item_ct1.get_local_id(2) %
  209. (qi / vdr)); // x block quant index when casting the quants to int
  210. tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
  211. }
  212. // sum up partial sums and write back result
  213. #pragma unroll
  214. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  215. tmp +=
  216. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  217. }
  218. if (item_ct1.get_local_id(2) == 0) {
  219. dst[row] = tmp;
  220. }
  221. }
  222. template <int qk, int qi, typename block_q_t, int vdr>
  223. static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
  224. const void *__restrict__ vy,
  225. float *__restrict__ dst, const int ncols,
  226. const int nrows,
  227. const sycl::nd_item<3> &item_ct1) {
  228. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  229. item_ct1.get_local_id(1);
  230. if (row >= nrows) {
  231. return;
  232. }
  233. const int blocks_per_row = ncols / qk;
  234. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  235. // partial sum for each thread
  236. float tmp = 0.0f;
  237. const block_q_t * x = (const block_q_t *) vx;
  238. const block_q8_1 * y = (const block_q8_1 *) vy;
  239. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  240. i += blocks_per_warp) {
  241. const int ibx = row*blocks_per_row + i; // x block index
  242. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  243. const int iqs =
  244. vdr *
  245. (item_ct1.get_local_id(2) %
  246. (qi / vdr)); // x block quant index when casting the quants to int
  247. tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
  248. }
  249. // sum up partial sums and write back result
  250. #pragma unroll
  251. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  252. tmp +=
  253. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  254. }
  255. if (item_ct1.get_local_id(2) == 0) {
  256. dst[row] = tmp;
  257. }
  258. }
  259. template <int qk, int qi, typename block_q_t, int vdr>
  260. static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
  261. const void *__restrict__ vy,
  262. float *__restrict__ dst, const int ncols,
  263. const int nrows,
  264. const sycl::nd_item<3> &item_ct1) {
  265. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  266. item_ct1.get_local_id(1);
  267. if (row >= nrows) {
  268. return;
  269. }
  270. const int blocks_per_row = ncols / qk;
  271. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  272. // partial sum for each thread
  273. float tmp = 0.0f;
  274. const block_q_t * x = (const block_q_t *) vx;
  275. const block_q8_1 * y = (const block_q8_1 *) vy;
  276. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  277. i += blocks_per_warp) {
  278. const int ibx = row*blocks_per_row + i; // x block index
  279. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  280. const int iqs =
  281. vdr *
  282. (item_ct1.get_local_id(2) %
  283. (qi / vdr)); // x block quant index when casting the quants to int
  284. tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
  285. }
  286. // sum up partial sums and write back result
  287. #pragma unroll
  288. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  289. tmp +=
  290. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  291. }
  292. if (item_ct1.get_local_id(2) == 0) {
  293. dst[row] = tmp;
  294. }
  295. }
  296. template <int qk, int qi, typename block_q_t, int vdr>
  297. static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
  298. const void *__restrict__ vy,
  299. float *__restrict__ dst, const int ncols,
  300. const int nrows,
  301. const sycl::nd_item<3> &item_ct1) {
  302. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  303. item_ct1.get_local_id(1);
  304. if (row >= nrows) {
  305. return;
  306. }
  307. const int blocks_per_row = ncols / qk;
  308. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  309. // partial sum for each thread
  310. float tmp = 0.0f;
  311. const block_q_t * x = (const block_q_t *) vx;
  312. const block_q8_1 * y = (const block_q8_1 *) vy;
  313. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  314. i += blocks_per_warp) {
  315. const int ibx = row*blocks_per_row + i; // x block index
  316. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  317. const int iqs =
  318. vdr *
  319. (item_ct1.get_local_id(2) %
  320. (qi / vdr)); // x block quant index when casting the quants to int
  321. tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
  322. }
  323. // sum up partial sums and write back result
  324. #pragma unroll
  325. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  326. tmp +=
  327. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  328. }
  329. if (item_ct1.get_local_id(2) == 0) {
  330. dst[row] = tmp;
  331. }
  332. }
  333. template <int qk, int qi, typename block_q_t, int vdr>
  334. static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
  335. const void *__restrict__ vy,
  336. float *__restrict__ dst, const int ncols,
  337. const int nrows,
  338. const sycl::nd_item<3> &item_ct1) {
  339. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  340. item_ct1.get_local_id(1);
  341. if (row >= nrows) {
  342. return;
  343. }
  344. const int blocks_per_row = ncols / qk;
  345. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  346. // partial sum for each thread
  347. float tmp = 0.0f;
  348. const block_q_t * x = (const block_q_t *) vx;
  349. const block_q8_1 * y = (const block_q8_1 *) vy;
  350. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  351. i += blocks_per_warp) {
  352. const int ibx = row*blocks_per_row + i; // x block index
  353. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  354. const int iqs =
  355. vdr *
  356. (item_ct1.get_local_id(2) %
  357. (qi / vdr)); // x block quant index when casting the quants to int
  358. tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
  359. }
  360. // sum up partial sums and write back result
  361. #pragma unroll
  362. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  363. tmp +=
  364. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  365. }
  366. if (item_ct1.get_local_id(2) == 0) {
  367. dst[row] = tmp;
  368. }
  369. }
  370. static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
  371. float *dst, const int ncols,
  372. const int nrows,
  373. dpct::queue_ptr stream) {
  374. GGML_ASSERT(ncols % QK4_0 == 0);
  375. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  376. const sycl::range<3> block_nums(1, 1, block_num_y);
  377. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  378. {
  379. stream->submit([&](sycl::handler &cgh) {
  380. cgh.parallel_for(
  381. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  382. [=](sycl::nd_item<3> item_ct1)
  383. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  384. mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
  385. VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
  386. vx, vy, dst, ncols, nrows, item_ct1);
  387. });
  388. });
  389. }
  390. }
  391. static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
  392. float *dst, const int ncols,
  393. const int nrows,
  394. dpct::queue_ptr stream) {
  395. GGML_ASSERT(ncols % QK4_1 == 0);
  396. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  397. const sycl::range<3> block_nums(1, 1, block_num_y);
  398. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  399. {
  400. stream->submit([&](sycl::handler &cgh) {
  401. cgh.parallel_for(
  402. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  403. [=](sycl::nd_item<3> item_ct1)
  404. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  405. mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
  406. VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
  407. vx, vy, dst, ncols, nrows, item_ct1);
  408. });
  409. });
  410. }
  411. }
  412. static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
  413. float *dst, const int ncols,
  414. const int nrows,
  415. dpct::queue_ptr stream) {
  416. GGML_ASSERT(ncols % QK5_0 == 0);
  417. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  418. const sycl::range<3> block_nums(1, 1, block_num_y);
  419. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  420. {
  421. stream->submit([&](sycl::handler &cgh) {
  422. cgh.parallel_for(
  423. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  424. [=](sycl::nd_item<3> item_ct1)
  425. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  426. mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
  427. VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
  428. vx, vy, dst, ncols, nrows, item_ct1);
  429. });
  430. });
  431. }
  432. }
  433. static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
  434. float *dst, const int ncols,
  435. const int nrows,
  436. dpct::queue_ptr stream) {
  437. GGML_ASSERT(ncols % QK5_1 == 0);
  438. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  439. const sycl::range<3> block_nums(1, 1, block_num_y);
  440. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  441. {
  442. stream->submit([&](sycl::handler &cgh) {
  443. cgh.parallel_for(
  444. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  445. [=](sycl::nd_item<3> item_ct1)
  446. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  447. mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
  448. VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
  449. vx, vy, dst, ncols, nrows, item_ct1);
  450. });
  451. });
  452. }
  453. }
  454. static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
  455. float *dst, const int ncols,
  456. const int nrows,
  457. dpct::queue_ptr stream) {
  458. GGML_ASSERT(ncols % QK8_0 == 0);
  459. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  460. const sycl::range<3> block_nums(1, 1, block_num_y);
  461. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  462. {
  463. stream->submit([&](sycl::handler &cgh) {
  464. cgh.parallel_for(
  465. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  466. [=](sycl::nd_item<3> item_ct1)
  467. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  468. mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
  469. VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
  470. vx, vy, dst, ncols, nrows, item_ct1);
  471. });
  472. });
  473. }
  474. }
  475. static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
  476. float *dst, const int ncols,
  477. const int nrows,
  478. dpct::queue_ptr stream) {
  479. GGML_ASSERT(ncols % QK_K == 0);
  480. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  481. const sycl::range<3> block_nums(1, 1, block_num_y);
  482. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  483. {
  484. stream->submit([&](sycl::handler &cgh) {
  485. cgh.parallel_for(
  486. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  487. [=](sycl::nd_item<3> item_ct1)
  488. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  489. mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
  490. VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
  491. vx, vy, dst, ncols, nrows, item_ct1);
  492. });
  493. });
  494. }
  495. }
  496. static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
  497. float *dst, const int ncols,
  498. const int nrows,
  499. dpct::queue_ptr stream) {
  500. GGML_ASSERT(ncols % QK_K == 0);
  501. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  502. const sycl::range<3> block_nums(1, 1, block_num_y);
  503. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  504. {
  505. stream->submit([&](sycl::handler &cgh) {
  506. cgh.parallel_for(
  507. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  508. [=](sycl::nd_item<3> item_ct1)
  509. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  510. mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
  511. VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
  512. vx, vy, dst, ncols, nrows, item_ct1);
  513. });
  514. });
  515. }
  516. }
  517. static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
  518. float *dst, const int ncols,
  519. const int nrows,
  520. dpct::queue_ptr stream) {
  521. GGML_ASSERT(ncols % QK_K == 0);
  522. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  523. const sycl::range<3> block_nums(1, 1, block_num_y);
  524. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  525. {
  526. stream->submit([&](sycl::handler &cgh) {
  527. cgh.parallel_for(
  528. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  529. [=](sycl::nd_item<3> item_ct1)
  530. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  531. mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
  532. VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
  533. vx, vy, dst, ncols, nrows, item_ct1);
  534. });
  535. });
  536. }
  537. }
  538. static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
  539. float *dst, const int ncols,
  540. const int nrows,
  541. dpct::queue_ptr stream) {
  542. GGML_ASSERT(ncols % QK_K == 0);
  543. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  544. const sycl::range<3> block_nums(1, 1, block_num_y);
  545. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  546. {
  547. stream->submit([&](sycl::handler &cgh) {
  548. cgh.parallel_for(
  549. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  550. [=](sycl::nd_item<3> item_ct1)
  551. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  552. mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
  553. VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
  554. vx, vy, dst, ncols, nrows, item_ct1);
  555. });
  556. });
  557. }
  558. }
  559. static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
  560. float *dst, const int ncols,
  561. const int nrows,
  562. dpct::queue_ptr stream) {
  563. GGML_ASSERT(ncols % QK_K == 0);
  564. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  565. const sycl::range<3> block_nums(1, 1, block_num_y);
  566. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  567. {
  568. stream->submit([&](sycl::handler &cgh) {
  569. cgh.parallel_for(
  570. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  571. [=](sycl::nd_item<3> item_ct1)
  572. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  573. mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
  574. VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
  575. vx, vy, dst, ncols, nrows, item_ct1);
  576. });
  577. });
  578. }
  579. }
  580. static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
  581. float *dst, const int ncols,
  582. const int nrows,
  583. dpct::queue_ptr stream) {
  584. GGML_ASSERT(ncols % QK_K == 0);
  585. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  586. const sycl::range<3> block_nums(1, 1, block_num_y);
  587. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  588. {
  589. stream->submit([&](sycl::handler &cgh) {
  590. cgh.parallel_for(
  591. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  592. [=](sycl::nd_item<3> item_ct1)
  593. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  594. mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
  595. vx, vy, dst, ncols, nrows, item_ct1);
  596. });
  597. });
  598. }
  599. }
  600. static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
  601. float *dst, const int ncols,
  602. const int nrows,
  603. dpct::queue_ptr stream) {
  604. GGML_ASSERT(ncols % QK_K == 0);
  605. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  606. const sycl::range<3> block_nums(1, 1, block_num_y);
  607. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  608. {
  609. stream->submit([&](sycl::handler &cgh) {
  610. auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
  611. auto ksigns64_ptr_ct1 = &ksigns64[0];
  612. cgh.parallel_for(
  613. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  614. [=](sycl::nd_item<3> item_ct1)
  615. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  616. mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
  617. vx, vy, dst, ncols, nrows, item_ct1);
  618. });
  619. });
  620. }
  621. }
  622. static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
  623. float *dst, const int ncols,
  624. const int nrows,
  625. dpct::queue_ptr stream) {
  626. GGML_ASSERT(ncols % QK_K == 0);
  627. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  628. const sycl::range<3> block_nums(1, 1, block_num_y);
  629. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  630. {
  631. stream->submit([&](sycl::handler &cgh) {
  632. auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
  633. auto ksigns64_ptr_ct1 = &ksigns64[0];
  634. cgh.parallel_for(
  635. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  636. [=](sycl::nd_item<3> item_ct1)
  637. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  638. mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
  639. vx, vy, dst, ncols, nrows, item_ct1);
  640. });
  641. });
  642. }
  643. }
  644. static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
  645. float *dst, const int ncols,
  646. const int nrows,
  647. dpct::queue_ptr stream) {
  648. GGML_ASSERT(ncols % QK_K == 0);
  649. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  650. const sycl::range<3> block_nums(1, 1, block_num_y);
  651. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  652. {
  653. stream->submit([&](sycl::handler &cgh) {
  654. auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
  655. auto ksigns64_ptr_ct1 = &ksigns64[0];
  656. cgh.parallel_for(
  657. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  658. [=](sycl::nd_item<3> item_ct1)
  659. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  660. mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
  661. vx, vy, dst, ncols, nrows, item_ct1);
  662. });
  663. });
  664. }
  665. }
  666. static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
  667. float *dst, const int ncols,
  668. const int nrows,
  669. dpct::queue_ptr stream) {
  670. GGML_ASSERT(ncols % QK_K == 0);
  671. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  672. const sycl::range<3> block_nums(1, 1, block_num_y);
  673. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  674. {
  675. stream->submit([&](sycl::handler &cgh) {
  676. auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
  677. cgh.parallel_for(
  678. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  679. [=](sycl::nd_item<3> item_ct1)
  680. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  681. mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
  682. vx, vy, dst, ncols, nrows, item_ct1);
  683. });
  684. });
  685. }
  686. }
  687. static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
  688. float *dst, const int ncols,
  689. const int nrows,
  690. dpct::queue_ptr stream) {
  691. GGML_ASSERT(ncols % QK_K == 0);
  692. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  693. const sycl::range<3> block_nums(1, 1, block_num_y);
  694. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  695. {
  696. stream->submit([&](sycl::handler &cgh) {
  697. auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
  698. auto ksigns64_ptr_ct1 = &ksigns64[0];
  699. cgh.parallel_for(
  700. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  701. [=](sycl::nd_item<3> item_ct1)
  702. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  703. mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
  704. vx, vy, dst, ncols, nrows, item_ct1);
  705. });
  706. });
  707. }
  708. }
  709. static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
  710. float *dst, const int ncols,
  711. const int nrows,
  712. dpct::queue_ptr stream) {
  713. GGML_ASSERT(ncols % QK_K == 0);
  714. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  715. const sycl::range<3> block_nums(1, 1, block_num_y);
  716. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  717. {
  718. stream->submit([&](sycl::handler &cgh) {
  719. cgh.parallel_for(
  720. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  721. [=](sycl::nd_item<3> item_ct1)
  722. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  723. mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
  724. vx, vy, dst, ncols, nrows, item_ct1);
  725. });
  726. });
  727. }
  728. }
  729. static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
  730. float *dst, const int ncols,
  731. const int nrows,
  732. dpct::queue_ptr stream) {
  733. GGML_ASSERT(ncols % QK4_NL == 0);
  734. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  735. const sycl::range<3> block_nums(1, 1, block_num_y);
  736. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  737. {
  738. stream->submit([&](sycl::handler &cgh) {
  739. cgh.parallel_for(
  740. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  741. [=](sycl::nd_item<3> item_ct1)
  742. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  743. mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
  744. vx, vy, dst, ncols, nrows, item_ct1);
  745. });
  746. });
  747. }
  748. }
  749. static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
  750. float *dst, const int ncols,
  751. const int nrows,
  752. dpct::queue_ptr stream) {
  753. GGML_ASSERT(ncols % QK_K == 0);
  754. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  755. const sycl::range<3> block_nums(1, 1, block_num_y);
  756. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  757. {
  758. stream->submit([&](sycl::handler &cgh) {
  759. cgh.parallel_for(
  760. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  761. [=](sycl::nd_item<3> item_ct1)
  762. [[intel::reqd_sub_group_size(WARP_SIZE)]] {
  763. mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
  764. vx, vy, dst, ncols, nrows, item_ct1);
  765. });
  766. });
  767. }
  768. }
  769. void ggml_sycl_op_mul_mat_vec_q(
  770. ggml_backend_sycl_context & ctx,
  771. const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
  772. const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
  773. float *dst_dd_i, const int64_t row_low, const int64_t row_high,
  774. const int64_t src1_ncols, const int64_t src1_padded_col_size,
  775. const dpct::queue_ptr &stream) {
  776. const int64_t ne10 = src1->ne[0];
  777. GGML_ASSERT(ne10 % QK8_1 == 0);
  778. const int64_t ne00 = src0->ne[0];
  779. const int64_t row_diff = row_high - row_low;
  780. int id;
  781. SYCL_CHECK(
  782. CHECK_TRY_ERROR(id = get_current_device_id()));
  783. const size_t q8_1_ts = sizeof(block_q8_1);
  784. const size_t q8_1_bs = QK8_1;
  785. // the main device has a larger memory buffer to hold the results from all GPUs
  786. // nrows_dst == nrows of the matrix that the kernel writes into
  787. const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
  788. for (int i = 0; i < src1_ncols; i++)
  789. {
  790. const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
  791. const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
  792. float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
  793. switch (src0->type) {
  794. case GGML_TYPE_Q4_0:
  795. mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  796. break;
  797. case GGML_TYPE_Q4_1:
  798. mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  799. break;
  800. case GGML_TYPE_Q5_0:
  801. mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  802. break;
  803. case GGML_TYPE_Q5_1:
  804. mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  805. break;
  806. case GGML_TYPE_Q8_0:
  807. mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  808. break;
  809. case GGML_TYPE_Q2_K:
  810. mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  811. break;
  812. case GGML_TYPE_Q3_K:
  813. mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  814. break;
  815. case GGML_TYPE_Q4_K:
  816. mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  817. break;
  818. case GGML_TYPE_Q5_K:
  819. mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  820. break;
  821. case GGML_TYPE_Q6_K:
  822. mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  823. break;
  824. case GGML_TYPE_IQ1_S:
  825. mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  826. break;
  827. case GGML_TYPE_IQ1_M:
  828. mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  829. break;
  830. case GGML_TYPE_IQ2_XXS:
  831. mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  832. break;
  833. case GGML_TYPE_IQ2_XS:
  834. mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  835. break;
  836. case GGML_TYPE_IQ2_S:
  837. mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  838. break;
  839. case GGML_TYPE_IQ3_XXS:
  840. mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  841. break;
  842. case GGML_TYPE_IQ3_S:
  843. mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  844. break;
  845. case GGML_TYPE_IQ4_NL:
  846. mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  847. break;
  848. case GGML_TYPE_IQ4_XS:
  849. mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  850. break;
  851. default:
  852. GGML_ABORT("fatal error");
  853. break;
  854. }
  855. }
  856. (void) src1;
  857. (void) dst;
  858. (void) src1_ddf_i;
  859. }