mmvq.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. #include "mmvq.hpp"
  2. #include "vecdotq.hpp"
  3. #include <cassert>
  4. template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
  5. static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
  6. const sycl::nd_item<3> &item_ct1) {
  7. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  8. item_ct1.get_local_id(1);
  9. if (row >= nrows) {
  10. return;
  11. }
  12. const int blocks_per_row = ncols / qk;
  13. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  14. assert(blocks_per_warp>0);
  15. // partial sum for each thread
  16. float tmp = 0.0f;
  17. const block_q_t * x = (const block_q_t *) vx;
  18. const block_q8_1 * y = (const block_q8_1 *) vy;
  19. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  20. i += blocks_per_warp) {
  21. const int ibx = row*blocks_per_row + i; // x block index
  22. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  23. const int iqs =
  24. vdr *
  25. (item_ct1.get_local_id(2) %
  26. (qi / vdr)); // x block quant index when casting the quants to int
  27. tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
  28. }
  29. // sum up partial sums and write back result
  30. #pragma unroll
  31. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  32. tmp +=
  33. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  34. }
  35. if (item_ct1.get_local_id(2) == 0) {
  36. dst[row] = tmp;
  37. }
  38. }
  39. template <int qk, int qi, typename block_q_t, int vdr>
  40. static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
  41. const void *__restrict__ vy,
  42. float *__restrict__ dst, const int ncols,
  43. const int nrows,
  44. const sycl::nd_item<3> &item_ct1) {
  45. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  46. item_ct1.get_local_id(1);
  47. if (row >= nrows) {
  48. return;
  49. }
  50. const int blocks_per_row = ncols / qk;
  51. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  52. assert(blocks_per_warp>0);
  53. // partial sum for each thread
  54. float tmp = 0.0f;
  55. const block_q_t * x = (const block_q_t *) vx;
  56. const block_q8_1 * y = (const block_q8_1 *) vy;
  57. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  58. i += blocks_per_warp) {
  59. const int ibx = row*blocks_per_row + i; // x block index
  60. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  61. const int iqs =
  62. vdr *
  63. (item_ct1.get_local_id(2) %
  64. (qi / vdr)); // x block quant index when casting the quants to int
  65. tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
  66. }
  67. // sum up partial sums and write back result
  68. #pragma unroll
  69. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  70. tmp +=
  71. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  72. }
  73. if (item_ct1.get_local_id(2) == 0) {
  74. dst[row] = tmp;
  75. }
  76. }
  77. template <int qk, int qi, typename block_q_t, int vdr>
  78. static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
  79. const void *__restrict__ vy,
  80. float *__restrict__ dst, const int ncols,
  81. const int nrows,
  82. const sycl::nd_item<3> &item_ct1) {
  83. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  84. item_ct1.get_local_id(1);
  85. if (row >= nrows) {
  86. return;
  87. }
  88. const int blocks_per_row = ncols / qk;
  89. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  90. assert(blocks_per_warp>0);
  91. // partial sum for each thread
  92. float tmp = 0.0f;
  93. const block_q_t * x = (const block_q_t *) vx;
  94. const block_q8_1 * y = (const block_q8_1 *) vy;
  95. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  96. i += blocks_per_warp) {
  97. const int ibx = row*blocks_per_row + i; // x block index
  98. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  99. const int iqs =
  100. vdr *
  101. (item_ct1.get_local_id(2) %
  102. (qi / vdr)); // x block quant index when casting the quants to int
  103. tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
  104. }
  105. // sum up partial sums and write back result
  106. #pragma unroll
  107. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  108. tmp +=
  109. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  110. }
  111. if (item_ct1.get_local_id(2) == 0) {
  112. dst[row] = tmp;
  113. }
  114. }
  115. template <int qk, int qi, typename block_q_t, int vdr>
  116. static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
  117. const void *__restrict__ vy,
  118. float *__restrict__ dst, const int ncols,
  119. const int nrows,
  120. const sycl::nd_item<3> &item_ct1) {
  121. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  122. item_ct1.get_local_id(1);
  123. if (row >= nrows) {
  124. return;
  125. }
  126. const int blocks_per_row = ncols / qk;
  127. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  128. assert(blocks_per_warp>0);
  129. // partial sum for each thread
  130. float tmp = 0.0f;
  131. const block_q_t * x = (const block_q_t *) vx;
  132. const block_q8_1 * y = (const block_q8_1 *) vy;
  133. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  134. i += blocks_per_warp) {
  135. const int ibx = row*blocks_per_row + i; // x block index
  136. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  137. const int iqs =
  138. vdr *
  139. (item_ct1.get_local_id(2) %
  140. (qi / vdr)); // x block quant index when casting the quants to int
  141. tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
  142. }
  143. // sum up partial sums and write back result
  144. #pragma unroll
  145. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  146. tmp +=
  147. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  148. }
  149. if (item_ct1.get_local_id(2) == 0) {
  150. dst[row] = tmp;
  151. }
  152. }
  153. template <int qk, int qi, typename block_q_t, int vdr>
  154. static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
  155. const void *__restrict__ vy,
  156. float *__restrict__ dst, const int ncols,
  157. const int nrows,
  158. const sycl::nd_item<3> &item_ct1) {
  159. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  160. item_ct1.get_local_id(1);
  161. if (row >= nrows) {
  162. return;
  163. }
  164. const int blocks_per_row = ncols / qk;
  165. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  166. assert(blocks_per_warp>0);
  167. // partial sum for each thread
  168. float tmp = 0.0f;
  169. const block_q_t * x = (const block_q_t *) vx;
  170. const block_q8_1 * y = (const block_q8_1 *) vy;
  171. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  172. i += blocks_per_warp) {
  173. const int ibx = row*blocks_per_row + i; // x block index
  174. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  175. const int iqs =
  176. vdr *
  177. (item_ct1.get_local_id(2) %
  178. (qi / vdr)); // x block quant index when casting the quants to int
  179. tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
  180. }
  181. // sum up partial sums and write back result
  182. #pragma unroll
  183. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  184. tmp +=
  185. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  186. }
  187. if (item_ct1.get_local_id(2) == 0) {
  188. dst[row] = tmp;
  189. }
  190. }
  191. template <int qk, int qi, typename block_q_t, int vdr>
  192. static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
  193. const void *__restrict__ vy,
  194. float *__restrict__ dst, const int ncols,
  195. const int nrows,
  196. const sycl::nd_item<3> &item_ct1) {
  197. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  198. item_ct1.get_local_id(1);
  199. if (row >= nrows) {
  200. return;
  201. }
  202. const int blocks_per_row = ncols / qk;
  203. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  204. assert(blocks_per_warp>0);
  205. // partial sum for each thread
  206. float tmp = 0.0f;
  207. const block_q_t * x = (const block_q_t *) vx;
  208. const block_q8_1 * y = (const block_q8_1 *) vy;
  209. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  210. i += blocks_per_warp) {
  211. const int ibx = row*blocks_per_row + i; // x block index
  212. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  213. const int iqs =
  214. vdr *
  215. (item_ct1.get_local_id(2) %
  216. (qi / vdr)); // x block quant index when casting the quants to int
  217. tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
  218. }
  219. // sum up partial sums and write back result
  220. #pragma unroll
  221. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  222. tmp +=
  223. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  224. }
  225. if (item_ct1.get_local_id(2) == 0) {
  226. dst[row] = tmp;
  227. }
  228. }
  229. template <int qk, int qi, typename block_q_t, int vdr>
  230. static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
  231. const void *__restrict__ vy,
  232. float *__restrict__ dst, const int ncols,
  233. const int nrows,
  234. const sycl::nd_item<3> &item_ct1) {
  235. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  236. item_ct1.get_local_id(1);
  237. if (row >= nrows) {
  238. return;
  239. }
  240. const int blocks_per_row = ncols / qk;
  241. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  242. assert(blocks_per_warp>0);
  243. // partial sum for each thread
  244. float tmp = 0.0f;
  245. const block_q_t * x = (const block_q_t *) vx;
  246. const block_q8_1 * y = (const block_q8_1 *) vy;
  247. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  248. i += blocks_per_warp) {
  249. const int ibx = row*blocks_per_row + i; // x block index
  250. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  251. const int iqs =
  252. vdr *
  253. (item_ct1.get_local_id(2) %
  254. (qi / vdr)); // x block quant index when casting the quants to int
  255. tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
  256. }
  257. // sum up partial sums and write back result
  258. #pragma unroll
  259. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  260. tmp +=
  261. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  262. }
  263. if (item_ct1.get_local_id(2) == 0) {
  264. dst[row] = tmp;
  265. }
  266. }
  267. template <int qk, int qi, typename block_q_t, int vdr>
  268. static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
  269. const void *__restrict__ vy,
  270. float *__restrict__ dst, const int ncols,
  271. const int nrows,
  272. const sycl::nd_item<3> &item_ct1) {
  273. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  274. item_ct1.get_local_id(1);
  275. if (row >= nrows) {
  276. return;
  277. }
  278. const int blocks_per_row = ncols / qk;
  279. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  280. assert(blocks_per_warp>0);
  281. // partial sum for each thread
  282. float tmp = 0.0f;
  283. const block_q_t * x = (const block_q_t *) vx;
  284. const block_q8_1 * y = (const block_q8_1 *) vy;
  285. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  286. i += blocks_per_warp) {
  287. const int ibx = row*blocks_per_row + i; // x block index
  288. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  289. const int iqs =
  290. vdr *
  291. (item_ct1.get_local_id(2) %
  292. (qi / vdr)); // x block quant index when casting the quants to int
  293. tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
  294. }
  295. // sum up partial sums and write back result
  296. #pragma unroll
  297. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  298. tmp +=
  299. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  300. }
  301. if (item_ct1.get_local_id(2) == 0) {
  302. dst[row] = tmp;
  303. }
  304. }
  305. template <int qk, int qi, typename block_q_t, int vdr>
  306. static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
  307. const void *__restrict__ vy,
  308. float *__restrict__ dst, const int ncols,
  309. const int nrows,
  310. const sycl::nd_item<3> &item_ct1) {
  311. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  312. item_ct1.get_local_id(1);
  313. if (row >= nrows) {
  314. return;
  315. }
  316. const int blocks_per_row = ncols / qk;
  317. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  318. assert(blocks_per_warp>0);
  319. // partial sum for each thread
  320. float tmp = 0.0f;
  321. const block_q_t * x = (const block_q_t *) vx;
  322. const block_q8_1 * y = (const block_q8_1 *) vy;
  323. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  324. i += blocks_per_warp) {
  325. const int ibx = row*blocks_per_row + i; // x block index
  326. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  327. const int iqs =
  328. vdr *
  329. (item_ct1.get_local_id(2) %
  330. (qi / vdr)); // x block quant index when casting the quants to int
  331. tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
  332. }
  333. // sum up partial sums and write back result
  334. #pragma unroll
  335. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  336. tmp +=
  337. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  338. }
  339. if (item_ct1.get_local_id(2) == 0) {
  340. dst[row] = tmp;
  341. }
  342. }
  343. template <int qk, int qi, typename block_q_t, int vdr>
  344. static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
  345. const void *__restrict__ vy,
  346. float *__restrict__ dst, const int ncols,
  347. const int nrows,
  348. const sycl::nd_item<3> &item_ct1) {
  349. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  350. item_ct1.get_local_id(1);
  351. if (row >= nrows) {
  352. return;
  353. }
  354. const int blocks_per_row = ncols / qk;
  355. const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
  356. assert(blocks_per_warp>0);
  357. // partial sum for each thread
  358. float tmp = 0.0f;
  359. const block_q_t * x = (const block_q_t *) vx;
  360. const block_q8_1 * y = (const block_q8_1 *) vy;
  361. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  362. i += blocks_per_warp) {
  363. const int ibx = row*blocks_per_row + i; // x block index
  364. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  365. const int iqs =
  366. vdr *
  367. (item_ct1.get_local_id(2) %
  368. (qi / vdr)); // x block quant index when casting the quants to int
  369. tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
  370. }
  371. // sum up partial sums and write back result
  372. #pragma unroll
  373. for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
  374. tmp +=
  375. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  376. }
  377. if (item_ct1.get_local_id(2) == 0) {
  378. dst[row] = tmp;
  379. }
  380. }
  381. static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
  382. float *dst, const int ncols,
  383. const int nrows,
  384. dpct::queue_ptr stream) {
  385. GGML_ASSERT(ncols % QK4_0 == 0);
  386. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  387. const sycl::range<3> block_nums(1, 1, block_num_y);
  388. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  389. {
  390. stream->submit([&](sycl::handler &cgh) {
  391. cgh.parallel_for(
  392. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  393. [=](sycl::nd_item<3> item_ct1)
  394. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  395. mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
  396. VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
  397. vx, vy, dst, ncols, nrows, item_ct1);
  398. });
  399. });
  400. }
  401. }
  402. static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
  403. float *dst, const int ncols,
  404. const int nrows,
  405. dpct::queue_ptr stream) {
  406. GGML_ASSERT(ncols % QK4_1 == 0);
  407. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  408. const sycl::range<3> block_nums(1, 1, block_num_y);
  409. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  410. {
  411. stream->submit([&](sycl::handler &cgh) {
  412. cgh.parallel_for(
  413. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  414. [=](sycl::nd_item<3> item_ct1)
  415. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  416. mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
  417. VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
  418. vx, vy, dst, ncols, nrows, item_ct1);
  419. });
  420. });
  421. }
  422. }
  423. static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
  424. float *dst, const int ncols,
  425. const int nrows,
  426. dpct::queue_ptr stream) {
  427. GGML_ASSERT(ncols % QK5_0 == 0);
  428. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  429. const sycl::range<3> block_nums(1, 1, block_num_y);
  430. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  431. {
  432. stream->submit([&](sycl::handler &cgh) {
  433. cgh.parallel_for(
  434. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  435. [=](sycl::nd_item<3> item_ct1)
  436. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  437. mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
  438. VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
  439. vx, vy, dst, ncols, nrows, item_ct1);
  440. });
  441. });
  442. }
  443. }
  444. static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
  445. float *dst, const int ncols,
  446. const int nrows,
  447. dpct::queue_ptr stream) {
  448. GGML_ASSERT(ncols % QK5_1 == 0);
  449. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  450. const sycl::range<3> block_nums(1, 1, block_num_y);
  451. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  452. {
  453. stream->submit([&](sycl::handler &cgh) {
  454. cgh.parallel_for(
  455. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  456. [=](sycl::nd_item<3> item_ct1)
  457. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  458. mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
  459. VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
  460. vx, vy, dst, ncols, nrows, item_ct1);
  461. });
  462. });
  463. }
  464. }
  465. static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
  466. float *dst, const int ncols,
  467. const int nrows,
  468. dpct::queue_ptr stream) {
  469. GGML_ASSERT(ncols % QK8_0 == 0);
  470. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  471. const sycl::range<3> block_nums(1, 1, block_num_y);
  472. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  473. {
  474. stream->submit([&](sycl::handler &cgh) {
  475. cgh.parallel_for(
  476. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  477. [=](sycl::nd_item<3> item_ct1)
  478. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  479. mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
  480. VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
  481. vx, vy, dst, ncols, nrows, item_ct1);
  482. });
  483. });
  484. }
  485. }
  486. static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
  487. float *dst, const int ncols,
  488. const int nrows,
  489. dpct::queue_ptr stream) {
  490. GGML_ASSERT(ncols % QK_K == 0);
  491. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  492. const sycl::range<3> block_nums(1, 1, block_num_y);
  493. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  494. {
  495. stream->submit([&](sycl::handler &cgh) {
  496. cgh.parallel_for(
  497. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  498. [=](sycl::nd_item<3> item_ct1)
  499. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  500. mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
  501. VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
  502. vx, vy, dst, ncols, nrows, item_ct1);
  503. });
  504. });
  505. }
  506. }
  507. static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
  508. float *dst, const int ncols,
  509. const int nrows,
  510. dpct::queue_ptr stream) {
  511. GGML_ASSERT(ncols % QK_K == 0);
  512. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  513. const sycl::range<3> block_nums(1, 1, block_num_y);
  514. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  515. {
  516. stream->submit([&](sycl::handler &cgh) {
  517. cgh.parallel_for(
  518. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  519. [=](sycl::nd_item<3> item_ct1)
  520. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  521. mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
  522. VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
  523. vx, vy, dst, ncols, nrows, item_ct1);
  524. });
  525. });
  526. }
  527. }
  528. static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
  529. float *dst, const int ncols,
  530. const int nrows,
  531. dpct::queue_ptr stream) {
  532. GGML_ASSERT(ncols % QK_K == 0);
  533. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  534. const sycl::range<3> block_nums(1, 1, block_num_y);
  535. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  536. {
  537. stream->submit([&](sycl::handler &cgh) {
  538. cgh.parallel_for(
  539. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  540. [=](sycl::nd_item<3> item_ct1)
  541. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  542. mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
  543. VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
  544. vx, vy, dst, ncols, nrows, item_ct1);
  545. });
  546. });
  547. }
  548. }
  549. static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
  550. float *dst, const int ncols,
  551. const int nrows,
  552. dpct::queue_ptr stream) {
  553. GGML_ASSERT(ncols % QK_K == 0);
  554. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  555. const sycl::range<3> block_nums(1, 1, block_num_y);
  556. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  557. {
  558. stream->submit([&](sycl::handler &cgh) {
  559. cgh.parallel_for(
  560. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  561. [=](sycl::nd_item<3> item_ct1)
  562. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  563. mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
  564. VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
  565. vx, vy, dst, ncols, nrows, item_ct1);
  566. });
  567. });
  568. }
  569. }
  570. static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
  571. float *dst, const int ncols,
  572. const int nrows,
  573. dpct::queue_ptr stream) {
  574. GGML_ASSERT(ncols % QK_K == 0);
  575. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  576. const sycl::range<3> block_nums(1, 1, block_num_y);
  577. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  578. {
  579. stream->submit([&](sycl::handler &cgh) {
  580. cgh.parallel_for(
  581. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  582. [=](sycl::nd_item<3> item_ct1)
  583. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  584. mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
  585. VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
  586. vx, vy, dst, ncols, nrows, item_ct1);
  587. });
  588. });
  589. }
  590. }
  591. static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
  592. float *dst, const int ncols,
  593. const int nrows,
  594. dpct::queue_ptr stream) {
  595. GGML_ASSERT(ncols % QK_K == 0);
  596. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  597. const sycl::range<3> block_nums(1, 1, block_num_y);
  598. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  599. {
  600. stream->submit([&](sycl::handler &cgh) {
  601. cgh.parallel_for(
  602. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  603. [=](sycl::nd_item<3> item_ct1)
  604. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  605. mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
  606. vx, vy, dst, ncols, nrows, item_ct1);
  607. });
  608. });
  609. }
  610. }
  611. static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
  612. float *dst, const int ncols,
  613. const int nrows,
  614. dpct::queue_ptr stream) {
  615. GGML_ASSERT(ncols % QK_K == 0);
  616. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  617. const sycl::range<3> block_nums(1, 1, block_num_y);
  618. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  619. {
  620. stream->submit([&](sycl::handler &cgh) {
  621. auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
  622. auto ksigns64_ptr_ct1 = &ksigns64[0];
  623. cgh.parallel_for(
  624. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  625. [=](sycl::nd_item<3> item_ct1)
  626. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  627. mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
  628. vx, vy, dst, ncols, nrows, item_ct1);
  629. });
  630. });
  631. }
  632. }
  633. static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
  634. float *dst, const int ncols,
  635. const int nrows,
  636. dpct::queue_ptr stream) {
  637. GGML_ASSERT(ncols % QK_K == 0);
  638. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  639. const sycl::range<3> block_nums(1, 1, block_num_y);
  640. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  641. {
  642. stream->submit([&](sycl::handler &cgh) {
  643. auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
  644. auto ksigns64_ptr_ct1 = &ksigns64[0];
  645. cgh.parallel_for(
  646. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  647. [=](sycl::nd_item<3> item_ct1)
  648. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  649. mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
  650. vx, vy, dst, ncols, nrows, item_ct1);
  651. });
  652. });
  653. }
  654. }
  655. static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
  656. float *dst, const int ncols,
  657. const int nrows,
  658. dpct::queue_ptr stream) {
  659. GGML_ASSERT(ncols % QK_K == 0);
  660. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  661. const sycl::range<3> block_nums(1, 1, block_num_y);
  662. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  663. {
  664. stream->submit([&](sycl::handler &cgh) {
  665. auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
  666. auto ksigns64_ptr_ct1 = &ksigns64[0];
  667. cgh.parallel_for(
  668. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  669. [=](sycl::nd_item<3> item_ct1)
  670. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  671. mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
  672. vx, vy, dst, ncols, nrows, item_ct1);
  673. });
  674. });
  675. }
  676. }
  677. static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
  678. float *dst, const int ncols,
  679. const int nrows,
  680. dpct::queue_ptr stream) {
  681. GGML_ASSERT(ncols % QK_K == 0);
  682. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  683. const sycl::range<3> block_nums(1, 1, block_num_y);
  684. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  685. {
  686. stream->submit([&](sycl::handler &cgh) {
  687. auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
  688. cgh.parallel_for(
  689. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  690. [=](sycl::nd_item<3> item_ct1)
  691. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  692. mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
  693. vx, vy, dst, ncols, nrows, item_ct1);
  694. });
  695. });
  696. }
  697. }
  698. static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
  699. float *dst, const int ncols,
  700. const int nrows,
  701. dpct::queue_ptr stream) {
  702. GGML_ASSERT(ncols % QK_K == 0);
  703. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  704. const sycl::range<3> block_nums(1, 1, block_num_y);
  705. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  706. {
  707. stream->submit([&](sycl::handler &cgh) {
  708. auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
  709. auto ksigns64_ptr_ct1 = &ksigns64[0];
  710. cgh.parallel_for(
  711. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  712. [=](sycl::nd_item<3> item_ct1)
  713. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  714. mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
  715. vx, vy, dst, ncols, nrows, item_ct1);
  716. });
  717. });
  718. }
  719. }
  720. static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
  721. float *dst, const int ncols,
  722. const int nrows,
  723. dpct::queue_ptr stream) {
  724. GGML_ASSERT(ncols % QK_K == 0);
  725. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  726. const sycl::range<3> block_nums(1, 1, block_num_y);
  727. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  728. {
  729. stream->submit([&](sycl::handler &cgh) {
  730. cgh.parallel_for(
  731. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  732. [=](sycl::nd_item<3> item_ct1)
  733. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  734. mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
  735. vx, vy, dst, ncols, nrows, item_ct1);
  736. });
  737. });
  738. }
  739. }
  740. static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
  741. float *dst, const int ncols,
  742. const int nrows,
  743. dpct::queue_ptr stream) {
  744. GGML_ASSERT(ncols % QK4_NL == 0);
  745. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  746. const sycl::range<3> block_nums(1, 1, block_num_y);
  747. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  748. {
  749. stream->submit([&](sycl::handler &cgh) {
  750. cgh.parallel_for(
  751. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  752. [=](sycl::nd_item<3> item_ct1)
  753. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  754. mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
  755. vx, vy, dst, ncols, nrows, item_ct1);
  756. });
  757. });
  758. }
  759. }
  760. static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
  761. float *dst, const int ncols,
  762. const int nrows,
  763. dpct::queue_ptr stream) {
  764. GGML_ASSERT(ncols % QK_K == 0);
  765. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  766. const sycl::range<3> block_nums(1, 1, block_num_y);
  767. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
  768. {
  769. stream->submit([&](sycl::handler &cgh) {
  770. cgh.parallel_for(
  771. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  772. [=](sycl::nd_item<3> item_ct1)
  773. [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
  774. mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
  775. vx, vy, dst, ncols, nrows, item_ct1);
  776. });
  777. });
  778. }
  779. }
  780. void ggml_sycl_op_mul_mat_vec_q(
  781. ggml_backend_sycl_context & ctx,
  782. const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
  783. const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
  784. float *dst_dd_i, const int64_t row_low, const int64_t row_high,
  785. const int64_t src1_ncols, const int64_t src1_padded_col_size,
  786. const dpct::queue_ptr &stream) {
  787. const int64_t ne10 = src1->ne[0];
  788. GGML_ASSERT(ne10 % QK8_1 == 0);
  789. const int64_t ne00 = src0->ne[0];
  790. const int64_t row_diff = row_high - row_low;
  791. int id;
  792. SYCL_CHECK(
  793. CHECK_TRY_ERROR(id = get_current_device_id()));
  794. const size_t q8_1_ts = sizeof(block_q8_1);
  795. const size_t q8_1_bs = QK8_1;
  796. // the main device has a larger memory buffer to hold the results from all GPUs
  797. // nrows_dst == nrows of the matrix that the kernel writes into
  798. const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
  799. for (int i = 0; i < src1_ncols; i++)
  800. {
  801. const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
  802. const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
  803. float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
  804. switch (src0->type) {
  805. case GGML_TYPE_Q4_0:
  806. mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  807. break;
  808. case GGML_TYPE_Q4_1:
  809. mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  810. break;
  811. case GGML_TYPE_Q5_0:
  812. mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  813. break;
  814. case GGML_TYPE_Q5_1:
  815. mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  816. break;
  817. case GGML_TYPE_Q8_0:
  818. mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  819. break;
  820. case GGML_TYPE_Q2_K:
  821. mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  822. break;
  823. case GGML_TYPE_Q3_K:
  824. mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  825. break;
  826. case GGML_TYPE_Q4_K:
  827. mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  828. break;
  829. case GGML_TYPE_Q5_K:
  830. mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  831. break;
  832. case GGML_TYPE_Q6_K:
  833. mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  834. break;
  835. case GGML_TYPE_IQ1_S:
  836. mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  837. break;
  838. case GGML_TYPE_IQ1_M:
  839. mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  840. break;
  841. case GGML_TYPE_IQ2_XXS:
  842. mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  843. break;
  844. case GGML_TYPE_IQ2_XS:
  845. mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  846. break;
  847. case GGML_TYPE_IQ2_S:
  848. mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  849. break;
  850. case GGML_TYPE_IQ3_XXS:
  851. mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  852. break;
  853. case GGML_TYPE_IQ3_S:
  854. mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  855. break;
  856. case GGML_TYPE_IQ4_NL:
  857. mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  858. break;
  859. case GGML_TYPE_IQ4_XS:
  860. mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  861. break;
  862. default:
  863. GGML_ABORT("fatal error");
  864. break;
  865. }
  866. }
  867. (void) src1;
  868. (void) dst;
  869. (void) src1_ddf_i;
  870. }