1
0

mmvq.cpp 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134
  1. #include "mmvq.hpp"
  2. #include "ggml.h"
  3. #include "common.hpp"
  4. #include "quants.hpp"
  5. #include "vecdotq.hpp"
  6. template <typename reorder_vec_dot_q_sycl>
  7. static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
  8. const int ncols, const int nrows, const sycl::nd_item<3> & nd_item) {
  9. using block_type = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
  10. using block_traits = typename block_type::traits;
  11. const auto sg = nd_item.get_sub_group();
  12. const int sg_range = sg.get_group_linear_range();
  13. const int workgroup_id = nd_item.get_group_linear_id();
  14. const int sg_id = sg.get_group_linear_id();
  15. const int row = workgroup_id * sg_range + sg_id;
  16. if (row >= nrows) {
  17. return;
  18. }
  19. const int blocks_per_row = ncols / block_traits::qk;
  20. constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
  21. constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
  22. const int nblocks = nrows * (ncols / block_traits::qk);
  23. static_assert(blocks_per_subgroup > 0);
  24. static_assert(block_elements_per_subgroup > 0);
  25. float partial_sum = 0.0f;
  26. for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
  27. const int ibx = row * blocks_per_row + i; // x block index
  28. const auto bx_offset = block_type::get_block_offset(ibx, nblocks);
  29. const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx);
  30. // Y block index that aligns with ibx
  31. const int iby = i * block_type::block_to_q8_1_ratio();
  32. const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
  33. const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2));
  34. #pragma unroll
  35. for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
  36. // x block quant index when casting the quants to int
  37. const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
  38. partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
  39. }
  40. }
  41. auto sum = sycl::reduce_over_group(nd_item.get_sub_group(), partial_sum, std::plus<>());
  42. if (sg.leader()) {
  43. dst[row] = sum;
  44. }
  45. }
  46. template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
  47. static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
  48. const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) {
  49. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
  50. if (row >= nrows) {
  51. return;
  52. }
  53. const int blocks_per_row = ncols / qk;
  54. constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; // Ensuring blocks_per_warp > 0
  55. assert(blocks_per_warp > 0);
  56. // partial sum for each thread
  57. float tmp = 0.0f;
  58. const block_q_t * x = (const block_q_t *) vx;
  59. const block_q8_1 * y = (const block_q8_1 *) vy;
  60. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
  61. const int ibx = row * blocks_per_row + i; // x block index
  62. const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
  63. for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
  64. const int iqs = elem + vdr * (item_ct1.get_local_id(2) %
  65. (qi / vdr)); // x block quant index when casting the quants to int
  66. tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
  67. }
  68. }
  69. // sum up partial sums and write back result
  70. #pragma unroll
  71. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  72. tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  73. }
  74. if (item_ct1.get_local_id(2) == 0) {
  75. dst[row] = tmp;
  76. }
  77. }
  78. template <int qk, int qi, typename block_q_t, int vdr>
  79. static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
  80. const void *__restrict__ vy,
  81. float *__restrict__ dst, const int ncols,
  82. const int nrows,
  83. const sycl::nd_item<3> &item_ct1) {
  84. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  85. item_ct1.get_local_id(1);
  86. if (row >= nrows) {
  87. return;
  88. }
  89. const int blocks_per_row = ncols / qk;
  90. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  91. assert(blocks_per_warp>0);
  92. // partial sum for each thread
  93. float tmp = 0.0f;
  94. const block_q_t * x = (const block_q_t *) vx;
  95. const block_q8_1 * y = (const block_q8_1 *) vy;
  96. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  97. i += blocks_per_warp) {
  98. const int ibx = row*blocks_per_row + i; // x block index
  99. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  100. const int iqs =
  101. vdr *
  102. (item_ct1.get_local_id(2) %
  103. (qi / vdr)); // x block quant index when casting the quants to int
  104. tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
  105. }
  106. // sum up partial sums and write back result
  107. #pragma unroll
  108. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  109. tmp +=
  110. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  111. }
  112. if (item_ct1.get_local_id(2) == 0) {
  113. dst[row] = tmp;
  114. }
  115. }
  116. template <int qk, int qi, typename block_q_t, int vdr>
  117. static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
  118. const void *__restrict__ vy,
  119. float *__restrict__ dst, const int ncols,
  120. const int nrows,
  121. const sycl::nd_item<3> &item_ct1) {
  122. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  123. item_ct1.get_local_id(1);
  124. if (row >= nrows) {
  125. return;
  126. }
  127. const int blocks_per_row = ncols / qk;
  128. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  129. assert(blocks_per_warp>0);
  130. // partial sum for each thread
  131. float tmp = 0.0f;
  132. const block_q_t * x = (const block_q_t *) vx;
  133. const block_q8_1 * y = (const block_q8_1 *) vy;
  134. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  135. i += blocks_per_warp) {
  136. const int ibx = row*blocks_per_row + i; // x block index
  137. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  138. const int iqs =
  139. vdr *
  140. (item_ct1.get_local_id(2) %
  141. (qi / vdr)); // x block quant index when casting the quants to int
  142. tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
  143. }
  144. // sum up partial sums and write back result
  145. #pragma unroll
  146. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  147. tmp +=
  148. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  149. }
  150. if (item_ct1.get_local_id(2) == 0) {
  151. dst[row] = tmp;
  152. }
  153. }
  154. template <int qk, int qi, typename block_q_t, int vdr>
  155. static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
  156. const void *__restrict__ vy,
  157. float *__restrict__ dst, const int ncols,
  158. const int nrows,
  159. const sycl::nd_item<3> &item_ct1) {
  160. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  161. item_ct1.get_local_id(1);
  162. if (row >= nrows) {
  163. return;
  164. }
  165. const int blocks_per_row = ncols / qk;
  166. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  167. assert(blocks_per_warp>0);
  168. // partial sum for each thread
  169. float tmp = 0.0f;
  170. const block_q_t * x = (const block_q_t *) vx;
  171. const block_q8_1 * y = (const block_q8_1 *) vy;
  172. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  173. i += blocks_per_warp) {
  174. const int ibx = row*blocks_per_row + i; // x block index
  175. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  176. const int iqs =
  177. vdr *
  178. (item_ct1.get_local_id(2) %
  179. (qi / vdr)); // x block quant index when casting the quants to int
  180. tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
  181. }
  182. // sum up partial sums and write back result
  183. #pragma unroll
  184. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  185. tmp +=
  186. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  187. }
  188. if (item_ct1.get_local_id(2) == 0) {
  189. dst[row] = tmp;
  190. }
  191. }
  192. template <int qk, int qi, typename block_q_t, int vdr>
  193. static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
  194. const void *__restrict__ vy,
  195. float *__restrict__ dst, const int ncols,
  196. const int nrows,
  197. const sycl::nd_item<3> &item_ct1) {
  198. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  199. item_ct1.get_local_id(1);
  200. if (row >= nrows) {
  201. return;
  202. }
  203. const int blocks_per_row = ncols / qk;
  204. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  205. assert(blocks_per_warp>0);
  206. // partial sum for each thread
  207. float tmp = 0.0f;
  208. const block_q_t * x = (const block_q_t *) vx;
  209. const block_q8_1 * y = (const block_q8_1 *) vy;
  210. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  211. i += blocks_per_warp) {
  212. const int ibx = row*blocks_per_row + i; // x block index
  213. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  214. const int iqs =
  215. vdr *
  216. (item_ct1.get_local_id(2) %
  217. (qi / vdr)); // x block quant index when casting the quants to int
  218. tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
  219. }
  220. // sum up partial sums and write back result
  221. #pragma unroll
  222. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  223. tmp +=
  224. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  225. }
  226. if (item_ct1.get_local_id(2) == 0) {
  227. dst[row] = tmp;
  228. }
  229. }
  230. template <int qk, int qi, typename block_q_t, int vdr>
  231. static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
  232. const void *__restrict__ vy,
  233. float *__restrict__ dst, const int ncols,
  234. const int nrows,
  235. const sycl::nd_item<3> &item_ct1) {
  236. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  237. item_ct1.get_local_id(1);
  238. if (row >= nrows) {
  239. return;
  240. }
  241. const int blocks_per_row = ncols / qk;
  242. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  243. assert(blocks_per_warp>0);
  244. // partial sum for each thread
  245. float tmp = 0.0f;
  246. const block_q_t * x = (const block_q_t *) vx;
  247. const block_q8_1 * y = (const block_q8_1 *) vy;
  248. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  249. i += blocks_per_warp) {
  250. const int ibx = row*blocks_per_row + i; // x block index
  251. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  252. const int iqs =
  253. vdr *
  254. (item_ct1.get_local_id(2) %
  255. (qi / vdr)); // x block quant index when casting the quants to int
  256. tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
  257. }
  258. // sum up partial sums and write back result
  259. #pragma unroll
  260. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  261. tmp +=
  262. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  263. }
  264. if (item_ct1.get_local_id(2) == 0) {
  265. dst[row] = tmp;
  266. }
  267. }
  268. template <int qk, int qi, typename block_q_t, int vdr>
  269. static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
  270. const void *__restrict__ vy,
  271. float *__restrict__ dst, const int ncols,
  272. const int nrows,
  273. const sycl::nd_item<3> &item_ct1) {
  274. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  275. item_ct1.get_local_id(1);
  276. if (row >= nrows) {
  277. return;
  278. }
  279. const int blocks_per_row = ncols / qk;
  280. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  281. assert(blocks_per_warp>0);
  282. // partial sum for each thread
  283. float tmp = 0.0f;
  284. const block_q_t * x = (const block_q_t *) vx;
  285. const block_q8_1 * y = (const block_q8_1 *) vy;
  286. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  287. i += blocks_per_warp) {
  288. const int ibx = row*blocks_per_row + i; // x block index
  289. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  290. const int iqs =
  291. vdr *
  292. (item_ct1.get_local_id(2) %
  293. (qi / vdr)); // x block quant index when casting the quants to int
  294. tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
  295. }
  296. // sum up partial sums and write back result
  297. #pragma unroll
  298. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  299. tmp +=
  300. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  301. }
  302. if (item_ct1.get_local_id(2) == 0) {
  303. dst[row] = tmp;
  304. }
  305. }
  306. template <int qk, int qi, typename block_q_t, int vdr>
  307. static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
  308. const void *__restrict__ vy,
  309. float *__restrict__ dst, const int ncols,
  310. const int nrows,
  311. const sycl::nd_item<3> &item_ct1) {
  312. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  313. item_ct1.get_local_id(1);
  314. if (row >= nrows) {
  315. return;
  316. }
  317. const int blocks_per_row = ncols / qk;
  318. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  319. assert(blocks_per_warp>0);
  320. // partial sum for each thread
  321. float tmp = 0.0f;
  322. const block_q_t * x = (const block_q_t *) vx;
  323. const block_q8_1 * y = (const block_q8_1 *) vy;
  324. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  325. i += blocks_per_warp) {
  326. const int ibx = row*blocks_per_row + i; // x block index
  327. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  328. const int iqs =
  329. vdr *
  330. (item_ct1.get_local_id(2) %
  331. (qi / vdr)); // x block quant index when casting the quants to int
  332. tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
  333. }
  334. // sum up partial sums and write back result
  335. #pragma unroll
  336. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  337. tmp +=
  338. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  339. }
  340. if (item_ct1.get_local_id(2) == 0) {
  341. dst[row] = tmp;
  342. }
  343. }
  344. template <int qk, int qi, typename block_q_t, int vdr>
  345. static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
  346. const void *__restrict__ vy,
  347. float *__restrict__ dst, const int ncols,
  348. const int nrows,
  349. const sycl::nd_item<3> &item_ct1) {
  350. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  351. item_ct1.get_local_id(1);
  352. if (row >= nrows) {
  353. return;
  354. }
  355. const int blocks_per_row = ncols / qk;
  356. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  357. assert(blocks_per_warp>0);
  358. // partial sum for each thread
  359. float tmp = 0.0f;
  360. const block_q_t * x = (const block_q_t *) vx;
  361. const block_q8_1 * y = (const block_q8_1 *) vy;
  362. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  363. i += blocks_per_warp) {
  364. const int ibx = row*blocks_per_row + i; // x block index
  365. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  366. const int iqs =
  367. vdr *
  368. (item_ct1.get_local_id(2) %
  369. (qi / vdr)); // x block quant index when casting the quants to int
  370. tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
  371. }
  372. // sum up partial sums and write back result
  373. #pragma unroll
  374. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  375. tmp +=
  376. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  377. }
  378. if (item_ct1.get_local_id(2) == 0) {
  379. dst[row] = tmp;
  380. }
  381. }
  382. template <int qk, int qi, typename block_q_t, int vdr>
  383. static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
  384. const void *__restrict__ vy,
  385. float *__restrict__ dst, const int ncols,
  386. const int nrows,
  387. const sycl::nd_item<3> &item_ct1) {
  388. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  389. item_ct1.get_local_id(1);
  390. if (row >= nrows) {
  391. return;
  392. }
  393. const int blocks_per_row = ncols / qk;
  394. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  395. assert(blocks_per_warp>0);
  396. // partial sum for each thread
  397. float tmp = 0.0f;
  398. const block_q_t * x = (const block_q_t *) vx;
  399. const block_q8_1 * y = (const block_q8_1 *) vy;
  400. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  401. i += blocks_per_warp) {
  402. const int ibx = row*blocks_per_row + i; // x block index
  403. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  404. const int iqs =
  405. vdr *
  406. (item_ct1.get_local_id(2) %
  407. (qi / vdr)); // x block quant index when casting the quants to int
  408. tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
  409. }
  410. // sum up partial sums and write back result
  411. #pragma unroll
  412. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  413. tmp +=
  414. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  415. }
  416. if (item_ct1.get_local_id(2) == 0) {
  417. dst[row] = tmp;
  418. }
  419. }
  420. static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
  421. const int nrows, dpct::queue_ptr stream) {
  422. GGML_ASSERT(ncols % QK4_0 == 0);
  423. const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
  424. constexpr size_t num_subgroups = 16;
  425. GGML_ASSERT(block_num_y % num_subgroups == 0);
  426. const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
  427. const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
  428. stream->submit([&](sycl::handler & cgh) {
  429. cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
  430. [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  431. mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
  432. nd_item);
  433. });
  434. });
  435. }
  436. static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
  437. dpct::queue_ptr stream) {
  438. GGML_ASSERT(ncols % QK4_0 == 0);
  439. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  440. const sycl::range<3> block_nums(1, 1, block_num_y);
  441. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  442. {
  443. stream->submit([&](sycl::handler & cgh) {
  444. cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
  445. [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  446. mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
  447. vx, vy, dst, ncols, nrows, item_ct1);
  448. });
  449. });
  450. }
  451. }
  452. static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
  453. float *dst, const int ncols,
  454. const int nrows,
  455. dpct::queue_ptr stream) {
  456. GGML_ASSERT(ncols % QK4_1 == 0);
  457. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  458. const sycl::range<3> block_nums(1, 1, block_num_y);
  459. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  460. {
  461. stream->submit([&](sycl::handler &cgh) {
  462. cgh.parallel_for(
  463. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  464. [=](sycl::nd_item<3> item_ct1)
  465. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  466. mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
  467. VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
  468. vx, vy, dst, ncols, nrows, item_ct1);
  469. });
  470. });
  471. }
  472. }
  473. static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
  474. float *dst, const int ncols,
  475. const int nrows,
  476. dpct::queue_ptr stream) {
  477. GGML_ASSERT(ncols % QK5_0 == 0);
  478. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  479. const sycl::range<3> block_nums(1, 1, block_num_y);
  480. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  481. {
  482. stream->submit([&](sycl::handler &cgh) {
  483. cgh.parallel_for(
  484. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  485. [=](sycl::nd_item<3> item_ct1)
  486. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  487. mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
  488. VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
  489. vx, vy, dst, ncols, nrows, item_ct1);
  490. });
  491. });
  492. }
  493. }
  494. static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
  495. float *dst, const int ncols,
  496. const int nrows,
  497. dpct::queue_ptr stream) {
  498. GGML_ASSERT(ncols % QK5_1 == 0);
  499. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  500. const sycl::range<3> block_nums(1, 1, block_num_y);
  501. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  502. {
  503. stream->submit([&](sycl::handler &cgh) {
  504. cgh.parallel_for(
  505. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  506. [=](sycl::nd_item<3> item_ct1)
  507. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  508. mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
  509. VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
  510. vx, vy, dst, ncols, nrows, item_ct1);
  511. });
  512. });
  513. }
  514. }
  515. static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
  516. float *dst, const int ncols,
  517. const int nrows,
  518. dpct::queue_ptr stream) {
  519. GGML_ASSERT(ncols % QK8_0 == 0);
  520. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  521. const sycl::range<3> block_nums(1, 1, block_num_y);
  522. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  523. {
  524. stream->submit([&](sycl::handler &cgh) {
  525. cgh.parallel_for(
  526. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  527. [=](sycl::nd_item<3> item_ct1)
  528. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  529. mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
  530. VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
  531. vx, vy, dst, ncols, nrows, item_ct1);
  532. });
  533. });
  534. }
  535. }
  536. static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
  537. float *dst, const int ncols,
  538. const int nrows,
  539. dpct::queue_ptr stream) {
  540. GGML_ASSERT(ncols % QK_K == 0);
  541. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  542. const sycl::range<3> block_nums(1, 1, block_num_y);
  543. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  544. {
  545. stream->submit([&](sycl::handler &cgh) {
  546. cgh.parallel_for(
  547. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  548. [=](sycl::nd_item<3> item_ct1)
  549. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  550. mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
  551. VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
  552. vx, vy, dst, ncols, nrows, item_ct1);
  553. });
  554. });
  555. }
  556. }
  557. static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
  558. float *dst, const int ncols,
  559. const int nrows,
  560. dpct::queue_ptr stream) {
  561. GGML_ASSERT(ncols % QK_K == 0);
  562. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  563. const sycl::range<3> block_nums(1, 1, block_num_y);
  564. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  565. {
  566. stream->submit([&](sycl::handler &cgh) {
  567. cgh.parallel_for(
  568. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  569. [=](sycl::nd_item<3> item_ct1)
  570. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  571. mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
  572. VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
  573. vx, vy, dst, ncols, nrows, item_ct1);
  574. });
  575. });
  576. }
  577. }
  578. static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
  579. float *dst, const int ncols,
  580. const int nrows,
  581. dpct::queue_ptr stream) {
  582. GGML_ASSERT(ncols % QK_K == 0);
  583. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  584. const sycl::range<3> block_nums(1, 1, block_num_y);
  585. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  586. {
  587. stream->submit([&](sycl::handler &cgh) {
  588. cgh.parallel_for(
  589. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  590. [=](sycl::nd_item<3> item_ct1)
  591. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  592. mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
  593. VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
  594. vx, vy, dst, ncols, nrows, item_ct1);
  595. });
  596. });
  597. }
  598. }
  599. static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
  600. const int nrows, dpct::queue_ptr stream) {
  601. GGML_ASSERT(ncols % QK_K == 0);
  602. const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
  603. constexpr size_t num_subgroups = 16;
  604. GGML_ASSERT(block_num_y % num_subgroups == 0);
  605. const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
  606. const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
  607. stream->submit([&](sycl::handler & cgh) {
  608. cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
  609. [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  610. mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
  611. nrows, nd_item);
  612. });
  613. });
  614. }
  615. static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
  616. float *dst, const int ncols,
  617. const int nrows,
  618. dpct::queue_ptr stream) {
  619. GGML_ASSERT(ncols % QK_K == 0);
  620. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  621. const sycl::range<3> block_nums(1, 1, block_num_y);
  622. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  623. {
  624. stream->submit([&](sycl::handler &cgh) {
  625. cgh.parallel_for(
  626. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  627. [=](sycl::nd_item<3> item_ct1)
  628. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  629. mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
  630. VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
  631. vx, vy, dst, ncols, nrows, item_ct1);
  632. });
  633. });
  634. }
  635. }
  636. static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
  637. const int nrows, dpct::queue_ptr stream) {
  638. GGML_ASSERT(ncols % QK_K == 0);
  639. const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
  640. constexpr size_t num_subgroups = 16;
  641. GGML_ASSERT(block_num_y % num_subgroups == 0);
  642. const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
  643. const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
  644. stream->submit([&](sycl::handler & cgh) {
  645. cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
  646. [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  647. mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
  648. nd_item);
  649. });
  650. });
  651. }
  652. static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
  653. float *dst, const int ncols,
  654. const int nrows,
  655. dpct::queue_ptr stream) {
  656. GGML_ASSERT(ncols % QK_K == 0);
  657. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  658. const sycl::range<3> block_nums(1, 1, block_num_y);
  659. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  660. {
  661. stream->submit([&](sycl::handler &cgh) {
  662. cgh.parallel_for(
  663. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  664. [=](sycl::nd_item<3> item_ct1)
  665. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  666. mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
  667. VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
  668. vx, vy, dst, ncols, nrows, item_ct1);
  669. });
  670. });
  671. }
  672. }
  673. static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
  674. float *dst, const int ncols,
  675. const int nrows,
  676. dpct::queue_ptr stream) {
  677. GGML_ASSERT(ncols % QK_K == 0);
  678. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  679. const sycl::range<3> block_nums(1, 1, block_num_y);
  680. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  681. {
  682. stream->submit([&](sycl::handler &cgh) {
  683. cgh.parallel_for(
  684. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  685. [=](sycl::nd_item<3> item_ct1)
  686. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  687. mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
  688. vx, vy, dst, ncols, nrows, item_ct1);
  689. });
  690. });
  691. }
  692. }
  693. static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
  694. float *dst, const int ncols,
  695. const int nrows,
  696. dpct::queue_ptr stream) {
  697. GGML_ASSERT(ncols % QK_K == 0);
  698. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  699. const sycl::range<3> block_nums(1, 1, block_num_y);
  700. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  701. {
  702. stream->submit([&](sycl::handler & cgh) {
  703. cgh.parallel_for(
  704. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  705. [=](sycl::nd_item<3> item_ct1)
  706. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  707. mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
  708. vx, vy, dst, ncols, nrows, item_ct1);
  709. });
  710. });
  711. }
  712. }
  713. static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
  714. float *dst, const int ncols,
  715. const int nrows,
  716. dpct::queue_ptr stream) {
  717. GGML_ASSERT(ncols % QK_K == 0);
  718. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  719. const sycl::range<3> block_nums(1, 1, block_num_y);
  720. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  721. {
  722. stream->submit([&](sycl::handler &cgh) {
  723. cgh.parallel_for(
  724. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  725. [=](sycl::nd_item<3> item_ct1)
  726. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  727. mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
  728. vx, vy, dst, ncols, nrows, item_ct1);
  729. });
  730. });
  731. }
  732. }
  733. static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
  734. float *dst, const int ncols,
  735. const int nrows,
  736. dpct::queue_ptr stream) {
  737. GGML_ASSERT(ncols % QK_K == 0);
  738. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  739. const sycl::range<3> block_nums(1, 1, block_num_y);
  740. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  741. {
  742. stream->submit([&](sycl::handler &cgh) {
  743. cgh.parallel_for(
  744. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  745. [=](sycl::nd_item<3> item_ct1)
  746. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  747. mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
  748. vx, vy, dst, ncols, nrows, item_ct1);
  749. });
  750. });
  751. }
  752. }
  753. static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
  754. float *dst, const int ncols,
  755. const int nrows,
  756. dpct::queue_ptr stream) {
  757. GGML_ASSERT(ncols % QK_K == 0);
  758. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  759. const sycl::range<3> block_nums(1, 1, block_num_y);
  760. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  761. {
  762. stream->submit([&](sycl::handler &cgh) {
  763. cgh.parallel_for(
  764. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  765. [=](sycl::nd_item<3> item_ct1)
  766. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  767. mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
  768. vx, vy, dst, ncols, nrows, item_ct1);
  769. });
  770. });
  771. }
  772. }
  773. static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
  774. float *dst, const int ncols,
  775. const int nrows,
  776. dpct::queue_ptr stream) {
  777. GGML_ASSERT(ncols % QK_K == 0);
  778. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  779. const sycl::range<3> block_nums(1, 1, block_num_y);
  780. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  781. {
  782. stream->submit([&](sycl::handler &cgh) {
  783. cgh.parallel_for(
  784. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  785. [=](sycl::nd_item<3> item_ct1)
  786. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  787. mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
  788. vx, vy, dst, ncols, nrows, item_ct1);
  789. });
  790. });
  791. }
  792. }
  793. static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
  794. float *dst, const int ncols,
  795. const int nrows,
  796. dpct::queue_ptr stream) {
  797. GGML_ASSERT(ncols % QK_K == 0);
  798. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  799. const sycl::range<3> block_nums(1, 1, block_num_y);
  800. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  801. {
  802. stream->submit([&](sycl::handler &cgh) {
  803. cgh.parallel_for(
  804. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  805. [=](sycl::nd_item<3> item_ct1)
  806. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  807. mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
  808. vx, vy, dst, ncols, nrows, item_ct1);
  809. });
  810. });
  811. }
  812. }
  813. static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
  814. float *dst, const int ncols,
  815. const int nrows,
  816. dpct::queue_ptr stream) {
  817. GGML_ASSERT(ncols % QK4_NL == 0);
  818. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  819. const sycl::range<3> block_nums(1, 1, block_num_y);
  820. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  821. {
  822. stream->submit([&](sycl::handler &cgh) {
  823. cgh.parallel_for(
  824. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  825. [=](sycl::nd_item<3> item_ct1)
  826. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  827. mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
  828. vx, vy, dst, ncols, nrows, item_ct1);
  829. });
  830. });
  831. }
  832. }
  833. static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
  834. float *dst, const int ncols,
  835. const int nrows,
  836. dpct::queue_ptr stream) {
  837. GGML_ASSERT(ncols % QK_K == 0);
  838. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  839. const sycl::range<3> block_nums(1, 1, block_num_y);
  840. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  841. {
  842. stream->submit([&](sycl::handler &cgh) {
  843. cgh.parallel_for(
  844. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  845. [=](sycl::nd_item<3> item_ct1)
  846. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  847. mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
  848. vx, vy, dst, ncols, nrows, item_ct1);
  849. });
  850. });
  851. }
  852. }
  853. void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
  854. ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
  855. const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low,
  856. const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_col_size,
  857. const dpct::queue_ptr & stream) {
  858. const int64_t ne10 = src1->ne[0];
  859. GGML_ASSERT(ne10 % QK8_1 == 0);
  860. const int64_t ne00 = src0->ne[0];
  861. const int64_t row_diff = row_high - row_low;
  862. int id;
  863. SYCL_CHECK(CHECK_TRY_ERROR(id = get_current_device_id()));
  864. const size_t q8_1_ts = sizeof(block_q8_1);
  865. const size_t q8_1_bs = QK8_1;
  866. // the main device has a larger memory buffer to hold the results from all GPUs
  867. // nrows_dst == nrows of the matrix that the kernel writes into
  868. for (int i = 0; i < src1_ncols; i++) {
  869. const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
  870. const char * src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
  871. float * dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
  872. switch (src0->type) {
  873. case GGML_TYPE_Q4_0:
  874. if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
  875. ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
  876. GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n");
  877. reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  878. } else {
  879. GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n");
  880. mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  881. }
  882. break;
  883. case GGML_TYPE_Q4_1:
  884. mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  885. break;
  886. case GGML_TYPE_Q5_0:
  887. mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  888. break;
  889. case GGML_TYPE_Q5_1:
  890. mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  891. break;
  892. case GGML_TYPE_Q8_0:
  893. mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  894. break;
  895. case GGML_TYPE_Q2_K:
  896. mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  897. break;
  898. case GGML_TYPE_Q3_K:
  899. mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  900. break;
  901. case GGML_TYPE_Q4_K:
  902. if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
  903. ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
  904. GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
  905. reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  906. } else {
  907. GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
  908. mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  909. }
  910. break;
  911. case GGML_TYPE_Q5_K:
  912. mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  913. break;
  914. case GGML_TYPE_Q6_K:
  915. if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
  916. ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
  917. GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n");
  918. reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  919. } else {
  920. GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_k_q8_1_sycl\n");
  921. mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  922. }
  923. break;
  924. case GGML_TYPE_IQ1_S:
  925. mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  926. break;
  927. case GGML_TYPE_IQ1_M:
  928. mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  929. break;
  930. case GGML_TYPE_IQ2_XXS:
  931. mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  932. break;
  933. case GGML_TYPE_IQ2_XS:
  934. mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  935. break;
  936. case GGML_TYPE_IQ2_S:
  937. mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  938. break;
  939. case GGML_TYPE_IQ3_XXS:
  940. mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  941. break;
  942. case GGML_TYPE_IQ3_S:
  943. mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  944. break;
  945. case GGML_TYPE_IQ4_NL:
  946. mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  947. break;
  948. case GGML_TYPE_IQ4_XS:
  949. mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  950. break;
  951. default:
  952. GGML_ABORT("fatal error");
  953. }
  954. }
  955. GGML_UNUSED(src1);
  956. GGML_UNUSED(dst);
  957. GGML_UNUSED(src1_ddf_i);
  958. GGML_UNUSED(ctx);
  959. }