mmvq.cpp 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. #include "mmvq.hpp"
  2. #include "vecdotq.hpp"
  3. #include <cassert>
  4. template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
  5. static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
  6. const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) {
  7. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
  8. if (row >= nrows) {
  9. return;
  10. }
  11. const int blocks_per_row = ncols / qk;
  12. constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; // Ensuring blocks_per_warp > 0
  13. assert(blocks_per_warp > 0);
  14. // partial sum for each thread
  15. float tmp = 0.0f;
  16. const block_q_t * x = (const block_q_t *) vx;
  17. const block_q8_1 * y = (const block_q8_1 *) vy;
  18. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
  19. const int ibx = row * blocks_per_row + i; // x block index
  20. const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
  21. for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
  22. const int iqs = elem + vdr * (item_ct1.get_local_id(2) %
  23. (qi / vdr)); // x block quant index when casting the quants to int
  24. tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
  25. }
  26. }
  27. // sum up partial sums and write back result
  28. #pragma unroll
  29. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  30. tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  31. }
  32. if (item_ct1.get_local_id(2) == 0) {
  33. dst[row] = tmp;
  34. }
  35. }
  36. template <int qk, int qi, typename block_q_t, int vdr>
  37. static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
  38. const void *__restrict__ vy,
  39. float *__restrict__ dst, const int ncols,
  40. const int nrows,
  41. const sycl::nd_item<3> &item_ct1) {
  42. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  43. item_ct1.get_local_id(1);
  44. if (row >= nrows) {
  45. return;
  46. }
  47. const int blocks_per_row = ncols / qk;
  48. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  49. assert(blocks_per_warp>0);
  50. // partial sum for each thread
  51. float tmp = 0.0f;
  52. const block_q_t * x = (const block_q_t *) vx;
  53. const block_q8_1 * y = (const block_q8_1 *) vy;
  54. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  55. i += blocks_per_warp) {
  56. const int ibx = row*blocks_per_row + i; // x block index
  57. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  58. const int iqs =
  59. vdr *
  60. (item_ct1.get_local_id(2) %
  61. (qi / vdr)); // x block quant index when casting the quants to int
  62. tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
  63. }
  64. // sum up partial sums and write back result
  65. #pragma unroll
  66. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  67. tmp +=
  68. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  69. }
  70. if (item_ct1.get_local_id(2) == 0) {
  71. dst[row] = tmp;
  72. }
  73. }
  74. template <int qk, int qi, typename block_q_t, int vdr>
  75. static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
  76. const void *__restrict__ vy,
  77. float *__restrict__ dst, const int ncols,
  78. const int nrows,
  79. const sycl::nd_item<3> &item_ct1) {
  80. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  81. item_ct1.get_local_id(1);
  82. if (row >= nrows) {
  83. return;
  84. }
  85. const int blocks_per_row = ncols / qk;
  86. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  87. assert(blocks_per_warp>0);
  88. // partial sum for each thread
  89. float tmp = 0.0f;
  90. const block_q_t * x = (const block_q_t *) vx;
  91. const block_q8_1 * y = (const block_q8_1 *) vy;
  92. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  93. i += blocks_per_warp) {
  94. const int ibx = row*blocks_per_row + i; // x block index
  95. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  96. const int iqs =
  97. vdr *
  98. (item_ct1.get_local_id(2) %
  99. (qi / vdr)); // x block quant index when casting the quants to int
  100. tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
  101. }
  102. // sum up partial sums and write back result
  103. #pragma unroll
  104. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  105. tmp +=
  106. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  107. }
  108. if (item_ct1.get_local_id(2) == 0) {
  109. dst[row] = tmp;
  110. }
  111. }
  112. template <int qk, int qi, typename block_q_t, int vdr>
  113. static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
  114. const void *__restrict__ vy,
  115. float *__restrict__ dst, const int ncols,
  116. const int nrows,
  117. const sycl::nd_item<3> &item_ct1) {
  118. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  119. item_ct1.get_local_id(1);
  120. if (row >= nrows) {
  121. return;
  122. }
  123. const int blocks_per_row = ncols / qk;
  124. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  125. assert(blocks_per_warp>0);
  126. // partial sum for each thread
  127. float tmp = 0.0f;
  128. const block_q_t * x = (const block_q_t *) vx;
  129. const block_q8_1 * y = (const block_q8_1 *) vy;
  130. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  131. i += blocks_per_warp) {
  132. const int ibx = row*blocks_per_row + i; // x block index
  133. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  134. const int iqs =
  135. vdr *
  136. (item_ct1.get_local_id(2) %
  137. (qi / vdr)); // x block quant index when casting the quants to int
  138. tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
  139. }
  140. // sum up partial sums and write back result
  141. #pragma unroll
  142. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  143. tmp +=
  144. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  145. }
  146. if (item_ct1.get_local_id(2) == 0) {
  147. dst[row] = tmp;
  148. }
  149. }
  150. template <int qk, int qi, typename block_q_t, int vdr>
  151. static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
  152. const void *__restrict__ vy,
  153. float *__restrict__ dst, const int ncols,
  154. const int nrows,
  155. const sycl::nd_item<3> &item_ct1) {
  156. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  157. item_ct1.get_local_id(1);
  158. if (row >= nrows) {
  159. return;
  160. }
  161. const int blocks_per_row = ncols / qk;
  162. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  163. assert(blocks_per_warp>0);
  164. // partial sum for each thread
  165. float tmp = 0.0f;
  166. const block_q_t * x = (const block_q_t *) vx;
  167. const block_q8_1 * y = (const block_q8_1 *) vy;
  168. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  169. i += blocks_per_warp) {
  170. const int ibx = row*blocks_per_row + i; // x block index
  171. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  172. const int iqs =
  173. vdr *
  174. (item_ct1.get_local_id(2) %
  175. (qi / vdr)); // x block quant index when casting the quants to int
  176. tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
  177. }
  178. // sum up partial sums and write back result
  179. #pragma unroll
  180. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  181. tmp +=
  182. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  183. }
  184. if (item_ct1.get_local_id(2) == 0) {
  185. dst[row] = tmp;
  186. }
  187. }
  188. template <int qk, int qi, typename block_q_t, int vdr>
  189. static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
  190. const void *__restrict__ vy,
  191. float *__restrict__ dst, const int ncols,
  192. const int nrows,
  193. const sycl::nd_item<3> &item_ct1) {
  194. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  195. item_ct1.get_local_id(1);
  196. if (row >= nrows) {
  197. return;
  198. }
  199. const int blocks_per_row = ncols / qk;
  200. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  201. assert(blocks_per_warp>0);
  202. // partial sum for each thread
  203. float tmp = 0.0f;
  204. const block_q_t * x = (const block_q_t *) vx;
  205. const block_q8_1 * y = (const block_q8_1 *) vy;
  206. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  207. i += blocks_per_warp) {
  208. const int ibx = row*blocks_per_row + i; // x block index
  209. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  210. const int iqs =
  211. vdr *
  212. (item_ct1.get_local_id(2) %
  213. (qi / vdr)); // x block quant index when casting the quants to int
  214. tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
  215. }
  216. // sum up partial sums and write back result
  217. #pragma unroll
  218. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  219. tmp +=
  220. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  221. }
  222. if (item_ct1.get_local_id(2) == 0) {
  223. dst[row] = tmp;
  224. }
  225. }
  226. template <int qk, int qi, typename block_q_t, int vdr>
  227. static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
  228. const void *__restrict__ vy,
  229. float *__restrict__ dst, const int ncols,
  230. const int nrows,
  231. const sycl::nd_item<3> &item_ct1) {
  232. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  233. item_ct1.get_local_id(1);
  234. if (row >= nrows) {
  235. return;
  236. }
  237. const int blocks_per_row = ncols / qk;
  238. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  239. assert(blocks_per_warp>0);
  240. // partial sum for each thread
  241. float tmp = 0.0f;
  242. const block_q_t * x = (const block_q_t *) vx;
  243. const block_q8_1 * y = (const block_q8_1 *) vy;
  244. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  245. i += blocks_per_warp) {
  246. const int ibx = row*blocks_per_row + i; // x block index
  247. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  248. const int iqs =
  249. vdr *
  250. (item_ct1.get_local_id(2) %
  251. (qi / vdr)); // x block quant index when casting the quants to int
  252. tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
  253. }
  254. // sum up partial sums and write back result
  255. #pragma unroll
  256. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  257. tmp +=
  258. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  259. }
  260. if (item_ct1.get_local_id(2) == 0) {
  261. dst[row] = tmp;
  262. }
  263. }
  264. template <int qk, int qi, typename block_q_t, int vdr>
  265. static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
  266. const void *__restrict__ vy,
  267. float *__restrict__ dst, const int ncols,
  268. const int nrows,
  269. const sycl::nd_item<3> &item_ct1) {
  270. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  271. item_ct1.get_local_id(1);
  272. if (row >= nrows) {
  273. return;
  274. }
  275. const int blocks_per_row = ncols / qk;
  276. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  277. assert(blocks_per_warp>0);
  278. // partial sum for each thread
  279. float tmp = 0.0f;
  280. const block_q_t * x = (const block_q_t *) vx;
  281. const block_q8_1 * y = (const block_q8_1 *) vy;
  282. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  283. i += blocks_per_warp) {
  284. const int ibx = row*blocks_per_row + i; // x block index
  285. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  286. const int iqs =
  287. vdr *
  288. (item_ct1.get_local_id(2) %
  289. (qi / vdr)); // x block quant index when casting the quants to int
  290. tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
  291. }
  292. // sum up partial sums and write back result
  293. #pragma unroll
  294. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  295. tmp +=
  296. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  297. }
  298. if (item_ct1.get_local_id(2) == 0) {
  299. dst[row] = tmp;
  300. }
  301. }
  302. template <int qk, int qi, typename block_q_t, int vdr>
  303. static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
  304. const void *__restrict__ vy,
  305. float *__restrict__ dst, const int ncols,
  306. const int nrows,
  307. const sycl::nd_item<3> &item_ct1) {
  308. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  309. item_ct1.get_local_id(1);
  310. if (row >= nrows) {
  311. return;
  312. }
  313. const int blocks_per_row = ncols / qk;
  314. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  315. assert(blocks_per_warp>0);
  316. // partial sum for each thread
  317. float tmp = 0.0f;
  318. const block_q_t * x = (const block_q_t *) vx;
  319. const block_q8_1 * y = (const block_q8_1 *) vy;
  320. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  321. i += blocks_per_warp) {
  322. const int ibx = row*blocks_per_row + i; // x block index
  323. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  324. const int iqs =
  325. vdr *
  326. (item_ct1.get_local_id(2) %
  327. (qi / vdr)); // x block quant index when casting the quants to int
  328. tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
  329. }
  330. // sum up partial sums and write back result
  331. #pragma unroll
  332. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  333. tmp +=
  334. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  335. }
  336. if (item_ct1.get_local_id(2) == 0) {
  337. dst[row] = tmp;
  338. }
  339. }
  340. template <int qk, int qi, typename block_q_t, int vdr>
  341. static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
  342. const void *__restrict__ vy,
  343. float *__restrict__ dst, const int ncols,
  344. const int nrows,
  345. const sycl::nd_item<3> &item_ct1) {
  346. const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
  347. item_ct1.get_local_id(1);
  348. if (row >= nrows) {
  349. return;
  350. }
  351. const int blocks_per_row = ncols / qk;
  352. const int blocks_per_warp = vdr * WARP_SIZE / qi;
  353. assert(blocks_per_warp>0);
  354. // partial sum for each thread
  355. float tmp = 0.0f;
  356. const block_q_t * x = (const block_q_t *) vx;
  357. const block_q8_1 * y = (const block_q8_1 *) vy;
  358. for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
  359. i += blocks_per_warp) {
  360. const int ibx = row*blocks_per_row + i; // x block index
  361. const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
  362. const int iqs =
  363. vdr *
  364. (item_ct1.get_local_id(2) %
  365. (qi / vdr)); // x block quant index when casting the quants to int
  366. tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
  367. }
  368. // sum up partial sums and write back result
  369. #pragma unroll
  370. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  371. tmp +=
  372. dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
  373. }
  374. if (item_ct1.get_local_id(2) == 0) {
  375. dst[row] = tmp;
  376. }
  377. }
  378. static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
  379. float *dst, const int ncols,
  380. const int nrows,
  381. dpct::queue_ptr stream) {
  382. GGML_ASSERT(ncols % QK4_0 == 0);
  383. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  384. const sycl::range<3> block_nums(1, 1, block_num_y);
  385. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  386. {
  387. stream->submit([&](sycl::handler &cgh) {
  388. cgh.parallel_for(
  389. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  390. [=](sycl::nd_item<3> item_ct1)
  391. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  392. mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
  393. VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
  394. vx, vy, dst, ncols, nrows, item_ct1);
  395. });
  396. });
  397. }
  398. }
  399. static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
  400. float *dst, const int ncols,
  401. const int nrows,
  402. dpct::queue_ptr stream) {
  403. GGML_ASSERT(ncols % QK4_1 == 0);
  404. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  405. const sycl::range<3> block_nums(1, 1, block_num_y);
  406. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  407. {
  408. stream->submit([&](sycl::handler &cgh) {
  409. cgh.parallel_for(
  410. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  411. [=](sycl::nd_item<3> item_ct1)
  412. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  413. mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
  414. VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
  415. vx, vy, dst, ncols, nrows, item_ct1);
  416. });
  417. });
  418. }
  419. }
  420. static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
  421. float *dst, const int ncols,
  422. const int nrows,
  423. dpct::queue_ptr stream) {
  424. GGML_ASSERT(ncols % QK5_0 == 0);
  425. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  426. const sycl::range<3> block_nums(1, 1, block_num_y);
  427. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  428. {
  429. stream->submit([&](sycl::handler &cgh) {
  430. cgh.parallel_for(
  431. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  432. [=](sycl::nd_item<3> item_ct1)
  433. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  434. mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
  435. VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
  436. vx, vy, dst, ncols, nrows, item_ct1);
  437. });
  438. });
  439. }
  440. }
  441. static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
  442. float *dst, const int ncols,
  443. const int nrows,
  444. dpct::queue_ptr stream) {
  445. GGML_ASSERT(ncols % QK5_1 == 0);
  446. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  447. const sycl::range<3> block_nums(1, 1, block_num_y);
  448. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  449. {
  450. stream->submit([&](sycl::handler &cgh) {
  451. cgh.parallel_for(
  452. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  453. [=](sycl::nd_item<3> item_ct1)
  454. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  455. mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
  456. VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
  457. vx, vy, dst, ncols, nrows, item_ct1);
  458. });
  459. });
  460. }
  461. }
  462. static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
  463. float *dst, const int ncols,
  464. const int nrows,
  465. dpct::queue_ptr stream) {
  466. GGML_ASSERT(ncols % QK8_0 == 0);
  467. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  468. const sycl::range<3> block_nums(1, 1, block_num_y);
  469. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  470. {
  471. stream->submit([&](sycl::handler &cgh) {
  472. cgh.parallel_for(
  473. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  474. [=](sycl::nd_item<3> item_ct1)
  475. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  476. mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
  477. VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
  478. vx, vy, dst, ncols, nrows, item_ct1);
  479. });
  480. });
  481. }
  482. }
  483. static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
  484. float *dst, const int ncols,
  485. const int nrows,
  486. dpct::queue_ptr stream) {
  487. GGML_ASSERT(ncols % QK_K == 0);
  488. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  489. const sycl::range<3> block_nums(1, 1, block_num_y);
  490. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  491. {
  492. stream->submit([&](sycl::handler &cgh) {
  493. cgh.parallel_for(
  494. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  495. [=](sycl::nd_item<3> item_ct1)
  496. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  497. mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
  498. VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
  499. vx, vy, dst, ncols, nrows, item_ct1);
  500. });
  501. });
  502. }
  503. }
  504. static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
  505. float *dst, const int ncols,
  506. const int nrows,
  507. dpct::queue_ptr stream) {
  508. GGML_ASSERT(ncols % QK_K == 0);
  509. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  510. const sycl::range<3> block_nums(1, 1, block_num_y);
  511. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  512. {
  513. stream->submit([&](sycl::handler &cgh) {
  514. cgh.parallel_for(
  515. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  516. [=](sycl::nd_item<3> item_ct1)
  517. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  518. mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
  519. VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
  520. vx, vy, dst, ncols, nrows, item_ct1);
  521. });
  522. });
  523. }
  524. }
  525. static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
  526. float *dst, const int ncols,
  527. const int nrows,
  528. dpct::queue_ptr stream) {
  529. GGML_ASSERT(ncols % QK_K == 0);
  530. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  531. const sycl::range<3> block_nums(1, 1, block_num_y);
  532. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  533. {
  534. stream->submit([&](sycl::handler &cgh) {
  535. cgh.parallel_for(
  536. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  537. [=](sycl::nd_item<3> item_ct1)
  538. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  539. mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
  540. VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
  541. vx, vy, dst, ncols, nrows, item_ct1);
  542. });
  543. });
  544. }
  545. }
  546. static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
  547. float *dst, const int ncols,
  548. const int nrows,
  549. dpct::queue_ptr stream) {
  550. GGML_ASSERT(ncols % QK_K == 0);
  551. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  552. const sycl::range<3> block_nums(1, 1, block_num_y);
  553. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  554. {
  555. stream->submit([&](sycl::handler &cgh) {
  556. cgh.parallel_for(
  557. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  558. [=](sycl::nd_item<3> item_ct1)
  559. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  560. mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
  561. VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
  562. vx, vy, dst, ncols, nrows, item_ct1);
  563. });
  564. });
  565. }
  566. }
  567. static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
  568. float *dst, const int ncols,
  569. const int nrows,
  570. dpct::queue_ptr stream) {
  571. GGML_ASSERT(ncols % QK_K == 0);
  572. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  573. const sycl::range<3> block_nums(1, 1, block_num_y);
  574. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  575. {
  576. stream->submit([&](sycl::handler &cgh) {
  577. cgh.parallel_for(
  578. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  579. [=](sycl::nd_item<3> item_ct1)
  580. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  581. mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
  582. VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
  583. vx, vy, dst, ncols, nrows, item_ct1);
  584. });
  585. });
  586. }
  587. }
  588. static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
  589. float *dst, const int ncols,
  590. const int nrows,
  591. dpct::queue_ptr stream) {
  592. GGML_ASSERT(ncols % QK_K == 0);
  593. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  594. const sycl::range<3> block_nums(1, 1, block_num_y);
  595. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  596. {
  597. stream->submit([&](sycl::handler &cgh) {
  598. cgh.parallel_for(
  599. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  600. [=](sycl::nd_item<3> item_ct1)
  601. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  602. mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
  603. vx, vy, dst, ncols, nrows, item_ct1);
  604. });
  605. });
  606. }
  607. }
  608. static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
  609. float *dst, const int ncols,
  610. const int nrows,
  611. dpct::queue_ptr stream) {
  612. GGML_ASSERT(ncols % QK_K == 0);
  613. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  614. const sycl::range<3> block_nums(1, 1, block_num_y);
  615. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  616. {
  617. stream->submit([&](sycl::handler & cgh) {
  618. cgh.parallel_for(
  619. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  620. [=](sycl::nd_item<3> item_ct1)
  621. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  622. mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
  623. vx, vy, dst, ncols, nrows, item_ct1);
  624. });
  625. });
  626. }
  627. }
  628. static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
  629. float *dst, const int ncols,
  630. const int nrows,
  631. dpct::queue_ptr stream) {
  632. GGML_ASSERT(ncols % QK_K == 0);
  633. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  634. const sycl::range<3> block_nums(1, 1, block_num_y);
  635. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  636. {
  637. stream->submit([&](sycl::handler &cgh) {
  638. cgh.parallel_for(
  639. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  640. [=](sycl::nd_item<3> item_ct1)
  641. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  642. mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
  643. vx, vy, dst, ncols, nrows, item_ct1);
  644. });
  645. });
  646. }
  647. }
  648. static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
  649. float *dst, const int ncols,
  650. const int nrows,
  651. dpct::queue_ptr stream) {
  652. GGML_ASSERT(ncols % QK_K == 0);
  653. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  654. const sycl::range<3> block_nums(1, 1, block_num_y);
  655. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  656. {
  657. stream->submit([&](sycl::handler &cgh) {
  658. cgh.parallel_for(
  659. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  660. [=](sycl::nd_item<3> item_ct1)
  661. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  662. mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
  663. vx, vy, dst, ncols, nrows, item_ct1);
  664. });
  665. });
  666. }
  667. }
  668. static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
  669. float *dst, const int ncols,
  670. const int nrows,
  671. dpct::queue_ptr stream) {
  672. GGML_ASSERT(ncols % QK_K == 0);
  673. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  674. const sycl::range<3> block_nums(1, 1, block_num_y);
  675. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  676. {
  677. stream->submit([&](sycl::handler &cgh) {
  678. cgh.parallel_for(
  679. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  680. [=](sycl::nd_item<3> item_ct1)
  681. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  682. mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
  683. vx, vy, dst, ncols, nrows, item_ct1);
  684. });
  685. });
  686. }
  687. }
  688. static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
  689. float *dst, const int ncols,
  690. const int nrows,
  691. dpct::queue_ptr stream) {
  692. GGML_ASSERT(ncols % QK_K == 0);
  693. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  694. const sycl::range<3> block_nums(1, 1, block_num_y);
  695. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  696. {
  697. stream->submit([&](sycl::handler &cgh) {
  698. cgh.parallel_for(
  699. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  700. [=](sycl::nd_item<3> item_ct1)
  701. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  702. mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
  703. vx, vy, dst, ncols, nrows, item_ct1);
  704. });
  705. });
  706. }
  707. }
  708. static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
  709. float *dst, const int ncols,
  710. const int nrows,
  711. dpct::queue_ptr stream) {
  712. GGML_ASSERT(ncols % QK_K == 0);
  713. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  714. const sycl::range<3> block_nums(1, 1, block_num_y);
  715. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  716. {
  717. stream->submit([&](sycl::handler &cgh) {
  718. cgh.parallel_for(
  719. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  720. [=](sycl::nd_item<3> item_ct1)
  721. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  722. mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
  723. vx, vy, dst, ncols, nrows, item_ct1);
  724. });
  725. });
  726. }
  727. }
  728. static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
  729. float *dst, const int ncols,
  730. const int nrows,
  731. dpct::queue_ptr stream) {
  732. GGML_ASSERT(ncols % QK4_NL == 0);
  733. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  734. const sycl::range<3> block_nums(1, 1, block_num_y);
  735. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  736. {
  737. stream->submit([&](sycl::handler &cgh) {
  738. cgh.parallel_for(
  739. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  740. [=](sycl::nd_item<3> item_ct1)
  741. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  742. mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
  743. vx, vy, dst, ncols, nrows, item_ct1);
  744. });
  745. });
  746. }
  747. }
  748. static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
  749. float *dst, const int ncols,
  750. const int nrows,
  751. dpct::queue_ptr stream) {
  752. GGML_ASSERT(ncols % QK_K == 0);
  753. const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
  754. const sycl::range<3> block_nums(1, 1, block_num_y);
  755. const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
  756. {
  757. stream->submit([&](sycl::handler &cgh) {
  758. cgh.parallel_for(
  759. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  760. [=](sycl::nd_item<3> item_ct1)
  761. [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
  762. mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
  763. vx, vy, dst, ncols, nrows, item_ct1);
  764. });
  765. });
  766. }
  767. }
  768. void ggml_sycl_op_mul_mat_vec_q(
  769. ggml_backend_sycl_context & ctx,
  770. const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
  771. const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
  772. float *dst_dd_i, const int64_t row_low, const int64_t row_high,
  773. const int64_t src1_ncols, const int64_t src1_padded_col_size,
  774. const dpct::queue_ptr &stream) {
  775. const int64_t ne10 = src1->ne[0];
  776. GGML_ASSERT(ne10 % QK8_1 == 0);
  777. const int64_t ne00 = src0->ne[0];
  778. const int64_t row_diff = row_high - row_low;
  779. int id;
  780. SYCL_CHECK(
  781. CHECK_TRY_ERROR(id = get_current_device_id()));
  782. const size_t q8_1_ts = sizeof(block_q8_1);
  783. const size_t q8_1_bs = QK8_1;
  784. // the main device has a larger memory buffer to hold the results from all GPUs
  785. // nrows_dst == nrows of the matrix that the kernel writes into
  786. for (int i = 0; i < src1_ncols; i++)
  787. {
  788. const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
  789. const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
  790. float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
  791. switch (src0->type) {
  792. case GGML_TYPE_Q4_0:
  793. mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  794. break;
  795. case GGML_TYPE_Q4_1:
  796. mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  797. break;
  798. case GGML_TYPE_Q5_0:
  799. mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  800. break;
  801. case GGML_TYPE_Q5_1:
  802. mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  803. break;
  804. case GGML_TYPE_Q8_0:
  805. mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  806. break;
  807. case GGML_TYPE_Q2_K:
  808. mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  809. break;
  810. case GGML_TYPE_Q3_K:
  811. mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  812. break;
  813. case GGML_TYPE_Q4_K:
  814. mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  815. break;
  816. case GGML_TYPE_Q5_K:
  817. mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  818. break;
  819. case GGML_TYPE_Q6_K:
  820. mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  821. break;
  822. case GGML_TYPE_IQ1_S:
  823. mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  824. break;
  825. case GGML_TYPE_IQ1_M:
  826. mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  827. break;
  828. case GGML_TYPE_IQ2_XXS:
  829. mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  830. break;
  831. case GGML_TYPE_IQ2_XS:
  832. mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  833. break;
  834. case GGML_TYPE_IQ2_S:
  835. mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  836. break;
  837. case GGML_TYPE_IQ3_XXS:
  838. mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  839. break;
  840. case GGML_TYPE_IQ3_S:
  841. mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  842. break;
  843. case GGML_TYPE_IQ4_NL:
  844. mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  845. break;
  846. case GGML_TYPE_IQ4_XS:
  847. mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
  848. break;
  849. default:
  850. GGML_ABORT("fatal error");
  851. }
  852. }
  853. GGML_UNUSED(src1);
  854. GGML_UNUSED(dst);
  855. GGML_UNUSED(src1_ddf_i);
  856. GGML_UNUSED(ctx);
  857. }