ggml-metal.metal 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. #include <metal_stdlib>
  2. using namespace metal;
  3. #define MAX(x, y) ((x) > (y) ? (x) : (y))
  4. #define QK4_0 32
  5. #define QR4_0 2
  6. typedef struct {
  7. half d; // delta
  8. uint8_t qs[QK4_0 / 2]; // nibbles / quants
  9. } block_q4_0;
  10. static void dequantize_row_q4_0(device const block_q4_0 * x, device float * y, int k) {
  11. const int qk = QK4_0;
  12. assert(k % qk == 0);
  13. const int nb = k / qk;
  14. for (int i = 0; i < nb; i++) {
  15. const half d = x[i].d;
  16. for (int j = 0; j < qk/2; ++j) {
  17. const int x0 = (x[i].qs[j] & 0x0F) - 8;
  18. const int x1 = (x[i].qs[j] >> 4) - 8;
  19. y[i*qk + j + 0 ] = x0*d;
  20. y[i*qk + j + qk/2] = x1*d;
  21. }
  22. }
  23. }
  24. kernel void kernel_add(
  25. device const float * src0,
  26. device const float * src1,
  27. device float * dst,
  28. uint tpig[[thread_position_in_grid]]) {
  29. dst[tpig] = src0[tpig] + src1[tpig];
  30. }
  31. kernel void kernel_mul(
  32. device const float * src0,
  33. device const float * src1,
  34. device float * dst,
  35. uint tpig[[thread_position_in_grid]]) {
  36. dst[tpig] = src0[tpig] * src1[tpig];
  37. }
  38. // assumption: src1 is a row
  39. // broadcast src1 into src0
  40. kernel void kernel_mul_row(
  41. device const float * src0,
  42. device const float * src1,
  43. device float * dst,
  44. constant int64_t & ne00,
  45. uint tpig[[thread_position_in_grid]]) {
  46. dst[tpig] = src0[tpig] * src1[tpig % ne00];
  47. }
  48. kernel void kernel_scale(
  49. device const float * src0,
  50. device float * dst,
  51. constant float & scale,
  52. uint tpig[[thread_position_in_grid]]) {
  53. dst[tpig] = src0[tpig] * scale;
  54. }
  55. kernel void kernel_silu(
  56. device const float * src0,
  57. device float * dst,
  58. uint tpig[[thread_position_in_grid]]) {
  59. float x = src0[tpig];
  60. dst[tpig] = x / (1.0f + exp(-x));
  61. }
  62. kernel void kernel_relu(
  63. device const float * src0,
  64. device float * dst,
  65. uint tpig[[thread_position_in_grid]]) {
  66. dst[tpig] = max(0.0f, src0[tpig]);
  67. }
  68. kernel void kernel_soft_max(
  69. device const float * src0,
  70. device float * dst,
  71. constant int64_t & ne00,
  72. constant int64_t & ne01,
  73. constant int64_t & ne02,
  74. threadgroup float * buf [[threadgroup(0)]],
  75. uint3 tgpig[[threadgroup_position_in_grid]],
  76. uint3 tpitg[[thread_position_in_threadgroup]],
  77. uint3 ntg[[threads_per_threadgroup]]) {
  78. const int64_t i03 = tgpig[2];
  79. const int64_t i02 = tgpig[1];
  80. const int64_t i01 = tgpig[0];
  81. device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
  82. device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
  83. // parallel max
  84. buf[tpitg[0]] = -INFINITY;
  85. for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
  86. buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]);
  87. }
  88. // reduce
  89. threadgroup_barrier(mem_flags::mem_threadgroup);
  90. for (uint i = ntg[0]/2; i > 0; i /= 2) {
  91. if (tpitg[0] < i) {
  92. buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]);
  93. }
  94. threadgroup_barrier(mem_flags::mem_threadgroup);
  95. }
  96. // broadcast
  97. if (tpitg[0] == 0) {
  98. buf[0] = buf[0];
  99. }
  100. threadgroup_barrier(mem_flags::mem_threadgroup);
  101. const float max = buf[0];
  102. // parallel sum
  103. buf[tpitg[0]] = 0.0f;
  104. for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
  105. buf[tpitg[0]] += exp(psrc0[i00] - max);
  106. }
  107. // reduce
  108. threadgroup_barrier(mem_flags::mem_threadgroup);
  109. for (uint i = ntg[0]/2; i > 0; i /= 2) {
  110. if (tpitg[0] < i) {
  111. buf[tpitg[0]] += buf[tpitg[0] + i];
  112. }
  113. threadgroup_barrier(mem_flags::mem_threadgroup);
  114. }
  115. // broadcast
  116. if (tpitg[0] == 0) {
  117. buf[0] = buf[0];
  118. }
  119. threadgroup_barrier(mem_flags::mem_threadgroup);
  120. const float sum = buf[0];
  121. for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
  122. pdst[i00] = exp(psrc0[i00] - max) / sum;
  123. }
  124. }
  125. kernel void kernel_diag_mask_inf(
  126. device const float * src0,
  127. device float * dst,
  128. constant int64_t & ne00,
  129. constant int64_t & ne01,
  130. constant int & n_past,
  131. uint3 tpig[[thread_position_in_grid]]) {
  132. const int64_t i02 = tpig[2];
  133. const int64_t i01 = tpig[1];
  134. const int64_t i00 = tpig[0];
  135. if (i00 > n_past + i01) {
  136. dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
  137. } else {
  138. dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
  139. }
  140. }
  141. kernel void kernel_get_rows_q4_0(
  142. device const void * src0,
  143. device const int * src1,
  144. device float * dst,
  145. constant int64_t & ne00,
  146. constant uint64_t & nb01,
  147. constant uint64_t & nb1,
  148. uint tpig[[thread_position_in_grid]]) {
  149. const int i = tpig;
  150. const int r = ((device int32_t *) src1)[i];
  151. dequantize_row_q4_0(
  152. (device const block_q4_0 *) ((device char *) src0 + r*nb01),
  153. (device float *) ((device char *) dst + i*nb1), ne00);
  154. }
  155. kernel void kernel_rms_norm(
  156. device const void * src0,
  157. device float * dst,
  158. constant int64_t & ne00,
  159. constant uint64_t & nb01,
  160. constant float & eps,
  161. threadgroup float * sum [[threadgroup(0)]],
  162. uint tgpig[[threadgroup_position_in_grid]],
  163. uint tpitg[[thread_position_in_threadgroup]],
  164. uint ntg[[threads_per_threadgroup]]) {
  165. device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
  166. // parallel sum
  167. sum[tpitg] = 0.0f;
  168. for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
  169. sum[tpitg] += x[i00] * x[i00];
  170. }
  171. // reduce
  172. threadgroup_barrier(mem_flags::mem_threadgroup);
  173. for (uint i = ntg/2; i > 0; i /= 2) {
  174. if (tpitg < i) {
  175. sum[tpitg] += sum[tpitg + i];
  176. }
  177. threadgroup_barrier(mem_flags::mem_threadgroup);
  178. }
  179. // broadcast
  180. if (tpitg == 0) {
  181. sum[0] /= ne00;
  182. }
  183. threadgroup_barrier(mem_flags::mem_threadgroup);
  184. const float mean = sum[0];
  185. const float scale = 1.0f/sqrt(mean + eps);
  186. device float * y = dst + tgpig*ne00;
  187. for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
  188. y[i00] = x[i00] * scale;
  189. }
  190. }
  191. kernel void kernel_mul_mat_q4_0_f32(
  192. device const void * src0,
  193. device const float * src1,
  194. device float * dst,
  195. constant int64_t & ne00,
  196. constant int64_t & ne01,
  197. constant uint64_t & nb00,
  198. constant uint64_t & nb01,
  199. constant uint64_t & nb02,
  200. constant int64_t & ne10,
  201. constant int64_t & ne11,
  202. constant uint64_t & nb10,
  203. constant uint64_t & nb11,
  204. constant uint64_t & nb12,
  205. constant int64_t & ne0,
  206. constant int64_t & ne1,
  207. threadgroup float * sum [[threadgroup(0)]],
  208. uint2 tgpig[[threadgroup_position_in_grid]],
  209. uint2 tpig[[thread_position_in_grid]],
  210. uint2 tpitg[[thread_position_in_threadgroup]],
  211. uint2 tptg[[threads_per_threadgroup]]) {
  212. const int nb = ne00/QK4_0;
  213. const int64_t r0 = tgpig.x;
  214. const int64_t r1 = tgpig.y;
  215. device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
  216. device const float * y = (device const float *) src1 + r1*ne10;
  217. const uint nth = tptg.x*tptg.y;
  218. const uint ith = tptg.y*tpitg.x + tpitg.y;
  219. sum[ith] = 0.0f;
  220. for (int i = tpitg.x; i < nb; i += tptg.x) {
  221. device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs;
  222. device const float4 * y0p = (device const float4 *) (y + i*QK4_0);
  223. const float d = (float)((x + i)->d);
  224. const uchar4 x0v = *(x0p + tpitg.y);
  225. const float4 y0v = *(y0p + tpitg.y + 0);
  226. const float4 y1v = *(y0p + tpitg.y + 4);
  227. float acc = 0.0f;
  228. for (int j = 0; j < 4; ++j) {
  229. const int x0 = x0v[j] & 0x0F;
  230. const int x1 = x0v[j] >> 4;
  231. const float y0 = y0v[j];
  232. const float y1 = y1v[j];
  233. acc += (x0 - 8)*y0 + (x1 - 8)*y1;
  234. }
  235. sum[ith] += acc*d;
  236. }
  237. // accumulate the sum from all threads in the threadgroup
  238. threadgroup_barrier(mem_flags::mem_threadgroup);
  239. for (uint i = nth/2; i > 0; i /= 2) {
  240. if (ith < i) {
  241. sum[ith] += sum[ith + i];
  242. }
  243. threadgroup_barrier(mem_flags::mem_threadgroup);
  244. }
  245. if (ith == 0) {
  246. dst[r1*ne0 + r0] = sum[0];
  247. }
  248. }
  249. kernel void kernel_mul_mat_f16_f32(
  250. device const char * src0,
  251. device const char * src1,
  252. device float * dst,
  253. constant int64_t & ne00,
  254. constant int64_t & ne01,
  255. constant uint64_t & nb00,
  256. constant uint64_t & nb01,
  257. constant uint64_t & nb02,
  258. constant int64_t & ne10,
  259. constant int64_t & ne11,
  260. constant uint64_t & nb10,
  261. constant uint64_t & nb11,
  262. constant uint64_t & nb12,
  263. constant int64_t & ne0,
  264. constant int64_t & ne1,
  265. threadgroup float * sum [[threadgroup(0)]],
  266. uint3 tgpig[[threadgroup_position_in_grid]],
  267. uint3 tpig[[thread_position_in_grid]],
  268. uint3 tpitg[[thread_position_in_threadgroup]],
  269. uint3 tptg[[threads_per_threadgroup]]) {
  270. const int64_t r0 = tgpig.x;
  271. const int64_t r1 = tgpig.y;
  272. const int64_t im = tgpig.z;
  273. device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
  274. device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
  275. sum[tpitg.x] = 0.0f;
  276. for (int i = tpitg.x; i < ne00; i += tptg.x) {
  277. sum[tpitg.x] += (float) x[i] * (float) y[i];
  278. }
  279. // accumulate the sum from all threads in the threadgroup
  280. threadgroup_barrier(mem_flags::mem_threadgroup);
  281. for (uint i = tptg.x/2; i > 0; i /= 2) {
  282. if (tpitg.x < i) {
  283. sum[tpitg.x] += sum[tpitg.x + i];
  284. }
  285. threadgroup_barrier(mem_flags::mem_threadgroup);
  286. }
  287. if (tpitg.x == 0) {
  288. dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
  289. }
  290. }
  291. kernel void kernel_rope(
  292. device const void * src0,
  293. device float * dst,
  294. constant int64_t & ne00,
  295. constant int64_t & ne01,
  296. constant int64_t & ne02,
  297. constant int64_t & ne03,
  298. constant uint64_t & nb00,
  299. constant uint64_t & nb01,
  300. constant uint64_t & nb02,
  301. constant uint64_t & nb03,
  302. constant int64_t & ne0,
  303. constant int64_t & ne1,
  304. constant int64_t & ne2,
  305. constant int64_t & ne3,
  306. constant uint64_t & nb0,
  307. constant uint64_t & nb1,
  308. constant uint64_t & nb2,
  309. constant uint64_t & nb3,
  310. constant int & n_past,
  311. constant int & n_dims,
  312. constant int & mode,
  313. uint3 tpig[[thread_position_in_grid]]) {
  314. const int64_t i3 = tpig[2];
  315. const int64_t i2 = tpig[1];
  316. const int64_t i1 = tpig[0];
  317. const bool is_neox = mode & 2;
  318. const float theta_scale = pow(10000.0, -2.0f/n_dims);
  319. const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
  320. float theta = (float)p;
  321. if (!is_neox) {
  322. for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
  323. const float cos_theta = cos(theta);
  324. const float sin_theta = sin(theta);
  325. theta *= theta_scale;
  326. device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
  327. device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
  328. const float x0 = src[0];
  329. const float x1 = src[1];
  330. dst_data[0] = x0*cos_theta - x1*sin_theta;
  331. dst_data[1] = x0*sin_theta + x1*cos_theta;
  332. }
  333. } else {
  334. // TODO: implement
  335. }
  336. }
  337. kernel void kernel_cpy_f32_f16(
  338. device const float * src0,
  339. device half * dst,
  340. constant int64_t & ne00,
  341. constant int64_t & ne01,
  342. constant int64_t & ne02,
  343. constant int64_t & ne03,
  344. constant uint64_t & nb00,
  345. constant uint64_t & nb01,
  346. constant uint64_t & nb02,
  347. constant uint64_t & nb03,
  348. constant int64_t & ne0,
  349. constant int64_t & ne1,
  350. constant int64_t & ne2,
  351. constant int64_t & ne3,
  352. constant uint64_t & nb0,
  353. constant uint64_t & nb1,
  354. constant uint64_t & nb2,
  355. constant uint64_t & nb3,
  356. uint3 tgpig[[threadgroup_position_in_grid]],
  357. uint3 tpitg[[thread_position_in_threadgroup]],
  358. uint3 ntg[[threads_per_threadgroup]]) {
  359. const int64_t i03 = tgpig[2];
  360. const int64_t i02 = tgpig[1];
  361. const int64_t i01 = tgpig[0];
  362. const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
  363. const int64_t i3 = n / (ne2*ne1*ne0);
  364. const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
  365. const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
  366. const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
  367. device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
  368. for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
  369. device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
  370. dst_data[i00] = src[0];
  371. }
  372. }
  373. kernel void kernel_cpy_f32_f32(
  374. device const float * src0,
  375. device float * dst,
  376. constant int64_t & ne00,
  377. constant int64_t & ne01,
  378. constant int64_t & ne02,
  379. constant int64_t & ne03,
  380. constant uint64_t & nb00,
  381. constant uint64_t & nb01,
  382. constant uint64_t & nb02,
  383. constant uint64_t & nb03,
  384. constant int64_t & ne0,
  385. constant int64_t & ne1,
  386. constant int64_t & ne2,
  387. constant int64_t & ne3,
  388. constant uint64_t & nb0,
  389. constant uint64_t & nb1,
  390. constant uint64_t & nb2,
  391. constant uint64_t & nb3,
  392. uint3 tgpig[[threadgroup_position_in_grid]],
  393. uint3 tpitg[[thread_position_in_threadgroup]],
  394. uint3 ntg[[threads_per_threadgroup]]) {
  395. const int64_t i03 = tgpig[2];
  396. const int64_t i02 = tgpig[1];
  397. const int64_t i01 = tgpig[0];
  398. const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
  399. const int64_t i3 = n / (ne2*ne1*ne0);
  400. const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
  401. const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
  402. const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
  403. device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
  404. for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
  405. device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
  406. dst_data[i00] = src[0];
  407. }
  408. }