1
0

ggml-metal.metal 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. #include <metal_stdlib>
  2. using namespace metal;
  3. #define MAX(x, y) ((x) > (y) ? (x) : (y))
  4. #define QK4_0 32
  5. #define QR4_0 2
  6. typedef struct {
  7. half d; // delta
  8. uint8_t qs[QK4_0 / 2]; // nibbles / quants
  9. } block_q4_0;
  10. static void dequantize_row_q4_0(device const block_q4_0 * x, device float * y, int k) {
  11. const int qk = QK4_0;
  12. assert(k % qk == 0);
  13. const int nb = k / qk;
  14. for (int i = 0; i < nb; i++) {
  15. const half d = x[i].d;
  16. for (int j = 0; j < qk/2; ++j) {
  17. const int x0 = (x[i].qs[j] & 0x0F) - 8;
  18. const int x1 = (x[i].qs[j] >> 4) - 8;
  19. y[i*qk + j + 0 ] = x0*d;
  20. y[i*qk + j + qk/2] = x1*d;
  21. }
  22. }
  23. }
  24. kernel void kernel_add(
  25. device const float * src0,
  26. device const float * src1,
  27. device float * dst,
  28. uint tpig[[thread_position_in_grid]]) {
  29. dst[tpig] = src0[tpig] + src1[tpig];
  30. }
  31. kernel void kernel_mul(
  32. device const float * src0,
  33. device const float * src1,
  34. device float * dst,
  35. uint tpig[[thread_position_in_grid]]) {
  36. dst[tpig] = src0[tpig] * src1[tpig];
  37. }
  38. // assumption: src1 is a row
  39. // broadcast src1 into src0
  40. kernel void kernel_mul_row(
  41. device const float * src0,
  42. device const float * src1,
  43. device float * dst,
  44. constant int64_t & ne00,
  45. uint tpig[[thread_position_in_grid]]) {
  46. dst[tpig] = src0[tpig] * src1[tpig % ne00];
  47. }
  48. kernel void kernel_scale(
  49. device const float * src0,
  50. device float * dst,
  51. constant float & scale,
  52. uint tpig[[thread_position_in_grid]]) {
  53. dst[tpig] = src0[tpig] * scale;
  54. }
  55. kernel void kernel_silu(
  56. device const float * src0,
  57. device float * dst,
  58. uint tpig[[thread_position_in_grid]]) {
  59. float x = src0[tpig];
  60. dst[tpig] = x / (1.0f + exp(-x));
  61. }
  62. kernel void kernel_relu(
  63. device const float * src0,
  64. device float * dst,
  65. uint tpig[[thread_position_in_grid]]) {
  66. dst[tpig] = max(0.0f, src0[tpig]);
  67. }
  68. kernel void kernel_soft_max(
  69. device const float * src0,
  70. device float * dst,
  71. constant int64_t & ne00,
  72. constant int64_t & ne01,
  73. constant int64_t & ne02,
  74. threadgroup float * buf [[threadgroup(0)]],
  75. uint3 tgpig[[threadgroup_position_in_grid]],
  76. uint3 tpitg[[thread_position_in_threadgroup]],
  77. uint3 ntg[[threads_per_threadgroup]]) {
  78. const int64_t i03 = tgpig[2];
  79. const int64_t i02 = tgpig[1];
  80. const int64_t i01 = tgpig[0];
  81. device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
  82. device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
  83. // parallel max
  84. buf[tpitg[0]] = -INFINITY;
  85. for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
  86. buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]);
  87. }
  88. // reduce
  89. threadgroup_barrier(mem_flags::mem_threadgroup);
  90. for (uint i = ntg[0]/2; i > 0; i /= 2) {
  91. if (tpitg[0] < i) {
  92. buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]);
  93. }
  94. threadgroup_barrier(mem_flags::mem_threadgroup);
  95. }
  96. // broadcast
  97. if (tpitg[0] == 0) {
  98. buf[0] = buf[0];
  99. }
  100. threadgroup_barrier(mem_flags::mem_threadgroup);
  101. const float max = buf[0];
  102. // parallel sum
  103. buf[tpitg[0]] = 0.0f;
  104. for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
  105. buf[tpitg[0]] += exp(psrc0[i00] - max);
  106. }
  107. // reduce
  108. threadgroup_barrier(mem_flags::mem_threadgroup);
  109. for (uint i = ntg[0]/2; i > 0; i /= 2) {
  110. if (tpitg[0] < i) {
  111. buf[tpitg[0]] += buf[tpitg[0] + i];
  112. }
  113. threadgroup_barrier(mem_flags::mem_threadgroup);
  114. }
  115. // broadcast
  116. if (tpitg[0] == 0) {
  117. buf[0] = buf[0];
  118. }
  119. threadgroup_barrier(mem_flags::mem_threadgroup);
  120. const float sum = buf[0];
  121. for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
  122. pdst[i00] = exp(psrc0[i00] - max) / sum;
  123. }
  124. }
  125. kernel void kernel_diag_mask_inf(
  126. device const float * src0,
  127. device float * dst,
  128. constant int64_t & ne00,
  129. constant int64_t & ne01,
  130. constant int & n_past,
  131. uint3 tpig[[thread_position_in_grid]]) {
  132. const int64_t i02 = tpig[2];
  133. const int64_t i01 = tpig[1];
  134. const int64_t i00 = tpig[0];
  135. if (i00 > n_past + i01) {
  136. dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
  137. } else {
  138. dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
  139. }
  140. }
  141. kernel void kernel_get_rows_f16(
  142. device const void * src0,
  143. device const int * src1,
  144. device float * dst,
  145. constant int64_t & ne00,
  146. constant uint64_t & nb01,
  147. constant uint64_t & nb1,
  148. uint tpig[[thread_position_in_grid]]) {
  149. const int i = tpig;
  150. const int r = ((device int32_t *) src1)[i];
  151. for (int j = 0; j < ne00; j++) {
  152. dst[i*nb1 + j] = ((device half *) ((device char *) src0 + r*nb01))[j];
  153. }
  154. }
  155. kernel void kernel_get_rows_q4_0(
  156. device const void * src0,
  157. device const int * src1,
  158. device float * dst,
  159. constant int64_t & ne00,
  160. constant uint64_t & nb01,
  161. constant uint64_t & nb1,
  162. uint tpig[[thread_position_in_grid]]) {
  163. const int i = tpig;
  164. const int r = ((device int32_t *) src1)[i];
  165. dequantize_row_q4_0(
  166. (device const block_q4_0 *) ((device char *) src0 + r*nb01),
  167. (device float *) ((device char *) dst + i*nb1), ne00);
  168. }
  169. kernel void kernel_rms_norm(
  170. device const void * src0,
  171. device float * dst,
  172. constant int64_t & ne00,
  173. constant uint64_t & nb01,
  174. constant float & eps,
  175. threadgroup float * sum [[threadgroup(0)]],
  176. uint tgpig[[threadgroup_position_in_grid]],
  177. uint tpitg[[thread_position_in_threadgroup]],
  178. uint ntg[[threads_per_threadgroup]]) {
  179. device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
  180. // parallel sum
  181. sum[tpitg] = 0.0f;
  182. for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
  183. sum[tpitg] += x[i00] * x[i00];
  184. }
  185. // reduce
  186. threadgroup_barrier(mem_flags::mem_threadgroup);
  187. for (uint i = ntg/2; i > 0; i /= 2) {
  188. if (tpitg < i) {
  189. sum[tpitg] += sum[tpitg + i];
  190. }
  191. threadgroup_barrier(mem_flags::mem_threadgroup);
  192. }
  193. // broadcast
  194. if (tpitg == 0) {
  195. sum[0] /= ne00;
  196. }
  197. threadgroup_barrier(mem_flags::mem_threadgroup);
  198. const float mean = sum[0];
  199. const float scale = 1.0f/sqrt(mean + eps);
  200. device float * y = dst + tgpig*ne00;
  201. for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
  202. y[i00] = x[i00] * scale;
  203. }
  204. }
  205. kernel void kernel_mul_mat_q4_0_f32(
  206. device const void * src0,
  207. device const float * src1,
  208. device float * dst,
  209. constant int64_t & ne00,
  210. constant int64_t & ne01,
  211. constant uint64_t & nb00,
  212. constant uint64_t & nb01,
  213. constant uint64_t & nb02,
  214. constant int64_t & ne10,
  215. constant int64_t & ne11,
  216. constant uint64_t & nb10,
  217. constant uint64_t & nb11,
  218. constant uint64_t & nb12,
  219. constant int64_t & ne0,
  220. constant int64_t & ne1,
  221. threadgroup float * sum [[threadgroup(0)]],
  222. uint2 tgpig[[threadgroup_position_in_grid]],
  223. uint2 tpig[[thread_position_in_grid]],
  224. uint2 tpitg[[thread_position_in_threadgroup]],
  225. uint2 tptg[[threads_per_threadgroup]]) {
  226. const int nb = ne00/QK4_0;
  227. const int64_t r0 = tgpig.x;
  228. const int64_t r1 = tgpig.y;
  229. device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
  230. device const float * y = (device const float *) src1 + r1*ne10;
  231. const uint nth = tptg.x*tptg.y;
  232. const uint ith = tptg.y*tpitg.x + tpitg.y;
  233. sum[ith] = 0.0f;
  234. for (int i = tpitg.x; i < nb; i += tptg.x) {
  235. device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs;
  236. device const float4 * y0p = (device const float4 *) (y + i*QK4_0);
  237. const float d = (float)((x + i)->d);
  238. const uchar4 x0v = *(x0p + tpitg.y);
  239. const float4 y0v = *(y0p + tpitg.y + 0);
  240. const float4 y1v = *(y0p + tpitg.y + 4);
  241. float acc = 0.0f;
  242. for (int j = 0; j < 4; ++j) {
  243. const int x0 = x0v[j] & 0x0F;
  244. const int x1 = x0v[j] >> 4;
  245. const float y0 = y0v[j];
  246. const float y1 = y1v[j];
  247. acc += (x0 - 8)*y0 + (x1 - 8)*y1;
  248. }
  249. sum[ith] += acc*d;
  250. }
  251. // accumulate the sum from all threads in the threadgroup
  252. threadgroup_barrier(mem_flags::mem_threadgroup);
  253. for (uint i = nth/2; i > 0; i /= 2) {
  254. if (ith < i) {
  255. sum[ith] += sum[ith + i];
  256. }
  257. threadgroup_barrier(mem_flags::mem_threadgroup);
  258. }
  259. if (ith == 0) {
  260. dst[r1*ne0 + r0] = sum[0];
  261. }
  262. }
  263. kernel void kernel_mul_mat_f16_f32(
  264. device const char * src0,
  265. device const char * src1,
  266. device float * dst,
  267. constant int64_t & ne00,
  268. constant int64_t & ne01,
  269. constant uint64_t & nb00,
  270. constant uint64_t & nb01,
  271. constant uint64_t & nb02,
  272. constant int64_t & ne10,
  273. constant int64_t & ne11,
  274. constant uint64_t & nb10,
  275. constant uint64_t & nb11,
  276. constant uint64_t & nb12,
  277. constant int64_t & ne0,
  278. constant int64_t & ne1,
  279. threadgroup float * sum [[threadgroup(0)]],
  280. uint3 tgpig[[threadgroup_position_in_grid]],
  281. uint3 tpig[[thread_position_in_grid]],
  282. uint3 tpitg[[thread_position_in_threadgroup]],
  283. uint3 tptg[[threads_per_threadgroup]]) {
  284. const int64_t r0 = tgpig.x;
  285. const int64_t r1 = tgpig.y;
  286. const int64_t im = tgpig.z;
  287. device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
  288. device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
  289. sum[tpitg.x] = 0.0f;
  290. for (int i = tpitg.x; i < ne00; i += tptg.x) {
  291. sum[tpitg.x] += (float) x[i] * (float) y[i];
  292. }
  293. // accumulate the sum from all threads in the threadgroup
  294. threadgroup_barrier(mem_flags::mem_threadgroup);
  295. for (uint i = tptg.x/2; i > 0; i /= 2) {
  296. if (tpitg.x < i) {
  297. sum[tpitg.x] += sum[tpitg.x + i];
  298. }
  299. threadgroup_barrier(mem_flags::mem_threadgroup);
  300. }
  301. if (tpitg.x == 0) {
  302. dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
  303. }
  304. }
  305. kernel void kernel_rope(
  306. device const void * src0,
  307. device float * dst,
  308. constant int64_t & ne00,
  309. constant int64_t & ne01,
  310. constant int64_t & ne02,
  311. constant int64_t & ne03,
  312. constant uint64_t & nb00,
  313. constant uint64_t & nb01,
  314. constant uint64_t & nb02,
  315. constant uint64_t & nb03,
  316. constant int64_t & ne0,
  317. constant int64_t & ne1,
  318. constant int64_t & ne2,
  319. constant int64_t & ne3,
  320. constant uint64_t & nb0,
  321. constant uint64_t & nb1,
  322. constant uint64_t & nb2,
  323. constant uint64_t & nb3,
  324. constant int & n_past,
  325. constant int & n_dims,
  326. constant int & mode,
  327. uint3 tpig[[thread_position_in_grid]]) {
  328. const int64_t i3 = tpig[2];
  329. const int64_t i2 = tpig[1];
  330. const int64_t i1 = tpig[0];
  331. const bool is_neox = mode & 2;
  332. const float theta_scale = pow(10000.0, -2.0f/n_dims);
  333. const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
  334. float theta = (float)p;
  335. if (!is_neox) {
  336. for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
  337. const float cos_theta = cos(theta);
  338. const float sin_theta = sin(theta);
  339. theta *= theta_scale;
  340. device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
  341. device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
  342. const float x0 = src[0];
  343. const float x1 = src[1];
  344. dst_data[0] = x0*cos_theta - x1*sin_theta;
  345. dst_data[1] = x0*sin_theta + x1*cos_theta;
  346. }
  347. } else {
  348. // TODO: implement
  349. }
  350. }
  351. kernel void kernel_cpy_f32_f16(
  352. device const float * src0,
  353. device half * dst,
  354. constant int64_t & ne00,
  355. constant int64_t & ne01,
  356. constant int64_t & ne02,
  357. constant int64_t & ne03,
  358. constant uint64_t & nb00,
  359. constant uint64_t & nb01,
  360. constant uint64_t & nb02,
  361. constant uint64_t & nb03,
  362. constant int64_t & ne0,
  363. constant int64_t & ne1,
  364. constant int64_t & ne2,
  365. constant int64_t & ne3,
  366. constant uint64_t & nb0,
  367. constant uint64_t & nb1,
  368. constant uint64_t & nb2,
  369. constant uint64_t & nb3,
  370. uint3 tgpig[[threadgroup_position_in_grid]],
  371. uint3 tpitg[[thread_position_in_threadgroup]],
  372. uint3 ntg[[threads_per_threadgroup]]) {
  373. const int64_t i03 = tgpig[2];
  374. const int64_t i02 = tgpig[1];
  375. const int64_t i01 = tgpig[0];
  376. const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
  377. const int64_t i3 = n / (ne2*ne1*ne0);
  378. const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
  379. const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
  380. const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
  381. device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
  382. for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
  383. device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
  384. dst_data[i00] = src[0];
  385. }
  386. }
  387. kernel void kernel_cpy_f32_f32(
  388. device const float * src0,
  389. device float * dst,
  390. constant int64_t & ne00,
  391. constant int64_t & ne01,
  392. constant int64_t & ne02,
  393. constant int64_t & ne03,
  394. constant uint64_t & nb00,
  395. constant uint64_t & nb01,
  396. constant uint64_t & nb02,
  397. constant uint64_t & nb03,
  398. constant int64_t & ne0,
  399. constant int64_t & ne1,
  400. constant int64_t & ne2,
  401. constant int64_t & ne3,
  402. constant uint64_t & nb0,
  403. constant uint64_t & nb1,
  404. constant uint64_t & nb2,
  405. constant uint64_t & nb3,
  406. uint3 tgpig[[threadgroup_position_in_grid]],
  407. uint3 tpitg[[thread_position_in_threadgroup]],
  408. uint3 ntg[[threads_per_threadgroup]]) {
  409. const int64_t i03 = tgpig[2];
  410. const int64_t i02 = tgpig[1];
  411. const int64_t i01 = tgpig[0];
  412. const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
  413. const int64_t i3 = n / (ne2*ne1*ne0);
  414. const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
  415. const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
  416. const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
  417. device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
  418. for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
  419. device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
  420. dst_data[i00] = src[0];
  421. }
  422. }