generic_unary_head.comp 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #extension GL_EXT_shader_16bit_storage : require
  2. #extension GL_EXT_control_flow_attributes : require
  3. layout (push_constant) uniform parameter
  4. {
  5. uint ne;
  6. uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
  7. uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
  8. uint misalign_offsets;
  9. float param1; float param2;
  10. uint ne0_012mp; uint ne0_012L;
  11. uint ne0_01mp; uint ne0_01L;
  12. uint ne0_0mp; uint ne0_0L;
  13. uint ne1_012mp; uint ne1_012L;
  14. uint ne1_01mp; uint ne1_01L;
  15. uint ne1_0mp; uint ne1_0L;
  16. } p;
  17. layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
  18. layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
  19. uint get_idx() {
  20. return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
  21. }
  22. uint get_aoffset() { return p.misalign_offsets >> 16; }
  23. uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
  24. // see init_fastdiv_values in ggml-vulkan.cpp
  25. uint fastdiv(uint n, uint mp, uint L) {
  26. uint msbs, lsbs;
  27. // msbs = mulhi(n, mp)
  28. umulExtended(n, mp, msbs, lsbs);
  29. return (msbs + n) >> L;
  30. }
  31. uint src0_idx(uint idx) {
  32. const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
  33. const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
  34. const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
  35. const uint i02_offset = i02*p.ne01*p.ne00;
  36. const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
  37. const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
  38. return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
  39. }
  40. uint dst_idx(uint idx) {
  41. const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
  42. const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
  43. const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
  44. const uint i12_offset = i12*p.ne11*p.ne10;
  45. const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
  46. const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
  47. return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
  48. }
  49. uint src0_idx_quant(uint idx, uint qk) {
  50. const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
  51. const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
  52. const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
  53. const uint i02_offset = i02*p.ne01*p.ne00;
  54. const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
  55. const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
  56. return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00;
  57. }
  58. uint dst_idx_quant(uint idx, uint qk) {
  59. const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
  60. const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
  61. const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
  62. const uint i12_offset = i12*p.ne11*p.ne10;
  63. const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
  64. const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
  65. return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10;
  66. }