mul_mat_vec_p021.comp 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. #version 450
  2. #extension GL_EXT_control_flow_attributes : enable
  3. #extension GL_EXT_shader_16bit_storage : require
  4. #define BLOCK_SIZE 32
  5. #define FLOAT_TYPE float
  6. layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
  7. layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
  8. layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
  9. layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
  10. layout (push_constant) uniform parameter
  11. {
  12. uint ncols_x;
  13. uint nrows_x;
  14. uint nchannels_x;
  15. uint nchannels_y;
  16. uint b_offset;
  17. uint d_offset;
  18. } p;
  19. shared FLOAT_TYPE tmp[BLOCK_SIZE];
  20. void main() {
  21. const uint tid = gl_LocalInvocationID.x;
  22. const uint row_x = gl_GlobalInvocationID.y;
  23. const uint channel = gl_GlobalInvocationID.z;
  24. const uint channel_x = channel / (p.nchannels_y / p.nchannels_x);
  25. const uint nrows_y = p.ncols_x;
  26. const uint nrows_dst = p.nrows_x;
  27. const uint row_dst = row_x;
  28. tmp[tid] = FLOAT_TYPE(0.0f);
  29. for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
  30. const uint col_x = col_x0 + tid;
  31. if (col_x >= p.ncols_x) {
  32. break;
  33. }
  34. // x is transposed and permuted
  35. const uint ix = row_x*p.nchannels_x*p.ncols_x + channel_x*p.ncols_x + col_x;
  36. const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
  37. const uint row_y = col_x;
  38. // y is not transposed but permuted
  39. const uint iy = channel*nrows_y + row_y;
  40. tmp[tid] += xi * FLOAT_TYPE(data_b[iy]);
  41. }
  42. // dst is not transposed and not permuted
  43. const uint idst = channel*nrows_dst + row_dst;
  44. // sum up partial sums and write back result
  45. barrier();
  46. [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
  47. if (tid < s) {
  48. tmp[tid] += tmp[tid + s];
  49. }
  50. barrier();
  51. }
  52. if (tid == 0) {
  53. dst[idst] = tmp[0];
  54. }
  55. }