mul_mat_vec_nc.comp 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. #version 450
  2. #extension GL_EXT_control_flow_attributes : enable
  3. #extension GL_EXT_shader_16bit_storage : require
  4. #define BLOCK_SIZE 32
  5. #define FLOAT_TYPE float
  6. layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
  7. layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
  8. layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
  9. layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
  10. layout (push_constant) uniform parameter
  11. {
  12. uint ncols_x;
  13. uint nrows_x;
  14. uint row_stride_x;
  15. uint channel_stride_x;
  16. uint channel_x_divisor;
  17. uint b_offset;
  18. uint d_offset;
  19. } p;
  20. shared FLOAT_TYPE tmp[BLOCK_SIZE];
  21. void main() {
  22. const uint tid = gl_LocalInvocationID.x;
  23. const uint row_x = gl_GlobalInvocationID.y;
  24. const uint channel = gl_GlobalInvocationID.z;
  25. const uint channel_x = channel / p.channel_x_divisor;
  26. const uint nrows_y = p.ncols_x;
  27. const uint nrows_dst = p.nrows_x;
  28. const uint row_dst = row_x;
  29. const uint idst = channel*nrows_dst + row_dst;
  30. tmp[tid] = 0.0f;
  31. for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
  32. const uint col_x = col_x0 + tid;
  33. if (col_x >= p.ncols_x) {
  34. break;
  35. }
  36. const uint row_y = col_x;
  37. const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
  38. const uint iy = channel*nrows_y + row_y;
  39. const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
  40. tmp[tid] += xi * FLOAT_TYPE(data_b[iy]);
  41. }
  42. // sum up partial sums and write back result
  43. barrier();
  44. [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
  45. if (tid < s) {
  46. tmp[tid] += tmp[tid + s];
  47. }
  48. barrier();
  49. }
  50. if (tid == 0) {
  51. dst[idst] = tmp[0];
  52. }
  53. }