argsort.comp 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. #version 450
  2. #include "types.comp"
  3. #define BLOCK_SIZE 1024
  4. #define ASC 0
  5. layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
  6. layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
  7. layout (binding = 1) buffer D {int data_d[];};
  8. layout (push_constant) uniform parameter {
  9. uint ncols;
  10. uint ncols_pad;
  11. uint order;
  12. } p;
  13. shared int dst_row[BLOCK_SIZE];
  14. void swap(uint idx0, uint idx1) {
  15. int tmp = dst_row[idx0];
  16. dst_row[idx0] = dst_row[idx1];
  17. dst_row[idx1] = tmp;
  18. }
  19. void main() {
  20. // bitonic sort
  21. const int col = int(gl_LocalInvocationID.x);
  22. const uint row = gl_WorkGroupID.y;
  23. if (col >= p.ncols_pad) {
  24. return;
  25. }
  26. const uint row_offset = row * p.ncols;
  27. // initialize indices
  28. dst_row[col] = col;
  29. barrier();
  30. for (uint k = 2; k <= p.ncols_pad; k *= 2) {
  31. for (uint j = k / 2; j > 0; j /= 2) {
  32. const uint ixj = col ^ j;
  33. if (ixj > col) {
  34. if ((col & k) == 0) {
  35. if (dst_row[col] >= p.ncols ||
  36. (dst_row[ixj] < p.ncols && (p.order == ASC ?
  37. data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]] :
  38. data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]]))
  39. ) {
  40. swap(col, ixj);
  41. }
  42. } else {
  43. if (dst_row[ixj] >= p.ncols ||
  44. (dst_row[col] < p.ncols && (p.order == ASC ?
  45. data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]] :
  46. data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]]))
  47. ) {
  48. swap(col, ixj);
  49. }
  50. }
  51. }
  52. barrier();
  53. }
  54. }
  55. if (col < p.ncols) {
  56. data_d[row_offset + col] = dst_row[col];
  57. }
  58. }