op_cpy_f16_f32.comp 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. #version 450
  2. #include "common.comp"
  3. #define IN_TYPE float16_t
  4. #define IN_TYPE_SIZE 2
  5. #define OUT_TYPE float
  6. #define OUT_TYPE_SIZE 4
  7. layout(local_size_x = 1024) in;
  8. layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
  9. layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
  10. layout (push_constant) uniform parameter {
  11. uint inOff;
  12. uint outOff;
  13. int ne00;
  14. int ne01;
  15. int ne02;
  16. uint nb00;
  17. uint nb01;
  18. uint nb02;
  19. uint nb03;
  20. int ne0;
  21. int ne1;
  22. int ne2;
  23. uint nb0;
  24. uint nb1;
  25. uint nb2;
  26. uint nb3;
  27. } pcs;
  28. void main() {
  29. const uint i03 = gl_WorkGroupID.z;
  30. const uint i02 = gl_WorkGroupID.y;
  31. const uint i01 = gl_WorkGroupID.x;
  32. const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
  33. const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
  34. const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
  35. const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
  36. const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
  37. const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
  38. for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
  39. const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
  40. out_[dst_data+i00] = OUT_TYPE(in_[src]);
  41. }
  42. }