upscale.cu 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. #include "upscale.cuh"
  2. static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
  3. // blockIdx.z: idx of ne02*ne03
  4. // blockIdx.y: idx of ne01*scale_factor, aka ne1
  5. // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
  6. // ne00xne01: ne00 * ne01
  7. int ne0 = ne00 * scale_factor;
  8. int nidx = threadIdx.x + blockIdx.x * blockDim.x;
  9. if (nidx >= ne0) {
  10. return;
  11. }
  12. // operation
  13. int i00 = nidx / scale_factor;
  14. int i01 = blockIdx.y / scale_factor;
  15. int offset_src =
  16. i00 +
  17. i01 * ne00 +
  18. blockIdx.z * ne00xne01;
  19. int offset_dst =
  20. nidx +
  21. blockIdx.y * ne0 +
  22. blockIdx.z * ne0 * gridDim.y;
  23. dst[offset_dst] = x[offset_src];
  24. }
  25. static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
  26. const int scale_factor, cudaStream_t stream) {
  27. int ne0 = (ne00 * scale_factor);
  28. int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
  29. dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
  30. upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
  31. }
  32. void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  33. const ggml_tensor * src0 = dst->src[0];
  34. const float * src0_d = (const float *)src0->data;
  35. float * dst_d = (float *)dst->data;
  36. cudaStream_t stream = ctx.stream();
  37. GGML_ASSERT(src0->type == GGML_TYPE_F32);
  38. GGML_ASSERT(dst->type == GGML_TYPE_F32);
  39. GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
  40. const int scale_factor = dst->op_params[0];
  41. upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
  42. }