| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- #include "pool2d.cuh"
- template <typename Ti, typename To>
- static __global__ void pool2d_nchw_kernel(
- const int ih, const int iw, const int oh, const int ow,
- const int kh, const int kw, const int sh, const int sw,
- const int ph, const int pw, const int parallel_elements,
- const Ti* src, To* dst, const enum ggml_op_pool op) {
- int idx = threadIdx.x + blockIdx.x * blockDim.x;
- if (idx >= parallel_elements) {
- return;
- }
- const int I_HW = ih * iw;
- const int O_HW = oh * ow;
- const int nc = idx / O_HW;
- const int cur_oh = idx % O_HW / ow;
- const int cur_ow = idx % O_HW % ow;
- const Ti* i_ptr = src + nc * I_HW;
- To* o_ptr = dst + nc * O_HW;
- const int start_h = cur_oh * sh - ph;
- const int bh = max(0, start_h);
- const int eh = min(ih, start_h + kh);
- const int start_w = cur_ow * sw - pw;
- const int bw = max(0, start_w);
- const int ew = min(iw, start_w + kw);
- const To scale = 1. / (kh * kw);
- To res = 0;
- switch (op) {
- case GGML_OP_POOL_AVG: res = 0; break;
- case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
- default: assert(false);
- }
- for (int i = bh; i < eh; i += 1) {
- for (int j = bw; j < ew; j += 1) {
- #if __CUDA_ARCH__ >= 350
- Ti cur = __ldg(i_ptr + i * iw + j);
- #else
- Ti cur = i_ptr[i * iw + j];
- #endif
- switch (op) {
- case GGML_OP_POOL_AVG: res += cur * scale; break;
- case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
- default: assert(false);
- }
- }
- }
- o_ptr[cur_oh * ow + cur_ow] = res;
- }
- static void pool2d_nchw_kernel_f32_f32_cuda(
- const int ih, const int iw, const int oh, const int ow,
- const int kh, const int kw, const int sh, const int sw,
- const int ph, const int pw, const int parallel_elements,
- const float * src, float * dst, const enum ggml_op_pool op,
- cudaStream_t stream) {
- const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
- dim3 block_nums(num_blocks);
- pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
- }
- void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
- const ggml_tensor * src0 = dst->src[0];
- const float * src0_d = (const float *)src0->data;
- float * dst_d = (float *)dst->data;
- cudaStream_t stream = ctx.stream();
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
- const int32_t * opts = (const int32_t *)dst->op_params;
- enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
- const int k0 = opts[1];
- const int k1 = opts[2];
- const int s0 = opts[3];
- const int s1 = opts[4];
- const int p0 = opts[5];
- const int p1 = opts[6];
- const int64_t IH = src0->ne[1];
- const int64_t IW = src0->ne[0];
- const int64_t N = dst->ne[3];
- const int64_t OC = dst->ne[2];
- const int64_t OH = dst->ne[1];
- const int64_t OW = dst->ne[0];
- const int parallel_elements = N * OC * OH * OW;
- pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
- }
|