count-equal.cpp 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #include "count-equal.hpp"
  2. #include <cstdint>
  3. template <typename T>
  4. static void count_equal(const T *__restrict__ x, const T *__restrict__ y,
  5. int64_t *__restrict__ dst, const int64_t dk,
  6. const int64_t k) {
  7. auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
  8. const int64_t i0 = (int64_t)item_ct1.get_group(2) * dk;
  9. const int64_t i1 = sycl::min(i0 + dk, k);
  10. int nequal = 0;
  11. for (int64_t i = i0 + item_ct1.get_local_id(2); i < i1; i += WARP_SIZE) {
  12. const T xi = x[i];
  13. const T yi = y[i];
  14. nequal += xi == yi;
  15. }
  16. nequal = warp_reduce_sum(nequal);
  17. if (item_ct1.get_local_id(2) != 0) {
  18. return;
  19. }
  20. dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
  21. (int *)dst, nequal);
  22. }
  23. void ggml_sycl_count_equal(ggml_backend_sycl_context &ctx, ggml_tensor *dst) {
  24. scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
  25. const ggml_tensor * src0 = dst->src[0];
  26. const ggml_tensor * src1 = dst->src[1];
  27. GGML_ASSERT(src0->type == src1->type);
  28. GGML_ASSERT( dst->type == GGML_TYPE_I64);
  29. GGML_ASSERT(ggml_are_same_shape(src0, src1));
  30. GGML_ASSERT(ggml_is_contiguous(src0));
  31. GGML_ASSERT(ggml_is_contiguous(src1));
  32. GGML_ASSERT(ggml_is_contiguous(dst));
  33. int64_t * dst_d = (int64_t *) dst->data;
  34. dpct::queue_ptr stream = ctx.stream();
  35. const int id = get_current_device_id();
  36. const int nsm = ggml_sycl_info().devices[id].nsm;
  37. const int64_t ne = ggml_nelements(src0);
  38. GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
  39. const int64_t dne =
  40. GGML_PAD((ne + 4 * nsm - 1) / (4 * nsm), SYCL_COUNT_EQUAL_CHUNK_SIZE);
  41. SYCL_CHECK(CHECK_TRY_ERROR(stream->memset(dst_d, 0, ggml_nbytes(dst))));
  42. const dpct::dim3 block_dims(WARP_SIZE, 1, 1);
  43. const dpct::dim3 block_nums(
  44. std::min((int64_t)4 * nsm, (ne + SYCL_COUNT_EQUAL_CHUNK_SIZE - 1) /
  45. SYCL_COUNT_EQUAL_CHUNK_SIZE),
  46. 1, 1);
  47. switch (src0->type) {
  48. case GGML_TYPE_I32: {
  49. const int *src0_d = (const int *)src0->data;
  50. const int *src1_d = (const int *)src1->data;
  51. stream->parallel_for(
  52. sycl::nd_range<3>(block_nums * block_dims, block_dims),
  53. [=](sycl::nd_item<3> item_ct1) {
  54. count_equal(src0_d, src1_d, dst_d, dne, ne);
  55. GGML_UNUSED(item_ct1);
  56. });
  57. } break;
  58. default:
  59. GGML_ASSERT(false);
  60. break;
  61. }
  62. }