common.hpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. //
  2. // MIT license
  3. // Copyright (C) 2024 Intel Corporation
  4. // SPDX-License-Identifier: MIT
  5. //
  6. //
  7. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  8. // See https://llvm.org/LICENSE.txt for license information.
  9. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10. //
  11. #ifndef GGML_SYCL_COMMON_HPP
  12. #define GGML_SYCL_COMMON_HPP
  13. #include <cstddef>
  14. #include <fstream>
  15. #include <iostream>
  16. #include <string>
  17. #include "dpct/helper.hpp"
  18. #include "ggml-sycl.h"
  19. #include "presets.hpp"
  20. #include "sycl_hw.hpp"
  21. #if GGML_SYCL_DNNL
  22. #include "dnnl.hpp"
  23. #include "dnnl_sycl.hpp"
  24. #endif
  25. #define GGML_COMMON_DECL_SYCL
  26. #define GGML_COMMON_IMPL_SYCL
  27. /* suppress warning spam */
  28. #pragma clang diagnostic push
  29. #pragma clang diagnostic ignored "-Wnested-anon-types"
  30. #include "ggml-common.h"
  31. #pragma clang diagnostic pop
  32. #include "ggml-impl.h"
  33. void* ggml_sycl_host_malloc(size_t size);
  34. void ggml_sycl_host_free(void* ptr);
  35. extern int g_ggml_sycl_debug;
  36. extern int g_ggml_sycl_disable_optimize;
  37. extern int g_ggml_sycl_prioritize_dmmv;
  38. #if defined(__clang__) && __has_builtin(__builtin_expect)
  39. // Hint the optimizer to pipeline the more likely following instruction in branches
  40. # define LIKELY(expr) __builtin_expect(expr, true)
  41. # define UNLIKELY(expr) __builtin_expect(expr, false)
  42. #else
  43. # define LIKELY(expr) (expr)
  44. # define UNLIKELY(expr) (expr)
  45. #endif
  46. #define GGML_SYCL_DEBUG(...) \
  47. do { \
  48. if (UNLIKELY(g_ggml_sycl_debug)) \
  49. fprintf(stderr, __VA_ARGS__); \
  50. } while (0)
  51. #define CHECK_TRY_ERROR(expr) \
  52. [&]() { \
  53. try { \
  54. expr; \
  55. return dpct::success; \
  56. } catch (std::exception const& e) { \
  57. std::cerr << e.what() << "\nException caught at file:" << __FILE__ \
  58. << ", line:" << __LINE__ << ", func:" << __func__ \
  59. << std::endl; \
  60. return dpct::default_error; \
  61. } \
  62. }()
  63. #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
  64. #define VER_4VEC 610 // todo for hardward optimize.
  65. #define VER_GEN9 700 // todo for hardward optimize.
  66. #define VER_GEN12 1000000 // todo for hardward optimize.
  67. #define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
  68. #define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
  69. // define for XMX in Intel GPU
  70. // TODO: currently, it's not used for XMX really.
  71. #if !defined(GGML_SYCL_FORCE_MMQ)
  72. #define SYCL_USE_XMX
  73. #endif
  74. // max batch size to use MMQ kernels when tensor cores are available
  75. #define MMQ_MAX_BATCH_SIZE 32
  76. // dmmv = dequantize_mul_mat_vec
  77. #ifndef GGML_SYCL_DMMV_X
  78. #define GGML_SYCL_DMMV_X 32
  79. #endif
  80. #ifndef GGML_SYCL_MMV_Y
  81. #define GGML_SYCL_MMV_Y 1
  82. #endif
  83. typedef sycl::queue *queue_ptr;
  84. enum ggml_sycl_backend_gpu_mode {
  85. SYCL_UNSET_GPU_MODE = -1,
  86. SYCL_SINGLE_GPU_MODE = 0,
  87. SYCL_MUL_GPU_MODE
  88. };
  89. static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
  90. static void crash() {
  91. int* ptr = NULL;
  92. *ptr = 0;
  93. }
  94. [[noreturn]] static void ggml_sycl_error(
  95. const char* stmt,
  96. const char* func,
  97. const char* file,
  98. const int line,
  99. const char* msg) {
  100. fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
  101. fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
  102. GGML_ABORT("SYCL error");
  103. }
  104. #define SYCL_CHECK(err) \
  105. do { \
  106. auto err_ = (err); \
  107. if (err_ != 0) \
  108. ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \
  109. } while (0)
  110. #if DPCT_COMPAT_RT_VERSION >= 11100
  111. #define GGML_SYCL_ASSUME(x) __builtin_assume(x)
  112. #else
  113. #define GGML_SYCL_ASSUME(x)
  114. #endif // DPCT_COMPAT_RT_VERSION >= 11100
  115. #ifdef GGML_SYCL_F16
  116. typedef sycl::half dfloat; // dequantize float
  117. typedef sycl::half2 dfloat2;
  118. #else
  119. typedef float dfloat; // dequantize float
  120. typedef sycl::float2 dfloat2;
  121. #endif // GGML_SYCL_F16
  122. #define MMVQ_MAX_BATCH_SIZE 8
  123. static int g_all_sycl_device_count = -1;
  124. static bool g_ggml_backend_sycl_buffer_type_initialized = false;
  125. static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
  126. SYCL_UNSET_GPU_MODE;
  127. static void* g_scratch_buffer = nullptr;
  128. static size_t g_scratch_size = 0; // disabled by default
  129. static size_t g_scratch_offset = 0;
  130. [[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
  131. stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
  132. "current GPU architecture.\n";
  133. // __trap();
  134. std::exit(1);
  135. (void)bad_arch; // suppress unused function warning
  136. }
  137. int get_current_device_id();
  138. inline dpct::err0 ggml_sycl_set_device(const int device) try {
  139. int current_device_id;
  140. SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
  141. // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
  142. // current_device_id=%d\n", device, current_device);
  143. if (device == current_device_id) {
  144. return 0;
  145. }
  146. return CHECK_TRY_ERROR(dpct::select_device(device));
  147. } catch (sycl::exception const& exc) {
  148. std::cerr << exc.what() << "Exception caught at file:" << __FILE__
  149. << ", line:" << __LINE__ << std::endl;
  150. crash();
  151. std::exit(1);
  152. }
  153. //////////////////////
  154. struct optimize_feature {
  155. bool reorder=false;
  156. };
  157. struct sycl_device_info {
  158. int cc; // compute capability
  159. int nsm; // number of streaming multiprocessors (CUDA) maps to the maximum
  160. // number of compute units on a SYCL device.
  161. // size_t smpb; // max. shared memory per block
  162. size_t smpbo; // max. shared memory per block (with opt-in)
  163. bool vmm; // virtual memory support
  164. size_t total_vram;
  165. //sycl_hw_info hw_info; \\ device id and aarch, currently not used
  166. optimize_feature opt_feature;
  167. };
  168. struct ggml_sycl_device_info {
  169. int device_count;
  170. sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
  171. std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
  172. int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
  173. };
  174. const ggml_sycl_device_info & ggml_sycl_info();
  175. struct ggml_sycl_pool {
  176. virtual ~ggml_sycl_pool() = default;
  177. virtual void * alloc(size_t size, size_t * actual_size) = 0;
  178. virtual void free(void * ptr, size_t size) = 0;
  179. };
  180. template<typename T>
  181. struct ggml_sycl_pool_alloc {
  182. ggml_sycl_pool * pool = nullptr;
  183. T * ptr = nullptr;
  184. size_t actual_size = 0;
  185. explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
  186. }
  187. ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
  188. alloc(size);
  189. }
  190. ~ggml_sycl_pool_alloc() {
  191. if (ptr != nullptr) {
  192. pool->free(ptr, actual_size);
  193. }
  194. }
  195. T * realloc(size_t size) {
  196. GGML_ASSERT(pool != nullptr);
  197. if (ptr)
  198. pool->free(ptr, actual_size);
  199. ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
  200. return ptr;
  201. }
  202. // size is in number of elements
  203. T * alloc(size_t size) {
  204. GGML_ASSERT(pool != nullptr);
  205. GGML_ASSERT(ptr == nullptr);
  206. ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
  207. return ptr;
  208. }
  209. T * alloc(ggml_sycl_pool & pool, size_t size) {
  210. this->pool = &pool;
  211. return alloc(size);
  212. }
  213. T * get() {
  214. return ptr;
  215. }
  216. ggml_sycl_pool_alloc() = default;
  217. ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
  218. ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
  219. ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
  220. ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
  221. };
  222. // backend interface
  223. struct ggml_tensor_extra_gpu {
  224. void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
  225. // tensors
  226. dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
  227. [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
  228. optimize_feature optimized_feature;
  229. };
  230. void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
  231. namespace sycl_ex = sycl::ext::oneapi::experimental;
  232. struct ggml_backend_sycl_context {
  233. int device;
  234. std::string name;
  235. optimize_feature opt_feature;
  236. queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
  237. explicit ggml_backend_sycl_context(int device) :
  238. device(device),
  239. name(GGML_SYCL_NAME + std::to_string(device)) {
  240. opt_feature = ggml_sycl_info().devices[device].opt_feature;
  241. }
  242. queue_ptr stream(int device, int stream) {
  243. if (qptrs[device][stream] == nullptr) {
  244. qptrs[device][stream] = &(dpct::get_device(device).default_queue());
  245. }
  246. return qptrs[device][stream];
  247. }
  248. queue_ptr stream() {
  249. return stream(device, 0);
  250. }
  251. #if GGML_SYCL_DNNL
  252. dnnl::engine make_engine(sycl::queue* q) {
  253. // Get the device associated with the queue
  254. sycl::device dev = q->get_device();
  255. // Get the context associated with the queue
  256. sycl::context ctx = q->get_context();
  257. const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
  258. return eng;
  259. }
  260. std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
  261. std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
  262. dnnl::stream stream_dnnl(int device, int _stream) {
  263. auto q = stream(device, _stream);
  264. return stream_dnnl(q);
  265. }
  266. dnnl::engine engine_dnnl(sycl::queue* qptr) {
  267. auto it = engine_map.find(qptr);
  268. if (it == engine_map.end()) {
  269. auto eng = make_engine(qptr);
  270. engine_map[qptr] = eng;
  271. return eng;
  272. }
  273. else
  274. {
  275. return it->second;
  276. }
  277. }
  278. dnnl::stream stream_dnnl(sycl::queue* qptr) {
  279. auto it = stream_map.find(qptr);
  280. if (it == stream_map.end()) {
  281. auto eng = engine_dnnl(qptr);
  282. auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
  283. stream_map[qptr] = stream;
  284. return stream;
  285. }
  286. else
  287. {
  288. return it->second;
  289. }
  290. }
  291. dnnl::stream stream_dnnl() {
  292. return stream_dnnl(device, 0);
  293. }
  294. dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md,
  295. const dnnl::engine & eng, const queue_ptr q) {
  296. ggml_sycl_pool_alloc<uint8_t> * pool;
  297. auto it = scratchpad_map.find(q);
  298. if (it == scratchpad_map.end()) {
  299. scratchpad_map[q] = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(this->pool());
  300. pool = scratchpad_map[q].get();
  301. } else {
  302. pool = it->second.get();
  303. }
  304. size_t scratchpad_size = scratchpad_md.get_size();
  305. if (scratchpad_size > pool->actual_size) {
  306. pool->realloc(scratchpad_size);
  307. }
  308. void * mem_ptr = pool->get();
  309. return dnnl::memory(scratchpad_md, eng, mem_ptr);
  310. }
  311. #endif
  312. // pool
  313. std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
  314. std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
  315. std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
  316. static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
  317. static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
  318. ggml_sycl_pool & pool(int device) {
  319. if (pools[device] == nullptr) {
  320. pools[device] = new_pool_for_device(stream(device,0), device);
  321. }
  322. return *pools[device];
  323. }
  324. ggml_sycl_pool & pool() {
  325. return pool(device);
  326. }
  327. #ifdef GGML_SYCL_GRAPH
  328. std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
  329. #endif
  330. ggml_sycl_pool & host_pool(int device) {
  331. if (host_pools[device] == nullptr) {
  332. host_pools[device] = new_pool_for_host(stream(device, 0), device);
  333. }
  334. return *host_pools[device];
  335. }
  336. ggml_sycl_pool & host_pool() { return host_pool(device); }
  337. };
  338. // common device functions
  339. static __dpct_inline__ float warp_reduce_sum(float x,
  340. const sycl::nd_item<3>& item_ct1) {
  341. #pragma unroll
  342. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  343. x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
  344. }
  345. return x;
  346. }
  347. static __dpct_inline__ sycl::float2
  348. warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
  349. #pragma unroll
  350. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  351. a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
  352. mask);
  353. a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
  354. mask);
  355. }
  356. return a;
  357. }
  358. template <int width = WARP_SIZE>
  359. static __dpct_inline__ int warp_reduce_sum(int x) {
  360. return sycl::reduce_over_group(
  361. sycl::ext::oneapi::this_work_item::get_sub_group(), x, sycl::plus<>());
  362. }
  363. template <int width = WARP_SIZE>
  364. static __dpct_inline__ float warp_reduce_sum(float x) {
  365. #pragma unroll
  366. for (int offset = width / 2; offset > 0; offset >>= 1) {
  367. x += dpct::permute_sub_group_by_xor(
  368. sycl::ext::oneapi::this_work_item::get_sub_group(), x, offset, width);
  369. }
  370. return x;
  371. }
  372. template <int width = WARP_SIZE>
  373. static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a) {
  374. #pragma unroll
  375. for (int offset = width / 2; offset > 0; offset >>= 1) {
  376. a.x() += dpct::permute_sub_group_by_xor(
  377. sycl::ext::oneapi::this_work_item::get_sub_group(), a.x(), offset,
  378. width);
  379. a.y() += dpct::permute_sub_group_by_xor(
  380. sycl::ext::oneapi::this_work_item::get_sub_group(), a.y(), offset,
  381. width);
  382. }
  383. return a;
  384. }
  385. template <int width = WARP_SIZE>
  386. static __dpct_inline__ sycl::half2 warp_reduce_sum(sycl::half2 a) {
  387. #pragma unroll
  388. for (int offset = width / 2; offset > 0; offset >>= 1) {
  389. a = a + dpct::permute_sub_group_by_xor(
  390. sycl::ext::oneapi::this_work_item::get_sub_group(), a, offset,
  391. width);
  392. }
  393. return a;
  394. }
  395. static constexpr int ggml_sycl_get_physical_warp_size() {
  396. // todo: for old iGPU + dGPU case, need to be changed.
  397. return WARP_SIZE;
  398. }
  399. template <int width = WARP_SIZE>
  400. static __dpct_inline__ float warp_reduce_max(float x) {
  401. #pragma unroll
  402. for (int offset = width / 2; offset > 0; offset >>= 1) {
  403. x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
  404. sycl::ext::oneapi::this_work_item::get_sub_group(), x,
  405. offset, width));
  406. }
  407. return x;
  408. }
  409. static __dpct_inline__ float warp_reduce_max(float x,
  410. const sycl::nd_item<3>& item_ct1) {
  411. #pragma unroll
  412. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  413. x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
  414. item_ct1.get_sub_group(), x, mask));
  415. }
  416. return x;
  417. }
  418. /* Helper for Computing the linear offset of a ggml_tensor given
  419. per-dimension sizes, strides, and indices */
  420. template<int N>
  421. __dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
  422. size_t offset = 0;
  423. #pragma unroll
  424. for (int i = 0; i < N; i++) {
  425. auto index_i = indices[i];
  426. offset += strides[i] * index_i;
  427. }
  428. return offset;
  429. }
  430. // Helper for vec loading aligned data
  431. template <typename Tp, int n>
  432. inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
  433. return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
  434. }
  435. // Helper for accessing pointers with no warnings
  436. template <typename Tp, int dim>
  437. static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
  438. return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
  439. }
  440. int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
  441. constexpr size_t ceil_div(const size_t m, const size_t n) {
  442. return (m + n - 1) / n;
  443. }
  444. bool gpu_has_xmx(sycl::device &dev);
  445. template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
  446. if (LIKELY(!g_ggml_sycl_debug)) {
  447. return "";
  448. }
  449. std::stringstream ss;
  450. ss << prefix << "=[";
  451. for (std::size_t i = 0; i < N - 1; ++i) {
  452. ss << array[i] << ", ";
  453. }
  454. if constexpr (N > 0) {
  455. ss << array[N - 1];
  456. }
  457. ss << "]";
  458. return ss.str();
  459. }
  460. inline std::string debug_get_tensor_str(const std::string &prefix,
  461. const ggml_tensor *tensor, const std::string &suffix = "") {
  462. std::stringstream ss;
  463. if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
  464. ss << prefix.c_str() << "=";
  465. if (tensor) {
  466. ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
  467. ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
  468. ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
  469. if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
  470. if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
  471. } else {
  472. ss << "nullptr";
  473. }
  474. ss << suffix;
  475. return ss.str();
  476. }
  477. // Use scope_op_debug_print to log operations coming from running a model
  478. struct scope_op_debug_print {
  479. // Use string_views to avoid the cost of creating a string and concatenating them
  480. // string_views must be alive for as long as the object is alive
  481. // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
  482. scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
  483. std::size_t num_src, const std::string_view & suffix = "") :
  484. func(func),
  485. func_suffix(func_suffix) {
  486. if (LIKELY(!g_ggml_sycl_debug)) {
  487. return;
  488. }
  489. GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
  490. GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
  491. if (dst) {
  492. for (std::size_t i = 0; i < num_src; ++i) {
  493. GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
  494. }
  495. }
  496. GGML_SYCL_DEBUG("%s\n", suffix.data());
  497. }
  498. scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
  499. const std::string_view & suffix = "") :
  500. scope_op_debug_print(func, "", dst, num_src, suffix) {}
  501. ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
  502. private:
  503. std::string_view func;
  504. std::string_view func_suffix;
  505. };
  506. static __dpct_inline__ float get_alibi_slope(const float max_bias,
  507. const uint32_t h,
  508. const uint32_t n_head_log2,
  509. const float m0,
  510. const float m1) {
  511. if (max_bias <= 0.0f) {
  512. return 1.0f;
  513. }
  514. const float base = h < n_head_log2 ? m0 : m1;
  515. const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
  516. return dpct::pow(base, exph);
  517. }
  518. static const sycl::uint3 init_fastdiv_values(uint32_t d) {
  519. GGML_ASSERT(d != 0);
  520. uint32_t L = 0;
  521. while (L < 32 && (uint32_t{ 1 } << L) < d) {
  522. L++;
  523. }
  524. uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
  525. return sycl::uint3(mp, L, d);
  526. }
  527. static __dpct_inline__ uint32_t fastdiv(uint32_t n, const sycl::uint3 fastdiv_values) {
  528. const uint32_t hi = sycl::mul_hi<unsigned>(n, fastdiv_values.x());
  529. return (hi + n) >> fastdiv_values.y();
  530. }
  531. static __dpct_inline__ sycl::uint2 fast_div_modulo(uint32_t n, const sycl::uint3 fastdiv_values) {
  532. const uint32_t div_val = fastdiv(n, fastdiv_values);
  533. const uint32_t mod_val = n - div_val * fastdiv_values.z();
  534. return sycl::uint2(div_val, mod_val);
  535. }
  536. static __dpct_inline__ int ggml_sycl_dp4a(const int a, const int b, int c) {
  537. return dpct::dp4a(a, b, c);
  538. }
  539. static __dpct_inline__ float ggml_sycl_e8m0_to_fp32(uint8_t x) {
  540. uint32_t bits;
  541. if (x == 0) {
  542. bits = 0x00400000;
  543. } else {
  544. bits = (uint32_t) x << 23;
  545. }
  546. float result;
  547. memcpy(&result, &bits, sizeof(float));
  548. return result;
  549. }
  550. #endif // GGML_SYCL_COMMON_HPP