common.hpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. //
  2. // MIT license
  3. // Copyright (C) 2024 Intel Corporation
  4. // SPDX-License-Identifier: MIT
  5. //
  6. //
  7. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  8. // See https://llvm.org/LICENSE.txt for license information.
  9. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10. //
  11. #ifndef GGML_SYCL_COMMON_HPP
  12. #define GGML_SYCL_COMMON_HPP
  13. #include <fstream>
  14. #include <iostream>
  15. #include "dpct/helper.hpp"
  16. #include "ggml-sycl.h"
  17. #include "presets.hpp"
  18. #include "sycl_hw.hpp"
  19. #if GGML_SYCL_DNNL
  20. #include "dnnl.hpp"
  21. #include "dnnl_sycl.hpp"
  22. #endif
  23. #define GGML_COMMON_DECL_SYCL
  24. #define GGML_COMMON_IMPL_SYCL
  25. /* suppress warning spam */
  26. #pragma clang diagnostic push
  27. #pragma clang diagnostic ignored "-Wnested-anon-types"
  28. #include "ggml-common.h"
  29. #pragma clang diagnostic pop
  30. #include "ggml-impl.h"
  31. void* ggml_sycl_host_malloc(size_t size);
  32. void ggml_sycl_host_free(void* ptr);
  33. extern int g_ggml_sycl_debug;
  34. extern int g_ggml_sycl_disable_optimize;
  35. extern int g_ggml_sycl_prioritize_dmmv;
  36. #define GGML_SYCL_DEBUG(...) \
  37. do { \
  38. if (g_ggml_sycl_debug) \
  39. fprintf(stderr, __VA_ARGS__); \
  40. } while (0)
  41. #define CHECK_TRY_ERROR(expr) \
  42. [&]() { \
  43. try { \
  44. expr; \
  45. return dpct::success; \
  46. } catch (std::exception const& e) { \
  47. std::cerr << e.what() << "\nException caught at file:" << __FILE__ \
  48. << ", line:" << __LINE__ << ", func:" << __func__ \
  49. << std::endl; \
  50. return dpct::default_error; \
  51. } \
  52. }()
  53. #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
  54. #define VER_4VEC 610 // todo for hardward optimize.
  55. #define VER_GEN9 700 // todo for hardward optimize.
  56. #define VER_GEN12 1000000 // todo for hardward optimize.
  57. #define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
  58. #define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
  59. // define for XMX in Intel GPU
  60. // TODO: currently, it's not used for XMX really.
  61. #if !defined(GGML_SYCL_FORCE_MMQ)
  62. #define SYCL_USE_XMX
  63. #endif
  64. // max batch size to use MMQ kernels when tensor cores are available
  65. #define MMQ_MAX_BATCH_SIZE 32
  66. // dmmv = dequantize_mul_mat_vec
  67. #ifndef GGML_SYCL_DMMV_X
  68. #define GGML_SYCL_DMMV_X 32
  69. #endif
  70. #ifndef GGML_SYCL_MMV_Y
  71. #define GGML_SYCL_MMV_Y 1
  72. #endif
  73. typedef sycl::queue *queue_ptr;
  74. enum ggml_sycl_backend_gpu_mode {
  75. SYCL_UNSET_GPU_MODE = -1,
  76. SYCL_SINGLE_GPU_MODE = 0,
  77. SYCL_MUL_GPU_MODE
  78. };
  79. static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
  80. static void crash() {
  81. int* ptr = NULL;
  82. *ptr = 0;
  83. }
  84. [[noreturn]] static void ggml_sycl_error(
  85. const char* stmt,
  86. const char* func,
  87. const char* file,
  88. const int line,
  89. const char* msg) {
  90. fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
  91. fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
  92. GGML_ABORT("SYCL error");
  93. }
  94. #define SYCL_CHECK(err) \
  95. do { \
  96. auto err_ = (err); \
  97. if (err_ != 0) \
  98. ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \
  99. } while (0)
  100. #if DPCT_COMPAT_RT_VERSION >= 11100
  101. #define GGML_SYCL_ASSUME(x) __builtin_assume(x)
  102. #else
  103. #define GGML_SYCL_ASSUME(x)
  104. #endif // DPCT_COMPAT_RT_VERSION >= 11100
  105. #ifdef GGML_SYCL_F16
  106. typedef sycl::half dfloat; // dequantize float
  107. typedef sycl::half2 dfloat2;
  108. #else
  109. typedef float dfloat; // dequantize float
  110. typedef sycl::float2 dfloat2;
  111. #endif // GGML_SYCL_F16
  112. #define MMVQ_MAX_BATCH_SIZE 8
  113. static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
  114. static int g_all_sycl_device_count = -1;
  115. static bool g_ggml_backend_sycl_buffer_type_initialized = false;
  116. static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
  117. SYCL_UNSET_GPU_MODE;
  118. static void* g_scratch_buffer = nullptr;
  119. static size_t g_scratch_size = 0; // disabled by default
  120. static size_t g_scratch_offset = 0;
  121. [[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
  122. stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
  123. "current GPU architecture.\n";
  124. // __trap();
  125. std::exit(1);
  126. (void)bad_arch; // suppress unused function warning
  127. }
  128. int get_current_device_id();
  129. inline dpct::err0 ggml_sycl_set_device(const int device) try {
  130. int current_device_id;
  131. SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
  132. // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
  133. // current_device_id=%d\n", device, current_device);
  134. if (device == current_device_id) {
  135. return 0;
  136. }
  137. return CHECK_TRY_ERROR(dpct::select_device(device));
  138. } catch (sycl::exception const& exc) {
  139. std::cerr << exc.what() << "Exception caught at file:" << __FILE__
  140. << ", line:" << __LINE__ << std::endl;
  141. crash();
  142. std::exit(1);
  143. }
  144. //////////////////////
  145. struct optimize_feature {
  146. bool reorder=false;
  147. };
  148. struct sycl_device_info {
  149. int cc; // compute capability
  150. // int nsm; // number of streaming multiprocessors
  151. // size_t smpb; // max. shared memory per block
  152. bool vmm; // virtual memory support
  153. size_t total_vram;
  154. sycl_hw_info hw_info;
  155. optimize_feature opt_feature;
  156. };
  157. struct ggml_sycl_device_info {
  158. int device_count;
  159. sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
  160. std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
  161. int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
  162. };
  163. const ggml_sycl_device_info & ggml_sycl_info();
  164. struct ggml_sycl_pool {
  165. virtual ~ggml_sycl_pool() = default;
  166. virtual void * alloc(size_t size, size_t * actual_size) = 0;
  167. virtual void free(void * ptr, size_t size) = 0;
  168. };
  169. template<typename T>
  170. struct ggml_sycl_pool_alloc {
  171. ggml_sycl_pool * pool = nullptr;
  172. T * ptr = nullptr;
  173. size_t actual_size = 0;
  174. explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
  175. }
  176. ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
  177. alloc(size);
  178. }
  179. ~ggml_sycl_pool_alloc() {
  180. if (ptr != nullptr) {
  181. pool->free(ptr, actual_size);
  182. }
  183. }
  184. T * realloc(size_t size) {
  185. GGML_ASSERT(pool != nullptr);
  186. if (ptr)
  187. pool->free(ptr, actual_size);
  188. ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
  189. return ptr;
  190. }
  191. // size is in number of elements
  192. T * alloc(size_t size) {
  193. GGML_ASSERT(pool != nullptr);
  194. GGML_ASSERT(ptr == nullptr);
  195. ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
  196. return ptr;
  197. }
  198. T * alloc(ggml_sycl_pool & pool, size_t size) {
  199. this->pool = &pool;
  200. return alloc(size);
  201. }
  202. T * get() {
  203. return ptr;
  204. }
  205. ggml_sycl_pool_alloc() = default;
  206. ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
  207. ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
  208. ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
  209. ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
  210. };
  211. // backend interface
  212. struct ggml_tensor_extra_gpu {
  213. void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
  214. // tensors
  215. dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
  216. [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
  217. optimize_feature optimized_feature;
  218. };
  219. void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
  220. inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
  221. optimize_feature opt;
  222. opt.reorder =
  223. (arch == syclex::architecture::intel_gpu_dg1 ||
  224. arch == syclex::architecture::intel_gpu_acm_g10 ||
  225. arch == syclex::architecture::intel_gpu_acm_g11 ||
  226. arch == syclex::architecture::intel_gpu_acm_g12 ||
  227. arch == syclex::architecture::intel_gpu_pvc ||
  228. arch == syclex::architecture::intel_gpu_pvc_vg ||
  229. arch == syclex::architecture::intel_gpu_mtl_u ||
  230. arch == syclex::architecture::intel_gpu_mtl_s ||
  231. arch == syclex::architecture::intel_gpu_mtl_h ||
  232. arch == syclex::architecture::intel_gpu_arl_u ||
  233. arch == syclex::architecture::intel_gpu_arl_s ||
  234. arch == syclex::architecture::intel_gpu_arl_h ||
  235. arch == syclex::architecture::intel_gpu_bmg_g21 ||
  236. arch == syclex::architecture::intel_gpu_lnl_m
  237. );
  238. return opt;
  239. }
  240. namespace sycl_ex = sycl::ext::oneapi::experimental;
  241. struct ggml_backend_sycl_context {
  242. int device;
  243. std::string name;
  244. optimize_feature opt_feature;
  245. queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
  246. explicit ggml_backend_sycl_context(int device) :
  247. device(device),
  248. name(GGML_SYCL_NAME + std::to_string(device)) {
  249. opt_feature = ggml_sycl_info().devices[device].opt_feature;
  250. }
  251. queue_ptr stream(int device, int stream) {
  252. if (qptrs[device][stream] == nullptr) {
  253. qptrs[device][stream] = &(dpct::get_device(device).default_queue());
  254. }
  255. return qptrs[device][stream];
  256. }
  257. queue_ptr stream() {
  258. return stream(device, 0);
  259. }
  260. #if GGML_SYCL_DNNL
  261. dnnl::engine make_engine(sycl::queue* q) {
  262. // Get the device associated with the queue
  263. sycl::device dev = q->get_device();
  264. // Get the context associated with the queue
  265. sycl::context ctx = q->get_context();
  266. const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
  267. return eng;
  268. }
  269. std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
  270. std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
  271. dnnl::stream stream_dnnl(int device, int _stream) {
  272. auto q = stream(device, _stream);
  273. return stream_dnnl(q);
  274. }
  275. dnnl::engine engine_dnnl(sycl::queue* qptr) {
  276. auto it = engine_map.find(qptr);
  277. if (it == engine_map.end()) {
  278. auto eng = make_engine(qptr);
  279. engine_map[qptr] = eng;
  280. return eng;
  281. }
  282. else
  283. {
  284. return it->second;
  285. }
  286. }
  287. dnnl::stream stream_dnnl(sycl::queue* qptr) {
  288. auto it = stream_map.find(qptr);
  289. if (it == stream_map.end()) {
  290. auto eng = engine_dnnl(qptr);
  291. auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
  292. stream_map[qptr] = stream;
  293. return stream;
  294. }
  295. else
  296. {
  297. return it->second;
  298. }
  299. }
  300. dnnl::stream stream_dnnl() {
  301. return stream_dnnl(device, 0);
  302. }
  303. dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md,
  304. const dnnl::engine & eng, const queue_ptr q) {
  305. ggml_sycl_pool_alloc<uint8_t> * pool;
  306. auto it = scratchpad_map.find(q);
  307. if (it == scratchpad_map.end()) {
  308. scratchpad_map[q] = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(this->pool());
  309. pool = scratchpad_map[q].get();
  310. } else {
  311. pool = it->second.get();
  312. }
  313. size_t scratchpad_size = scratchpad_md.get_size();
  314. if (scratchpad_size > pool->actual_size) {
  315. pool->realloc(scratchpad_size);
  316. }
  317. void * mem_ptr = pool->get();
  318. return dnnl::memory(scratchpad_md, eng, mem_ptr);
  319. }
  320. #endif
  321. // pool
  322. std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
  323. std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
  324. std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
  325. static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
  326. static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
  327. ggml_sycl_pool & pool(int device) {
  328. if (pools[device] == nullptr) {
  329. pools[device] = new_pool_for_device(stream(device,0), device);
  330. }
  331. return *pools[device];
  332. }
  333. ggml_sycl_pool & pool() {
  334. return pool(device);
  335. }
  336. #ifdef GGML_SYCL_GRAPH
  337. std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
  338. #endif
  339. ggml_sycl_pool & host_pool(int device) {
  340. if (host_pools[device] == nullptr) {
  341. host_pools[device] = new_pool_for_host(stream(device, 0), device);
  342. }
  343. return *host_pools[device];
  344. }
  345. ggml_sycl_pool & host_pool() { return host_pool(device); }
  346. };
  347. // common device functions
  348. static __dpct_inline__ float warp_reduce_sum(float x,
  349. const sycl::nd_item<3>& item_ct1) {
  350. #pragma unroll
  351. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  352. /*
  353. DPCT1096:98: The right-most dimension of the work-group used in the SYCL
  354. kernel that calls this function may be less than "32". The function
  355. "dpct::permute_sub_group_by_xor" may return an unexpected result on the
  356. CPU device. Modify the size of the work-group to ensure that the value
  357. of the right-most dimension is a multiple of "32".
  358. */
  359. x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
  360. }
  361. return x;
  362. }
  363. static __dpct_inline__ sycl::float2
  364. warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
  365. #pragma unroll
  366. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  367. a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
  368. mask);
  369. a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
  370. mask);
  371. }
  372. return a;
  373. }
  374. static __dpct_inline__ float warp_reduce_max(float x,
  375. const sycl::nd_item<3>& item_ct1) {
  376. #pragma unroll
  377. for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
  378. /*
  379. DPCT1096:97: The right-most dimension of the work-group used in the SYCL
  380. kernel that calls this function may be less than "32". The function
  381. "dpct::permute_sub_group_by_xor" may return an unexpected result on the
  382. CPU device. Modify the size of the work-group to ensure that the value
  383. of the right-most dimension is a multiple of "32".
  384. */
  385. x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
  386. item_ct1.get_sub_group(), x, mask));
  387. }
  388. return x;
  389. }
  390. // Helper for vec loading aligned data
  391. template <typename Tp, int n>
  392. inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
  393. return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
  394. }
  395. // Helper for accessing pointers with no warnings
  396. template <typename Tp, int dim>
  397. static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
  398. return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
  399. }
  400. int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
  401. constexpr size_t ceil_div(const size_t m, const size_t n) {
  402. return (m + n - 1) / n;
  403. }
  404. bool gpu_has_xmx(sycl::device &dev);
  405. #endif // GGML_SYCL_COMMON_HPP