common.hpp 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. //
  2. // MIT license
  3. // Copyright (C) 2024 Intel Corporation
  4. // SPDX-License-Identifier: MIT
  5. //
  6. //
  7. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  8. // See https://llvm.org/LICENSE.txt for license information.
  9. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10. //
  11. #ifndef GGML_SYCL_COMMON_HPP
  12. #define GGML_SYCL_COMMON_HPP
  13. #include <fstream>
  14. #include <iostream>
  15. #include "dpct/helper.hpp"
  16. #include "presets.hpp"
  17. #define GGML_COMMON_DECL_SYCL
  18. #define GGML_COMMON_IMPL_SYCL
  19. #include "ggml-common.h"
  20. void* ggml_sycl_host_malloc(size_t size);
  21. void ggml_sycl_host_free(void* ptr);
  22. static int g_ggml_sycl_debug = 0;
  23. #define GGML_SYCL_DEBUG(...) \
  24. do { \
  25. if (g_ggml_sycl_debug) \
  26. fprintf(stderr, __VA_ARGS__); \
  27. } while (0)
  28. #define CHECK_TRY_ERROR(expr) \
  29. [&]() { \
  30. try { \
  31. expr; \
  32. return dpct::success; \
  33. } catch (std::exception const& e) { \
  34. std::cerr << e.what() << "\nException caught at file:" << __FILE__ \
  35. << ", line:" << __LINE__ << ", func:" << __func__ \
  36. << std::endl; \
  37. return dpct::default_error; \
  38. } \
  39. }()
  40. // #define DEBUG_SYCL_MALLOC
  41. static int g_work_group_size = 0;
  42. // typedef sycl::half ggml_fp16_t;
  43. #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
  44. #define VER_4VEC 610 // todo for hardward optimize.
  45. #define VER_GEN9 700 // todo for hardward optimize.
  46. #define VER_GEN12 1000000 // todo for hardward optimize.
  47. #define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
  48. #define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
  49. // define for XMX in Intel GPU
  50. // TODO: currently, it's not used for XMX really.
  51. #if !defined(GGML_SYCL_FORCE_MMQ)
  52. #define SYCL_USE_XMX
  53. #endif
  54. // max batch size to use MMQ kernels when tensor cores are available
  55. #define MMQ_MAX_BATCH_SIZE 32
  56. #if defined(_MSC_VER)
  57. #pragma warning(disable : 4244 4267) // possible loss of data
  58. #endif
  59. // dmmv = dequantize_mul_mat_vec
  60. #ifndef GGML_SYCL_DMMV_X
  61. #define GGML_SYCL_DMMV_X 32
  62. #endif
  63. #ifndef GGML_SYCL_MMV_Y
  64. #define GGML_SYCL_MMV_Y 1
  65. #endif
  66. typedef sycl::queue *queue_ptr;
  67. enum ggml_sycl_backend_gpu_mode {
  68. SYCL_UNSET_GPU_MODE = -1,
  69. SYCL_SINGLE_GPU_MODE = 0,
  70. SYCL_MUL_GPU_MODE
  71. };
  72. static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
  73. static void crash() {
  74. int* ptr = NULL;
  75. *ptr = 0;
  76. }
  77. [[noreturn]] static void ggml_sycl_error(
  78. const char* stmt,
  79. const char* func,
  80. const char* file,
  81. const int line,
  82. const char* msg) {
  83. fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
  84. fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
  85. GGML_ASSERT(!"SYCL error");
  86. }
  87. #define SYCL_CHECK(err) \
  88. do { \
  89. auto err_ = (err); \
  90. if (err_ != 0) \
  91. ggml_sycl_error( \
  92. #err, \
  93. __func__, \
  94. __FILE__, \
  95. __LINE__, \
  96. "Meet error in this line code!"); \
  97. } while (0)
  98. #if DPCT_COMPAT_RT_VERSION >= 11100
  99. #define GGML_SYCL_ASSUME(x) __builtin_assume(x)
  100. #else
  101. #define GGML_SYCL_ASSUME(x)
  102. #endif // DPCT_COMPAT_RT_VERSION >= 11100
  103. #ifdef GGML_SYCL_F16
  104. typedef sycl::half dfloat; // dequantize float
  105. typedef sycl::half2 dfloat2;
  106. #else
  107. typedef float dfloat; // dequantize float
  108. typedef sycl::float2 dfloat2;
  109. #endif // GGML_SYCL_F16
  110. #define MMVQ_MAX_BATCH_SIZE 8
  111. static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
  112. static int g_all_sycl_device_count = -1;
  113. static bool g_ggml_backend_sycl_buffer_type_initialized = false;
  114. static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
  115. SYCL_UNSET_GPU_MODE;
  116. static void* g_scratch_buffer = nullptr;
  117. static size_t g_scratch_size = 0; // disabled by default
  118. static size_t g_scratch_offset = 0;
  119. [[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
  120. stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
  121. "current GPU architecture.\n";
  122. // __trap();
  123. std::exit(1);
  124. (void)bad_arch; // suppress unused function warning
  125. }
  126. int get_current_device_id();
  127. inline dpct::err0 ggml_sycl_set_device(const int device) try {
  128. int current_device_id;
  129. SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
  130. // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
  131. // current_device_id=%d\n", device, current_device);
  132. if (device == current_device_id) {
  133. return 0;
  134. }
  135. return CHECK_TRY_ERROR(dpct::select_device(device));
  136. } catch (sycl::exception const& exc) {
  137. std::cerr << exc.what() << "Exception caught at file:" << __FILE__
  138. << ", line:" << __LINE__ << std::endl;
  139. crash();
  140. std::exit(1);
  141. }
  142. //////////////////////
  143. struct ggml_sycl_device_info {
  144. int device_count;
  145. struct sycl_device_info {
  146. int cc; // compute capability
  147. // int nsm; // number of streaming multiprocessors
  148. // size_t smpb; // max. shared memory per block
  149. bool vmm; // virtual memory support
  150. size_t total_vram;
  151. };
  152. sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
  153. std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
  154. };
  155. const ggml_sycl_device_info & ggml_sycl_info();
  156. struct ggml_sycl_pool {
  157. virtual ~ggml_sycl_pool() = default;
  158. virtual void * alloc(size_t size, size_t * actual_size) = 0;
  159. virtual void free(void * ptr, size_t size) = 0;
  160. };
  161. template<typename T>
  162. struct ggml_sycl_pool_alloc {
  163. ggml_sycl_pool * pool = nullptr;
  164. T * ptr = nullptr;
  165. size_t actual_size = 0;
  166. explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
  167. }
  168. ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
  169. alloc(size);
  170. }
  171. ~ggml_sycl_pool_alloc() {
  172. if (ptr != nullptr) {
  173. pool->free(ptr, actual_size);
  174. }
  175. }
  176. // size is in number of elements
  177. T * alloc(size_t size) {
  178. GGML_ASSERT(pool != nullptr);
  179. GGML_ASSERT(ptr == nullptr);
  180. ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
  181. return ptr;
  182. }
  183. T * alloc(ggml_sycl_pool & pool, size_t size) {
  184. this->pool = &pool;
  185. return alloc(size);
  186. }
  187. T * get() {
  188. return ptr;
  189. }
  190. ggml_sycl_pool_alloc() = default;
  191. ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
  192. ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
  193. ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
  194. ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
  195. };
  196. // backend interface
  197. struct ggml_tensor_extra_gpu {
  198. void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
  199. // tensors
  200. dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
  201. [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
  202. };
  203. struct ggml_backend_sycl_context {
  204. int device;
  205. std::string name;
  206. queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
  207. explicit ggml_backend_sycl_context(int device) :
  208. device(device),
  209. name(GGML_SYCL_NAME + std::to_string(device)) {
  210. }
  211. queue_ptr stream(int device, int stream) {
  212. if (qptrs[device][stream] == nullptr) {
  213. qptrs[device][stream] = &(dpct::get_current_device().default_queue());
  214. }
  215. return qptrs[device][stream];
  216. }
  217. queue_ptr stream() {
  218. return stream(device, 0);
  219. }
  220. // pool
  221. std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
  222. static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
  223. ggml_sycl_pool & pool(int device) {
  224. if (pools[device] == nullptr) {
  225. pools[device] = new_pool_for_device(stream(device,0), device);
  226. }
  227. return *pools[device];
  228. }
  229. ggml_sycl_pool & pool() {
  230. return pool(device);
  231. }
  232. };
  233. #endif // GGML_SYCL_COMMON_HPP