| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663 |
- //
- // MIT license
- // Copyright (C) 2024 Intel Corporation
- // SPDX-License-Identifier: MIT
- //
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- #ifndef GGML_SYCL_COMMON_HPP
- #define GGML_SYCL_COMMON_HPP
- #include <cstddef>
- #include <fstream>
- #include <iostream>
- #include <string>
- #include "dpct/helper.hpp"
- #include "ggml-sycl.h"
- #include "presets.hpp"
- #include "sycl_hw.hpp"
- #if GGML_SYCL_DNNL
- #include "dnnl.hpp"
- #include "dnnl_sycl.hpp"
- #endif
- #define GGML_COMMON_DECL_SYCL
- #define GGML_COMMON_IMPL_SYCL
- /* suppress warning spam */
- #pragma clang diagnostic push
- #pragma clang diagnostic ignored "-Wnested-anon-types"
- #include "ggml-common.h"
- #pragma clang diagnostic pop
- #include "ggml-impl.h"
- void* ggml_sycl_host_malloc(size_t size);
- void ggml_sycl_host_free(void* ptr);
- extern int g_ggml_sycl_debug;
- extern int g_ggml_sycl_disable_optimize;
- extern int g_ggml_sycl_prioritize_dmmv;
- #if defined(__clang__) && __has_builtin(__builtin_expect)
- // Hint the optimizer to pipeline the more likely following instruction in branches
- # define LIKELY(expr) __builtin_expect(expr, true)
- # define UNLIKELY(expr) __builtin_expect(expr, false)
- #else
- # define LIKELY(expr) (expr)
- # define UNLIKELY(expr) (expr)
- #endif
- #define GGML_SYCL_DEBUG(...) \
- do { \
- if (UNLIKELY(g_ggml_sycl_debug)) \
- fprintf(stderr, __VA_ARGS__); \
- } while (0)
- #define CHECK_TRY_ERROR(expr) \
- [&]() { \
- try { \
- expr; \
- return dpct::success; \
- } catch (std::exception const& e) { \
- std::cerr << e.what() << "\nException caught at file:" << __FILE__ \
- << ", line:" << __LINE__ << ", func:" << __func__ \
- << std::endl; \
- return dpct::default_error; \
- } \
- }()
- #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
- #define VER_4VEC 610 // todo for hardward optimize.
- #define VER_GEN9 700 // todo for hardward optimize.
- #define VER_GEN12 1000000 // todo for hardward optimize.
- #define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
- #define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
- // define for XMX in Intel GPU
- // TODO: currently, it's not used for XMX really.
- #if !defined(GGML_SYCL_FORCE_MMQ)
- #define SYCL_USE_XMX
- #endif
- // max batch size to use MMQ kernels when tensor cores are available
- #define MMQ_MAX_BATCH_SIZE 32
- // dmmv = dequantize_mul_mat_vec
- #ifndef GGML_SYCL_DMMV_X
- #define GGML_SYCL_DMMV_X 32
- #endif
- #ifndef GGML_SYCL_MMV_Y
- #define GGML_SYCL_MMV_Y 1
- #endif
- typedef sycl::queue *queue_ptr;
- enum ggml_sycl_backend_gpu_mode {
- SYCL_UNSET_GPU_MODE = -1,
- SYCL_SINGLE_GPU_MODE = 0,
- SYCL_MUL_GPU_MODE
- };
- static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
- static void crash() {
- int* ptr = NULL;
- *ptr = 0;
- }
- [[noreturn]] static void ggml_sycl_error(
- const char* stmt,
- const char* func,
- const char* file,
- const int line,
- const char* msg) {
- fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
- fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
- GGML_ABORT("SYCL error");
- }
- #define SYCL_CHECK(err) \
- do { \
- auto err_ = (err); \
- if (err_ != 0) \
- ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \
- } while (0)
- #if DPCT_COMPAT_RT_VERSION >= 11100
- #define GGML_SYCL_ASSUME(x) __builtin_assume(x)
- #else
- #define GGML_SYCL_ASSUME(x)
- #endif // DPCT_COMPAT_RT_VERSION >= 11100
- #ifdef GGML_SYCL_F16
- typedef sycl::half dfloat; // dequantize float
- typedef sycl::half2 dfloat2;
- #else
- typedef float dfloat; // dequantize float
- typedef sycl::float2 dfloat2;
- #endif // GGML_SYCL_F16
- #define MMVQ_MAX_BATCH_SIZE 8
- static int g_all_sycl_device_count = -1;
- static bool g_ggml_backend_sycl_buffer_type_initialized = false;
- static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
- SYCL_UNSET_GPU_MODE;
- static void* g_scratch_buffer = nullptr;
- static size_t g_scratch_size = 0; // disabled by default
- static size_t g_scratch_offset = 0;
- [[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
- stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
- "current GPU architecture.\n";
- // __trap();
- std::exit(1);
- (void)bad_arch; // suppress unused function warning
- }
- int get_current_device_id();
- inline dpct::err0 ggml_sycl_set_device(const int device) try {
- int current_device_id;
- SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
- // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
- // current_device_id=%d\n", device, current_device);
- if (device == current_device_id) {
- return 0;
- }
- return CHECK_TRY_ERROR(dpct::select_device(device));
- } catch (sycl::exception const& exc) {
- std::cerr << exc.what() << "Exception caught at file:" << __FILE__
- << ", line:" << __LINE__ << std::endl;
- crash();
- std::exit(1);
- }
- //////////////////////
- struct optimize_feature {
- bool reorder=false;
- };
- struct sycl_device_info {
- int cc; // compute capability
- int nsm; // number of streaming multiprocessors (CUDA) maps to the maximum
- // number of compute units on a SYCL device.
- // size_t smpb; // max. shared memory per block
- size_t smpbo; // max. shared memory per block (with opt-in)
- bool vmm; // virtual memory support
- size_t total_vram;
- //sycl_hw_info hw_info; \\ device id and aarch, currently not used
- optimize_feature opt_feature;
- };
- struct ggml_sycl_device_info {
- int device_count;
- sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
- std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
- int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
- };
- const ggml_sycl_device_info & ggml_sycl_info();
- struct ggml_sycl_pool {
- virtual ~ggml_sycl_pool() = default;
- virtual void * alloc(size_t size, size_t * actual_size) = 0;
- virtual void free(void * ptr, size_t size) = 0;
- };
- template<typename T>
- struct ggml_sycl_pool_alloc {
- ggml_sycl_pool * pool = nullptr;
- T * ptr = nullptr;
- size_t actual_size = 0;
- explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
- }
- ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
- alloc(size);
- }
- ~ggml_sycl_pool_alloc() {
- if (ptr != nullptr) {
- pool->free(ptr, actual_size);
- }
- }
- T * realloc(size_t size) {
- GGML_ASSERT(pool != nullptr);
- if (ptr)
- pool->free(ptr, actual_size);
- ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
- return ptr;
- }
- // size is in number of elements
- T * alloc(size_t size) {
- GGML_ASSERT(pool != nullptr);
- GGML_ASSERT(ptr == nullptr);
- ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
- return ptr;
- }
- T * alloc(ggml_sycl_pool & pool, size_t size) {
- this->pool = &pool;
- return alloc(size);
- }
- T * get() {
- return ptr;
- }
- ggml_sycl_pool_alloc() = default;
- ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
- ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
- ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
- ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
- };
- // backend interface
- struct ggml_tensor_extra_gpu {
- void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
- // tensors
- dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
- [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
- optimize_feature optimized_feature;
- };
- void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
- namespace sycl_ex = sycl::ext::oneapi::experimental;
- struct ggml_backend_sycl_context {
- int device;
- std::string name;
- optimize_feature opt_feature;
- queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
- explicit ggml_backend_sycl_context(int device) :
- device(device),
- name(GGML_SYCL_NAME + std::to_string(device)) {
- opt_feature = ggml_sycl_info().devices[device].opt_feature;
- }
- queue_ptr stream(int device, int stream) {
- if (qptrs[device][stream] == nullptr) {
- qptrs[device][stream] = &(dpct::get_device(device).default_queue());
- }
- return qptrs[device][stream];
- }
- queue_ptr stream() {
- return stream(device, 0);
- }
- #if GGML_SYCL_DNNL
- dnnl::engine make_engine(sycl::queue* q) {
- // Get the device associated with the queue
- sycl::device dev = q->get_device();
- // Get the context associated with the queue
- sycl::context ctx = q->get_context();
- const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
- return eng;
- }
- std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
- std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
- dnnl::stream stream_dnnl(int device, int _stream) {
- auto q = stream(device, _stream);
- return stream_dnnl(q);
- }
- dnnl::engine engine_dnnl(sycl::queue* qptr) {
- auto it = engine_map.find(qptr);
- if (it == engine_map.end()) {
- auto eng = make_engine(qptr);
- engine_map[qptr] = eng;
- return eng;
- }
- else
- {
- return it->second;
- }
- }
- dnnl::stream stream_dnnl(sycl::queue* qptr) {
- auto it = stream_map.find(qptr);
- if (it == stream_map.end()) {
- auto eng = engine_dnnl(qptr);
- auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
- stream_map[qptr] = stream;
- return stream;
- }
- else
- {
- return it->second;
- }
- }
- dnnl::stream stream_dnnl() {
- return stream_dnnl(device, 0);
- }
- dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md,
- const dnnl::engine & eng, const queue_ptr q) {
- ggml_sycl_pool_alloc<uint8_t> * pool;
- auto it = scratchpad_map.find(q);
- if (it == scratchpad_map.end()) {
- scratchpad_map[q] = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(this->pool());
- pool = scratchpad_map[q].get();
- } else {
- pool = it->second.get();
- }
- size_t scratchpad_size = scratchpad_md.get_size();
- if (scratchpad_size > pool->actual_size) {
- pool->realloc(scratchpad_size);
- }
- void * mem_ptr = pool->get();
- return dnnl::memory(scratchpad_md, eng, mem_ptr);
- }
- #endif
- // pool
- std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
- std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
- std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
- static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
- static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
- ggml_sycl_pool & pool(int device) {
- if (pools[device] == nullptr) {
- pools[device] = new_pool_for_device(stream(device,0), device);
- }
- return *pools[device];
- }
- ggml_sycl_pool & pool() {
- return pool(device);
- }
- #ifdef GGML_SYCL_GRAPH
- std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
- #endif
- ggml_sycl_pool & host_pool(int device) {
- if (host_pools[device] == nullptr) {
- host_pools[device] = new_pool_for_host(stream(device, 0), device);
- }
- return *host_pools[device];
- }
- ggml_sycl_pool & host_pool() { return host_pool(device); }
- };
- // common device functions
- static __dpct_inline__ float warp_reduce_sum(float x,
- const sycl::nd_item<3>& item_ct1) {
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
- }
- return x;
- }
- static __dpct_inline__ sycl::float2
- warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
- mask);
- a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
- mask);
- }
- return a;
- }
- template <int width = WARP_SIZE>
- static __dpct_inline__ int warp_reduce_sum(int x) {
- return sycl::reduce_over_group(
- sycl::ext::oneapi::this_work_item::get_sub_group(), x, sycl::plus<>());
- }
- template <int width = WARP_SIZE>
- static __dpct_inline__ float warp_reduce_sum(float x) {
- #pragma unroll
- for (int offset = width / 2; offset > 0; offset >>= 1) {
- x += dpct::permute_sub_group_by_xor(
- sycl::ext::oneapi::this_work_item::get_sub_group(), x, offset, width);
- }
- return x;
- }
- template <int width = WARP_SIZE>
- static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a) {
- #pragma unroll
- for (int offset = width / 2; offset > 0; offset >>= 1) {
- a.x() += dpct::permute_sub_group_by_xor(
- sycl::ext::oneapi::this_work_item::get_sub_group(), a.x(), offset,
- width);
- a.y() += dpct::permute_sub_group_by_xor(
- sycl::ext::oneapi::this_work_item::get_sub_group(), a.y(), offset,
- width);
- }
- return a;
- }
- template <int width = WARP_SIZE>
- static __dpct_inline__ sycl::half2 warp_reduce_sum(sycl::half2 a) {
- #pragma unroll
- for (int offset = width / 2; offset > 0; offset >>= 1) {
- a = a + dpct::permute_sub_group_by_xor(
- sycl::ext::oneapi::this_work_item::get_sub_group(), a, offset,
- width);
- }
- return a;
- }
- static constexpr int ggml_sycl_get_physical_warp_size() {
- // todo: for old iGPU + dGPU case, need to be changed.
- return WARP_SIZE;
- }
- template <int width = WARP_SIZE>
- static __dpct_inline__ float warp_reduce_max(float x) {
- #pragma unroll
- for (int offset = width / 2; offset > 0; offset >>= 1) {
- x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
- sycl::ext::oneapi::this_work_item::get_sub_group(), x,
- offset, width));
- }
- return x;
- }
- static __dpct_inline__ float warp_reduce_max(float x,
- const sycl::nd_item<3>& item_ct1) {
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
- item_ct1.get_sub_group(), x, mask));
- }
- return x;
- }
- /* Helper for Computing the linear offset of a ggml_tensor given
- per-dimension sizes, strides, and indices */
- template<int N>
- __dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
- size_t offset = 0;
- #pragma unroll
- for (int i = 0; i < N; i++) {
- auto index_i = indices[i];
- offset += strides[i] * index_i;
- }
- return offset;
- }
- // Helper for vec loading aligned data
- template <typename Tp, int n>
- inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
- return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
- }
- // Helper for accessing pointers with no warnings
- template <typename Tp, int dim>
- static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
- return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
- }
- int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
- constexpr size_t ceil_div(const size_t m, const size_t n) {
- return (m + n - 1) / n;
- }
- bool gpu_has_xmx(sycl::device &dev);
- template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
- if (LIKELY(!g_ggml_sycl_debug)) {
- return "";
- }
- std::stringstream ss;
- ss << prefix << "=[";
- for (std::size_t i = 0; i < N - 1; ++i) {
- ss << array[i] << ", ";
- }
- if constexpr (N > 0) {
- ss << array[N - 1];
- }
- ss << "]";
- return ss.str();
- }
- inline std::string debug_get_tensor_str(const std::string &prefix,
- const ggml_tensor *tensor, const std::string &suffix = "") {
- std::stringstream ss;
- if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
- ss << prefix.c_str() << "=";
- if (tensor) {
- ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
- ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
- ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
- if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
- if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
- } else {
- ss << "nullptr";
- }
- ss << suffix;
- return ss.str();
- }
- // Use scope_op_debug_print to log operations coming from running a model
- struct scope_op_debug_print {
- // Use string_views to avoid the cost of creating a string and concatenating them
- // string_views must be alive for as long as the object is alive
- // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
- scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
- std::size_t num_src, const std::string_view & suffix = "") :
- func(func),
- func_suffix(func_suffix) {
- if (LIKELY(!g_ggml_sycl_debug)) {
- return;
- }
- GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
- GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
- if (dst) {
- for (std::size_t i = 0; i < num_src; ++i) {
- GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
- }
- }
- GGML_SYCL_DEBUG("%s\n", suffix.data());
- }
- scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
- const std::string_view & suffix = "") :
- scope_op_debug_print(func, "", dst, num_src, suffix) {}
- ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
- private:
- std::string_view func;
- std::string_view func_suffix;
- };
- static __dpct_inline__ float get_alibi_slope(const float max_bias,
- const uint32_t h,
- const uint32_t n_head_log2,
- const float m0,
- const float m1) {
- if (max_bias <= 0.0f) {
- return 1.0f;
- }
- const float base = h < n_head_log2 ? m0 : m1;
- const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
- return dpct::pow(base, exph);
- }
- static const sycl::uint3 init_fastdiv_values(uint32_t d) {
- GGML_ASSERT(d != 0);
- uint32_t L = 0;
- while (L < 32 && (uint32_t{ 1 } << L) < d) {
- L++;
- }
- uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
- return sycl::uint3(mp, L, d);
- }
- static __dpct_inline__ uint32_t fastdiv(uint32_t n, const sycl::uint3 fastdiv_values) {
- const uint32_t hi = sycl::mul_hi<unsigned>(n, fastdiv_values.x());
- return (hi + n) >> fastdiv_values.y();
- }
- static __dpct_inline__ sycl::uint2 fast_div_modulo(uint32_t n, const sycl::uint3 fastdiv_values) {
- const uint32_t div_val = fastdiv(n, fastdiv_values);
- const uint32_t mod_val = n - div_val * fastdiv_values.z();
- return sycl::uint2(div_val, mod_val);
- }
- static __dpct_inline__ int ggml_sycl_dp4a(const int a, const int b, int c) {
- return dpct::dp4a(a, b, c);
- }
- static __dpct_inline__ float ggml_sycl_e8m0_to_fp32(uint8_t x) {
- uint32_t bits;
- if (x == 0) {
- bits = 0x00400000;
- } else {
- bits = (uint32_t) x << 23;
- }
- float result;
- memcpy(&result, &bits, sizeof(float));
- return result;
- }
- #endif // GGML_SYCL_COMMON_HPP
|