#ifndef MAKARNA_CUDA_COMMON_CUH #define MAKARNA_CUDA_COMMON_CUH #include "kernels.h" #define CUDA_API_PER_THREAD_DEFAULT_STREAM 1 #include #include #include #define CHECK_CUDA(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ return 1; \ } \ } while (0) // ============================================================ // FP16 -> FP32 conversion (device function) // ============================================================ __device__ __forceinline__ float fp16_to_fp32(unsigned short h) { unsigned int sign = (h & 0x8000) << 16; unsigned int exp = (h & 0x7C00) >> 10; unsigned int mant = h & 0x03FF; if (exp > 0 && exp < 0x1F) { // Normalized unsigned int bits = sign | ((exp + 112) << 23) | (mant << 13); return __int_as_float(bits); } if (exp == 0) { if (mant == 0) return __int_as_float(sign); // Zero // Denorm - rare case, simplified float m = (float)mant / 1024.0f; float val = m * 6.103515625e-05f; // 2^-14 return sign ? -val : val; } // Inf/NaN return mant == 0 ? __int_as_float(sign | 0x7F800000) : __int_as_float(sign | 0x7FC00000); } #endif