| 12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- #ifndef MAKARNA_CUDA_COMMON_CUH
- #define MAKARNA_CUDA_COMMON_CUH
- #include "kernels.h"
- #define CUDA_API_PER_THREAD_DEFAULT_STREAM 1
- #include <cuda_runtime.h>
- #include <math.h>
- #include <stdio.h>
- #define CHECK_CUDA(call) \
- do { \
- cudaError_t err = call; \
- if (err != cudaSuccess) { \
- fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
- return 1; \
- } \
- } while (0)
- // ============================================================
- // FP16 -> FP32 conversion (device function)
- // ============================================================
- __device__ __forceinline__ float fp16_to_fp32(unsigned short h) {
- unsigned int sign = (h & 0x8000) << 16;
- unsigned int exp = (h & 0x7C00) >> 10;
- unsigned int mant = h & 0x03FF;
-
- if (exp > 0 && exp < 0x1F) {
- // Normalized
- unsigned int bits = sign | ((exp + 112) << 23) | (mant << 13);
- return __int_as_float(bits);
- }
- if (exp == 0) {
- if (mant == 0) return __int_as_float(sign); // Zero
- // Denorm - rare case, simplified
- float m = (float)mant / 1024.0f;
- float val = m * 6.103515625e-05f; // 2^-14
- return sign ? -val : val;
- }
- // Inf/NaN
- return mant == 0 ? __int_as_float(sign | 0x7F800000) : __int_as_float(sign | 0x7FC00000);
- }
- #endif
|