cuda_common.cuh 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. #ifndef MAKARNA_CUDA_COMMON_CUH
  2. #define MAKARNA_CUDA_COMMON_CUH
  3. #include "kernels.h"
  4. #define CUDA_API_PER_THREAD_DEFAULT_STREAM 1
  5. #include <cuda_runtime.h>
  6. #include <math.h>
  7. #include <stdio.h>
  8. #define CHECK_CUDA(call) \
  9. do { \
  10. cudaError_t err = call; \
  11. if (err != cudaSuccess) { \
  12. fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
  13. return 1; \
  14. } \
  15. } while (0)
  16. // ============================================================
  17. // FP16 -> FP32 conversion (device function)
  18. // ============================================================
  19. __device__ __forceinline__ float fp16_to_fp32(unsigned short h) {
  20. unsigned int sign = (h & 0x8000) << 16;
  21. unsigned int exp = (h & 0x7C00) >> 10;
  22. unsigned int mant = h & 0x03FF;
  23. if (exp > 0 && exp < 0x1F) {
  24. // Normalized
  25. unsigned int bits = sign | ((exp + 112) << 23) | (mant << 13);
  26. return __int_as_float(bits);
  27. }
  28. if (exp == 0) {
  29. if (mant == 0) return __int_as_float(sign); // Zero
  30. // Denorm - rare case, simplified
  31. float m = (float)mant / 1024.0f;
  32. float val = m * 6.103515625e-05f; // 2^-14
  33. return sign ? -val : val;
  34. }
  35. // Inf/NaN
  36. return mant == 0 ? __int_as_float(sign | 0x7F800000) : __int_as_float(sign | 0x7FC00000);
  37. }
  38. #endif