ggml-cuda.cu 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. #include <stdint.h>
  2. #include <stdio.h>
  3. #include <cuda_fp16.h>
  4. #include <atomic>
  5. #include "ggml-cuda.h"
  6. typedef uint16_t ggml_fp16_t;
  7. static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
  8. #define QK4_0 32
  9. typedef struct {
  10. float d; // delta
  11. uint8_t qs[QK4_0 / 2]; // nibbles / quants
  12. } block_q4_0;
  13. static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
  14. #define QK4_1 32
  15. typedef struct {
  16. float d; // delta
  17. float m; // min
  18. uint8_t qs[QK4_1 / 2]; // nibbles / quants
  19. } block_q4_1;
  20. static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
  21. #define QK4_2 16
  22. typedef struct {
  23. __half d; // delta
  24. uint8_t qs[QK4_2 / 2]; // nibbles / quants
  25. } block_q4_2;
  26. static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
  27. #define QK4_3 16
  28. typedef struct {
  29. __half d; // delta
  30. __half m; // min
  31. uint8_t qs[QK4_3 / 2]; // nibbles / quants
  32. } block_q4_3;
  33. static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
  34. static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
  35. const block_q4_0 * x = (const block_q4_0 *) vx;
  36. const int i = blockIdx.x;
  37. const float d = x[i].d;
  38. const uint8_t * pp = x[i].qs;
  39. for (int l = 0; l < QK4_0; l += 2) {
  40. const uint8_t vi = pp[l/2];
  41. const int8_t vi0 = vi & 0xf;
  42. const int8_t vi1 = vi >> 4;
  43. const float v0 = (vi0 - 8)*d;
  44. const float v1 = (vi1 - 8)*d;
  45. y[i*QK4_0 + l + 0] = v0;
  46. y[i*QK4_0 + l + 1] = v1;
  47. }
  48. }
  49. static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
  50. const block_q4_1 * x = (const block_q4_1 *) vx;
  51. const int i = blockIdx.x;
  52. const float d = x[i].d;
  53. const float m = x[i].m;
  54. const uint8_t * pp = x[i].qs;
  55. for (int l = 0; l < QK4_1; l += 2) {
  56. const uint8_t vi = pp[l/2];
  57. const int8_t vi0 = vi & 0xf;
  58. const int8_t vi1 = vi >> 4;
  59. const float v0 = vi0*d + m;
  60. const float v1 = vi1*d + m;
  61. y[i*QK4_1 + l + 0] = v0;
  62. y[i*QK4_1 + l + 1] = v1;
  63. }
  64. }
  65. static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
  66. const block_q4_2 * x = (const block_q4_2 *) vx;
  67. const int i = blockIdx.x;
  68. const float d = x[i].d;
  69. const uint8_t * pp = x[i].qs;
  70. for (int l = 0; l < QK4_2; l += 2) {
  71. const uint8_t vi = pp[l/2];
  72. const int8_t vi0 = vi & 0xf;
  73. const int8_t vi1 = vi >> 4;
  74. const float v0 = (vi0 - 8)*d;
  75. const float v1 = (vi1 - 8)*d;
  76. y[i*QK4_2 + l + 0] = v0;
  77. y[i*QK4_2 + l + 1] = v1;
  78. }
  79. }
  80. static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
  81. const block_q4_3 * x = (const block_q4_3 *) vx;
  82. const int i = blockIdx.x;
  83. const float d = x[i].d;
  84. const float m = x[i].m;
  85. const uint8_t * pp = x[i].qs;
  86. for (int l = 0; l < QK4_3; l += 2) {
  87. const uint8_t vi = pp[l/2];
  88. const int8_t vi0 = vi & 0xf;
  89. const int8_t vi1 = vi >> 4;
  90. const float v0 = vi0*d + m;
  91. const float v1 = vi1*d + m;
  92. y[i*QK4_3 + l + 0] = v0;
  93. y[i*QK4_3 + l + 1] = v1;
  94. }
  95. }
  96. void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
  97. const int nb = k / QK4_0;
  98. dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
  99. }
  100. void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
  101. const int nb = k / QK4_1;
  102. dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
  103. }
  104. void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
  105. const int nb = k / QK4_2;
  106. dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
  107. }
  108. void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
  109. const int nb = k / QK4_3;
  110. dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
  111. }
  112. // buffer pool for cuda
  113. #define MAX_CUDA_BUFFERS 16
  114. struct scoped_spin_lock {
  115. std::atomic_flag& lock;
  116. scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
  117. while (lock.test_and_set(std::memory_order_acquire)) {
  118. ; // spin
  119. }
  120. }
  121. ~scoped_spin_lock() {
  122. lock.clear(std::memory_order_release);
  123. }
  124. scoped_spin_lock(const scoped_spin_lock&) = delete;
  125. scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
  126. };
  127. struct cuda_buffer {
  128. void * ptr = nullptr;
  129. size_t size = 0;
  130. };
  131. static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
  132. static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
  133. void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
  134. scoped_spin_lock lock(g_cuda_pool_lock);
  135. for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
  136. cuda_buffer& b = g_cuda_buffer_pool[i];
  137. if (b.size >= size && b.ptr != nullptr) {
  138. void * ptr = b.ptr;
  139. *actual_size = b.size;
  140. b.ptr = nullptr;
  141. b.size = 0;
  142. return ptr;
  143. }
  144. }
  145. void * ptr;
  146. CUDA_CHECK(cudaMalloc((void **) &ptr, size));
  147. *actual_size = size;
  148. return ptr;
  149. }
  150. void ggml_cuda_pool_free(void * ptr, size_t size) {
  151. scoped_spin_lock lock(g_cuda_pool_lock);
  152. for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
  153. cuda_buffer& b = g_cuda_buffer_pool[i];
  154. if (b.ptr == nullptr) {
  155. b.ptr = ptr;
  156. b.size = size;
  157. return;
  158. }
  159. }
  160. fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
  161. CUDA_CHECK(cudaFree(ptr));
  162. }
  163. cublasHandle_t g_cublasH = NULL;
  164. cudaStream_t g_cudaStream = NULL;
  165. void ggml_init_cublas(void) {
  166. if (g_cublasH == NULL) {
  167. // create cublas handle, bind a stream
  168. CUBLAS_CHECK(cublasCreate(&g_cublasH));
  169. CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
  170. CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
  171. // configure logging to stdout
  172. // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
  173. }
  174. }