| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- #include "cuda_common.cuh"
- // ============================================================
- // Q6_K Dequantization Kernel
- // 256 elements, lower 4 bits + upper 2 bits
- // ============================================================
- __global__ void dequant_q6k_kernel(const BlockQ6_K* blocks, float* out, int numBlocks) {
- int blockIdx_q = blockIdx.x;
- int elemIdx = threadIdx.x; // 0-255
-
- if (blockIdx_q >= numBlocks) return;
-
- const BlockQ6_K* b = &blocks[blockIdx_q];
- float d = fp16_to_fp32(b->d);
-
- // Position within 128-element halves
- int half = elemIdx / 128;
- int pos = elemIdx % 128;
- const int is = elemIdx / 32;
- const int iq = elemIdx % 32;
-
- int qlIdx = (is / 4) * 64 + (is % 2) * 32 + iq;
- int qhIdx = (is / 4) * 32 + iq;
- int scIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16);
-
- unsigned char ql = b->ql[qlIdx];
- unsigned char qh = b->qh[qhIdx];
-
- int shift_ql = ((is % 4) < 2) ? 0 : 4;
- int shift_qh = (is % 4) * 2;
-
- int q = ((ql >> shift_ql) & 0xF) | (((qh >> shift_qh) & 3) << 4);
- q -= 32;
-
- out[blockIdx_q * 256 + elemIdx] = d * (float)b->scales[scIdx] * (float)q;
- }
- int cuda_dequant_q6k(const void* blocks, float* out, int numBlocks) {
- if (numBlocks == 0) return 0;
- dequant_q6k_kernel<<<numBlocks, 256>>>((const BlockQ6_K*)blocks, out, numBlocks);
- CHECK_CUDA(cudaGetLastError());
- return 0;
- }
- // ============================================================
- // Q3_K Dequantization Kernel
- // ============================================================
- __device__ __forceinline__ signed char unpack_q3_scale(const unsigned char* packed, int idx) {
- unsigned char sc;
- if (idx < 8) {
- sc = packed[idx] & 0xF;
- } else {
- sc = packed[idx - 8] >> 4;
- }
- sc |= ((packed[8 + (idx % 4)] >> (2 * (idx / 4))) & 0x3) << 4;
- return (signed char)sc - 32;
- }
- __global__ void dequant_q3k_kernel(const BlockQ3_K* blocks, float* out, int numBlocks) {
- int blockIdx_q = blockIdx.x;
- int elemIdx = threadIdx.x;
-
- if (blockIdx_q >= numBlocks) return;
-
- const BlockQ3_K* b = &blocks[blockIdx_q];
- float d = fp16_to_fp32(b->d);
-
- const int is = elemIdx / 32;
- const int iq = elemIdx % 32;
-
- int qsIdx = (is / 4) * 32 + iq;
- int hmaskIdx = iq;
-
- int scaleIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16);
-
- int shift = (is % 4) * 2;
- unsigned char m = 1 << ((is / 4) * 4 + (is % 4));
-
- signed char scale = unpack_q3_scale(b->scales, scaleIdx);
- int qv = (b->qs[qsIdx] >> shift) & 0x3;
- if ((b->hmask[hmaskIdx] & m) == 0) {
- qv -= 4;
- }
-
- out[blockIdx_q * 256 + elemIdx] = d * (float)scale * (float)qv;
- }
- int cuda_dequant_q3k(const void* blocks, float* out, int numBlocks) {
- if (numBlocks == 0) return 0;
- dequant_q3k_kernel<<<numBlocks, 256>>>((const BlockQ3_K*)blocks, out, numBlocks);
- CHECK_CUDA(cudaGetLastError());
- return 0;
- }
- // ============================================================
- // Q2_K Dequantization Kernel
- // ============================================================
- __global__ void dequant_q2k_kernel(const BlockQ2_K* blocks, float* out, int numBlocks) {
- int blockIdx_q = blockIdx.x;
- int elemIdx = threadIdx.x;
-
- if (blockIdx_q >= numBlocks) return;
-
- const BlockQ2_K* b = &blocks[blockIdx_q];
- float d = fp16_to_fp32(b->d);
- float dmin = fp16_to_fp32(b->dmin);
-
- const int is = elemIdx / 32;
- const int iq = elemIdx % 32;
-
- int scIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16);
- int qsIdx = (is / 4) * 32 + iq;
- int shift = (is % 4) * 2;
-
- unsigned char sc = b->scales[scIdx];
- float dl = d * (float)(sc & 0xF);
- float ml = dmin * (float)(sc >> 4);
-
- int val = (b->qs[qsIdx] >> shift) & 3;
-
- out[blockIdx_q * 256 + elemIdx] = dl * (float)val - ml;
- }
- int cuda_dequant_q2k(const void* blocks, float* out, int numBlocks) {
- if (numBlocks == 0) return 0;
- dequant_q2k_kernel<<<numBlocks, 256>>>((const BlockQ2_K*)blocks, out, numBlocks);
- CHECK_CUDA(cudaGetLastError());
- return 0;
- }
|