#include "cuda_common.cuh" // ============================================================ // Q6_K Dequantization Kernel // 256 elements, lower 4 bits + upper 2 bits // ============================================================ __global__ void dequant_q6k_kernel(const BlockQ6_K* blocks, float* out, int numBlocks) { int blockIdx_q = blockIdx.x; int elemIdx = threadIdx.x; // 0-255 if (blockIdx_q >= numBlocks) return; const BlockQ6_K* b = &blocks[blockIdx_q]; float d = fp16_to_fp32(b->d); // Position within 128-element halves int half = elemIdx / 128; int pos = elemIdx % 128; const int is = elemIdx / 32; const int iq = elemIdx % 32; int qlIdx = (is / 4) * 64 + (is % 2) * 32 + iq; int qhIdx = (is / 4) * 32 + iq; int scIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16); unsigned char ql = b->ql[qlIdx]; unsigned char qh = b->qh[qhIdx]; int shift_ql = ((is % 4) < 2) ? 0 : 4; int shift_qh = (is % 4) * 2; int q = ((ql >> shift_ql) & 0xF) | (((qh >> shift_qh) & 3) << 4); q -= 32; out[blockIdx_q * 256 + elemIdx] = d * (float)b->scales[scIdx] * (float)q; } int cuda_dequant_q6k(const void* blocks, float* out, int numBlocks) { if (numBlocks == 0) return 0; dequant_q6k_kernel<<>>((const BlockQ6_K*)blocks, out, numBlocks); CHECK_CUDA(cudaGetLastError()); return 0; } // ============================================================ // Q3_K Dequantization Kernel // ============================================================ __device__ __forceinline__ signed char unpack_q3_scale(const unsigned char* packed, int idx) { unsigned char sc; if (idx < 8) { sc = packed[idx] & 0xF; } else { sc = packed[idx - 8] >> 4; } sc |= ((packed[8 + (idx % 4)] >> (2 * (idx / 4))) & 0x3) << 4; return (signed char)sc - 32; } __global__ void dequant_q3k_kernel(const BlockQ3_K* blocks, float* out, int numBlocks) { int blockIdx_q = blockIdx.x; int elemIdx = threadIdx.x; if (blockIdx_q >= numBlocks) return; const BlockQ3_K* b = &blocks[blockIdx_q]; float d = fp16_to_fp32(b->d); const int is = elemIdx / 32; const int iq = elemIdx % 32; int qsIdx = (is / 4) * 32 + iq; int hmaskIdx = iq; int scaleIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16); int shift = (is % 4) * 2; unsigned char m = 1 << ((is / 4) * 4 + (is % 4)); signed char scale = unpack_q3_scale(b->scales, scaleIdx); int qv = (b->qs[qsIdx] >> shift) & 0x3; if ((b->hmask[hmaskIdx] & m) == 0) { qv -= 4; } out[blockIdx_q * 256 + elemIdx] = d * (float)scale * (float)qv; } int cuda_dequant_q3k(const void* blocks, float* out, int numBlocks) { if (numBlocks == 0) return 0; dequant_q3k_kernel<<>>((const BlockQ3_K*)blocks, out, numBlocks); CHECK_CUDA(cudaGetLastError()); return 0; } // ============================================================ // Q2_K Dequantization Kernel // ============================================================ __global__ void dequant_q2k_kernel(const BlockQ2_K* blocks, float* out, int numBlocks) { int blockIdx_q = blockIdx.x; int elemIdx = threadIdx.x; if (blockIdx_q >= numBlocks) return; const BlockQ2_K* b = &blocks[blockIdx_q]; float d = fp16_to_fp32(b->d); float dmin = fp16_to_fp32(b->dmin); const int is = elemIdx / 32; const int iq = elemIdx % 32; int scIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16); int qsIdx = (is / 4) * 32 + iq; int shift = (is % 4) * 2; unsigned char sc = b->scales[scIdx]; float dl = d * (float)(sc & 0xF); float ml = dmin * (float)(sc >> 4); int val = (b->qs[qsIdx] >> shift) & 3; out[blockIdx_q * 256 + elemIdx] = dl * (float)val - ml; } int cuda_dequant_q2k(const void* blocks, float* out, int numBlocks) { if (numBlocks == 0) return 0; dequant_q2k_kernel<<>>((const BlockQ2_K*)blocks, out, numBlocks); CHECK_CUDA(cudaGetLastError()); return 0; }