#include "cuda_common.cuh" __global__ void dequant_q5k_kernel(const BlockQ5_K* blocks, float* out, int numBlocks) { int blockIdx_q = blockIdx.x; int elemIdx = threadIdx.x; if (blockIdx_q >= numBlocks) return; const BlockQ5_K* b = &blocks[blockIdx_q]; float d = fp16_to_fp32(b->d); float dmin = fp16_to_fp32(b->dmin); unsigned char sc[8], m[8]; #pragma unroll for (int j = 0; j < 4; j++) { sc[j] = b->scales[j] & 63; m[j] = b->scales[j + 4] & 63; } #pragma unroll for (int j = 4; j < 8; j++) { sc[j] = (b->scales[j + 4] & 0xF) | ((b->scales[j - 4] >> 6) << 4); m[j] = (b->scales[j + 4] >> 4) | ((b->scales[j] >> 6) << 4); } int subBlock = elemIdx / 32; int subPos = elemIdx % 32; int chunk = elemIdx / 64; int posInChunk = elemIdx % 64; int qsIdx = chunk * 32 + (posInChunk & 31); unsigned char qs = b->qs[qsIdx]; int val; unsigned char hb = b->qh[posInChunk & 31]; if (posInChunk < 32) { val = (qs & 0xF); val += ((hb >> (2 * chunk)) & 1) << 4; } else { val = (qs >> 4); val += ((hb >> (2 * chunk + 1)) & 1) << 4; } float scale = d * (float)sc[subBlock]; float minVal = dmin * (float)m[subBlock]; out[blockIdx_q * 256 + elemIdx] = (float)val * scale - minVal; } int cuda_dequant_q5k(const void* blocks, float* out, int numBlocks) { if (numBlocks == 0) return 0; dequant_q5k_kernel<<>>((const BlockQ5_K*)blocks, out, numBlocks); CHECK_CUDA(cudaGetLastError()); return 0; }