#include "cuda_common.cuh"

// ============================================================
// Q6_K Dequantization Kernel
// 256 elements, lower 4 bits + upper 2 bits
// ============================================================
__global__ void dequant_q6k_kernel(const BlockQ6_K* blocks, float* out, int numBlocks) {
    int blockIdx_q = blockIdx.x;
    int elemIdx = threadIdx.x; // 0-255
    
    if (blockIdx_q >= numBlocks) return;
    
    const BlockQ6_K* b = &blocks[blockIdx_q];
    float d = fp16_to_fp32(b->d);
    
    // Position within 128-element halves
    int half = elemIdx / 128;
    int pos = elemIdx % 128;
    const int is = elemIdx / 32;
    const int iq = elemIdx % 32;
    
    int qlIdx = (is / 4) * 64 + (is % 2) * 32 + iq;
    int qhIdx = (is / 4) * 32 + iq;
    int scIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16);
    
    unsigned char ql = b->ql[qlIdx];
    unsigned char qh = b->qh[qhIdx];
    
    int shift_ql = ((is % 4) < 2) ? 0 : 4;
    int shift_qh = (is % 4) * 2;
    
    int q = ((ql >> shift_ql) & 0xF) | (((qh >> shift_qh) & 3) << 4);
    q -= 32;
    
    out[blockIdx_q * 256 + elemIdx] = d * (float)b->scales[scIdx] * (float)q;
}

int cuda_dequant_q6k(const void* blocks, float* out, int numBlocks) {
    if (numBlocks == 0) return 0;
    dequant_q6k_kernel<<<numBlocks, 256>>>((const BlockQ6_K*)blocks, out, numBlocks);
    CHECK_CUDA(cudaGetLastError());
    return 0;
}

// ============================================================
// Q3_K Dequantization Kernel
// ============================================================
__device__ __forceinline__ signed char unpack_q3_scale(const unsigned char* packed, int idx) {
    unsigned char sc;
    if (idx < 8) {
        sc = packed[idx] & 0xF;
    } else {
        sc = packed[idx - 8] >> 4;
    }
    sc |= ((packed[8 + (idx % 4)] >> (2 * (idx / 4))) & 0x3) << 4;
    return (signed char)sc - 32;
}

__global__ void dequant_q3k_kernel(const BlockQ3_K* blocks, float* out, int numBlocks) {
    int blockIdx_q = blockIdx.x;
    int elemIdx = threadIdx.x;
    
    if (blockIdx_q >= numBlocks) return;
    
    const BlockQ3_K* b = &blocks[blockIdx_q];
    float d = fp16_to_fp32(b->d);
    
    const int is = elemIdx / 32;
    const int iq = elemIdx % 32;
    
    int qsIdx = (is / 4) * 32 + iq;
    int hmaskIdx = iq;
    
    int scaleIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16);
    
    int shift = (is % 4) * 2;
    unsigned char m = 1 << ((is / 4) * 4 + (is % 4));
    
    signed char scale = unpack_q3_scale(b->scales, scaleIdx);
    int qv = (b->qs[qsIdx] >> shift) & 0x3;
    if ((b->hmask[hmaskIdx] & m) == 0) {
        qv -= 4;
    }
    
    out[blockIdx_q * 256 + elemIdx] = d * (float)scale * (float)qv;
}

int cuda_dequant_q3k(const void* blocks, float* out, int numBlocks) {
    if (numBlocks == 0) return 0;
    dequant_q3k_kernel<<<numBlocks, 256>>>((const BlockQ3_K*)blocks, out, numBlocks);
    CHECK_CUDA(cudaGetLastError());
    return 0;
}

// ============================================================
// Q2_K Dequantization Kernel
// ============================================================
__global__ void dequant_q2k_kernel(const BlockQ2_K* blocks, float* out, int numBlocks) {
    int blockIdx_q = blockIdx.x;
    int elemIdx = threadIdx.x;
    
    if (blockIdx_q >= numBlocks) return;
    
    const BlockQ2_K* b = &blocks[blockIdx_q];
    float d = fp16_to_fp32(b->d);
    float dmin = fp16_to_fp32(b->dmin);
    
    const int is = elemIdx / 32;
    const int iq = elemIdx % 32;
    
    int scIdx = (is / 4) * 8 + (is % 4) * 2 + (iq / 16);
    int qsIdx = (is / 4) * 32 + iq;
    int shift = (is % 4) * 2;
    
    unsigned char sc = b->scales[scIdx];
    float dl = d * (float)(sc & 0xF);
    float ml = dmin * (float)(sc >> 4);
    
    int val = (b->qs[qsIdx] >> shift) & 3;
    
    out[blockIdx_q * 256 + elemIdx] = dl * (float)val - ml;
}

int cuda_dequant_q2k(const void* blocks, float* out, int numBlocks) {
    if (numBlocks == 0) return 0;
    dequant_q2k_kernel<<<numBlocks, 256>>>((const BlockQ2_K*)blocks, out, numBlocks);
    CHECK_CUDA(cudaGetLastError());
    return 0;
}