| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344 |
- #ifndef MAKARNA_CUDA_H
- #define MAKARNA_CUDA_H
- #include <stddef.h>
- #ifdef __cplusplus
- extern "C" {
- #endif
- // Memory Management
- int cuda_set_device(int id);
- void* cuda_malloc(size_t size);
- void cuda_free(void* ptr);
- int cuda_synchronize();
- int cuda_memcpy_h2d(void* dst, void* src, size_t size);
- int cuda_memcpy_d2h(void* dst, void* src, size_t size);
- int cuda_memcpy_d2d(void* dst, void* src, size_t size);
- int cuda_mem_info(size_t* free_bytes, size_t* total_bytes);
- int cuda_device_count(int* count);
- // Math Operations (Float32)
- // Launches kernels on the default stream
- int cuda_add_f32(float* a, float* b, size_t n);
- int cuda_mul_f32(float* a, float* b, size_t n);
- int cuda_matmul_f32(float* A, float* B, float* C, int M, int K, int N);
- // MatMul where B is row-major [N, K] (no host transpose needed).
- int cuda_matmul_f32_nt(float* A, float* B, float* C, int M, int K, int N);
- // MatMul where A and B are float16 (IEEE half stored as uint16).
- // B is row-major [N, K] and interpreted as column-major [K, N].
- int cuda_matmul_f16_nt(const unsigned short* A, const unsigned short* B, float* C, int M, int K, int N);
- // ============================================================
- // Neural Network Operations
- // ============================================================
- // RMSNorm: x = x * rsqrt(mean(x^2) + eps) * weight
- // x: [seqLen, dim], w: [dim] -> modifies x in-place
- int cuda_rmsnorm_f32(float* x, const float* w, int seqLen, int dim, float eps);
- // RoPE: Apply rotary positional embeddings in-place
- // x: [seqLen, numHeads * headDim]
- // positions: [seqLen] - position indices
- int cuda_rope_f32(float* x, const int* positions, int seqLen, int numHeads, int headDim, float theta);
- int cuda_rope_f32_single(float* x, int pos, int numHeads, int headDim, float theta);
- // Softmax: Apply softmax along last dimension
- // x: [rows, cols] -> in-place
- int cuda_softmax_f32(float* x, int rows, int cols);
- // Top-K selection on logits with optional repetition penalty.
- // logits: [vocab]
- // rep_ids: [rep_count] token ids to penalize
- // out_ids/out_scores: [numBlocks * k]
- // Returns 0 on success.
- int cuda_topk_logits_f32(
- const float* logits, int vocab,
- const int* rep_ids, int rep_count, float rep_penalty,
- int k,
- int* out_ids, float* out_scores);
- // Causal Attention: Full attention computation
- // Q: [seqLen, numHeads * headDim]
- // K: [kvLen, numKVHeads * headDim]
- // V: [kvLen, numKVHeads * headDim]
- // out: [seqLen, numHeads * headDim]
- // scale: typically 1/sqrt(headDim)
- // startPos: for causal mask offset (KV cache)
- int cuda_attention_f32(
- const float* Q, const float* K, const float* V, float* out,
- int seqLen, int kvLen, int numHeads, int numKVHeads, int headDim,
- float scale, int startPos);
- int cuda_paged_attention_f32(
- const float* Q,
- const float* const* KBlocks,
- const float* const* VBlocks,
- float* out,
- int seqLen, int kvLen, int numHeads, int numKVHeads, int headDim,
- int blockSize,
- float scale, int startPos);
- int cuda_paged_attention_batch_f32(
- const float* Q,
- const float* const* KBlocksFlat,
- const float* const* VBlocksFlat,
- const int* blockOffsets,
- const int* kvLens,
- const int* queryPos,
- float* out,
- int numTokens,
- int numHeads, int numKVHeads, int headDim,
- int blockSize,
- float scale,
- int maxKvLen);
- // Paged attention where KV blocks are float16 (IEEE half stored as uint16).
- // Q and out are float32. Accumulation is float32.
- int cuda_paged_attention_f32_f16kv(
- const float* Q,
- const unsigned short* const* KBlocks,
- const unsigned short* const* VBlocks,
- float* out,
- int seqLen, int kvLen, int numHeads, int numKVHeads, int headDim,
- int blockSize,
- float scale, int startPos);
- int cuda_paged_attention_batch_f32_f16kv(
- const float* Q,
- const unsigned short* const* KBlocksFlat,
- const unsigned short* const* VBlocksFlat,
- const int* blockOffsets,
- const int* kvLens,
- const int* queryPos,
- float* out,
- int numTokens,
- int numHeads, int numKVHeads, int headDim,
- int blockSize,
- float scale,
- int maxKvLen);
- // Fused RoPE + paged attention where KV blocks are float16 (IEEE half stored as uint16).
- // Expects un-rotated Q and un-rotated K blocks; RoPE is applied on-the-fly in the attention kernel.
- int cuda_paged_attention_rope_f32_f16kv(
- const float* Q,
- const unsigned short* const* KBlocks,
- const unsigned short* const* VBlocks,
- float* out,
- int seqLen, int kvLen, int numHeads, int numKVHeads, int headDim,
- int blockSize,
- float scale, int startPos,
- float theta);
- int cuda_paged_attention_rope_batch_f32_f16kv(
- const float* Q,
- const unsigned short* const* KBlocksFlat,
- const unsigned short* const* VBlocksFlat,
- const int* blockOffsets,
- const int* kvLens,
- const int* queryPos,
- float* out,
- int numTokens,
- int numHeads, int numKVHeads, int headDim,
- int blockSize,
- float scale,
- int maxKvLen,
- float theta);
- // Cast float32 -> float16 (stored as uint16) on GPU.
- int cuda_cast_f32_to_f16(const float* src, unsigned short* dst, int n);
- int cuda_attention_f32_timed(
- const float* Q, const float* K, const float* V, float* out,
- int seqLen, int kvLen, int numHeads, int numKVHeads, int headDim,
- float scale, int startPos, float* ms);
- // SiLU activation: x = x * sigmoid(x), in-place
- int cuda_silu_f32(float* x, size_t n);
- // Element-wise multiply: a = a * b, in-place
- int cuda_mul_inplace_f32(float* a, const float* b, size_t n);
- // Copy: dst = src
- int cuda_copy_f32(float* dst, const float* src, size_t n);
- int cuda_kda_causal_short_conv1d_f32(
- float* x,
- float* state,
- const float* w,
- int tokens,
- int projSize,
- int kernel);
- int cuda_l2norm_heads_f32(
- float* q,
- float* k,
- int tokens,
- int numHeads,
- int headDim,
- float eps);
- int cuda_kda_gate_f32(
- const float* g,
- const float* aLog,
- const float* dtBias,
- float* out,
- int tokens,
- int numHeads,
- int headDim);
- int cuda_kda_recurrent_f32(
- const float* q,
- const float* k,
- float* v,
- const float* g,
- // beta is a device pointer: [tokens, numHeads] (row-major).
- const float* beta,
- float* state,
- int tokens,
- int numHeads,
- int headDim);
- int cuda_rmsnorm_gated_f32(
- float* out,
- const float* g,
- const float* weight,
- int n,
- int headDim,
- float eps);
- int cuda_sigmoid_f32(float* x, int n);
- int cuda_softmax_rows_f32(float* x, int rows, int cols);
- int cuda_topk_per_row_f32(
- const float* scores,
- int* indices,
- float* values,
- int rows,
- int cols,
- int k);
- // ============================================================
- // Dequantization Kernels
- // These convert quantized blocks to float32 on GPU
- // ============================================================
- // Block sizes for K-quantization
- #define QK_K 256
- // BlockQ8_K: 292 bytes per block (4 + 256 + 32)
- // - D (4 bytes): float32 scale
- // - QS (256 bytes): 256 int8 quants
- // - BSums (32 bytes): unused in dequant
- typedef struct {
- float d;
- signed char qs[256];
- short bsums[16];
- } BlockQ8_K;
- // BlockQ4_K: 144 bytes per block
- // - D (2 bytes): float16 super-scale
- // - DMin (2 bytes): float16 super-min
- // - Scales (12 bytes): packed 6-bit scales/mins
- // - QS (128 bytes): 256 4-bit quants
- typedef struct {
- unsigned short d;
- unsigned short dmin;
- unsigned char scales[12];
- unsigned char qs[128];
- } BlockQ4_K;
- typedef struct {
- unsigned short d;
- unsigned short dmin;
- unsigned char scales[12];
- unsigned char qh[32];
- unsigned char qs[128];
- } BlockQ5_K;
- // BlockQ6_K: 210 bytes per block
- // - QL (128 bytes): lower 4 bits
- // - QH (64 bytes): upper 2 bits
- // - Scales (16 bytes): 8-bit scales
- // - D (2 bytes): float16 super-scale
- typedef struct {
- unsigned char ql[128];
- unsigned char qh[64];
- signed char scales[16];
- unsigned short d;
- } BlockQ6_K;
- // BlockQ3_K: 110 bytes per block
- // - HMask (32 bytes): high bits
- // - QS (64 bytes): low 2 bits
- // - Scales (12 bytes): packed 6-bit scales
- // - D (2 bytes): float16 super-scale
- typedef struct {
- unsigned char hmask[32];
- unsigned char qs[64];
- unsigned char scales[12];
- unsigned short d;
- } BlockQ3_K;
- // BlockQ2_K: 84 bytes per block
- // - Scales (16 bytes): packed 4-bit scales/mins
- // - QS (64 bytes): 256 2-bit quants
- // - D (2 bytes): float16 super-scale
- // - DMin (2 bytes): float16 super-min
- typedef struct {
- unsigned char scales[16];
- unsigned char qs[64];
- unsigned short d;
- unsigned short dmin;
- } BlockQ2_K;
- // Dequantize a row of Q8_K blocks: numBlocks * 256 values -> out
- int cuda_dequant_q8k(const void* blocks, float* out, int numBlocks);
- // Dequantize a row of Q4_K blocks
- int cuda_dequant_q4k(const void* blocks, float* out, int numBlocks);
- int cuda_dequant_q5k(const void* blocks, float* out, int numBlocks);
- // Dequantize a row of Q6_K blocks
- int cuda_dequant_q6k(const void* blocks, float* out, int numBlocks);
- // Dequantize a row of Q3_K blocks
- int cuda_dequant_q3k(const void* blocks, float* out, int numBlocks);
- // Dequantize a row of Q2_K blocks
- int cuda_dequant_q2k(const void* blocks, float* out, int numBlocks);
- // Fused Dequant + MatMul (for maximum performance)
- // A: [M, K] float32 input
- // B: quantized weight blocks [N rows, K/256 blocks per row]
- // C: [M, N] float32 output
- // This dequantizes B on-the-fly and computes C = A @ B.T
- int cuda_matmul_f32_q8k(float* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f32_q5k(float* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f32_q6k(float* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f32_q4k(float* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f32_q3k(float* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f32_q2k(float* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f32_q8k_timed(float* A, const void* B, float* C, int M, int K, int N, float* ms);
- int cuda_matmul_f32_q4k_timed(float* A, const void* B, float* C, int M, int K, int N, float* ms);
- // FP16 Input Variants - 2x memory bandwidth for activations
- // A: [M, K] float16 input, B: quantized, C: [M, N] float32 output
- int cuda_matmul_f16_q8k(const void* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f16_q4k(const void* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f16_q5k(const void* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f16_q2k(const void* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f16_q3k(const void* A, const void* B, float* C, int M, int K, int N);
- int cuda_matmul_f16_q6k(const void* A, const void* B, float* C, int M, int K, int N);
- // Debug helper
- int cuda_print_struct_sizes();
- #ifdef __cplusplus
- }
- #endif
- #endif // MAKARNA_CUDA_H
|