//go:build cuda package cuda import ( "testing" "unsafe" "makarna/pkg/quant" "makarna/pkg/tensor" ) func TestDequantQ8K_CUDA(t *testing.T) { // Create a simple Q8_K block // Block layout: 4 bytes D (float32) + 256 bytes qs (int8) + 32 bytes bsums blockSize := 292 hostBlock := make([]byte, blockSize) // Set D = 0.5 (as float32 bytes) d := float32(0.5) dBytes := (*[4]byte)(unsafe.Pointer(&d))[:] copy(hostBlock[0:4], dBytes) // Set qs: values 0, 1, 2, 3, ... (as int8) for i := 0; i < 256; i++ { hostBlock[4+i] = byte(int8(i - 128)) // Range -128 to 127 } // Upload block to GPU gpu := 0 devBlocks, err := UploadQ8K(hostBlock, 1, gpu) if err != nil { t.Fatalf("UploadQ8K failed: %v", err) } defer FreeDevicePtr(devBlocks) // Allocate output on GPU outTensor, err := NewTensor(tensor.Shape{256}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor failed: %v", err) } // Dequantize err = DequantQ8K(devBlocks, outTensor.ptr, 1, gpu) if err != nil { t.Fatalf("DequantQ8K failed: %v", err) } // Copy back and verify hostOut := make([]float32, 256) if err := outTensor.CopyToHost(hostOut); err != nil { t.Fatalf("CopyToHost failed: %v", err) } // Check first few values for i := 0; i < 10; i++ { expected := float32(0.5) * float32(int8(i-128)) if diff := hostOut[i] - expected; diff < -0.001 || diff > 0.001 { t.Errorf("out[%d] = %f, expected %f", i, hostOut[i], expected) } } t.Logf("Q8_K CUDA dequant test passed, sample outputs: %.4f, %.4f, %.4f", hostOut[0], hostOut[128], hostOut[255]) } func TestMatMulQ8K_CUDA(t *testing.T) { // Simple 2x4 @ Q8K(4x4) = 2x4 test // But Q8K needs K to be multiple of 256, so we use M=2, K=256, N=2 M, K, N := 2, 256, 2 gpu := 0 // Create input A on GPU [2, 256] aTensor, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor A failed: %v", err) } // Fill A with 1.0 hostA := make([]float32, M*K) for i := range hostA { hostA[i] = 1.0 } if err := aTensor.CopyFrom(hostA); err != nil { t.Fatalf("CopyFrom A failed: %v", err) } // Create Q8_K weight B: N rows, each with K/256 = 1 block // Each block: d=1.0, qs=all 1s -> dequant = 1.0 for all blockSize := 292 numBlocks := N * (K / 256) // 2 * 1 = 2 blocks hostB := make([]byte, numBlocks*blockSize) d := float32(1.0) dBytes := (*[4]byte)(unsafe.Pointer(&d))[:] for blk := 0; blk < numBlocks; blk++ { offset := blk * blockSize copy(hostB[offset:offset+4], dBytes) // qs = all 1s for i := 0; i < 256; i++ { hostB[offset+4+i] = 1 } } devB, err := UploadQ8K(hostB, numBlocks, gpu) if err != nil { t.Fatalf("UploadQ8K B failed: %v", err) } defer FreeDevicePtr(devB) // Create output C on GPU [2, 2] cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor C failed: %v", err) } // Run fused matmul err = MatMulQ8K(aTensor.ptr, devB, cTensor.ptr, M, K, N, gpu) if err != nil { t.Fatalf("MatMulQ8K failed: %v", err) } // Copy back and verify // C = A @ dequant(B) = [1,1,...] @ [1,1,...].T = 256.0 per element hostC := make([]float32, M*N) if err := cTensor.CopyToHost(hostC); err != nil { t.Fatalf("CopyToHost C failed: %v", err) } expected := float32(256.0) // Sum of 256 1s for i, v := range hostC { if diff := v - expected; diff < -1.0 || diff > 1.0 { t.Errorf("C[%d] = %f, expected %f", i, v, expected) } } t.Logf("MatMulQ8K CUDA test passed, outputs: %v", hostC) } func TestMatMulF16Q8K_CUDA(t *testing.T) { // Same as TestMatMulQ8K_CUDA but uses FP16 input kernel. M, K, N := 2, 256, 2 gpu := 0 // Create input A on GPU [2, 256] as FP32 then cast to FP16 on GPU. aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor A(F32) failed: %v", err) } aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu) if err != nil { t.Fatalf("NewTensor A(F16) failed: %v", err) } // Fill A with 1.0 hostA := make([]float32, M*K) for i := range hostA { hostA[i] = 1.0 } if err := aF32.CopyFrom(hostA); err != nil { t.Fatalf("CopyFrom A(F32) failed: %v", err) } if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil { t.Fatalf("CastF32ToF16 failed: %v", err) } // Create Q8_K weight B: N rows, each with K/256 = 1 block blockSize := 292 numBlocks := N * (K / 256) hostB := make([]byte, numBlocks*blockSize) d := float32(1.0) dBytes := (*[4]byte)(unsafe.Pointer(&d))[:] for blk := 0; blk < numBlocks; blk++ { offset := blk * blockSize copy(hostB[offset:offset+4], dBytes) for i := 0; i < 256; i++ { hostB[offset+4+i] = 1 } } devB, err := UploadQ8K(hostB, numBlocks, gpu) if err != nil { t.Fatalf("UploadQ8K B failed: %v", err) } defer FreeDevicePtr(devB) // Create output C on GPU [2, 2] cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor C failed: %v", err) } // Run fused matmul (FP16 input) err = MatMulF16Q8K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu) if err != nil { t.Fatalf("MatMulF16Q8K failed: %v", err) } // Copy back and verify hostC := make([]float32, M*N) if err := cTensor.CopyToHost(hostC); err != nil { t.Fatalf("CopyToHost C failed: %v", err) } expected := float32(256.0) for i, v := range hostC { if diff := v - expected; diff < -1.0 || diff > 1.0 { t.Errorf("C[%d] = %f, expected %f", i, v, expected) } } t.Logf("MatMulF16Q8K CUDA test passed, outputs: %v", hostC) } func TestMatMulF16Q4K_CUDA(t *testing.T) { M, K, N := 2, 256, 2 gpu := 0 aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor A(F32) failed: %v", err) } aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu) if err != nil { t.Fatalf("NewTensor A(F16) failed: %v", err) } hostA := make([]float32, M*K) for i := range hostA { hostA[i] = 1.0 } if err := aF32.CopyFrom(hostA); err != nil { t.Fatalf("CopyFrom A(F32) failed: %v", err) } if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil { t.Fatalf("CastF32ToF16 failed: %v", err) } row := make([]float32, K) for i := range row { row[i] = 1.0 } hostB := make([]byte, 0, N*144) for i := 0; i < N; i++ { hostB = append(hostB, quant.QuantizeQ4K(row)...) } devB, err := UploadQ4K(hostB, N*(K/256), gpu) if err != nil { t.Fatalf("UploadQ4K B failed: %v", err) } defer FreeDevicePtr(devB) cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor C failed: %v", err) } err = MatMulF16Q4K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu) if err != nil { t.Fatalf("MatMulF16Q4K failed: %v", err) } hostC := make([]float32, M*N) if err := cTensor.CopyToHost(hostC); err != nil { t.Fatalf("CopyToHost C failed: %v", err) } // Quantization may introduce small error; allow a bit more tolerance. expected := float32(256.0) for i, v := range hostC { if diff := v - expected; diff < -4.0 || diff > 4.0 { t.Errorf("C[%d] = %f, expected ~%f", i, v, expected) } } } func TestMatMulF16Q5K_CUDA(t *testing.T) { M, K, N := 2, 256, 2 gpu := 0 aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor A(F32) failed: %v", err) } aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu) if err != nil { t.Fatalf("NewTensor A(F16) failed: %v", err) } hostA := make([]float32, M*K) for i := range hostA { hostA[i] = 1.0 } if err := aF32.CopyFrom(hostA); err != nil { t.Fatalf("CopyFrom A(F32) failed: %v", err) } if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil { t.Fatalf("CastF32ToF16 failed: %v", err) } row := make([]float32, K) for i := range row { row[i] = 1.0 } hostB := make([]byte, 0, N*176) for i := 0; i < N; i++ { hostB = append(hostB, quant.QuantizeQ5K(row)...) } devB, err := UploadQ5K(hostB, N*(K/256), gpu) if err != nil { t.Fatalf("UploadQ5K B failed: %v", err) } defer FreeDevicePtr(devB) cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor C failed: %v", err) } err = MatMulF16Q5K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu) if err != nil { t.Fatalf("MatMulF16Q5K failed: %v", err) } hostC := make([]float32, M*N) if err := cTensor.CopyToHost(hostC); err != nil { t.Fatalf("CopyToHost C failed: %v", err) } expected := float32(256.0) for i, v := range hostC { if diff := v - expected; diff < -4.0 || diff > 4.0 { t.Errorf("C[%d] = %f, expected ~%f", i, v, expected) } } } func TestMatMulF16Q2K_CUDA(t *testing.T) { M, K, N := 2, 256, 2 gpu := 0 aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor A(F32) failed: %v", err) } aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu) if err != nil { t.Fatalf("NewTensor A(F16) failed: %v", err) } hostA := make([]float32, M*K) for i := range hostA { hostA[i] = 1.0 } if err := aF32.CopyFrom(hostA); err != nil { t.Fatalf("CopyFrom A(F32) failed: %v", err) } if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil { t.Fatalf("CastF32ToF16 failed: %v", err) } row := make([]float32, K) for i := range row { row[i] = 1.0 } hostB := make([]byte, 0, N*84) for i := 0; i < N; i++ { hostB = append(hostB, quant.QuantizeQ2K(row)...) } devB, err := UploadQ2K(hostB, N*(K/256), gpu) if err != nil { t.Fatalf("UploadQ2K B failed: %v", err) } defer FreeDevicePtr(devB) cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor C failed: %v", err) } err = MatMulF16Q2K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu) if err != nil { t.Fatalf("MatMulF16Q2K failed: %v", err) } hostC := make([]float32, M*N) if err := cTensor.CopyToHost(hostC); err != nil { t.Fatalf("CopyToHost C failed: %v", err) } expected := float32(256.0) for i, v := range hostC { if diff := v - expected; diff < -12.0 || diff > 12.0 { t.Errorf("C[%d] = %f, expected ~%f", i, v, expected) } } } func TestMatMulF16Q3K_CUDA(t *testing.T) { M, K, N := 2, 256, 2 gpu := 0 aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor A(F32) failed: %v", err) } aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu) if err != nil { t.Fatalf("NewTensor A(F16) failed: %v", err) } hostA := make([]float32, M*K) for i := range hostA { hostA[i] = 1.0 } if err := aF32.CopyFrom(hostA); err != nil { t.Fatalf("CopyFrom A(F32) failed: %v", err) } if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil { t.Fatalf("CastF32ToF16 failed: %v", err) } row := make([]float32, K) for i := range row { row[i] = 1.0 } hostB := make([]byte, 0, N*110) for i := 0; i < N; i++ { hostB = append(hostB, quant.QuantizeQ3K(row)...) } devB, err := UploadQ3K(hostB, N*(K/256), gpu) if err != nil { t.Fatalf("UploadQ3K B failed: %v", err) } defer FreeDevicePtr(devB) cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor C failed: %v", err) } err = MatMulF16Q3K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu) if err != nil { t.Fatalf("MatMulF16Q3K failed: %v", err) } hostC := make([]float32, M*N) if err := cTensor.CopyToHost(hostC); err != nil { t.Fatalf("CopyToHost C failed: %v", err) } expected := float32(256.0) for i, v := range hostC { if diff := v - expected; diff < -12.0 || diff > 12.0 { t.Errorf("C[%d] = %f, expected ~%f", i, v, expected) } } } func TestMatMulF16Q6K_CUDA(t *testing.T) { M, K, N := 2, 256, 2 gpu := 0 aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor A(F32) failed: %v", err) } aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu) if err != nil { t.Fatalf("NewTensor A(F16) failed: %v", err) } hostA := make([]float32, M*K) for i := range hostA { hostA[i] = 1.0 } if err := aF32.CopyFrom(hostA); err != nil { t.Fatalf("CopyFrom A(F32) failed: %v", err) } if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil { t.Fatalf("CastF32ToF16 failed: %v", err) } row := make([]float32, K) for i := range row { row[i] = 1.0 } hostB := make([]byte, 0, N*210) for i := 0; i < N; i++ { hostB = append(hostB, quant.QuantizeQ6K(row)...) } devB, err := UploadQ6K(hostB, N*(K/256), gpu) if err != nil { t.Fatalf("UploadQ6K B failed: %v", err) } defer FreeDevicePtr(devB) cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu) if err != nil { t.Fatalf("NewTensor C failed: %v", err) } err = MatMulF16Q6K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu) if err != nil { t.Fatalf("MatMulF16Q6K failed: %v", err) } hostC := make([]float32, M*N) if err := cTensor.CopyToHost(hostC); err != nil { t.Fatalf("CopyToHost C failed: %v", err) } expected := float32(256.0) for i, v := range hostC { if diff := v - expected; diff < -8.0 || diff > 8.0 { t.Errorf("C[%d] = %f, expected ~%f", i, v, expected) } } }