| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527 |
- //go:build cuda
- package cuda
- import (
- "testing"
- "unsafe"
- "makarna/pkg/quant"
- "makarna/pkg/tensor"
- )
- func TestDequantQ8K_CUDA(t *testing.T) {
- // Create a simple Q8_K block
- // Block layout: 4 bytes D (float32) + 256 bytes qs (int8) + 32 bytes bsums
- blockSize := 292
- hostBlock := make([]byte, blockSize)
-
- // Set D = 0.5 (as float32 bytes)
- d := float32(0.5)
- dBytes := (*[4]byte)(unsafe.Pointer(&d))[:]
- copy(hostBlock[0:4], dBytes)
-
- // Set qs: values 0, 1, 2, 3, ... (as int8)
- for i := 0; i < 256; i++ {
- hostBlock[4+i] = byte(int8(i - 128)) // Range -128 to 127
- }
-
- // Upload block to GPU
- gpu := 0
- devBlocks, err := UploadQ8K(hostBlock, 1, gpu)
- if err != nil {
- t.Fatalf("UploadQ8K failed: %v", err)
- }
- defer FreeDevicePtr(devBlocks)
-
- // Allocate output on GPU
- outTensor, err := NewTensor(tensor.Shape{256}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor failed: %v", err)
- }
-
- // Dequantize
- err = DequantQ8K(devBlocks, outTensor.ptr, 1, gpu)
- if err != nil {
- t.Fatalf("DequantQ8K failed: %v", err)
- }
-
- // Copy back and verify
- hostOut := make([]float32, 256)
- if err := outTensor.CopyToHost(hostOut); err != nil {
- t.Fatalf("CopyToHost failed: %v", err)
- }
-
- // Check first few values
- for i := 0; i < 10; i++ {
- expected := float32(0.5) * float32(int8(i-128))
- if diff := hostOut[i] - expected; diff < -0.001 || diff > 0.001 {
- t.Errorf("out[%d] = %f, expected %f", i, hostOut[i], expected)
- }
- }
-
- t.Logf("Q8_K CUDA dequant test passed, sample outputs: %.4f, %.4f, %.4f",
- hostOut[0], hostOut[128], hostOut[255])
- }
- func TestMatMulQ8K_CUDA(t *testing.T) {
- // Simple 2x4 @ Q8K(4x4) = 2x4 test
- // But Q8K needs K to be multiple of 256, so we use M=2, K=256, N=2
- M, K, N := 2, 256, 2
- gpu := 0
-
- // Create input A on GPU [2, 256]
- aTensor, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor A failed: %v", err)
- }
-
- // Fill A with 1.0
- hostA := make([]float32, M*K)
- for i := range hostA {
- hostA[i] = 1.0
- }
- if err := aTensor.CopyFrom(hostA); err != nil {
- t.Fatalf("CopyFrom A failed: %v", err)
- }
-
- // Create Q8_K weight B: N rows, each with K/256 = 1 block
- // Each block: d=1.0, qs=all 1s -> dequant = 1.0 for all
- blockSize := 292
- numBlocks := N * (K / 256) // 2 * 1 = 2 blocks
- hostB := make([]byte, numBlocks*blockSize)
-
- d := float32(1.0)
- dBytes := (*[4]byte)(unsafe.Pointer(&d))[:]
-
- for blk := 0; blk < numBlocks; blk++ {
- offset := blk * blockSize
- copy(hostB[offset:offset+4], dBytes)
- // qs = all 1s
- for i := 0; i < 256; i++ {
- hostB[offset+4+i] = 1
- }
- }
-
- devB, err := UploadQ8K(hostB, numBlocks, gpu)
- if err != nil {
- t.Fatalf("UploadQ8K B failed: %v", err)
- }
- defer FreeDevicePtr(devB)
-
- // Create output C on GPU [2, 2]
- cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor C failed: %v", err)
- }
-
- // Run fused matmul
- err = MatMulQ8K(aTensor.ptr, devB, cTensor.ptr, M, K, N, gpu)
- if err != nil {
- t.Fatalf("MatMulQ8K failed: %v", err)
- }
-
- // Copy back and verify
- // C = A @ dequant(B) = [1,1,...] @ [1,1,...].T = 256.0 per element
- hostC := make([]float32, M*N)
- if err := cTensor.CopyToHost(hostC); err != nil {
- t.Fatalf("CopyToHost C failed: %v", err)
- }
-
- expected := float32(256.0) // Sum of 256 1s
- for i, v := range hostC {
- if diff := v - expected; diff < -1.0 || diff > 1.0 {
- t.Errorf("C[%d] = %f, expected %f", i, v, expected)
- }
- }
-
- t.Logf("MatMulQ8K CUDA test passed, outputs: %v", hostC)
- }
- func TestMatMulF16Q8K_CUDA(t *testing.T) {
- // Same as TestMatMulQ8K_CUDA but uses FP16 input kernel.
- M, K, N := 2, 256, 2
- gpu := 0
- // Create input A on GPU [2, 256] as FP32 then cast to FP16 on GPU.
- aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F32) failed: %v", err)
- }
- aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F16) failed: %v", err)
- }
- // Fill A with 1.0
- hostA := make([]float32, M*K)
- for i := range hostA {
- hostA[i] = 1.0
- }
- if err := aF32.CopyFrom(hostA); err != nil {
- t.Fatalf("CopyFrom A(F32) failed: %v", err)
- }
- if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
- t.Fatalf("CastF32ToF16 failed: %v", err)
- }
- // Create Q8_K weight B: N rows, each with K/256 = 1 block
- blockSize := 292
- numBlocks := N * (K / 256)
- hostB := make([]byte, numBlocks*blockSize)
- d := float32(1.0)
- dBytes := (*[4]byte)(unsafe.Pointer(&d))[:]
- for blk := 0; blk < numBlocks; blk++ {
- offset := blk * blockSize
- copy(hostB[offset:offset+4], dBytes)
- for i := 0; i < 256; i++ {
- hostB[offset+4+i] = 1
- }
- }
- devB, err := UploadQ8K(hostB, numBlocks, gpu)
- if err != nil {
- t.Fatalf("UploadQ8K B failed: %v", err)
- }
- defer FreeDevicePtr(devB)
- // Create output C on GPU [2, 2]
- cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor C failed: %v", err)
- }
- // Run fused matmul (FP16 input)
- err = MatMulF16Q8K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
- if err != nil {
- t.Fatalf("MatMulF16Q8K failed: %v", err)
- }
- // Copy back and verify
- hostC := make([]float32, M*N)
- if err := cTensor.CopyToHost(hostC); err != nil {
- t.Fatalf("CopyToHost C failed: %v", err)
- }
- expected := float32(256.0)
- for i, v := range hostC {
- if diff := v - expected; diff < -1.0 || diff > 1.0 {
- t.Errorf("C[%d] = %f, expected %f", i, v, expected)
- }
- }
- t.Logf("MatMulF16Q8K CUDA test passed, outputs: %v", hostC)
- }
- func TestMatMulF16Q4K_CUDA(t *testing.T) {
- M, K, N := 2, 256, 2
- gpu := 0
- aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F32) failed: %v", err)
- }
- aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F16) failed: %v", err)
- }
- hostA := make([]float32, M*K)
- for i := range hostA {
- hostA[i] = 1.0
- }
- if err := aF32.CopyFrom(hostA); err != nil {
- t.Fatalf("CopyFrom A(F32) failed: %v", err)
- }
- if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
- t.Fatalf("CastF32ToF16 failed: %v", err)
- }
- row := make([]float32, K)
- for i := range row {
- row[i] = 1.0
- }
- hostB := make([]byte, 0, N*144)
- for i := 0; i < N; i++ {
- hostB = append(hostB, quant.QuantizeQ4K(row)...)
- }
- devB, err := UploadQ4K(hostB, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("UploadQ4K B failed: %v", err)
- }
- defer FreeDevicePtr(devB)
- cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor C failed: %v", err)
- }
- err = MatMulF16Q4K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
- if err != nil {
- t.Fatalf("MatMulF16Q4K failed: %v", err)
- }
- hostC := make([]float32, M*N)
- if err := cTensor.CopyToHost(hostC); err != nil {
- t.Fatalf("CopyToHost C failed: %v", err)
- }
- // Quantization may introduce small error; allow a bit more tolerance.
- expected := float32(256.0)
- for i, v := range hostC {
- if diff := v - expected; diff < -4.0 || diff > 4.0 {
- t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
- }
- }
- }
- func TestMatMulF16Q5K_CUDA(t *testing.T) {
- M, K, N := 2, 256, 2
- gpu := 0
- aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F32) failed: %v", err)
- }
- aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F16) failed: %v", err)
- }
- hostA := make([]float32, M*K)
- for i := range hostA {
- hostA[i] = 1.0
- }
- if err := aF32.CopyFrom(hostA); err != nil {
- t.Fatalf("CopyFrom A(F32) failed: %v", err)
- }
- if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
- t.Fatalf("CastF32ToF16 failed: %v", err)
- }
- row := make([]float32, K)
- for i := range row {
- row[i] = 1.0
- }
- hostB := make([]byte, 0, N*176)
- for i := 0; i < N; i++ {
- hostB = append(hostB, quant.QuantizeQ5K(row)...)
- }
- devB, err := UploadQ5K(hostB, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("UploadQ5K B failed: %v", err)
- }
- defer FreeDevicePtr(devB)
- cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor C failed: %v", err)
- }
- err = MatMulF16Q5K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
- if err != nil {
- t.Fatalf("MatMulF16Q5K failed: %v", err)
- }
- hostC := make([]float32, M*N)
- if err := cTensor.CopyToHost(hostC); err != nil {
- t.Fatalf("CopyToHost C failed: %v", err)
- }
- expected := float32(256.0)
- for i, v := range hostC {
- if diff := v - expected; diff < -4.0 || diff > 4.0 {
- t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
- }
- }
- }
- func TestMatMulF16Q2K_CUDA(t *testing.T) {
- M, K, N := 2, 256, 2
- gpu := 0
- aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F32) failed: %v", err)
- }
- aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F16) failed: %v", err)
- }
- hostA := make([]float32, M*K)
- for i := range hostA {
- hostA[i] = 1.0
- }
- if err := aF32.CopyFrom(hostA); err != nil {
- t.Fatalf("CopyFrom A(F32) failed: %v", err)
- }
- if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
- t.Fatalf("CastF32ToF16 failed: %v", err)
- }
- row := make([]float32, K)
- for i := range row {
- row[i] = 1.0
- }
- hostB := make([]byte, 0, N*84)
- for i := 0; i < N; i++ {
- hostB = append(hostB, quant.QuantizeQ2K(row)...)
- }
- devB, err := UploadQ2K(hostB, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("UploadQ2K B failed: %v", err)
- }
- defer FreeDevicePtr(devB)
- cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor C failed: %v", err)
- }
- err = MatMulF16Q2K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
- if err != nil {
- t.Fatalf("MatMulF16Q2K failed: %v", err)
- }
- hostC := make([]float32, M*N)
- if err := cTensor.CopyToHost(hostC); err != nil {
- t.Fatalf("CopyToHost C failed: %v", err)
- }
- expected := float32(256.0)
- for i, v := range hostC {
- if diff := v - expected; diff < -12.0 || diff > 12.0 {
- t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
- }
- }
- }
- func TestMatMulF16Q3K_CUDA(t *testing.T) {
- M, K, N := 2, 256, 2
- gpu := 0
- aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F32) failed: %v", err)
- }
- aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F16) failed: %v", err)
- }
- hostA := make([]float32, M*K)
- for i := range hostA {
- hostA[i] = 1.0
- }
- if err := aF32.CopyFrom(hostA); err != nil {
- t.Fatalf("CopyFrom A(F32) failed: %v", err)
- }
- if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
- t.Fatalf("CastF32ToF16 failed: %v", err)
- }
- row := make([]float32, K)
- for i := range row {
- row[i] = 1.0
- }
- hostB := make([]byte, 0, N*110)
- for i := 0; i < N; i++ {
- hostB = append(hostB, quant.QuantizeQ3K(row)...)
- }
- devB, err := UploadQ3K(hostB, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("UploadQ3K B failed: %v", err)
- }
- defer FreeDevicePtr(devB)
- cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor C failed: %v", err)
- }
- err = MatMulF16Q3K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
- if err != nil {
- t.Fatalf("MatMulF16Q3K failed: %v", err)
- }
- hostC := make([]float32, M*N)
- if err := cTensor.CopyToHost(hostC); err != nil {
- t.Fatalf("CopyToHost C failed: %v", err)
- }
- expected := float32(256.0)
- for i, v := range hostC {
- if diff := v - expected; diff < -12.0 || diff > 12.0 {
- t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
- }
- }
- }
- func TestMatMulF16Q6K_CUDA(t *testing.T) {
- M, K, N := 2, 256, 2
- gpu := 0
- aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F32) failed: %v", err)
- }
- aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
- if err != nil {
- t.Fatalf("NewTensor A(F16) failed: %v", err)
- }
- hostA := make([]float32, M*K)
- for i := range hostA {
- hostA[i] = 1.0
- }
- if err := aF32.CopyFrom(hostA); err != nil {
- t.Fatalf("CopyFrom A(F32) failed: %v", err)
- }
- if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
- t.Fatalf("CastF32ToF16 failed: %v", err)
- }
- row := make([]float32, K)
- for i := range row {
- row[i] = 1.0
- }
- hostB := make([]byte, 0, N*210)
- for i := 0; i < N; i++ {
- hostB = append(hostB, quant.QuantizeQ6K(row)...)
- }
- devB, err := UploadQ6K(hostB, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("UploadQ6K B failed: %v", err)
- }
- defer FreeDevicePtr(devB)
- cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("NewTensor C failed: %v", err)
- }
- err = MatMulF16Q6K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
- if err != nil {
- t.Fatalf("MatMulF16Q6K failed: %v", err)
- }
- hostC := make([]float32, M*N)
- if err := cTensor.CopyToHost(hostC); err != nil {
- t.Fatalf("CopyToHost C failed: %v", err)
- }
- expected := float32(256.0)
- for i, v := range hostC {
- if diff := v - expected; diff < -8.0 || diff > 8.0 {
- t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
- }
- }
- }
|