1
0

dequant_test.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. //go:build cuda
  2. package cuda
  3. import (
  4. "testing"
  5. "unsafe"
  6. "makarna/pkg/quant"
  7. "makarna/pkg/tensor"
  8. )
  9. func TestDequantQ8K_CUDA(t *testing.T) {
  10. // Create a simple Q8_K block
  11. // Block layout: 4 bytes D (float32) + 256 bytes qs (int8) + 32 bytes bsums
  12. blockSize := 292
  13. hostBlock := make([]byte, blockSize)
  14. // Set D = 0.5 (as float32 bytes)
  15. d := float32(0.5)
  16. dBytes := (*[4]byte)(unsafe.Pointer(&d))[:]
  17. copy(hostBlock[0:4], dBytes)
  18. // Set qs: values 0, 1, 2, 3, ... (as int8)
  19. for i := 0; i < 256; i++ {
  20. hostBlock[4+i] = byte(int8(i - 128)) // Range -128 to 127
  21. }
  22. // Upload block to GPU
  23. gpu := 0
  24. devBlocks, err := UploadQ8K(hostBlock, 1, gpu)
  25. if err != nil {
  26. t.Fatalf("UploadQ8K failed: %v", err)
  27. }
  28. defer FreeDevicePtr(devBlocks)
  29. // Allocate output on GPU
  30. outTensor, err := NewTensor(tensor.Shape{256}, tensor.Float32, gpu)
  31. if err != nil {
  32. t.Fatalf("NewTensor failed: %v", err)
  33. }
  34. // Dequantize
  35. err = DequantQ8K(devBlocks, outTensor.ptr, 1, gpu)
  36. if err != nil {
  37. t.Fatalf("DequantQ8K failed: %v", err)
  38. }
  39. // Copy back and verify
  40. hostOut := make([]float32, 256)
  41. if err := outTensor.CopyToHost(hostOut); err != nil {
  42. t.Fatalf("CopyToHost failed: %v", err)
  43. }
  44. // Check first few values
  45. for i := 0; i < 10; i++ {
  46. expected := float32(0.5) * float32(int8(i-128))
  47. if diff := hostOut[i] - expected; diff < -0.001 || diff > 0.001 {
  48. t.Errorf("out[%d] = %f, expected %f", i, hostOut[i], expected)
  49. }
  50. }
  51. t.Logf("Q8_K CUDA dequant test passed, sample outputs: %.4f, %.4f, %.4f",
  52. hostOut[0], hostOut[128], hostOut[255])
  53. }
  54. func TestMatMulQ8K_CUDA(t *testing.T) {
  55. // Simple 2x4 @ Q8K(4x4) = 2x4 test
  56. // But Q8K needs K to be multiple of 256, so we use M=2, K=256, N=2
  57. M, K, N := 2, 256, 2
  58. gpu := 0
  59. // Create input A on GPU [2, 256]
  60. aTensor, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
  61. if err != nil {
  62. t.Fatalf("NewTensor A failed: %v", err)
  63. }
  64. // Fill A with 1.0
  65. hostA := make([]float32, M*K)
  66. for i := range hostA {
  67. hostA[i] = 1.0
  68. }
  69. if err := aTensor.CopyFrom(hostA); err != nil {
  70. t.Fatalf("CopyFrom A failed: %v", err)
  71. }
  72. // Create Q8_K weight B: N rows, each with K/256 = 1 block
  73. // Each block: d=1.0, qs=all 1s -> dequant = 1.0 for all
  74. blockSize := 292
  75. numBlocks := N * (K / 256) // 2 * 1 = 2 blocks
  76. hostB := make([]byte, numBlocks*blockSize)
  77. d := float32(1.0)
  78. dBytes := (*[4]byte)(unsafe.Pointer(&d))[:]
  79. for blk := 0; blk < numBlocks; blk++ {
  80. offset := blk * blockSize
  81. copy(hostB[offset:offset+4], dBytes)
  82. // qs = all 1s
  83. for i := 0; i < 256; i++ {
  84. hostB[offset+4+i] = 1
  85. }
  86. }
  87. devB, err := UploadQ8K(hostB, numBlocks, gpu)
  88. if err != nil {
  89. t.Fatalf("UploadQ8K B failed: %v", err)
  90. }
  91. defer FreeDevicePtr(devB)
  92. // Create output C on GPU [2, 2]
  93. cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
  94. if err != nil {
  95. t.Fatalf("NewTensor C failed: %v", err)
  96. }
  97. // Run fused matmul
  98. err = MatMulQ8K(aTensor.ptr, devB, cTensor.ptr, M, K, N, gpu)
  99. if err != nil {
  100. t.Fatalf("MatMulQ8K failed: %v", err)
  101. }
  102. // Copy back and verify
  103. // C = A @ dequant(B) = [1,1,...] @ [1,1,...].T = 256.0 per element
  104. hostC := make([]float32, M*N)
  105. if err := cTensor.CopyToHost(hostC); err != nil {
  106. t.Fatalf("CopyToHost C failed: %v", err)
  107. }
  108. expected := float32(256.0) // Sum of 256 1s
  109. for i, v := range hostC {
  110. if diff := v - expected; diff < -1.0 || diff > 1.0 {
  111. t.Errorf("C[%d] = %f, expected %f", i, v, expected)
  112. }
  113. }
  114. t.Logf("MatMulQ8K CUDA test passed, outputs: %v", hostC)
  115. }
  116. func TestMatMulF16Q8K_CUDA(t *testing.T) {
  117. // Same as TestMatMulQ8K_CUDA but uses FP16 input kernel.
  118. M, K, N := 2, 256, 2
  119. gpu := 0
  120. // Create input A on GPU [2, 256] as FP32 then cast to FP16 on GPU.
  121. aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
  122. if err != nil {
  123. t.Fatalf("NewTensor A(F32) failed: %v", err)
  124. }
  125. aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
  126. if err != nil {
  127. t.Fatalf("NewTensor A(F16) failed: %v", err)
  128. }
  129. // Fill A with 1.0
  130. hostA := make([]float32, M*K)
  131. for i := range hostA {
  132. hostA[i] = 1.0
  133. }
  134. if err := aF32.CopyFrom(hostA); err != nil {
  135. t.Fatalf("CopyFrom A(F32) failed: %v", err)
  136. }
  137. if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
  138. t.Fatalf("CastF32ToF16 failed: %v", err)
  139. }
  140. // Create Q8_K weight B: N rows, each with K/256 = 1 block
  141. blockSize := 292
  142. numBlocks := N * (K / 256)
  143. hostB := make([]byte, numBlocks*blockSize)
  144. d := float32(1.0)
  145. dBytes := (*[4]byte)(unsafe.Pointer(&d))[:]
  146. for blk := 0; blk < numBlocks; blk++ {
  147. offset := blk * blockSize
  148. copy(hostB[offset:offset+4], dBytes)
  149. for i := 0; i < 256; i++ {
  150. hostB[offset+4+i] = 1
  151. }
  152. }
  153. devB, err := UploadQ8K(hostB, numBlocks, gpu)
  154. if err != nil {
  155. t.Fatalf("UploadQ8K B failed: %v", err)
  156. }
  157. defer FreeDevicePtr(devB)
  158. // Create output C on GPU [2, 2]
  159. cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
  160. if err != nil {
  161. t.Fatalf("NewTensor C failed: %v", err)
  162. }
  163. // Run fused matmul (FP16 input)
  164. err = MatMulF16Q8K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
  165. if err != nil {
  166. t.Fatalf("MatMulF16Q8K failed: %v", err)
  167. }
  168. // Copy back and verify
  169. hostC := make([]float32, M*N)
  170. if err := cTensor.CopyToHost(hostC); err != nil {
  171. t.Fatalf("CopyToHost C failed: %v", err)
  172. }
  173. expected := float32(256.0)
  174. for i, v := range hostC {
  175. if diff := v - expected; diff < -1.0 || diff > 1.0 {
  176. t.Errorf("C[%d] = %f, expected %f", i, v, expected)
  177. }
  178. }
  179. t.Logf("MatMulF16Q8K CUDA test passed, outputs: %v", hostC)
  180. }
  181. func TestMatMulF16Q4K_CUDA(t *testing.T) {
  182. M, K, N := 2, 256, 2
  183. gpu := 0
  184. aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
  185. if err != nil {
  186. t.Fatalf("NewTensor A(F32) failed: %v", err)
  187. }
  188. aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
  189. if err != nil {
  190. t.Fatalf("NewTensor A(F16) failed: %v", err)
  191. }
  192. hostA := make([]float32, M*K)
  193. for i := range hostA {
  194. hostA[i] = 1.0
  195. }
  196. if err := aF32.CopyFrom(hostA); err != nil {
  197. t.Fatalf("CopyFrom A(F32) failed: %v", err)
  198. }
  199. if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
  200. t.Fatalf("CastF32ToF16 failed: %v", err)
  201. }
  202. row := make([]float32, K)
  203. for i := range row {
  204. row[i] = 1.0
  205. }
  206. hostB := make([]byte, 0, N*144)
  207. for i := 0; i < N; i++ {
  208. hostB = append(hostB, quant.QuantizeQ4K(row)...)
  209. }
  210. devB, err := UploadQ4K(hostB, N*(K/256), gpu)
  211. if err != nil {
  212. t.Fatalf("UploadQ4K B failed: %v", err)
  213. }
  214. defer FreeDevicePtr(devB)
  215. cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
  216. if err != nil {
  217. t.Fatalf("NewTensor C failed: %v", err)
  218. }
  219. err = MatMulF16Q4K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
  220. if err != nil {
  221. t.Fatalf("MatMulF16Q4K failed: %v", err)
  222. }
  223. hostC := make([]float32, M*N)
  224. if err := cTensor.CopyToHost(hostC); err != nil {
  225. t.Fatalf("CopyToHost C failed: %v", err)
  226. }
  227. // Quantization may introduce small error; allow a bit more tolerance.
  228. expected := float32(256.0)
  229. for i, v := range hostC {
  230. if diff := v - expected; diff < -4.0 || diff > 4.0 {
  231. t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
  232. }
  233. }
  234. }
  235. func TestMatMulF16Q5K_CUDA(t *testing.T) {
  236. M, K, N := 2, 256, 2
  237. gpu := 0
  238. aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
  239. if err != nil {
  240. t.Fatalf("NewTensor A(F32) failed: %v", err)
  241. }
  242. aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
  243. if err != nil {
  244. t.Fatalf("NewTensor A(F16) failed: %v", err)
  245. }
  246. hostA := make([]float32, M*K)
  247. for i := range hostA {
  248. hostA[i] = 1.0
  249. }
  250. if err := aF32.CopyFrom(hostA); err != nil {
  251. t.Fatalf("CopyFrom A(F32) failed: %v", err)
  252. }
  253. if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
  254. t.Fatalf("CastF32ToF16 failed: %v", err)
  255. }
  256. row := make([]float32, K)
  257. for i := range row {
  258. row[i] = 1.0
  259. }
  260. hostB := make([]byte, 0, N*176)
  261. for i := 0; i < N; i++ {
  262. hostB = append(hostB, quant.QuantizeQ5K(row)...)
  263. }
  264. devB, err := UploadQ5K(hostB, N*(K/256), gpu)
  265. if err != nil {
  266. t.Fatalf("UploadQ5K B failed: %v", err)
  267. }
  268. defer FreeDevicePtr(devB)
  269. cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
  270. if err != nil {
  271. t.Fatalf("NewTensor C failed: %v", err)
  272. }
  273. err = MatMulF16Q5K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
  274. if err != nil {
  275. t.Fatalf("MatMulF16Q5K failed: %v", err)
  276. }
  277. hostC := make([]float32, M*N)
  278. if err := cTensor.CopyToHost(hostC); err != nil {
  279. t.Fatalf("CopyToHost C failed: %v", err)
  280. }
  281. expected := float32(256.0)
  282. for i, v := range hostC {
  283. if diff := v - expected; diff < -4.0 || diff > 4.0 {
  284. t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
  285. }
  286. }
  287. }
  288. func TestMatMulF16Q2K_CUDA(t *testing.T) {
  289. M, K, N := 2, 256, 2
  290. gpu := 0
  291. aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
  292. if err != nil {
  293. t.Fatalf("NewTensor A(F32) failed: %v", err)
  294. }
  295. aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
  296. if err != nil {
  297. t.Fatalf("NewTensor A(F16) failed: %v", err)
  298. }
  299. hostA := make([]float32, M*K)
  300. for i := range hostA {
  301. hostA[i] = 1.0
  302. }
  303. if err := aF32.CopyFrom(hostA); err != nil {
  304. t.Fatalf("CopyFrom A(F32) failed: %v", err)
  305. }
  306. if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
  307. t.Fatalf("CastF32ToF16 failed: %v", err)
  308. }
  309. row := make([]float32, K)
  310. for i := range row {
  311. row[i] = 1.0
  312. }
  313. hostB := make([]byte, 0, N*84)
  314. for i := 0; i < N; i++ {
  315. hostB = append(hostB, quant.QuantizeQ2K(row)...)
  316. }
  317. devB, err := UploadQ2K(hostB, N*(K/256), gpu)
  318. if err != nil {
  319. t.Fatalf("UploadQ2K B failed: %v", err)
  320. }
  321. defer FreeDevicePtr(devB)
  322. cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
  323. if err != nil {
  324. t.Fatalf("NewTensor C failed: %v", err)
  325. }
  326. err = MatMulF16Q2K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
  327. if err != nil {
  328. t.Fatalf("MatMulF16Q2K failed: %v", err)
  329. }
  330. hostC := make([]float32, M*N)
  331. if err := cTensor.CopyToHost(hostC); err != nil {
  332. t.Fatalf("CopyToHost C failed: %v", err)
  333. }
  334. expected := float32(256.0)
  335. for i, v := range hostC {
  336. if diff := v - expected; diff < -12.0 || diff > 12.0 {
  337. t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
  338. }
  339. }
  340. }
  341. func TestMatMulF16Q3K_CUDA(t *testing.T) {
  342. M, K, N := 2, 256, 2
  343. gpu := 0
  344. aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
  345. if err != nil {
  346. t.Fatalf("NewTensor A(F32) failed: %v", err)
  347. }
  348. aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
  349. if err != nil {
  350. t.Fatalf("NewTensor A(F16) failed: %v", err)
  351. }
  352. hostA := make([]float32, M*K)
  353. for i := range hostA {
  354. hostA[i] = 1.0
  355. }
  356. if err := aF32.CopyFrom(hostA); err != nil {
  357. t.Fatalf("CopyFrom A(F32) failed: %v", err)
  358. }
  359. if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
  360. t.Fatalf("CastF32ToF16 failed: %v", err)
  361. }
  362. row := make([]float32, K)
  363. for i := range row {
  364. row[i] = 1.0
  365. }
  366. hostB := make([]byte, 0, N*110)
  367. for i := 0; i < N; i++ {
  368. hostB = append(hostB, quant.QuantizeQ3K(row)...)
  369. }
  370. devB, err := UploadQ3K(hostB, N*(K/256), gpu)
  371. if err != nil {
  372. t.Fatalf("UploadQ3K B failed: %v", err)
  373. }
  374. defer FreeDevicePtr(devB)
  375. cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
  376. if err != nil {
  377. t.Fatalf("NewTensor C failed: %v", err)
  378. }
  379. err = MatMulF16Q3K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
  380. if err != nil {
  381. t.Fatalf("MatMulF16Q3K failed: %v", err)
  382. }
  383. hostC := make([]float32, M*N)
  384. if err := cTensor.CopyToHost(hostC); err != nil {
  385. t.Fatalf("CopyToHost C failed: %v", err)
  386. }
  387. expected := float32(256.0)
  388. for i, v := range hostC {
  389. if diff := v - expected; diff < -12.0 || diff > 12.0 {
  390. t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
  391. }
  392. }
  393. }
  394. func TestMatMulF16Q6K_CUDA(t *testing.T) {
  395. M, K, N := 2, 256, 2
  396. gpu := 0
  397. aF32, err := NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
  398. if err != nil {
  399. t.Fatalf("NewTensor A(F32) failed: %v", err)
  400. }
  401. aF16, err := NewTensor(tensor.Shape{M, K}, tensor.Float16, gpu)
  402. if err != nil {
  403. t.Fatalf("NewTensor A(F16) failed: %v", err)
  404. }
  405. hostA := make([]float32, M*K)
  406. for i := range hostA {
  407. hostA[i] = 1.0
  408. }
  409. if err := aF32.CopyFrom(hostA); err != nil {
  410. t.Fatalf("CopyFrom A(F32) failed: %v", err)
  411. }
  412. if err := CastF32ToF16(aF32.ptr, aF16.ptr, M*K, gpu); err != nil {
  413. t.Fatalf("CastF32ToF16 failed: %v", err)
  414. }
  415. row := make([]float32, K)
  416. for i := range row {
  417. row[i] = 1.0
  418. }
  419. hostB := make([]byte, 0, N*210)
  420. for i := 0; i < N; i++ {
  421. hostB = append(hostB, quant.QuantizeQ6K(row)...)
  422. }
  423. devB, err := UploadQ6K(hostB, N*(K/256), gpu)
  424. if err != nil {
  425. t.Fatalf("UploadQ6K B failed: %v", err)
  426. }
  427. defer FreeDevicePtr(devB)
  428. cTensor, err := NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
  429. if err != nil {
  430. t.Fatalf("NewTensor C failed: %v", err)
  431. }
  432. err = MatMulF16Q6K(aF16.ptr, devB, cTensor.ptr, M, K, N, gpu)
  433. if err != nil {
  434. t.Fatalf("MatMulF16Q6K failed: %v", err)
  435. }
  436. hostC := make([]float32, M*N)
  437. if err := cTensor.CopyToHost(hostC); err != nil {
  438. t.Fatalf("CopyToHost C failed: %v", err)
  439. }
  440. expected := float32(256.0)
  441. for i, v := range hostC {
  442. if diff := v - expected; diff < -8.0 || diff > 8.0 {
  443. t.Errorf("C[%d] = %f, expected ~%f", i, v, expected)
  444. }
  445. }
  446. }