simd_dequant_test.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. package tensor
  2. import "testing"
  3. func TestDequantQ8KSimdMatchesScalar(t *testing.T) {
  4. var b BlockQ8_K
  5. for i := 0; i < 256; i++ {
  6. b.QS[i] = int8(i%17 - 8)
  7. }
  8. b.D = 0.125
  9. var outSimd, outScalar [256]float32
  10. if !dequantQ8KSimd(&b, outSimd[:]) {
  11. t.Skip("SIMD path not available on this CPU")
  12. }
  13. // Force scalar path
  14. origAVX2, origAVX512 := hasAVX2, hasAVX512
  15. hasAVX2, hasAVX512 = false, false
  16. DequantizeQ8_K(&b, outScalar[:])
  17. hasAVX2, hasAVX512 = origAVX2, origAVX512
  18. for i := range outScalar {
  19. if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-6 {
  20. t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i])
  21. }
  22. }
  23. }
  24. func TestDequantQ4KSimdMatchesScalar(t *testing.T) {
  25. var b BlockQ4_K
  26. b.D = 0x3C00 // float16(1.0)
  27. b.DMin = 0x3800 // float16(0.5)
  28. for i := range b.Scales {
  29. b.Scales[i] = uint8((i % 4) + 1)
  30. }
  31. for i := range b.QS {
  32. b.QS[i] = uint8(i % 256)
  33. }
  34. var outSimd, outScalar [256]float32
  35. if !dequantQ4KSimd(&b, outSimd[:]) {
  36. t.Skip("SIMD path not available on this CPU")
  37. }
  38. origAVX2, origAVX512 := hasAVX2, hasAVX512
  39. hasAVX2, hasAVX512 = false, false
  40. DequantizeQ4_K(&b, outScalar[:])
  41. hasAVX2, hasAVX512 = origAVX2, origAVX512
  42. for i := range outScalar {
  43. if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-5 {
  44. t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i])
  45. }
  46. }
  47. }
  48. func TestDotQ4KSimdMatchesScalar(t *testing.T) {
  49. var b BlockQ4_K
  50. b.D = 0x3C00
  51. b.DMin = 0x3800
  52. for i := range b.Scales {
  53. b.Scales[i] = uint8((i % 4) + 1)
  54. }
  55. for i := range b.QS {
  56. b.QS[i] = uint8(i % 251)
  57. }
  58. x := make([]float32, 256)
  59. for i := range x {
  60. x[i] = float32((i%23)-11) * 0.25
  61. }
  62. // SIMD path
  63. if !hasAVX2 {
  64. t.Skip("SIMD path not available on this CPU")
  65. }
  66. simd := DotQ4_K(&b, x)
  67. // Force scalar
  68. origAVX2, origAVX512 := hasAVX2, hasAVX512
  69. hasAVX2, hasAVX512 = false, false
  70. scalar := DotQ4_K(&b, x)
  71. hasAVX2, hasAVX512 = origAVX2, origAVX512
  72. if diff := absDiff(simd, scalar); diff > 1e-4 {
  73. t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar)
  74. }
  75. }
  76. func TestDotQ5KParamsMatchesScalar(t *testing.T) {
  77. if !hasAVX2 && !hasAVX512 {
  78. t.Skip("SIMD path not available on this CPU")
  79. }
  80. var b BlockQ5_K
  81. b.D = 0x3C00
  82. b.DMin = 0x3800
  83. for i := range b.Scales {
  84. b.Scales[i] = uint8(i*7 + 3)
  85. }
  86. for i := range b.QH {
  87. b.QH[i] = uint8(i * 13)
  88. }
  89. for i := range b.QS {
  90. b.QS[i] = uint8(i * 5)
  91. }
  92. x := make([]float32, 256)
  93. for i := range x {
  94. x[i] = float32((i%23)-11) * 0.25
  95. }
  96. params := GetQ5KDotParams([]BlockQ5_K{b})
  97. got := DotQ5_K_Params(&b, &params[0], x)
  98. origAVX2, origAVX512 := hasAVX2, hasAVX512
  99. hasAVX2, hasAVX512 = false, false
  100. want := DotQ5_K_Params(&b, &params[0], x)
  101. hasAVX2, hasAVX512 = origAVX2, origAVX512
  102. if diff := absDiff(got, want); diff > 1e-4 {
  103. t.Fatalf("mismatch: got=%f want=%f", got, want)
  104. }
  105. }
  106. func TestDotQ5KTile8MatchesScalar(t *testing.T) {
  107. var blocks [8]BlockQ5_K
  108. for bi := range blocks {
  109. b := &blocks[bi]
  110. b.D = 0x3C00
  111. b.DMin = 0x3800
  112. for i := range b.Scales {
  113. b.Scales[i] = uint8((i + bi) * 3)
  114. }
  115. for i := range b.QH {
  116. b.QH[i] = uint8(i*5 + bi)
  117. }
  118. for i := range b.QS {
  119. b.QS[i] = uint8(i*7 + bi)
  120. }
  121. }
  122. x := make([]float32, 256)
  123. for i := range x {
  124. x[i] = float32((i%19)-9) * 0.125
  125. }
  126. wp := GetQ5KDotParams(blocks[:])
  127. var sums [8]float32
  128. DotQ5KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  129. for i := 0; i < 8; i++ {
  130. want := DotQ5_K_Params(&blocks[i], &wp[i], x)
  131. if diff := absDiff(sums[i], want); diff > 1e-4 {
  132. t.Fatalf("mismatch at %d: got=%f want=%f", i, sums[i], want)
  133. }
  134. }
  135. }
  136. func TestDotQ6KParamsMatchesScalar(t *testing.T) {
  137. var b BlockQ6_K
  138. b.D = 0x3C00
  139. for i := range b.Scales {
  140. b.Scales[i] = int8((i%11) - 5)
  141. }
  142. for i := range b.QL {
  143. b.QL[i] = uint8(i * 7)
  144. }
  145. for i := range b.QH {
  146. b.QH[i] = uint8(i * 13)
  147. }
  148. x := make([]float32, 256)
  149. for i := range x {
  150. x[i] = float32((i%23)-11) * 0.25
  151. }
  152. params := GetQ6KDotParams([]BlockQ6_K{b})
  153. got := DotQ6_K_Params(&b, &params[0], x)
  154. origAVX2, origAVX512 := hasAVX2, hasAVX512
  155. hasAVX2, hasAVX512 = false, false
  156. want := DotQ6_K(&b, x)
  157. hasAVX2, hasAVX512 = origAVX2, origAVX512
  158. if diff := absDiff(got, want); diff > 1e-4 {
  159. t.Fatalf("mismatch: got=%f want=%f", got, want)
  160. }
  161. }
  162. func TestDotQ6KTile8MatchesScalar(t *testing.T) {
  163. var blocks [8]BlockQ6_K
  164. for bi := range blocks {
  165. b := &blocks[bi]
  166. b.D = 0x3C00
  167. for i := range b.Scales {
  168. b.Scales[i] = int8(((i + bi) % 13) - 6)
  169. }
  170. for i := range b.QL {
  171. b.QL[i] = uint8(i*3 + bi)
  172. }
  173. for i := range b.QH {
  174. b.QH[i] = uint8(i*5 + bi)
  175. }
  176. }
  177. x := make([]float32, 256)
  178. for i := range x {
  179. x[i] = float32((i%19)-9) * 0.125
  180. }
  181. wp := GetQ6KDotParams(blocks[:])
  182. var sums [8]float32
  183. DotQ6KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  184. for i := 0; i < 8; i++ {
  185. want := DotQ6_K_Params(&blocks[i], &wp[i], x)
  186. if diff := absDiff(sums[i], want); diff > 1e-4 {
  187. t.Fatalf("mismatch at %d: got=%f want=%f", i, sums[i], want)
  188. }
  189. }
  190. }
  191. func TestDotQ8KSimdMatchesScalar(t *testing.T) {
  192. var b BlockQ8_K
  193. for i := 0; i < 256; i++ {
  194. b.QS[i] = int8((i%31)-15)
  195. }
  196. b.D = 0.125
  197. x := make([]float32, 256)
  198. for i := range x {
  199. x[i] = float32((i%23)-11) * 0.25
  200. }
  201. if !hasAVX2 && !hasAVX512 {
  202. t.Skip("SIMD path not available on this CPU")
  203. }
  204. simd := DotQ8_K(&b, x)
  205. origAVX2, origAVX512 := hasAVX2, hasAVX512
  206. hasAVX2, hasAVX512 = false, false
  207. scalar := DotQ8_K(&b, x)
  208. hasAVX2, hasAVX512 = origAVX2, origAVX512
  209. if diff := absDiff(simd, scalar); diff > 1e-3 {
  210. t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar)
  211. }
  212. }
  213. func TestDotQ2KMatchesDequantDot(t *testing.T) {
  214. var b BlockQ2_K
  215. b.D = 0x3C00
  216. b.DMin = 0x3800
  217. for i := range b.Scales {
  218. b.Scales[i] = uint8((i%16)*16 + i%16)
  219. }
  220. for i := range b.QS {
  221. b.QS[i] = uint8(i * 3)
  222. }
  223. x := make([]float32, 256)
  224. for i := range x {
  225. x[i] = float32((i%19)-9) * 0.125
  226. }
  227. got := DotQ2_K(&b, x)
  228. var deq [256]float32
  229. DequantizeQ2_K(&b, deq[:])
  230. var want float32
  231. for i := 0; i < 256; i++ {
  232. want += x[i] * deq[i]
  233. }
  234. if diff := absDiff(got, want); diff > 1e-4 {
  235. t.Fatalf("mismatch: got=%f want=%f", got, want)
  236. }
  237. }
  238. func TestDotQ3KMatchesDequantDot(t *testing.T) {
  239. var b BlockQ3_K
  240. b.D = 0x3C00
  241. for i := range b.Scales {
  242. b.Scales[i] = uint8(i * 7)
  243. }
  244. for i := range b.QS {
  245. b.QS[i] = uint8(i)
  246. }
  247. for i := range b.HMask {
  248. b.HMask[i] = uint8(i * 5)
  249. }
  250. x := make([]float32, 256)
  251. for i := range x {
  252. x[i] = float32((i%17)-8) * 0.25
  253. }
  254. got := DotQ3_K(&b, x)
  255. var deq [256]float32
  256. DequantizeQ3_K(&b, deq[:])
  257. var want float32
  258. for i := 0; i < 256; i++ {
  259. want += x[i] * deq[i]
  260. }
  261. if diff := absDiff(got, want); diff > 1e-4 {
  262. t.Fatalf("mismatch: got=%f want=%f", got, want)
  263. }
  264. }
  265. func TestDotQ2KSimdMatchesScalar(t *testing.T) {
  266. if !hasAVX2 {
  267. t.Skip("SIMD path not available on this CPU")
  268. }
  269. var b BlockQ2_K
  270. b.D = 0x3C00
  271. b.DMin = 0x3800
  272. for i := range b.Scales {
  273. b.Scales[i] = uint8((i%16)*16 + i%16)
  274. }
  275. for i := range b.QS {
  276. b.QS[i] = uint8(i * 7)
  277. }
  278. x := make([]float32, 256)
  279. for i := range x {
  280. x[i] = float32((i%29)-14) * 0.125
  281. }
  282. simd := DotQ2_K(&b, x)
  283. origAVX2, origAVX512 := hasAVX2, hasAVX512
  284. hasAVX2, hasAVX512 = false, false
  285. scalar := DotQ2_K(&b, x)
  286. hasAVX2, hasAVX512 = origAVX2, origAVX512
  287. if diff := absDiff(simd, scalar); diff > 1e-4 {
  288. t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar)
  289. }
  290. }
  291. func TestDotQ3KSimdMatchesScalar(t *testing.T) {
  292. if !hasAVX2 {
  293. t.Skip("SIMD path not available on this CPU")
  294. }
  295. var b BlockQ3_K
  296. b.D = 0x3C00
  297. for i := range b.Scales {
  298. b.Scales[i] = uint8(i*11 + 3)
  299. }
  300. for i := range b.QS {
  301. b.QS[i] = uint8(i * 5)
  302. }
  303. for i := range b.HMask {
  304. b.HMask[i] = uint8(i * 13)
  305. }
  306. x := make([]float32, 256)
  307. for i := range x {
  308. x[i] = float32((i%23)-11) * 0.25
  309. }
  310. simd := DotQ3_K(&b, x)
  311. origAVX2, origAVX512 := hasAVX2, hasAVX512
  312. hasAVX2, hasAVX512 = false, false
  313. scalar := DotQ3_K(&b, x)
  314. hasAVX2, hasAVX512 = origAVX2, origAVX512
  315. if diff := absDiff(simd, scalar); diff > 1e-4 {
  316. t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar)
  317. }
  318. }
  319. func TestDotQ2KParamsMatchesScalar(t *testing.T) {
  320. var b BlockQ2_K
  321. b.D = 0x3C00
  322. b.DMin = 0x3800
  323. for i := range b.Scales {
  324. b.Scales[i] = uint8(i*7 + 3)
  325. }
  326. for i := range b.QS {
  327. b.QS[i] = uint8(i * 5)
  328. }
  329. x := make([]float32, 256)
  330. for i := range x {
  331. x[i] = float32((i%23)-11) * 0.25
  332. }
  333. params := GetQ2KDotParams([]BlockQ2_K{b})
  334. got := DotQ2_K_Params(&b, &params[0], x)
  335. origAVX2, origAVX512 := hasAVX2, hasAVX512
  336. hasAVX2, hasAVX512 = false, false
  337. want := DotQ2_K(&b, x)
  338. hasAVX2, hasAVX512 = origAVX2, origAVX512
  339. if diff := absDiff(got, want); diff > 1e-4 {
  340. t.Fatalf("mismatch: got=%f want=%f", got, want)
  341. }
  342. }
  343. func TestDotQ2KTile8MatchesScalar(t *testing.T) {
  344. var blocks [8]BlockQ2_K
  345. for bi := range blocks {
  346. b := &blocks[bi]
  347. b.D = 0x3C00
  348. b.DMin = 0x3800
  349. for i := range b.Scales {
  350. b.Scales[i] = uint8((i + bi) * 3)
  351. }
  352. for i := range b.QS {
  353. b.QS[i] = uint8(i*7 + bi)
  354. }
  355. }
  356. x := make([]float32, 256)
  357. for i := range x {
  358. x[i] = float32((i%19)-9) * 0.125
  359. }
  360. wp := GetQ2KDotParams(blocks[:])
  361. var sums [8]float32
  362. DotQ2KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  363. for i := 0; i < 8; i++ {
  364. want := DotQ2_K_Params(&blocks[i], &wp[i], x)
  365. if diff := absDiff(sums[i], want); diff > 1e-4 {
  366. t.Fatalf("mismatch at %d: got=%f want=%f", i, sums[i], want)
  367. }
  368. }
  369. }
  370. func TestDotQ8KTile8MatchesScalar(t *testing.T) {
  371. var blocks [8]BlockQ8_K
  372. for bi := range blocks {
  373. b := &blocks[bi]
  374. for i := 0; i < 256; i++ {
  375. b.QS[i] = int8((i+bi)%31 - 15)
  376. }
  377. b.D = 0.125
  378. }
  379. x := make([]float32, 256)
  380. for i := range x {
  381. x[i] = float32((i%23)-11) * 0.25
  382. }
  383. var sums [8]float32
  384. DotQ8KTile8(&sums, blocks[:], 0, 1, &x[0], 8)
  385. for i := 0; i < 8; i++ {
  386. want := DotQ8_K(&blocks[i], x)
  387. if diff := absDiff(sums[i], want); diff > 1e-3 {
  388. t.Fatalf("mismatch at %d: got=%f want=%f", i, sums[i], want)
  389. }
  390. }
  391. }
  392. func TestDotQ6KSimdMatchesScalar(t *testing.T) {
  393. if !hasAVX2 {
  394. t.Skip("SIMD path not available on this CPU")
  395. }
  396. if !q6kSimdReady() {
  397. t.Skip("Q6K SIMD gate disabled")
  398. }
  399. var b BlockQ6_K
  400. b.D = 0x3C00
  401. for i := range b.Scales {
  402. b.Scales[i] = int8((i % 16) - 8)
  403. }
  404. for i := range b.QL {
  405. b.QL[i] = uint8(i)
  406. }
  407. for i := range b.QH {
  408. b.QH[i] = uint8(i * 3)
  409. }
  410. x := make([]float32, 256)
  411. for i := range x {
  412. x[i] = float32((i%23)-11) * 0.25
  413. }
  414. simd := DotQ6_K(&b, x)
  415. origAVX2, origAVX512 := hasAVX2, hasAVX512
  416. hasAVX2, hasAVX512 = false, false
  417. scalar := DotQ6_K(&b, x)
  418. hasAVX2, hasAVX512 = origAVX2, origAVX512
  419. if diff := absDiff(simd, scalar); diff > 1e-3 {
  420. t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar)
  421. }
  422. }
  423. func TestDequantQ2KSimdMatchesScalar(t *testing.T) {
  424. var b BlockQ2_K
  425. b.D = 0x3C00 // float16(1.0)
  426. b.DMin = 0x3800 // float16(0.5)
  427. for i := range b.Scales {
  428. b.Scales[i] = uint8((i%16)*16 + i%16) // both scales and mins
  429. }
  430. for i := range b.QS {
  431. b.QS[i] = uint8(i)
  432. }
  433. var outSimd, outScalar [256]float32
  434. if !dequantQ2KSimd(&b, outSimd[:]) {
  435. t.Skip("SIMD path not available on this CPU")
  436. }
  437. origAVX2, origAVX512 := hasAVX2, hasAVX512
  438. hasAVX2, hasAVX512 = false, false
  439. DequantizeQ2_K(&b, outScalar[:])
  440. hasAVX2, hasAVX512 = origAVX2, origAVX512
  441. for i := range outScalar {
  442. if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-5 {
  443. t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i])
  444. }
  445. }
  446. }
  447. func TestDequantQ3KSimdMatchesScalar(t *testing.T) {
  448. var b BlockQ3_K
  449. b.D = 0x3C00 // float16(1.0)
  450. for i := range b.Scales {
  451. b.Scales[i] = uint8(i % 64)
  452. }
  453. for i := range b.QS {
  454. b.QS[i] = uint8(i)
  455. }
  456. for i := range b.HMask {
  457. b.HMask[i] = uint8(i * 7)
  458. }
  459. var outSimd, outScalar [256]float32
  460. if !dequantQ3KSimd(&b, outSimd[:]) {
  461. t.Skip("SIMD path not available on this CPU")
  462. }
  463. origAVX2, origAVX512 := hasAVX2, hasAVX512
  464. hasAVX2, hasAVX512 = false, false
  465. DequantizeQ3_K(&b, outScalar[:])
  466. hasAVX2, hasAVX512 = origAVX2, origAVX512
  467. for i := range outScalar {
  468. if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-5 {
  469. t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i])
  470. }
  471. }
  472. }
  473. func TestDequantQ6KSimdMatchesScalar(t *testing.T) {
  474. var b BlockQ6_K
  475. b.D = 0x3C00 // float16(1.0)
  476. for i := range b.Scales {
  477. b.Scales[i] = int8((i % 16) - 8)
  478. }
  479. for i := range b.QL {
  480. b.QL[i] = uint8(i)
  481. }
  482. for i := range b.QH {
  483. b.QH[i] = uint8(i * 3)
  484. }
  485. var outSimd, outScalar [256]float32
  486. if !dequantQ6KSimd(&b, outSimd[:]) {
  487. t.Skip("SIMD path not available on this CPU")
  488. }
  489. origAVX2, origAVX512 := hasAVX2, hasAVX512
  490. hasAVX2, hasAVX512 = false, false
  491. DequantizeQ6_K(&b, outScalar[:])
  492. hasAVX2, hasAVX512 = origAVX2, origAVX512
  493. for i := range outScalar {
  494. if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-4 {
  495. t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i])
  496. }
  497. }
  498. }
  499. func absDiff(a, b float32) float32 {
  500. if a > b {
  501. return a - b
  502. }
  503. return b - a
  504. }