simd_dequant_bench_test.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759
  1. package tensor
  2. import "testing"
  3. // BenchmarkDequantQ8K_SIMD benchmarks the SIMD dequantization path
  4. func BenchmarkDequantQ8K_SIMD(b *testing.B) {
  5. var block BlockQ8_K
  6. for i := 0; i < 256; i++ {
  7. block.QS[i] = int8(i%17 - 8)
  8. }
  9. block.D = 0.125
  10. var out [256]float32
  11. b.ResetTimer()
  12. for i := 0; i < b.N; i++ {
  13. DequantizeQ8_K(&block, out[:])
  14. }
  15. }
  16. // BenchmarkDequantQ8K_Scalar benchmarks the scalar fallback path
  17. func BenchmarkDequantQ8K_Scalar(b *testing.B) {
  18. var block BlockQ8_K
  19. for i := 0; i < 256; i++ {
  20. block.QS[i] = int8(i%17 - 8)
  21. }
  22. block.D = 0.125
  23. // Force scalar path
  24. origAVX2, origAVX512 := hasAVX2, hasAVX512
  25. hasAVX2, hasAVX512 = false, false
  26. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  27. var out [256]float32
  28. b.ResetTimer()
  29. for i := 0; i < b.N; i++ {
  30. DequantizeQ8_K(&block, out[:])
  31. }
  32. }
  33. // BenchmarkDequantQ4K_SIMD benchmarks the SIMD dequantization path
  34. func BenchmarkDequantQ4K_SIMD(b *testing.B) {
  35. var block BlockQ4_K
  36. block.D = 0x3C00
  37. block.DMin = 0x3800
  38. for i := range block.Scales {
  39. block.Scales[i] = uint8((i % 4) + 1)
  40. }
  41. for i := range block.QS {
  42. block.QS[i] = uint8(i)
  43. }
  44. var out [256]float32
  45. b.ResetTimer()
  46. for i := 0; i < b.N; i++ {
  47. DequantizeQ4_K(&block, out[:])
  48. }
  49. }
  50. // BenchmarkDequantQ4K_Scalar benchmarks the scalar fallback path
  51. func BenchmarkDequantQ4K_Scalar(b *testing.B) {
  52. var block BlockQ4_K
  53. block.D = 0x3C00
  54. block.DMin = 0x3800
  55. for i := range block.Scales {
  56. block.Scales[i] = uint8((i % 4) + 1)
  57. }
  58. for i := range block.QS {
  59. block.QS[i] = uint8(i)
  60. }
  61. origAVX2, origAVX512 := hasAVX2, hasAVX512
  62. hasAVX2, hasAVX512 = false, false
  63. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  64. var out [256]float32
  65. b.ResetTimer()
  66. for i := 0; i < b.N; i++ {
  67. DequantizeQ4_K(&block, out[:])
  68. }
  69. }
  70. // BenchmarkDequantQ2K_SIMD benchmarks the SIMD dequantization path
  71. func BenchmarkDequantQ2K_SIMD(b *testing.B) {
  72. var block BlockQ2_K
  73. block.D = 0x3C00
  74. block.DMin = 0x3800
  75. for i := range block.Scales {
  76. block.Scales[i] = uint8((i%16)*16 + i%16)
  77. }
  78. for i := range block.QS {
  79. block.QS[i] = uint8(i)
  80. }
  81. var out [256]float32
  82. b.ResetTimer()
  83. for i := 0; i < b.N; i++ {
  84. DequantizeQ2_K(&block, out[:])
  85. }
  86. }
  87. // BenchmarkDequantQ2K_Scalar benchmarks the scalar fallback path
  88. func BenchmarkDequantQ2K_Scalar(b *testing.B) {
  89. var block BlockQ2_K
  90. block.D = 0x3C00
  91. block.DMin = 0x3800
  92. for i := range block.Scales {
  93. block.Scales[i] = uint8((i%16)*16 + i%16)
  94. }
  95. for i := range block.QS {
  96. block.QS[i] = uint8(i)
  97. }
  98. origAVX2, origAVX512 := hasAVX2, hasAVX512
  99. hasAVX2, hasAVX512 = false, false
  100. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  101. var out [256]float32
  102. b.ResetTimer()
  103. for i := 0; i < b.N; i++ {
  104. DequantizeQ2_K(&block, out[:])
  105. }
  106. }
  107. // BenchmarkDequantQ3K_SIMD benchmarks the optimized Go path
  108. func BenchmarkDequantQ3K_SIMD(b *testing.B) {
  109. var block BlockQ3_K
  110. block.D = 0x3C00
  111. for i := range block.Scales {
  112. block.Scales[i] = uint8(i % 64)
  113. }
  114. for i := range block.QS {
  115. block.QS[i] = uint8(i)
  116. }
  117. for i := range block.HMask {
  118. block.HMask[i] = uint8(i * 7)
  119. }
  120. var out [256]float32
  121. b.ResetTimer()
  122. for i := 0; i < b.N; i++ {
  123. DequantizeQ3_K(&block, out[:])
  124. }
  125. }
  126. // BenchmarkDequantQ3K_Scalar benchmarks the scalar fallback path
  127. func BenchmarkDequantQ3K_Scalar(b *testing.B) {
  128. var block BlockQ3_K
  129. block.D = 0x3C00
  130. for i := range block.Scales {
  131. block.Scales[i] = uint8(i % 64)
  132. }
  133. for i := range block.QS {
  134. block.QS[i] = uint8(i)
  135. }
  136. for i := range block.HMask {
  137. block.HMask[i] = uint8(i * 7)
  138. }
  139. origAVX2, origAVX512 := hasAVX2, hasAVX512
  140. hasAVX2, hasAVX512 = false, false
  141. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  142. var out [256]float32
  143. b.ResetTimer()
  144. for i := 0; i < b.N; i++ {
  145. DequantizeQ3_K(&block, out[:])
  146. }
  147. }
  148. // BenchmarkDequantQ6K_SIMD benchmarks the optimized Go path
  149. func BenchmarkDequantQ6K_SIMD(b *testing.B) {
  150. var block BlockQ6_K
  151. block.D = 0x3C00
  152. for i := range block.Scales {
  153. block.Scales[i] = int8((i % 16) - 8)
  154. }
  155. for i := range block.QL {
  156. block.QL[i] = uint8(i)
  157. }
  158. for i := range block.QH {
  159. block.QH[i] = uint8(i * 3)
  160. }
  161. var out [256]float32
  162. b.ResetTimer()
  163. for i := 0; i < b.N; i++ {
  164. DequantizeQ6_K(&block, out[:])
  165. }
  166. }
  167. func BenchmarkDotQ5K_Params_SIMD(b *testing.B) {
  168. var block BlockQ5_K
  169. block.D = 0x3C00
  170. block.DMin = 0x3800
  171. for i := range block.Scales {
  172. block.Scales[i] = uint8((i%7)*9 + 3)
  173. }
  174. for i := range block.QH {
  175. block.QH[i] = uint8(i * 13)
  176. }
  177. for i := range block.QS {
  178. block.QS[i] = uint8(i * 5)
  179. }
  180. x := make([]float32, 256)
  181. for i := range x {
  182. x[i] = float32((i%23)-11) * 0.25
  183. }
  184. p := GetQ5KDotParams([]BlockQ5_K{block})
  185. var sum float32
  186. b.ResetTimer()
  187. for i := 0; i < b.N; i++ {
  188. sum += DotQ5_K_Params(&block, &p[0], x)
  189. }
  190. _ = sum
  191. }
  192. func BenchmarkDotQ5K_Params_Scalar(b *testing.B) {
  193. var block BlockQ5_K
  194. block.D = 0x3C00
  195. block.DMin = 0x3800
  196. for i := range block.Scales {
  197. block.Scales[i] = uint8((i%7)*9 + 3)
  198. }
  199. for i := range block.QH {
  200. block.QH[i] = uint8(i * 13)
  201. }
  202. for i := range block.QS {
  203. block.QS[i] = uint8(i * 5)
  204. }
  205. x := make([]float32, 256)
  206. for i := range x {
  207. x[i] = float32((i%23)-11) * 0.25
  208. }
  209. p := GetQ5KDotParams([]BlockQ5_K{block})
  210. origAVX2, origAVX512 := hasAVX2, hasAVX512
  211. hasAVX2, hasAVX512 = false, false
  212. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  213. var sum float32
  214. b.ResetTimer()
  215. for i := 0; i < b.N; i++ {
  216. sum += DotQ5_K_Params(&block, &p[0], x)
  217. }
  218. _ = sum
  219. }
  220. func BenchmarkDotQ5KTile8_SIMD(b *testing.B) {
  221. var blocks [8]BlockQ5_K
  222. for bi := range blocks {
  223. blk := &blocks[bi]
  224. blk.D = 0x3C00
  225. blk.DMin = 0x3800
  226. for i := range blk.Scales {
  227. blk.Scales[i] = uint8((i+bi)%11 + 1)
  228. }
  229. for i := range blk.QH {
  230. blk.QH[i] = uint8(i*5 + bi)
  231. }
  232. for i := range blk.QS {
  233. blk.QS[i] = uint8(i*7 + bi)
  234. }
  235. }
  236. x := make([]float32, 256)
  237. for i := range x {
  238. x[i] = float32((i%19)-9) * 0.125
  239. }
  240. wp := GetQ5KDotParams(blocks[:])
  241. var acc float32
  242. b.ResetTimer()
  243. for i := 0; i < b.N; i++ {
  244. var sums [8]float32
  245. DotQ5KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  246. acc += sums[0]
  247. }
  248. _ = acc
  249. }
  250. func BenchmarkDotQ6K_Params_SIMD(b *testing.B) {
  251. var block BlockQ6_K
  252. block.D = 0x3C00
  253. for i := range block.Scales {
  254. block.Scales[i] = int8((i%11) - 5)
  255. }
  256. for i := range block.QL {
  257. block.QL[i] = uint8(i * 7)
  258. }
  259. for i := range block.QH {
  260. block.QH[i] = uint8(i * 13)
  261. }
  262. x := make([]float32, 256)
  263. for i := range x {
  264. x[i] = float32((i%23)-11) * 0.25
  265. }
  266. p := GetQ6KDotParams([]BlockQ6_K{block})
  267. var sum float32
  268. b.ResetTimer()
  269. for i := 0; i < b.N; i++ {
  270. sum += DotQ6_K_Params(&block, &p[0], x)
  271. }
  272. _ = sum
  273. }
  274. func BenchmarkDotQ6K_Params_Scalar(b *testing.B) {
  275. var block BlockQ6_K
  276. block.D = 0x3C00
  277. for i := range block.Scales {
  278. block.Scales[i] = int8((i%11) - 5)
  279. }
  280. for i := range block.QL {
  281. block.QL[i] = uint8(i * 7)
  282. }
  283. for i := range block.QH {
  284. block.QH[i] = uint8(i * 13)
  285. }
  286. x := make([]float32, 256)
  287. for i := range x {
  288. x[i] = float32((i%23)-11) * 0.25
  289. }
  290. p := GetQ6KDotParams([]BlockQ6_K{block})
  291. origAVX2, origAVX512 := hasAVX2, hasAVX512
  292. hasAVX2, hasAVX512 = false, false
  293. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  294. var sum float32
  295. b.ResetTimer()
  296. for i := 0; i < b.N; i++ {
  297. sum += DotQ6_K_Params(&block, &p[0], x)
  298. }
  299. _ = sum
  300. }
  301. func BenchmarkDotQ6KTile8_SIMD(b *testing.B) {
  302. var blocks [8]BlockQ6_K
  303. for bi := range blocks {
  304. blk := &blocks[bi]
  305. blk.D = 0x3C00
  306. for i := range blk.Scales {
  307. blk.Scales[i] = int8(((i + bi) % 13) - 6)
  308. }
  309. for i := range blk.QL {
  310. blk.QL[i] = uint8(i*3 + bi)
  311. }
  312. for i := range blk.QH {
  313. blk.QH[i] = uint8(i*5 + bi)
  314. }
  315. }
  316. x := make([]float32, 256)
  317. for i := range x {
  318. x[i] = float32((i%19)-9) * 0.125
  319. }
  320. wp := GetQ6KDotParams(blocks[:])
  321. var acc float32
  322. b.ResetTimer()
  323. for i := 0; i < b.N; i++ {
  324. var sums [8]float32
  325. DotQ6KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  326. acc += sums[0]
  327. }
  328. _ = acc
  329. }
  330. func BenchmarkDotQ2K_Params_SIMD(b *testing.B) {
  331. var block BlockQ2_K
  332. block.D = 0x3C00
  333. block.DMin = 0x3800
  334. for i := range block.Scales {
  335. block.Scales[i] = uint8(i*7 + 3)
  336. }
  337. for i := range block.QS {
  338. block.QS[i] = uint8(i * 5)
  339. }
  340. x := make([]float32, 256)
  341. for i := range x {
  342. x[i] = float32((i%23)-11) * 0.25
  343. }
  344. p := GetQ2KDotParams([]BlockQ2_K{block})
  345. var sum float32
  346. b.ResetTimer()
  347. for i := 0; i < b.N; i++ {
  348. sum += DotQ2_K_Params(&block, &p[0], x)
  349. }
  350. _ = sum
  351. }
  352. func BenchmarkDotQ2K_Params_Scalar(b *testing.B) {
  353. var block BlockQ2_K
  354. block.D = 0x3C00
  355. block.DMin = 0x3800
  356. for i := range block.Scales {
  357. block.Scales[i] = uint8(i*7 + 3)
  358. }
  359. for i := range block.QS {
  360. block.QS[i] = uint8(i * 5)
  361. }
  362. x := make([]float32, 256)
  363. for i := range x {
  364. x[i] = float32((i%23)-11) * 0.25
  365. }
  366. p := GetQ2KDotParams([]BlockQ2_K{block})
  367. origAVX2, origAVX512 := hasAVX2, hasAVX512
  368. hasAVX2, hasAVX512 = false, false
  369. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  370. var sum float32
  371. b.ResetTimer()
  372. for i := 0; i < b.N; i++ {
  373. sum += DotQ2_K_Params(&block, &p[0], x)
  374. }
  375. _ = sum
  376. }
  377. func BenchmarkDotQ2KTile8_SIMD(b *testing.B) {
  378. var blocks [8]BlockQ2_K
  379. for bi := range blocks {
  380. blk := &blocks[bi]
  381. blk.D = 0x3C00
  382. blk.DMin = 0x3800
  383. for i := range blk.Scales {
  384. blk.Scales[i] = uint8((i+bi)%11 + 1)
  385. }
  386. for i := range blk.QS {
  387. blk.QS[i] = uint8(i*7 + bi)
  388. }
  389. }
  390. x := make([]float32, 256)
  391. for i := range x {
  392. x[i] = float32((i%19)-9) * 0.125
  393. }
  394. wp := GetQ2KDotParams(blocks[:])
  395. var acc float32
  396. b.ResetTimer()
  397. for i := 0; i < b.N; i++ {
  398. var sums [8]float32
  399. DotQ2KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  400. acc += sums[0]
  401. }
  402. _ = acc
  403. }
  404. func BenchmarkDotQ2KTile8_Scalar(b *testing.B) {
  405. var blocks [8]BlockQ2_K
  406. for bi := range blocks {
  407. blk := &blocks[bi]
  408. blk.D = 0x3C00
  409. blk.DMin = 0x3800
  410. for i := range blk.Scales {
  411. blk.Scales[i] = uint8((i+bi)%11 + 1)
  412. }
  413. for i := range blk.QS {
  414. blk.QS[i] = uint8(i*7 + bi)
  415. }
  416. }
  417. x := make([]float32, 256)
  418. for i := range x {
  419. x[i] = float32((i%19)-9) * 0.125
  420. }
  421. wp := GetQ2KDotParams(blocks[:])
  422. origAVX2, origAVX512 := hasAVX2, hasAVX512
  423. hasAVX2, hasAVX512 = false, false
  424. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  425. var acc float32
  426. b.ResetTimer()
  427. for i := 0; i < b.N; i++ {
  428. var sums [8]float32
  429. DotQ2KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  430. acc += sums[0]
  431. }
  432. _ = acc
  433. }
  434. func BenchmarkDotQ8KTile8_SIMD(b *testing.B) {
  435. var blocks [8]BlockQ8_K
  436. for bi := range blocks {
  437. blk := &blocks[bi]
  438. for i := 0; i < 256; i++ {
  439. blk.QS[i] = int8((i+bi)%31 - 15)
  440. }
  441. blk.D = 0.125
  442. }
  443. x := make([]float32, 256)
  444. for i := range x {
  445. x[i] = float32((i%23)-11) * 0.25
  446. }
  447. var acc float32
  448. b.ResetTimer()
  449. for i := 0; i < b.N; i++ {
  450. var sums [8]float32
  451. DotQ8KTile8(&sums, blocks[:], 0, 1, &x[0], 8)
  452. acc += sums[0]
  453. }
  454. _ = acc
  455. }
  456. func BenchmarkDotQ3K_Params_SIMD(b *testing.B) {
  457. var block BlockQ3_K
  458. block.D = 0x3C00
  459. for i := range block.Scales {
  460. block.Scales[i] = uint8(i*11 + 3)
  461. }
  462. for i := range block.QS {
  463. block.QS[i] = uint8(i * 5)
  464. }
  465. for i := range block.HMask {
  466. block.HMask[i] = uint8(i * 13)
  467. }
  468. x := make([]float32, 256)
  469. for i := range x {
  470. x[i] = float32((i%23)-11) * 0.25
  471. }
  472. p := GetQ3KDotParams([]BlockQ3_K{block})
  473. var sum float32
  474. b.ResetTimer()
  475. for i := 0; i < b.N; i++ {
  476. sum += DotQ3_K_Params(&block, &p[0], x)
  477. }
  478. _ = sum
  479. }
  480. func BenchmarkDotQ3K_Params_Scalar(b *testing.B) {
  481. var block BlockQ3_K
  482. block.D = 0x3C00
  483. for i := range block.Scales {
  484. block.Scales[i] = uint8(i*11 + 3)
  485. }
  486. for i := range block.QS {
  487. block.QS[i] = uint8(i * 5)
  488. }
  489. for i := range block.HMask {
  490. block.HMask[i] = uint8(i * 13)
  491. }
  492. x := make([]float32, 256)
  493. for i := range x {
  494. x[i] = float32((i%23)-11) * 0.25
  495. }
  496. p := GetQ3KDotParams([]BlockQ3_K{block})
  497. origAVX2, origAVX512 := hasAVX2, hasAVX512
  498. hasAVX2, hasAVX512 = false, false
  499. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  500. var sum float32
  501. b.ResetTimer()
  502. for i := 0; i < b.N; i++ {
  503. sum += DotQ3_K_Params(&block, &p[0], x)
  504. }
  505. _ = sum
  506. }
  507. func BenchmarkDotQ3KTile8_SIMD(b *testing.B) {
  508. var blocks [8]BlockQ3_K
  509. for bi := range blocks {
  510. blk := &blocks[bi]
  511. blk.D = 0x3C00
  512. for i := range blk.Scales {
  513. blk.Scales[i] = uint8((i+bi)%64)
  514. }
  515. for i := range blk.QS {
  516. blk.QS[i] = uint8(i*7 + bi)
  517. }
  518. for i := range blk.HMask {
  519. blk.HMask[i] = uint8(i*5 + bi)
  520. }
  521. }
  522. x := make([]float32, 256)
  523. for i := range x {
  524. x[i] = float32((i%19)-9) * 0.125
  525. }
  526. wp := GetQ3KDotParams(blocks[:])
  527. var acc float32
  528. b.ResetTimer()
  529. for i := 0; i < b.N; i++ {
  530. var sums [8]float32
  531. DotQ3KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  532. acc += sums[0]
  533. }
  534. _ = acc
  535. }
  536. func BenchmarkDotQ3KTile8_Scalar(b *testing.B) {
  537. var blocks [8]BlockQ3_K
  538. for bi := range blocks {
  539. blk := &blocks[bi]
  540. blk.D = 0x3C00
  541. for i := range blk.Scales {
  542. blk.Scales[i] = uint8((i+bi)%64)
  543. }
  544. for i := range blk.QS {
  545. blk.QS[i] = uint8(i*7 + bi)
  546. }
  547. for i := range blk.HMask {
  548. blk.HMask[i] = uint8(i*5 + bi)
  549. }
  550. }
  551. x := make([]float32, 256)
  552. for i := range x {
  553. x[i] = float32((i%19)-9) * 0.125
  554. }
  555. wp := GetQ3KDotParams(blocks[:])
  556. origAVX2, origAVX512 := hasAVX2, hasAVX512
  557. hasAVX2, hasAVX512 = false, false
  558. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  559. var acc float32
  560. b.ResetTimer()
  561. for i := 0; i < b.N; i++ {
  562. var sums [8]float32
  563. DotQ3KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  564. acc += sums[0]
  565. }
  566. _ = acc
  567. }
  568. func BenchmarkDotQ8KTile8_Scalar(b *testing.B) {
  569. var blocks [8]BlockQ8_K
  570. for bi := range blocks {
  571. blk := &blocks[bi]
  572. for i := 0; i < 256; i++ {
  573. blk.QS[i] = int8((i+bi)%31 - 15)
  574. }
  575. blk.D = 0.125
  576. }
  577. x := make([]float32, 256)
  578. for i := range x {
  579. x[i] = float32((i%23)-11) * 0.25
  580. }
  581. origAVX2, origAVX512 := hasAVX2, hasAVX512
  582. hasAVX2, hasAVX512 = false, false
  583. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  584. var acc float32
  585. b.ResetTimer()
  586. for i := 0; i < b.N; i++ {
  587. var sums [8]float32
  588. DotQ8KTile8(&sums, blocks[:], 0, 1, &x[0], 8)
  589. acc += sums[0]
  590. }
  591. _ = acc
  592. }
  593. func BenchmarkDotQ6KTile8_Scalar(b *testing.B) {
  594. var blocks [8]BlockQ6_K
  595. for bi := range blocks {
  596. blk := &blocks[bi]
  597. blk.D = 0x3C00
  598. for i := range blk.Scales {
  599. blk.Scales[i] = int8(((i + bi) % 13) - 6)
  600. }
  601. for i := range blk.QL {
  602. blk.QL[i] = uint8(i*3 + bi)
  603. }
  604. for i := range blk.QH {
  605. blk.QH[i] = uint8(i*5 + bi)
  606. }
  607. }
  608. x := make([]float32, 256)
  609. for i := range x {
  610. x[i] = float32((i%19)-9) * 0.125
  611. }
  612. wp := GetQ6KDotParams(blocks[:])
  613. origAVX2, origAVX512 := hasAVX2, hasAVX512
  614. hasAVX2, hasAVX512 = false, false
  615. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  616. var acc float32
  617. b.ResetTimer()
  618. for i := 0; i < b.N; i++ {
  619. var sums [8]float32
  620. DotQ6KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  621. acc += sums[0]
  622. }
  623. _ = acc
  624. }
  625. func BenchmarkDotQ5KTile8_Scalar(b *testing.B) {
  626. var blocks [8]BlockQ5_K
  627. for bi := range blocks {
  628. blk := &blocks[bi]
  629. blk.D = 0x3C00
  630. blk.DMin = 0x3800
  631. for i := range blk.Scales {
  632. blk.Scales[i] = uint8((i+bi)%11 + 1)
  633. }
  634. for i := range blk.QH {
  635. blk.QH[i] = uint8(i*5 + bi)
  636. }
  637. for i := range blk.QS {
  638. blk.QS[i] = uint8(i*7 + bi)
  639. }
  640. }
  641. x := make([]float32, 256)
  642. for i := range x {
  643. x[i] = float32((i%19)-9) * 0.125
  644. }
  645. wp := GetQ5KDotParams(blocks[:])
  646. origAVX2, origAVX512 := hasAVX2, hasAVX512
  647. hasAVX2, hasAVX512 = false, false
  648. defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
  649. var acc float32
  650. b.ResetTimer()
  651. for i := 0; i < b.N; i++ {
  652. var sums [8]float32
  653. DotQ5KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
  654. acc += sums[0]
  655. }
  656. _ = acc
  657. }