linear_shared.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688
  1. package matmul
  2. import (
  3. "fmt"
  4. "sync"
  5. "makarna/pkg/backend/cpu"
  6. "makarna/pkg/tensor"
  7. )
  8. // linearCPU contains the original CPU implementations for all supported
  9. // weight dtypes. Both CPU-only and CUDA-enabled builds reuse this.
  10. func linearCPU(input, weight, output *cpu.Tensor) error {
  11. inShape := input.Shape()
  12. wShape := weight.Shape()
  13. // Validate dimensions
  14. if len(inShape) != 2 || len(wShape) != 2 {
  15. return fmt.Errorf("linear: expected 2D inputs, got input %v, weight %v", inShape, wShape)
  16. }
  17. M := inShape[0]
  18. K := inShape[1]
  19. N := wShape[0]
  20. if wShape[1] != K {
  21. return fmt.Errorf("linear: shape mismatch: input [*, %d] vs weight [%d, %d]", K, N, wShape[1])
  22. }
  23. inData := input.DataFloat32()
  24. outData := output.DataFloat32()
  25. workers := cpu.MaxThreads()
  26. switch weight.DType() {
  27. case tensor.Float32:
  28. wData := weight.DataFloat32()
  29. gemmFloat32Blocked(outData, inData, wData, M, K, N, workers)
  30. case tensor.Q4_K:
  31. wData := weight.DataQ4_K()
  32. if K%256 != 0 {
  33. return fmt.Errorf("linear: Q4_K weight K dimension %d must be multiple of 256", K)
  34. }
  35. wParams := tensor.GetQ4KDotParams(wData)
  36. blocksPerRow := K / 256
  37. work := M * N * K
  38. use := chooseWorkers(work, workers)
  39. if use == 1 {
  40. if M == 1 {
  41. q4kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, 0, N)
  42. return nil
  43. }
  44. for m := 0; m < M; m++ {
  45. for n := 0; n < N; n++ {
  46. var sum float32
  47. for b := 0; b < blocksPerRow; b++ {
  48. inOffset := m*K + b*256
  49. wBlockIdx := n*blocksPerRow + b
  50. block := &wData[wBlockIdx]
  51. p := &wParams[wBlockIdx]
  52. sum += tensor.DotQ4_K_Params(block, p, inData[inOffset:inOffset+256])
  53. }
  54. outData[m*N+n] = sum
  55. }
  56. }
  57. return nil
  58. }
  59. var wg sync.WaitGroup
  60. if M == 1 {
  61. for _, r := range chunkRanges(N, use) {
  62. wg.Add(1)
  63. start, end := r[0], r[1]
  64. go func(s, e int) {
  65. defer wg.Done()
  66. q4kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, s, e)
  67. }(start, end)
  68. }
  69. wg.Wait()
  70. return nil
  71. }
  72. if M < use {
  73. for _, r := range chunkRanges(N, use) {
  74. wg.Add(1)
  75. start, end := r[0], r[1]
  76. go func(s, e int) {
  77. defer wg.Done()
  78. for n := s; n < e; n++ {
  79. for m := 0; m < M; m++ {
  80. var sum float32
  81. for b := 0; b < blocksPerRow; b++ {
  82. inOffset := m*K + b*256
  83. wBlockIdx := n*blocksPerRow + b
  84. block := &wData[wBlockIdx]
  85. p := &wParams[wBlockIdx]
  86. sum += tensor.DotQ4_K_Params(block, p, inData[inOffset:inOffset+256])
  87. }
  88. outData[m*N+n] = sum
  89. }
  90. }
  91. }(start, end)
  92. }
  93. wg.Wait()
  94. return nil
  95. }
  96. for _, r := range chunkRanges(M, use) {
  97. wg.Add(1)
  98. start, end := r[0], r[1]
  99. go func(s, e int) {
  100. defer wg.Done()
  101. for m := s; m < e; m++ {
  102. for n := 0; n < N; n++ {
  103. var sum float32
  104. for b := 0; b < blocksPerRow; b++ {
  105. inOffset := m*K + b*256
  106. wBlockIdx := n*blocksPerRow + b
  107. block := &wData[wBlockIdx]
  108. p := &wParams[wBlockIdx]
  109. sum += tensor.DotQ4_K_Params(block, p, inData[inOffset:inOffset+256])
  110. }
  111. outData[m*N+n] = sum
  112. }
  113. }
  114. }(start, end)
  115. }
  116. wg.Wait()
  117. case tensor.Q8_K:
  118. wData := weight.DataQ8_K()
  119. if K%256 != 0 {
  120. return fmt.Errorf("linear: Q8_K weight K dimension %d must be multiple of 256", K)
  121. }
  122. blocksPerRow := K / 256
  123. work := M * N * K
  124. use := chooseWorkers(work, workers)
  125. if use == 1 {
  126. if M == 1 {
  127. q8kGemvDecodeTiled(outData[:N], inData[:K], wData, N, blocksPerRow, 0, N)
  128. return nil
  129. }
  130. for m := 0; m < M; m++ {
  131. for n := 0; n < N; n++ {
  132. var sum float32
  133. for b := 0; b < blocksPerRow; b++ {
  134. inOffset := m*K + b*256
  135. wBlockIdx := n*blocksPerRow + b
  136. block := &wData[wBlockIdx]
  137. sum += tensor.DotQ8_K(block, inData[inOffset:inOffset+256])
  138. }
  139. outData[m*N+n] = sum
  140. }
  141. }
  142. return nil
  143. }
  144. var wg sync.WaitGroup
  145. if M == 1 {
  146. for _, r := range chunkRanges(N, use) {
  147. wg.Add(1)
  148. start, end := r[0], r[1]
  149. go func(s, e int) {
  150. defer wg.Done()
  151. q8kGemvDecodeTiled(outData[:N], inData[:K], wData, N, blocksPerRow, s, e)
  152. }(start, end)
  153. }
  154. wg.Wait()
  155. return nil
  156. }
  157. if M < use {
  158. for _, r := range chunkRanges(N, use) {
  159. wg.Add(1)
  160. start, end := r[0], r[1]
  161. go func(s, e int) {
  162. defer wg.Done()
  163. for n := s; n < e; n++ {
  164. for m := 0; m < M; m++ {
  165. var sum float32
  166. for b := 0; b < blocksPerRow; b++ {
  167. inOffset := m*K + b*256
  168. wBlockIdx := n*blocksPerRow + b
  169. block := &wData[wBlockIdx]
  170. sum += tensor.DotQ8_K(block, inData[inOffset:inOffset+256])
  171. }
  172. outData[m*N+n] = sum
  173. }
  174. }
  175. }(start, end)
  176. }
  177. wg.Wait()
  178. return nil
  179. }
  180. for _, r := range chunkRanges(M, use) {
  181. wg.Add(1)
  182. start, end := r[0], r[1]
  183. go func(s, e int) {
  184. defer wg.Done()
  185. for m := s; m < e; m++ {
  186. for n := 0; n < N; n++ {
  187. var sum float32
  188. for b := 0; b < blocksPerRow; b++ {
  189. inOffset := m*K + b*256
  190. wBlockIdx := n*blocksPerRow + b
  191. block := &wData[wBlockIdx]
  192. sum += tensor.DotQ8_K(block, inData[inOffset:inOffset+256])
  193. }
  194. outData[m*N+n] = sum
  195. }
  196. }
  197. }(start, end)
  198. }
  199. wg.Wait()
  200. case tensor.Q3_K:
  201. wData := weight.DataQ3_K()
  202. if K%256 != 0 {
  203. return fmt.Errorf("linear: Q3_K weight K dimension %d must be multiple of 256", K)
  204. }
  205. wParams := tensor.GetQ3KDotParams(wData)
  206. blocksPerRow := K / 256
  207. work := M * N * K
  208. use := chooseWorkers(work, workers)
  209. if use == 1 {
  210. if M == 1 {
  211. q3kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, 0, N)
  212. return nil
  213. }
  214. for m := 0; m < M; m++ {
  215. for n := 0; n < N; n++ {
  216. var sum float32
  217. for b := 0; b < blocksPerRow; b++ {
  218. inOffset := m*K + b*256
  219. wBlockIdx := n*blocksPerRow + b
  220. block := &wData[wBlockIdx]
  221. p := &wParams[wBlockIdx]
  222. sum += tensor.DotQ3_K_Params(block, p, inData[inOffset:inOffset+256])
  223. }
  224. outData[m*N+n] = sum
  225. }
  226. }
  227. return nil
  228. }
  229. var wg sync.WaitGroup
  230. if M == 1 {
  231. for _, r := range chunkRanges(N, use) {
  232. wg.Add(1)
  233. start, end := r[0], r[1]
  234. go func(s, e int) {
  235. defer wg.Done()
  236. q3kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, s, e)
  237. }(start, end)
  238. }
  239. wg.Wait()
  240. return nil
  241. }
  242. if M < use {
  243. for _, r := range chunkRanges(N, use) {
  244. wg.Add(1)
  245. start, end := r[0], r[1]
  246. go func(s, e int) {
  247. defer wg.Done()
  248. for n := s; n < e; n++ {
  249. for m := 0; m < M; m++ {
  250. var sum float32
  251. for b := 0; b < blocksPerRow; b++ {
  252. inOffset := m*K + b*256
  253. wBlockIdx := n*blocksPerRow + b
  254. block := &wData[wBlockIdx]
  255. p := &wParams[wBlockIdx]
  256. sum += tensor.DotQ3_K_Params(block, p, inData[inOffset:inOffset+256])
  257. }
  258. outData[m*N+n] = sum
  259. }
  260. }
  261. }(start, end)
  262. }
  263. wg.Wait()
  264. return nil
  265. }
  266. for _, r := range chunkRanges(M, use) {
  267. wg.Add(1)
  268. start, end := r[0], r[1]
  269. go func(s, e int) {
  270. defer wg.Done()
  271. for m := s; m < e; m++ {
  272. for n := 0; n < N; n++ {
  273. var sum float32
  274. for b := 0; b < blocksPerRow; b++ {
  275. inOffset := m*K + b*256
  276. wBlockIdx := n*blocksPerRow + b
  277. block := &wData[wBlockIdx]
  278. p := &wParams[wBlockIdx]
  279. sum += tensor.DotQ3_K_Params(block, p, inData[inOffset:inOffset+256])
  280. }
  281. outData[m*N+n] = sum
  282. }
  283. }
  284. }(start, end)
  285. }
  286. wg.Wait()
  287. case tensor.Q5_K:
  288. wData := weight.DataQ5_K()
  289. if K%256 != 0 {
  290. return fmt.Errorf("linear: Q5_K weight K dimension %d must be multiple of 256", K)
  291. }
  292. wParams := tensor.GetQ5KDotParams(wData)
  293. blocksPerRow := K / 256
  294. work := M * N * K
  295. use := chooseWorkers(work, workers)
  296. if use == 1 {
  297. if M == 1 {
  298. q5kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, 0, N)
  299. return nil
  300. }
  301. for m := 0; m < M; m++ {
  302. for n := 0; n < N; n++ {
  303. var sum float32
  304. for b := 0; b < blocksPerRow; b++ {
  305. inOffset := m*K + b*256
  306. wBlockIdx := n*blocksPerRow + b
  307. block := &wData[wBlockIdx]
  308. p := &wParams[wBlockIdx]
  309. sum += tensor.DotQ5_K_Params(block, p, inData[inOffset:inOffset+256])
  310. }
  311. outData[m*N+n] = sum
  312. }
  313. }
  314. return nil
  315. }
  316. var wg sync.WaitGroup
  317. if M == 1 {
  318. for _, r := range chunkRanges(N, use) {
  319. wg.Add(1)
  320. start, end := r[0], r[1]
  321. go func(s, e int) {
  322. defer wg.Done()
  323. q5kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, s, e)
  324. }(start, end)
  325. }
  326. wg.Wait()
  327. return nil
  328. }
  329. if M < use {
  330. for _, r := range chunkRanges(N, use) {
  331. wg.Add(1)
  332. start, end := r[0], r[1]
  333. go func(s, e int) {
  334. defer wg.Done()
  335. for n := s; n < e; n++ {
  336. for m := 0; m < M; m++ {
  337. var sum float32
  338. for b := 0; b < blocksPerRow; b++ {
  339. inOffset := m*K + b*256
  340. wBlockIdx := n*blocksPerRow + b
  341. block := &wData[wBlockIdx]
  342. p := &wParams[wBlockIdx]
  343. sum += tensor.DotQ5_K_Params(block, p, inData[inOffset:inOffset+256])
  344. }
  345. outData[m*N+n] = sum
  346. }
  347. }
  348. }(start, end)
  349. }
  350. wg.Wait()
  351. return nil
  352. }
  353. for _, r := range chunkRanges(M, use) {
  354. wg.Add(1)
  355. start, end := r[0], r[1]
  356. go func(s, e int) {
  357. defer wg.Done()
  358. for m := s; m < e; m++ {
  359. for n := 0; n < N; n++ {
  360. var sum float32
  361. for b := 0; b < blocksPerRow; b++ {
  362. inOffset := m*K + b*256
  363. wBlockIdx := n*blocksPerRow + b
  364. block := &wData[wBlockIdx]
  365. p := &wParams[wBlockIdx]
  366. sum += tensor.DotQ5_K_Params(block, p, inData[inOffset:inOffset+256])
  367. }
  368. outData[m*N+n] = sum
  369. }
  370. }
  371. }(start, end)
  372. }
  373. wg.Wait()
  374. case tensor.Q6_K:
  375. wData := weight.DataQ6_K()
  376. if K%256 != 0 {
  377. return fmt.Errorf("linear: Q6_K weight K dimension %d must be multiple of 256", K)
  378. }
  379. wParams := tensor.GetQ6KDotParams(wData)
  380. blocksPerRow := K / 256
  381. work := M * N * K
  382. use := chooseWorkers(work, workers)
  383. if use == 1 {
  384. if M == 1 {
  385. q6kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, 0, N)
  386. return nil
  387. }
  388. for m := 0; m < M; m++ {
  389. for n := 0; n < N; n++ {
  390. var sum float32
  391. for b := 0; b < blocksPerRow; b++ {
  392. inOffset := m*K + b*256
  393. wBlockIdx := n*blocksPerRow + b
  394. block := &wData[wBlockIdx]
  395. p := &wParams[wBlockIdx]
  396. sum += tensor.DotQ6_K_Params(block, p, inData[inOffset:inOffset+256])
  397. }
  398. outData[m*N+n] = sum
  399. }
  400. }
  401. return nil
  402. }
  403. var wg sync.WaitGroup
  404. if M == 1 {
  405. for _, r := range chunkRanges(N, use) {
  406. wg.Add(1)
  407. start, end := r[0], r[1]
  408. go func(s, e int) {
  409. defer wg.Done()
  410. q6kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, s, e)
  411. }(start, end)
  412. }
  413. wg.Wait()
  414. return nil
  415. }
  416. if M < use {
  417. for _, r := range chunkRanges(N, use) {
  418. wg.Add(1)
  419. start, end := r[0], r[1]
  420. go func(s, e int) {
  421. defer wg.Done()
  422. for n := s; n < e; n++ {
  423. for m := 0; m < M; m++ {
  424. var sum float32
  425. for b := 0; b < blocksPerRow; b++ {
  426. inOffset := m*K + b*256
  427. wBlockIdx := n*blocksPerRow + b
  428. block := &wData[wBlockIdx]
  429. p := &wParams[wBlockIdx]
  430. sum += tensor.DotQ6_K_Params(block, p, inData[inOffset:inOffset+256])
  431. }
  432. outData[m*N+n] = sum
  433. }
  434. }
  435. }(start, end)
  436. }
  437. wg.Wait()
  438. return nil
  439. }
  440. for _, r := range chunkRanges(M, use) {
  441. wg.Add(1)
  442. start, end := r[0], r[1]
  443. go func(s, e int) {
  444. defer wg.Done()
  445. for m := s; m < e; m++ {
  446. for n := 0; n < N; n++ {
  447. var sum float32
  448. for b := 0; b < blocksPerRow; b++ {
  449. inOffset := m*K + b*256
  450. wBlockIdx := n*blocksPerRow + b
  451. block := &wData[wBlockIdx]
  452. p := &wParams[wBlockIdx]
  453. sum += tensor.DotQ6_K_Params(block, p, inData[inOffset:inOffset+256])
  454. }
  455. outData[m*N+n] = sum
  456. }
  457. }
  458. }(start, end)
  459. }
  460. wg.Wait()
  461. case tensor.Q2_K:
  462. wData := weight.DataQ2_K()
  463. if K%256 != 0 {
  464. return fmt.Errorf("linear: Q2_K weight K dimension %d must be multiple of 256", K)
  465. }
  466. wParams := tensor.GetQ2KDotParams(wData)
  467. blocksPerRow := K / 256
  468. work := M * N * K
  469. use := chooseWorkers(work, workers)
  470. if use == 1 {
  471. if M == 1 {
  472. q2kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, 0, N)
  473. return nil
  474. }
  475. for m := 0; m < M; m++ {
  476. for n := 0; n < N; n++ {
  477. var sum float32
  478. for b := 0; b < blocksPerRow; b++ {
  479. inOffset := m*K + b*256
  480. wBlockIdx := n*blocksPerRow + b
  481. block := &wData[wBlockIdx]
  482. p := &wParams[wBlockIdx]
  483. sum += tensor.DotQ2_K_Params(block, p, inData[inOffset:inOffset+256])
  484. }
  485. outData[m*N+n] = sum
  486. }
  487. }
  488. return nil
  489. }
  490. var wg sync.WaitGroup
  491. if M == 1 {
  492. for _, r := range chunkRanges(N, use) {
  493. wg.Add(1)
  494. start, end := r[0], r[1]
  495. go func(s, e int) {
  496. defer wg.Done()
  497. q2kGemvDecodeTiled(outData[:N], inData[:K], wData, wParams, N, blocksPerRow, s, e)
  498. }(start, end)
  499. }
  500. wg.Wait()
  501. return nil
  502. }
  503. if M < use {
  504. for _, r := range chunkRanges(N, use) {
  505. wg.Add(1)
  506. start, end := r[0], r[1]
  507. go func(s, e int) {
  508. defer wg.Done()
  509. for n := s; n < e; n++ {
  510. for m := 0; m < M; m++ {
  511. var sum float32
  512. for b := 0; b < blocksPerRow; b++ {
  513. inOffset := m*K + b*256
  514. wBlockIdx := n*blocksPerRow + b
  515. block := &wData[wBlockIdx]
  516. p := &wParams[wBlockIdx]
  517. sum += tensor.DotQ2_K_Params(block, p, inData[inOffset:inOffset+256])
  518. }
  519. outData[m*N+n] = sum
  520. }
  521. }
  522. }(start, end)
  523. }
  524. wg.Wait()
  525. return nil
  526. }
  527. for _, r := range chunkRanges(M, use) {
  528. wg.Add(1)
  529. start, end := r[0], r[1]
  530. go func(s, e int) {
  531. defer wg.Done()
  532. for m := s; m < e; m++ {
  533. for n := 0; n < N; n++ {
  534. var sum float32
  535. for b := 0; b < blocksPerRow; b++ {
  536. inOffset := m*K + b*256
  537. wBlockIdx := n*blocksPerRow + b
  538. block := &wData[wBlockIdx]
  539. p := &wParams[wBlockIdx]
  540. sum += tensor.DotQ2_K_Params(block, p, inData[inOffset:inOffset+256])
  541. }
  542. outData[m*N+n] = sum
  543. }
  544. }
  545. }(start, end)
  546. }
  547. wg.Wait()
  548. default:
  549. return fmt.Errorf("linear: unsupported weight dtype %v", weight.DType())
  550. }
  551. return nil
  552. }
  553. func q4kGemvDecodeTiled(out []float32, x []float32, w []tensor.BlockQ4_K, wp []tensor.Q4KDotParams, N, blocksPerRow, startN, endN int) {
  554. const tile = 8
  555. for n := startN; n < endN; n += tile {
  556. tn := endN - n
  557. if tn > tile {
  558. tn = tile
  559. }
  560. var sums [tile]float32
  561. for b := 0; b < blocksPerRow; b++ {
  562. xBlock := &x[b*256]
  563. base := n*blocksPerRow + b
  564. tensor.DotQ4KTile8(&sums, w, wp, base, blocksPerRow, xBlock, tn)
  565. }
  566. for t := 0; t < tn; t++ {
  567. out[n+t] = sums[t]
  568. }
  569. }
  570. }
  571. func q5kGemvDecodeTiled(out []float32, x []float32, w []tensor.BlockQ5_K, wp []tensor.Q5KDotParams, N, blocksPerRow, startN, endN int) {
  572. const tile = 8
  573. for n := startN; n < endN; n += tile {
  574. tn := endN - n
  575. if tn > tile {
  576. tn = tile
  577. }
  578. var sums [tile]float32
  579. for b := 0; b < blocksPerRow; b++ {
  580. xBlock := &x[b*256]
  581. base := n*blocksPerRow + b
  582. tensor.DotQ5KTile8(&sums, w, wp, base, blocksPerRow, xBlock, tn)
  583. }
  584. for t := 0; t < tn; t++ {
  585. out[n+t] = sums[t]
  586. }
  587. }
  588. }
  589. func q6kGemvDecodeTiled(out []float32, x []float32, w []tensor.BlockQ6_K, wp []tensor.Q6KDotParams, N, blocksPerRow, startN, endN int) {
  590. const tile = 8
  591. for n := startN; n < endN; n += tile {
  592. tn := endN - n
  593. if tn > tile {
  594. tn = tile
  595. }
  596. var sums [tile]float32
  597. for b := 0; b < blocksPerRow; b++ {
  598. xBlock := &x[b*256]
  599. base := n*blocksPerRow + b
  600. tensor.DotQ6KTile8(&sums, w, wp, base, blocksPerRow, xBlock, tn)
  601. }
  602. for t := 0; t < tn; t++ {
  603. out[n+t] = sums[t]
  604. }
  605. }
  606. }
  607. func q3kGemvDecodeTiled(out []float32, x []float32, w []tensor.BlockQ3_K, wp []tensor.Q3KDotParams, N, blocksPerRow, startN, endN int) {
  608. const tile = 8
  609. for n := startN; n < endN; n += tile {
  610. tn := endN - n
  611. if tn > tile {
  612. tn = tile
  613. }
  614. var sums [tile]float32
  615. for b := 0; b < blocksPerRow; b++ {
  616. xBlock := &x[b*256]
  617. base := n*blocksPerRow + b
  618. tensor.DotQ3KTile8(&sums, w, wp, base, blocksPerRow, xBlock, tn)
  619. }
  620. for t := 0; t < tn; t++ {
  621. out[n+t] = sums[t]
  622. }
  623. }
  624. }
  625. func q2kGemvDecodeTiled(out []float32, x []float32, w []tensor.BlockQ2_K, wp []tensor.Q2KDotParams, N, blocksPerRow, startN, endN int) {
  626. const tile = 8
  627. for n := startN; n < endN; n += tile {
  628. tn := endN - n
  629. if tn > tile {
  630. tn = tile
  631. }
  632. var sums [tile]float32
  633. for b := 0; b < blocksPerRow; b++ {
  634. xBlock := &x[b*256]
  635. base := n*blocksPerRow + b
  636. tensor.DotQ2KTile8(&sums, w, wp, base, blocksPerRow, xBlock, tn)
  637. }
  638. for t := 0; t < tn; t++ {
  639. out[n+t] = sums[t]
  640. }
  641. }
  642. }
  643. func q8kGemvDecodeTiled(out []float32, x []float32, w []tensor.BlockQ8_K, N, blocksPerRow, startN, endN int) {
  644. const tile = 8
  645. for n := startN; n < endN; n += tile {
  646. tn := endN - n
  647. if tn > tile {
  648. tn = tile
  649. }
  650. var sums [tile]float32
  651. for b := 0; b < blocksPerRow; b++ {
  652. xBlock := &x[b*256]
  653. base := n*blocksPerRow + b
  654. tensor.DotQ8KTile8(&sums, w, base, blocksPerRow, xBlock, tn)
  655. }
  656. for t := 0; t < tn; t++ {
  657. out[n+t] = sums[t]
  658. }
  659. }
  660. }