| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571 |
- #define GGML_COMMON_IMPL_CPP
- #define GGML_COMMON_DECL_CPP
- #include "ggml-common.h"
- #include "ggml-backend-impl.h"
- #include "ggml-impl.h"
- #include "ggml-cpu.h"
- #include "ggml-cpu-impl.h"
- #include "simd-mappings.h"
- #include "traits.h"
- #include "arch-fallback.h"
- #include <cmath>
- #include <cstring>
- #include <cassert>
- #include <cstdlib> // for qsort
- #include <cstdio> // for GGML_ASSERT
- #include "repack.h"
- #if defined(__GNUC__)
- #pragma GCC diagnostic ignored "-Woverlength-strings"
- #endif
- #define UNUSED GGML_UNUSED
- static inline int nearest_int(float fval) {
- assert(fabsf(fval) <= 4194303.f);
- float val = fval + 12582912.f;
- int i; memcpy(&i, &val, sizeof(int));
- return (i & 0x007fffff) - 0x00400000;
- }
- // Functions to create the interleaved data layout formats
- // interleave 4 block_q4_0s in blocks of blck_size_interleave
- // returns an interleaved block_q4_0x4
- // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
- // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
- //
- // - in : an array of block_q4_0 pointers
- // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
- // blck_size_interleave bytes
- // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
- // from bias offset form to pure sign form (this saves subtract
- // operations durin unpacking)
- //
- extern "C" {
- void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
- assert(QK8_0 == 32);
- assert(k % QK8_0 == 0);
- const int nb = k / QK8_0;
- block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
- // scalar
- const int blck_size_interleave = 4;
- float srcv[4][QK8_0];
- float id[4];
- for (int i = 0; i < nb; i++) {
- for (int row_iter = 0; row_iter < 4; row_iter++) {
- float amax = 0.0f; // absolute max
- for (int j = 0; j < QK8_0; j++) {
- srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
- amax = MAX(amax, fabsf(srcv[row_iter][j]));
- }
- const float d = amax / ((1 << 7) - 1);
- id[row_iter] = d ? 1.0f / d : 0.0f;
- y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
- }
- for (int j = 0; j < QK8_0 * 4; j++) {
- int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
- src_offset += (j % blck_size_interleave);
- float x0 = srcv[src_id][src_offset] * id[src_id];
- y[i].qs[j] = roundf(x0);
- }
- }
- }
- void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
- assert(QK8_0 == 32);
- assert(k % QK8_0 == 0);
- const int nb = k / QK8_0;
- block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
- // scalar
- const int blck_size_interleave = 8;
- float srcv[4][QK8_0];
- float id[4];
- for (int i = 0; i < nb; i++) {
- for (int row_iter = 0; row_iter < 4; row_iter++) {
- float amax = 0.0f; // absolute max
- for (int j = 0; j < QK8_0; j++) {
- srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
- amax = MAX(amax, fabsf(srcv[row_iter][j]));
- }
- const float d = amax / ((1 << 7) - 1);
- id[row_iter] = d ? 1.0f / d : 0.0f;
- y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
- }
- for (int j = 0; j < QK8_0 * 4; j++) {
- int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
- src_offset += (j % blck_size_interleave);
- float x0 = srcv[src_id][src_offset] * id[src_id];
- y[i].qs[j] = roundf(x0);
- }
- }
- }
- void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
- assert(QK_K == 256);
- assert(k % QK_K == 0);
- const int nb = k / QK_K;
- block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
- // scalar
- const int blck_size_interleave = 8;
- float srcv[4][QK_K];
- float iscale[4];
- for (int i = 0; i < nb; i++) {
- for (int row_iter = 0; row_iter < 4; row_iter++) {
- float amax = 0.0f; // absolute max
- float max = 0;
- for (int j = 0; j < QK_K; j++) {
- srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
- // Update the maximum value of the corresponding super block
- if(amax < fabsf(srcv[row_iter][j])) {
- amax = fabsf(srcv[row_iter][j]);
- max = srcv[row_iter][j];
- }
- }
- iscale[row_iter] = amax ? -127.f/max : 0;
- y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
- }
- for (int j = 0; j < QK_K / 4; j++) {
- y[i].bsums[j] = 0;
- }
- // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
- // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
- // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
- for (int j = 0; j < QK_K * 4; j++) {
- int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
- src_offset += (j % blck_size_interleave);
- int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
- float x0 = srcv[src_id][src_offset] * iscale[src_id];
- y[i].qs[j] = nearest_int(x0);
- y[i].bsums[index] += y[i].qs[j];
- }
- }
- }
- } // extern "C"
- template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
- void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
- template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
- assert(nrow == 4);
- UNUSED(nrow);
- ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
- }
- template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
- assert(nrow == 4);
- UNUSED(nrow);
- ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
- }
- template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
- assert(nrow == 4);
- UNUSED(nrow);
- ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
- }
- extern "C" {
- void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK8_0;
- const int nb = n / qk;
- const int ncols_interleaved = 4;
- const int blocklen = 4;
- assert (n % qk == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- float sumf[4];
- int sumi;
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
- for (int l = 0; l < nb; l++) {
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
- }
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
- }
- }
- }
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
- }
- }
- void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK8_0;
- const int nb = n / qk;
- const int ncols_interleaved = 4;
- const int blocklen = 8;
- assert (n % qk == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- float sumf[4];
- int sumi;
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
- for (int l = 0; l < nb; l++) {
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
- }
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
- }
- }
- }
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
- }
- }
- void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK8_0;
- const int nb = n / qk;
- const int ncols_interleaved = 8;
- const int blocklen = 8;
- assert (n % qk == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- {
- float sumf[8];
- int sumi;
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
- for (int l = 0; l < nb; l++) {
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
- }
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
- }
- }
- }
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
- }
- }
- }
- void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK_K;
- const int nb = n / qk;
- const int ncols_interleaved = 8;
- const int blocklen = 8;
- static const uint32_t kmask1 = 0x3f3f3f3f;
- static const uint32_t kmask2 = 0x0f0f0f0f;
- static const uint32_t kmask3 = 0x03030303;
- assert (n % qk == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- float sumf[8];
- float sum_minf[8];
- uint32_t utmp[32];
- int sumi1;
- int sumi2;
- int sumi;
- const block_q8_K * a_ptr = (const block_q8_K *) vy;
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
- for (int j = 0; j < ncols_interleaved; j++) {
- sumf[j] = 0.0;
- sum_minf[j] = 0.0;
- }
- for (int l = 0; l < nb; l++) {
- for (int sb = 0; sb < 8; sb++) {
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
- utmp[sb * 4 + 2] = uaux_0;
- utmp[sb * 4 + 0] &= kmask1;
- }
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
- uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi1 = 0;
- sumi2 = 0;
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
- sumi1 = sumi1 * scales_0[j];
- sumi2 = sumi2 * scales_1[j];
- sumi += sumi1 + sumi2;
- }
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
- }
- }
- for (int sb = 0; sb < 8; sb++) {
- uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
- for (int j = 0; j < ncols_interleaved; j++) {
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
- }
- }
- }
- for (int j = 0; j < ncols_interleaved; j++) {
- s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
- }
- }
- }
- void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK8_0;
- const int nb = n / qk;
- const int ncols_interleaved = 4;
- const int blocklen = 4;
- assert (n % qk == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- {
- float sumf[4];
- int sumi;
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
- for (int l = 0; l < nb; l++) {
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
- }
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
- }
- }
- }
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
- }
- }
- }
- void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK8_0;
- const int nb = n / qk;
- const int ncols_interleaved = 4;
- const int blocklen = 4;
- assert (n % qk == 0);
- assert (nr % 4 == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- {
- float sumf[4][4];
- int sumi;
- for (int y = 0; y < nr / 4; y++) {
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
- }
- for (int l = 0; l < nb; l++) {
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
- }
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
- }
- }
- }
- }
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++)
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
- }
- }
- }
- }
- }
- void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK8_0;
- const int nb = n / qk;
- const int ncols_interleaved = 4;
- const int blocklen = 8;
- assert (n % qk == 0);
- assert (nr % 4 == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- float sumf[4][4];
- int sumi;
- for (int y = 0; y < nr / 4; y++) {
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
- }
- for (int l = 0; l < nb; l++) {
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
- }
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
- }
- }
- }
- }
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++)
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
- }
- }
- }
- }
- void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK8_0;
- const int nb = n / qk;
- const int ncols_interleaved = 8;
- const int blocklen = 8;
- assert (n % qk == 0);
- assert (nr % 4 == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- float sumf[4][8];
- int sumi;
- for (int y = 0; y < nr / 4; y++) {
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
- }
- for (int l = 0; l < nb; l++) {
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
- }
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
- }
- }
- }
- }
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++)
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
- }
- }
- }
- }
- void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK_K;
- const int nb = n / qk;
- const int ncols_interleaved = 8;
- const int blocklen = 8;
- static const uint32_t kmask1 = 0x3f3f3f3f;
- static const uint32_t kmask2 = 0x0f0f0f0f;
- static const uint32_t kmask3 = 0x03030303;
- assert (n % qk == 0);
- assert (nr % 4 == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- float sumf[4][8];
- float sum_minf[4][8];
- uint32_t utmp[32];
- int sumi1;
- int sumi2;
- int sumi;
- for (int y = 0; y < nr / 4; y++) {
- const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumf[m][j] = 0.0;
- sum_minf[m][j] = 0.0;
- }
- }
- for (int l = 0; l < nb; l++) {
- for (int sb = 0; sb < 8; sb++) {
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
- utmp[sb * 4 + 2] = uaux_0;
- utmp[sb * 4 + 0] &= kmask1;
- }
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
- uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi1 = 0;
- sumi2 = 0;
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
- sumi1 = sumi1 * scales_0[j];
- sumi2 = sumi2 * scales_1[j];
- sumi += sumi1 + sumi2;
- }
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
- }
- }
- }
- for (int sb = 0; sb < 8; sb++) {
- uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
- for(int m = 0; m < 4; m++) {
- const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
- for(int j = 0; j < ncols_interleaved; j++) {
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
- }
- }
- }
- }
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
- }
- }
- }
- }
- }
- void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
- const int qk = QK8_0;
- const int nb = n / qk;
- const int ncols_interleaved = 4;
- const int blocklen = 4;
- assert (n % qk == 0);
- assert (nr % 4 == 0);
- assert (nc % ncols_interleaved == 0);
- UNUSED(s);
- UNUSED(bs);
- UNUSED(vx);
- UNUSED(vy);
- UNUSED(nr);
- UNUSED(nc);
- UNUSED(nb);
- UNUSED(ncols_interleaved);
- UNUSED(blocklen);
- {
- float sumf[4][4];
- int sumi;
- for (int y = 0; y < nr / 4; y++) {
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
- for (int x = 0; x < nc / ncols_interleaved; x++) {
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
- }
- for (int l = 0; l < nb; l++) {
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++) {
- sumi = 0;
- for (int i = 0; i < blocklen; ++i) {
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
- }
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
- }
- }
- }
- }
- for (int m = 0; m < 4; m++) {
- for (int j = 0; j < ncols_interleaved; j++)
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
- }
- }
- }
- }
- }
- } // extern "C"
- static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
- block_q4_0x4 out;
- for (int i = 0; i < 4; i++) {
- out.d[i] = in[i].d;
- }
- const int end = QK4_0 * 2 / blck_size_interleave;
- if (blck_size_interleave == 8) {
- const uint64_t xor_mask = 0x8888888888888888ULL;
- for (int i = 0; i < end; ++i) {
- int src_id = i % 4;
- int src_offset = (i / 4) * blck_size_interleave;
- int dst_offset = i * blck_size_interleave;
- uint64_t elems;
- // Using memcpy to avoid unaligned memory accesses
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
- elems ^= xor_mask;
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
- }
- } else if (blck_size_interleave == 4) {
- const uint32_t xor_mask = 0x88888888;
- for (int i = 0; i < end; ++i) {
- int src_id = i % 4;
- int src_offset = (i / 4) * blck_size_interleave;
- int dst_offset = i * blck_size_interleave;
- uint32_t elems;
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
- elems ^= xor_mask;
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
- }
- } else {
- GGML_ASSERT(false);
- }
- return out;
- }
- // interleave 8 block_q4_0s in blocks of blck_size_interleave
- // returns an interleaved block_q4_0x8
- // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
- // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
- static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
- block_q4_0x8 out;
- for (int i = 0; i < 8; i++) {
- out.d[i] = in[i].d;
- }
- const int end = QK4_0 * 4 / blck_size_interleave;
- const uint64_t xor_mask = 0x8888888888888888ULL;
- for (int i = 0; i < end; ++i) {
- int src_id = i % 8;
- int src_offset = (i / 8) * blck_size_interleave;
- int dst_offset = i * blck_size_interleave;
- uint64_t elems;
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
- elems ^= xor_mask;
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
- }
- return out;
- }
- static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
- block_q4_Kx8 out;
- //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
- for (int i = 0; i < 8; i++) {
- out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
- }
- for (int i = 0; i < 8; i++) {
- out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
- }
- const int end = QK_K * 4 / blck_size_interleave;
- // Interleave Q4_K quants by taking 8 bytes at a time
- for (int i = 0; i < end; ++i) {
- int src_id = i % 8;
- int src_offset = (i / 8) * blck_size_interleave;
- int dst_offset = i * blck_size_interleave;
- uint64_t elems;
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
- }
- // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
- // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
- // The output Q4_Kx8 structure has 96 bytes
- // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
- // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
- uint8_t s[8], m[8];
- for (int i = 0; i < 4; i++) {
- for (int j = 0; j < 8; j++) {
- s[j] = in[j].scales[i] & 63;
- m[j] = in[j].scales[i + 4] & 63;
- }
- out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
- out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
- out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
- out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
- out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
- out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
- out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
- out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
- out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
- out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
- out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
- out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
- }
- for (int i = 0; i < 4; i++) {
- for (int j = 0; j < 8; j++) {
- s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
- m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
- }
- out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
- out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
- out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
- out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
- out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
- out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
- out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
- out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
- out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
- out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
- out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
- out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
- }
- return out;
- }
- static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
- GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
- GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
- constexpr int nrows_interleaved = 4;
- block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
- const block_q4_0 * src = (const block_q4_0 *)data;
- block_q4_0 dst_tmp[4];
- int nrow = ggml_nrows(t);
- int nblocks = t->ne[0] / QK4_0;
- GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
- if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
- return -1;
- }
- for (int b = 0; b < nrow; b += nrows_interleaved) {
- for (int64_t x = 0; x < nblocks; x++) {
- for (int i = 0; i < nrows_interleaved; i++) {
- dst_tmp[i] = src[x + i * nblocks];
- }
- *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
- }
- src += nrows_interleaved * nblocks;
- }
- return 0;
- GGML_UNUSED(data_size);
- }
- static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
- GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
- GGML_ASSERT(interleave_block == 8);
- constexpr int nrows_interleaved = 8;
- block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
- const block_q4_K * src = (const block_q4_K*) data;
- block_q4_K dst_tmp[8];
- int nrow = ggml_nrows(t);
- int nblocks = t->ne[0] / QK_K;
- GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
- if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
- return -1;
- }
- for (int b = 0; b < nrow; b += nrows_interleaved) {
- for (int64_t x = 0; x < nblocks; x++) {
- for (int i = 0; i < nrows_interleaved; i++ ) {
- dst_tmp[i] = src[x + i * nblocks];
- }
- *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
- }
- src += nrows_interleaved * nblocks;
- }
- return 0;
- GGML_UNUSED(data_size);
- }
- static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
- GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
- GGML_ASSERT(interleave_block == 8);
- constexpr int nrows_interleaved = 8;
- block_q4_0x8 * dst = (block_q4_0x8*)t->data;
- const block_q4_0 * src = (const block_q4_0*) data;
- block_q4_0 dst_tmp[8];
- int nrow = ggml_nrows(t);
- int nblocks = t->ne[0] / QK4_0;
- GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
- if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
- return -1;
- }
- for (int b = 0; b < nrow; b += nrows_interleaved) {
- for (int64_t x = 0; x < nblocks; x++) {
- for (int i = 0; i < nrows_interleaved; i++ ) {
- dst_tmp[i] = src[x + i * nblocks];
- }
- *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
- }
- src += nrows_interleaved * nblocks;
- }
- return 0;
- GGML_UNUSED(data_size);
- }
- static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
- block_iq4_nlx4 out;
- for (int i = 0; i < 4; i++) {
- out.d[i] = in[i].d;
- }
- const int end = QK4_NL * 2 / blck_size_interleave;
- // TODO: this branch seems wrong
- //if (blck_size_interleave == 8) {
- // for (int i = 0; i < end; ++i) {
- // int src_id = i % 4;
- // int src_offset = (i / 4) * blck_size_interleave;
- // int dst_offset = i * blck_size_interleave;
- // // Using memcpy to avoid unaligned memory accesses
- // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
- // }
- //} else
- if (blck_size_interleave == 4) {
- for (int i = 0; i < end; ++i) {
- int src_id = i % 4;
- int src_offset = (i / 4) * blck_size_interleave;
- int dst_offset = i * blck_size_interleave;
- memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
- }
- } else {
- GGML_ASSERT(false);
- }
- return out;
- }
- static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
- GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
- //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
- GGML_ASSERT(interleave_block == 4);
- block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
- const block_iq4_nl * src = (const block_iq4_nl *)data;
- block_iq4_nl dst_tmp[4];
- int nrow = ggml_nrows(t);
- int nrows_interleaved = 4;
- int nblocks = t->ne[0] / QK4_0;
- GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
- if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
- return -1;
- }
- for (int b = 0; b < nrow; b += nrows_interleaved) {
- for (int64_t x = 0; x < nblocks; x++) {
- for (int i = 0; i < nrows_interleaved; i++) {
- dst_tmp[i] = src[x + i * nblocks];
- }
- *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
- }
- src += nrows_interleaved * nblocks;
- }
- return 0;
- GGML_UNUSED(data_size);
- }
- namespace ggml::cpu::repack {
- // repack
- template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
- int repack(struct ggml_tensor *, const void *, size_t);
- // TODO: generalise.
- template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
- return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
- }
- template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
- return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
- }
- template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
- return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
- }
- template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
- return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
- }
- template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
- return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
- }
- // TODO: needs to be revisited
- //template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
- // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
- //}
- // gemv
- template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
- void gemv(int, float *, size_t, const void *, const void *, int, int);
- template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
- }
- template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
- }
- template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
- }
- template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
- }
- template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
- }
- // gemm
- template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
- void gemm(int, float *, size_t, const void *, const void *, int, int);
- template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
- }
- template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
- }
- template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
- }
- template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
- }
- template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
- ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
- }
- class tensor_traits_base : public ggml::cpu::tensor_traits {
- public:
- virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
- };
- template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
- bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
- // not realy a GGML_TYPE_Q8_0 but same size.
- switch (op->op) {
- case GGML_OP_MUL_MAT:
- {
- size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
- return true;
- }
- case GGML_OP_MUL_MAT_ID:
- {
- size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
- size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
- const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
- const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
- const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
- size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
- return true;
- }
- default:
- // GGML_ABORT("fatal error");
- break;
- }
- return false;
- }
- bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
- switch (op->op) {
- case GGML_OP_MUL_MAT:
- forward_mul_mat(params, op);
- return true;
- case GGML_OP_MUL_MAT_ID:
- forward_mul_mat_id(params, op);
- return true;
- default:
- // GGML_ABORT("fatal error");
- break;
- }
- return false;
- }
- void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
- const ggml_tensor * src0 = op->src[0];
- const ggml_tensor * src1 = op->src[1];
- ggml_tensor * dst = op;
- GGML_TENSOR_BINARY_OP_LOCALS
- const int ith = params->ith;
- const int nth = params->nth;
- GGML_ASSERT(ne0 == ne01);
- GGML_ASSERT(ne1 == ne11);
- GGML_ASSERT(ne2 == ne12);
- GGML_ASSERT(ne3 == ne13);
- // dst cannot be transposed or permuted
- GGML_ASSERT(nb0 == sizeof(float));
- GGML_ASSERT(nb0 <= nb1);
- GGML_ASSERT(nb1 <= nb2);
- GGML_ASSERT(nb2 <= nb3);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
- GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
- // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
- char * wdata = static_cast<char *>(params->wdata);
- const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
- assert(params->wsize >= nbw1 * ne11);
- const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
- int64_t i11_processed = 0;
- for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
- ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
- }
- i11_processed = ne11 - ne11 % 4;
- for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
- from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
- }
- ggml_barrier(params->threadpool);
- const void * src1_wdata = params->wdata;
- const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
- int64_t src0_start = (ith * ne01) / nth;
- int64_t src0_end = ((ith + 1) * ne01) / nth;
- src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
- src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
- if (src0_start >= src0_end) {
- return;
- }
- // If there are more than three rows in src1, use gemm; otherwise, use gemv.
- if (ne11 > 3) {
- gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
- (float *) ((char *) dst->data) + src0_start, ne01,
- (const char *) src0->data + src0_start * nb01,
- (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
- }
- for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
- gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
- (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
- (const char *) src0->data + src0_start * nb01,
- (const char *) src1_wdata + (src1_col_stride * iter), 1,
- src0_end - src0_start);
- }
- }
- void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
- const ggml_tensor * src0 = op->src[0];
- const ggml_tensor * src1 = op->src[1];
- const ggml_tensor * ids = op->src[2];
- ggml_tensor * dst = op;
- GGML_TENSOR_BINARY_OP_LOCALS
- const int ith = params->ith;
- const int nth = params->nth;
- const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
- // we don't support permuted src0 or src1
- GGML_ASSERT(nb00 == ggml_type_size(src0->type));
- GGML_ASSERT(nb10 == ggml_type_size(src1->type));
- // dst cannot be transposed or permuted
- GGML_ASSERT(nb0 == sizeof(float));
- GGML_ASSERT(nb0 <= nb1);
- GGML_ASSERT(nb1 <= nb2);
- GGML_ASSERT(nb2 <= nb3);
- GGML_ASSERT(ne03 == 1);
- GGML_ASSERT(ne13 == 1);
- GGML_ASSERT(ne3 == 1);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
- // row groups
- const int n_ids = ids->ne[0]; // n_expert_used
- const int n_as = ne02; // n_expert
- const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
- const size_t nbw2 = nbw1*ne11;
- const size_t nbw3 = nbw2*ne12;
- struct mmid_row_mapping {
- int32_t i1;
- int32_t i2;
- };
- GGML_ASSERT(params->wsize >=
- (GGML_PAD(nbw3, sizeof(int64_t)) +
- n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
- );
- auto * wdata = (char *)params->wdata;
- auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
- // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
- auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
- struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
- // src1: float32 => param type
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
- for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
- from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
- (void *) (wdata + i12 * nbw2 + i11 * nbw1),
- ne10);
- }
- }
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
- if (ith == 0) {
- // initialize matrix_row_counts
- memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
- // group rows by src0 matrix
- for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
- for (int32_t id = 0; id < n_ids; ++id) {
- const int32_t i02 =
- *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
- GGML_ASSERT(i02 >= 0 && i02 < n_as);
- MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
- matrix_row_counts[i02] += 1;
- }
- }
- }
- ggml_barrier(params->threadpool);
- // compute each matrix multiplication in sequence
- for (int cur_a = 0; cur_a < n_as; ++cur_a) {
- const int64_t cne1 = matrix_row_counts[cur_a];
- if (cne1 == 0) {
- continue;
- }
- const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
- //const int64_t nr0 = ne01; // src0 rows
- const int64_t nr1 = cne1; // src1 rows
- int64_t src0_cur_start = (ith * ne01) / nth;
- int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
- src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
- src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
- if (src0_cur_start >= src0_cur_end) {
- return;
- }
- for (int ir1 = 0; ir1 < nr1; ir1++) {
- struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
- const int id = row_mapping.i1; // selected expert index
- const int64_t i11 = id % ne11;
- const int64_t i12 = row_mapping.i2; // row index in src1
- const int64_t i1 = id; // selected expert index
- const int64_t i2 = i12; // row
- const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
- gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
- (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
- src0_cur + src0_cur_start * nb01,
- src1_col, 1, src0_cur_end - src0_cur_start);
- }
- }
- #undef MMID_MATRIX_ROW
- }
- int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
- GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
- (int) NB_COLS, (int) INTER_SIZE);
- return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
- }
- };
- } // namespace ggml::cpu::repack
- static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
- // instance for Q4
- static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
- static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
- static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
- static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
- // instance for IQ4
- static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
- if (cur->type == GGML_TYPE_Q4_0) {
- if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
- if (cur->ne[1] % 8 == 0) {
- return &q4_0_8x8_q8_0;
- }
- }
- if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
- if (cur->ne[1] % 4 == 0) {
- return &q4_0_4x8_q8_0;
- }
- }
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
- if (cur->ne[1] % 4 == 0) {
- return &q4_0_4x4_q8_0;
- }
- }
- } else if (cur->type == GGML_TYPE_Q4_K) {
- if (ggml_cpu_has_avx2()) {
- if (cur->ne[1] % 8 == 0) {
- return &q4_K_8x8_q8_K;
- }
- }
- } else if (cur->type == GGML_TYPE_IQ4_NL) {
- if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
- if (cur->ne[1] % 4 == 0) {
- return &iq4_nl_4x4_q8_0;
- }
- }
- }
- return nullptr;
- }
- static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
- tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
- GGML_UNUSED(buffer);
- return GGML_STATUS_SUCCESS;
- }
- static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
- const void * data, size_t offset, size_t size) {
- GGML_ASSERT(offset == 0);
- GGML_ASSERT(size == ggml_nbytes(tensor));
- auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
- auto OK = tensor_traits->repack(tensor, data, size);
- GGML_ASSERT(OK == 0);
- GGML_UNUSED(buffer);
- }
- static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
- return "CPU_REPACK";
- GGML_UNUSED(buft);
- }
- static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
- if (buffer == nullptr) {
- return nullptr;
- }
- buffer->buft = buft;
- buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
- buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor;
- buffer->iface.get_tensor = nullptr;
- buffer->iface.cpy_tensor = nullptr;
- return buffer;
- }
- static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
- return TENSOR_ALIGNMENT;
- GGML_UNUSED(buft);
- }
- namespace ggml::cpu::repack {
- class extra_buffer_type : ggml::cpu::extra_buffer_type {
- bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
- if ( op->op == GGML_OP_MUL_MAT &&
- op->src[0]->buffer &&
- (ggml_n_dims(op->src[0]) == 2) &&
- op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
- ggml_repack_get_optimal_repack_type(op->src[0])
- ) {
- if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
- return false;
- }
- if (op->src[1]->type == GGML_TYPE_F32) {
- return true;
- }
- //if (op->src[1]->type == GGML_TYPE_Q8_0) {
- // return true;
- //}
- // may be possible if Q8_0 packed...
- } else if (op->op == GGML_OP_MUL_MAT_ID
- && op->src[0]->buffer
- && (ggml_n_dims(op->src[0]) == 3)
- && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
- && ggml_repack_get_optimal_repack_type(op->src[0])
- ) {
- if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
- return false;
- }
- if (op->src[1]->type == GGML_TYPE_F32) {
- return true;
- }
- //if (op->src[1]->type == GGML_TYPE_Q8_0) {
- // return true;
- //}
- }
- return false;
- }
- ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
- if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
- if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
- return (ggml::cpu::tensor_traits *) op->src[0]->extra;
- }
- }
- return nullptr;
- }
- };
- } // namespace ggml::cpu::repack
- ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
- /* .iface = */ {
- /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name,
- /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer,
- /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment,
- /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
- /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
- /* .is_host = */ nullptr,
- },
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
- /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
- };
- return &ggml_backend_cpu_buffer_type_repack;
- }
|