| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134 |
- #include "mmvq.hpp"
- #include "ggml.h"
- #include "common.hpp"
- #include "quants.hpp"
- #include "vecdotq.hpp"
- template <typename reorder_vec_dot_q_sycl>
- static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols, const int nrows, const sycl::nd_item<3> & nd_item) {
- using block_type = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
- using block_traits = typename block_type::traits;
- const auto sg = nd_item.get_sub_group();
- const int sg_range = sg.get_group_linear_range();
- const int workgroup_id = nd_item.get_group_linear_id();
- const int sg_id = sg.get_group_linear_id();
- const int row = workgroup_id * sg_range + sg_id;
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / block_traits::qk;
- constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
- constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
- const int nblocks = nrows * (ncols / block_traits::qk);
- static_assert(blocks_per_subgroup > 0);
- static_assert(block_elements_per_subgroup > 0);
- float partial_sum = 0.0f;
- for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
- const int ibx = row * blocks_per_row + i; // x block index
- const auto bx_offset = block_type::get_block_offset(ibx, nblocks);
- const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx);
- // Y block index that aligns with ibx
- const int iby = i * block_type::block_to_q8_1_ratio();
- const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
- const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2));
- #pragma unroll
- for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
- // x block quant index when casting the quants to int
- const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
- partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
- }
- }
- auto sum = sycl::reduce_over_group(nd_item.get_sub_group(), partial_sum, std::plus<>());
- if (sg.leader()) {
- dst[row] = sum;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
- static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; // Ensuring blocks_per_warp > 0
- assert(blocks_per_warp > 0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
- const int ibx = row * blocks_per_row + i; // x block index
- const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
- for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
- const int iqs = elem + vdr * (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
- }
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- template <int qk, int qi, typename block_q_t, int vdr>
- static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
- const void *__restrict__ vy,
- float *__restrict__ dst, const int ncols,
- const int nrows,
- const sycl::nd_item<3> &item_ct1) {
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
- item_ct1.get_local_id(1);
- if (row >= nrows) {
- return;
- }
- const int blocks_per_row = ncols / qk;
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
- assert(blocks_per_warp>0);
- // partial sum for each thread
- float tmp = 0.0f;
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
- i += blocks_per_warp) {
- const int ibx = row*blocks_per_row + i; // x block index
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
- const int iqs =
- vdr *
- (item_ct1.get_local_id(2) %
- (qi / vdr)); // x block quant index when casting the quants to int
- tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
- }
- // sum up partial sums and write back result
- #pragma unroll
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
- tmp +=
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
- }
- if (item_ct1.get_local_id(2) == 0) {
- dst[row] = tmp;
- }
- }
- static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
- const int nrows, dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK4_0 == 0);
- const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
- constexpr size_t num_subgroups = 16;
- GGML_ASSERT(block_num_y % num_subgroups == 0);
- const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
- const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
- stream->submit([&](sycl::handler & cgh) {
- cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
- [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
- nd_item);
- });
- });
- }
- static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK4_0 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler & cgh) {
- cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK4_1 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
- VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK5_0 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
- VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK5_1 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
- VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK8_0 == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
- VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
- VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
- VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
- VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
- const int nrows, dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
- constexpr size_t num_subgroups = 16;
- GGML_ASSERT(block_num_y % num_subgroups == 0);
- const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
- const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
- stream->submit([&](sycl::handler & cgh) {
- cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
- [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
- nrows, nd_item);
- });
- });
- }
- static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
- VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
- const int nrows, dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
- constexpr size_t num_subgroups = 16;
- GGML_ASSERT(block_num_y % num_subgroups == 0);
- const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
- const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
- stream->submit([&](sycl::handler & cgh) {
- cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
- [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
- nd_item);
- });
- });
- }
- static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
- VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler & cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK4_NL == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
- float *dst, const int ncols,
- const int nrows,
- dpct::queue_ptr stream) {
- GGML_ASSERT(ncols % QK_K == 0);
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
- const sycl::range<3> block_nums(1, 1, block_num_y);
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
- {
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1)
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
- vx, vy, dst, ncols, nrows, item_ct1);
- });
- });
- }
- }
- void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
- ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low,
- const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_col_size,
- const dpct::queue_ptr & stream) {
- const int64_t ne10 = src1->ne[0];
- GGML_ASSERT(ne10 % QK8_1 == 0);
- const int64_t ne00 = src0->ne[0];
- const int64_t row_diff = row_high - row_low;
- int id;
- SYCL_CHECK(CHECK_TRY_ERROR(id = get_current_device_id()));
- const size_t q8_1_ts = sizeof(block_q8_1);
- const size_t q8_1_bs = QK8_1;
- // the main device has a larger memory buffer to hold the results from all GPUs
- // nrows_dst == nrows of the matrix that the kernel writes into
- for (int i = 0; i < src1_ncols; i++) {
- const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
- const char * src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
- float * dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
- switch (src0->type) {
- case GGML_TYPE_Q4_0:
- if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
- ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
- GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n");
- reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- } else {
- GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n");
- mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- }
- break;
- case GGML_TYPE_Q4_1:
- mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_Q5_0:
- mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_Q5_1:
- mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_Q8_0:
- mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_Q2_K:
- mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_Q3_K:
- mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_Q4_K:
- if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
- ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
- GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
- reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- } else {
- GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
- mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- }
- break;
- case GGML_TYPE_Q5_K:
- mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_Q6_K:
- if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
- ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
- GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n");
- reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- } else {
- GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_k_q8_1_sycl\n");
- mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- }
- break;
- case GGML_TYPE_IQ1_S:
- mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_IQ1_M:
- mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_IQ2_XXS:
- mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_IQ2_XS:
- mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_IQ2_S:
- mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_IQ3_XXS:
- mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_IQ3_S:
- mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_IQ4_NL:
- mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- case GGML_TYPE_IQ4_XS:
- mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
- break;
- default:
- GGML_ABORT("fatal error");
- }
- }
- GGML_UNUSED(src1);
- GGML_UNUSED(dst);
- GGML_UNUSED(src1_ddf_i);
- GGML_UNUSED(ctx);
- }
|