|
@@ -692,6 +692,100 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
|
|
|
|
|
+ float * GGML_RESTRICT s,
|
|
|
|
|
+ size_t bs,
|
|
|
|
|
+ const void * GGML_RESTRICT vx,
|
|
|
|
|
+ const void * GGML_RESTRICT vy,
|
|
|
|
|
+ int nr,
|
|
|
|
|
+ int nc) {
|
|
|
|
|
+ const int qk = QK8_0;
|
|
|
|
|
+ const int nb = n / qk;
|
|
|
|
|
+ const int ncols_interleaved = 4;
|
|
|
|
|
+ const int blocklen = 4;
|
|
|
|
|
+
|
|
|
|
|
+ assert(nr == 1);
|
|
|
|
|
+ assert(n % qk == 0);
|
|
|
|
|
+ assert(nc % ncols_interleaved == 0);
|
|
|
|
|
+
|
|
|
|
|
+ UNUSED(bs);
|
|
|
|
|
+ UNUSED(nr);
|
|
|
|
|
+
|
|
|
|
|
+ float sumf[4];
|
|
|
|
|
+ int sumi;
|
|
|
|
|
+
|
|
|
|
|
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
|
|
|
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
|
|
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
|
|
|
+
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ sumf[j] = 0.0;
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int l = 0; l < nb; l++) {
|
|
|
|
|
+ for (int k = 0; k < (qk / blocklen); k++) {
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ sumi = 0;
|
|
|
|
|
+ for (int i = 0; i < blocklen; ++i) {
|
|
|
|
|
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
|
|
|
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
|
|
|
+ }
|
|
|
|
|
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ s[x * ncols_interleaved + j] = sumf[j];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
|
|
|
|
+ float * GGML_RESTRICT s,
|
|
|
|
|
+ size_t bs,
|
|
|
|
|
+ const void * GGML_RESTRICT vx,
|
|
|
|
|
+ const void * GGML_RESTRICT vy,
|
|
|
|
|
+ int nr,
|
|
|
|
|
+ int nc) {
|
|
|
|
|
+ const int qk = QK8_0;
|
|
|
|
|
+ const int nb = n / qk;
|
|
|
|
|
+ const int ncols_interleaved = 4;
|
|
|
|
|
+ const int blocklen = 8;
|
|
|
|
|
+
|
|
|
|
|
+ assert(nr == 1);
|
|
|
|
|
+ assert(n % qk == 0);
|
|
|
|
|
+ assert(nc % ncols_interleaved == 0);
|
|
|
|
|
+
|
|
|
|
|
+ UNUSED(bs);
|
|
|
|
|
+ UNUSED(nr);
|
|
|
|
|
+
|
|
|
|
|
+ float sumf[4];
|
|
|
|
|
+ int sumi;
|
|
|
|
|
+
|
|
|
|
|
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
|
|
|
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
|
|
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
|
|
|
+
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ sumf[j] = 0.0;
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int l = 0; l < nb; l++) {
|
|
|
|
|
+ for (int k = 0; k < (qk / blocklen); k++) {
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ sumi = 0;
|
|
|
|
|
+ for (int i = 0; i < blocklen; ++i) {
|
|
|
|
|
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
|
|
|
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
|
|
|
+ }
|
|
|
|
|
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ s[x * ncols_interleaved + j] = sumf[j];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
|
const int qk = QK8_0;
|
|
const int qk = QK8_0;
|
|
|
const int nb = n / qk;
|
|
const int nb = n / qk;
|
|
@@ -1219,8 +1313,129 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
|
|
|
|
+ float * GGML_RESTRICT s,
|
|
|
|
|
+ size_t bs,
|
|
|
|
|
+ const void * GGML_RESTRICT vx,
|
|
|
|
|
+ const void * GGML_RESTRICT vy,
|
|
|
|
|
+ int nr,
|
|
|
|
|
+ int nc) {
|
|
|
|
|
+ const int qk = QK8_0;
|
|
|
|
|
+ const int nb = n / qk;
|
|
|
|
|
+ const int ncols_interleaved = 4;
|
|
|
|
|
+ const int blocklen = 4;
|
|
|
|
|
+
|
|
|
|
|
+ assert(n % qk == 0);
|
|
|
|
|
+ assert(nr % 4 == 0);
|
|
|
|
|
+ assert(nc % ncols_interleaved == 0);
|
|
|
|
|
+
|
|
|
|
|
+ float sumf[4][4];
|
|
|
|
|
+ int sumi;
|
|
|
|
|
+
|
|
|
|
|
+ for (int y = 0; y < nr / 4; y++) {
|
|
|
|
|
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
|
|
|
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
|
|
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ sumf[m][j] = 0.0;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int l = 0; l < nb; l++) {
|
|
|
|
|
+ for (int k = 0; k < (qk / blocklen); k++) {
|
|
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ sumi = 0;
|
|
|
|
|
+ for (int i = 0; i < blocklen; ++i) {
|
|
|
|
|
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
|
|
|
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
|
|
|
+ }
|
|
|
|
|
+ sumf[m][j] +=
|
|
|
|
|
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|
|
|
|
+ float * GGML_RESTRICT s,
|
|
|
|
|
+ size_t bs,
|
|
|
|
|
+ const void * GGML_RESTRICT vx,
|
|
|
|
|
+ const void * GGML_RESTRICT vy,
|
|
|
|
|
+ int nr,
|
|
|
|
|
+ int nc) {
|
|
|
|
|
+ const int qk = QK8_0;
|
|
|
|
|
+ const int nb = n / qk;
|
|
|
|
|
+ const int ncols_interleaved = 4;
|
|
|
|
|
+ const int blocklen = 8;
|
|
|
|
|
+
|
|
|
|
|
+ assert(n % qk == 0);
|
|
|
|
|
+ assert(nr % 4 == 0);
|
|
|
|
|
+ assert(nc % ncols_interleaved == 0);
|
|
|
|
|
+
|
|
|
|
|
+ float sumf[4][4];
|
|
|
|
|
+ int sumi;
|
|
|
|
|
+
|
|
|
|
|
+ for (int y = 0; y < nr / 4; y++) {
|
|
|
|
|
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
|
|
|
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
|
|
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ sumf[m][j] = 0.0;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int l = 0; l < nb; l++) {
|
|
|
|
|
+ for (int k = 0; k < (qk / blocklen); k++) {
|
|
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ sumi = 0;
|
|
|
|
|
+ for (int i = 0; i < blocklen; ++i) {
|
|
|
|
|
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
|
|
|
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
|
|
|
+ }
|
|
|
|
|
+ sumf[m][j] +=
|
|
|
|
|
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
|
|
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
} // extern "C"
|
|
} // extern "C"
|
|
|
|
|
|
|
|
|
|
+static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
|
|
|
|
|
+ block_q8_0x4 out;
|
|
|
|
|
+
|
|
|
|
|
+ for (int i = 0; i < 4; i++) {
|
|
|
|
|
+ out.d[i] = in[i].d;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const int end = QK8_0 * 4 / blck_size_interleave;
|
|
|
|
|
+ for (int i = 0; i < end; ++i) {
|
|
|
|
|
+ int src_id = i % 4;
|
|
|
|
|
+ int src_offset = (i / 4) * blck_size_interleave;
|
|
|
|
|
+ int dst_offset = i * blck_size_interleave;
|
|
|
|
|
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
|
|
|
|
+ }
|
|
|
|
|
+ return out;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
|
block_q4_0x4 out;
|
|
block_q4_0x4 out;
|
|
|
|
|
|
|
@@ -1534,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
GGML_UNUSED(data_size);
|
|
GGML_UNUSED(data_size);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
|
|
|
|
+ int interleave_block,
|
|
|
|
|
+ const void * GGML_RESTRICT data,
|
|
|
|
|
+ size_t data_size) {
|
|
|
|
|
+ GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
|
|
|
|
|
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
|
|
|
+ constexpr int nrows_interleaved = 4;
|
|
|
|
|
+
|
|
|
|
|
+ block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
|
|
|
|
+ const block_q8_0 * src = (const block_q8_0 *) data;
|
|
|
|
|
+ block_q8_0 dst_tmp[4];
|
|
|
|
|
+ int nrow = ggml_nrows(t);
|
|
|
|
|
+ int nblocks = t->ne[0] / QK8_0;
|
|
|
|
|
+
|
|
|
|
|
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
|
|
|
|
|
+
|
|
|
|
|
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
|
|
|
+ return -1;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
|
|
|
+ for (int64_t x = 0; x < nblocks; x++) {
|
|
|
|
|
+ for (int i = 0; i < nrows_interleaved; i++) {
|
|
|
|
|
+ dst_tmp[i] = src[x + i * nblocks];
|
|
|
|
|
+ }
|
|
|
|
|
+ *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
|
|
|
|
|
+ }
|
|
|
|
|
+ src += nrows_interleaved * nblocks;
|
|
|
|
|
+ }
|
|
|
|
|
+ return 0;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
|
block_iq4_nlx4 out;
|
|
block_iq4_nlx4 out;
|
|
|
|
|
|
|
@@ -1702,6 +1949,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
|
|
|
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
|
|
|
+ return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
|
|
|
+ return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
// gemv
|
|
// gemv
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1738,6 +1993,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
|
|
|
+ ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
|
|
|
+ ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
// gemm
|
|
// gemm
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1774,6 +2037,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
|
|
|
+ ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
|
|
|
+ ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
|
public:
|
|
public:
|
|
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
@@ -2168,6 +2439,10 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
|
|
|
|
|
|
|
|
+ // instance for Q8_0
|
|
|
|
|
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
|
|
|
|
|
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
|
|
|
|
|
+
|
|
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
|
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
@@ -2218,6 +2493,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
return &iq4_nl_4x4_q8_0;
|
|
return &iq4_nl_4x4_q8_0;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
+ } else if (cur->type == GGML_TYPE_Q8_0) {
|
|
|
|
|
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
|
|
|
+ if (cur->ne[1] % 4 == 0) {
|
|
|
|
|
+ return &q8_0_4x8_q8_0;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
|
|
|
+ if (cur->ne[1] % 4 == 0) {
|
|
|
|
|
+ return &q8_0_4x4_q8_0;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
return nullptr;
|
|
return nullptr;
|