|
|
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
const int ncols_interleaved = 4;
|
|
|
const int blocklen = 4;
|
|
|
|
|
|
- assert (n % qk == 0);
|
|
|
- assert (nc % ncols_interleaved == 0);
|
|
|
+ assert(nr == 1);
|
|
|
+ assert(n % qk == 0);
|
|
|
+ assert(nc % ncols_interleaved == 0);
|
|
|
|
|
|
UNUSED(s);
|
|
|
UNUSED(bs);
|
|
|
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
UNUSED(ncols_interleaved);
|
|
|
UNUSED(blocklen);
|
|
|
|
|
|
- {
|
|
|
- float sumf[8];
|
|
|
- int sumi;
|
|
|
+ float sumf[8];
|
|
|
+ int sumi;
|
|
|
|
|
|
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
|
- for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
|
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
|
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
|
|
|
|
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
|
- for (int l = 0; l < nb; l++) {
|
|
|
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
|
- for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
- sumi = 0;
|
|
|
- for (int i = 0; i < blocklen; ++i) {
|
|
|
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
|
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
|
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
|
- }
|
|
|
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
|
+ for (int l = 0; l < nb; l++) {
|
|
|
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
+ sumi = 0;
|
|
|
+ for (int i = 0; i < blocklen; ++i) {
|
|
|
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
|
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
|
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
|
}
|
|
|
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
|
}
|
|
|
}
|
|
|
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
|
}
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
const int ncols_interleaved = 4;
|
|
|
const int blocklen = 4;
|
|
|
|
|
|
- assert (n % qk == 0);
|
|
|
- assert (nc % ncols_interleaved == 0);
|
|
|
+ assert(nr == 1);
|
|
|
+ assert(n % qk == 0);
|
|
|
+ assert(nc % ncols_interleaved == 0);
|
|
|
|
|
|
- UNUSED(s);
|
|
|
UNUSED(bs);
|
|
|
- UNUSED(vx);
|
|
|
- UNUSED(vy);
|
|
|
UNUSED(nr);
|
|
|
- UNUSED(nc);
|
|
|
- UNUSED(nb);
|
|
|
- UNUSED(ncols_interleaved);
|
|
|
- UNUSED(blocklen);
|
|
|
|
|
|
- {
|
|
|
- float sumf[4];
|
|
|
- int sumi;
|
|
|
+ float sumf[4];
|
|
|
+ int sumi;
|
|
|
|
|
|
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
|
- for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
|
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
|
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
|
|
|
|
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
|
- for (int l = 0; l < nb; l++) {
|
|
|
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
|
- for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
- sumi = 0;
|
|
|
- for (int i = 0; i < blocklen; ++i) {
|
|
|
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
|
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
|
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
|
- }
|
|
|
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
|
+ for (int l = 0; l < nb; l++) {
|
|
|
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
+ sumi = 0;
|
|
|
+ for (int i = 0; i < blocklen; ++i) {
|
|
|
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
|
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
|
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
|
}
|
|
|
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
|
}
|
|
|
}
|
|
|
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
|
}
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
|
+ const int qk = QK8_0;
|
|
|
+ const int nb = n / qk;
|
|
|
+ const int ncols_interleaved = 8;
|
|
|
+ const int blocklen = 8;
|
|
|
+
|
|
|
+ assert(nr == 1);
|
|
|
+ assert(n % qk == 0);
|
|
|
+ assert(nc % ncols_interleaved == 0);
|
|
|
+
|
|
|
+ UNUSED(bs);
|
|
|
+ UNUSED(nr);
|
|
|
+
|
|
|
+ float sumf[8];
|
|
|
+ int sumi;
|
|
|
+
|
|
|
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
|
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
|
+
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
|
+ for (int l = 0; l < nb; l++) {
|
|
|
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
+ sumi = 0;
|
|
|
+ for (int i = 0; i < blocklen; ++i) {
|
|
|
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
|
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
|
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
|
+ }
|
|
|
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
|
+ const int qk = QK8_0;
|
|
|
+ const int nb = n / qk;
|
|
|
+ const int ncols_interleaved = 8;
|
|
|
+ const int blocklen = 8;
|
|
|
+
|
|
|
+ assert(n % qk == 0);
|
|
|
+ assert(nr % 4 == 0);
|
|
|
+ assert(nc % ncols_interleaved == 0);
|
|
|
+
|
|
|
+ float sumf[4][8];
|
|
|
+ int sumi;
|
|
|
+
|
|
|
+ for (int y = 0; y < nr / 4; y++) {
|
|
|
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
|
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
|
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
|
+ }
|
|
|
+ for (int l = 0; l < nb; l++) {
|
|
|
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++) {
|
|
|
+ sumi = 0;
|
|
|
+ for (int i = 0; i < blocklen; ++i) {
|
|
|
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
|
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
|
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
|
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
|
+ }
|
|
|
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for (int m = 0; m < 4; m++) {
|
|
|
+ for (int j = 0; j < ncols_interleaved; j++)
|
|
|
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
} // extern "C"
|
|
|
|
|
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
|
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
|
|
|
|
|
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
|
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
|
- //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
|
GGML_ASSERT(interleave_block == 4);
|
|
|
|
|
|
- block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
|
|
|
- const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
|
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
|
+ block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
|
|
|
+
|
|
|
block_iq4_nl dst_tmp[4];
|
|
|
+
|
|
|
int nrow = ggml_nrows(t);
|
|
|
int nrows_interleaved = 4;
|
|
|
- int nblocks = t->ne[0] / QK4_0;
|
|
|
+ int nblocks = t->ne[0] / QK4_NL;
|
|
|
|
|
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
|
|
|
|
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
|
|
GGML_UNUSED(data_size);
|
|
|
}
|
|
|
|
|
|
+static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
|
+ block_iq4_nlx8 out;
|
|
|
+
|
|
|
+ for (int i = 0; i < 8; i++) {
|
|
|
+ out.d[i] = in[i].d;
|
|
|
+ }
|
|
|
+
|
|
|
+ const int end = QK4_NL * 4 / blck_size_interleave;
|
|
|
+
|
|
|
+ if (blck_size_interleave == 8) {
|
|
|
+ for (int i = 0; i < end; ++i) {
|
|
|
+ int src_id = i % 8;
|
|
|
+ int src_offset = (i / 8) * blck_size_interleave;
|
|
|
+ int dst_offset = i * blck_size_interleave;
|
|
|
+
|
|
|
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ GGML_ASSERT(false);
|
|
|
+ }
|
|
|
+
|
|
|
+ return out;
|
|
|
+}
|
|
|
+
|
|
|
+static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
|
|
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
|
|
+ GGML_ASSERT(interleave_block == 8);
|
|
|
+
|
|
|
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
|
|
|
+ block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
|
|
|
+
|
|
|
+ block_iq4_nl dst_tmp[8];
|
|
|
+
|
|
|
+ int nrow = ggml_nrows(t);
|
|
|
+ int nrows_interleaved = 8;
|
|
|
+ int nblocks = t->ne[0] / QK4_NL;
|
|
|
+
|
|
|
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
|
|
+
|
|
|
+ if (t->ne[1] % nrows_interleaved != 0) {
|
|
|
+ return -1;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
|
+ for (int64_t x = 0; x < nblocks; x++) {
|
|
|
+ for (int i = 0; i < nrows_interleaved; i++) {
|
|
|
+ dst_tmp[i] = src[x + i * nblocks];
|
|
|
+ }
|
|
|
+ *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
|
|
|
+ }
|
|
|
+ src += nrows_interleaved * nblocks;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ GGML_UNUSED(data_size);
|
|
|
+}
|
|
|
+
|
|
|
namespace ggml::cpu::repack {
|
|
|
// repack
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
|
|
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
|
|
|
// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
|
|
//}
|
|
|
|
|
|
+template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
|
+ return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
|
+}
|
|
|
+
|
|
|
// gemv
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
|
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
}
|
|
|
|
|
|
+template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
|
+ ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
+}
|
|
|
+
|
|
|
// gemm
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
|
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
}
|
|
|
|
|
|
+template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
|
+ ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
|
+}
|
|
|
+
|
|
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
|
public:
|
|
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
|
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
|
|
|
// instance for IQ4
|
|
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
|
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
|
|
|
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
|
|
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
}
|
|
|
}
|
|
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
|
+ if (ggml_cpu_has_avx2()) {
|
|
|
+ if (cur->ne[1] % 8 == 0) {
|
|
|
+ return &iq4_nl_8x8_q8_0;
|
|
|
+ }
|
|
|
+ }
|
|
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
|
if (cur->ne[1] % 4 == 0) {
|
|
|
return &iq4_nl_4x4_q8_0;
|