|
@@ -306,11 +306,11 @@ typedef struct {
|
|
|
#define QI4_K (QK_K / (4*QR4_K))
|
|
#define QI4_K (QK_K / (4*QR4_K))
|
|
|
#ifdef GGML_QKK_64
|
|
#ifdef GGML_QKK_64
|
|
|
typedef struct {
|
|
typedef struct {
|
|
|
- half d[2]; // super-block scales/mins
|
|
|
|
|
|
|
+ half dm[2]; // super-block scales/mins
|
|
|
uint8_t scales[2]; // 4-bit block scales/mins
|
|
uint8_t scales[2]; // 4-bit block scales/mins
|
|
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
|
|
} block_q4_K;
|
|
} block_q4_K;
|
|
|
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
|
|
|
|
|
|
+static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
|
|
#else
|
|
#else
|
|
|
typedef struct {
|
|
typedef struct {
|
|
|
half2 dm; // super-block scale for quantized scales/mins
|
|
half2 dm; // super-block scale for quantized scales/mins
|
|
@@ -737,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
|
const int tid = threadIdx.x;
|
|
const int tid = threadIdx.x;
|
|
|
const uint8_t * q = x[i].qs;
|
|
const uint8_t * q = x[i].qs;
|
|
|
float * y = yy + i*QK_K;
|
|
float * y = yy + i*QK_K;
|
|
|
- const float d = (float)x[i].d[0];
|
|
|
|
|
- const float m = (float)x[i].d[1];
|
|
|
|
|
|
|
+ const float d = (float)x[i].dm[0];
|
|
|
|
|
+ const float m = (float)x[i].dm[1];
|
|
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
|
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
|
|
#endif
|
|
#endif
|
|
@@ -1155,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
|
aux16[0] = a[0] & 0x0f0f;
|
|
aux16[0] = a[0] & 0x0f0f;
|
|
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
|
- const float d = (float)x[i].d[0];
|
|
|
|
|
- const float m = (float)x[i].d[1];
|
|
|
|
|
|
|
+ const float d = (float)x[i].dm[0];
|
|
|
|
|
+ const float m = (float)x[i].dm[1];
|
|
|
float sum = 0.f;
|
|
float sum = 0.f;
|
|
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
|
@@ -2845,8 +2845,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
|
aux16[0] = a[0] & 0x0f0f;
|
|
aux16[0] = a[0] & 0x0f0f;
|
|
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
|
|
|
|
|
|
- const float dall = bq4_K->d[0];
|
|
|
|
|
- const float dmin = bq4_K->d[1];
|
|
|
|
|
|
|
+ const float dall = bq4_K->dm[0];
|
|
|
|
|
+ const float dmin = bq4_K->dm[1];
|
|
|
|
|
|
|
|
const float d8_1 = __low2float(bq8_1[0].ds);
|
|
const float d8_1 = __low2float(bq8_1[0].ds);
|
|
|
const float d8_2 = __low2float(bq8_1[1].ds);
|
|
const float d8_2 = __low2float(bq8_1[1].ds);
|
|
@@ -2929,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
|
|
|
|
|
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
|
|
|
|
|
|
|
|
+#if QK_K == 256
|
|
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
|
|
|
|
+#else
|
|
|
|
|
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
|
|
|
|
+#endif
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
@@ -3119,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
|
|
|
|
|
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
|
|
|
|
|
|
|
|
+#if QK_K == 256
|
|
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
|
|
|
|
+#endif
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
@@ -4709,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
|
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
|
|
|
|
|
|
|
|
|
+#if QK_K == 256
|
|
|
|
|
+
|
|
|
int id;
|
|
int id;
|
|
|
CUDA_CHECK(cudaGetDevice(&id));
|
|
CUDA_CHECK(cudaGetDevice(&id));
|
|
|
const int compute_capability = g_compute_capabilities[id];
|
|
const int compute_capability = g_compute_capabilities[id];
|
|
@@ -4740,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
|
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
|
}
|
|
}
|
|
|
|
|
+#endif
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|