1 жил өмнө · e84b71c2c6
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,7 +124,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 
				 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
			
 
				 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
			
 
				 option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
			
 
				-option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
			
 
				 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
			
 
				 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
			
 
				 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
			
@@ -384,10 +383,6 @@ if (LLAMA_LLAMAFILE)
 
				     set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
			
 
				 endif()
			
 
				 
			
 
				-if (LLAMA_QKK_64)
			
 
				-    add_compile_definitions(GGML_QKK_64)
			
 
				-endif()
			
 
				-
			
 
				 if (LLAMA_CUBLAS)
			
 
				     message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
			
 
				     set(LLAMA_CUDA ON)
			
--- a/Makefile
+++ b/Makefile
@@ -389,10 +389,6 @@ else
 
				 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
			
 
				 endif
			
 
				 
			
 
				-ifdef LLAMA_QKK_64
			
 
				-	MK_CPPFLAGS += -DGGML_QKK_64
			
 
				-endif
			
 
				-
			
 
				 ifndef LLAMA_NO_ACCELERATE
			
 
				 	# Mac OS - include Accelerate framework.
			
 
				 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
			
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -606,7 +606,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 
				 
			
 
				     if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
			
 
				         if [ -z ${GG_BUILD_CUDA} ]; then
			
 
				-            test $ret -eq 0 && gg_run open_llama_3b_v2
			
 
				+            #test $ret -eq 0 && gg_run open_llama_3b_v2
			
 
				+            date # dummy
			
 
				         else
			
 
				             test $ret -eq 0 && gg_run open_llama_7b_v2
			
 
				         fi
			
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
 
				 // QK = number of values after dequantization
			
 
				 // QK_K = super-block size
			
 
				 
			
 
				-#ifdef GGML_QKK_64
			
 
				-#define QK_K 64
			
 
				-#define K_SCALE_SIZE 4
			
 
				-#else
			
 
				 #define QK_K 256
			
 
				 #define K_SCALE_SIZE 12
			
 
				-#endif // GGML_QKK_64
			
 
				 
			
 
				 #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
			
 
				 // QR = QK / number of values before dequantization
			
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
 
				 #define QI4_NL (QK4_NL / (4*QR4_NL))
			
 
				 #define QR4_NL 2
			
 
				 
			
 
				-#if QK_K == 64
			
 
				-#define QI4_XS QI4_NL
			
 
				-#define QR4_XS QR4_NL
			
 
				-#else
			
 
				 #define QI4_XS (QK_K / (4*QR4_XS))
			
 
				 #define QR4_XS 8
			
 
				-#endif
			
 
				 
			
 
				 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
			
 
				 
			
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
 
				 // weight is represented as x = a * q
			
 
				 // 16 blocks of 16 elements each
			
 
				 // Effectively 3.4375 bits per weight
			
 
				-#ifdef GGML_QKK_64
			
 
				-typedef struct {
			
 
				-    uint8_t hmask[QK_K/8]; // quants - high bit
			
 
				-    uint8_t qs[QK_K/4];    // quants - low 2 bits
			
 
				-    uint8_t scales[2];
			
 
				-    ggml_half d;           // super-block scale
			
 
				-} block_q3_K;
			
 
				-static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
			
 
				-#else
			
 
				 typedef struct {
			
 
				     uint8_t hmask[QK_K/8]; // quants - high bit
			
 
				     uint8_t qs[QK_K/4];    // quants - low 2 bits
			
@@ -244,20 +225,11 @@ typedef struct {
 
				     ggml_half d;           // super-block scale
			
 
				 } block_q3_K;
			
 
				 static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
			
 
				-#endif
			
 
				 
			
 
				 // 4-bit quantization
			
 
				 // 8 blocks of 32 elements each
			
 
				 // weight is represented as x = a * q + b
			
 
				 // Effectively 4.5 bits per weight
			
 
				-#ifdef GGML_QKK_64
			
 
				-typedef struct {
			
 
				-    ggml_half d[2];     // super-block scales/mins
			
 
				-    uint8_t scales[2];  // 4-bit block scales/mins
			
 
				-    uint8_t qs[QK_K/2]; // 4--bit quants
			
 
				-} block_q4_K;
			
 
				-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
			
 
				-#else
			
 
				 typedef struct {
			
 
				     union {
			
 
				         struct {
			
@@ -270,21 +242,11 @@ typedef struct {
 
				     uint8_t qs[QK_K/2];           // 4--bit quants
			
 
				 } block_q4_K;
			
 
				 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
			
 
				-#endif
			
 
				 
			
 
				 // 5-bit quantization
			
 
				 // 8 blocks of 32 elements each
			
 
				 // weight is represented as x = a * q + b
			
 
				 // Effectively 5.5 bits per weight
			
 
				-#ifdef GGML_QKK_64
			
 
				-typedef struct {
			
 
				-    ggml_half d;             // super-block scale
			
 
				-    int8_t  scales[QK_K/16]; // 8-bit block scales
			
 
				-    uint8_t qh[QK_K/8];      // quants, high bit
			
 
				-    uint8_t qs[QK_K/2];      // quants, low 4 bits
			
 
				-} block_q5_K;
			
 
				-static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
			
 
				-#else
			
 
				 typedef struct {
			
 
				     union {
			
 
				         struct {
			
@@ -298,7 +260,6 @@ typedef struct {
 
				     uint8_t qs[QK_K/2];           // quants, low 4 bits
			
 
				 } block_q5_K;
			
 
				 static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
			
 
				-#endif
			
 
				 
			
 
				 // 6-bit quantization
			
 
				 // weight is represented as x = a * q
			
@@ -356,11 +317,7 @@ typedef struct {
 
				 static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
			
 
				 
			
 
				 // 3.4375 bpw
			
 
				-#if QK_K == 64
			
 
				-#define IQ3S_N_SCALE 2
			
 
				-#else
			
 
				 #define IQ3S_N_SCALE QK_K/64
			
 
				-#endif
			
 
				 typedef struct {
			
 
				     ggml_half d;
			
 
				     uint8_t qs[QK_K/4];
			
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
 
				 typedef struct {
			
 
				     uint8_t  qs[QK_K/8];      // grid index, low 8 bits
			
 
				     uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
			
 
				-#if QK_K == 64
			
 
				-    ggml_half d;
			
 
				-#endif
			
 
				     uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
			
 
				 } block_iq1_m;
			
 
				-#if QK_K == 64
			
 
				-static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
			
 
				-#else
			
 
				 static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
			
 
				-#endif
			
 
				 
			
 
				 // Used by IQ1_M quants
			
 
				 typedef union {
			
@@ -406,9 +356,6 @@ typedef struct {
 
				 } block_iq4_nl;
			
 
				 static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
			
 
				 
			
 
				-#if QK_K == 64
			
 
				-#define block_iq4_xs block_iq4_nl
			
 
				-#else
			
 
				 typedef struct {
			
 
				     ggml_half d;
			
 
				     uint16_t scales_h;
			
@@ -416,7 +363,6 @@ typedef struct {
 
				     uint8_t  qs[QK_K/2];
			
 
				 } block_iq4_xs;
			
 
				 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
			
 
				-#endif
			
 
				 
			
 
				 #endif // GGML_COMMON_DECL
			
 
				 #endif // GGML_COMMON_DECL
			
--- a/ggml-cuda/convert.cu
+++ b/ggml-cuda/convert.cu
@@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 
				     const block_q2_K * x = (const block_q2_K *) vx;
			
 
				 
			
 
				     const int64_t tid = threadIdx.x;
			
 
				-#if QK_K == 256
			
 
				     const int64_t n   = tid/32;
			
 
				     const int64_t l   = tid - 32*n;
			
 
				     const int64_t is  = 8*n + l/16;
			
@@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 
				     y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
			
 
				     y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
			
 
				     y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
			
 
				-#else
			
 
				-    const int64_t is = tid/16;  // 0 or 1
			
 
				-    const int64_t il = tid%16;  // 0...15
			
 
				-    const uint8_t q = x[i].qs[il] >> (2*is);
			
 
				-    dst_t * y = yy + i*QK_K + 16*is + il;
			
 
				-    float dall = __low2half(x[i].dm);
			
 
				-    float dmin = __high2half(x[i].dm);
			
 
				-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
			
 
				-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
 
				     const int64_t i = blockIdx.x;
			
 
				     const block_q3_K * x = (const block_q3_K *) vx;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const int64_t r = threadIdx.x/4;
			
 
				     const int64_t tid = r/2;
			
 
				     const int64_t is0 = r%2;
			
@@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
 
				     const uint8_t * hm = x[i].hmask;
			
 
				 
			
 
				     for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
			
 
				-#else
			
 
				-    const int64_t tid = threadIdx.x;
			
 
				-    const int64_t is  = tid/16;  // 0 or 1
			
 
				-    const int64_t il  = tid%16;  // 0...15
			
 
				-    const int64_t im  = il/8;    // 0...1
			
 
				-    const int64_t in  = il%8;    // 0...7
			
 
				-
			
 
				-    dst_t * y = yy + i*QK_K + 16*is + il;
			
 
				-
			
 
				-    const uint8_t q = x[i].qs[il] >> (2*is);
			
 
				-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
			
 
				-    const float   d = (float)x[i].d;
			
 
				-
			
 
				-    if (is == 0) {
			
 
				-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
			
 
				-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
			
 
				-    } else {
			
 
				-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
			
 
				-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				-#if QK_K == 256
			
 
				 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
			
 
				     if (j < 4) {
			
 
				         d = q[j] & 63; m = q[j + 4] & 63;
			
@@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
 
				         m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
			
 
				     }
			
 
				 }
			
 
				-#endif
			
 
				 
			
 
				 template<typename dst_t>
			
 
				 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
			
@@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
 
				 
			
 
				     const int64_t i = blockIdx.x;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     // assume 32 threads
			
 
				     const int64_t tid = threadIdx.x;
			
 
				     const int64_t il  = tid/8;
			
@@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
 
				         y[l + 0] = d1 * (q[l] & 0xF) - m1;
			
 
				         y[l +32] = d2 * (q[l] >>  4) - m2;
			
 
				     }
			
 
				-#else
			
 
				-    const int64_t tid = threadIdx.x;
			
 
				-    const uint8_t * q = x[i].qs;
			
 
				-    dst_t * y = yy + i*QK_K;
			
 
				-    const float d = (float)x[i].dm[0];
			
 
				-    const float m = (float)x[i].dm[1];
			
 
				-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
			
 
				-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
 
				 
			
 
				     const int64_t i = blockIdx.x;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     // assume 64 threads - this is very slightly better than the one below
			
 
				     const int64_t tid = threadIdx.x;
			
 
				     const int64_t il  = tid/16;   // il is in 0...3
			
@@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
 
				     hm <<= 1;
			
 
				     y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
			
 
				     y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
			
 
				-#else
			
 
				-    const int64_t tid = threadIdx.x;
			
 
				-    const uint8_t q = x[i].qs[tid];
			
 
				-    const int64_t im = tid/8;  // 0...3
			
 
				-    const int64_t in = tid%8;  // 0...7
			
 
				-    const int64_t is = tid/16; // 0 or 1
			
 
				-    const uint8_t h = x[i].qh[in] >> im;
			
 
				-    const float d = x[i].d;
			
 
				-    dst_t * y = yy + i*QK_K + tid;
			
 
				-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
			
 
				-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 
				     const block_q6_K * x = (const block_q6_K *) vx;
			
 
				 
			
 
				     const int64_t i = blockIdx.x;
			
 
				-#if QK_K == 256
			
 
				 
			
 
				     // assume 64 threads - this is very slightly better than the one below
			
 
				     const int64_t tid = threadIdx.x;
			
@@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 
				     y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
			
 
				     y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
			
 
				     y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
			
 
				-#else
			
 
				-
			
 
				-    // assume 32 threads
			
 
				-    const int64_t tid = threadIdx.x;
			
 
				-    const int64_t ip  = tid/16;         // 0 or 1
			
 
				-    const int64_t il  = tid - 16*ip;    // 0...15
			
 
				-
			
 
				-    dst_t * y = yy + i*QK_K + 16*ip + il;
			
 
				-
			
 
				-    const float d = x[i].d;
			
 
				-
			
 
				-    const uint8_t   ql = x[i].ql[16*ip + il];
			
 
				-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
			
 
				-    const int8_t  * sc = x[i].scales;
			
 
				-
			
 
				-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
			
 
				-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
 
				     const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
			
 
				 
			
 
				     const int64_t tid = threadIdx.x;
			
 
				-#if QK_K == 256
			
 
				     const int64_t il = tid/8; // 0...3
			
 
				     const int64_t ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
 
				     const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
			
 
				     const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
			
 
				     for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
 
				     const block_iq2_xs * x = (const block_iq2_xs *) vx;
			
 
				 
			
 
				     const int64_t tid = threadIdx.x;
			
 
				-#if QK_K == 256
			
 
				     const int64_t il = tid/8; // 0...3
			
 
				     const int64_t ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
 
				     const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
			
 
				     const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
			
 
				     for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
 
				     const block_iq2_s * x = (const block_iq2_s *) vx;
			
 
				 
			
 
				     const int64_t tid = threadIdx.x;
			
 
				-#if QK_K == 256
			
 
				     const int64_t il = tid/8; // 0...3
			
 
				     const int64_t ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
 
				     const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
			
 
				     const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
			
 
				     for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 
				     const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
			
 
				 
			
 
				     const int64_t tid = threadIdx.x;
			
 
				-#if QK_K == 256
			
 
				     const int64_t il = tid/8; // 0...3
			
 
				     const int64_t ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 
				         y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
			
 
				         y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
			
 
				     }
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
 
				     const block_iq3_s * x = (const block_iq3_s *) vx;
			
 
				 
			
 
				     const int64_t tid = threadIdx.x;
			
 
				-#if QK_K == 256
			
 
				     const int64_t il = tid/8; // 0...3
			
 
				     const int64_t ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
 
				         y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
			
 
				         y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
			
 
				     }
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
 
				     const block_iq1_s * x = (const block_iq1_s  *) vx;
			
 
				 
			
 
				     const int64_t tid = threadIdx.x;
			
 
				-#if QK_K == 256
			
 
				     const int64_t il = tid/8; // 0...3
			
 
				     const int64_t ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
 
				     for (int j = 0; j < 8; ++j) {
			
 
				         y[j] = d * (q[j] + delta);
			
 
				     }
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
 
				     const block_iq1_m * x = (const block_iq1_m  *) vx;
			
 
				 
			
 
				     const int64_t tid = threadIdx.x;
			
 
				-#if QK_K == 256
			
 
				     const int64_t il = tid/8; // 0...3
			
 
				     const int64_t ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
 
				     for (int j = 0; j < 8; ++j) {
			
 
				         y[j] = d * (q[j] + delta);
			
 
				     }
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				-
			
 
				 template<typename dst_t>
			
 
				 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
			
 
				 
			
@@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
 
				         y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
			
 
				         y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
			
 
				     }
			
 
				-
			
 
				 }
			
 
				 
			
 
				-#if QK_K != 64
			
 
				 template<typename dst_t>
			
 
				 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
			
 
				     const int64_t i   = blockIdx.x;
			
@@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
 
				         y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
			
 
				     }
			
 
				 }
			
 
				-#endif
			
 
				 
			
 
				 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
			
 
				 static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
			
@@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
 
				 template<typename dst_t>
			
 
				 static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
			
 
				     const int nb = k / QK_K;
			
 
				-#if QK_K == 256
			
 
				     dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
			
 
				-#else
			
 
				-    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
 
				 static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
			
 
				     const int nb = k / QK_K;
			
 
				-#if QK_K == 256
			
 
				     dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
			
 
				-#else
			
 
				-    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k
 
				 template<typename dst_t>
			
 
				 static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
			
 
				     const int nb = k / QK_K;
			
 
				-#if QK_K == 256
			
 
				     dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
			
 
				-#else
			
 
				-    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
 
				 static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
			
 
				     const int nb = k / QK_K;
			
 
				-#if QK_K == 256
			
 
				     dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
			
 
				-#else
			
 
				-    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
 
				 template<typename dst_t>
			
 
				 static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
			
 
				     const int nb = (k + QK_K - 1) / QK_K;
			
 
				-#if QK_K == 64
			
 
				-    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
			
 
				-#else
			
 
				     dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <typename src_t, typename dst_t>
			
--- a/ggml-cuda/dmmv.cu
+++ b/ggml-cuda/dmmv.cu
@@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
 
				 
			
 
				     float tmp = 0; // partial sum for thread in warp
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
			
 
				     const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
			
 
				 
			
@@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
 
				         tmp += dall * sum1 - dmin * sum2;
			
 
				 
			
 
				     }
			
 
				-#else
			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
			
 
				-    const int offset = tid * K_QUANTS_PER_ITERATION;
			
 
				-
			
 
				-    uint32_t uaux[2];
			
 
				-    const uint8_t * d = (const uint8_t *)uaux;
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-
			
 
				-        const float   * y = yy + i * QK_K + offset;
			
 
				-        const uint8_t * q = x[i].qs + offset;
			
 
				-        const uint32_t * s = (const uint32_t *)x[i].scales;
			
 
				-
			
 
				-        uaux[0] = s[0] & 0x0f0f0f0f;
			
 
				-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
			
 
				-
			
 
				-        const float2 dall = __half22float2(x[i].dm);
			
 
				-
			
 
				-        float sum1 = 0, sum2 = 0;
			
 
				-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
			
 
				-            const uint8_t ql = q[l];
			
 
				-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
			
 
				-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
			
 
				-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
			
 
				-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
			
 
				-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
			
 
				-        }
			
 
				-        tmp += dall.x * sum1 - dall.y * sum2;
			
 
				-    }
			
 
				-#endif
			
 
				 
			
 
				     // sum up partial sums and write back result
			
 
				     tmp = warp_reduce_sum(tmp);
			
@@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
 
				 
			
 
				     float tmp = 0; // partial sum for thread in warp
			
 
				 
			
 
				-#if QK_K == 256
			
 
				-
			
 
				     const uint16_t kmask1 = 0x0303;
			
 
				     const uint16_t kmask2 = 0x0f0f;
			
 
				 
			
@@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
 
				         tmp += d * sum;
			
 
				 
			
 
				     }
			
 
				-#else
			
 
				-
			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
			
 
				-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
			
 
				-    const int in = offset/8;                                 // 0 or 1
			
 
				-    const int im = offset%8;                                 // 0...7
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-
			
 
				-        const float   * y = yy + i * QK_K + offset;
			
 
				-        const uint8_t * q = x[i].qs + offset;
			
 
				-        const uint8_t * s = x[i].scales;
			
 
				-
			
 
				-        const float dall = (float)x[i].d;
			
 
				-
			
 
				-        float sum = 0;
			
 
				-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
			
 
				-            const uint8_t hl = x[i].hmask[im+l] >> in;
			
 
				-            const uint8_t ql = q[l];
			
 
				-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
			
 
				-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
			
 
				-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
			
 
				-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
			
 
				-        }
			
 
				-        tmp += sum;
			
 
				-    }
			
 
				-#endif
			
 
				 
			
 
				     // sum up partial sums and write back result
			
 
				     tmp = warp_reduce_sum(tmp);
			
@@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
 
				 
			
 
				     const block_q4_K * x = (const block_q4_K *)vx + ib0;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const uint16_t kmask1 = 0x3f3f;
			
 
				     const uint16_t kmask2 = 0x0f0f;
			
 
				     const uint16_t kmask3 = 0xc0c0;
			
@@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
 
				 #endif
			
 
				 
			
 
				     }
			
 
				-#else
			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
			
 
				-
			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				-
			
 
				-    uint16_t aux16[2];
			
 
				-    const uint8_t * s = (const uint8_t *)aux16;
			
 
				-
			
 
				-    float tmp = 0;
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-        const uint8_t * q = x[i].qs + step;
			
 
				-        const float   * y = yy + i*QK_K + step;
			
 
				-        const uint16_t * a = (const uint16_t *)x[i].scales;
			
 
				-        aux16[0] = a[0] & 0x0f0f;
			
 
				-        aux16[1] = (a[0] >> 4) & 0x0f0f;
			
 
				-        const float d = (float)x[i].dm[0];
			
 
				-        const float m = (float)x[i].dm[1];
			
 
				-        float sum = 0.f;
			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
			
 
				-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
			
 
				-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
			
 
				-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
			
 
				-        }
			
 
				-        tmp += sum;
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				 
			
 
				     // sum up partial sums and write back result
			
 
				     tmp = warp_reduce_sum(tmp);
			
@@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
 
				 
			
 
				     float tmp = 0; // partial sum for thread in warp
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const uint16_t kmask1 = 0x3f3f;
			
 
				     const uint16_t kmask2 = 0x0f0f;
			
 
				     const uint16_t kmask3 = 0xc0c0;
			
@@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
 
				         tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
			
 
				     }
			
 
				 
			
 
				-#else
			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				-    const int im = step/8;
			
 
				-    const int in = step%8;
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-        const uint8_t * q = x[i].qs + step;
			
 
				-        const int8_t  * s = x[i].scales;
			
 
				-        const float   * y = yy + i*QK_K + step;
			
 
				-        const float     d = x[i].d;
			
 
				-        float sum = 0.f;
			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				-            const uint8_t h = x[i].qh[in+j] >> im;
			
 
				-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
			
 
				-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
			
 
				-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
			
 
				-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
			
 
				-        }
			
 
				-        tmp += sum;
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				     // sum up partial sums and write back result
			
 
				     tmp = warp_reduce_sum(tmp);
			
 
				 
			
@@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
 
				 
			
 
				     const block_q6_K * x = (const block_q6_K *)vx + ib0;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				-
			
 
				     const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
			
 
				     const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
			
 
				 
			
@@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
 
				 
			
 
				     }
			
 
				 
			
 
				-#else
			
 
				-
			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
			
 
				-
			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				-
			
 
				-    float tmp = 0; // partial sum for thread in warp
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-
			
 
				-        const float   * y  = yy + i * QK_K + step;
			
 
				-        const uint8_t * ql = x[i].ql + step;
			
 
				-        const uint8_t * qh = x[i].qh + step;
			
 
				-        const int8_t  * s  = x[i].scales;
			
 
				-
			
 
				-        const float d = x[i+0].d;
			
 
				-
			
 
				-        float sum = 0;
			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
			
 
				-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
			
 
				-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
			
 
				-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
			
 
				-        }
			
 
				-        tmp += sum;
			
 
				-
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				     // sum up partial sums and write back result
			
 
				     tmp = warp_reduce_sum(tmp);
			
 
				 
			
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
@@ -826,11 +826,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 
			
 
				         const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				         x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
			
 
				-#else
			
 
				-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
			
 
				-#endif
			
 
				     }
			
 
				 
			
 
				 #pragma unroll
			
@@ -933,9 +929,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 
			
 
				         const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				         x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
			
 
				-#endif
			
 
				     }
			
 
				 
			
 
				 #pragma unroll
			
--- a/ggml-cuda/vecdotq.cuh
+++ b/ggml-cuda/vecdotq.cuh
@@ -712,7 +712,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 
				 static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				 
			
 
				-#ifndef GGML_QKK_64
			
 
				     const block_q4_K * bq4_K = (const block_q4_K *) vbq;
			
 
				 
			
 
				     int    v[2];
			
@@ -754,58 +753,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 
				     }
			
 
				 
			
 
				     return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
			
 
				-
			
 
				-#else
			
 
				-
			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
			
 
				-
			
 
				-    float sumf_d = 0.0f;
			
 
				-    float sumf_m = 0.0f;
			
 
				-
			
 
				-    uint16_t aux16[2];
			
 
				-    const uint8_t * s = (const uint8_t *)aux16;
			
 
				-
			
 
				-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
			
 
				-    aux16[0] = a[0] & 0x0f0f;
			
 
				-    aux16[1] = (a[0] >> 4) & 0x0f0f;
			
 
				-
			
 
				-    const float dall = bq4_K->dm[0];
			
 
				-    const float dmin = bq4_K->dm[1];
			
 
				-
			
 
				-    const float d8_1 = __low2float(bq8_1[0].ds);
			
 
				-    const float d8_2 = __low2float(bq8_1[1].ds);
			
 
				-
			
 
				-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
			
 
				-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
			
 
				-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
			
 
				-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
			
 
				-
			
 
				-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
			
 
				-    const int v1 = q4[0];
			
 
				-    const int v2 = q4[4];
			
 
				-
			
 
				-    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
			
 
				-    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
			
 
				-    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
			
 
				-    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
			
 
				-
			
 
				-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
			
 
				-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
			
 
				-
			
 
				-    return dall * sumf_d - dmin * sumf_m;
			
 
				-
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				-
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				 
			
 
				-#ifndef GGML_QKK_64
			
 
				     const block_q5_K * bq5_K = (const block_q5_K *) vbq;
			
 
				 
			
 
				     int   vl[2];
			
@@ -847,48 +799,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 
				     }
			
 
				 
			
 
				     return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
			
 
				-
			
 
				-#else
			
 
				-
			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
			
 
				-
			
 
				-    const int8_t * s = bq5_K->scales;
			
 
				-
			
 
				-    const float d = bq5_K->d;
			
 
				-
			
 
				-    const float d8_1 = __low2half(bq8_1[0].ds);
			
 
				-    const float d8_2 = __low2half(bq8_1[1].ds);
			
 
				-
			
 
				-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
			
 
				-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
			
 
				-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
			
 
				-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
			
 
				-
			
 
				-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
			
 
				-    const int vl1 = ql[0];
			
 
				-    const int vl2 = ql[4];
			
 
				-
			
 
				-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
			
 
				-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
			
 
				-    const int in = step%8; // 0, 4, 0, 4
			
 
				-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
			
 
				-
			
 
				-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
			
 
				-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
			
 
				-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
			
 
				-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
			
 
				-
			
 
				-    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
			
 
				-                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
			
 
				-
			
 
				-    return d * sumf_d;
			
 
				-
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				-
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
			
@@ -919,7 +829,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 
				 
			
 
				 static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				-#if QK_K == 256
			
 
				     const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
			
 
				 
			
 
				 #if QR2_XXS == 8
			
@@ -960,15 +869,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
 
				     }
			
 
				     return d * (sumi1 + sumi2);
			
 
				 #endif
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-#if QK_K == 256
			
 
				     const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -1002,17 +907,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
 
				     GGML_UNUSED(ksigns64);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif
			
 
				-#else
			
 
				-    GGML_UNUSED(ksigns64);
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 // TODO
			
 
				 static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-#if QK_K == 256
			
 
				     const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -1048,16 +948,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
 
				     GGML_UNUSED(ksigns64);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif
			
 
				-#else
			
 
				-    GGML_UNUSED(ksigns64);
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-#if QK_K == 256
			
 
				     const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -1082,16 +977,12 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 
				 #else
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 // TODO: don't use lookup table for signs
			
 
				 static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-#if QK_K == 256
			
 
				     const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -1114,14 +1005,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
 
				 #else
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				-#if QK_K == 256
			
 
				     const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -1149,14 +1036,10 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
 
				     const float d = d1q * __low2float (bq8_1[ib32].ds);
			
 
				     const float m = d1q * __high2float(bq8_1[ib32].ds);
			
 
				     return d * sumi + m * delta;
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				-#if QK_K == 256
			
 
				     const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -1192,9 +1075,6 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
 
				     scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
			
 
				     const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
			
 
				     return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
@@ -1250,9 +1130,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
 
				 static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
			
 
				     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				 
			
 
				-#if QK_K == 256
			
 
				 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-
			
 
				     const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
			
 
				     const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
			
 
				 
			
@@ -1270,10 +1148,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
 
				         sumi2 = __dp4a(v2, q8[j+4], sumi2);
			
 
				     }
			
 
				     return d * (sumi1 + sumi2);
			
 
				-
			
 
				-#else
			
 
				-    NO_DEVICE_CODE;
			
 
				-#endif
			
 
				 #else
			
 
				     return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
			
 
				 #endif
			
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 
				                 // dictionary of preprocessor macros
			
 
				                 NSMutableDictionary * prep = [NSMutableDictionary dictionary];
			
 
				 
			
 
				-#ifdef GGML_QKK_64
			
 
				-                prep[@"GGML_QKK_64"] = @(1);
			
 
				-#endif
			
 
				-
			
 
				                 MTLCompileOptions* options = [MTLCompileOptions new];
			
 
				                 options.preprocessorMacros = prep;
			
 
				 
			
@@ -1773,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
 
				                                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
			
 
				                             }
			
 
				                             else if (src0t == GGML_TYPE_Q3_K) {
			
 
				-#ifdef GGML_QKK_64
			
 
				-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
			
 
				-#else
			
 
				                                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
			
 
				-#endif
			
 
				                             }
			
 
				                             else if (src0t == GGML_TYPE_Q5_K) {
			
 
				                                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
			
@@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
 
				                                     {
			
 
				                                         nth0 = 4;
			
 
				                                         nth1 = 16;
			
 
				-                                    #if QK_K == 64
			
 
				-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
			
 
				-                                    #else
			
 
				                                         pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
			
 
				-                                    #endif
			
 
				-
			
 
				                                     } break;
			
 
				                                 default:
			
 
				                                     {
			
@@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
 
				                                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
			
 
				                             }
			
 
				                             else if (src0t == GGML_TYPE_Q3_K) {
			
 
				-#ifdef GGML_QKK_64
			
 
				-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
			
 
				-#else
			
 
				                                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
			
 
				-#endif
			
 
				                             }
			
 
				                             else if (src0t == GGML_TYPE_Q5_K) {
			
 
				                                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
			
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -3386,7 +3386,6 @@ void kernel_mul_mv_q2_K_f32_impl(
 
				 
			
 
				     const int step = sizeof(block_q2_K) * nb;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const int ix = tiisg/8;  // 0...3
			
 
				     const int it = tiisg%8;  // 0...7
			
 
				     const int iq = it/4;     // 0 or 1
			
@@ -3438,57 +3437,6 @@ void kernel_mul_mv_q2_K_f32_impl(
 
				 
			
 
				         y4 += 4 * QK_K;
			
 
				     }
			
 
				-#else
			
 
				-    const int ix = tiisg/2;  // 0...15
			
 
				-    const int it = tiisg%2;  // 0...1
			
 
				-
			
 
				-    device const float * y4 = y + ix * QK_K + 8 * it;
			
 
				-
			
 
				-    for (int ib = ix; ib < nb; ib += 16) {
			
 
				-
			
 
				-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
			
 
				-        for (int i = 0; i < 8; ++i) {
			
 
				-            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
			
 
				-            yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
			
 
				-            yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
			
 
				-            yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
			
 
				-        }
			
 
				-
			
 
				-        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales;
			
 
				-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
			
 
				-        device const half     * dh = &x[ib].d;
			
 
				-
			
 
				-        for (int row = 0; row < N_DST; row++) {
			
 
				-
			
 
				-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
			
 
				-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
			
 
				-            for (int i = 0; i < 8; i += 2) {
			
 
				-                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
			
 
				-                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
			
 
				-                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
			
 
				-                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
			
 
				-                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
			
 
				-                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
			
 
				-                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
			
 
				-                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
			
 
				-            }
			
 
				-
			
 
				-            float dall = dh[0];
			
 
				-            float dmin = dh[1];
			
 
				-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
			
 
				-                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
			
 
				-                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
			
 
				-                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
			
 
				-                         dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
			
 
				-
			
 
				-            qs += step/2;
			
 
				-            sc += step;
			
 
				-            dh += step/2;
			
 
				-        }
			
 
				-
			
 
				-        y4 += 16 * QK_K;
			
 
				-    }
			
 
				-#endif
			
 
				 
			
 
				     for (int row = 0; row < N_DST; ++row) {
			
 
				         all_sum = simd_sum(sumf[row]);
			
@@ -3526,7 +3474,6 @@ kernel void kernel_mul_mv_q2_K_f32(
 
				     kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				 }
			
 
				 
			
 
				-#if QK_K == 256
			
 
				 void kernel_mul_mv_q3_K_f32_impl(
			
 
				         device const  void * src0,
			
 
				         device const float * src1,
			
@@ -3685,84 +3632,6 @@ void kernel_mul_mv_q3_K_f32_impl(
 
				         }
			
 
				     }
			
 
				 }
			
 
				-#else
			
 
				-void kernel_mul_mv_q3_K_f32_impl(
			
 
				-        device const  void * src0,
			
 
				-        device const float * src1,
			
 
				-        device       float * dst,
			
 
				-        constant   int64_t & ne00,
			
 
				-        constant   int64_t & ne01,
			
 
				-        constant   int64_t & ne02,
			
 
				-        constant   int64_t & ne10,
			
 
				-        constant   int64_t & ne12,
			
 
				-        constant   int64_t & ne0,
			
 
				-        constant   int64_t & ne1,
			
 
				-        constant   uint    & r2,
			
 
				-        constant   uint    & r3,
			
 
				-        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				-        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				-        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				-
			
 
				-    const int nb = ne00/QK_K;
			
 
				-
			
 
				-    const int64_t r0 = tgpig.x;
			
 
				-    const int64_t r1 = tgpig.y;
			
 
				-    const int64_t im = tgpig.z;
			
 
				-
			
 
				-    const int row = 2 * r0 + sgitg;
			
 
				-
			
 
				-    const uint i12 = im%ne12;
			
 
				-    const uint i13 = im/ne12;
			
 
				-
			
 
				-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				-
			
 
				-    device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
			
 
				-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				-
			
 
				-    const int ix = tiisg/4;
			
 
				-    const int il = 4 * (tiisg%4);// 0, 4, 8, 12
			
 
				-    const int iq = il/8;         // 0, 0, 1, 1
			
 
				-    const int in = il%8;         // 0, 4, 0, 4
			
 
				-
			
 
				-    float2 sum = {0.f, 0.f};
			
 
				-
			
 
				-    for (int i = ix; i < nb; i += 8) {
			
 
				-
			
 
				-        const float d_all = (float)(x[i].d);
			
 
				-
			
 
				-        device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
			
 
				-        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
			
 
				-        device const uint16_t * s = (device const uint16_t *)(x[i].scales);
			
 
				-        device const float    * y = yy + i * QK_K + il;
			
 
				-
			
 
				-        const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
			
 
				-        const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
			
 
				-        const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
			
 
				-        const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
			
 
				-
			
 
				-        for (int l = 0; l < 4; l += 2) {
			
 
				-            const uint16_t hm = h[l/2] >> iq;
			
 
				-            sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 :  4))
			
 
				-                    + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
			
 
				-                    + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
			
 
				-                    + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
			
 
				-            sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
			
 
				-                    + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
			
 
				-                    + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
			
 
				-                    + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
			
 
				-        }
			
 
				-
			
 
				-    }
			
 
				-    const float sumf = sum[0] + sum[1] * 1.f/256.f;
			
 
				-
			
 
				-    const float tot = simd_sum(sumf);
			
 
				-    if (tiisg == 0) {
			
 
				-        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
			
 
				-    }
			
 
				-
			
 
				-}
			
 
				-#endif
			
 
				 
			
 
				 [[host_name("kernel_mul_mv_q3_K_f32")]]
			
 
				 kernel void kernel_mul_mv_q3_K_f32(
			
@@ -3792,7 +3661,6 @@ kernel void kernel_mul_mv_q3_K_f32(
 
				     kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				 }
			
 
				 
			
 
				-#if QK_K == 256
			
 
				 void kernel_mul_mv_q4_K_f32_impl(
			
 
				         device const  void * src0,
			
 
				         device const float * src1,
			
@@ -3906,103 +3774,6 @@ void kernel_mul_mv_q4_K_f32_impl(
 
				         }
			
 
				     }
			
 
				 }
			
 
				-#else
			
 
				-void kernel_mul_mv_q4_K_f32_impl(
			
 
				-        device const  void * src0,
			
 
				-        device const float * src1,
			
 
				-        device       float * dst,
			
 
				-        constant   int64_t & ne00,
			
 
				-        constant   int64_t & ne01,
			
 
				-        constant   int64_t & ne02,
			
 
				-        constant   int64_t & ne10,
			
 
				-        constant   int64_t & ne12,
			
 
				-        constant   int64_t & ne0,
			
 
				-        constant   int64_t & ne1,
			
 
				-        constant   uint    & r2,
			
 
				-        constant   uint    & r3,
			
 
				-        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				-        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				-        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				-
			
 
				-    const int ix = tiisg/4;  // 0...7
			
 
				-    const int it = tiisg%4;  // 0...3
			
 
				-
			
 
				-    const int nb = ne00/QK_K;
			
 
				-    const int r0 = tgpig.x;
			
 
				-    const int r1 = tgpig.y;
			
 
				-    const int im = tgpig.z;
			
 
				-    const int first_row = r0 * N_DST;
			
 
				-    const int ib_row = first_row * nb;
			
 
				-
			
 
				-    const uint i12 = im%ne12;
			
 
				-    const uint i13 = im/ne12;
			
 
				-
			
 
				-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				-
			
 
				-    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
			
 
				-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				-
			
 
				-    float yl[8];
			
 
				-    float yh[8];
			
 
				-    float sumf[N_DST]={0.f}, all_sum;
			
 
				-
			
 
				-    const int step = sizeof(block_q4_K) * nb / 2;
			
 
				-
			
 
				-    device const float * y4 = y + ix * QK_K + 8 * it;
			
 
				-
			
 
				-    uint16_t sc16[4];
			
 
				-
			
 
				-    for (int ib = ix; ib < nb; ib += 8) {
			
 
				-
			
 
				-        float2 sumy = {0.f, 0.f};
			
 
				-        for (int i = 0; i < 8; ++i) {
			
 
				-            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
			
 
				-            yh[i] = y4[i+32]; sumy[1] += yh[i];
			
 
				-        }
			
 
				-
			
 
				-        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
			
 
				-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
			
 
				-        device const half     * dh = x[ib].d;
			
 
				-
			
 
				-        for (int row = 0; row < N_DST; row++) {
			
 
				-
			
 
				-            sc16[0] = sc[0] & 0x000f;
			
 
				-            sc16[1] = sc[0] & 0x0f00;
			
 
				-            sc16[2] = sc[0] & 0x00f0;
			
 
				-            sc16[3] = sc[0] & 0xf000;
			
 
				-
			
 
				-            float2 acc1 = {0.f, 0.f};
			
 
				-            float2 acc2 = {0.f, 0.f};
			
 
				-            for (int i = 0; i < 8; i += 2) {
			
 
				-                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
			
 
				-                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
			
 
				-                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
			
 
				-                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
			
 
				-            }
			
 
				-
			
 
				-            float dall = dh[0];
			
 
				-            float dmin = dh[1];
			
 
				-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
			
 
				-                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
			
 
				-                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
			
 
				-
			
 
				-            qs += step;
			
 
				-            sc += step;
			
 
				-            dh += step;
			
 
				-        }
			
 
				-
			
 
				-        y4 += 8 * QK_K;
			
 
				-    }
			
 
				-
			
 
				-    for (int row = 0; row < N_DST; ++row) {
			
 
				-        all_sum = simd_sum(sumf[row]);
			
 
				-        if (tiisg == 0) {
			
 
				-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				-        }
			
 
				-    }
			
 
				-}
			
 
				-#endif
			
 
				 
			
 
				 [[host_name("kernel_mul_mv_q4_K_f32")]]
			
 
				 kernel void kernel_mul_mv_q4_K_f32(
			
@@ -4070,8 +3841,6 @@ void kernel_mul_mv_q5_K_f32_impl(
 
				 
			
 
				     const int step = sizeof(block_q5_K) * nb;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				-#
			
 
				     float yl[16], yh[16];
			
 
				 
			
 
				     const uint16_t kmask1 = 0x3f3f;
			
@@ -4154,54 +3923,6 @@ void kernel_mul_mv_q5_K_f32_impl(
 
				         y1 += 4 * QK_K;
			
 
				 
			
 
				     }
			
 
				-#else
			
 
				-    float yl[8], yh[8];
			
 
				-
			
 
				-    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
			
 
				-    const int ix = tiisg%8;
			
 
				-    const int iq = il/8;         // 0, 0, 1, 1
			
 
				-    const int in = il%8;         // 0, 4, 0, 4
			
 
				-
			
 
				-    device const float * y = yy + ix*QK_K + il;
			
 
				-
			
 
				-    for (int i = ix; i < nb; i += 8) {
			
 
				-
			
 
				-        for (int l = 0; l < 4; ++l) {
			
 
				-            yl[l+0] = y[l+ 0];
			
 
				-            yl[l+4] = y[l+16];
			
 
				-            yh[l+0] = y[l+32];
			
 
				-            yh[l+4] = y[l+48];
			
 
				-        }
			
 
				-
			
 
				-        device const half * dh = &x[i].d;
			
 
				-        device const uint8_t * q = x[i].qs + il;
			
 
				-        device const uint8_t * h = x[i].qh + in;
			
 
				-        device const int8_t  * s = x[i].scales;
			
 
				-
			
 
				-        for (int row = 0; row < 2; ++row) {
			
 
				-
			
 
				-            const float d = dh[0];
			
 
				-
			
 
				-            float2 acc = {0.f, 0.f};
			
 
				-            for (int l = 0; l < 4; ++l) {
			
 
				-                const uint8_t hl = h[l] >> iq;
			
 
				-                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
			
 
				-                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
			
 
				-                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
			
 
				-                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
			
 
				-            }
			
 
				-            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
			
 
				-
			
 
				-            q += step;
			
 
				-            h += step;
			
 
				-            s += step;
			
 
				-            dh += step/2;
			
 
				-
			
 
				-        }
			
 
				-
			
 
				-        y += 8 * QK_K;
			
 
				-    }
			
 
				-#endif
			
 
				 
			
 
				     for (int row = 0; row < 2; ++row) {
			
 
				         const float tot = simd_sum(sumf[row]);
			
@@ -4280,7 +4001,6 @@ void kernel_mul_mv_q6_K_f32_impl(
 
				 
			
 
				     float sumf = 0;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const int tid  = tiisg/2;
			
 
				     const int ix   = tiisg%2;
			
 
				     const int ip   = tid/8;         // 0 or 1
			
@@ -4316,30 +4036,6 @@ void kernel_mul_mv_q6_K_f32_impl(
 
				 
			
 
				     }
			
 
				 
			
 
				-#else
			
 
				-    const int ix  = tiisg/4;
			
 
				-    const int il  = 4*(tiisg%4);
			
 
				-
			
 
				-    for (int i = ix; i < nb; i += 8) {
			
 
				-        device const float * y = yy + i * QK_K + il;
			
 
				-        device const uint8_t * ql = x[i].ql + il;
			
 
				-        device const uint8_t * qh = x[i].qh + il;
			
 
				-        device const int8_t  * s  = x[i].scales;
			
 
				-
			
 
				-        const float d = x[i].d;
			
 
				-
			
 
				-        float4 sums = {0.f, 0.f, 0.f, 0.f};
			
 
				-        for (int l = 0; l < 4; ++l) {
			
 
				-            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
			
 
				-            sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
			
 
				-            sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32);
			
 
				-            sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32);
			
 
				-        }
			
 
				-        sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				     const float tot = simd_sum(sumf);
			
 
				     if (tiisg == 0) {
			
 
				         dst[r1*ne0 + im*ne0*ne1 + row] = tot;
			
@@ -5173,9 +4869,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
 
				 
			
 
				     device const float * y4 = y + 32 * ix;
			
 
				 
			
 
				-#if QK_K != 64
			
 
				     iq1m_scale_t scale;
			
 
				-#endif
			
 
				 
			
 
				     for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
			
 
				 
			
@@ -5196,10 +4890,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
 
				         device const uint16_t * sc = (device const uint16_t *)xr->scales;
			
 
				 
			
 
				         for (int row = 0; row < N_DST; row++) {
			
 
				-
			
 
				-#if QK_K != 64
			
 
				             scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
			
 
				-#endif
			
 
				 
			
 
				             constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
			
 
				             constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
			
@@ -5215,14 +4906,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
 
				             }
			
 
				             const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
			
 
				             const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
			
 
				-#if QK_K == 64
			
 
				-            const float d = (float) *((device const half *)(sc - 1));
			
 
				-            sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
			
 
				-                              (sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
			
 
				-#else
			
 
				+
			
 
				             sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
			
 
				                                              (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
			
 
				-#endif
			
 
				 
			
 
				             sc += nb*sizeof(block_iq1_m)/2;
			
 
				             qs += nb*sizeof(block_iq1_m);
			
@@ -5334,7 +5020,6 @@ void kernel_mul_mv_iq4_nl_f32_impl(
 
				     }
			
 
				 }
			
 
				 
			
 
				-#if QK_K != 64
			
 
				 void kernel_mul_mv_iq4_xs_f32_impl(
			
 
				         device const  void * src0,
			
 
				         device const float * src1,
			
@@ -5429,7 +5114,6 @@ void kernel_mul_mv_iq4_xs_f32_impl(
 
				         }
			
 
				     }
			
 
				 }
			
 
				-#endif
			
 
				 
			
 
				 [[host_name("kernel_mul_mv_iq1_s_f32")]]
			
 
				 kernel void kernel_mul_mv_iq1_s_f32(
			
@@ -5542,11 +5226,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
 
				         uint tiisg[[thread_index_in_simdgroup]],
			
 
				         uint sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				 
			
 
				-#if QK_K == 64
			
 
				-    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				-#else
			
 
				     kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 //============================= templates and their specializations =============================
			
@@ -5672,10 +5352,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
 
				     float dl, ml;
			
 
				     uint8_t sc = xb->scales[il];
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     q = q + 32*(il/8) + 16*(il&1);
			
 
				     il = (il/2)%4;
			
 
				-#endif
			
 
				+
			
 
				     half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
			
 
				     uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
			
 
				     dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
			
@@ -5691,7 +5370,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
 
				     device const uint8_t * h = (device const uint8_t *)xb->hmask;
			
 
				     device const int8_t * scales = (device const int8_t *)xb->scales;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     q = q + 32 * (il/8) + 16 * (il&1);
			
 
				     h = h + 16 * (il&1);
			
 
				     uint8_t m = 1 << (il/2);
			
@@ -5712,17 +5390,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
 
				     for (int i = 0; i < 16; ++i) {
			
 
				         reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
			
 
				     }
			
 
				-#else
			
 
				-    float    kcoef = il&1 ? 1.f/16.f : 1.f;
			
 
				-    uint16_t kmask = il&1 ? 0xF0     : 0x0F;
			
 
				-    float    dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
			
 
				-    float    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
			
 
				-    uint8_t  mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
			
 
				-    uint8_t  m = 1<<(il*2);
			
 
				-    for (int i = 0; i < 16; ++i) {
			
 
				-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
			
 
				-    }
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
			
@@ -5734,7 +5401,6 @@ template <typename type4x4>
 
				 void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
			
 
				     device const uchar * q = xb->qs;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     short is = (il/4) * 2;
			
 
				     q = q + (il/4) * 32 + 16 * (il&1);
			
 
				     il = il & 3;
			
@@ -5743,16 +5409,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
 
				     const float min = xb->dmin;
			
 
				     const float dl = d * sc[0];
			
 
				     const float ml = min * sc[1];
			
 
				-#else
			
 
				-    (void) get_scale_min_k4_just2;
			
 
				-
			
 
				-    q = q + 16 * (il&1);
			
 
				-    device const uint8_t * s = xb->scales;
			
 
				-    device const half2 * dh = (device const half2 *)xb->d;
			
 
				-    const float2 d = (float2)dh[0];
			
 
				-    const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
			
 
				-    const float ml = il<2 ? d[1] * (s[0]>>4)  : d[1] * (s[1]>>4);
			
 
				-#endif
			
 
				+
			
 
				     const ushort mask = il<2 ? 0x0F : 0xF0;
			
 
				     for (int i = 0; i < 16; ++i) {
			
 
				         reg[i/4][i%4] = dl * (q[i] & mask) - ml;
			
@@ -5764,7 +5421,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
 
				     device const uint8_t * q  = xb->qs;
			
 
				     device const uint8_t * qh = xb->qh;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     short is = (il/4) * 2;
			
 
				     q  = q + 32 * (il/4) + 16 * (il&1);
			
 
				     qh = qh + 16 * (il&1);
			
@@ -5781,17 +5437,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
 
				     for (int i = 0; i < 16; ++i) {
			
 
				         reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
			
 
				     }
			
 
				-#else
			
 
				-    q = q + 16 * (il&1);
			
 
				-    device const int8_t * s = xb->scales;
			
 
				-    const float dl = xb->d * s[il];
			
 
				-    uint8_t m = 1<<(il*2);
			
 
				-    const float  coef = il<2 ? 1.f  : 1.f/16.f;
			
 
				-    const ushort mask = il<2 ? 0x0F : 0xF0;
			
 
				-    for (int i = 0; i < 16; ++i) {
			
 
				-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
			
 
				-    }
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <typename type4x4>
			
@@ -5801,15 +5446,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
 
				     device const uint8_t * qh = (device const uint8_t *)xb->qh;
			
 
				     device const int8_t * scales = (device const int8_t *)xb->scales;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
			
 
				     qh = qh + 32*(il/8) + 16*(il&1);
			
 
				     float sc = scales[(il%2) + 2 * ((il/2))];
			
 
				     il = (il/2) & 3;
			
 
				-#else
			
 
				-    ql = ql + 16 * (il&1);
			
 
				-    float sc = scales[il];
			
 
				-#endif
			
 
				+
			
 
				     const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
			
 
				     const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
			
 
				     const float       coef = il>1 ? 1.f/16.f          : 1.f;
			
@@ -5966,20 +5607,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 &
 
				     const int ib32 = il/2;
			
 
				     il = il%2;
			
 
				     device const uint16_t * sc = (device const uint16_t *)xb->scales;
			
 
				-#if QK_K == 64
			
 
				-    const float d = xb->d;
			
 
				-#else
			
 
				+
			
 
				     iq1m_scale_t scale;
			
 
				     scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
			
 
				     const float d = scale.f16;
			
 
				-#endif
			
 
				+
			
 
				     device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
			
 
				     device const uint8_t * qh = xb->qh + 2*ib32 + il;
			
 
				-#if QK_K == 64
			
 
				-    const float dl  = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
			
 
				-#else
			
 
				+
			
 
				     const float dl  = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
			
 
				-#endif
			
 
				     const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
			
 
				     const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
			
 
				     constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
			
@@ -6009,9 +5645,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
 
				 
			
 
				 template <typename type4x4>
			
 
				 void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
			
 
				-#if QK_K == 64
			
 
				-    dequantize_iq4_nl(xb, il, reg);
			
 
				-#else
			
 
				     // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				     const int ib32 = il/2;
			
 
				     il = il%2;
			
@@ -6028,7 +5661,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
 
				         reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
			
 
				         reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
			
 
				     }
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
			
@@ -6533,11 +6165,7 @@ kernel void kernel_mul_mm_id(
 
				         sgitg);
			
 
				 }
			
 
				 
			
 
				-#if QK_K == 256
			
 
				 #define QK_NL 16
			
 
				-#else
			
 
				-#define QK_NL 4
			
 
				-#endif
			
 
				 
			
 
				 //
			
 
				 // get rows
			
@@ -6577,11 +6205,7 @@ template [[host_name("kernel_get_rows_iq2_s")]]   kernel get_rows_t kernel_get_r
 
				 template [[host_name("kernel_get_rows_iq1_s")]]   kernel get_rows_t kernel_get_rows<block_iq1_s,   QK_NL, dequantize_iq1_s>;
			
 
				 template [[host_name("kernel_get_rows_iq1_m")]]   kernel get_rows_t kernel_get_rows<block_iq1_m,   QK_NL, dequantize_iq1_m>;
			
 
				 template [[host_name("kernel_get_rows_iq4_nl")]]  kernel get_rows_t kernel_get_rows<block_iq4_nl,  2,     dequantize_iq4_nl>;
			
 
				-#if QK_K == 64
			
 
				-template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  2,     dequantize_iq4_xs>;
			
 
				-#else
			
 
				 template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
			
 
				-#endif
			
 
				 
			
 
				 //
			
 
				 // matrix-matrix multiplication
			
@@ -6609,11 +6233,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_m
 
				 template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_s,   QK_NL, dequantize_iq1_s>;
			
 
				 template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_m,   QK_NL, dequantize_iq1_m>;
			
 
				 template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_nl>;
			
 
				-#if QK_K == 64
			
 
				-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_xs>;
			
 
				-#else
			
 
				 template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
			
 
				-#endif
			
 
				 
			
 
				 //
			
 
				 // indirect matrix-matrix multiplication
			
@@ -6641,11 +6261,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel
 
				 template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
			
 
				 template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
			
 
				 template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
			
 
				-#if QK_K == 64
			
 
				-template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  2,     dequantize_iq4_xs>;
			
 
				-#else
			
 
				 template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
			
 
				-#endif
			
 
				 
			
 
				 //
			
 
				 // matrix-vector multiplication
			
@@ -6854,7 +6470,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t
 
				 template [[host_name("kernel_mul_mv_id_iq3_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
			
 
				 template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
			
 
				 template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
			
 
				-#if QK_K != 64
			
 
				 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
			
 
				-#endif
			
 
				 
			
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1,4 +1,4 @@
 
				-#include "ggml.h"
			
 
				+#include "ggml.h"
			
 
				 #include "ggml-opencl.h"
			
 
				 #include "ggml-backend-impl.h"
			
 
				 
			
--- a/ggml-quants.c
+++ b/ggml-quants.c
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
 
				     const block_q2_K * x = (const block_q2_K *) vx;
			
 
				 
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				-#if QK_K == 256
			
 
				     const int n   = tid/32;
			
 
				     const int l   = tid - 32*n;
			
 
				     const int is  = 8*n + l/16;
			
@@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
 
				     y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
			
 
				     y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
			
 
				     y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
			
 
				-#else
			
 
				-    const int is = tid/16;  // 0 or 1
			
 
				-    const int il = tid%16;  // 0...15
			
 
				-    const uint8_t q = x[i].qs[il] >> (2*is);
			
 
				-    dst_t * y = yy + i*QK_K + 16*is + il;
			
 
				-
			
 
				-    float dall = x[i].dm[0];
			
 
				-    float dmin = x[i].dm[1];
			
 
				-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
			
 
				-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
 
				     const int i = item_ct1.get_group(2);
			
 
				     const block_q3_K * x = (const block_q3_K *) vx;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const int r = item_ct1.get_local_id(2) / 4;
			
 
				     const int tid = r/2;
			
 
				     const int is0 = r%2;
			
@@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
 
				     const uint8_t * hm = x[i].hmask;
			
 
				 
			
 
				     for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
			
 
				-#else
			
 
				-    const int tid = item_ct1.get_local_id(2);
			
 
				-    const int is  = tid/16;  // 0 or 1
			
 
				-    const int il  = tid%16;  // 0...15
			
 
				-    const int im  = il/8;    // 0...1
			
 
				-    const int in  = il%8;    // 0...7
			
 
				-
			
 
				-    dst_t * y = yy + i*QK_K + 16*is + il;
			
 
				-
			
 
				-    const uint8_t q = x[i].qs[il] >> (2*is);
			
 
				-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
			
 
				-    const float   d = (float)x[i].d;
			
 
				-
			
 
				-    if (is == 0) {
			
 
				-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
			
 
				-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
			
 
				-    } else {
			
 
				-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
			
 
				-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				-#if QK_K == 256
			
 
				 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
			
 
				     if (j < 4) {
			
 
				         d = q[j] & 63; m = q[j + 4] & 63;
			
@@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
 
				         m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
			
 
				     }
			
 
				 }
			
 
				-#endif
			
 
				 
			
 
				 template<typename dst_t>
			
 
				 static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
			
@@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
 
				 
			
 
				     const int i = item_ct1.get_group(2);
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     // assume 32 threads
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				     const int il  = tid/8;
			
@@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
 
				         y[l + 0] = d1 * (q[l] & 0xF) - m1;
			
 
				         y[l +32] = d2 * (q[l] >>  4) - m2;
			
 
				     }
			
 
				-#else
			
 
				-    const int tid = item_ct1.get_local_id(2);
			
 
				-    const uint8_t * q = x[i].qs;
			
 
				-    dst_t * y = yy + i*QK_K;
			
 
				-    const float d = (float)x[i].dm[0];
			
 
				-    const float m = (float)x[i].dm[1];
			
 
				-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
			
 
				-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
 
				 
			
 
				     const int i = item_ct1.get_group(2);
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     // assume 64 threads - this is very slightly better than the one below
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				     const int il  = tid/16;   // il is in 0...3
			
@@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
 
				     hm <<= 1;
			
 
				     y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
			
 
				     y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
			
 
				-#else
			
 
				-    const int tid = item_ct1.get_local_id(2);
			
 
				-    const uint8_t q = x[i].qs[tid];
			
 
				-    const int im = tid/8;  // 0...3
			
 
				-    const int in = tid%8;  // 0...7
			
 
				-    const int is = tid/16; // 0 or 1
			
 
				-    const uint8_t h = x[i].qh[in] >> im;
			
 
				-    const float d = x[i].d;
			
 
				-    dst_t * y = yy + i*QK_K + tid;
			
 
				-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
			
 
				-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
 
				     const block_q6_K * x = (const block_q6_K *) vx;
			
 
				 
			
 
				     const int i = item_ct1.get_group(2);
			
 
				-#if QK_K == 256
			
 
				 
			
 
				     // assume 64 threads - this is very slightly better than the one below
			
 
				     const int tid = item_ct1.get_local_id(2);
			
@@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
 
				     y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
			
 
				     y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
			
 
				     y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
			
 
				-#else
			
 
				-
			
 
				-    // assume 32 threads
			
 
				-    const int tid = item_ct1.get_local_id(2);
			
 
				-    const int ip  = tid/16;         // 0 or 1
			
 
				-    const int il  = tid - 16*ip;    // 0...15
			
 
				-
			
 
				-    dst_t * y = yy + i*QK_K + 16*ip + il;
			
 
				-
			
 
				-    const float d = x[i].d;
			
 
				-
			
 
				-    const uint8_t   ql = x[i].ql[16*ip + il];
			
 
				-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
			
 
				-    const int8_t  * sc = x[i].scales;
			
 
				-
			
 
				-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
			
 
				-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
 
				     const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
			
 
				 
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				-#if QK_K == 256
			
 
				     const int il = tid/8; // 0...3
			
 
				     const int ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
 
				     const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
			
 
				     const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
			
 
				     for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
 
				     const block_iq2_xs * x = (const block_iq2_xs *) vx;
			
 
				 
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				-#if QK_K == 256
			
 
				     const int il = tid/8; // 0...3
			
 
				     const int ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
 
				     const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
			
 
				     const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
			
 
				     for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
@@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
 
				     const block_iq2_s * x = (const block_iq2_s *) vx;
			
 
				 
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				-#if QK_K == 256
			
 
				     const int il = tid/8; // 0...3
			
 
				     const int ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
 
				     const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
			
 
				     const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
			
 
				 #pragma unroll
			
 
				-    for (int j = 0; j < 8; ++j)
			
 
				+    for (int j = 0; j < 8; ++j) {
			
 
				         y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
			
 
				-#else
			
 
				-    assert(false);
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 template<typename dst_t>
			
@@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
 
				     const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
			
 
				 
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				-#if QK_K == 256
			
 
				     const int il = tid/8; // 0...3
			
 
				     const int ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
 
				         y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
			
 
				         y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
			
 
				     }
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
@@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
 
				     const block_iq3_s * x = (const block_iq3_s *) vx;
			
 
				 
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				-#if QK_K == 256
			
 
				     const int il = tid/8; // 0...3
			
 
				     const int ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
 
				         y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
			
 
				         y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
			
 
				     }
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
@@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
 
				     const block_iq1_s * x = (const block_iq1_s  *) vx;
			
 
				 
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				-#if QK_K == 256
			
 
				     const int il = tid/8; // 0...3
			
 
				     const int ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
 
				     for (int j = 0; j < 8; ++j) {
			
 
				         y[j] = d * (q[j] + delta);
			
 
				     }
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
@@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
 
				     const block_iq1_m * x = (const block_iq1_m  *) vx;
			
 
				 
			
 
				     const int tid = item_ct1.get_local_id(2);
			
 
				-#if QK_K == 256
			
 
				     const int il = tid/8; // 0...3
			
 
				     const int ib = tid%8; // 0...7
			
 
				     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
			
@@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
 
				     for (int j = 0; j < 8; ++j) {
			
 
				         y[j] = d * (q[j] + delta);
			
 
				     }
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
@@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
 
				 
			
 
				     float tmp = 0; // partial sum for thread in warp
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const int tid =
			
 
				         item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
			
 
				     const int ix =
			
@@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
 
				         tmp += dall * sum1 - dmin * sum2;
			
 
				 
			
 
				     }
			
 
				-#else
			
 
				-    const int tid = item_ct1.get_local_id(2) /
			
 
				-                    (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
			
 
				-    const int ix = item_ct1.get_local_id(2) %
			
 
				-                   (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
			
 
				-    const int offset = tid * K_QUANTS_PER_ITERATION;
			
 
				-
			
 
				-    uint32_t uaux[2];
			
 
				-    const uint8_t * d = (const uint8_t *)uaux;
			
 
				-
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-
			
 
				-        const float   * y = yy + i * QK_K + offset;
			
 
				-        const uint8_t * q = x[i].qs + offset;
			
 
				-        const uint32_t * s = (const uint32_t *)x[i].scales;
			
 
				-
			
 
				-        uaux[0] = s[0] & 0x0f0f0f0f;
			
 
				-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
			
 
				-
			
 
				-        const sycl::float2 dall =
			
 
				-            x[i].dm.convert<float, sycl::rounding_mode::automatic>();
			
 
				-
			
 
				-        float sum1 = 0, sum2 = 0;
			
 
				-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
			
 
				-            const uint8_t ql = q[l];
			
 
				-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
			
 
				-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
			
 
				-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
			
 
				-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
			
 
				-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
			
 
				-        }
			
 
				-        tmp += dall.x() * sum1 - dall.y() * sum2;
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				 
			
 
				     // sum up partial sums and write back result
			
 
				 #pragma unroll
			
@@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
 
				 
			
 
				     float tmp = 0; // partial sum for thread in warp
			
 
				 
			
 
				-#if QK_K == 256
			
 
				-
			
 
				     const uint16_t kmask1 = 0x0303;
			
 
				     const uint16_t kmask2 = 0x0f0f;
			
 
				 
			
@@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
 
				         tmp += d * sum;
			
 
				 
			
 
				     }
			
 
				-#else
			
 
				-
			
 
				-    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
			
 
				-    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
			
 
				-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
			
 
				-    const int in = offset/8;                                 // 0 or 1
			
 
				-    const int im = offset%8;                                 // 0...7
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-
			
 
				-        const float   * y = yy + i * QK_K + offset;
			
 
				-        const uint8_t * q = x[i].qs + offset;
			
 
				-        const uint8_t * s = x[i].scales;
			
 
				-
			
 
				-        const float dall = (float)x[i].d;
			
 
				-
			
 
				-        float sum = 0;
			
 
				-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
			
 
				-            const uint8_t hl = x[i].hmask[im+l] >> in;
			
 
				-            const uint8_t ql = q[l];
			
 
				-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
			
 
				-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
			
 
				-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
			
 
				-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
			
 
				-        }
			
 
				-        tmp += sum;
			
 
				-    }
			
 
				-#endif
			
 
				 
			
 
				     // sum up partial sums and write back result
			
 
				 #pragma unroll
			
@@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
 
				 
			
 
				     const block_q4_K * x = (const block_q4_K *)vx + ib0;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const uint16_t kmask1 = 0x3f3f;
			
 
				     const uint16_t kmask2 = 0x0f0f;
			
 
				     const uint16_t kmask3 = 0xc0c0;
			
@@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
 
				 #endif
			
 
				 
			
 
				     }
			
 
				-#else
			
 
				-    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15
			
 
				-    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
			
 
				-
			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				-
			
 
				-    uint16_t aux16[2];
			
 
				-    const uint8_t * s = (const uint8_t *)aux16;
			
 
				-
			
 
				-    float tmp = 0;
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-        const uint8_t * q = x[i].qs + step;
			
 
				-        const float   * y = yy + i*QK_K + step;
			
 
				-        const uint16_t * a = (const uint16_t *)x[i].scales;
			
 
				-        aux16[0] = a[0] & 0x0f0f;
			
 
				-        aux16[1] = (a[0] >> 4) & 0x0f0f;
			
 
				-        const float d = (float)x[i].dm[0];
			
 
				-        const float m = (float)x[i].dm[1];
			
 
				-        float sum = 0.f;
			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
			
 
				-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
			
 
				-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
			
 
				-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
			
 
				-        }
			
 
				-        tmp += sum;
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				 
			
 
				     // sum up partial sums and write back result
			
 
				 #pragma unroll
			
@@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
 
				 
			
 
				     float tmp = 0; // partial sum for thread in warp
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const uint16_t kmask1 = 0x3f3f;
			
 
				     const uint16_t kmask2 = 0x0f0f;
			
 
				     const uint16_t kmask3 = 0xc0c0;
			
@@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
 
				                dmin * smin;
			
 
				     }
			
 
				 
			
 
				-#else
			
 
				-    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15
			
 
				-    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				-    const int im = step/8;
			
 
				-    const int in = step%8;
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-        const uint8_t * q = x[i].qs + step;
			
 
				-        const int8_t  * s = x[i].scales;
			
 
				-        const float   * y = yy + i*QK_K + step;
			
 
				-        const float     d = x[i].d;
			
 
				-        float sum = 0.f;
			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				-            const uint8_t h = x[i].qh[in+j] >> im;
			
 
				-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
			
 
				-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
			
 
				-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
			
 
				-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
			
 
				-        }
			
 
				-        tmp += sum;
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				     // sum up partial sums and write back result
			
 
				 #pragma unroll
			
 
				     for (int mask = 16; mask > 0; mask >>= 1) {
			
@@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
 
				 
			
 
				     const block_q6_K * x = (const block_q6_K *)vx + ib0;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				-
			
 
				     const int tid =
			
 
				         item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
			
 
				     const int ix =
			
@@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
 
				 
			
 
				     }
			
 
				 
			
 
				-#else
			
 
				-
			
 
				-    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...7
			
 
				-    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);  // 0...3
			
 
				-
			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				-
			
 
				-    float tmp = 0; // partial sum for thread in warp
			
 
				-
			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				-
			
 
				-        const float   * y  = yy + i * QK_K + step;
			
 
				-        const uint8_t * ql = x[i].ql + step;
			
 
				-        const uint8_t * qh = x[i].qh + step;
			
 
				-        const int8_t  * s  = x[i].scales;
			
 
				-
			
 
				-        const float d = x[i+0].d;
			
 
				-
			
 
				-        float sum = 0;
			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
			
 
				-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
			
 
				-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
			
 
				-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
			
 
				-        }
			
 
				-        tmp += sum;
			
 
				-
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				     // sum up partial sums and write back result
			
 
				 #pragma unroll
			
 
				     for (int mask = 16; mask > 0; mask >>= 1) {
			
@@ -6857,7 +6586,6 @@ static __dpct_inline__ float
 
				 vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
			
 
				                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
			
 
				 
			
 
				-#ifndef GGML_QKK_64
			
 
				     const block_q4_K * bq4_K = (const block_q4_K *) vbq;
			
 
				 
			
 
				     int    v[2];
			
@@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
 
				     }
			
 
				 
			
 
				     return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
			
 
				-
			
 
				-#else
			
 
				-
			
 
				-#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
			
 
				-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
			
 
				-
			
 
				-    float sumf_d = 0.0f;
			
 
				-    float sumf_m = 0.0f;
			
 
				-
			
 
				-    uint16_t aux16[2];
			
 
				-    const uint8_t * s = (const uint8_t *)aux16;
			
 
				-
			
 
				-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
			
 
				-    aux16[0] = a[0] & 0x0f0f;
			
 
				-    aux16[1] = (a[0] >> 4) & 0x0f0f;
			
 
				-
			
 
				-    const float dall = bq4_K->dm[0];
			
 
				-    const float dmin = bq4_K->dm[1];
			
 
				-
			
 
				-    const float d8_1 = bq8_1[0].ds[0];
			
 
				-    const float d8_2 = bq8_1[1].ds[1];
			
 
				-
			
 
				-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
			
 
				-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
			
 
				-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
			
 
				-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
			
 
				-
			
 
				-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
			
 
				-    const int v1 = q4[0];
			
 
				-    const int v2 = q4[4];
			
 
				-
			
 
				-    const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
			
 
				-    const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
			
 
				-    const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
			
 
				-    const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
			
 
				-
			
 
				-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
			
 
				-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
			
 
				-
			
 
				-    return dall * sumf_d - dmin * sumf_m;
			
 
				-
			
 
				-#else
			
 
				-    bad_arch();
			
 
				-#endif // __SYCL_ARCH__ >= VER_4VEC
			
 
				-
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <int mmq_y>
			
@@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
 
				 
			
 
				         const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				         x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
			
 
				-#else
			
 
				-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
			
 
				-#endif
			
 
				     }
			
 
				 
			
 
				 #pragma unroll
			
@@ -7050,7 +6728,6 @@ static __dpct_inline__ float
 
				 vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
			
 
				                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
			
 
				 
			
 
				-#ifndef GGML_QKK_64
			
 
				     const block_q5_K * bq5_K = (const block_q5_K *) vbq;
			
 
				 
			
 
				     int   vl[2];
			
@@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
 
				     }
			
 
				 
			
 
				     return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
			
 
				-
			
 
				-#else
			
 
				-
			
 
				-#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
			
 
				-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
			
 
				-
			
 
				-    const int8_t * s = bq5_K->scales;
			
 
				-
			
 
				-    const float d = bq5_K->d;
			
 
				-
			
 
				-    const float d8_1 = bq8_1[0].ds[0];
			
 
				-    const float d8_2 = bq8_1[1].ds[1];
			
 
				-
			
 
				-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
			
 
				-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
			
 
				-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
			
 
				-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
			
 
				-
			
 
				-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
			
 
				-    const int vl1 = ql[0];
			
 
				-    const int vl2 = ql[4];
			
 
				-
			
 
				-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
			
 
				-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
			
 
				-    const int in = step%8; // 0, 4, 0, 4
			
 
				-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
			
 
				-
			
 
				-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
			
 
				-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
			
 
				-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
			
 
				-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
			
 
				-
			
 
				-    const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
			
 
				-                       + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
			
 
				-
			
 
				-    return d * sumf_d;
			
 
				-
			
 
				-#else
			
 
				-    bad_arch();
			
 
				-#endif // __SYCL_ARCH__ >= VER_4VEC
			
 
				-
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <int mmq_y>
			
@@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
 
				 
			
 
				         const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
			
 
				 
			
 
				-#if QK_K == 256
			
 
				         x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
			
 
				-#endif
			
 
				     }
			
 
				 
			
 
				 #pragma unroll
			
@@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
 
				                      const block_q8_1 *__restrict__ bq8_1, const int &iqs,
			
 
				                      const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
			
 
				                      const uint8_t *kmask_iq2xs) {
			
 
				-#if QK_K == 256
			
 
				     const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
			
 
				 
			
 
				 #if QR2_XXS == 8
			
@@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
 
				     }
			
 
				     return d * (sumi1 + sumi2);
			
 
				 #endif
			
 
				-#else
			
 
				-    assert(false);
			
 
				-    return 0.f;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __dpct_inline__ float
			
@@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
 
				                     const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
			
 
				 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
			
 
				     MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-#if QK_K == 256
			
 
				     const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
 
				     assert(false);
			
 
				     return 0.f;
			
 
				 #endif
			
 
				-#else
			
 
				-    assert(false);
			
 
				-    return 0.f;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __dpct_inline__ float
			
 
				 vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
			
 
				                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
			
 
				-#if QK_K == 256
			
 
				     const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
 
				     }
			
 
				     const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
			
 
				     return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __dpct_inline__ float
			
@@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
 
				                      const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
			
 
				 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
			
 
				     MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				-#if QK_K == 256
			
 
				     const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
 
				     assert(false);
			
 
				     return 0.f;
			
 
				 #endif
			
 
				-#else
			
 
				-    assert(false);
			
 
				-    return 0.f;
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __dpct_inline__ float
			
 
				 vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
			
 
				                    const block_q8_1 *__restrict__ bq8_1, const int &iqs,
			
 
				                    const uint32_t *iq3s_grid) {
			
 
				-#if QK_K == 256
			
 
				     const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
 
				         (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
			
 
				         bq8_1[ib32].ds[0];
			
 
				     return d * sumi;
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __dpct_inline__ float
			
 
				 vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
			
 
				                    const block_q8_1 *__restrict__ bq8_1, const int &iqs,
			
 
				                    const uint32_t *iq1s_grid_gpu) {
			
 
				-#if QK_K == 256
			
 
				     const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
 
				     const float d = d1q * bq8_1[ib32].ds[0];
			
 
				     const float m = d1q * bq8_1[ib32].ds[1];
			
 
				     return d * sumi + m * delta;
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __dpct_inline__ float
			
 
				 vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
			
 
				                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
			
 
				-#if QK_K == 256
			
 
				     const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
			
 
				 
			
 
				     const int ib32 = iqs;
			
@@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
 
				     scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
			
 
				     const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
			
 
				     return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
			
@@ -7720,7 +7322,6 @@ static __dpct_inline__ float
 
				 vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
			
 
				                     const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
			
 
				 
			
 
				-#if QK_K == 256
			
 
				     const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
			
 
				     const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
			
 
				 
			
@@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
 
				         sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
			
 
				     }
			
 
				     return d * (sumi1 + sumi2);
			
 
				-#else
			
 
				-    assert(false);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
			
@@ -10203,7 +9801,6 @@ template <typename dst_t>
 
				 static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
			
 
				                                      dpct::queue_ptr stream) {
			
 
				     const int nb = k / QK_K;
			
 
				-#if QK_K == 256
			
 
				     {
			
 
				         dpct::has_capability_or_fail(stream->get_device(),
			
 
				                                      {sycl::aspect::fp16});
			
@@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
 
				                                  dequantize_block_q2_K(vx, y, item_ct1);
			
 
				                              });
			
 
				     }
			
 
				-#else
			
 
				-    {
			
 
				-        dpct::has_capability_or_fail(stream->get_device(),
			
 
				-                                     {sycl::aspect::fp16});
			
 
				-
			
 
				-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
			
 
				-                                                   sycl::range<3>(1, 1, 32),
			
 
				-                                               sycl::range<3>(1, 1, 32)),
			
 
				-                             [=](sycl::nd_item<3> item_ct1) {
			
 
				-                                 dequantize_block_q2_K(vx, y, item_ct1);
			
 
				-                             });
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
 
				 static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
			
 
				                                      dpct::queue_ptr stream) {
			
 
				     const int nb = k / QK_K;
			
 
				-#if QK_K == 256
			
 
				     {
			
 
				         dpct::has_capability_or_fail(stream->get_device(),
			
 
				                                      {sycl::aspect::fp16});
			
@@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
 
				                                  dequantize_block_q3_K(vx, y, item_ct1);
			
 
				                              });
			
 
				     }
			
 
				-#else
			
 
				-    {
			
 
				-        dpct::has_capability_or_fail(stream->get_device(),
			
 
				-                                     {sycl::aspect::fp16});
			
 
				-
			
 
				-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
			
 
				-                                                   sycl::range<3>(1, 1, 32),
			
 
				-                                               sycl::range<3>(1, 1, 32)),
			
 
				-                             [=](sycl::nd_item<3> item_ct1) {
			
 
				-                                 dequantize_block_q3_K(vx, y, item_ct1);
			
 
				-                             });
			
 
				-    }
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
@@ -10320,7 +9889,6 @@ template <typename dst_t>
 
				 static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
			
 
				                                      dpct::queue_ptr stream) {
			
 
				     const int nb = k / QK_K;
			
 
				-#if QK_K == 256
			
 
				     {
			
 
				         dpct::has_capability_or_fail(stream->get_device(),
			
 
				                                      {sycl::aspect::fp16});
			
@@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
 
				                                  dequantize_block_q5_K(vx, y, item_ct1);
			
 
				                              });
			
 
				     }
			
 
				-#else
			
 
				-    {
			
 
				-        dpct::has_capability_or_fail(stream->get_device(),
			
 
				-                                     {sycl::aspect::fp16});
			
 
				-
			
 
				-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
			
 
				-                                                   sycl::range<3>(1, 1, 32),
			
 
				-                                               sycl::range<3>(1, 1, 32)),
			
 
				-                             [=](sycl::nd_item<3> item_ct1) {
			
 
				-                                 dequantize_block_q5_K(vx, y, item_ct1);
			
 
				-                             });
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
 
				 static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
			
 
				                                      dpct::queue_ptr stream) {
			
 
				     const int nb = k / QK_K;
			
 
				-#if QK_K == 256
			
 
				     {
			
 
				         dpct::has_capability_or_fail(stream->get_device(),
			
 
				                                      {sycl::aspect::fp16});
			
@@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
 
				                                  dequantize_block_q6_K(vx, y, item_ct1);
			
 
				                              });
			
 
				     }
			
 
				-#else
			
 
				-    {
			
 
				-        dpct::has_capability_or_fail(stream->get_device(),
			
 
				-                                     {sycl::aspect::fp16});
			
 
				-
			
 
				-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
			
 
				-                                                   sycl::range<3>(1, 1, 32),
			
 
				-                                               sycl::range<3>(1, 1, 32)),
			
 
				-                             [=](sycl::nd_item<3> item_ct1) {
			
 
				-                                 dequantize_block_q6_K(vx, y, item_ct1);
			
 
				-                             });
			
 
				-    }
			
 
				-
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 template <typename dst_t>
			
@@ -10529,9 +10068,6 @@ template <typename dst_t>
 
				 static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
			
 
				                                        dpct::queue_ptr stream) {
			
 
				     const int nb = (k + QK_K - 1) / QK_K;
			
 
				-#if QK_K == 64
			
 
				-    dequantize_row_iq4_nl_sycl(vx, y, k, stream);
			
 
				-#else
			
 
				       {
			
 
				             dpct::has_capability_or_fail(stream->get_device(),
			
 
				                                          {sycl::aspect::fp16});
			
@@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
 
				                       });
			
 
				             });
			
 
				       }
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 
			
@@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
 
				                                         const int nrows_y, const int nrows_dst,
			
 
				                                         dpct::queue_ptr stream) try {
			
 
				 
			
 
				-#if QK_K == 256
			
 
				-
			
 
				     int id;
			
 
				     SYCL_CHECK(
			
 
				         CHECK_TRY_ERROR(id = get_current_device_id()));
			
@@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
 
				             });
			
 
				         }
			
 
				     }
			
 
				-#endif
			
 
				 }
			
 
				 catch (sycl::exception const &exc) {
			
 
				   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
			
--- a/ggml.c
+++ b/ggml.c
@@ -871,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 
				     },
			
 
				     [GGML_TYPE_IQ4_XS] = {
			
 
				         .type_name                = "iq4_xs",
			
 
				-#if QK_K == 64
			
 
				-        .blck_size                = QK4_NL,
			
 
				-#else
			
 
				         .blck_size                = QK_K,
			
 
				-#endif
			
 
				         .type_size                = sizeof(block_iq4_xs),
			
 
				         .is_quantized             = true,
			
 
				         .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
			
 
				         .from_float               = quantize_row_iq4_xs,
			
 
				         .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_xs_reference,
			
 
				         .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
			
 
				-#if QK_K == 64
			
 
				-        .vec_dot_type             = GGML_TYPE_Q8_0,
			
 
				-#else
			
 
				         .vec_dot_type             = GGML_TYPE_Q8_K,
			
 
				-#endif
			
 
				         .nrows                    = 1,
			
 
				     },
			
 
				     [GGML_TYPE_Q8_K] = {
			
@@ -22117,11 +22109,7 @@ size_t ggml_quantize_chunk(
 
				         case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
			
 
				         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
			
 
				         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
			
 
				-#if QK_K == 64
			
 
				-        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
			
 
				-#else
			
 
				         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
			
 
				-#endif
			
 
				         case GGML_TYPE_F16:
			
 
				             {
			
 
				                 size_t elemsize = sizeof(ggml_fp16_t);
			
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -905,9 +905,8 @@ class GGUFValueType(IntEnum):
 
				             raise ValueError(f"Unknown type: {type(val)}")
			
 
				 
			
 
				 
			
 
				-# Note: Does not support GGML_QKK_64
			
 
				-QK_K = 256
			
 
				 # Items here are (block size, type size)
			
 
				+QK_K = 256
			
 
				 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
			
 
				     GGMLQuantizationType.F32:     (1, 4),
			
 
				     GGMLQuantizationType.F16:     (1, 2),
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -26,13 +26,9 @@
 
				 #ifdef GGML_USE_METAL
			
 
				 #  include "ggml-metal.h"
			
 
				 #endif
			
 
				-#ifndef QK_K
			
 
				-#  ifdef GGML_QKK_64
			
 
				-#    define QK_K 64
			
 
				-#  else
			
 
				-#    define QK_K 256
			
 
				-#  endif
			
 
				-#endif
			
 
				+
			
 
				+// TODO: replace with ggml API call
			
 
				+#define QK_K 256
			
 
				 
			
 
				 #ifdef __has_include
			
 
				     #if __has_include(<unistd.h>)
			
@@ -14308,8 +14304,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
 
				         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
			
 
				                 use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
			
 
				         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
			
 
				-        else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
			
 
				-                (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
			
 
				         if (qs.model.type == MODEL_70B) {
			
 
				             // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
			
 
				             // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with