|
|
@@ -48,11 +48,11 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
|
|
|
};
|
|
|
|
|
|
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
|
|
|
- return { block_index * (traits::qk / traits::qr), 0 };
|
|
|
+ return { block_index * (QK4_0 / QR4_0), 0 };
|
|
|
}
|
|
|
|
|
|
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
|
- return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
|
|
|
+ return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 };
|
|
|
}
|
|
|
|
|
|
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
|
@@ -71,14 +71,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
|
|
|
}
|
|
|
|
|
|
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
|
- auto nblocks = (nrows * (ncols / traits::qk));
|
|
|
- return { nblocks * (QK_K / 2),
|
|
|
+ auto nblocks = (nrows * (ncols / QK_K));
|
|
|
+ return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE),
|
|
|
(nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
|
|
|
}
|
|
|
|
|
|
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
|
-
|
|
|
- constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
|
|
|
};
|
|
|
|
|
|
template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
|
|
@@ -90,22 +88,23 @@ template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
|
|
};
|
|
|
|
|
|
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
|
|
|
- auto low_bits_index = block_index * (traits::qk / traits::qr);
|
|
|
+ auto low_bits_index = block_index * (QK_K / QR6_K);
|
|
|
// the index of high bits it's after all low bits
|
|
|
auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
|
|
|
return { low_bits_index, high_bits_index };
|
|
|
}
|
|
|
|
|
|
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
|
- auto nblocks = (nrows * (ncols / traits::qk));
|
|
|
+ auto nblocks = (nrows * (ncols / QK_K));
|
|
|
auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
|
|
|
auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
|
|
|
- auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16);
|
|
|
+ auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half);
|
|
|
return { block_scales, sb_scale };
|
|
|
}
|
|
|
|
|
|
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
|
};
|
|
|
+
|
|
|
} // namespace ggml_sycl_reordered
|
|
|
|
|
|
#endif // GGML_SYCL_QUANTS_HPP
|