|
@@ -191,70 +191,74 @@ typedef struct {
|
|
|
} block_iq3_xxs;
|
|
} block_iq3_xxs;
|
|
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
|
|
|
|
|
|
|
|
|
+#ifdef __cplusplus
|
|
|
|
|
+extern "C" {
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
// Quantization
|
|
// Quantization
|
|
|
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
|
|
|
|
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
|
|
|
|
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
|
|
|
|
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
|
|
|
|
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
|
|
|
|
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
|
|
|
|
-
|
|
|
|
|
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
|
|
|
|
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
|
|
|
|
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
|
|
|
|
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
|
|
|
|
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
|
|
|
|
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
|
|
|
|
-void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k);
|
|
|
|
|
-
|
|
|
|
|
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-
|
|
|
|
|
-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
-void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
|
|
+void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
|
|
|
|
+
|
|
|
|
|
+void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
|
|
|
|
+
|
|
|
|
|
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+
|
|
|
|
|
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
+void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
|
|
|
|
// Dequantization
|
|
// Dequantization
|
|
|
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
|
|
|
|
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
|
|
|
|
-
|
|
|
|
|
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
|
|
|
|
|
-void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k);
|
|
|
|
|
|
|
+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+
|
|
|
|
|
+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
|
|
|
|
// Dot product
|
|
// Dot product
|
|
|
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-
|
|
|
|
|
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
|
|
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+
|
|
|
|
|
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
|
|
|
|
|
|
|
//
|
|
//
|
|
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
@@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
|
|
|
void iq2xs_free_impl(int grid_size);
|
|
void iq2xs_free_impl(int grid_size);
|
|
|
void iq3xs_init_impl(int grid_size);
|
|
void iq3xs_init_impl(int grid_size);
|
|
|
void iq3xs_free_impl(int grid_size);
|
|
void iq3xs_free_impl(int grid_size);
|
|
|
|
|
+
|
|
|
|
|
+#ifdef __cplusplus
|
|
|
|
|
+}
|
|
|
|
|
+#endif
|
|
|
|
|
+
|