1 년 전 · c6b395535a
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -19,6 +19,7 @@ extern "C" {
 
															 // fall back to the _Static_assert C11 keyword.
														
 
															 // if C99 - static_assert is noop
														
 
															 // ref: https://stackoverflow.com/a/53923785/4039976
														
 
															+#ifndef __cplusplus
														
 
															 #ifndef static_assert
														
 
															 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
														
 
															 #define static_assert(cond, msg) _Static_assert(cond, msg)
														
@@ -26,6 +27,7 @@ extern "C" {
 
															 #define static_assert(cond, msg) struct global_scope_noop_trick
														
 
															 #endif
														
 
															 #endif
														
 
															+#endif
														
 
															 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
														
 
															 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
														
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -191,70 +191,74 @@ typedef struct {
 
															 } block_iq3_xxs;
														
 
															 static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
														
 
															+#ifdef __cplusplus
														
 
															+extern "C" {
														
 
															+#endif
														
 
															+
														
 
															 // Quantization
														
 
															-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
														
 
															-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
														
 
															-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
														
 
															-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
														
 
															-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
														
 
															-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
														
 
															-
														
 
															-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
														
 
															-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
														
 
															-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
														
 
															-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
														
 
															-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
														
 
															-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
														
 
															-void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k);
														
 
															-
														
 
															-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
														
 
															-
														
 
															-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
														
 
															-void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k);
														
 
															+void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
														
 
															+
														
 
															+void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
														
 
															+
														
 
															+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+
														
 
															+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															+void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
														
 
															 // Dequantization
														
 
															-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
														
 
															-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
														
 
															-
														
 
															-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_iq2_xs (const block_iq2_xs  * restrict x, float * restrict y, int k);
														
 
															-void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k);
														
 
															+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+
														
 
															+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
														
 
															 // Dot product
														
 
															-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-
														
 
															-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
														
 
															+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+
														
 
															+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
														
 
															 //
														
 
															 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
														
@@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
 
															 void iq2xs_free_impl(int grid_size);
														
 
															 void iq3xs_init_impl(int grid_size);
														
 
															 void iq3xs_free_impl(int grid_size);
														
 
															+
														
 
															+#ifdef __cplusplus
														
 
															+}
														
 
															+#endif
														
 
															+