10 ماه پیش · 5bbe6a9fe9
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -7,6 +7,7 @@
 
				 #include <cstdio>
			
 
				 #include <fstream>
			
 
				 #include <thread>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
			
 
				                               std::vector<llama_token> & inp, int nnew, bool print_progress) {
			
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -4,6 +4,7 @@
 
				 
			
 
				 #include <cmath>
			
 
				 #include <unordered_map>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 // the ring buffer works similarly to std::deque, but with a fixed capacity
			
 
				 // TODO: deduplicate with llama-impl.h
			
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -5,6 +5,7 @@
 
				 #include "sampling.h"
			
 
				 
			
 
				 #include <cstring>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
			
 
				 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
			
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -4,6 +4,7 @@
 
				 #include "llama.h"
			
 
				 
			
 
				 #include <ctime>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 #if defined(_MSC_VER)
			
 
				 #pragma warning(disable: 4244 4267) // possible loss of data
			
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -7,6 +7,7 @@
 
				 #include <cstdio>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 struct ngram_data {
			
 
				     bool active = false;
			
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -12,6 +12,7 @@
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 #include <ctime>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 // trim whitespace from the beginning and end of a string
			
 
				 static std::string trim(const std::string & str) {
			
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -7,6 +7,7 @@
 
				 #include <cstdio>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 static void print_usage(int, char ** argv) {
			
 
				     LOG("\nexample usage:\n");
			
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,6 +8,7 @@
 
				 #include <unordered_map>
			
 
				 #include <fstream>
			
 
				 #include <cmath>
			
 
				+#include <cctype>
			
 
				 
			
 
				 struct quant_option {
			
 
				     std::string name;
			
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -1,3 +1,5 @@
 
				+#define _USE_MATH_DEFINES // For M_PI on MSVC
			
 
				+
			
 
				 #include "arg.h"
			
 
				 #include "common.h"
			
 
				 #include "sampling.h"
			
@@ -5,8 +7,6 @@
 
				 #include "llama.h"
			
 
				 #include "json.hpp"
			
 
				 
			
 
				-#define _USE_MATH_DEFINES // For M_PI on MSVC
			
 
				-
			
 
				 #include <algorithm>
			
 
				 #include <cmath>
			
 
				 #include <cstdio>
			
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2140,7 +2140,11 @@ extern "C" {
 
				 #        define GGML_RESTRICT
			
 
				 #    endif
			
 
				 #else
			
 
				-#    define GGML_RESTRICT restrict
			
 
				+#    if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
			
 
				+#        define GGML_RESTRICT __restrict
			
 
				+#    else
			
 
				+#        define GGML_RESTRICT restrict
			
 
				+#    endif
			
 
				 #endif
			
 
				     typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
			
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -8,6 +8,7 @@
 
				 #include <string>
			
 
				 #include <type_traits>
			
 
				 #include <vector>
			
 
				+#include <cctype>
			
 
				 
			
 
				 #ifdef _WIN32
			
 
				 #    define WIN32_LEAN_AND_MEAN
			
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -21,6 +21,7 @@
 
				 #include <string.h>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 #ifdef __APPLE__
			
 
				 #include <sys/types.h>
			
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -247,9 +247,9 @@ typedef pthread_t ggml_thread_t;
 
				 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
			
 
				 
			
 
				 
			
 
				-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
			
 
				-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
			
 
				-static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
			
 
				+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
			
 
				+static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
			
 
				+static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
			
 
				 
			
 
				 static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
			
 
				     [GGML_TYPE_F32] = {
			
@@ -1451,7 +1451,7 @@ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
			
 
				+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
			
 
				    assert(nrc == 1);
			
 
				    UNUSED(nrc);
			
 
				    UNUSED(bx);
			
@@ -1494,7 +1494,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
 
				     *s = sumf;
			
 
				 }
			
 
				 
			
 
				-static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
			
 
				+static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
			
 
				     assert(nrc == 1);
			
 
				     UNUSED(nrc);
			
 
				     UNUSED(bx);
			
@@ -1562,7 +1562,7 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
 
				     *s = sumf;
			
 
				 }
			
 
				 
			
 
				-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
			
 
				+static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
			
 
				     assert(nrc == 1);
			
 
				     UNUSED(nrc);
			
 
				     UNUSED(bx);
			
@@ -1606,10 +1606,10 @@ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t *
 
				 
			
 
				 // compute GGML_VEC_DOT_UNROLL dot products at once
			
 
				 // xs - x row stride in bytes
			
 
				-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
			
 
				+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
			
 
				     ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
			
 
				 
			
 
				-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
			
 
				+    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
			
 
				 
			
 
				     for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
			
 
				         x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
			
@@ -1659,7 +1659,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
 
				     }
			
 
				 }
			
 
				 
			
 
				-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
			
 
				+inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
			
 
				 #if defined(GGML_SIMD)
			
 
				     const int np = (n & ~(GGML_F32_STEP - 1));
			
 
				 
			
@@ -1690,7 +1690,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
			
 
				+inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
			
 
				 #if defined(GGML_SIMD)
			
 
				     const int np = (n & ~(GGML_F16_STEP - 1));
			
 
				 
			
@@ -1722,10 +1722,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
 
				 }
			
 
				 
			
 
				 // xs and vs are byte strides of x and v
			
 
				-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
			
 
				+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
			
 
				 
			
 
				-    const float * restrict x[GGML_VEC_MAD_UNROLL];
			
 
				-    const float * restrict v[GGML_VEC_MAD_UNROLL];
			
 
				+    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
			
 
				+    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
			
 
				 
			
 
				     for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
			
 
				         x[i] = (const float *) ((const char *) xv + i*xs);
			
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -28,7 +28,7 @@
 
				 #define UNUSED GGML_UNUSED
			
 
				 
			
 
				 // reference implementation for deterministic creation of model files
			
 
				-void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
			
 
				+void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
			
 
				     static const int qk = QK4_0;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -65,7 +65,7 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
			
 
				+void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) {
			
 
				     const int qk = QK4_1;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -102,7 +102,7 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
			
 
				+void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) {
			
 
				     static const int qk = QK5_0;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -146,7 +146,7 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
			
 
				+void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) {
			
 
				     const int qk = QK5_1;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -191,7 +191,7 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in
 
				 }
			
 
				 
			
 
				 // reference implementation for deterministic creation of model files
			
 
				-void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
			
 
				+void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK8_0 == 0);
			
 
				     const int nb = k / QK8_0;
			
 
				 
			
@@ -217,7 +217,7 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in
 
				 }
			
 
				 
			
 
				 // reference implementation for deterministic creation of model files
			
 
				-void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
			
 
				+void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(QK8_1 == 32);
			
 
				     assert(k % QK8_1 == 0);
			
 
				     const int nb = k / QK8_1;
			
@@ -252,7 +252,7 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     static const int qk = QK4_0;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -272,7 +272,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     static const int qk = QK4_1;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -293,7 +293,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     static const int qk = QK5_0;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -319,7 +319,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     static const int qk = QK5_1;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -346,7 +346,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     static const int qk = QK8_0;
			
 
				 
			
 
				     assert(k % qk == 0);
			
@@ -376,8 +376,8 @@ static inline int nearest_int(float fval) {
 
				     return (i & 0x007fffff) - 0x00400000;
			
 
				 }
			
 
				 
			
 
				-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
			
 
				-        const float * restrict qw) {
			
 
				+static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
			
 
				+        const float * GGML_RESTRICT qw) {
			
 
				     float max = 0;
			
 
				     float amax = 0;
			
 
				     for (int i = 0; i < n; ++i) {
			
@@ -445,7 +445,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
 
				     return scale;
			
 
				 }
			
 
				 
			
 
				-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
			
 
				+static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
			
 
				     float max = 0;
			
 
				     float amax = 0;
			
 
				     for (int i = 0; i < n; ++i) {
			
@@ -504,7 +504,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
 
				     return 1/iscale;
			
 
				 }
			
 
				 
			
 
				-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
			
 
				+static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
			
 
				         int ntry, float alpha) {
			
 
				     float min = x[0];
			
 
				     float max = x[0];
			
@@ -547,8 +547,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
 
				     return scale;
			
 
				 }
			
 
				 
			
 
				-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
			
 
				-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
			
 
				+static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
			
 
				+        uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
			
 
				         float rmin, float rdelta, int nstep, bool use_mad) {
			
 
				     float min = x[0];
			
 
				     float max = x[0];
			
@@ -628,7 +628,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
 
				     return scale;
			
 
				 }
			
 
				 
			
 
				-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
			
 
				+static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
			
 
				     if (j < 4) {
			
 
				         *d = q[j] & 63; *m = q[j + 4] & 63;
			
 
				     } else {
			
@@ -639,7 +639,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
 
				 
			
 
				 //========================- 2-bit (de)-quantization
			
 
				 
			
 
				-void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
			
 
				+void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int nb = k / QK_K;
			
 
				 
			
@@ -709,7 +709,7 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int nb = k / QK_K;
			
 
				 
			
@@ -741,8 +741,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
			
 
				-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
			
 
				+static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
			
 
				+        uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
			
 
				         float rmin, float rdelta, int nstep, bool use_mad) {
			
 
				     float min = x[0];
			
 
				     float max = x[0];
			
@@ -824,7 +824,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
 
				     return scale;
			
 
				 }
			
 
				 
			
 
				-static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
			
 
				+static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) {
			
 
				     float max = 0;
			
 
				     for (int i = 0; i < n; ++i) {
			
 
				         max = MAX(max, x[i]);
			
@@ -897,7 +897,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
 
				     return sumlx/suml2;
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
			
 
				+static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) {
			
 
				     GGML_ASSERT(quant_weights);
			
 
				     assert(k % QK_K == 0);
			
 
				     const int nb = k / QK_K;
			
@@ -917,7 +917,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
 
				         for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
			
 
				         float sigma2 = sumx2/QK_K;
			
 
				         for (int j = 0; j < QK_K/16; ++j) {
			
 
				-            const float * restrict qw = quant_weights + QK_K * i + 16*j;
			
 
				+            const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
			
 
				             for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
			
 
				             for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
			
 
				             scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
			
@@ -959,7 +959,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
			
@@ -977,7 +977,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
 
				 
			
 
				 //========================= 3-bit (de)-quantization
			
 
				 
			
 
				-void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
			
 
				+void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int nb = k / QK_K;
			
 
				 
			
@@ -1053,7 +1053,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int nb = k / QK_K;
			
 
				 
			
@@ -1067,8 +1067,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
 
				 
			
 
				         const float d_all = GGML_FP16_TO_FP32(x[i].d);
			
 
				 
			
 
				-        const uint8_t * restrict q = x[i].qs;
			
 
				-        const uint8_t * restrict hm = x[i].hmask;
			
 
				+        const uint8_t * GGML_RESTRICT q = x[i].qs;
			
 
				+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
			
 
				         uint8_t m = 1;
			
 
				 
			
 
				         memcpy(aux, x[i].scales, 12);
			
@@ -1103,7 +1103,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
			
 
				+static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) {
			
 
				     assert(n_per_row % QK_K == 0);
			
 
				     const int nb = n_per_row / QK_K;
			
 
				 
			
@@ -1187,7 +1187,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
			
@@ -1205,7 +1205,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
 
				 
			
 
				 // ====================== 4-bit (de)-quantization
			
 
				 
			
 
				-void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
			
 
				+void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int nb = k / QK_K;
			
 
				 
			
@@ -1277,7 +1277,7 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int nb = k / QK_K;
			
 
				 
			
@@ -1301,7 +1301,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
			
 
				+static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
			
 
				     assert(n_per_row % QK_K == 0);
			
 
				     const int64_t nb = n_per_row / QK_K;
			
 
				 
			
@@ -1374,7 +1374,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
			
@@ -1392,7 +1392,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
 
				 
			
 
				 // ====================== 5-bit (de)-quantization
			
 
				 
			
 
				-void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
			
 
				+void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -1454,8 +1454,8 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
 
				             }
			
 
				         }
			
 
				 
			
 
				-        uint8_t * restrict qh = y[i].qh;
			
 
				-        uint8_t * restrict ql = y[i].qs;
			
 
				+        uint8_t * GGML_RESTRICT qh = y[i].qh;
			
 
				+        uint8_t * GGML_RESTRICT ql = y[i].qs;
			
 
				         memset(qh, 0, QK_K/8);
			
 
				 
			
 
				         uint8_t m1 = 1, m2 = 2;
			
@@ -1479,7 +1479,7 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -1506,7 +1506,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
			
 
				+static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
			
 
				     assert(n_per_row % QK_K == 0);
			
 
				     const int64_t nb = n_per_row / QK_K;
			
 
				 
			
@@ -1573,8 +1573,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
 
				             }
			
 
				         }
			
 
				 
			
 
				-        uint8_t * restrict qh = y[i].qh;
			
 
				-        uint8_t * restrict ql = y[i].qs;
			
 
				+        uint8_t * GGML_RESTRICT qh = y[i].qh;
			
 
				+        uint8_t * GGML_RESTRICT ql = y[i].qs;
			
 
				         memset(qh, 0, QK_K/8);
			
 
				 
			
 
				         uint8_t m1 = 1, m2 = 2;
			
@@ -1599,7 +1599,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
			
@@ -1617,7 +1617,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
 
				 
			
 
				 // ====================== 6-bit (de)-quantization
			
 
				 
			
 
				-void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
			
 
				+void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -1667,8 +1667,8 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
 
				             }
			
 
				         }
			
 
				 
			
 
				-        uint8_t * restrict ql = y[i].ql;
			
 
				-        uint8_t * restrict qh = y[i].qh;
			
 
				+        uint8_t * GGML_RESTRICT ql = y[i].ql;
			
 
				+        uint8_t * GGML_RESTRICT qh = y[i].qh;
			
 
				         for (int j = 0; j < QK_K; j += 128) {
			
 
				             for (int l = 0; l < 32; ++l) {
			
 
				                 const uint8_t q1 = L[j + l +  0] & 0xF;
			
@@ -1687,16 +1687,16 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
 
				     for (int i = 0; i < nb; i++) {
			
 
				         const float d = GGML_FP16_TO_FP32(x[i].d);
			
 
				 
			
 
				-        const uint8_t * restrict ql = x[i].ql;
			
 
				-        const uint8_t * restrict qh = x[i].qh;
			
 
				-        const int8_t  * restrict sc = x[i].scales;
			
 
				+        const uint8_t * GGML_RESTRICT ql = x[i].ql;
			
 
				+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
			
 
				+        const int8_t  * GGML_RESTRICT sc = x[i].scales;
			
 
				 
			
 
				         for (int n = 0; n < QK_K; n += 128) {
			
 
				             for (int l = 0; l < 32; ++l) {
			
@@ -1718,7 +1718,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
			
 
				+static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
			
 
				     assert(n_per_row % QK_K == 0);
			
 
				     const int64_t nb = n_per_row / QK_K;
			
 
				 
			
@@ -1781,8 +1781,8 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
 
				             }
			
 
				         }
			
 
				 
			
 
				-        uint8_t * restrict ql = y[i].ql;
			
 
				-        uint8_t * restrict qh = y[i].qh;
			
 
				+        uint8_t * GGML_RESTRICT ql = y[i].ql;
			
 
				+        uint8_t * GGML_RESTRICT qh = y[i].qh;
			
 
				         for (int j = 0; j < QK_K; j += 128) {
			
 
				             for (int l = 0; l < 32; ++l) {
			
 
				                 const uint8_t q1 = L[j + l +  0] & 0xF;
			
@@ -1802,7 +1802,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
			
@@ -1818,7 +1818,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
 
				     return nrow * row_size;
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
			
 
				+static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
			
 
				     static_assert(QK4_0 == 32, "QK4_0 must be 32");
			
 
				 
			
 
				     if (!quant_weights) {
			
@@ -1846,7 +1846,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
			
 
				         return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
			
@@ -1861,7 +1861,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
 
				     return nrow * row_size;
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
			
 
				+static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
			
 
				     static_assert(QK4_1 == 32, "QK4_1 must be 32");
			
 
				 
			
 
				     if (!quant_weights) {
			
@@ -1891,7 +1891,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
			
 
				         return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
			
@@ -1906,7 +1906,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nr
 
				     return nrow * row_size;
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
			
 
				+static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
			
 
				     static_assert(QK5_0 == 32, "QK5_0 must be 32");
			
 
				 
			
 
				     if (!quant_weights) {
			
@@ -1945,7 +1945,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
			
 
				         return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
			
@@ -1960,7 +1960,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nr
 
				     return nrow * row_size;
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
			
 
				+static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
			
 
				     static_assert(QK5_1 == 32, "QK5_1 must be 32");
			
 
				 
			
 
				     if (!quant_weights) {
			
@@ -1998,7 +1998,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     if (!quant_weights) {
			
 
				         quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
			
 
				         return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
			
@@ -2013,7 +2013,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
 
				     return nrow * row_size;
			
 
				 }
			
 
				 
			
 
				-size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     (void)quant_weights; // not used
			
 
				     const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
			
 
				     quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
			
@@ -2022,7 +2022,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
 
				 
			
 
				 // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
			
 
				 
			
 
				-void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
			
 
				+void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2088,7 +2088,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
 
				     }
			
 
				 }
			
 
				 
			
 
				-void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
			
 
				+void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2120,21 +2120,21 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     (void)quant_weights; // not used
			
 
				     const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
			
 
				     quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
			
 
				     return nrow * row_size;
			
 
				 }
			
 
				 
			
 
				-size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     (void)quant_weights; // not used
			
 
				     const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
			
 
				     quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
			
 
				     return nrow * row_size;
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2173,7 +2173,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2194,7 +2194,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in
 
				 
			
 
				 // ====================== "True" 2-bit (de)-quantization
			
 
				 
			
 
				-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2222,7 +2222,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
 
				 
			
 
				 // ====================== 2.3125 bpw (de)-quantization
			
 
				 
			
 
				-void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2249,7 +2249,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
 
				 
			
 
				 // ====================== 2.5625 bpw (de)-quantization
			
 
				 
			
 
				-void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2281,7 +2281,7 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
 
				 
			
 
				 // ====================== 3.0625 bpw (de)-quantization
			
 
				 
			
 
				-void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2313,7 +2313,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
 
				 
			
 
				 // ====================== 3.3125 bpw (de)-quantization
			
 
				 
			
 
				-void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2356,7 +2356,7 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
 
				 
			
 
				 // ====================== 1.5625 bpw (de)-quantization
			
 
				 
			
 
				-void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2381,7 +2381,7 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2433,7 +2433,7 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
 
				 
			
 
				 static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
			
 
				 
			
 
				-void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK4_NL == 0);
			
 
				     const int64_t nb = k / QK4_NL;
			
 
				 
			
@@ -2451,7 +2451,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2476,7 +2476,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
 
				 
			
 
				 //===================================== Q8_K ==============================================
			
 
				 
			
 
				-void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
			
 
				+void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2515,7 +2515,7 @@ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, in
 
				     }
			
 
				 }
			
 
				 
			
 
				-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
			
 
				+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     const int64_t nb = k / QK_K;
			
 
				 
			
@@ -2927,8 +2927,8 @@ void iq2xs_free_impl(enum ggml_type type) {
 
				     }
			
 
				 }
			
 
				 
			
 
				-static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
			
 
				-        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
			
 
				+static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
			
 
				+        const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
			
 
				     int num_neighbors = neighbours[0];
			
 
				     GGML_ASSERT(num_neighbors > 0);
			
 
				     float best_d2 = FLT_MAX;
			
@@ -2951,7 +2951,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
 
				     return grid_index;
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
			
 
				+static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
			
 
				 
			
 
				     const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
			
 
				 
			
@@ -3124,7 +3124,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
			
 
				+static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
			
 
				 
			
 
				     const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
			
 
				 
			
@@ -3304,7 +3304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK_K == 0);
			
 
				     int64_t nblock = n_per_row/QK_K;
			
 
				     char * qrow = (char *)dst;
			
@@ -3316,7 +3316,7 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
 
				     return nrow * nblock * sizeof(block_iq2_xxs);
			
 
				 }
			
 
				 
			
 
				-size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK_K == 0);
			
 
				     int64_t nblock = n_per_row/QK_K;
			
 
				     char * qrow = (char *)dst;
			
@@ -3521,8 +3521,8 @@ void iq3xs_free_impl(int grid_size) {
 
				     }
			
 
				 }
			
 
				 
			
 
				-static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
			
 
				-        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
			
 
				+static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid,
			
 
				+        const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
			
 
				     int num_neighbors = neighbours[0];
			
 
				     GGML_ASSERT(num_neighbors > 0);
			
 
				     float best_d2 = FLT_MAX;
			
@@ -3545,8 +3545,8 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
 
				     return grid_index;
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
			
 
				-        const float * restrict quant_weights) {
			
 
				+static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n,
			
 
				+        const float * GGML_RESTRICT quant_weights) {
			
 
				 
			
 
				     const int gindex = iq3_data_index(grid_size);
			
 
				 
			
@@ -3758,7 +3758,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK_K == 0);
			
 
				     int64_t nblock = n_per_row/QK_K;
			
 
				     char * qrow = (char *)dst;
			
@@ -3770,13 +3770,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
 
				     return nrow * nblock * sizeof(block_iq3_xxs);
			
 
				 }
			
 
				 
			
 
				-void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
			
 
				+void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
			
 
				-        const float * restrict quant_weights,
			
 
				+static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n,
			
 
				+        const float * GGML_RESTRICT quant_weights,
			
 
				         float   * scales,
			
 
				         float   * weight,
			
 
				         float   * xval,
			
@@ -3958,7 +3958,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
 
				 }
			
 
				 
			
 
				 #define IQ3S_BLOCK_SIZE 32
			
 
				-size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK_K == 0);
			
 
				     int64_t nblock = n_per_row/QK_K;
			
 
				     float scales[QK_K/IQ3S_BLOCK_SIZE];
			
@@ -3980,7 +3980,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
 
				     return nrow * nblock * sizeof(block_iq3_s);
			
 
				 }
			
 
				 
			
 
				-void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
			
 
				+void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     quantize_iq3_s(x, y, 1, k, NULL);
			
 
				 }
			
@@ -3988,8 +3988,8 @@ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y,
 
				 
			
 
				 // =================================== 1.5 bpw ===================================================
			
 
				 
			
 
				-static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
			
 
				-        const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
			
 
				+static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
			
 
				+        const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) {
			
 
				     int num_neighbors = neighbours[0];
			
 
				     GGML_ASSERT(num_neighbors > 0);
			
 
				     float best_score = -FLT_MAX;
			
@@ -4048,8 +4048,8 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
 
				     return grid_index;
			
 
				 }
			
 
				 
			
 
				-static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
			
 
				-        const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
			
 
				+static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
			
 
				+        const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) {
			
 
				     int num_neighbors = neighbours[0];
			
 
				     GGML_ASSERT(num_neighbors > 0);
			
 
				     float best_score = FLT_MAX;
			
@@ -4113,7 +4113,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
 
				 
			
 
				 #define IQ1S_BLOCK_SIZE 32
			
 
				 #define IQ1M_BLOCK_SIZE 16
			
 
				-static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
			
 
				+static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
			
 
				         float    * scales,
			
 
				         float    * weight,
			
 
				         float    * sumx,
			
@@ -4271,7 +4271,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK_K == 0);
			
 
				     float  scales[QK_K/IQ1S_BLOCK_SIZE];
			
 
				     float  weight[IQ1S_BLOCK_SIZE];
			
@@ -4291,7 +4291,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n
 
				     return nrow * nblock * sizeof(block_iq1_s);
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
			
 
				+static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
			
 
				         float    * scales,
			
 
				         float    * weight,
			
 
				         float    * pairs,
			
@@ -4539,7 +4539,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK_K == 0);
			
 
				     float  scales[QK_K/IQ1M_BLOCK_SIZE];
			
 
				     float  weight[IQ1M_BLOCK_SIZE];
			
@@ -4570,7 +4570,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
 
				     return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
			
 
				 }
			
 
				 
			
 
				-static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
			
 
				+static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
			
 
				         ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
			
 
				         float * scales, float * weight, uint8_t * L,
			
 
				         const int8_t * values,
			
@@ -4681,7 +4681,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK4_NL == 0);
			
 
				     int64_t nblock = n_per_row/QK4_NL;
			
 
				     char * qrow = (char *)dst;
			
@@ -4703,8 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
 
				     return nrow * nblock * sizeof(block_iq4_nl);
			
 
				 }
			
 
				 
			
 
				-//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
			
 
				-void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
			
 
				+//void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
			
 
				+void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) {
			
 
				     GGML_ASSERT(k%QK4_NL == 0);
			
 
				     int64_t nblock = k/QK4_NL;
			
 
				     uint8_t L[QK4_NL];
			
@@ -4719,7 +4719,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK_K == 0);
			
 
				     int64_t nblock = n_per_row/QK_K;
			
 
				     char * qrow = (char *)dst;
			
@@ -4739,14 +4739,14 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
 
				     return nrow * nblock * sizeof(block_iq4_xs);
			
 
				 }
			
 
				 
			
 
				-void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
			
 
				+void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     quantize_iq4_xs(x, y, 1, k, NULL);
			
 
				 }
			
 
				 
			
 
				 // =============================== 2.5625 bpw
			
 
				 
			
 
				-static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
			
 
				+static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
			
 
				 
			
 
				     const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
			
 
				 
			
@@ -4914,7 +4914,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
 
				     }
			
 
				 }
			
 
				 
			
 
				-size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				+size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				     GGML_ASSERT(n_per_row%QK_K == 0);
			
 
				     int64_t nblock = n_per_row/QK_K;
			
 
				     char * qrow = (char *)dst;
			
@@ -4926,7 +4926,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
 
				     return nrow * nblock * sizeof(block_iq2_s);
			
 
				 }
			
 
				 
			
 
				-void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
			
 
				+void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) {
			
 
				     assert(k % QK_K == 0);
			
 
				     quantize_iq2_s(x, y, 1, k, NULL);
			
 
				 }
			
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -565,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
 
				 #endif
			
 
				 
			
 
				 }
			
 
				-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
			
 
				-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
			
 
				-static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
			
 
				+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
			
 
				+static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
			
 
				+static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
			
 
				 
			
 
				 static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
			
 
				     [GGML_TYPE_I8] = {
			
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -4,6 +4,7 @@
 
				 
			
 
				 #include <map>
			
 
				 #include <sstream>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 #if __cplusplus >= 202000L
			
 
				     #define LU8(x) (const char*)(u8##x)
			
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -6,6 +6,7 @@
 
				 
			
 
				 #include <set>
			
 
				 #include <vector>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 struct llama_kv_cell {
			
 
				     llama_pos pos   = -1;
			
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -8,6 +8,7 @@
 
				 #include <climits>
			
 
				 #include <stdexcept>
			
 
				 #include <cerrno>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 #ifdef __has_include
			
 
				     #if __has_include(<unistd.h>)
			
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -16,6 +16,7 @@
 
				 #include <queue>
			
 
				 #include <set>
			
 
				 #include <unordered_map>
			
 
				+#include <cctype>
			
 
				 
			
 
				 //
			
 
				 // helpers