8 місяців тому · 77d5e9a76a
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -133,6 +133,11 @@ extern "C" {
 
															     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
														
 
															+    GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
														
 
															+    GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
														
 
															+    GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
														
 
															+    GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
														
 
															+
														
 
															 #ifdef __cplusplus
														
 
															 }
														
 
															 #endif
														
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
 
															         .nrows                    = 1,
														
 
															     },
														
 
															     [GGML_TYPE_F16] = {
														
 
															-        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
														
 
															+        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
														
 
															         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
														
 
															         .vec_dot_type             = GGML_TYPE_F16,
														
 
															         .nrows                    = 1,
														
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
 
															         .from_float               = quantize_row_q8_K,
														
 
															     },
														
 
															     [GGML_TYPE_BF16] = {
														
 
															-        .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
														
 
															+        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
														
 
															         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
														
 
															         .vec_dot_type             = GGML_TYPE_BF16,
														
 
															         .nrows                    = 1,
														
@@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
 
															     return ggml_graph_compute(cgraph, &cplan);
														
 
															 }
														
 
															+void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
														
 
															+    int64_t i = 0;
														
 
															+#if defined(__F16C__)
														
 
															+#if defined(__AVX512F__)
														
 
															+    for (; i + 15 < n; i += 16) {
														
 
															+        __m512 x_vec = _mm512_loadu_ps(x + i);
														
 
															+        __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
														
 
															+        _mm256_storeu_si256((__m256i *)(y + i), y_vec);
														
 
															+    }
														
 
															+#endif
														
 
															+    for (; i + 7 < n; i += 8) {
														
 
															+        __m256 x_vec = _mm256_loadu_ps(x + i);
														
 
															+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
														
 
															+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
														
 
															+    }
														
 
															+    for (; i + 3 < n; i += 4) {
														
 
															+        __m128 x_vec = _mm_loadu_ps(x + i);
														
 
															+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
														
 
															+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
														
 
															+    }
														
 
															+#endif
														
 
															+    for (; i < n; ++i) {
														
 
															+        y[i] = GGML_FP32_TO_FP16(x[i]);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
														
 
															+    int64_t i = 0;
														
 
															+#if defined(__F16C__)
														
 
															+#if defined(__AVX512F__)
														
 
															+    for (; i + 15 < n; i += 16) {
														
 
															+        __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
														
 
															+        __m512 y_vec = _mm512_cvtph_ps(x_vec);
														
 
															+        _mm512_storeu_ps(y + i, y_vec);
														
 
															+    }
														
 
															+#endif
														
 
															+    for (; i + 7 < n; i += 8) {
														
 
															+        __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
														
 
															+        __m256 y_vec = _mm256_cvtph_ps(x_vec);
														
 
															+        _mm256_storeu_ps(y + i, y_vec);
														
 
															+    }
														
 
															+    for (; i + 3 < n; i += 4) {
														
 
															+        __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
														
 
															+        __m128 y_vec = _mm_cvtph_ps(x_vec);
														
 
															+        _mm_storeu_ps(y + i, y_vec);
														
 
															+    }
														
 
															+#endif
														
 
															+    for (; i < n; ++i) {
														
 
															+        y[i] = GGML_FP16_TO_FP32(x[i]);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
														
 
															+    int64_t i = 0;
														
 
															+    for (; i < n; ++i) {
														
 
															+        y[i] = GGML_FP32_TO_BF16(x[i]);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
														
 
															+    int64_t i = 0;
														
 
															+#if defined(__AVX2__)
														
 
															+#if defined(__AVX512F__)
														
 
															+    for (; i + 15 < n; i += 16) {
														
 
															+        _mm512_storeu_ps(y + i,
														
 
															+                        _mm512_castsi512_ps(
														
 
															+                            _mm512_slli_epi32(
														
 
															+                                _mm512_cvtepu16_epi32(
														
 
															+                                    _mm256_loadu_si256(
														
 
															+                                        (const __m256i *)(x + i))),
														
 
															+                                16)));
														
 
															+    }
														
 
															+#endif
														
 
															+    for (; i + 7 < n; i += 8) {
														
 
															+        _mm256_storeu_ps(y + i,
														
 
															+                        _mm256_castsi256_ps(
														
 
															+                            _mm256_slli_epi32(
														
 
															+                                _mm256_cvtepu16_epi32(
														
 
															+                                    _mm_loadu_si128(
														
 
															+                                        (const __m128i *)(x + i))),
														
 
															+                                16)));
														
 
															+    }
														
 
															+#endif
														
 
															+    for (; i < n; i++) {
														
 
															+        y[i] = GGML_BF16_TO_FP32(x[i]);
														
 
															+    }
														
 
															+}
														
 
															 int ggml_cpu_has_avx(void) {
														
 
															 #if defined(__AVX__)
														
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
 
															         GGML_ASSERT(i01 >= 0 && i01 < ne01);
														
 
															-        ggml_fp16_to_fp32_row(
														
 
															+        ggml_cpu_fp16_to_fp32(
														
 
															             (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
														
 
															                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
														
 
															     }
														
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
 
															         GGML_ASSERT(i01 >= 0 && i01 < ne01);
														
 
															-        ggml_bf16_to_fp32_row(
														
 
															+        ggml_cpu_bf16_to_fp32(
														
 
															             (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
														
 
															                         (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
														
 
															     }
														
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4,6 +4,7 @@
 
															 #include "ggml-backend.h"
														
 
															 #include "ggml-impl.h"
														
 
															 #include "ggml-threading.h"
														
 
															+#include "ggml-cpu.h"
														
 
															 #include "ggml.h"
														
 
															 // FIXME: required here for quantization functions
														
@@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
 
															     }
														
 
															 }
														
 
															-// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
														
 
															-//        currently, the ggml_cpu_has_* functions are entirely compile-time
														
 
															 void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
														
 
															-    int64_t i = 0;
														
 
															-#if defined(__F16C__)
														
 
															-    //if (ggml_cpu_has_f16c()) {
														
 
															-        for (; i + 7 < n; i += 8) {
														
 
															-            __m256 x_vec = _mm256_loadu_ps(x + i);
														
 
															-            __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
														
 
															-            _mm_storeu_si128((__m128i *)(y + i), y_vec);
														
 
															-        }
														
 
															-        for(; i + 3 < n; i += 4) {
														
 
															-            __m128 x_vec = _mm_loadu_ps(x + i);
														
 
															-            __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
														
 
															-            _mm_storel_epi64((__m128i *)(y + i), y_vec);
														
 
															-        }
														
 
															-    //}
														
 
															-#endif
														
 
															-    for (; i < n; i++) {
														
 
															+    int i = 0;
														
 
															+    for (; i < n; ++i) {
														
 
															         y[i] = GGML_FP32_TO_FP16(x[i]);
														
 
															     }
														
 
															 }
														
 
															 void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
														
 
															-    int64_t i = 0;
														
 
															-#if defined(__AVX512F__)
														
 
															-    //if (ggml_cpu_has_avx512()) {
														
 
															-        for (; i + 16 <= n; i += 16) {
														
 
															-            _mm512_storeu_ps(y + i,
														
 
															-                            _mm512_castsi512_ps(
														
 
															-                                _mm512_slli_epi32(
														
 
															-                                    _mm512_cvtepu16_epi32(
														
 
															-                                        _mm256_loadu_si256(
														
 
															-                                            (const __m256i *)(x + i))),
														
 
															-                                    16)));
														
 
															-        }
														
 
															-    //}
														
 
															-#endif
														
 
															-#if defined(__AVX2__)
														
 
															-    //if (ggml_cpu_has_avx2()) {
														
 
															-        for (; i + 8 <= n; i += 8) {
														
 
															-            _mm256_storeu_ps(y + i,
														
 
															-                            _mm256_castsi256_ps(
														
 
															-                                _mm256_slli_epi32(
														
 
															-                                    _mm256_cvtepu16_epi32(
														
 
															-                                        _mm_loadu_si128(
														
 
															-                                            (const __m128i *)(x + i))),
														
 
															-                                    16)));
														
 
															-        }
														
 
															-    //}
														
 
															-#endif
														
 
															-    for (; i < n; i++) {
														
 
															+    int i = 0;
														
 
															+    for (; i < n; ++i) {
														
 
															         y[i] = GGML_BF16_TO_FP32(x[i]);
														
 
															     }
														
 
															 }