4 maanden geleden · 186415d595
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -42,18 +42,6 @@ cmake --build build --config Release -j $(nproc)
 
				     cmake --build build --config Release -j $(nproc)
			
 
				     ```
			
 
				 
			
 
				--   By default, NNPA is disabled by default. To enable it:
			
 
				-
			
 
				-    ```bash
			
 
				-    cmake -S . -B build             \
			
 
				-        -DCMAKE_BUILD_TYPE=Release  \
			
 
				-        -DGGML_BLAS=ON              \
			
 
				-        -DGGML_BLAS_VENDOR=OpenBLAS \
			
 
				-        -DGGML_NNPA=ON
			
 
				-
			
 
				-    cmake --build build --config Release -j $(nproc)
			
 
				-    ```
			
 
				-
			
 
				 -   For debug builds:
			
 
				 
			
 
				     ```bash
			
@@ -164,15 +152,11 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
 
				 
			
 
				 Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
			
 
				 
			
 
				-### 2. NNPA Vector Intrinsics Acceleration
			
 
				-
			
 
				-Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
			
 
				-
			
 
				-### 3. zDNN Accelerator (WIP)
			
 
				+### 2. zDNN Accelerator (WIP)
			
 
				 
			
 
				 Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
			
 
				 
			
 
				-### 4. Spyre Accelerator
			
 
				+### 3. Spyre Accelerator
			
 
				 
			
 
				 _Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
			
 
				 
			
@@ -230,10 +214,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 
				     CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
			
 
				     ```
			
 
				 
			
 
				-5. `-DGGML_NNPA=ON` generates gibberish output
			
 
				-
			
 
				-    Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
			
 
				-
			
 
				 ## Getting Help on IBM Z & LinuxONE
			
 
				 
			
 
				 1. **Bugs, Feature Requests**
			
@@ -258,38 +238,38 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 
				 
			
 
				 ## Appendix B: SIMD Support Matrix
			
 
				 
			
 
				-|            | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
			
 
				-| ---------- | ----------- | ---- | ---- | ----- |
			
 
				-| FP32       | ✅          | ✅   | ✅   | ❓    |
			
 
				-| FP16       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| BF16       | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| Q4_0       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| Q4_1       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| MXFP4      | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| Q5_0       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| Q5_1       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| Q8_0       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| Q2_K       | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| Q3_K       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| Q4_K       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| Q5_K       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| Q6_K       | ✅          | ✅   | ❓   | ❓    |
			
 
				-| TQ1_0      | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| TQ2_0      | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| IQ2_XXS    | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| IQ2_XS     | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| IQ2_S      | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| IQ3_XXS    | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| IQ3_S      | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| IQ1_S      | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| IQ1_M      | 🚫          | 🚫   | ❓   | ❓    |
			
 
				-| IQ4_NL     | ✅          | ✅   | ❓   | ❓    |
			
 
				-| IQ4_XS     | ✅          | ✅   | ❓   | ❓    |
			
 
				-| FP32->FP16 | 🚫          | ✅   | ❓   | ❓    |
			
 
				-| FP16->FP32 | 🚫          | ✅   | ❓   | ❓    |
			
 
				+|            | VX/VXE/VXE2 | zDNN | Spyre |
			
 
				+|------------|-------------|------|-------|
			
 
				+| FP32       | ✅           | ✅    | ❓     |
			
 
				+| FP16       | ✅           | ❓    | ❓     |
			
 
				+| BF16       | 🚫           | ❓    | ❓     |
			
 
				+| Q4_0       | ✅           | ❓    | ❓     |
			
 
				+| Q4_1       | ✅           | ❓    | ❓     |
			
 
				+| MXFP4      | 🚫           | ❓    | ❓     |
			
 
				+| Q5_0       | ✅           | ❓    | ❓     |
			
 
				+| Q5_1       | ✅           | ❓    | ❓     |
			
 
				+| Q8_0       | ✅           | ❓    | ❓     |
			
 
				+| Q2_K       | 🚫           | ❓    | ❓     |
			
 
				+| Q3_K       | ✅           | ❓    | ❓     |
			
 
				+| Q4_K       | ✅           | ❓    | ❓     |
			
 
				+| Q5_K       | ✅           | ❓    | ❓     |
			
 
				+| Q6_K       | ✅           | ❓    | ❓     |
			
 
				+| TQ1_0      | 🚫           | ❓    | ❓     |
			
 
				+| TQ2_0      | 🚫           | ❓    | ❓     |
			
 
				+| IQ2_XXS    | 🚫           | ❓    | ❓     |
			
 
				+| IQ2_XS     | 🚫           | ❓    | ❓     |
			
 
				+| IQ2_S      | 🚫           | ❓    | ❓     |
			
 
				+| IQ3_XXS    | 🚫           | ❓    | ❓     |
			
 
				+| IQ3_S      | 🚫           | ❓    | ❓     |
			
 
				+| IQ1_S      | 🚫           | ❓    | ❓     |
			
 
				+| IQ1_M      | 🚫           | ❓    | ❓     |
			
 
				+| IQ4_NL     | ✅           | ❓    | ❓     |
			
 
				+| IQ4_XS     | ✅           | ❓    | ❓     |
			
 
				+| FP32->FP16 | 🚫           | ❓    | ❓     |
			
 
				+| FP16->FP32 | 🚫           | ❓    | ❓     |
			
 
				 
			
 
				 -   ✅ - acceleration available
			
 
				 -   🚫 - acceleration unavailable, will still run using scalar implementation
			
 
				 -   ❓ - acceleration unknown, please contribute if you can test it yourself
			
 
				 
			
 
				-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.
			
 
				+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 6, 2025.
			
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -134,7 +134,6 @@ option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
 
				 option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
			
 
				 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
			
 
				 option(GGML_VXE              "ggml: enable vxe"              ON)
			
 
				-option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
			
 
				 
			
 
				 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
			
 
				 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
			
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -101,7 +101,6 @@ extern "C" {
 
				     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
			
 
				     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
			
 
				     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
			
 
				-    GGML_BACKEND_API int ggml_cpu_has_nnpa       (void);
			
 
				     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
			
 
				     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
			
 
				 
			
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -457,7 +457,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
 
				 
			
 
				         # TODO: Separation to determine activation of VX/VXE/VXE2
			
 
				         if (${S390X_M} MATCHES "8561|8562")
			
 
				-            set(GGML_NNPA OFF)
			
 
				             message(STATUS "z15 target")
			
 
				             list(APPEND ARCH_FLAGS -march=z15)
			
 
				         elseif (${S390X_M} MATCHES "3931")
			
@@ -479,11 +478,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
 
				             list(APPEND ARCH_FLAGS -mvx -mzvector)
			
 
				             list(APPEND ARCH_DEFINITIONS GGML_VXE)
			
 
				         endif()
			
 
				-
			
 
				-        if (GGML_NNPA)
			
 
				-            message(STATUS "NNPA enabled")
			
 
				-            list(APPEND ARCH_DEFINITIONS GGML_NNPA)
			
 
				-        endif()
			
 
				     elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
			
 
				         message(STATUS "Wasm detected")
			
 
				         list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
			
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -68,12 +68,6 @@ struct ggml_compute_params {
 
				 #endif  // __VXE2__
			
 
				 #endif  // __s390x__ && __VEC__
			
 
				 
			
 
				-#if defined(__s390x__) && defined(GGML_NNPA)
			
 
				-#ifndef __NNPA__
			
 
				-#define __NNPA__
			
 
				-#endif  // __NNPA__
			
 
				-#endif  // __s390x__ && GGML_NNPA
			
 
				-
			
 
				 #if defined(__ARM_FEATURE_SVE)
			
 
				 #include <sys/prctl.h>
			
 
				 #endif
			
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3211,21 +3211,6 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
 
				         __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
			
 
				         _mm_storel_epi64((__m128i *)(y + i), y_vec);
			
 
				     }
			
 
				-#elif defined(__NNPA__)
			
 
				-    for (; i + 7 < n; i += 8) {
			
 
				-        float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
			
 
				-        float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
			
 
				-        uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
			
 
				-        uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
			
 
				-        vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
			
 
				-    }
			
 
				-    for (; i + 3 < n; i += 4) {
			
 
				-        float32x4_t v_x = vec_xl(0, (const float *)(x + i));
			
 
				-        float32x4_t v_zero = vec_splats(0.0f);
			
 
				-        uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
			
 
				-        uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
			
 
				-        vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
			
 
				-    }
			
 
				 #elif defined(__riscv_zvfh)
			
 
				     for (int vl; i < n; i += vl) {
			
 
				         vl = __riscv_vsetvl_e32m2(n - i);
			
@@ -3259,21 +3244,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
 
				         __m128 y_vec = _mm_cvtph_ps(x_vec);
			
 
				         _mm_storeu_ps(y + i, y_vec);
			
 
				     }
			
 
				-#elif defined(__NNPA__)
			
 
				-    for (; i + 7 < n; i += 8) {
			
 
				-        uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
			
 
				-        uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
			
 
				-        float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
			
 
				-        float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
			
 
				-        vec_xst(v_yh, 0, (float *)(y + i + 0));
			
 
				-        vec_xst(v_yl, 0, (float *)(y + i + 4));
			
 
				-    }
			
 
				-    for (; i + 3 < n; i += 4) {
			
 
				-        uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
			
 
				-        uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
			
 
				-        float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
			
 
				-        vec_xst(v_yh, 0, (float *)(y + i));
			
 
				-    }
			
 
				 #endif
			
 
				 
			
 
				     for (; i < n; ++i) {
			
@@ -3477,14 +3447,6 @@ int ggml_cpu_has_vxe(void) {
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-int ggml_cpu_has_nnpa(void) {
			
 
				-#if defined(GGML_NNPA)
			
 
				-    return 1;
			
 
				-#else
			
 
				-    return 0;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				 int ggml_cpu_has_neon(void) {
			
 
				 #if defined(__ARM_ARCH) && defined(__ARM_NEON)
			
 
				     return 1;
			
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -576,9 +576,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
 
				         if (ggml_cpu_has_vxe()) {
			
 
				             features.push_back({ "VXE", "1" });
			
 
				         }
			
 
				-        if (ggml_cpu_has_nnpa()) {
			
 
				-            features.push_back({ "NNPA", "1" });
			
 
				-        }
			
 
				         if (ggml_cpu_has_wasm_simd()) {
			
 
				             features.push_back({ "WASM_SIMD", "1" });
			
 
				         }
			
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -114,26 +114,6 @@ extern "C" {
 
				     #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
			
 
				     #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
			
 
				     #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
			
 
				-#elif defined(__NNPA__)
			
 
				-    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
			
 
				-    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
			
 
				-
			
 
				-    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
			
 
				-    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
			
 
				-
			
 
				-    static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
			
 
				-        uint16x8_t v_h = vec_splats(h);
			
 
				-        uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
			
 
				-        return vec_extend_to_fp32_hi(v_hd, 0)[0];
			
 
				-    }
			
 
				-
			
 
				-    static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
			
 
				-        float32x4_t v_f = vec_splats(f);
			
 
				-        float32x4_t v_zero = vec_splats(0.0f);
			
 
				-        uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
			
 
				-        uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
			
 
				-        return vec_extract(v_h, 0);
			
 
				-    }
			
 
				 #endif
			
 
				 
			
 
				 // precomputed f32 table for f16 (256 KB)
			
@@ -1156,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 
				 #define GGML_F16_EPR  GGML_F32_EPR
			
 
				 
			
 
				 static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
			
 
				-#if defined(__NNPA__)
			
 
				-    uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
			
 
				-    uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
			
 
				-    return vec_extend_to_fp32_hi(v_xd, 0);
			
 
				-#else
			
 
				     float tmp[4];
			
 
				 
			
 
				     for (int i = 0; i < 4; i++) {
			
@@ -1170,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
 
				     // note: keep type-cast here to prevent compiler bugs
			
 
				     // see: https://github.com/ggml-org/llama.cpp/issues/12846
			
 
				     return vec_xl(0, (const float *)(tmp));
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
			
 
				-#if defined(__NNPA__)
			
 
				-    float32x4_t v_zero = vec_splats(0.0f);
			
 
				-    uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
			
 
				-    uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
			
 
				-
			
 
				-    x[0] = vec_extract(v_x, 0);
			
 
				-    x[1] = vec_extract(v_x, 1);
			
 
				-    x[2] = vec_extract(v_x, 2);
			
 
				-    x[3] = vec_extract(v_x, 3);
			
 
				-#else
			
 
				     float arr[4];
			
 
				 
			
 
				     // note: keep type-cast here to prevent compiler bugs
			
@@ -1193,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
 
				     for (int i = 0; i < 4; i++) {
			
 
				         x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
			
 
				     }
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 #define GGML_F16_VEC                GGML_F32x4