1 vuosi sitten · 605fa66c50
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 
				 
			
 
				 ## News
			
 
				 
			
 
				+- 2024.11
			
 
				+  - Support F16 and F32 data type model for Ascend 310P NPU.
			
 
				 - 2024.8
			
 
				   - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
			
 
				 - 2024.7
			
@@ -43,6 +45,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 
				 | Ascend NPU                    | Status  |
			
 
				 |:-----------------------------:|:-------:|
			
 
				 | Atlas 300T A2                 | Support |
			
 
				+| Atlas 300I Duo                | Support |
			
 
				 
			
 
				 *Notes:*
			
 
				 
			
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
 
				     detect_ascend_soc_type(SOC_VERSION)
			
 
				     set(SOC_TYPE "${SOC_VERSION}")
			
 
				     message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
			
 
				-else()
			
 
				-    string(TOLOWER ${SOC_TYPE} SOC_VERSION)
			
 
				 endif()
			
 
				 
			
 
				-# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
			
 
				+string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
			
 
				+
			
 
				+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
			
 
				 string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
			
 
				 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
			
 
				+string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
			
 
				 
			
 
				 if (CANN_INSTALL_DIR)
			
 
				     # Only Support Linux.
			
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -25,6 +25,6 @@ ascendc_library(ascendc_kernels STATIC
 
				     ${SRC_FILES}
			
 
				 )
			
 
				 
			
 
				-message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
			
 
				+message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
			
 
				 ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
			
 
				 # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
			
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -20,7 +20,6 @@ class DupByRows {
 
				         // Input has four dims.
			
 
				         int64_t op_block_num = GetBlockNum();
			
 
				         int64_t op_block_idx = GetBlockIdx();
			
 
				-        assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
			
 
				 
			
 
				         // param
			
 
				         num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
			
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -2,6 +2,15 @@
 
				 
			
 
				 // optimize me. Use template to avoid copy code.
			
 
				 using namespace AscendC;
			
 
				+#ifdef ASCEND_310P // 310P not support 4bit get row
			
 
				+    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
			
 
				+        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
			
 
				+        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
			
 
				+        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
			
 
				+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
			
 
				+        printf("Ascend310P not support 4bit get row.\n");
			
 
				+    }
			
 
				+#else
			
 
				 
			
 
				 #define BUFFER_NUM 2
			
 
				 
			
@@ -110,12 +119,9 @@ class GET_ROW_Q4_0 {
 
				         LocalTensor<float> output_local = output_queue.AllocTensor<float>();
			
 
				 
			
 
				         // TODO: cast more data to speed up.
			
 
				-#ifdef ASCEND_310P
			
 
				-        // TODO: 310P support quantification
			
 
				-#else
			
 
				         Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
			
 
				         Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
			
 
				-#endif
			
 
				+
			
 
				         // Only mul need compile by group.
			
 
				         half scale = scale_gm.GetValue(scale_offset);
			
 
				 
			
@@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
 
				             indices_nb_ub, output_ne_ub, output_nb_ub);
			
 
				     op.calculate();
			
 
				 }
			
 
				+
			
 
				+#endif // #ifdef ASCEND_310P
			
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@@ -1,6 +1,14 @@
 
				 #include "kernel_operator.h"
			
 
				 
			
 
				 using namespace AscendC;
			
 
				+#ifdef ASCEND_310P
			
 
				+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
			
 
				+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
			
 
				+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
			
 
				+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
			
 
				+        printf("Ascend310P not support f16->8bit quantization.\n");
			
 
				+    }
			
 
				+#else
			
 
				 
			
 
				 #define BUFFER_NUM 2
			
 
				 #define QK8_0 32
			
@@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
 
				     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
			
 
				     op.calculate();
			
 
				 }
			
 
				+
			
 
				+#endif // #ifdef ASCEND_310P
			
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@@ -1,6 +1,14 @@
 
				 #include "kernel_operator.h"
			
 
				 
			
 
				 using namespace AscendC;
			
 
				+#ifdef ASCEND_310P // 310P not support f32->8bit quantization
			
 
				+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
			
 
				+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
			
 
				+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
			
 
				+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
			
 
				+        printf("Ascend310P not support f32->8bit quantization.\n");
			
 
				+    }
			
 
				+#else
			
 
				 
			
 
				 #define BUFFER_NUM 2
			
 
				 #define QK8_0 32
			
@@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
 
				     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
			
 
				     op.calculate();
			
 
				 }
			
 
				+
			
 
				+#endif // #ifdef ASCEND_310P
			
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@@ -1,6 +1,21 @@
 
				 #include "kernel_operator.h"
			
 
				 
			
 
				 using namespace AscendC;
			
 
				+#ifdef ASCEND_310P // 310P not support float->4bit quantization
			
 
				+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
			
 
				+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
			
 
				+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
			
 
				+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
			
 
				+        printf("Ascend310P not support f32->4bit quantization.\n");
			
 
				+    }
			
 
				+
			
 
				+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
			
 
				+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
			
 
				+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
			
 
				+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
			
 
				+        printf("Ascend310P not support f16->4bit quantization.\n");
			
 
				+    }
			
 
				+#else
			
 
				 
			
 
				 #define BUFFER_NUM 2
			
 
				 #define Group_Size 32
			
@@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
 
				     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
			
 
				     op.calculate();
			
 
				 }
			
 
				+
			
 
				+#endif // #ifdef ASCEND_310P