3 miesięcy temu · 7a50cf388a
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -51,28 +51,31 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
 
				     return ACL_DT_UNDEFINED;
			
 
				 }
			
 
				 
			
 
				-aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
			
 
				-                                   size_t* nb, int64_t dims, aclFormat format,
			
 
				-                                   size_t offset) {
			
 
				+aclTensor * ggml_cann_create_tensor(const ggml_tensor * tensor,
			
 
				+                                    int64_t *           ne,
			
 
				+                                    size_t *            nb,
			
 
				+                                    int64_t             dims,
			
 
				+                                    aclFormat           format,
			
 
				+                                    size_t              offset) {
			
 
				     // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
			
 
				     // added.
			
 
				     int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
			
 
				 
			
 
				     if (ne == nullptr) {
			
 
				         for (int i = 0; i < GGML_MAX_DIMS; i++) {
			
 
				-            acl_ne[i] = tensor->ne[i];
			
 
				+            acl_ne[i]     = tensor->ne[i];
			
 
				             // The step size of acl is in elements.
			
 
				             acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
			
 
				         }
			
 
				     } else {
			
 
				         // With bcast
			
 
				         for (int i = 0; i < dims; i++) {
			
 
				-            acl_ne[i] = ne[i];
			
 
				+            acl_ne[i]     = ne[i];
			
 
				             acl_stride[i] = nb[i] / ggml_element_size(tensor);
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
			
 
				+    int64_t final_dims      = (dims == 0 ? GGML_MAX_DIMS : dims);
			
 
				     int64_t acl_storage_len = 1;
			
 
				     for (int i = 0; i < final_dims; i++) {
			
 
				         acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
			
@@ -84,15 +87,13 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
 
				     std::reverse(acl_ne, acl_ne + final_dims);
			
 
				     std::reverse(acl_stride, acl_stride + final_dims);
			
 
				 
			
 
				-    aclTensor* acl_tensor = aclCreateTensor(
			
 
				-        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
			
 
				-        elem_offset, format, &acl_storage_len, 1,
			
 
				-        tensor->data);
			
 
				+    aclTensor * acl_tensor = aclCreateTensor(acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
			
 
				+                                             elem_offset, format, &acl_storage_len, 1, tensor->data);
			
 
				 
			
 
				     return acl_tensor;
			
 
				 }
			
 
				 
			
 
				-bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
			
 
				+bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1) {
			
 
				     for (int i = 0; i < GGML_MAX_DIMS; i++) {
			
 
				         if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
			
 
				             return true;
			
@@ -101,15 +102,16 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
 
				     return false;
			
 
				 }
			
 
				 
			
 
				-int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
			
 
				-                                  const ggml_tensor* src1,
			
 
				-                                  int64_t* bcast_src0_ne,
			
 
				-                                  int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
			
 
				-                                  size_t* bcast_src1_nb) {
			
 
				+int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0,
			
 
				+                                  const ggml_tensor * src1,
			
 
				+                                  int64_t *           bcast_src0_ne,
			
 
				+                                  int64_t *           bcast_src1_ne,
			
 
				+                                  size_t *            bcast_src0_nb,
			
 
				+                                  size_t *            bcast_src1_nb) {
			
 
				     GGML_ASSERT(ggml_can_repeat(src1, src0));
			
 
				     int bcast_dim_cnt = 0;
			
 
				     for (int i = 0; i < GGML_MAX_DIMS; i++) {
			
 
				-        int64_t nr = src0->ne[i] / src1->ne[i];
			
 
				+        int64_t nr                   = src0->ne[i] / src1->ne[i];
			
 
				         bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
			
 
				         bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
			
 
				         bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
			
@@ -119,21 +121,26 @@ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
 
				             // Need to add an extra dim.
			
 
				             bcast_src0_ne[bcast_dim_cnt] = nr;
			
 
				             bcast_src1_ne[bcast_dim_cnt] = 1;
			
 
				-            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
			
 
				-                                           bcast_src0_ne[bcast_dim_cnt - 1];
			
 
				-            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
			
 
				-                                           bcast_src1_ne[bcast_dim_cnt - 1];
			
 
				+            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] * bcast_src0_ne[bcast_dim_cnt - 1];
			
 
				+            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] * bcast_src1_ne[bcast_dim_cnt - 1];
			
 
				             bcast_dim_cnt++;
			
 
				         }
			
 
				     }
			
 
				     return bcast_dim_cnt;
			
 
				 }
			
 
				 
			
 
				-int64_t ggml_cann_get_mulmat_bcast_shape(
			
 
				-    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
			
 
				-    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
			
 
				-    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
			
 
				-    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
			
 
				+int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne,
			
 
				+                                         const int64_t * weight_ne,
			
 
				+                                         const int64_t * dst_ne,
			
 
				+                                         const size_t *  input_nb,
			
 
				+                                         const size_t *  weight_nb,
			
 
				+                                         const size_t *  dst_nb,
			
 
				+                                         int64_t *       bcast_input_ne,
			
 
				+                                         int64_t *       bcast_weight_ne,
			
 
				+                                         int64_t *       bcast_dst_ne,
			
 
				+                                         size_t *        bcast_input_nb,
			
 
				+                                         size_t *        bcast_weight_nb,
			
 
				+                                         size_t *        bcast_dst_nb) {
			
 
				     // input and dst shoule in same shape, except first two dims.
			
 
				     GGML_ASSERT(input_ne[2] == dst_ne[2]);
			
 
				     GGML_ASSERT(input_ne[3] == dst_ne[3]);
			
@@ -148,34 +155,30 @@ int64_t ggml_cann_get_mulmat_bcast_shape(
 
				         // Do not use bcast in the first two dimensions because we only support
			
 
				         // the bcast batch dimension. Just copy them.
			
 
				         if (i < 2 || nr == 1) {
			
 
				-            bcast_input_ne[bcast_dim_cnt] = input_ne[i];
			
 
				+            bcast_input_ne[bcast_dim_cnt]  = input_ne[i];
			
 
				             bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
			
 
				-            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
			
 
				+            bcast_dst_ne[bcast_dim_cnt]    = dst_ne[i];
			
 
				 
			
 
				-            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
			
 
				+            bcast_input_nb[bcast_dim_cnt]  = input_nb[i];
			
 
				             bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
			
 
				-            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
			
 
				+            bcast_dst_nb[bcast_dim_cnt]    = dst_nb[i];
			
 
				             bcast_dim_cnt++;
			
 
				         } else {
			
 
				             // Need to add an extra dim.
			
 
				-            bcast_input_ne[bcast_dim_cnt] = nr;
			
 
				-            bcast_dst_ne[bcast_dim_cnt] = nr;
			
 
				+            bcast_input_ne[bcast_dim_cnt]  = nr;
			
 
				+            bcast_dst_ne[bcast_dim_cnt]    = nr;
			
 
				             bcast_weight_ne[bcast_dim_cnt] = 1;
			
 
				-            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
			
 
				-            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
			
 
				+            bcast_input_nb[bcast_dim_cnt]  = input_nb[i];
			
 
				+            bcast_dst_nb[bcast_dim_cnt]    = dst_nb[i];
			
 
				             bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
			
 
				             bcast_dim_cnt++;
			
 
				 
			
 
				-            bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
			
 
				-            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
			
 
				+            bcast_input_ne[bcast_dim_cnt]  = input_ne[i] / nr;
			
 
				+            bcast_dst_ne[bcast_dim_cnt]    = dst_ne[i] / nr;
			
 
				             bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
			
 
				-            bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
			
 
				-                                            bcast_input_ne[bcast_dim_cnt - 1];
			
 
				-            bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
			
 
				-                                          bcast_dst_ne[bcast_dim_cnt - 1];
			
 
				-            bcast_weight_nb[bcast_dim_cnt] =
			
 
				-                bcast_weight_nb[bcast_dim_cnt - 1] *
			
 
				-                bcast_weight_ne[bcast_dim_cnt - 1];
			
 
				+            bcast_input_nb[bcast_dim_cnt]  = bcast_input_nb[bcast_dim_cnt - 1] * bcast_input_ne[bcast_dim_cnt - 1];
			
 
				+            bcast_dst_nb[bcast_dim_cnt]    = bcast_dst_nb[bcast_dim_cnt - 1] * bcast_dst_ne[bcast_dim_cnt - 1];
			
 
				+            bcast_weight_nb[bcast_dim_cnt] = bcast_weight_nb[bcast_dim_cnt - 1] * bcast_weight_ne[bcast_dim_cnt - 1];
			
 
				             bcast_dim_cnt++;
			
 
				         }
			
 
				     }
			
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -62,10 +62,12 @@ aclDataType ggml_cann_type_mapping(ggml_type type);
 
				  * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
			
 
				  * @return  Pointer to the created ACL tensor.
			
 
				  */
			
 
				-aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
			
 
				-                             size_t* nb = nullptr, int64_t dims = 0,
			
 
				-                             aclFormat format = ACL_FORMAT_ND,
			
 
				-                             size_t offset = 0);
			
 
				+aclTensor * ggml_cann_create_tensor(const ggml_tensor * tensor,
			
 
				+                                    int64_t *           ne     = nullptr,
			
 
				+                                    size_t *            nb     = nullptr,
			
 
				+                                    int64_t             dims   = 0,
			
 
				+                                    aclFormat           format = ACL_FORMAT_ND,
			
 
				+                                    size_t              offset = 0);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Template for creating an ACL tensor from provided parameters. typename TYPE
			
@@ -87,12 +89,15 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
 
				  * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
			
 
				  * @return  Pointer to the created ACL tensor.
			
 
				  */
			
 
				-template<typename TYPE>
			
 
				-aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
			
 
				-                                   TYPE type_size, int64_t* ne, TYPE* nb,
			
 
				-                                   int64_t dims,
			
 
				-                                   aclFormat format = ACL_FORMAT_ND,
			
 
				-                                   size_t offset = 0) {
			
 
				+template <typename TYPE>
			
 
				+aclTensor * ggml_cann_create_tensor(void *      data_ptr,
			
 
				+                                    aclDataType dtype,
			
 
				+                                    TYPE        type_size,
			
 
				+                                    int64_t *   ne,
			
 
				+                                    TYPE *      nb,
			
 
				+                                    int64_t     dims,
			
 
				+                                    aclFormat   format = ACL_FORMAT_ND,
			
 
				+                                    size_t      offset = 0) {
			
 
				     int64_t tmp_ne[GGML_MAX_DIMS * 2];
			
 
				     int64_t tmp_stride[GGML_MAX_DIMS * 2];
			
 
				 
			
@@ -109,9 +114,8 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
 
				     std::reverse(tmp_ne, tmp_ne + dims);
			
 
				     std::reverse(tmp_stride, tmp_stride + dims);
			
 
				 
			
 
				-    aclTensor* acl_tensor =
			
 
				-        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
			
 
				-                        format, &acl_storage_len, 1, data_ptr);
			
 
				+    aclTensor * acl_tensor =
			
 
				+        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, format, &acl_storage_len, 1, data_ptr);
			
 
				 
			
 
				     return acl_tensor;
			
 
				 }
			
@@ -132,7 +136,7 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
 
				  *          to 1. If such a dimension is found, broadcasting is required to align t1
			
 
				  *          with t0 for element-wise operations.
			
 
				  */
			
 
				-bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
			
 
				+bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes broadcast shapes and strides for two ggml_tensors.
			
@@ -187,19 +191,21 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
 
				  *  dim1 in a inserted dim, should add nb for dim1,
			
 
				  *  and all other nb moves to next in order.
			
 
				  */
			
 
				-int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
			
 
				-                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
			
 
				-                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
			
 
				+int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0,
			
 
				+                                  const ggml_tensor * src1,
			
 
				+                                  int64_t *           bcast_ne_src0,
			
 
				+                                  int64_t *           bcast_ne_src1,
			
 
				+                                  size_t *            bcast_nb_src0,
			
 
				+                                  size_t *            bcast_nb_src1);
			
 
				 
			
 
				 // Bcast macro to avoid duplicate code.
			
 
				-#define BCAST_SHAPE(src0, src1)                                              \
			
 
				-    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                            \
			
 
				-    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                            \
			
 
				-    size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2];                             \
			
 
				-    size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2];                             \
			
 
				-    int64_t bcast_dims = ggml_cann_get_bcast_shape(                          \
			
 
				-        src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
			
 
				-        bcast_##src1##_nb);
			
 
				+#define BCAST_SHAPE(src0, src1)                                                                      \
			
 
				+    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                                                    \
			
 
				+    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                                                    \
			
 
				+    size_t  bcast_##src0##_nb[GGML_MAX_DIMS * 2];                                                    \
			
 
				+    size_t  bcast_##src1##_nb[GGML_MAX_DIMS * 2];                                                    \
			
 
				+    int64_t bcast_dims = ggml_cann_get_bcast_shape(src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, \
			
 
				+                                                   bcast_##src0##_nb, bcast_##src1##_nb);
			
 
				 
			
 
				 #define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
			
 
				 
			
@@ -233,26 +239,31 @@ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* sr
 
				  *       before cast dim.
			
 
				  * @sa ggml_cann_get_bcast_shape
			
 
				  */
			
 
				-int64_t ggml_cann_get_mulmat_bcast_shape(
			
 
				-    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
			
 
				-    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
			
 
				-    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
			
 
				-    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
			
 
				+int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne,
			
 
				+                                         const int64_t * weight_ne,
			
 
				+                                         const int64_t * dst_ne,
			
 
				+                                         const size_t *  input_nb,
			
 
				+                                         const size_t *  weight_nb,
			
 
				+                                         const size_t *  dst_nb,
			
 
				+                                         int64_t *       bcast_input_ne,
			
 
				+                                         int64_t *       bcast_weight_ne,
			
 
				+                                         int64_t *       bcast_dst_ne,
			
 
				+                                         size_t *        bcast_input_nb,
			
 
				+                                         size_t *        bcast_weight_nb,
			
 
				+                                         size_t *        bcast_dst_nb);
			
 
				 
			
 
				 // Bcast macro to avoid duplicate code.
			
 
				-#define BCAST_MUL_MAT_SHAPE(input, weight, dst)                         \
			
 
				-    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                      \
			
 
				-    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                     \
			
 
				-    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                        \
			
 
				-    size_t bcast_##input##_nb[GGML_MAX_DIMS * 2];                       \
			
 
				-    size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2];                      \
			
 
				-    size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2];                         \
			
 
				-    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(              \
			
 
				-        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
			
 
				-        bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne,      \
			
 
				-        bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
			
 
				+#define BCAST_MUL_MAT_SHAPE(input, weight, dst)                                                                  \
			
 
				+    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                                                               \
			
 
				+    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                                                              \
			
 
				+    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                                                                 \
			
 
				+    size_t  bcast_##input##_nb[GGML_MAX_DIMS * 2];                                                               \
			
 
				+    size_t  bcast_##weight##_nb[GGML_MAX_DIMS * 2];                                                              \
			
 
				+    size_t  bcast_##dst##_nb[GGML_MAX_DIMS * 2];                                                                 \
			
 
				+    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(                                                       \
			
 
				+        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, bcast_##input##_ne, bcast_##weight##_ne, \
			
 
				+        bcast_##dst##_ne, bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
			
 
				 
			
 
				-#define BCAST_MUL_MAT_PARAM(tensor) \
			
 
				-    bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
			
 
				+#define BCAST_MUL_MAT_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
			
 
				 
			
 
				 #endif  // CANN_ACL_TENSOR_H
			
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -62,7 +62,7 @@
 
				  * @param   dst The ggml tensor representing the destination, which op is
			
 
				  *              GGML_OP_REPEAT and specifies the desired dimensions.
			
 
				  */
			
 
				-void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
			
@@ -82,7 +82,7 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the result of the Leaky ReLU
			
 
				  *            activation is stored, which op is `GGML_OP_LEAKY_RELU`
			
 
				  */
			
 
				-void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief    Concatenates multiple tensors along a specified dimension using the
			
@@ -97,7 +97,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @attention tensorList length should be 2 and the dimension using for concat
			
 
				  *            default to 1.
			
 
				  */
			
 
				-void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Generates a sequence of evenly spaced values within a specified
			
@@ -113,7 +113,7 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  *            `start`, 'stop' and 'step' are in dst->op_params and dst->op is
			
 
				  *            `GGML_OP_ARANGE`.
			
 
				  */
			
 
				-void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Applies a clamp operation to the elements of a ggml tensor using the
			
@@ -131,7 +131,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the clamped values will be stored.
			
 
				  *            dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
			
 
				  */
			
 
				-void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Scales the elements of a ggml tensor by a constant factor using the
			
@@ -148,7 +148,7 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the scaled values will be stored.
			
 
				  *            dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
			
 
				  */
			
 
				-void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Sorts the elements of a ggml tensor and returns the indices that
			
@@ -163,7 +163,7 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the sorted indices will be stored.
			
 
				  *            dst->op is `GGML_OP_ARGSORT`.
			
 
				  */
			
 
				-void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the Layer Normalization for a ggml tensor using the CANN
			
@@ -185,7 +185,7 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the normalized values will be stored.
			
 
				  * @attention `Var` defaults to dst->ne[0].
			
 
				  */
			
 
				-void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief  Computes the Group Normalization for a ggml tensor using the CANN
			
@@ -209,7 +209,7 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  *
			
 
				  * @attention eps defaults to 1e-6f.
			
 
				  */
			
 
				-void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the accumulation of tensors using the CANN backend.
			
@@ -228,7 +228,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the accumulated values will be stored.
			
 
				  *            `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
			
 
				  */
			
 
				-void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the sum of elements along the last dimension of a ggml tensor
			
@@ -244,7 +244,7 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  *
			
 
				  * @attention `reduce_dims` defaults to 3, which means the last dimension.
			
 
				  */
			
 
				-void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the sum of elements in a ggml tensor.
			
@@ -258,7 +258,7 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  *
			
 
				  */
			
 
				 
			
 
				-void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
			
@@ -274,8 +274,7 @@ void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the upsampled values will be stored.
			
 
				  *            dst->op is `GGML_OP_UPSCALE`.
			
 
				  */
			
 
				-void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
			
 
				-                                  ggml_tensor* dst);
			
 
				+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Pads a ggml tensor to match the dimensions of the destination tensor
			
@@ -290,7 +289,7 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
 
				  * @param dst The destination tensor, which specifies the target dimensions for
			
 
				  *            padding. dst->op is `GGML_OP_PAD`.
			
 
				  */
			
 
				-void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Executes a 2D pooling operation on a ggml tensor using the CANN
			
@@ -307,7 +306,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor on which the pooling operation is to be
			
 
				  *            performed. dst->op is `GGML_OP_POOL_2D`.
			
 
				  */
			
 
				-void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Duplicates a ggml tensor using the CANN backend.
			
@@ -326,7 +325,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  *            different shape and dst is no-contiguous.
			
 
				  * @note:     This func need to simplify.
			
 
				  */
			
 
				-void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the Root Mean Square (RMS) normalization of a ggml tensor
			
@@ -348,7 +347,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the normalized values will be stored.
			
 
				  *            dst->op is `GGML_OP_RMS_NORM`.
			
 
				  */
			
 
				-void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Applies a diagonal mask to the tensor with a specified value.
			
@@ -363,7 +362,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  *            `GGML_OP_DIAG_MASK`
			
 
				  * @param value The value to use for masking.
			
 
				  */
			
 
				-void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
			
 
				+void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Performs an image-to-column transformation on the input tensor.
			
@@ -378,7 +377,7 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float
 
				  * @param dst The destination tensor that stores the result of the operation.
			
 
				  *            dst->op is `GGML_OP_IM2COL`.
			
 
				  */
			
 
				-void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes time step embeddings using sine and cosine functions.
			
@@ -392,10 +391,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the result of the embedding operation
			
 
				  *            will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
			
 
				  */
			
 
				-void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 // @see ggml_cann_dup.
			
 
				-void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the softmax activation with optional masking.
			
@@ -417,7 +416,7 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the result will be stored. dst->op is
			
 
				  *            `GGML_OP_SOFTMAX`.
			
 
				  */
			
 
				-void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Extracts specific rows from a tensor based on indices.
			
@@ -429,7 +428,7 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param ctx The backend CANN context for executing operations.
			
 
				  * @param dst The destination tensor where the extracted rows will be stored.
			
 
				  */
			
 
				-void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Writes specific rows into a tensor at positions specified by indices.
			
@@ -441,7 +440,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param ctx The backend CANN context for executing operations.
			
 
				  * @param dst The destination tensor where the specified rows will be updated.
			
 
				  */
			
 
				-void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Executes matrix multiplication for the given tensor.
			
@@ -454,7 +453,7 @@ void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor for storing the result of the matrix
			
 
				  *            multiplication. dst->op is `GGML_OP_MUL_MAT`.
			
 
				  */
			
 
				-void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
			
@@ -477,7 +476,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @note The function currently does not support cases where the freq_scale is
			
 
				  *       not equal 1.
			
 
				  */
			
 
				-void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the index of the maximum value along the specified dimension
			
@@ -492,7 +491,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the indices of the maximum values will
			
 
				  *            be stored. dst->op is `GGML_OP_ARGMAX`.
			
 
				  */
			
 
				-void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief Adds two tensors element-wise and stores the result in a destination
			
@@ -509,8 +508,10 @@ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param acl_src1 The second source tensor.
			
 
				  * @param acl_dst The destination tensor where the result will be stored.
			
 
				  */
			
 
				-void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
			
 
				-    aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
			
 
				+void aclnn_add(ggml_backend_cann_context & ctx,
			
 
				+               aclTensor *                 acl_src0,
			
 
				+               aclTensor *                 acl_src1,
			
 
				+               aclTensor *                 acl_dst = nullptr);
			
 
				 
			
 
				 /**
			
 
				  * @brief Sub two tensors element-wise and stores the result in a destination
			
@@ -527,8 +528,10 @@ void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
 
				  * @param acl_src1 The second source tensor.
			
 
				  * @param acl_dst The destination tensor where the result will be stored.
			
 
				  */
			
 
				-void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
			
 
				-    aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
			
 
				+void aclnn_sub(ggml_backend_cann_context & ctx,
			
 
				+               aclTensor *                 acl_src0,
			
 
				+               aclTensor *                 acl_src1,
			
 
				+               aclTensor *                 acl_dst = nullptr);
			
 
				 
			
 
				 /**
			
 
				  * @brief Performs element-wise multiplication of two tensors and stores the
			
@@ -546,8 +549,10 @@ void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
 
				  * @param acl_other The second tensor for element-wise multiplication.
			
 
				  * @param acl_dst The destination tensor where the result will be stored.
			
 
				  */
			
 
				-void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
			
 
				-    aclTensor* acl_other, aclTensor* acl_dst = nullptr);
			
 
				+void aclnn_mul(ggml_backend_cann_context & ctx,
			
 
				+               aclTensor *                 acl_src,
			
 
				+               aclTensor *                 acl_other,
			
 
				+               aclTensor *                 acl_dst = nullptr);
			
 
				 
			
 
				 /**
			
 
				  * @brief Matrix division, optionally in-place.
			
@@ -567,8 +572,10 @@ void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 
				  * @param inplace Flag indicating whether to perform the operation in-place on
			
 
				  * `acl_src`.
			
 
				  */
			
 
				-void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
			
 
				-    aclTensor* acl_other, aclTensor* acl_dst = nullptr);
			
 
				+void aclnn_div(ggml_backend_cann_context & ctx,
			
 
				+               aclTensor *                 acl_src,
			
 
				+               aclTensor *                 acl_other,
			
 
				+               aclTensor *                 acl_dst = nullptr);
			
 
				 
			
 
				 /**
			
 
				  * @brief Applies element-wise cosine function to the elements of a tensor.
			
@@ -584,8 +591,7 @@ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 
				  * @param acl_dst The destination tensor where the cosine results will be
			
 
				  * stored.
			
 
				  */
			
 
				-void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
			
 
				-    aclTensor* acl_dst);
			
 
				+void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief Applies element-wise sine function to the elements of a tensor.
			
@@ -602,8 +608,7 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 
				  * @param acl_src The source tensor on which the sine function will be applied.
			
 
				  * @param acl_dst The destination tensor where the sine results will be stored.
			
 
				  */
			
 
				-void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
			
 
				-    aclTensor* acl_dst);
			
 
				+void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
			
@@ -621,8 +626,12 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 
				  * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
			
 
				  * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
			
 
				  */
			
 
				-void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
			
 
				-    aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
			
 
				+void bcast_shape(ggml_tensor * src0,
			
 
				+                 ggml_tensor * src1,
			
 
				+                 ggml_tensor * dst,
			
 
				+                 aclTensor **  acl_src0,
			
 
				+                 aclTensor **  acl_src1,
			
 
				+                 aclTensor **  acl_dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the 1D transposed convolution (deconvolution) of a ggml
			
@@ -637,7 +646,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
 
				  * @param dst The destination tensor where the transposed convolution result
			
 
				  * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
			
 
				  */
			
 
				-void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
			
@@ -662,7 +671,7 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
 
				  * @param dst The destination tensor where the ELU-activated result will be stored.
			
 
				  *            dst->op is expected to be `GGML_OP_ELU`.
			
 
				  */
			
 
				-void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Computes the mean of a ggml tensor element-wise using the CANN backend.
			
@@ -677,7 +686,7 @@ void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the mean result will be stored.
			
 
				  *            dst->op is expected to be `GGML_OP_MEAN`.
			
 
				  */
			
 
				-void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Applies 1D reflect padding to a ggml tensor using the CANN backend.
			
@@ -692,7 +701,7 @@ void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the padded result will be stored.
			
 
				  *            dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`.
			
 
				  */
			
 
				-void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Counts the number of equal elements in two ggml tensors using the CANN backend.
			
@@ -708,7 +717,7 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the result will be stored.
			
 
				  *            dst->op is expected to be `GGML_OP_COUNT_EQUAL`.
			
 
				  */
			
 
				-void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Applies the Step activation function to a ggml tensor using the CANN backend.
			
@@ -723,7 +732,7 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the result will be stored.
			
 
				  *            dst->op is expected to be `GGML_OP_STEP`.
			
 
				  */
			
 
				-void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Performs the Flash Attention extended operator using the CANN backend.
			
@@ -738,59 +747,46 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  * @param dst The destination tensor where the result will be stored.
			
 
				  *            dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
			
 
				  */
			
 
				-void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /*
			
 
				  * @brief A generic wrapper for ACL resources with custom deleter support.
			
 
				  */
			
 
				-using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
			
 
				+using any_acl_resource = std::unique_ptr<void, std::function<void(void *)>>;
			
 
				 
			
 
				 /**
			
 
				  * @brief Trait structure used to define how to destroy a given ACL resource type.
			
 
				  *
			
 
				  * @tparam T ACL resource type.
			
 
				  */
			
 
				-template<typename T>
			
 
				-struct acl_resource_traits;
			
 
				+template <typename T> struct acl_resource_traits;
			
 
				 
			
 
				 /**
			
 
				  * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
			
 
				  */
			
 
				-template<>
			
 
				-struct acl_resource_traits<aclTensor> {
			
 
				-    static void destroy(void* p) {
			
 
				-        ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
			
 
				-    }
			
 
				+template <> struct acl_resource_traits<aclTensor> {
			
 
				+    static void destroy(void * p) { ACL_CHECK(aclDestroyTensor(static_cast<aclTensor *>(p))); }
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
			
 
				  */
			
 
				-template<>
			
 
				-struct acl_resource_traits<aclIntArray> {
			
 
				-    static void destroy(void* p) {
			
 
				-        ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
			
 
				-    }
			
 
				+template <> struct acl_resource_traits<aclIntArray> {
			
 
				+    static void destroy(void * p) { ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray *>(p))); }
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
			
 
				  */
			
 
				-template<>
			
 
				-struct acl_resource_traits<aclScalar> {
			
 
				-    static void destroy(void* p) {
			
 
				-        ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
			
 
				-    }
			
 
				+template <> struct acl_resource_traits<aclScalar> {
			
 
				+    static void destroy(void * p) { ACL_CHECK(aclDestroyScalar(static_cast<aclScalar *>(p))); }
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
			
 
				  */
			
 
				-template<>
			
 
				-struct acl_resource_traits<aclTensorList> {
			
 
				-    static void destroy(void* p) {
			
 
				-        ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
			
 
				-    }
			
 
				+template <> struct acl_resource_traits<aclTensorList> {
			
 
				+    static void destroy(void * p) { ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList *>(p))); }
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -800,14 +796,8 @@ struct acl_resource_traits<aclTensorList> {
 
				  * @param ptr Raw pointer to ACL resource.
			
 
				  * @return any_acl_resource Smart pointer that handles destruction.
			
 
				  */
			
 
				-template<typename T>
			
 
				-any_acl_resource make_acl_resource(T* ptr) {
			
 
				-    return any_acl_resource(
			
 
				-        static_cast<void*>(ptr),
			
 
				-        [](void* p) {
			
 
				-            acl_resource_traits<T>::destroy(p);
			
 
				-        }
			
 
				-    );
			
 
				+template <typename T> any_acl_resource make_acl_resource(T * ptr) {
			
 
				+    return any_acl_resource(static_cast<void *>(ptr), [](void * p) { acl_resource_traits<T>::destroy(p); });
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -817,8 +807,7 @@ any_acl_resource make_acl_resource(T* ptr) {
 
				  * @param vec Target vector to hold ACL resources.
			
 
				  * @param args Raw pointers to ACL resources.
			
 
				  */
			
 
				-template<typename... Args>
			
 
				-void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
			
 
				+template <typename... Args> void register_acl_resources(std::vector<any_acl_resource> & vec, Args *... args) {
			
 
				     (vec.emplace_back(make_acl_resource(args)), ...);
			
 
				 }
			
 
				 
			
@@ -826,39 +815,36 @@ void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
 
				  * @brief Task class that wraps the execution of an aclnn function call.
			
 
				  */
			
 
				 class aclnn_task : public cann_task {
			
 
				-    public:
			
 
				-        aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
			
 
				-                   uint64_t workspace_size, aclOpExecutor * executor,
			
 
				-                   aclrtStream stream) :
			
 
				-            aclnn_func_(aclnn_func),
			
 
				-            workspace_addr_(workspace_addr),
			
 
				-            workspace_size_(workspace_size),
			
 
				-            executor_(executor),
			
 
				-            stream_(stream) {}
			
 
				-        virtual void run_task() override {
			
 
				-            ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
			
 
				-        }
			
 
				-    private:
			
 
				-        aclnn_func_t aclnn_func_;
			
 
				-        void *          workspace_addr_;
			
 
				-        uint64_t        workspace_size_;
			
 
				-        aclOpExecutor * executor_;
			
 
				-        aclrtStream     stream_;
			
 
				+  public:
			
 
				+    aclnn_task(aclnn_func_t    aclnn_func,
			
 
				+               void *          workspace_addr,
			
 
				+               uint64_t        workspace_size,
			
 
				+               aclOpExecutor * executor,
			
 
				+               aclrtStream     stream) :
			
 
				+        aclnn_func_(aclnn_func),
			
 
				+        workspace_addr_(workspace_addr),
			
 
				+        workspace_size_(workspace_size),
			
 
				+        executor_(executor),
			
 
				+        stream_(stream) {}
			
 
				+
			
 
				+    virtual void run_task() override { ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_)); }
			
 
				+  private:
			
 
				+    aclnn_func_t    aclnn_func_;
			
 
				+    void *          workspace_addr_;
			
 
				+    uint64_t        workspace_size_;
			
 
				+    aclOpExecutor * executor_;
			
 
				+    aclrtStream     stream_;
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * @brief Task class that releases ACL resources after usage.
			
 
				  */
			
 
				 class release_resource_task : public cann_task {
			
 
				-public:
			
 
				-    release_resource_task(std::vector<any_acl_resource>&& resources){
			
 
				-        resource_ = std::move(resources);
			
 
				-    }
			
 
				+  public:
			
 
				+    release_resource_task(std::vector<any_acl_resource> && resources) { resource_ = std::move(resources); }
			
 
				 
			
 
				-    virtual void run_task() override {
			
 
				-        resource_.clear();
			
 
				-    }
			
 
				-private:
			
 
				+    virtual void run_task() override { resource_.clear(); }
			
 
				+  private:
			
 
				     std::vector<any_acl_resource> resource_;
			
 
				 };
			
 
				 
			
@@ -866,38 +852,40 @@ private:
 
				  * @brief Task class for performing asynchronous memory copy operations.
			
 
				  */
			
 
				 class async_memcpy_task : public cann_task {
			
 
				-public:
			
 
				-    async_memcpy_task(void* dst, const void* src, size_t size,
			
 
				-                      aclrtMemcpyKind kind, aclrtStream stream)
			
 
				-        : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
			
 
				-
			
 
				-    virtual void run_task() override {
			
 
				-        ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
			
 
				-    }
			
 
				-private:
			
 
				-    void* dst_;
			
 
				-    const void* src_;
			
 
				-    size_t size_;
			
 
				+  public:
			
 
				+    async_memcpy_task(void * dst, const void * src, size_t size, aclrtMemcpyKind kind, aclrtStream stream) :
			
 
				+        dst_(dst),
			
 
				+        src_(src),
			
 
				+        size_(size),
			
 
				+        kind_(kind),
			
 
				+        stream_(stream) {}
			
 
				+
			
 
				+    virtual void run_task() override { ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_)); }
			
 
				+  private:
			
 
				+    void *          dst_;
			
 
				+    const void *    src_;
			
 
				+    size_t          size_;
			
 
				     aclrtMemcpyKind kind_;
			
 
				-    aclrtStream stream_;
			
 
				+    aclrtStream     stream_;
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * @brief Task class for performing asynchronous memory set operations.
			
 
				  */
			
 
				 class async_memset_task : public cann_task {
			
 
				-    public:
			
 
				-    async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
			
 
				-            : buffer_(buffer), size_(size), value_(value), stream_(stream) {}
			
 
				-
			
 
				-        virtual void run_task() override {
			
 
				-            ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
			
 
				-        }
			
 
				-    private:
			
 
				-        void* buffer_;
			
 
				-        size_t size_;
			
 
				-        int32_t value_;
			
 
				-        aclrtStream stream_;
			
 
				+  public:
			
 
				+    async_memset_task(void * buffer, size_t size, int32_t value, aclrtStream stream) :
			
 
				+        buffer_(buffer),
			
 
				+        size_(size),
			
 
				+        value_(value),
			
 
				+        stream_(stream) {}
			
 
				+
			
 
				+    virtual void run_task() override { ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_)); }
			
 
				+  private:
			
 
				+    void *      buffer_;
			
 
				+    size_t      size_;
			
 
				+    int32_t     value_;
			
 
				+    aclrtStream stream_;
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -918,25 +906,24 @@ class async_memset_task : public cann_task {
 
				  * same stream are executed in queue order.
			
 
				  */
			
 
				 
			
 
				-#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...)                                          \
			
 
				-    do {                                                                                    \
			
 
				-        uint64_t        workspaceSize = 0;                                                  \
			
 
				-        aclOpExecutor * executor;                                                           \
			
 
				-        void *          workspaceAddr = nullptr;                                            \
			
 
				-        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
			
 
				-        /* workspace should alloced in main thread to keep malloc order when using vmm. */  \
			
 
				-        if (workspaceSize > 0) {                                                            \
			
 
				-            ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize);            \
			
 
				-            workspaceAddr = workspace_allocator.get();                                      \
			
 
				-        }                                                                                   \
			
 
				-        if (CTX.async_mode) {                                                               \
			
 
				-            auto task =                                                                     \
			
 
				-                std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize,  \
			
 
				-                    executor, CTX.stream()); \
			
 
				-            CTX.task_queue.submit_task(std::move(task));                                    \
			
 
				-        } else {                                                                            \
			
 
				-            ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
			
 
				-        }                                                                                   \
			
 
				+#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...)                                                                  \
			
 
				+    do {                                                                                                            \
			
 
				+        uint64_t        workspaceSize = 0;                                                                          \
			
 
				+        aclOpExecutor * executor;                                                                                   \
			
 
				+        void *          workspaceAddr = nullptr;                                                                    \
			
 
				+        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));                        \
			
 
				+        /* workspace should alloced in main thread to keep malloc order when using vmm. */                          \
			
 
				+        if (workspaceSize > 0) {                                                                                    \
			
 
				+            ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize);                                    \
			
 
				+            workspaceAddr = workspace_allocator.get();                                                              \
			
 
				+        }                                                                                                           \
			
 
				+        if (CTX.async_mode) {                                                                                       \
			
 
				+            auto task =                                                                                             \
			
 
				+                std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, executor, CTX.stream()); \
			
 
				+            CTX.task_queue.submit_task(std::move(task));                                                            \
			
 
				+        } else {                                                                                                    \
			
 
				+            ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));                        \
			
 
				+        }                                                                                                           \
			
 
				     } while (0)
			
 
				 
			
 
				 /**
			
@@ -947,11 +934,10 @@ class async_memset_task : public cann_task {
 
				  * @param ctx Backend context which manages task submission and async mode.
			
 
				  * @param args Pointers to ACL resources to be released.
			
 
				  */
			
 
				-template <typename... Args>
			
 
				-void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
			
 
				+template <typename... Args> void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
			
 
				     std::vector<any_acl_resource> resources;
			
 
				     register_acl_resources(resources, std::forward<Args>(args)...);
			
 
				-    if(ctx.async_mode) {
			
 
				+    if (ctx.async_mode) {
			
 
				         auto task = std::make_unique<release_resource_task>(std::move(resources));
			
 
				         ctx.task_queue.submit_task(std::move(task));
			
 
				     }
			
@@ -966,8 +952,11 @@ void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... arg
 
				  * @param len Size of memory to copy (in bytes).
			
 
				  * @param kind Type of memory copy (host-to-device, device-to-host, etc).
			
 
				  */
			
 
				-inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
			
 
				-                                   const void * src, size_t len, aclrtMemcpyKind kind) {
			
 
				+inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx,
			
 
				+                                   void *                      dst,
			
 
				+                                   const void *                src,
			
 
				+                                   size_t                      len,
			
 
				+                                   aclrtMemcpyKind             kind) {
			
 
				     if (ctx.async_mode) {
			
 
				         auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
			
 
				         ctx.task_queue.submit_task(std::move(task));
			
@@ -976,8 +965,11 @@ inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
 
				     }
			
 
				 }
			
 
				 
			
 
				-inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
			
 
				-                                   const void * src, size_t len, aclrtMemcpyKind kind) {
			
 
				+inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx,
			
 
				+                                   void *                      dst,
			
 
				+                                   const void *                src,
			
 
				+                                   size_t                      len,
			
 
				+                                   aclrtMemcpyKind             kind) {
			
 
				     if (ctx->async_mode) {
			
 
				         auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
			
 
				         ctx->task_queue.submit_task(std::move(task));
			
@@ -994,8 +986,7 @@ inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
 
				  * @param size Size of the memory buffer (in bytes).
			
 
				  * @param value Value to set in the buffer.
			
 
				  */
			
 
				-inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
			
 
				-                                   size_t size, int value) {
			
 
				+inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer, size_t size, int value) {
			
 
				     if (ctx.async_mode) {
			
 
				         auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
			
 
				         ctx.task_queue.submit_task(std::move(task));
			
@@ -1029,7 +1020,7 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
 
				  * @param dst The destination tensor where the expert-weighted token outputs are stored.
			
 
				  *            Expected to be of shape [M, K, N, 1].
			
 
				  */
			
 
				-void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief   Check whether a tensor is a weight tensor for matrix multiplication.
			
@@ -1041,20 +1032,14 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  *
			
 
				  * @param tensor Pointer to the target ggml_tensor object (const-qualified).
			
 
				  */
			
 
				-static bool is_matmul_weight(const ggml_tensor* tensor) {
			
 
				-    std::string name = ggml_get_name(tensor);
			
 
				-    static const std::unordered_set<std::string> weight_suffixes{
			
 
				-        "output.weight",
			
 
				-        "attn_q.weight",
			
 
				-        "attn_k.weight",
			
 
				-        "attn_v.weight",
			
 
				-        "attn_output.weight",
			
 
				-        "ffn_gate.weight",
			
 
				-        "ffn_up.weight",
			
 
				-        "ffn_down.weight"
			
 
				-    };
			
 
				-
			
 
				-    for (const auto& suffix : weight_suffixes) {
			
 
				+static bool is_matmul_weight(const ggml_tensor * tensor) {
			
 
				+    std::string                                  name = ggml_get_name(tensor);
			
 
				+    static const std::unordered_set<std::string> weight_suffixes{ "output.weight",      "attn_q.weight",
			
 
				+                                                                  "attn_k.weight",      "attn_v.weight",
			
 
				+                                                                  "attn_output.weight", "ffn_gate.weight",
			
 
				+                                                                  "ffn_up.weight",      "ffn_down.weight" };
			
 
				+
			
 
				+    for (const auto & suffix : weight_suffixes) {
			
 
				         if (name.find(suffix) != std::string::npos) {
			
 
				             return true;
			
 
				         }
			
@@ -1078,14 +1063,13 @@ static bool is_matmul_weight(const ggml_tensor* tensor) {
 
				  * @param ctx The CANN backend context used to manage execution and resources.
			
 
				  * @param dst The destination tensor.
			
 
				  */
			
 
				-template <auto binary_op>
			
 
				-void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
			
 
				-    ggml_tensor* src0 = dst->src[0];
			
 
				-    ggml_tensor* src1 = dst->src[1];
			
 
				+template <auto binary_op> void ggml_cann_binary_op(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
			
 
				+    ggml_tensor * src0 = dst->src[0];
			
 
				+    ggml_tensor * src1 = dst->src[1];
			
 
				 
			
 
				-    aclTensor* acl_src0;
			
 
				-    aclTensor* acl_src1;
			
 
				-    aclTensor* acl_dst;
			
 
				+    aclTensor * acl_src0;
			
 
				+    aclTensor * acl_src1;
			
 
				+    aclTensor * acl_dst;
			
 
				 
			
 
				     // Need bcast
			
 
				     bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
			
@@ -1094,7 +1078,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
				     ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /**
			
 
				  * @brief Applies a unary operation to an input tensor using the CANN backend.
			
 
				  *
			
@@ -1107,12 +1090,12 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
				  * @param ctx The CANN backend context for managing resources and execution.
			
 
				  * @param dst The destination tensor. Its src[0] is treated as the input tensor.
			
 
				  */
			
 
				-template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
			
 
				-    void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
			
 
				-    ggml_tensor* src = dst->src[0];
			
 
				+template <void unary_op(ggml_backend_cann_context &, aclTensor *, aclTensor *)>
			
 
				+void ggml_cann_op_unary(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
			
 
				+    ggml_tensor * src = dst->src[0];
			
 
				 
			
 
				-    aclTensor* acl_src = ggml_cann_create_tensor(src);
			
 
				-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
			
 
				+    aclTensor * acl_src = ggml_cann_create_tensor(src);
			
 
				+    aclTensor * acl_dst = ggml_cann_create_tensor(dst);
			
 
				 
			
 
				     unary_op(ctx, acl_src, acl_dst);
			
 
				     ggml_cann_release_resources(ctx, acl_src, acl_dst);
			
@@ -1138,9 +1121,9 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 
				  *
			
 
				  * @see GGML_CANN_CALL_OP_UNARY
			
 
				  */
			
 
				-void ggml_cann_op_unary(
			
 
				-    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
			
 
				-    ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
			
 
				+                        ggml_backend_cann_context &                                                ctx,
			
 
				+                        ggml_tensor *                                                              dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
			
@@ -1172,9 +1155,9 @@ void ggml_cann_op_unary(
 
				  *
			
 
				  * @see GGML_CANN_CALL_OP_UNARY_GATED
			
 
				  */
			
 
				-void ggml_cann_op_unary_gated(
			
 
				-    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
			
 
				-    ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
			
 
				+                              ggml_backend_cann_context &                                                ctx,
			
 
				+                              ggml_tensor *                                                              dst);
			
 
				 
			
 
				 /**
			
 
				  * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
			
@@ -1197,16 +1180,13 @@ void ggml_cann_op_unary_gated(
 
				  * @see ggml_cann_op_unary
			
 
				  * @see GGML_CANN_CALL_ACLNN_OP
			
 
				  */
			
 
				-#define GGML_CANN_CALL_OP_UNARY(OP_NAME)                              \
			
 
				-    do {                                                              \
			
 
				-        auto lambda = [](ggml_backend_cann_context& ctx,              \
			
 
				-            aclTensor* acl_src,                                       \
			
 
				-            aclTensor* acl_dst) {                                     \
			
 
				-            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
			
 
				-        };                                                            \
			
 
				-        ggml_cann_op_unary(lambda, ctx, dst);                         \
			
 
				-    }                                                                 \
			
 
				-    while (0)
			
 
				+#define GGML_CANN_CALL_OP_UNARY(OP_NAME)                                                              \
			
 
				+    do {                                                                                              \
			
 
				+        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
			
 
				+            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);                                  \
			
 
				+        };                                                                                            \
			
 
				+        ggml_cann_op_unary(lambda, ctx, dst);                                                         \
			
 
				+    } while (0)
			
 
				 
			
 
				 /**
			
 
				  * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
			
@@ -1229,15 +1209,12 @@ void ggml_cann_op_unary_gated(
 
				  * @see ggml_cann_op_unary_gated
			
 
				  * @see GGML_CANN_CALL_ACLNN_OP
			
 
				  */
			
 
				-#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                        \
			
 
				-    do {                                                              \
			
 
				-        auto lambda = [](ggml_backend_cann_context& ctx,              \
			
 
				-            aclTensor* acl_src,                                       \
			
 
				-            aclTensor* acl_dst) {                                     \
			
 
				-            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);  \
			
 
				-        };                                                            \
			
 
				-        ggml_cann_op_unary_gated(lambda, ctx, dst);                   \
			
 
				-    }                                                                 \
			
 
				-    while (0)
			
 
				+#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                                                        \
			
 
				+    do {                                                                                              \
			
 
				+        auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
			
 
				+            GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);                                  \
			
 
				+        };                                                                                            \
			
 
				+        ggml_cann_op_unary_gated(lambda, ctx, dst);                                                   \
			
 
				+    } while (0)
			
 
				 
			
 
				 #endif  // CANN_ACLNN_OPS
			
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -44,7 +44,7 @@
 
				 #include "../include/ggml.h"
			
 
				 #include "../ggml-impl.h"
			
 
				 
			
 
				-#define MATRIX_ROW_PADDING 512
			
 
				+#define MATRIX_ROW_PADDING    512
			
 
				 #define GGML_CANN_MAX_STREAMS 8
			
 
				 
			
 
				 /**
			
@@ -56,8 +56,7 @@
 
				  * @param line The line number at which the error occurred.
			
 
				  * @param msg The error message.
			
 
				  */
			
 
				-[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
			
 
				-                                  const char* file, int line, const char* msg);
			
 
				+[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
			
 
				 
			
 
				 /**
			
 
				  * @brief Checks the result of a CANN function call and invokes the error
			
@@ -89,25 +88,24 @@ struct ggml_cann_device_info {
 
				      * @brief Information about a single CANN device.
			
 
				      */
			
 
				     struct cann_device_info {
			
 
				-        int cc;                 /**< Compute capability.                   */
			
 
				+        int    cc;              /**< Compute capability.                   */
			
 
				         size_t smpb;            /**< Maximum shared memory per block.      */
			
 
				-        bool vmm;               /**< Virtual memory support.               */
			
 
				+        bool   vmm;             /**< Virtual memory support.               */
			
 
				         size_t vmm_granularity; /**< Granularity of virtual memory.        */
			
 
				         size_t total_vram;      /**< Total video RAM available on the device. */
			
 
				     };
			
 
				 
			
 
				-    cann_device_info devices[GGML_CANN_MAX_DEVICES] =
			
 
				-        {}; /**< Array of CANN device information. */
			
 
				+    cann_device_info devices[GGML_CANN_MAX_DEVICES] = {}; /**< Array of CANN device information. */
			
 
				 };
			
 
				 
			
 
				-const ggml_cann_device_info& ggml_cann_info();
			
 
				+const ggml_cann_device_info & ggml_cann_info();
			
 
				 
			
 
				-void ggml_cann_set_device(int32_t device);
			
 
				+void    ggml_cann_set_device(int32_t device);
			
 
				 int32_t ggml_cann_get_device();
			
 
				 
			
 
				-std::optional<std::string> get_env(const std::string& name);
			
 
				-bool parse_bool(const std::string& value);
			
 
				-int parse_integer(const std::string& value);
			
 
				+std::optional<std::string> get_env(const std::string & name);
			
 
				+bool                       parse_bool(const std::string & value);
			
 
				+int                        parse_integer(const std::string & value);
			
 
				 
			
 
				 /**
			
 
				  * @brief Abstract base class for memory pools used by CANN.
			
@@ -126,7 +124,7 @@ struct ggml_cann_pool {
 
				      *                     will be stored.
			
 
				      * @return             Pointer to the allocated memory block.
			
 
				      */
			
 
				-    virtual void* alloc(size_t size, size_t* actual_size) = 0;
			
 
				+    virtual void * alloc(size_t size, size_t * actual_size) = 0;
			
 
				 
			
 
				     /**
			
 
				      * @brief Frees a previously allocated memory block.
			
@@ -136,16 +134,16 @@ struct ggml_cann_pool {
 
				      * @note Note that all CANN opertors are running async. Make sure memory is
			
 
				      *       still avaiable before this operator finished.
			
 
				      */
			
 
				-    virtual void free(void* ptr, size_t size) = 0;
			
 
				+    virtual void free(void * ptr, size_t size) = 0;
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
			
 
				  */
			
 
				 struct ggml_cann_pool_alloc {
			
 
				-    ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
			
 
				-    void* ptr = nullptr;    /**< Pointer to the allocated memory block. */
			
 
				-    size_t actual_size = 0; /**< Actual size of the allocated memory block. */
			
 
				+    ggml_cann_pool * pool        = nullptr; /**< Pointer to the memory pool. */
			
 
				+    void *           ptr         = nullptr; /**< Pointer to the allocated memory block. */
			
 
				+    size_t           actual_size = 0;       /**< Actual size of the allocated memory block. */
			
 
				 
			
 
				     /**
			
 
				      * @brief Default constructor.
			
@@ -156,16 +154,14 @@ struct ggml_cann_pool_alloc {
 
				      * @brief Constructor that initializes the memory pool.
			
 
				      * @param pool Reference to the memory pool.
			
 
				      */
			
 
				-    explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
			
 
				+    explicit ggml_cann_pool_alloc(ggml_cann_pool & pool) : pool(&pool) {}
			
 
				 
			
 
				     /**
			
 
				      * @brief Constructor that initializes the memory pool and allocates memory.
			
 
				      * @param pool Reference to the memory pool.
			
 
				      * @param size Size of the memory block to allocate.
			
 
				      */
			
 
				-    ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
			
 
				-        alloc(size);
			
 
				-    }
			
 
				+    ggml_cann_pool_alloc(ggml_cann_pool & pool, size_t size) : pool(&pool) { alloc(size); }
			
 
				 
			
 
				     /**
			
 
				      * @brief Destructor that frees the allocated memory block.
			
@@ -181,7 +177,7 @@ struct ggml_cann_pool_alloc {
 
				      * @param size Size of the memory block to allocate.
			
 
				      * @return Pointer to the allocated memory block.
			
 
				      */
			
 
				-    void* alloc(size_t size) {
			
 
				+    void * alloc(size_t size) {
			
 
				         GGML_ASSERT(pool != nullptr);
			
 
				         GGML_ASSERT(ptr == nullptr);
			
 
				         ptr = pool->alloc(size, &this->actual_size);
			
@@ -194,7 +190,7 @@ struct ggml_cann_pool_alloc {
 
				      * @param size Size of the memory block to allocate.
			
 
				      * @return Pointer to the allocated memory block.
			
 
				      */
			
 
				-    void* alloc(ggml_cann_pool& pool, size_t size) {
			
 
				+    void * alloc(ggml_cann_pool & pool, size_t size) {
			
 
				         this->pool = &pool;
			
 
				         return alloc(size);
			
 
				     }
			
@@ -203,25 +199,25 @@ struct ggml_cann_pool_alloc {
 
				      * @brief Gets the pointer to the allocated memory block.
			
 
				      * @return Pointer to the allocated memory block.
			
 
				      */
			
 
				-    void* get() { return ptr; }
			
 
				+    void * get() { return ptr; }
			
 
				 
			
 
				     // Deleted copy constructor
			
 
				-    ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
			
 
				+    ggml_cann_pool_alloc(const ggml_cann_pool_alloc &) = delete;
			
 
				 
			
 
				     // Deleted move constructor
			
 
				-    ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
			
 
				+    ggml_cann_pool_alloc(ggml_cann_pool_alloc &&) = delete;
			
 
				 
			
 
				     // Deleted copy assignment operator
			
 
				-    ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
			
 
				+    ggml_cann_pool_alloc & operator=(const ggml_cann_pool_alloc &) = delete;
			
 
				 
			
 
				     // Deleted move assignment operator
			
 
				-    ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
			
 
				+    ggml_cann_pool_alloc & operator=(ggml_cann_pool_alloc &&) = delete;
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * @brief Function pointer type for ACLNN operator calls.
			
 
				  */
			
 
				-using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStream);
			
 
				+using aclnn_func_t = aclnnStatus (*)(void *, uint64_t, aclOpExecutor *, aclrtStream);
			
 
				 
			
 
				 /**
			
 
				  * @brief Base class for all CANN tasks to be submitted to the task queue.
			
@@ -229,7 +225,7 @@ using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStrea
 
				  * Users should override the run_task() method with actual task logic.
			
 
				  */
			
 
				 class cann_task {
			
 
				-public:
			
 
				+  public:
			
 
				     virtual void run_task() {}
			
 
				 };
			
 
				 
			
@@ -237,16 +233,20 @@ public:
 
				  * @brief A lock-free ring-buffer based task queue for asynchronously executing cann_task instances.
			
 
				  */
			
 
				 class cann_task_queue {
			
 
				-public:
			
 
				+  public:
			
 
				     /**
			
 
				      * @brief Constructs a task queue with a fixed power-of-two capacity for a specific device.
			
 
				      *
			
 
				      * @param capacity Queue capacity. Must be a power of 2.
			
 
				      * @param device Target device ID (used for context setting).
			
 
				      */
			
 
				-    explicit cann_task_queue(size_t capacity, int32_t device)
			
 
				-        : buffer_(capacity), capacity_(capacity), head_(0), tail_(0),
			
 
				-          running_(false), device_(device) {
			
 
				+    explicit cann_task_queue(size_t capacity, int32_t device) :
			
 
				+        buffer_(capacity),
			
 
				+        capacity_(capacity),
			
 
				+        head_(0),
			
 
				+        tail_(0),
			
 
				+        running_(false),
			
 
				+        device_(device) {
			
 
				         GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2");
			
 
				         mask_ = capacity_ - 1;
			
 
				     }
			
@@ -257,7 +257,7 @@ public:
 
				      * @param item Unique pointer to the task.
			
 
				      * @return true if the task was successfully enqueued, false if the queue was full.
			
 
				      */
			
 
				-    bool enqueue(std::unique_ptr<cann_task>&& item) {
			
 
				+    bool enqueue(std::unique_ptr<cann_task> && item) {
			
 
				         size_t next_tail = (tail_ + 1) & mask_;
			
 
				 
			
 
				         if (next_tail == head_) {
			
@@ -276,17 +276,16 @@ public:
 
				      *
			
 
				      * @param task Task to be submitted.
			
 
				      */
			
 
				-    void submit_task(std::unique_ptr<cann_task>&& task) {
			
 
				-        while(!enqueue(std::move(task))) {
			
 
				+    void submit_task(std::unique_ptr<cann_task> && task) {
			
 
				+        while (!enqueue(std::move(task))) {
			
 
				             std::this_thread::yield();
			
 
				             continue;
			
 
				         }
			
 
				 
			
 
				         if (!running_) {
			
 
				             running_ = true;
			
 
				-            thread_ = std::thread(&cann_task_queue::execute, this);
			
 
				+            thread_  = std::thread(&cann_task_queue::execute, this);
			
 
				         }
			
 
				-
			
 
				     }
			
 
				 
			
 
				     /**
			
@@ -309,7 +308,7 @@ public:
 
				         }
			
 
				     }
			
 
				 
			
 
				-private:
			
 
				+  private:
			
 
				     /**
			
 
				      * @brief Worker thread function that continuously dequeues and executes tasks.
			
 
				      */
			
@@ -317,7 +316,7 @@ private:
 
				         ggml_cann_set_device(device_);
			
 
				 
			
 
				         while (running_) {
			
 
				-            if(head_ == tail_) {
			
 
				+            if (head_ == tail_) {
			
 
				                 std::this_thread::yield();
			
 
				                 continue;
			
 
				             }
			
@@ -330,24 +329,24 @@ private:
 
				     }
			
 
				 
			
 
				     std::vector<std::unique_ptr<cann_task>> buffer_;
			
 
				-    const size_t capacity_;
			
 
				-    size_t mask_;
			
 
				-    size_t head_;
			
 
				-    size_t tail_;
			
 
				-    bool running_;
			
 
				-    std::thread thread_;
			
 
				-    int32_t device_;
			
 
				+    const size_t                            capacity_;
			
 
				+    size_t                                  mask_;
			
 
				+    size_t                                  head_;
			
 
				+    size_t                                  tail_;
			
 
				+    bool                                    running_;
			
 
				+    std::thread                             thread_;
			
 
				+    int32_t                                 device_;
			
 
				 };
			
 
				 
			
 
				 #ifdef USE_ACL_GRAPH
			
 
				 struct ggml_graph_node_properties {
			
 
				     // dst tensor
			
 
				-    void * node_address;
			
 
				+    void *  node_address;
			
 
				     int64_t ne[GGML_MAX_DIMS];
			
 
				-    size_t nb[GGML_MAX_DIMS];
			
 
				+    size_t  nb[GGML_MAX_DIMS];
			
 
				 
			
 
				     // src tensor
			
 
				-    void * src_address[GGML_MAX_SRC];
			
 
				+    void *  src_address[GGML_MAX_SRC];
			
 
				     int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
			
 
				     size_t  src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
			
 
				 
			
@@ -376,13 +375,11 @@ struct ggml_cann_graph {
 
				  * move existing graphs to the front (most recently used), and clear the cache.
			
 
				  */
			
 
				 struct ggml_cann_graph_lru_cache {
			
 
				-    size_t capacity;  /**< Maximum number of graphs in the cache. */
			
 
				+    size_t capacity;                         /**< Maximum number of graphs in the cache. */
			
 
				 
			
 
				-    std::list<ggml_cann_graph*> cache_list; /**< List storing cached graphs as raw pointers. */
			
 
				+    std::list<ggml_cann_graph *> cache_list; /**< List storing cached graphs as raw pointers. */
			
 
				 
			
 
				-    ggml_cann_graph_lru_cache() {
			
 
				-        capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12"));
			
 
				-    }
			
 
				+    ggml_cann_graph_lru_cache() { capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); }
			
 
				 
			
 
				     /**
			
 
				      * @brief Push a new graph to the front of the cache.
			
@@ -390,11 +387,11 @@ struct ggml_cann_graph_lru_cache {
 
				      * @param new_node Pointer to the new ggml_cann_graph to cache.
			
 
				      *        Ownership is transferred to the cache (cache will delete it).
			
 
				      */
			
 
				-    void push(ggml_cann_graph* new_node) {
			
 
				+    void push(ggml_cann_graph * new_node) {
			
 
				         if (cache_list.size() >= capacity) {
			
 
				-            ggml_cann_graph* old = cache_list.back();
			
 
				+            ggml_cann_graph * old = cache_list.back();
			
 
				             cache_list.pop_back();
			
 
				-            delete old; // free the old graph
			
 
				+            delete old;  // free the old graph
			
 
				         }
			
 
				         cache_list.push_front(new_node);
			
 
				     }
			
@@ -403,7 +400,7 @@ struct ggml_cann_graph_lru_cache {
 
				      * @brief Move an existing graph to the front of the cache.
			
 
				      * @param node Pointer to the ggml_cann_graph to move.
			
 
				      */
			
 
				-    void move_to_front(ggml_cann_graph* node) {
			
 
				+    void move_to_front(ggml_cann_graph * node) {
			
 
				         cache_list.remove(node);
			
 
				         cache_list.push_front(node);
			
 
				     }
			
@@ -421,92 +418,89 @@ struct ggml_cann_graph_lru_cache {
 
				     /**
			
 
				      * @brief Destructor that clears the cache and frees all cached graphs.
			
 
				      */
			
 
				-    ~ggml_cann_graph_lru_cache() {
			
 
				-        clear();
			
 
				-    }
			
 
				+    ~ggml_cann_graph_lru_cache() { clear(); }
			
 
				 };
			
 
				 #endif  // USE_ACL_GRAPH
			
 
				 
			
 
				 struct ggml_cann_rope_cache {
			
 
				     ~ggml_cann_rope_cache() {
			
 
				-        if(theta_scale_cache != nullptr) {
			
 
				+        if (theta_scale_cache != nullptr) {
			
 
				             ACL_CHECK(aclrtFree(theta_scale_cache));
			
 
				         }
			
 
				-        if(sin_cache != nullptr) {
			
 
				+        if (sin_cache != nullptr) {
			
 
				             ACL_CHECK(aclrtFree(sin_cache));
			
 
				         }
			
 
				-        if(cos_cache != nullptr) {
			
 
				+        if (cos_cache != nullptr) {
			
 
				             ACL_CHECK(aclrtFree(cos_cache));
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    void* theta_scale_cache = nullptr;
			
 
				+    void *  theta_scale_cache  = nullptr;
			
 
				     int64_t theta_scale_length = 0;
			
 
				     // sin/cos cache, used only to accelerate first layer on each device
			
 
				-    void* sin_cache = nullptr;
			
 
				-    void* cos_cache = nullptr;
			
 
				-    int64_t position_length = 0;
			
 
				+    void *  sin_cache          = nullptr;
			
 
				+    void *  cos_cache          = nullptr;
			
 
				+    int64_t position_length    = 0;
			
 
				     // Properties to check before reusing the sincos cache
			
 
				-    bool cached = false;
			
 
				-    float ext_factor = 0.0f;
			
 
				-    float theta_scale = 0.0f;
			
 
				-    float freq_scale = 0.0f;
			
 
				-    float attn_factor = 0.0f;
			
 
				-    bool is_neox = false;
			
 
				+    bool    cached             = false;
			
 
				+    float   ext_factor         = 0.0f;
			
 
				+    float   theta_scale        = 0.0f;
			
 
				+    float   freq_scale         = 0.0f;
			
 
				+    float   attn_factor        = 0.0f;
			
 
				+    bool    is_neox            = false;
			
 
				 };
			
 
				 
			
 
				 struct ggml_cann_tensor_cache {
			
 
				     ~ggml_cann_tensor_cache() {
			
 
				-        if(cache != nullptr) {
			
 
				+        if (cache != nullptr) {
			
 
				             ACL_CHECK(aclrtFree(cache));
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    void* cache = nullptr;
			
 
				-    int64_t size = 0;
			
 
				+    void *  cache = nullptr;
			
 
				+    int64_t size  = 0;
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * @brief Context for managing CANN backend operations.
			
 
				  */
			
 
				 struct ggml_backend_cann_context {
			
 
				-    int32_t device;                  /**< Device ID. */
			
 
				-    std::string name;                /**< Name of the device. */
			
 
				-    std::string description;         /**< Description of the device. */
			
 
				-    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
			
 
				+    int32_t     device;               /**< Device ID. */
			
 
				+    std::string name;                 /**< Name of the device. */
			
 
				+    std::string description;          /**< Description of the device. */
			
 
				+    aclrtEvent  copy_event = nullptr; /**< Event for managing copy operations. */
			
 
				 #ifdef USE_ACL_GRAPH
			
 
				     /// Cached CANN ACL graph used for executing the current ggml computation graph.
			
 
				     ggml_cann_graph_lru_cache graph_lru_cache;
			
 
				-    bool acl_graph_mode = true;
			
 
				+    bool                      acl_graph_mode = true;
			
 
				 #endif
			
 
				-    cann_task_queue task_queue;
			
 
				-    bool async_mode;
			
 
				+    cann_task_queue        task_queue;
			
 
				+    bool                   async_mode;
			
 
				     // Rope Cache
			
 
				-    ggml_cann_rope_cache rope_cache;
			
 
				+    ggml_cann_rope_cache   rope_cache;
			
 
				     // Constant Pool
			
 
				     ggml_cann_tensor_cache rms_norm_one_tensor_cache;
			
 
				     ggml_cann_tensor_cache rms_norm_zero_tensor_cache;
			
 
				 
			
 
				-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
			
 
				+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = { nullptr }; /**< Array of streams for the device. */
			
 
				 
			
 
				     /**
			
 
				      * @brief Constructor for initializing the context with a given device.
			
 
				      * @param device Device ID.
			
 
				      */
			
 
				-    explicit ggml_backend_cann_context(int device)
			
 
				-        : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
			
 
				+    explicit ggml_backend_cann_context(int device) :
			
 
				+        device(device),
			
 
				+        name("CANN" + std::to_string(device)),
			
 
				+        task_queue(1024, device) {
			
 
				         ggml_cann_set_device(device);
			
 
				         description = aclrtGetSocName();
			
 
				 
			
 
				         async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
			
 
				-        GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
			
 
				-            device, async_mode ? "ON" : "OFF");
			
 
				+        GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__, device, async_mode ? "ON" : "OFF");
			
 
				 #ifdef USE_ACL_GRAPH
			
 
				         acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
			
 
				-        GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
			
 
				-              __func__, device,
			
 
				-              acl_graph_mode ? "GRAPH" : "EAGER",
			
 
				-              acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
			
 
				+        GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n", __func__, device, acl_graph_mode ? "GRAPH" : "EAGER",
			
 
				+                      acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
			
 
				 #endif
			
 
				     }
			
 
				 
			
@@ -549,8 +543,7 @@ struct ggml_backend_cann_context {
 
				     aclrtStream stream() { return stream(0); }
			
 
				 
			
 
				     // TODO: each stream should have a memory pool.
			
 
				-    std::unique_ptr<ggml_cann_pool>
			
 
				-        mem_pool; /**< Memory pool for the device. */
			
 
				+    std::unique_ptr<ggml_cann_pool> mem_pool; /**< Memory pool for the device. */
			
 
				 
			
 
				     /**
			
 
				      * @brief Create a new memory pool for a given device.
			
@@ -563,7 +556,7 @@ struct ggml_backend_cann_context {
 
				      * @brief Get or create the memory pool for the context.
			
 
				      * @return Reference to the memory pool.
			
 
				      */
			
 
				-    ggml_cann_pool& pool() {
			
 
				+    ggml_cann_pool & pool() {
			
 
				         if (mem_pool == nullptr) {
			
 
				             mem_pool = new_pool_for_device(device);
			
 
				         }
			
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp