Просмотр исходного кода

CANN: Improve loading efficiency after converting weights to NZ format. (#14985)

* CANN: Improve loading efficiency after converting weights to NZ format.

* CANN: fix typo
hipudding 5 месяцев назад
Родитель
Сommit
11490b3672
3 измененных файлов с 70 добавлено и 58 удалено
  1. 4 2
      docs/backend/CANN.md
  2. 3 5
      ggml/src/ggml-cann/aclnn_ops.cpp
  3. 63 51
      ggml/src/ggml-cann/ggml-cann.cpp

+ 4 - 2
docs/backend/CANN.md

@@ -310,5 +310,7 @@ Specifies the memory pool management strategy:
 
 
 Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
 Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
 
 
-## TODO
-- Support more models and data types.
+### GGML_CANN_WEIGHT_NZ
+
+Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
+

+ 3 - 5
ggml/src/ggml-cann/aclnn_ops.cpp

@@ -1913,11 +1913,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
                              bcast_weight_nb[4], bcast_weight_nb[5]};
                              bcast_weight_nb[4], bcast_weight_nb[5]};
     aclTensor* acl_weight_tensor;
     aclTensor* acl_weight_tensor;
 
 
-    bool weightToNZ = false;
-#ifdef ASCEND_310P
-    weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
-#endif
-    if (weightToNZ && is_matmul_weight(weight)) {
+    // Only check env once.
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+    if (weight_to_nz && is_matmul_weight(weight)) {
         int64_t acl_stride[2] = {1, transpose_ne[1]};
         int64_t acl_stride[2] = {1, transpose_ne[1]};
 
 
         // Reverse ne.
         // Reverse ne.

+ 63 - 51
ggml/src/ggml-cann/ggml-cann.cpp

@@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
     return GGML_STATUS_SUCCESS;
     return GGML_STATUS_SUCCESS;
 }
 }
 
 
-static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
-                      aclDataType dataType, aclTensor **tensor)
-{
-    uint64_t size = 1;
-    for (auto i : shape) {
-        size *= i;
+// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
+namespace {
+    void* g_nz_workspace = nullptr;
+    size_t g_nz_workspace_allocated = 0;
+
+    void release_nz_workspace() {
+        if (g_nz_workspace) {
+            aclrtFree(g_nz_workspace);
+            g_nz_workspace = nullptr;
+            g_nz_workspace_allocated = 0;
+        }
     }
     }
 
 
-    const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
-    ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
-
-    size *= sizeof(int16_t);
-
-    ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
-    aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
-
-    std::vector<int64_t> strides(shape.size(), 1);
-    for (int64_t i = shape.size() - 2; i >= 0; i--) {
-        strides[i] = shape[i + 1] * strides[i + 1];
+    void relloc_nz_workspace(size_t new_size) {
+        if (new_size > g_nz_workspace_allocated) {
+        if (g_nz_workspace) {
+            aclrtFree(g_nz_workspace);
+            g_nz_workspace = nullptr;
+        }
+        ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
+        g_nz_workspace_allocated = new_size;
+    }
     }
     }
-
-    *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
-                              shape.data(), shape.size(), *deviceAddr);
-    return 0;
 }
 }
 
 
+/**
+ * @brief Convert tensor weights to NZ format using Ascend CANN API.
+ *
+ * This function creates a transposed tensor descriptor and performs the
+ * TransMatmulWeight operation. Converting tensor formats can significantly
+ * improve performance on certain hardware.
+ *
+ * @param tensor Pointer to the input ggml_tensor containing the weights.
+ * @param data Pointer to the raw data buffer for the tensor weights.
+ * @param offset Byte offset within the tensor data buffer where weights start.
+ *
+ * @note The workspace buffer used in this function is managed globally and reused
+ *       across calls. This reduces overhead from repeated memory allocation and deallocation.
+ */
 static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
 static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
-    aclrtStream stream;
-    ACL_CHECK(aclrtCreateStream(&stream));
-
-    std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
-    void *weightTransposedDeviceAddr = nullptr;
-    aclTensor *weightTransposed = nullptr;
-    CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
-                          ggml_cann_type_mapping(tensor->type), &weightTransposed);
-
+    aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
+                                    tensor->nb, 2, ACL_FORMAT_ND, offset);
     uint64_t workspaceSize = 0;
     uint64_t workspaceSize = 0;
     aclOpExecutor *executor;
     aclOpExecutor *executor;
-    void *workspaceAddr = nullptr;
 
 
     // TransMatmulWeight
     // TransMatmulWeight
-    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
-    std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
-    if (workspaceSize > 0) {
-        ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
-        workspaceAddrPtrTrans.reset(workspaceAddr);
-    }
-    ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
+                                                    &workspaceSize, &executor));
+    // Avoid frequent malloc/free of the workspace.
+    relloc_nz_workspace(workspaceSize);
 
 
-    size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
-
-    aclrtMemcpy((char *)tensor->data + offset, size,
-                weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
+    ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
     ACL_CHECK(aclDestroyTensor(weightTransposed));
     ACL_CHECK(aclDestroyTensor(weightTransposed));
-    aclrtFree(weightTransposedDeviceAddr);
 }
 }
 
 
 // TODO: need handle tensor which has paddings.
 // TODO: need handle tensor which has paddings.
@@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor(
     // For acl, synchronous functions use this default stream.
     // For acl, synchronous functions use this default stream.
     // Why aclrtSynchronizeDevice?
     // Why aclrtSynchronizeDevice?
 
 
-    bool weightToNZ = false;
-#ifdef ASCEND_310P
-    weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
-#endif
+    // Only check env once.
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
     if (!need_transform(tensor->type)) {
     if (!need_transform(tensor->type)) {
         ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
         ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
                               ACL_MEMCPY_HOST_TO_DEVICE));
                               ACL_MEMCPY_HOST_TO_DEVICE));
-        if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
+        if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
+            GGML_ASSERT(tensor->ne[2] == 1);
+            GGML_ASSERT(tensor->ne[3] == 1);
             weight_format_to_nz(tensor, data, offset);
             weight_format_to_nz(tensor, data, offset);
         }
         }
     } else {
     } else {
@@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
     size_t size = ggml_nbytes(tensor);
     size_t size = ggml_nbytes(tensor);
     int64_t ne0 = tensor->ne[0];
     int64_t ne0 = tensor->ne[0];
 
 
+    // Only check env once.
+    static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
+
     // last line must bigger than 32, because every single op deal at
     // last line must bigger than 32, because every single op deal at
     // least 32 bytes.
     // least 32 bytes.
     // TODO: quantized type?
     // TODO: quantized type?
     // int64_t line_size = ne0 * ggml_element_size(tensor);
     // int64_t line_size = ne0 * ggml_element_size(tensor);
     // int64_t line_size_align_32 = (line_size + 31) & ~31;
     // int64_t line_size_align_32 = (line_size + 31) & ~31;
     // size += (line_size_align_32 - line_size);
     // size += (line_size_align_32 - line_size);
-
-    // TODO: not support quantized yet.
-    // TODO: consider un-continue tensor.
     if (ggml_is_quantized(tensor->type)) {
     if (ggml_is_quantized(tensor->type)) {
         if (ne0 % MATRIX_ROW_PADDING != 0) {
         if (ne0 % MATRIX_ROW_PADDING != 0) {
             size += ggml_row_size(
             size += ggml_row_size(
                 tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
                 tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
         }
         }
+    } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
+        // NZ format weight are not support quantized yet.
+        // If ND tensor transform to NZ, size may changed.
+        int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
+        GGML_ASSERT(tensor->ne[2] == 1);
+        GGML_ASSERT(tensor->ne[3] == 1);
+        const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
+        size_t new_size;
+        ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
+                    ggml_cann_type_mapping(tensor->type), &new_size));
+        ACL_CHECK(aclDestroyIntArray(acl_shape));
+        size = std::max(size, new_size);
     }
     }
 
 
     return size;
     return size;
@@ -2080,6 +2090,8 @@ static enum ggml_status ggml_backend_cann_graph_compute(
         (ggml_backend_cann_context*)backend->context;
         (ggml_backend_cann_context*)backend->context;
 
 
     ggml_cann_set_device(cann_ctx->device);
     ggml_cann_set_device(cann_ctx->device);
+    //release temp buffer create by set tensor.
+    release_nz_workspace();
 
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor* node = cgraph->nodes[i];
         ggml_tensor* node = cgraph->nodes[i];