9 месяцев назад · 6e1c4cebdb
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,4 +1,4 @@
 
				-ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
			
 
				+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
			
 
				 
			
 
				 FROM ascendai/cann:$ASCEND_VERSION AS build
			
 
				 
			
@@ -6,7 +6,7 @@ WORKDIR /app
 
				 
			
 
				 COPY . .
			
 
				 
			
 
				-RUN yum install -y gcc g++ cmake make
			
 
				+RUN yum install -y gcc g++ cmake make libcurl-devel
			
 
				 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
			
 
				 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
			
 
				 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
			
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1771,7 +1771,7 @@ jobs:
 
				     strategy:
			
 
				       matrix:
			
 
				         cann:
			
 
				-          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
			
 
				+          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
			
 
				         device:
			
 
				           - 'ascend910b3'
			
 
				         build:
			
@@ -1784,7 +1784,7 @@ jobs:
 
				       - name: Dependencies
			
 
				         run: |
			
 
				           yum update -y
			
 
				-          yum install -y git gcc gcc-c++ make cmake
			
 
				+          yum install -y git gcc gcc-c++ make cmake libcurl-devel
			
 
				 
			
 
				       - name: Build
			
 
				         run: |
			
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -57,6 +57,8 @@
 
				 #include <aclnnop/aclnn_sub.h>
			
 
				 #include <aclnnop/aclnn_mul.h>
			
 
				 #include <aclnnop/aclnn_div.h>
			
 
				+#include <aclnnop/aclnn_convolution.h>
			
 
				+#include <aclnnop/aclnn_elu.h>
			
 
				 #include <float.h>
			
 
				 
			
 
				 #include <cmath>
			
@@ -86,6 +88,20 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
 
				     }
			
 
				 }
			
 
				 
			
 
				+void ggml_cann_unary_op(
			
 
				+    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
			
 
				+    ggml_backend_cann_context& ctx, ggml_tensor* dst) {
			
 
				+    ggml_tensor* src = dst->src[0];
			
 
				+
			
 
				+    aclTensor* acl_src = ggml_cann_create_tensor(src);
			
 
				+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
			
 
				+
			
 
				+    unary_op(ctx, acl_src, acl_dst);
			
 
				+
			
 
				+    ACL_CHECK(aclDestroyTensor(acl_src));
			
 
				+    ACL_CHECK(aclDestroyTensor(acl_dst));
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * @brief Repeats elements of a tensor along each dimension according to the
			
 
				  * specified repeat array.
			
@@ -2585,3 +2601,49 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
				     ACL_CHECK(aclDestroyTensor(acl_src));
			
 
				     ACL_CHECK(aclDestroyTensor(acl_dst));
			
 
				 }
			
 
				+
			
 
				+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
			
 
				+    ggml_tensor * src0 = dst->src[0];
			
 
				+    ggml_tensor * src1 = dst->src[1];
			
 
				+
			
 
				+    // stride
			
 
				+    int64_t s0 = ((const int32_t*)(dst->op_params))[0];
			
 
				+
			
 
				+    aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
			
 
				+    aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
			
 
				+    aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
			
 
				+
			
 
				+    int64_t strideVal[1];
			
 
				+    strideVal[0] = s0;
			
 
				+    aclIntArray *stride = aclCreateIntArray(strideVal, 1);
			
 
				+    int64_t paddingVal[] = {0};
			
 
				+    aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
			
 
				+    int64_t dilationVal[] = {1};
			
 
				+    aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
			
 
				+    bool transposed = true;
			
 
				+    int64_t groups = 1;
			
 
				+    int8_t cubeMathType = 0;
			
 
				+
			
 
				+    GGML_CANN_CALL_ACLNN_OP(Convolution, acl_input, acl_weight, nullptr, stride,
			
 
				+        padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
			
 
				+
			
 
				+    ACL_CHECK(aclDestroyTensor(acl_weight));
			
 
				+    ACL_CHECK(aclDestroyTensor(acl_dst));
			
 
				+}
			
 
				+
			
 
				+void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
			
 
				+    ggml_tensor * src0 = dst->src[0];
			
 
				+
			
 
				+    aclTensor* acl_input = ggml_cann_create_tensor(src0);
			
 
				+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
			
 
				+
			
 
				+    float alphaValue = 1.0f;
			
 
				+    aclScalar* alpha = nullptr;
			
 
				+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
			
 
				+
			
 
				+    GGML_CANN_CALL_ACLNN_OP(Elu, acl_input, alpha, alpha, alpha,
			
 
				+        acl_dst);
			
 
				+
			
 
				+    ACL_CHECK(aclDestroyTensor(acl_input));
			
 
				+    ACL_CHECK(aclDestroyTensor(acl_dst));
			
 
				+}
			
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -1,15 +1,4 @@
 
				-#ifndef CANN_ACLNN_OPS
			
 
				-#define CANN_ACLNN_OPS
			
 
				-
			
 
				 /**
			
 
				- * @file    acl_tensor
			
 
				- * @brief   This file contains related functions of ggml_tensor and acl_tensor.
			
 
				- *          Contains conversion from ggml_tensor to acl_tensor, broadcast and other
			
 
				- *          functions.
			
 
				- * @author  hipudding <huafengchun@gmail.com>
			
 
				- * @author  wangshuai09 <391746016@qq.com>
			
 
				- * @date    July 15, 2024
			
 
				- *
			
 
				  * Copyright (c) 2023-2024 The ggml authors
			
 
				  *
			
 
				  * Permission is hereby granted, free of charge, to any person obtaining a copy
			
@@ -31,6 +20,9 @@
 
				  * IN THE SOFTWARE.
			
 
				  */
			
 
				 
			
 
				+#ifndef CANN_ACLNN_OPS
			
 
				+#define CANN_ACLNN_OPS
			
 
				+
			
 
				 #include <aclnnop/aclnn_abs.h>
			
 
				 #include <aclnnop/aclnn_neg.h>
			
 
				 #include <aclnnop/aclnn_exp.h>
			
@@ -483,8 +475,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
				  *          operation is executed using the CANN backend for optimized performance.
			
 
				  *
			
 
				  * @param ctx The CANN context used for operations.
			
 
				- * @param dst The destination tensor where the indices of the maximum values will be stored.
			
 
				- *            dst->op is `GGML_OP_ARGMAX`.
			
 
				+ * @param dst The destination tensor where the indices of the maximum values will
			
 
				+ *            be stored. dst->op is `GGML_OP_ARGMAX`.
			
 
				  */
			
 
				 void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				 
			
@@ -600,40 +592,8 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 
				     aclTensor* acl_dst);
			
 
				 
			
 
				 /**
			
 
				- * @brief Launches an asynchronous task using the memory allocator.
			
 
				- *
			
 
				- * This macro submit an asynchronous task on the specified stream.
			
 
				- * The task uses memory allocated by the allocator. It is guaranteed
			
 
				- * that the memory will not be accessed by other tasks until this task
			
 
				- * completes, due to the sequential execution order within the same stream.
			
 
				- *
			
 
				- * @param OP_NAME aclnn operator name.
			
 
				- * @param args Additional arguments required by the task.
			
 
				- *
			
 
				- * @note
			
 
				- * Memory from the allocator will be "freed" immediately and can be
			
 
				- * reallocated to other pointers. However, it won't be accessed by any
			
 
				- * other task before this asynchronous task ends, because all tasks in the
			
 
				- * same stream are executed in queue order.
			
 
				- */
			
 
				-#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...)                                                \
			
 
				-    do {                                                                                     \
			
 
				-        uint64_t        workspaceSize = 0;                                                   \
			
 
				-        aclOpExecutor * executor;                                                            \
			
 
				-        void *          workspaceAddr = nullptr;                                             \
			
 
				-                                                                                             \
			
 
				-        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
			
 
				-                                                                                             \
			
 
				-        if (workspaceSize > 0) {                                                             \
			
 
				-            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);             \
			
 
				-            workspaceAddr = workspace_allocator.get();                                       \
			
 
				-        }                                                                                    \
			
 
				-        ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream()));     \
			
 
				-    } while (0)
			
 
				-
			
 
				-
			
 
				-/**
			
 
				- * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one output tensor.
			
 
				+ * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
			
 
				+ * output tensor.
			
 
				  *
			
 
				  * This function checks whether broadcasting is needed between `src0` and `src1`.
			
 
				  * If broadcasting is required, it calculates the proper shapes and creates
			
@@ -647,14 +607,57 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 
				  * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
			
 
				  * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
			
 
				  */
			
 
				-void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
			
 
				-                        aclTensor ** acl_src1, aclTensor ** acl_dst);
			
 
				+void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
			
 
				+    aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
			
 
				+
			
 
				+/**
			
 
				+ * @brief   Computes the 1D transposed convolution (deconvolution) of a ggml
			
 
				+ * tensor using the CANN backend.
			
 
				+ *
			
 
				+ * @details This function performs a 1D transposed convolution (also known as
			
 
				+ * deconvolution) operation on the input tensor. The computed result is stored
			
 
				+ * in the destination tensor `dst`. The operation is optimized using the CANN
			
 
				+ * backend for improved performance.
			
 
				+ *
			
 
				+ * @param ctx The CANN context used for operations.
			
 
				+ * @param dst The destination tensor where the transposed convolution result
			
 
				+ * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
			
 
				+ */
			
 
				+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				 
			
 
				 /**
			
 
				- * @brief Applies a element-wise operation to two input tensors using the CANN backend.
			
 
				+ * @brief   Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
			
 
				+ * using the CANN backend.
			
 
				+ *
			
 
				+ * @details This function performs an element-wise ELU activation on the input
			
 
				+ *          tensor.
			
 
				+ *          The result is written to the destination tensor `dst` in-place.
			
 
				+ *          The ELU function is defined as:
			
 
				+ *
			
 
				+ *          \text{ELU}(x) =
			
 
				+ *          \begin{cases}
			
 
				+ *          x, & \text{if } x > 0 \\
			
 
				+ *          \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
			
 
				+ *          \end{cases}
			
 
				  *
			
 
				- * This templated function takes a binary operator and applies it to two source tensors
			
 
				- * associated with the destination tensor. The function handles broadcasting as needed.
			
 
				+ *          where α (alpha) is a hyperparameter, typically set to 1.0.
			
 
				+ *          This operation is optimized using the CANN backend for high-performance
			
 
				+ *          inference or training.
			
 
				+ *
			
 
				+ * @param ctx The CANN context used for operations.
			
 
				+ * @param dst The destination tensor where the ELU-activated result will be stored.
			
 
				+ *            dst->op is expected to be `GGML_OP_ELU`.
			
 
				+ */
			
 
				+void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Applies a element-wise operation to two input tensors using the CANN
			
 
				+ * backend.
			
 
				+ *
			
 
				+ * This templated function takes a binary operator and applies it to two source
			
 
				+ * tensors
			
 
				+ * associated with the destination tensor. The function handles broadcasting as
			
 
				+ * needed.
			
 
				  *
			
 
				  * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
			
 
				  *         the binary operation to be performed. It must take three arguments:
			
@@ -681,6 +684,38 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
				     ACL_CHECK(aclDestroyTensor(acl_dst));
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Launches an asynchronous task using the memory allocator.
			
 
				+ *
			
 
				+ * This macro submit an asynchronous task on the specified stream.
			
 
				+ * The task uses memory allocated by the allocator. It is guaranteed
			
 
				+ * that the memory will not be accessed by other tasks until this task
			
 
				+ * completes, due to the sequential execution order within the same stream.
			
 
				+ *
			
 
				+ * @param OP_NAME aclnn operator name.
			
 
				+ * @param args Additional arguments required by the task.
			
 
				+ *
			
 
				+ * @note
			
 
				+ * Memory from the allocator will be "freed" immediately and can be
			
 
				+ * reallocated to other pointers. However, it won't be accessed by any
			
 
				+ * other task before this asynchronous task ends, because all tasks in the
			
 
				+ * same stream are executed in queue order.
			
 
				+ */
			
 
				+#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...)                                                \
			
 
				+    do {                                                                                     \
			
 
				+        uint64_t        workspaceSize = 0;                                                   \
			
 
				+        aclOpExecutor * executor;                                                            \
			
 
				+        void *          workspaceAddr = nullptr;                                             \
			
 
				+                                                                                             \
			
 
				+        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
			
 
				+                                                                                             \
			
 
				+        if (workspaceSize > 0) {                                                             \
			
 
				+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);             \
			
 
				+            workspaceAddr = workspace_allocator.get();                                       \
			
 
				+        }                                                                                    \
			
 
				+        ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream()));     \
			
 
				+    } while (0)
			
 
				+
			
 
				 /**
			
 
				  * @brief Applies a unary operation to an input tensor using the CANN backend.
			
 
				  *
			
@@ -690,7 +725,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
				  * @tparam unary_op A callable with the signature:
			
 
				  *         void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
			
 
				  *         where the first aclTensor is the source and the second is the destination.
			
 
				- *
			
 
				  * @param ctx The CANN backend context for managing resources and execution.
			
 
				  * @param dst The destination tensor. Its src[0] is treated as the input tensor.
			
 
				  */
			
@@ -702,10 +736,30 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 
				     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
			
 
				 
			
 
				     unary_op(ctx, acl_src, acl_dst);
			
 
				+
			
 
				     ACL_CHECK(aclDestroyTensor(acl_src));
			
 
				     ACL_CHECK(aclDestroyTensor(acl_dst));
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief   Applies a unary operation to a ggml tensor using the CANN backend.
			
 
				+ *
			
 
				+ * @details This function performs a unary operation on the input tensor using
			
 
				+ * a user-provided lambda or callable object `unary_op`, which accepts the CANN
			
 
				+ * context and two ACL tensors (source and destination). Internally, this function
			
 
				+ * creates ACL representations of the ggml tensors and invokes the unary operation.
			
 
				+ * The result is stored in the destination tensor `dst`. This utility abstracts the
			
 
				+ * common boilerplate of tensor conversion and cleanup when implementing unary ops.
			
 
				+ *
			
 
				+ * @param unary_op A callable that performs the unary operation using CANN APIs.
			
 
				+ * @param ctx The CANN context used for operations.
			
 
				+ * @param dst The destination tensor where the result will be stored.
			
 
				+ *            The source tensor is retrieved from `dst->src[0]`.
			
 
				+ */
			
 
				+void ggml_cann_unary_op(
			
 
				+    std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
			
 
				+    ggml_backend_cann_context& ctx, ggml_tensor* dst);
			
 
				+
			
 
				 /**
			
 
				  * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
			
 
				  *
			
@@ -725,11 +779,12 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
 
				  */
			
 
				 #define GGML_CANN_CALL_UNARY_OP(OP_NAME)                         \
			
 
				     do {                                                         \
			
 
				-        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { \
			
 
				+        auto lambda = [](ggml_backend_cann_context& ctx,         \
			
 
				+            aclTensor* acl_src,                                  \
			
 
				+            aclTensor* acl_dst) {                                \
			
 
				             GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);  \
			
 
				         };                                                       \
			
 
				-        ggml_cann_unary_op<lambda>(ctx, dst);                    \
			
 
				+        ggml_cann_unary_op(lambda, ctx, dst);                    \
			
 
				     }                                                            \
			
 
				     while (0)
			
 
				-
			
 
				 #endif  // CANN_ACLNN_OPS
			
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1330,12 +1330,13 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
 
				                     GGML_CANN_CALL_UNARY_OP(Silu);
			
 
				                     break;
			
 
				                 case GGML_UNARY_OP_GELU_QUICK: {
			
 
				-                        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) {
			
 
				-                            GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
			
 
				-                        };
			
 
				-                        ggml_cann_unary_op<lambda>(ctx, dst);
			
 
				-                    }
			
 
				-                    break;
			
 
				+                    auto lambda = [](ggml_backend_cann_context& ctx,
			
 
				+                        aclTensor* acl_src,
			
 
				+                        aclTensor* acl_dst) {
			
 
				+                        GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
			
 
				+                    };
			
 
				+                    ggml_cann_unary_op(lambda, ctx, dst);
			
 
				+                } break;
			
 
				                 case GGML_UNARY_OP_TANH:
			
 
				                     GGML_CANN_CALL_UNARY_OP(Tanh);
			
 
				                     break;
			
@@ -1354,6 +1355,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
 
				                 case GGML_UNARY_OP_EXP:
			
 
				                     GGML_CANN_CALL_UNARY_OP(Exp);
			
 
				                     break;
			
 
				+                case GGML_UNARY_OP_ELU:
			
 
				+                    ggml_cann_elu(ctx, dst);
			
 
				+                    break;
			
 
				                 default:
			
 
				                     return false;
			
 
				             }
			
@@ -1448,7 +1452,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
 
				             break;
			
 
				         case GGML_OP_SIN:
			
 
				             ggml_cann_unary_op<aclnn_sin>(ctx, dst);
			
 
				-        break;
			
 
				+            break;
			
 
				+        case GGML_OP_CONV_TRANSPOSE_1D:
			
 
				+            ggml_cann_conv_transpose_1d(ctx, dst);
			
 
				+            break;
			
 
				         default:
			
 
				             return false;
			
 
				     }
			
@@ -1710,6 +1717,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
 
				                 case GGML_UNARY_OP_GELU_QUICK:
			
 
				                 case GGML_UNARY_OP_TANH:
			
 
				                 case GGML_UNARY_OP_EXP:
			
 
				+                case GGML_UNARY_OP_ELU:
			
 
				                     return true;
			
 
				                 default:
			
 
				                     return false;
			
@@ -1842,6 +1850,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
 
				         case GGML_OP_ARGMAX:
			
 
				         case GGML_OP_COS:
			
 
				         case GGML_OP_SIN:
			
 
				+        case GGML_OP_CONV_TRANSPOSE_1D:
			
 
				             return true;
			
 
				         default:
			
 
				             return false;