|
|
@@ -23,6 +23,7 @@
|
|
|
#ifndef CANN_ACLNN_OPS
|
|
|
#define CANN_ACLNN_OPS
|
|
|
|
|
|
+#include <functional>
|
|
|
#include <aclnnop/aclnn_abs.h>
|
|
|
#include <aclnnop/aclnn_neg.h>
|
|
|
#include <aclnnop/aclnn_exp.h>
|
|
|
@@ -713,6 +714,270 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
|
|
*/
|
|
|
void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
|
|
|
|
|
+/*
|
|
|
+ * @brief A generic wrapper for ACL resources with custom deleter support.
|
|
|
+ */
|
|
|
+using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Trait structure used to define how to destroy a given ACL resource type.
|
|
|
+ *
|
|
|
+ * @tparam T ACL resource type.
|
|
|
+ */
|
|
|
+template<typename T>
|
|
|
+struct acl_resource_traits;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
|
|
|
+ */
|
|
|
+template<>
|
|
|
+struct acl_resource_traits<aclTensor> {
|
|
|
+ static void destroy(void* p) {
|
|
|
+ ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
|
|
|
+ */
|
|
|
+template<>
|
|
|
+struct acl_resource_traits<aclIntArray> {
|
|
|
+ static void destroy(void* p) {
|
|
|
+ ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
|
|
|
+ */
|
|
|
+template<>
|
|
|
+struct acl_resource_traits<aclScalar> {
|
|
|
+ static void destroy(void* p) {
|
|
|
+ ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
|
|
|
+ */
|
|
|
+template<>
|
|
|
+struct acl_resource_traits<aclTensorList> {
|
|
|
+ static void destroy(void* p) {
|
|
|
+ ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Creates a generic ACL resource wrapper with proper destruction logic.
|
|
|
+ *
|
|
|
+ * @tparam T ACL resource type.
|
|
|
+ * @param ptr Raw pointer to ACL resource.
|
|
|
+ * @return any_acl_resource Smart pointer that handles destruction.
|
|
|
+ */
|
|
|
+template<typename T>
|
|
|
+any_acl_resource make_acl_resource(T* ptr) {
|
|
|
+ return any_acl_resource(
|
|
|
+ static_cast<void*>(ptr),
|
|
|
+ [](void* p) {
|
|
|
+ acl_resource_traits<T>::destroy(p);
|
|
|
+ }
|
|
|
+ );
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Registers multiple ACL resources into a vector for lifetime management.
|
|
|
+ *
|
|
|
+ * @tparam Args Variadic list of ACL resource types.
|
|
|
+ * @param vec Target vector to hold ACL resources.
|
|
|
+ * @param args Raw pointers to ACL resources.
|
|
|
+ */
|
|
|
+template<typename... Args>
|
|
|
+void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
|
|
|
+ (vec.emplace_back(make_acl_resource(args)), ...);
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Task class that wraps the execution of an aclnn function call.
|
|
|
+ */
|
|
|
+class aclnn_task : public cann_task {
|
|
|
+ public:
|
|
|
+ aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
|
|
|
+ uint64_t workspace_size, aclOpExecutor * executor,
|
|
|
+ aclrtStream stream) :
|
|
|
+ aclnn_func_(aclnn_func),
|
|
|
+ workspace_addr_(workspace_addr),
|
|
|
+ workspace_size_(workspace_size),
|
|
|
+ executor_(executor),
|
|
|
+ stream_(stream) {}
|
|
|
+ virtual void run_task() override {
|
|
|
+ ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
|
|
|
+ }
|
|
|
+ private:
|
|
|
+ aclnn_func_t aclnn_func_;
|
|
|
+ void * workspace_addr_;
|
|
|
+ uint64_t workspace_size_;
|
|
|
+ aclOpExecutor * executor_;
|
|
|
+ aclrtStream stream_;
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Task class that releases ACL resources after usage.
|
|
|
+ */
|
|
|
+class release_resource_task : public cann_task {
|
|
|
+public:
|
|
|
+ release_resource_task(std::vector<any_acl_resource>&& resources){
|
|
|
+ resource_ = std::move(resources);
|
|
|
+ }
|
|
|
+
|
|
|
+ virtual void run_task() override {
|
|
|
+ resource_.clear();
|
|
|
+ }
|
|
|
+private:
|
|
|
+ std::vector<any_acl_resource> resource_;
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Task class for performing asynchronous memory copy operations.
|
|
|
+ */
|
|
|
+class async_memcpy_task : public cann_task {
|
|
|
+public:
|
|
|
+ async_memcpy_task(void* dst, const void* src, size_t size,
|
|
|
+ aclrtMemcpyKind kind, aclrtStream stream)
|
|
|
+ : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
|
|
|
+
|
|
|
+ virtual void run_task() override {
|
|
|
+ ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
|
|
|
+ }
|
|
|
+private:
|
|
|
+ void* dst_;
|
|
|
+ const void* src_;
|
|
|
+ size_t size_;
|
|
|
+ aclrtMemcpyKind kind_;
|
|
|
+ aclrtStream stream_;
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Task class for performing asynchronous memory set operations.
|
|
|
+ */
|
|
|
+class async_memset_task : public cann_task {
|
|
|
+ public:
|
|
|
+ async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
|
|
|
+ : buffer_(buffer), size_(size), value_(value), stream_(stream) {}
|
|
|
+
|
|
|
+ virtual void run_task() override {
|
|
|
+ ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
|
|
|
+ }
|
|
|
+ private:
|
|
|
+ void* buffer_;
|
|
|
+ size_t size_;
|
|
|
+ int32_t value_;
|
|
|
+ aclrtStream stream_;
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Launches an asynchronous task using the memory allocator.
|
|
|
+ *
|
|
|
+ * This macro submit an asynchronous task on the specified stream.
|
|
|
+ * The task uses memory allocated by the allocator. It is guaranteed
|
|
|
+ * that the memory will not be accessed by other tasks until this task
|
|
|
+ * completes, due to the sequential execution order within the same stream.
|
|
|
+ *
|
|
|
+ * @param OP_NAME aclnn operator name.
|
|
|
+ * @param args Additional arguments required by the task.
|
|
|
+ *
|
|
|
+ * @note
|
|
|
+ * Memory from the allocator will be "freed" immediately and can be
|
|
|
+ * reallocated to other pointers. However, it won't be accessed by any
|
|
|
+ * other task before this asynchronous task ends, because all tasks in the
|
|
|
+ * same stream are executed in queue order.
|
|
|
+ */
|
|
|
+
|
|
|
+#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
|
|
|
+ do { \
|
|
|
+ uint64_t workspaceSize = 0; \
|
|
|
+ aclOpExecutor * executor; \
|
|
|
+ void * workspaceAddr = nullptr; \
|
|
|
+ ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
|
|
|
+ /* workspace should alloced in main thread to keep malloc order when using vmm. */ \
|
|
|
+ if (workspaceSize > 0) { \
|
|
|
+ ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
|
|
|
+ workspaceAddr = workspace_allocator.get(); \
|
|
|
+ } \
|
|
|
+ if (CTX.async_mode) { \
|
|
|
+ auto task = \
|
|
|
+ std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, \
|
|
|
+ executor, CTX.stream()); \
|
|
|
+ CTX.task_queue.submit_task(std::move(task)); \
|
|
|
+ } else { \
|
|
|
+ ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
|
|
|
+ } \
|
|
|
+ } while (0)
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Registers and releases multiple ACL resources, optionally deferring the release
|
|
|
+ * using a task.
|
|
|
+ *
|
|
|
+ * @tparam Args Types of the ACL resources.
|
|
|
+ * @param ctx Backend context which manages task submission and async mode.
|
|
|
+ * @param args Pointers to ACL resources to be released.
|
|
|
+ */
|
|
|
+template <typename... Args>
|
|
|
+void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
|
|
|
+ std::vector<any_acl_resource> resources;
|
|
|
+ register_acl_resources(resources, std::forward<Args>(args)...);
|
|
|
+ if(ctx.async_mode) {
|
|
|
+ auto task = std::make_unique<release_resource_task>(std::move(resources));
|
|
|
+ ctx.task_queue.submit_task(std::move(task));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Performs an asynchronous memory copy operation, optionally deferred via task submission.
|
|
|
+ *
|
|
|
+ * @param ctx Backend context containing stream and async configuration.
|
|
|
+ * @param dst Destination memory address.
|
|
|
+ * @param src Source memory address.
|
|
|
+ * @param len Size of memory to copy (in bytes).
|
|
|
+ * @param kind Type of memory copy (host-to-device, device-to-host, etc).
|
|
|
+ */
|
|
|
+inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
|
|
|
+ const void * src, size_t len, aclrtMemcpyKind kind) {
|
|
|
+ if (ctx.async_mode) {
|
|
|
+ auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
|
|
|
+ ctx.task_queue.submit_task(std::move(task));
|
|
|
+ } else {
|
|
|
+ ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx.stream()));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
|
|
|
+ const void * src, size_t len, aclrtMemcpyKind kind) {
|
|
|
+ if (ctx->async_mode) {
|
|
|
+ auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
|
|
|
+ ctx->task_queue.submit_task(std::move(task));
|
|
|
+ } else {
|
|
|
+ ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx->stream()));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * @brief Performs an asynchronous memory set operation, optionally deferred via task submission.
|
|
|
+ *
|
|
|
+ * @param ctx Backend context containing stream and async configuration.
|
|
|
+ * @param buffer Memory buffer to be set.
|
|
|
+ * @param size Size of the memory buffer (in bytes).
|
|
|
+ * @param value Value to set in the buffer.
|
|
|
+ */
|
|
|
+inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
|
|
|
+ size_t size, int value) {
|
|
|
+ if (ctx.async_mode) {
|
|
|
+ auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
|
|
|
+ ctx.task_queue.submit_task(std::move(task));
|
|
|
+ } else {
|
|
|
+ ACL_CHECK(aclrtMemsetAsync(buffer, size, value, size, ctx.stream()));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* @brief Applies a element-wise operation to two input tensors using the CANN
|
|
|
* backend.
|
|
|
@@ -742,42 +1007,9 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
|
|
|
binary_op(ctx, acl_src0, acl_src1, acl_dst);
|
|
|
|
|
|
- ACL_CHECK(aclDestroyTensor(acl_src0));
|
|
|
- ACL_CHECK(aclDestroyTensor(acl_src1));
|
|
|
- ACL_CHECK(aclDestroyTensor(acl_dst));
|
|
|
+ ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
|
|
|
}
|
|
|
|
|
|
-/**
|
|
|
- * @brief Launches an asynchronous task using the memory allocator.
|
|
|
- *
|
|
|
- * This macro submit an asynchronous task on the specified stream.
|
|
|
- * The task uses memory allocated by the allocator. It is guaranteed
|
|
|
- * that the memory will not be accessed by other tasks until this task
|
|
|
- * completes, due to the sequential execution order within the same stream.
|
|
|
- *
|
|
|
- * @param OP_NAME aclnn operator name.
|
|
|
- * @param args Additional arguments required by the task.
|
|
|
- *
|
|
|
- * @note
|
|
|
- * Memory from the allocator will be "freed" immediately and can be
|
|
|
- * reallocated to other pointers. However, it won't be accessed by any
|
|
|
- * other task before this asynchronous task ends, because all tasks in the
|
|
|
- * same stream are executed in queue order.
|
|
|
- */
|
|
|
-#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...) \
|
|
|
- do { \
|
|
|
- uint64_t workspaceSize = 0; \
|
|
|
- aclOpExecutor * executor; \
|
|
|
- void * workspaceAddr = nullptr; \
|
|
|
- \
|
|
|
- ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
|
|
|
- \
|
|
|
- if (workspaceSize > 0) { \
|
|
|
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); \
|
|
|
- workspaceAddr = workspace_allocator.get(); \
|
|
|
- } \
|
|
|
- ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \
|
|
|
- } while (0)
|
|
|
|
|
|
/**
|
|
|
* @brief Applies a unary operation to an input tensor using the CANN backend.
|
|
|
@@ -799,9 +1031,7 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
|
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
|
|
|
|
|
unary_op(ctx, acl_src, acl_dst);
|
|
|
-
|
|
|
- ACL_CHECK(aclDestroyTensor(acl_src));
|
|
|
- ACL_CHECK(aclDestroyTensor(acl_dst));
|
|
|
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
@@ -832,7 +1062,7 @@ void ggml_cann_unary_op(
|
|
|
*
|
|
|
* Internally, the lambda will call:
|
|
|
* @code
|
|
|
- * GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);
|
|
|
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
|
|
* @endcode
|
|
|
*
|
|
|
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
|
|
@@ -840,14 +1070,14 @@ void ggml_cann_unary_op(
|
|
|
* @see ggml_cann_unary_op
|
|
|
* @see GGML_CANN_CALL_ACLNN_OP
|
|
|
*/
|
|
|
-#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
|
|
- do { \
|
|
|
- auto lambda = [](ggml_backend_cann_context& ctx, \
|
|
|
- aclTensor* acl_src, \
|
|
|
- aclTensor* acl_dst) { \
|
|
|
- GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); \
|
|
|
- }; \
|
|
|
- ggml_cann_unary_op(lambda, ctx, dst); \
|
|
|
- } \
|
|
|
+#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
|
|
+ do { \
|
|
|
+ auto lambda = [](ggml_backend_cann_context& ctx, \
|
|
|
+ aclTensor* acl_src, \
|
|
|
+ aclTensor* acl_dst) { \
|
|
|
+ GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
|
|
+ }; \
|
|
|
+ ggml_cann_unary_op(lambda, ctx, dst); \
|
|
|
+ } \
|
|
|
while (0)
|
|
|
#endif // CANN_ACLNN_OPS
|