|
@@ -894,14 +894,13 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * @brief Get or expand a cached float32 tensor filled with a scalar value.
|
|
|
|
|
|
|
+ * @brief Get or expand a cached tensor filled with a scalar value.
|
|
|
*
|
|
*
|
|
|
- * This function manages cached device memory for float32 tensors. If the current
|
|
|
|
|
|
|
+ * This function manages cached device memory for tensors. If the current
|
|
|
* cache size is insufficient for the requested tensor shape, the old memory will
|
|
* cache size is insufficient for the requested tensor shape, the old memory will
|
|
|
- * be released and new memory will be allocated. The allocated buffer is then
|
|
|
|
|
- * initialized either with zeros (when @p value == 0.0f) or with the given scalar
|
|
|
|
|
- * value using CANN operations. Finally, an aclTensor object is created from the
|
|
|
|
|
- * cached memory and returned.
|
|
|
|
|
|
|
+ * be released and new memory will be allocated. The allocated buffer is
|
|
|
|
|
+ * initialized with the given scalar value using CANN operations.
|
|
|
|
|
+ * Finally, an aclTensor object is created from the cached memory and returned.
|
|
|
*
|
|
*
|
|
|
* @param ctx The CANN backend context that manages device memory.
|
|
* @param ctx The CANN backend context that manages device memory.
|
|
|
* @param buffer A pointer to the cached device buffer (will be allocated
|
|
* @param buffer A pointer to the cached device buffer (will be allocated
|
|
@@ -910,17 +909,19 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
|
|
|
* updated when the cache is expanded.
|
|
* updated when the cache is expanded.
|
|
|
* @param ne The tensor shape array (number of elements in each dimension).
|
|
* @param ne The tensor shape array (number of elements in each dimension).
|
|
|
* @param nb The stride size for each dimension.
|
|
* @param nb The stride size for each dimension.
|
|
|
|
|
+ * @param dtype Data type of cached tensor.
|
|
|
* @param dims The number of tensor dimensions.
|
|
* @param dims The number of tensor dimensions.
|
|
|
* @param value The scalar value used to fill the tensor (supports zero
|
|
* @param value The scalar value used to fill the tensor (supports zero
|
|
|
* initialization via memset or arbitrary values via fill_scalar).
|
|
* initialization via memset or arbitrary values via fill_scalar).
|
|
|
* @return An aclTensor pointer created from the cached buffer.
|
|
* @return An aclTensor pointer created from the cached buffer.
|
|
|
*/
|
|
*/
|
|
|
-static aclTensor* get_f32_cache_acl_tensor(
|
|
|
|
|
|
|
+static aclTensor* get_cache_acl_tensor(
|
|
|
ggml_backend_cann_context& ctx,
|
|
ggml_backend_cann_context& ctx,
|
|
|
void** buffer,
|
|
void** buffer,
|
|
|
int64_t &cache_element,
|
|
int64_t &cache_element,
|
|
|
int64_t* ne,
|
|
int64_t* ne,
|
|
|
size_t* nb,
|
|
size_t* nb,
|
|
|
|
|
+ ggml_type dtype,
|
|
|
int64_t dims,
|
|
int64_t dims,
|
|
|
float value) {
|
|
float value) {
|
|
|
// Calculate total number of elements
|
|
// Calculate total number of elements
|
|
@@ -928,7 +929,7 @@ static aclTensor* get_f32_cache_acl_tensor(
|
|
|
for (int i = 0; i < dims; i++) {
|
|
for (int i = 0; i < dims; i++) {
|
|
|
n_element *= ne[i];
|
|
n_element *= ne[i];
|
|
|
}
|
|
}
|
|
|
- size_t size = n_element * sizeof(float);
|
|
|
|
|
|
|
+ size_t size = n_element * ggml_type_size(dtype);
|
|
|
|
|
|
|
|
// Allocate or expand cache if needed
|
|
// Allocate or expand cache if needed
|
|
|
if (cache_element < n_element) {
|
|
if (cache_element < n_element) {
|
|
@@ -941,19 +942,17 @@ static aclTensor* get_f32_cache_acl_tensor(
|
|
|
cache_element = n_element;
|
|
cache_element = n_element;
|
|
|
|
|
|
|
|
// Initialize cache
|
|
// Initialize cache
|
|
|
- if (value == 0.0f) {
|
|
|
|
|
- ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream()));
|
|
|
|
|
- } else {
|
|
|
|
|
- int64_t pool_ne[1] = { n_element };
|
|
|
|
|
- size_t pool_nb[1] = { sizeof(float) };
|
|
|
|
|
- aclTensor* acl_value = ggml_cann_create_tensor(
|
|
|
|
|
- *buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
|
|
|
|
|
- aclnn_fill_scalar(ctx, 1, acl_value);
|
|
|
|
|
- ggml_cann_release_resources(ctx, acl_value);
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ int64_t pool_ne[1] = { n_element };
|
|
|
|
|
+ size_t pool_nb[1] = { ggml_type_size(dtype) };
|
|
|
|
|
+ aclTensor* acl_value = ggml_cann_create_tensor(
|
|
|
|
|
+ *buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype),
|
|
|
|
|
+ pool_ne, pool_nb, 1);
|
|
|
|
|
+ aclnn_fill_scalar(ctx, value, acl_value);
|
|
|
|
|
+ ggml_cann_release_resources(ctx, acl_value);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims);
|
|
|
|
|
|
|
+ return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype),
|
|
|
|
|
+ ggml_type_size(dtype), ne, nb, dims);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
@@ -965,35 +964,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
float eps;
|
|
float eps;
|
|
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
|
|
|
|
|
|
- // build gamma, one...
|
|
|
|
|
|
|
+ // build gamma.
|
|
|
size_t acl_gamma_nb[GGML_MAX_DIMS];
|
|
size_t acl_gamma_nb[GGML_MAX_DIMS];
|
|
|
- acl_gamma_nb[0] = sizeof(float);
|
|
|
|
|
|
|
+ // gamma's type is the same with dst.
|
|
|
|
|
+ acl_gamma_nb[0] = ggml_type_size(dst->type);
|
|
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
|
acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
|
|
acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
|
|
|
}
|
|
}
|
|
|
- aclTensor* acl_gamma = get_f32_cache_acl_tensor(
|
|
|
|
|
|
|
+ aclTensor* acl_gamma = get_cache_acl_tensor(
|
|
|
ctx,
|
|
ctx,
|
|
|
&ctx.rms_norm_one_tensor_cache.cache,
|
|
&ctx.rms_norm_one_tensor_cache.cache,
|
|
|
ctx.rms_norm_one_tensor_cache.size,
|
|
ctx.rms_norm_one_tensor_cache.size,
|
|
|
src->ne,
|
|
src->ne,
|
|
|
acl_gamma_nb,
|
|
acl_gamma_nb,
|
|
|
|
|
+ dst->type,
|
|
|
1, // dims
|
|
1, // dims
|
|
|
1.0f // value
|
|
1.0f // value
|
|
|
);
|
|
);
|
|
|
|
|
|
|
|
- // build rstd, zero...
|
|
|
|
|
|
|
+ // build rstd.
|
|
|
int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]};
|
|
int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]};
|
|
|
size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
|
|
size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
|
|
|
|
|
+ // rstd will always be F32.
|
|
|
acl_rstd_nb[0] = sizeof(float);
|
|
acl_rstd_nb[0] = sizeof(float);
|
|
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
|
|
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
|
|
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
|
|
|
}
|
|
}
|
|
|
- aclTensor* acl_rstd = get_f32_cache_acl_tensor(
|
|
|
|
|
|
|
+ aclTensor* acl_rstd = get_cache_acl_tensor(
|
|
|
ctx,
|
|
ctx,
|
|
|
&ctx.rms_norm_zero_tensor_cache.cache,
|
|
&ctx.rms_norm_zero_tensor_cache.cache,
|
|
|
ctx.rms_norm_zero_tensor_cache.size,
|
|
ctx.rms_norm_zero_tensor_cache.size,
|
|
|
acl_rstd_ne,
|
|
acl_rstd_ne,
|
|
|
acl_rstd_nb,
|
|
acl_rstd_nb,
|
|
|
|
|
+ GGML_TYPE_F32,
|
|
|
GGML_MAX_DIMS - 1,
|
|
GGML_MAX_DIMS - 1,
|
|
|
0.0f // value
|
|
0.0f // value
|
|
|
);
|
|
);
|
|
@@ -1765,33 +1768,35 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
ggml_tensor* src0 = dst->src[0]; // src
|
|
ggml_tensor* src0 = dst->src[0]; // src
|
|
|
ggml_tensor* src1 = dst->src[1]; // index
|
|
ggml_tensor* src1 = dst->src[1]; // index
|
|
|
|
|
|
|
|
|
|
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
|
|
|
+
|
|
|
switch (src0->type) {
|
|
switch (src0->type) {
|
|
|
- case GGML_TYPE_F32: {
|
|
|
|
|
- aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
|
|
|
|
|
- dst->data, dst->ne, dst->nb,
|
|
|
|
|
- src1, dst->type);
|
|
|
|
|
- break;
|
|
|
|
|
- }
|
|
|
|
|
- case GGML_TYPE_F16: {
|
|
|
|
|
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
|
|
|
|
- ggml_cann_pool_alloc src_buffer_allocator(
|
|
|
|
|
- ctx.pool(), ggml_nelements(src0) * sizeof(float));
|
|
|
|
|
- void* src_trans_buffer = src_buffer_allocator.get();
|
|
|
|
|
- size_t src_trans_nb[GGML_MAX_DIMS];
|
|
|
|
|
- src_trans_nb[0] = sizeof(float);
|
|
|
|
|
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
|
|
|
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
|
|
|
|
|
+ case GGML_TYPE_F16:
|
|
|
|
|
+ case GGML_TYPE_F32:
|
|
|
|
|
+ if(src0->type == dst->type) {
|
|
|
|
|
+ aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
|
|
|
|
|
+ dst->data, dst->ne, dst->nb,
|
|
|
|
|
+ src1, dst->type);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
|
|
|
|
+ ggml_cann_pool_alloc src_buffer_allocator(
|
|
|
|
|
+ ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
|
|
|
|
|
+ void* src_trans_buffer = src_buffer_allocator.get();
|
|
|
|
|
+ size_t src_trans_nb[GGML_MAX_DIMS];
|
|
|
|
|
+ src_trans_nb[0] = dst->nb[0];
|
|
|
|
|
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
|
|
|
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
|
|
|
+ }
|
|
|
|
|
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
|
|
|
|
+ src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
|
|
|
|
+ src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
|
|
|
+ aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
|
|
|
|
+ aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
|
|
|
|
+ dst->data, dst->ne, dst->nb,
|
|
|
|
|
+ src1, dst->type);
|
|
|
|
|
+ ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
|
|
}
|
|
}
|
|
|
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
|
|
|
|
- src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
|
|
|
|
- src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
|
|
|
- aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
|
|
|
|
- aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
|
|
|
|
- dst->data, dst->ne, dst->nb,
|
|
|
|
|
- src1, dst->type);
|
|
|
|
|
- ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
|
|
|
|
break;
|
|
break;
|
|
|
- }
|
|
|
|
|
case GGML_TYPE_Q8_0: {
|
|
case GGML_TYPE_Q8_0: {
|
|
|
// add 1 dim for bcast mul.
|
|
// add 1 dim for bcast mul.
|
|
|
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
|
|
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
|
|
@@ -1799,7 +1804,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
|
|
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
|
|
|
*dequant_ne;
|
|
*dequant_ne;
|
|
|
int64_t scale_offset = 0;
|
|
int64_t scale_offset = 0;
|
|
|
-
|
|
|
|
|
// [3,4,5,64] -> [3,4,5,2,32]
|
|
// [3,4,5,64] -> [3,4,5,2,32]
|
|
|
weight_ne[0] = QK8_0;
|
|
weight_ne[0] = QK8_0;
|
|
|
weight_ne[1] = src0->ne[0] / QK8_0;
|
|
weight_ne[1] = src0->ne[0] / QK8_0;
|
|
@@ -1809,7 +1813,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
weight_ne[i] = src0->ne[i - 1];
|
|
weight_ne[i] = src0->ne[i - 1];
|
|
|
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
|
|
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
// [3,4,5,64] -> [3,4,5,2,1]
|
|
// [3,4,5,64] -> [3,4,5,2,1]
|
|
|
scale_ne[0] = 1;
|
|
scale_ne[0] = 1;
|
|
|
scale_ne[1] = src0->ne[0] / QK8_0;
|
|
scale_ne[1] = src0->ne[0] / QK8_0;
|
|
@@ -1819,18 +1822,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
scale_ne[i] = src0->ne[i - 1];
|
|
scale_ne[i] = src0->ne[i - 1];
|
|
|
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
|
|
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
// [3,4,5,64] -> [3,4,5,2,32]
|
|
// [3,4,5,64] -> [3,4,5,2,32]
|
|
|
dequant_ne = weight_ne;
|
|
dequant_ne = weight_ne;
|
|
|
- dequant_nb[0] = sizeof(float);
|
|
|
|
|
|
|
+ dequant_nb[0] = ggml_type_size(dst->type);
|
|
|
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
|
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
|
|
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
|
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
|
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
|
|
ggml_cann_pool_alloc dequant_buffer_allocator(
|
|
ggml_cann_pool_alloc dequant_buffer_allocator(
|
|
|
- ctx.pool(), ggml_nelements(src0) * sizeof(float));
|
|
|
|
|
-
|
|
|
|
|
|
|
+ ctx.pool(), ggml_nelements(src0) * ggml_type_size(dst->type));
|
|
|
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
|
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
|
|
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
|
|
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
|
|
|
GGML_MAX_DIMS + 1);
|
|
GGML_MAX_DIMS + 1);
|
|
@@ -1838,16 +1838,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
|
|
src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
|
|
|
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
|
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
|
|
aclTensor* dequant_tensor = ggml_cann_create_tensor(
|
|
aclTensor* dequant_tensor = ggml_cann_create_tensor(
|
|
|
- dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float),
|
|
|
|
|
|
|
+ dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
|
|
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
|
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
|
|
-
|
|
|
|
|
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
|
|
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
|
|
|
- dequant_nb[0] = sizeof(float);
|
|
|
|
|
|
|
+ dequant_nb[0] = ggml_type_size(dst->type);
|
|
|
dequant_ne = src0->ne;
|
|
dequant_ne = src0->ne;
|
|
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
|
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
|
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
|
|
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
|
|
|
dequant_ne, dequant_nb,
|
|
dequant_ne, dequant_nb,
|
|
|
dst->data, dst->ne, dst->nb,
|
|
dst->data, dst->ne, dst->nb,
|
|
@@ -1965,16 +1963,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|
|
// Only check env once.
|
|
// Only check env once.
|
|
|
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
|
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
|
|
if (weight_to_nz && is_matmul_weight(weight)) {
|
|
if (weight_to_nz && is_matmul_weight(weight)) {
|
|
|
- int64_t acl_stride[2] = {1, transpose_ne[1]};
|
|
|
|
|
-
|
|
|
|
|
- // Reverse ne.
|
|
|
|
|
- std::reverse(transpose_ne, transpose_ne + n_dims);
|
|
|
|
|
-
|
|
|
|
|
- std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
|
|
|
|
|
-
|
|
|
|
|
- acl_weight_tensor = aclCreateTensor(
|
|
|
|
|
- transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
|
|
|
|
|
- 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
|
|
|
|
|
|
|
+ acl_weight_tensor =
|
|
|
|
|
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
|
|
|
} else {
|
|
} else {
|
|
|
acl_weight_tensor =
|
|
acl_weight_tensor =
|
|
|
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
|
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
|
@@ -3178,7 +3168,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
|
aclTensor* acl_src0_f16_tensor = nullptr;
|
|
aclTensor* acl_src0_f16_tensor = nullptr;
|
|
|
aclTensor* acl_src1_f16_tensor = nullptr;
|
|
aclTensor* acl_src1_f16_tensor = nullptr;
|
|
|
aclTensor* acl_src2_f16_tensor = nullptr;
|
|
aclTensor* acl_src2_f16_tensor = nullptr;
|
|
|
- aclTensor* acl_dst_f16_tensor = nullptr;
|
|
|
|
|
|
|
|
|
|
// Step 1: cast the src0 (Query) to fp16 if needed
|
|
// Step 1: cast the src0 (Query) to fp16 if needed
|
|
|
ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
|
|
ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
|
|
@@ -3216,22 +3205,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
|
acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne,
|
|
acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne,
|
|
|
src2_bsnd_nb, GGML_MAX_DIMS);
|
|
src2_bsnd_nb, GGML_MAX_DIMS);
|
|
|
|
|
|
|
|
- ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
|
|
|
|
- void* out_f16_buffer = out_f16_allocator.alloc(
|
|
|
|
|
- ggml_nelements(dst) * faElemSize);
|
|
|
|
|
-
|
|
|
|
|
- int64_t* out_f16_ne = src0_bsnd_ne;
|
|
|
|
|
- size_t out_f16_nb[GGML_MAX_DIMS];
|
|
|
|
|
- out_f16_nb[0] = faElemSize;
|
|
|
|
|
- for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
|
|
|
|
- out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- acl_dst_f16_tensor = ggml_cann_create_tensor(
|
|
|
|
|
- out_f16_buffer, faDataType, faElemSize,
|
|
|
|
|
- out_f16_ne, out_f16_nb, GGML_MAX_DIMS
|
|
|
|
|
- );
|
|
|
|
|
-
|
|
|
|
|
// Step 3: create the PSEShift tensor if needed
|
|
// Step 3: create the PSEShift tensor if needed
|
|
|
// this tensor is considered as mask (f16) in the llama.cpp
|
|
// this tensor is considered as mask (f16) in the llama.cpp
|
|
|
aclTensor* bcast_pse_tensor = nullptr;
|
|
aclTensor* bcast_pse_tensor = nullptr;
|
|
@@ -3334,8 +3307,29 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
|
int64_t keyAntiquantMode = 0;
|
|
int64_t keyAntiquantMode = 0;
|
|
|
int64_t valueAntiquantMode = 0;
|
|
int64_t valueAntiquantMode = 0;
|
|
|
|
|
|
|
|
- // Step 5: launch the FusedInferAttentionScoreV2 kernel.
|
|
|
|
|
- // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
|
|
|
|
|
|
|
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
|
|
|
+ aclTensor * fa_dst_tensor = nullptr;
|
|
|
|
|
+ aclTensor * acl_dst_tensor = nullptr;
|
|
|
|
|
+ ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
|
|
|
|
+ if (dst->type == GGML_TYPE_F32) {
|
|
|
|
|
+ void* out_f16_buffer = out_f16_allocator.alloc(
|
|
|
|
|
+ ggml_nelements(dst) * faElemSize);
|
|
|
|
|
+
|
|
|
|
|
+ int64_t* out_f16_ne = src0_bsnd_ne;
|
|
|
|
|
+ size_t out_f16_nb[GGML_MAX_DIMS];
|
|
|
|
|
+ out_f16_nb[0] = faElemSize;
|
|
|
|
|
+ for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
|
|
|
|
+ out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ fa_dst_tensor = ggml_cann_create_tensor(
|
|
|
|
|
+ out_f16_buffer, faDataType, faElemSize,
|
|
|
|
|
+ out_f16_ne, out_f16_nb, GGML_MAX_DIMS
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+ else {
|
|
|
|
|
+ fa_dst_tensor = ggml_cann_create_tensor(dst);
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
|
|
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
|
|
|
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
|
|
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
|
|
@@ -3357,23 +3351,24 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
|
blockSize, antiquantMode, // blockSize, antiquantMode
|
|
blockSize, antiquantMode, // blockSize, antiquantMode
|
|
|
softmaxLseFlag, // softmaxLseFlag
|
|
softmaxLseFlag, // softmaxLseFlag
|
|
|
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
|
|
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
|
|
|
- acl_dst_f16_tensor, // attentionOut
|
|
|
|
|
|
|
+ fa_dst_tensor, // attentionOut
|
|
|
nullptr // softmaxLse
|
|
nullptr // softmaxLse
|
|
|
);
|
|
);
|
|
|
|
|
|
|
|
- // Step 6: post-processing, permute and cast to f32
|
|
|
|
|
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
|
|
|
|
- // TODO: when dst is fp16, don't need cast
|
|
|
|
|
- aclnn_cast(ctx, acl_dst_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
|
|
|
|
- ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
|
|
|
|
|
- acl_src1_f16_tensor,
|
|
|
|
|
- acl_src2_f16_tensor,
|
|
|
|
|
- acl_dst_f16_tensor,
|
|
|
|
|
- acl_dst_tensor);
|
|
|
|
|
- if(src3 != nullptr){
|
|
|
|
|
- ggml_cann_release_resources(ctx, bcast_pse_tensor);
|
|
|
|
|
|
|
+ if (dst->type == GGML_TYPE_F32) {
|
|
|
|
|
+ // Step 6: post-processing, permute and cast to f32
|
|
|
|
|
+ aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
|
|
|
|
+ aclnn_cast(ctx, fa_dst_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
|
|
}
|
|
}
|
|
|
- }else{
|
|
|
|
|
|
|
+
|
|
|
|
|
+ ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
|
|
|
|
|
+ acl_src1_f16_tensor,
|
|
|
|
|
+ acl_src2_f16_tensor,
|
|
|
|
|
+ fa_dst_tensor,
|
|
|
|
|
+ acl_dst_tensor,
|
|
|
|
|
+ bcast_pse_tensor);
|
|
|
|
|
+
|
|
|
|
|
+ } else {
|
|
|
GGML_ABORT("Function is not implemented.");
|
|
GGML_ABORT("Function is not implemented.");
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|