|
|
@@ -329,7 +329,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|
|
} else
|
|
|
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
|
|
{
|
|
|
- CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
|
|
+ } else {
|
|
|
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
|
|
+ }
|
|
|
}
|
|
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
|
|
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
|
|
@@ -400,7 +404,13 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
|
|
|
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
|
|
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
|
|
- return nullptr;
|
|
|
+ // Prioritize CUDA graph compatibility over direct memory copy optimization.
|
|
|
+ // Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ return (void*) cpy_flt<cpy_1_flt<float, float>>;
|
|
|
+ } else {
|
|
|
+ return nullptr;
|
|
|
+ }
|
|
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
|
|
return (void*) cpy_flt<cpy_1_flt<float, float>>;
|
|
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|