|
|
@@ -2358,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
|
|
}
|
|
|
|
|
|
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
|
|
- GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
|
|
-
|
|
|
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
|
|
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
|
|
|
|
- if (!ggml_backend_buffer_is_cuda(src->buffer)) {
|
|
|
+ if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
- if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
|
|
|
+ if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
- // device -> device
|
|
|
+ // device -> device copy
|
|
|
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
|
|
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
|
|
|
|
|
- if (backend_src != backend_dst) {
|
|
|
- ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
|
|
- ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
|
|
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
|
|
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
|
|
|
|
|
- GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
|
|
- GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
|
|
+ if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
|
|
+#ifndef NDEBUG
|
|
|
+ GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
|
|
|
+#endif
|
|
|
+ return false;
|
|
|
+ }
|
|
|
|
|
|
+ if (backend_src != backend_dst) {
|
|
|
// copy on src stream
|
|
|
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
|
|
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
|
|
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
|
|
} else {
|
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
|
return false;
|
|
|
@@ -2393,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|
|
#endif
|
|
|
}
|
|
|
|
|
|
- // record event on src stream
|
|
|
+ // record event on src stream after the copy
|
|
|
if (!cuda_ctx_src->copy_event) {
|
|
|
ggml_cuda_set_device(cuda_ctx_src->device);
|
|
|
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
|
|
@@ -2405,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|
|
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
|
|
|
} else {
|
|
|
// src and dst are on the same backend
|
|
|
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
|
|
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
|
|
}
|
|
|
return true;
|
|
|
}
|