5 сар өмнө · 3f4fc97f1d
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,10 +1,10 @@
 
															 ARG UBUNTU_VERSION=22.04
														
 
															 # This needs to generally match the container host's environment.
														
 
															-ARG MUSA_VERSION=rc4.0.1
														
 
															+ARG MUSA_VERSION=rc4.2.0
														
 
															 # Target the MUSA build image
														
 
															-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
														
 
															+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
														
 
															-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
														
 
															+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
														
 
															 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
														
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -515,7 +515,7 @@ jobs:
 
															   ubuntu-22-cmake-musa:
														
 
															     runs-on: ubuntu-22.04
														
 
															-    container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
														
 
															+    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
														
 
															     steps:
														
 
															       - name: Clone
														
--- a/ci/README.md
+++ b/ci/README.md
@@ -54,7 +54,7 @@ docker run --privileged -it \
 
															     -v $HOME/llama.cpp/ci-cache:/ci-cache \
														
 
															     -v $HOME/llama.cpp/ci-results:/ci-results \
														
 
															     -v $PWD:/ws -w /ws \
														
 
															-    mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
														
 
															+    mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
														
 
															 ```
														
 
															 Inside the container, execute the following commands:
														
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
 
															 The defaults are:
														
 
															-- `MUSA_VERSION` set to `rc4.0.1`
														
 
															+- `MUSA_VERSION` set to `rc4.2.0`
														
 
															 The resulting images, are essentially the same as the non-MUSA images:
														
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental,
 
															 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
														
 
															 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
														
 
															 option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
														
 
															+option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
														
 
															+option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF)
														
 
															 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
														
 
															 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
														
 
															 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
														
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -765,7 +765,7 @@ struct ggml_tensor_extra_gpu {
 
															 };
														
 
															-#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
														
 
															+#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
														
 
															 #define USE_CUDA_GRAPH
														
 
															 #endif
														
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -1,9 +1,9 @@
 
															 #include "cpy.cuh"
														
 
															 #include "dequantize.cuh"
														
 
															 #include "cpy-utils.cuh"
														
 
															-#ifdef GGML_USE_MUSA
														
 
															+#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
														
 
															 #include "ggml-musa/mudnn.cuh"
														
 
															-#endif // GGML_USE_MUSA
														
 
															+#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
														
 
															 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
														
@@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
 
															 // Copy destination pointers to GPU to be available when pointer indirection is in use
														
 
															 void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
														
 
															-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
														
 
															+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
														
 
															     if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
														
 
															         CUDA_CHECK(cudaStreamSynchronize(stream));
														
 
															         if (cuda_graph->dest_ptrs_d != nullptr) {
														
@@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 
															     char ** dest_ptrs_d = nullptr;
														
 
															     int graph_cpynode_index = -1;
														
 
															-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
														
 
															+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
														
 
															     if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
														
 
															         dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
														
 
															         graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
														
@@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 
															 #endif
														
 
															     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
														
 
															         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
														
 
															-#ifdef GGML_USE_MUSA
														
 
															+#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
														
 
															         if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
														
 
															             CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
														
 
															         } else
														
 
															-#endif // GGML_USE_MUSA
														
 
															+#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
														
 
															         {
														
 
															             CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
														
 
															         }
														
@@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 
															         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
														
 
															                 ggml_type_name(src0->type), ggml_type_name(src1->type));
														
 
															     }
														
 
															-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
														
 
															+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
														
 
															     if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
														
 
															         ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
														
 
															     }
														
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -13,7 +13,7 @@
 
															 #define CUBLAS_OP_N MUBLAS_OP_N
														
 
															 #define CUBLAS_OP_T MUBLAS_OP_T
														
 
															 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
														
 
															-#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
														
 
															+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
														
 
															 #define CUDA_R_16F  MUSA_R_16F
														
 
															 #define CUDA_R_16BF MUSA_R_16BF
														
 
															 #define CUDA_R_32F  MUSA_R_32F
														
@@ -29,7 +29,7 @@
 
															 #define cublasSgemm mublasSgemm
														
 
															 #define cublasStatus_t mublasStatus_t
														
 
															 #define cublasOperation_t mublasOperation_t
														
 
															-#define cublasGetStatusString mublasStatus_to_string
														
 
															+#define cublasGetStatusString mublasGetStatusString
														
 
															 #define cudaDataType_t musaDataType_t
														
 
															 #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
														
 
															 #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
														
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND)
 
															     list(APPEND GGML_SOURCES_MUSA ${SRCS})
														
 
															     file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
														
 
															     list(APPEND GGML_SOURCES_MUSA ${SRCS})
														
 
															-    file(GLOB   SRCS "../ggml-musa/*.cu")
														
 
															-    list(APPEND GGML_SOURCES_MUSA ${SRCS})
														
 
															+
														
 
															+    if (GGML_MUSA_MUDNN_COPY)
														
 
															+        file(GLOB   SRCS "../ggml-musa/*.cu")
														
 
															+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
														
 
															+        add_compile_definitions(GGML_MUSA_MUDNN_COPY)
														
 
															+    endif()
														
 
															     if (GGML_CUDA_FA_ALL_QUANTS)
														
 
															         file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
														
@@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND)
 
															     add_compile_definitions(GGML_USE_MUSA)
														
 
															     add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
														
 
															+    if (GGML_MUSA_GRAPHS)
														
 
															+        add_compile_definitions(GGML_MUSA_GRAPHS)
														
 
															+    endif()
														
 
															+
														
 
															     if (GGML_CUDA_FORCE_MMQ)
														
 
															         add_compile_definitions(GGML_CUDA_FORCE_MMQ)
														
 
															     endif()
														
@@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND)
 
															     endif()
														
 
															     if (GGML_STATIC)
														
 
															-        # TODO: mudnn has not provided static libraries yet
														
 
															         target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
														
 
															+        # TODO: mudnn has not provided static libraries yet
														
 
															+        # if (GGML_MUSA_MUDNN_COPY)
														
 
															+        #     target_link_libraries(ggml-musa PRIVATE mudnn_static)
														
 
															+        # endif()
														
 
															     else()
														
 
															-        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
														
 
															+        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
														
 
															+        if (GGML_MUSA_MUDNN_COPY)
														
 
															+            target_link_libraries(ggml-musa PRIVATE mudnn)
														
 
															+        endif()
														
 
															     endif()
														
 
															     if (GGML_CUDA_NO_VMM)