8 months ago · 33983057d0
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,10 +1,10 @@
 
				 ARG UBUNTU_VERSION=22.04
			
 
				 # This needs to generally match the container host's environment.
			
 
				-ARG MUSA_VERSION=rc3.1.1
			
 
				+ARG MUSA_VERSION=rc4.0.1
			
 
				 # Target the MUSA build image
			
 
				-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
			
 
				+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
			
 
				 
			
 
				-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
			
 
				+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
			
 
				 
			
 
				 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
			
 
				 
			
@@ -21,21 +21,14 @@ RUN apt-get update && \
 
				     libcurl4-openssl-dev \
			
 
				     libgomp1
			
 
				 
			
 
				-COPY requirements.txt   requirements.txt
			
 
				-COPY requirements       requirements
			
 
				-
			
 
				-RUN pip install --upgrade pip setuptools wheel \
			
 
				-    && pip install -r requirements.txt
			
 
				-
			
 
				 WORKDIR /app
			
 
				 
			
 
				 COPY . .
			
 
				 
			
 
				-# Use the default MUSA archs if not specified
			
 
				 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
			
 
				         export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
			
 
				     fi && \
			
 
				-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
			
 
				+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
			
 
				     cmake --build build --config Release -j$(nproc)
			
 
				 
			
 
				 RUN mkdir -p /app/lib && \
			
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -351,7 +351,7 @@ jobs:
 
				 
			
 
				   ubuntu-22-cmake-musa:
			
 
				     runs-on: ubuntu-22.04
			
 
				-    container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
			
 
				+    container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
			
 
				 
			
 
				     steps:
			
 
				       - name: Clone
			
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
 
				 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
			
 
				 - AVX, AVX2, AVX512 and AMX support for x86 architectures
			
 
				 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
			
 
				-- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
			
 
				+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
			
 
				 - Vulkan and SYCL backend support
			
 
				 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
			
 
				 
			
@@ -237,7 +237,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 
				 | [BLAS](docs/build.md#blas-build) | All |
			
 
				 | [BLIS](docs/backend/BLIS.md) | All |
			
 
				 | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
			
 
				-| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
			
 
				+| [MUSA](docs/build.md#musa) | Moore Threads GPU |
			
 
				 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
			
 
				 | [HIP](docs/build.md#hip) | AMD GPU |
			
 
				 | [Vulkan](docs/build.md#vulkan) | GPU |
			
--- a/ci/README.md
+++ b/ci/README.md
@@ -54,7 +54,7 @@ docker run --privileged -it \
 
				     -v $HOME/llama.cpp/ci-cache:/ci-cache \
			
 
				     -v $HOME/llama.cpp/ci-results:/ci-results \
			
 
				     -v $PWD:/ws -w /ws \
			
 
				-    mthreads/musa:rc3.1.1-devel-ubuntu22.04
			
 
				+    mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
			
 
				 ```
			
 
				 
			
 
				 Inside the container, execute the following commands:
			
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -107,7 +107,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
 
				 
			
 
				 The defaults are:
			
 
				 
			
 
				-- `MUSA_VERSION` set to `rc3.1.1`
			
 
				+- `MUSA_VERSION` set to `rc4.0.1`
			
 
				 
			
 
				 The resulting images, are essentially the same as the non-MUSA images:
			
 
				 
			
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -1,5 +1,8 @@
 
				 #include "cpy.cuh"
			
 
				 #include "dequantize.cuh"
			
 
				+#ifdef GGML_USE_MUSA
			
 
				+#include "ggml-musa/mudnn.cuh"
			
 
				+#endif // GGML_USE_MUSA
			
 
				 
			
 
				 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
			
 
				 
			
@@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 
				 #endif
			
 
				     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
			
 
				         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
			
 
				-        CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
			
 
				+#ifdef GGML_USE_MUSA
			
 
				+        if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
			
 
				+            CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
			
 
				+        } else
			
 
				+#endif // GGML_USE_MUSA
			
 
				+        {
			
 
				+            CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
			
 
				+        }
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
			
 
				         ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
			
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -772,7 +772,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 
				     GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
			
 
				     GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
			
 
				     GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
			
 
				-    GGML_UNUSED(kb0);
			
 
				+    GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
			
 
				     NO_DEVICE_CODE;
			
 
				 #endif // NEW_MMA_AVAILABLE
			
 
				 }
			
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -27,12 +27,15 @@ if (MUSAToolkit_FOUND)
 
				 
			
 
				     file(GLOB   GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
			
 
				     list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
			
 
				+    list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")
			
 
				 
			
 
				     file(GLOB   GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
			
 
				     file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
			
 
				     list(APPEND GGML_SOURCES_MUSA ${SRCS})
			
 
				     file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
			
 
				     list(APPEND GGML_SOURCES_MUSA ${SRCS})
			
 
				+    file(GLOB   SRCS "../ggml-musa/*.cu")
			
 
				+    list(APPEND GGML_SOURCES_MUSA ${SRCS})
			
 
				 
			
 
				     if (GGML_CUDA_FA_ALL_QUANTS)
			
 
				         file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
			
@@ -62,7 +65,9 @@ if (MUSAToolkit_FOUND)
 
				                             )
			
 
				 
			
 
				     # TODO: do not use CUDA definitions for MUSA
			
 
				-    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
			
 
				+    if (NOT GGML_BACKEND_DL)
			
 
				+        target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
			
 
				+    endif()
			
 
				 
			
 
				     add_compile_definitions(GGML_USE_MUSA)
			
 
				     add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
			
@@ -92,9 +97,10 @@ if (MUSAToolkit_FOUND)
 
				     endif()
			
 
				 
			
 
				     if (GGML_STATIC)
			
 
				+        # TODO: mudnn has not provided static libraries yet
			
 
				         target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
			
 
				     else()
			
 
				-        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
			
 
				+        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
			
 
				     endif()
			
 
				 
			
 
				     if (GGML_CUDA_NO_VMM)
			
--- a/ggml/src/ggml-musa/mudnn.cu
+++ b/ggml/src/ggml-musa/mudnn.cu
@@ -0,0 +1,112 @@
 
				+#include <mutex>
			
 
				+#include <mudnn.h>
			
 
				+
			
 
				+#include "mudnn.cuh"
			
 
				+
			
 
				+namespace mudnn = musa::dnn;
			
 
				+
			
 
				+// Returns a human-readable error string for mudnn::Status
			
 
				+const char* mudnnGetErrorString(mudnn::Status err) {
			
 
				+    switch (err) {
			
 
				+        case mudnn::Status::SUCCESS:
			
 
				+            return "Success";
			
 
				+        case mudnn::Status::INVALID_PARAMETER:
			
 
				+            return "Invalid parameter";
			
 
				+        case mudnn::Status::NOT_INITIALIZED:
			
 
				+            return "Not initialized";
			
 
				+        case mudnn::Status::ALLOC_FAILED:
			
 
				+            return "Allocation failed";
			
 
				+        case mudnn::Status::NOT_SUPPORTED:
			
 
				+            return "Not supported";
			
 
				+        case mudnn::Status::INTERNAL_ERROR:
			
 
				+            return "Internal error";
			
 
				+        case mudnn::Status::ARCH_MISMATCH:
			
 
				+            return "Architecture mismatch";
			
 
				+        case mudnn::Status::EXECUTION_FAILED:
			
 
				+            return "Execution failed";
			
 
				+        default:
			
 
				+            return "Unknown mudnn status";
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Error checking macro for MUDNN calls
			
 
				+#define MUDNN_CHECK(err) CUDA_CHECK_GEN(err, mudnn::Status::SUCCESS, mudnnGetErrorString)
			
 
				+
			
 
				+namespace {
			
 
				+    // Thread-safe cache for mudnn::Handle objects per device
			
 
				+    std::unordered_map<int, std::unique_ptr<mudnn::Handle>> handle_cache;
			
 
				+    std::mutex handle_cache_mutex;
			
 
				+
			
 
				+    mudnn::Handle* get_cached_handle(int device_id) {
			
 
				+        std::lock_guard<std::mutex> lock(handle_cache_mutex);
			
 
				+        auto it = handle_cache.find(device_id);
			
 
				+        if (it != handle_cache.end()) {
			
 
				+            return it->second.get();
			
 
				+        }
			
 
				+        auto handle = std::make_unique<mudnn::Handle>(device_id);
			
 
				+        mudnn::Handle* handle_ptr = handle.get();
			
 
				+        handle_cache[device_id] = std::move(handle);
			
 
				+        return handle_ptr;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Extracts dimensions and strides from a ggml_tensor
			
 
				+int get_ggml_dims_and_strides(const ggml_tensor* tensor,
			
 
				+                              std::vector<int64_t>& dims,
			
 
				+                              std::vector<int64_t>& strides) {
			
 
				+    const int ndims = ggml_n_dims(tensor);
			
 
				+    const size_t element_size = ggml_element_size(tensor);
			
 
				+
			
 
				+    dims.resize(ndims);
			
 
				+    strides.resize(ndims);
			
 
				+
			
 
				+    for (int i = 0; i < ndims; ++i) {
			
 
				+        dims[i] = tensor->ne[i];
			
 
				+        strides[i] = tensor->nb[i] / static_cast<int64_t>(element_size);
			
 
				+    }
			
 
				+    return ndims;
			
 
				+}
			
 
				+
			
 
				+// Converts ggml_type to mudnn::Tensor::Type
			
 
				+mudnn::Tensor::Type ggml_type_to_mudnn_type(ggml_type type) {
			
 
				+    switch (type) {
			
 
				+        case GGML_TYPE_F32:
			
 
				+            return mudnn::Tensor::Type::FLOAT;
			
 
				+        case GGML_TYPE_F16:
			
 
				+            return mudnn::Tensor::Type::HALF;
			
 
				+
			
 
				+        // TODO: Add support for other types
			
 
				+
			
 
				+        default:
			
 
				+            MUDNN_CHECK(mudnn::Status::NOT_SUPPORTED);
			
 
				+    }
			
 
				+
			
 
				+    return mudnn::Tensor::Type::FLOAT; // Default fallback
			
 
				+}
			
 
				+
			
 
				+// Asynchronous memory copy using mudnn::Unary::IDENTITY
			
 
				+musaError_t mudnnMemcpyAsync(ggml_backend_cuda_context& ctx, const ggml_tensor* dst, const ggml_tensor* src) {
			
 
				+    mudnn::Tensor tensor_dst, tensor_src;
			
 
				+
			
 
				+    MUDNN_CHECK(tensor_dst.SetType(ggml_type_to_mudnn_type(dst->type)));
			
 
				+    MUDNN_CHECK(tensor_src.SetType(ggml_type_to_mudnn_type(src->type)));
			
 
				+
			
 
				+    std::vector<int64_t> dims, strides;
			
 
				+    const int ndims = get_ggml_dims_and_strides(src, dims, strides);
			
 
				+
			
 
				+    MUDNN_CHECK(tensor_dst.SetNdInfo(ndims, dims.data(), strides.data()));
			
 
				+    MUDNN_CHECK(tensor_src.SetNdInfo(ndims, dims.data(), strides.data()));
			
 
				+    MUDNN_CHECK(tensor_dst.SetAddr(dst->data));
			
 
				+    MUDNN_CHECK(tensor_src.SetAddr(src->data));
			
 
				+
			
 
				+    mudnn::Unary op;
			
 
				+    MUDNN_CHECK(op.SetMode(mudnn::Unary::Mode::IDENTITY));
			
 
				+    MUDNN_CHECK(op.SetAlpha(0.0f));
			
 
				+    MUDNN_CHECK(op.SetBeta(0.0f));
			
 
				+
			
 
				+    mudnn::Handle* handle = get_cached_handle(ctx.device);
			
 
				+    MUDNN_CHECK(handle->SetStream(ctx.stream()));
			
 
				+    MUDNN_CHECK(op.Run(*handle, tensor_dst, tensor_src));
			
 
				+
			
 
				+    return musaSuccess;
			
 
				+}
			
--- a/ggml/src/ggml-musa/mudnn.cuh
+++ b/ggml/src/ggml-musa/mudnn.cuh
@@ -0,0 +1,12 @@
 
				+#pragma once
			
 
				+
			
 
				+#include "../include/ggml.h"
			
 
				+#include "../ggml-cuda/common.cuh"
			
 
				+
			
 
				+// Asynchronously copies data from src tensor to dst tensor using the provided context.
			
 
				+// Returns a musaError_t indicating success or failure.
			
 
				+musaError_t mudnnMemcpyAsync(
			
 
				+    ggml_backend_cuda_context &ctx,
			
 
				+    const ggml_tensor *dst,
			
 
				+    const ggml_tensor *src
			
 
				+);