vor 11 Monaten · 5f0db9522f
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -154,6 +154,7 @@ option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashA
 
				 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
			
 
				 
			
 
				 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
			
 
				+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
			
 
				 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
			
 
				 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
			
 
				 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
			
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -588,7 +588,7 @@ struct ggml_tensor_extra_gpu {
 
				 };
			
 
				 
			
 
				 
			
 
				-#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
			
 
				+#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
			
 
				 #define USE_CUDA_GRAPH
			
 
				 #endif
			
 
				 
			
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
				 [[noreturn]]
			
 
				 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
			
 
				     int id = -1; // in case cudaGetDevice fails
			
 
				-    cudaGetDevice(&id);
			
 
				+    (void)cudaGetDevice(&id);
			
 
				 
			
 
				     GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
			
 
				     GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
			
@@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
				     for (int id = 0; id < info.device_count; ++id) {
			
 
				         int device_vmm = 0;
			
 
				 
			
 
				-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
			
 
				+#if !defined(GGML_CUDA_NO_VMM)
			
 
				         CUdevice device;
			
 
				         CU_CHECK(cuDeviceGet(&device, id));
			
 
				         CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
			
@@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
				             alloc_prop.location.id = id;
			
 
				             CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
			
 
				         }
			
 
				-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
			
 
				+#endif // !defined(GGML_CUDA_NO_VMM)
			
 
				         info.devices[id].vmm = !!device_vmm;
			
 
				 
			
 
				         cudaDeviceProp prop;
			
@@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 
				 };
			
 
				 
			
 
				 // pool with virtual memory
			
 
				-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
			
 
				+#if !defined(GGML_CUDA_NO_VMM)
			
 
				 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
			
 
				     static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
			
 
				 
			
@@ -309,6 +309,9 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 
				     size_t pool_used = 0;
			
 
				     size_t pool_size = 0;
			
 
				     size_t granularity;
			
 
				+#if defined(GGML_USE_HIP)
			
 
				+    std::vector<std::pair<CUdeviceptr, size_t>> mappings;
			
 
				+#endif
			
 
				 
			
 
				     explicit ggml_cuda_pool_vmm(int device) :
			
 
				         device(device),
			
@@ -317,7 +320,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 
				 
			
 
				     ~ggml_cuda_pool_vmm() {
			
 
				         if (pool_addr != 0) {
			
 
				+#if defined(GGML_USE_HIP)
			
 
				+            // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
			
 
				+            for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
			
 
				+                CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
			
 
				+            }
			
 
				+#else
			
 
				             CU_CHECK(cuMemUnmap(pool_addr, pool_size));
			
 
				+#endif
			
 
				             CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
			
 
				         }
			
 
				     }
			
@@ -350,7 +360,11 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 
				             }
			
 
				 
			
 
				             // map at the end of the pool
			
 
				-            CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
			
 
				+            CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
			
 
				+            CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
			
 
				+#if defined(GGML_USE_HIP)
			
 
				+            mappings.push_back({start_ptr, reserve_size});
			
 
				+#endif
			
 
				 
			
 
				             // the memory allocation handle is no longer needed after mapping
			
 
				             CU_CHECK(cuMemRelease(handle));
			
@@ -360,7 +374,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 
				             access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
			
 
				             access.location.id = device;
			
 
				             access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
			
 
				-            CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
			
 
				+            CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
			
 
				 
			
 
				             // add to the pool
			
 
				             pool_size += reserve_size;
			
@@ -372,7 +386,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 
				 
			
 
				         GGML_ASSERT(pool_addr != 0);
			
 
				 
			
 
				-        void * ptr = (void *) (pool_addr + pool_used);
			
 
				+        void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
			
 
				         *actual_size = size;
			
 
				         pool_used += size;
			
 
				 
			
@@ -391,17 +405,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 
				         pool_used -= size;
			
 
				 
			
 
				         // all deallocations must be in reverse order of the allocations
			
 
				-        GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
			
 
				+        GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
			
 
				     }
			
 
				 };
			
 
				-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
			
 
				+#endif // !defined(GGML_CUDA_NO_VMM)
			
 
				 
			
 
				 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
			
 
				-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
			
 
				+#if !defined(GGML_CUDA_NO_VMM)
			
 
				     if (ggml_cuda_info().devices[device].vmm) {
			
 
				         return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
			
 
				     }
			
 
				-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
			
 
				+#endif // !defined(GGML_CUDA_NO_VMM)
			
 
				     return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
			
 
				 }
			
 
				 
			
@@ -547,7 +561,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
 
				     cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
			
 
				     if (err != cudaSuccess) {
			
 
				         // clear the error
			
 
				-        cudaGetLastError();
			
 
				+        (void)cudaGetLastError();
			
 
				         GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
			
 
				         return nullptr;
			
 
				     }
			
@@ -962,7 +976,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
 
				     cudaError_t err = cudaMallocHost((void **) &ptr, size);
			
 
				     if (err != cudaSuccess) {
			
 
				         // clear the error
			
 
				-        cudaGetLastError();
			
 
				+        (void)cudaGetLastError();
			
 
				         GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
			
 
				                            size / 1024.0 / 1024.0, cudaGetErrorString(err));
			
 
				         return nullptr;
			
@@ -1209,7 +1223,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
 
				                         CUDA_CHECK(err);
			
 
				                     } else {
			
 
				                         // reset the error
			
 
				-                        cudaGetLastError();
			
 
				+                        (void)cudaGetLastError();
			
 
				                     }
			
 
				                 } else {
			
 
				                     cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
			
@@ -1217,7 +1231,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
 
				                         CUDA_CHECK(err);
			
 
				                     } else {
			
 
				                         // reset the error
			
 
				-                        cudaGetLastError();
			
 
				+                        (void)cudaGetLastError();
			
 
				                     }
			
 
				                 }
			
 
				             }
			
@@ -2452,7 +2466,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
 
				                     if (stat == cudaErrorInvalidDeviceFunction) {
			
 
				                         // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
			
 
				                         // We don't need to update blas nodes, so clear error and move on.
			
 
				-                        cudaGetLastError();
			
 
				+                        (void)cudaGetLastError();
			
 
				                     } else {
			
 
				                         GGML_ASSERT(stat == cudaSuccess);
			
 
				                     }
			
@@ -2507,14 +2521,20 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
 
				 static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
			
 
				 
			
 
				     cudaGraphExecUpdateResultInfo result_info;
			
 
				+#ifdef __HIP_PLATFORM_AMD__
			
 
				+    hipGraphNode_t errorNode;
			
 
				+    hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
			
 
				+#else
			
 
				     cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
			
 
				+#endif
			
 
				     if (stat == cudaErrorGraphExecUpdateFailure) {
			
 
				 #ifndef NDEBUG
			
 
				         GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
			
 
				 #endif
			
 
				+
			
 
				         // The pre-existing graph exec cannot be updated due to violated constraints
			
 
				         // so instead clear error and re-instantiate
			
 
				-        cudaGetLastError();
			
 
				+        (void)cudaGetLastError();
			
 
				         CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
			
 
				         cuda_ctx->cuda_graph->instance = nullptr;
			
 
				         CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
			
@@ -2742,7 +2762,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
 
				     cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
			
 
				     if (err != cudaSuccess) {
			
 
				         // clear the error
			
 
				-        cudaGetLastError();
			
 
				+        (void)cudaGetLastError();
			
 
				 
			
 
				         GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
			
 
				                            size / 1024.0 / 1024.0, cudaGetErrorString(err));
			
@@ -2762,7 +2782,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
 
				     cudaError_t err = cudaHostUnregister(buffer);
			
 
				     if (err != cudaSuccess) {
			
 
				         // clear the error
			
 
				-        cudaGetLastError();
			
 
				+        (void)cudaGetLastError();
			
 
				     }
			
 
				 }
			
 
				 
			
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -19,6 +19,12 @@
 
				 #define CUBLAS_TF32_TENSOR_OP_MATH 0
			
 
				 #define CUDA_R_16F  HIPBLAS_R_16F
			
 
				 #define CUDA_R_32F  HIPBLAS_R_32F
			
 
				+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
			
 
				+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
			
 
				+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
			
 
				+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
			
 
				+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
			
 
				+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
			
 
				 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
			
 
				 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
			
 
				 #define cublasCreate hipblasCreate
			
@@ -74,6 +80,21 @@
 
				 #define cudaMemGetInfo hipMemGetInfo
			
 
				 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
			
 
				 #define cudaSetDevice hipSetDevice
			
 
				+#define cuDeviceGet hipDeviceGet
			
 
				+#define CUdevice hipDevice_t
			
 
				+#define CUdeviceptr hipDeviceptr_t
			
 
				+#define cuMemUnmap hipMemUnmap
			
 
				+#define CUmemAccessDesc hipMemAccessDesc
			
 
				+#define cuMemAddressFree hipMemAddressFree
			
 
				+#define cuMemRelease hipMemRelease
			
 
				+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
			
 
				+#define cuMemCreate hipMemCreate
			
 
				+#define cuMemAddressReserve hipMemAddressReserve
			
 
				+#define cuMemMap hipMemMap
			
 
				+#define cuMemSetAccess hipMemSetAccess
			
 
				+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
			
 
				+#define CUmemAllocationProp hipMemAllocationProp
			
 
				+#define cuDeviceGetAttribute hipDeviceGetAttribute
			
 
				 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
			
 
				 #define cudaStreamDestroy hipStreamDestroy
			
 
				 #define cudaStreamFireAndForget hipStreamFireAndForget
			
@@ -81,6 +102,28 @@
 
				 #define cudaStreamPerThread hipStreamPerThread
			
 
				 #define cudaStreamSynchronize hipStreamSynchronize
			
 
				 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
			
 
				+#define cudaGraphExec_t hipGraphExec_t
			
 
				+#define cudaGraphNode_t hipGraphNode_t
			
 
				+#define cudaKernelNodeParams hipKernelNodeParams
			
 
				+#define cudaKernelNodeParams hipKernelNodeParams
			
 
				+#define cudaGraphExecDestroy hipGraphExecDestroy
			
 
				+#define cudaGraphLaunch hipGraphLaunch
			
 
				+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
			
 
				+#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
			
 
				+#define cudaGraphNodeType hipGraphNodeType
			
 
				+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
			
 
				+#define cudaGraphInstantiate hipGraphInstantiate
			
 
				+#define cudaStreamEndCapture hipStreamEndCapture
			
 
				+#define cudaGraphDestroy hipGraphDestroy
			
 
				+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
			
 
				+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
			
 
				+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
			
 
				+#define cudaGraphNodeGetType hipGraphNodeGetType
			
 
				+#define cudaGraphGetNodes hipGraphGetNodes
			
 
				+#define cudaGraphExecUpdate hipGraphExecUpdate
			
 
				+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
			
 
				+#define cudaStreamBeginCapture hipStreamBeginCapture
			
 
				+#define cudaGraph_t hipGraph_t
			
 
				 #define cudaStream_t hipStream_t
			
 
				 #define cudaSuccess hipSuccess
			
 
				 #define __trap() do { abort(); __builtin_unreachable(); } while(0)
			
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -92,6 +92,14 @@ if (GGML_CUDA_NO_PEER_COPY)
 
				     add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
			
 
				 endif()
			
 
				 
			
 
				+if (GGML_HIP_GRAPHS)
			
 
				+    add_compile_definitions(GGML_HIP_GRAPHS)
			
 
				+endif()
			
 
				+
			
 
				+if (GGML_CUDA_NO_VMM)
			
 
				+    add_compile_definitions(GGML_CUDA_NO_VMM)
			
 
				+endif()
			
 
				+
			
 
				 if (CXX_IS_HIPCC)
			
 
				     set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
			
 
				     target_link_libraries(ggml-hip PRIVATE hip::device)