9 月之前 · 84778e9770
--- a/Makefile
+++ b/Makefile
@@ -780,10 +780,6 @@ ifdef GGML_HIP
 
				 
			
 
				 	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
			
 
				 
			
 
				-ifdef GGML_HIP_UMA
			
 
				-	MK_CPPFLAGS += -DGGML_HIP_UMA
			
 
				-endif # GGML_HIP_UMA
			
 
				-
			
 
				 	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
			
 
				 	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
			
 
				 	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
			
--- a/docs/build.md
+++ b/docs/build.md
@@ -259,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 
				       cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
			
 
				       && cmake --build build --config Release -- -j 16
			
 
				   ```
			
 
				-  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
			
 
				-  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
			
 
				 
			
 
				   To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
			
 
				 
			
@@ -296,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 
				 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
			
 
				 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
			
 
				 
			
 
				+### Unified Memory
			
 
				+
			
 
				+On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
			
 
				+
			
 
				 ## Vulkan
			
 
				 
			
 
				 **Windows**
			
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -170,7 +170,6 @@ option(GGML_HIP                             "ggml: use HIP"
 
				 option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
			
 
				 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
			
 
				 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
			
 
				-option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
			
 
				 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
			
 
				 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
			
 
				 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
			
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
 
				 
			
 
				 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
			
 
				     ggml_cuda_set_device(device);
			
 
				-#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
			
 
				-    auto res = hipMallocManaged(ptr, size);
			
 
				-    if (res == hipSuccess) {
			
 
				-        // if error we "need" to know why...
			
 
				-        CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
			
 
				-    }
			
 
				-    return res;
			
 
				-#else
			
 
				-
			
 
				-#if !defined(GGML_USE_HIP)
			
 
				     cudaError_t err;
			
 
				     if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
			
 
				     {
			
 
				         err = cudaMallocManaged(ptr, size);
			
 
				+#if defined(GGML_USE_HIP)
			
 
				+        if (err == hipSuccess) {
			
 
				+            CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
			
 
				+        }
			
 
				+
			
 
				+        // fall back to cudaMalloc if not supported (e.g. on Windows)
			
 
				+        if (err == hipErrorNotSupported) {
			
 
				+            static bool warned_unsupported = false;
			
 
				+            if (!warned_unsupported) {
			
 
				+                GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
			
 
				+                warned_unsupported = true;
			
 
				+            }
			
 
				+
			
 
				+            err = cudaMalloc(ptr, size);
			
 
				+        }
			
 
				+#endif // defined(GGML_USE_HIP)
			
 
				     }
			
 
				     else
			
 
				     {
			
 
				         err = cudaMalloc(ptr, size);
			
 
				     }
			
 
				     return err;
			
 
				-#else
			
 
				-    return cudaMalloc(ptr, size);
			
 
				-#endif // !defined(GGML_USE_HIP)
			
 
				-
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
			
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -71,6 +71,8 @@
 
				 #define cudaLaunchHostFunc hipLaunchHostFunc
			
 
				 #define cudaMalloc hipMalloc
			
 
				 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
			
 
				+#define cudaMallocManaged hipMallocManaged
			
 
				+#define cudaMemAdvise hipMemAdvise
			
 
				 #define cudaMemcpy hipMemcpy
			
 
				 #define cudaMemcpyAsync hipMemcpyAsync
			
 
				 #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
			
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -89,10 +89,6 @@ endif()
 
				 
			
 
				 add_compile_definitions(GGML_USE_HIP)
			
 
				 
			
 
				-if (GGML_HIP_UMA)
			
 
				-    add_compile_definitions(GGML_HIP_UMA)
			
 
				-endif()
			
 
				-
			
 
				 if (GGML_CUDA_FORCE_MMQ)
			
 
				     add_compile_definitions(GGML_CUDA_FORCE_MMQ)
			
 
				 endif()