5 bulan lalu · c7aa1364fd
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -1,6 +1,12 @@
 
															 #include "common.cuh"
														
 
															 #include "fattn-common.cuh"
														
 
															+// Currenlty llvm with the amdgcn target dose not support unrolling loops
														
 
															+// that contain a break that can not be resolved at compile time.
														
 
															+#ifdef __clang__
														
 
															+#pragma clang diagnostic push
														
 
															+#pragma clang diagnostic ignored "-Wpass-failed"
														
 
															+#endif // __clang__
														
 
															 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
														
 
															 #ifndef GGML_USE_HIP
														
 
															 __launch_bounds__(D, 1)
														
@@ -341,6 +347,9 @@ static __global__ void flash_attn_vec_ext_f16(
 
															     NO_DEVICE_CODE;
														
 
															 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
														
 
															 }
														
 
															+#ifdef __clang__
														
 
															+#pragma clang diagnostic pop
														
 
															+#endif // __clang__
														
 
															 template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
														
 
															 void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -1,6 +1,12 @@
 
															 #include "common.cuh"
														
 
															 #include "fattn-common.cuh"
														
 
															+// Currenlty llvm with the amdgcn target dose not support unrolling loops
														
 
															+// that contain a break that can not be resolved at compile time.
														
 
															+#ifdef __clang__
														
 
															+#pragma clang diagnostic push
														
 
															+#pragma clang diagnostic ignored "-Wpass-failed"
														
 
															+#endif // __clang__
														
 
															 template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
														
 
															 #ifndef GGML_USE_HIP
														
 
															 __launch_bounds__(D, 1)
														
@@ -336,6 +342,9 @@ static __global__ void flash_attn_vec_ext_f32(
 
															     NO_DEVICE_CODE;
														
 
															 #endif // FLASH_ATTN_AVAILABLE
														
 
															 }
														
 
															+#ifdef __clang__
														
 
															+#pragma clang diagnostic pop
														
 
															+#endif // __clang__
														
 
															 template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
														
 
															 void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {