|
|
@@ -333,6 +333,28 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
|
|
}
|
|
|
|
|
|
if (amd_wmma_available(cc)) {
|
|
|
+ // RDNA 4 is consistently worse on rocblas
|
|
|
+ // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
|
|
|
+ if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
|
|
+ // High expert counts almost always better on MMQ
|
|
|
+ // due to a large amount of graph splits
|
|
|
+ // https://github.com/ggml-org/llama.cpp/pull/18202
|
|
|
+ if (n_experts >= 64) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ switch (type) {
|
|
|
+ // These quants are really bad on MMQ
|
|
|
+ case GGML_TYPE_Q2_K:
|
|
|
+ case GGML_TYPE_Q6_K:
|
|
|
+ // These quants are usually worse but not always
|
|
|
+ case GGML_TYPE_IQ2_XS:
|
|
|
+ case GGML_TYPE_IQ2_S:
|
|
|
+ return ne11 <= 128;
|
|
|
+ default:
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
return true;
|
|
|
}
|
|
|
|