|
|
@@ -90,7 +90,7 @@ struct tile_x_sizes {
|
|
|
|
|
|
static int get_mmq_x_max_host(const int cc) {
|
|
|
return new_mma_available(cc) ? 128 :
|
|
|
- ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc) ?
|
|
|
+ GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
|
|
|
#ifdef GGML_CUDA_FORCE_MMQ
|
|
|
128 : 64;
|
|
|
#else
|
|
|
@@ -124,7 +124,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
|
|
|
|
|
static int get_mmq_y_host(const int cc) {
|
|
|
return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
|
|
|
- ((ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) ? 128 : 64);
|
|
|
+ ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
|
|
|
}
|
|
|
|
|
|
static constexpr __device__ int get_mmq_y_device() {
|
|
|
@@ -2832,7 +2832,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
|
|
const int mmq_x_max = get_mmq_x_max_host(cc);
|
|
|
const int mmq_y = get_mmq_y_host(cc);
|
|
|
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
|
|
|
- const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc);
|
|
|
+ const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
|
|
|
|
|
int mmq_x_best = 0;
|
|
|
int nparts_best = INT_MAX;
|