|
@@ -109,8 +109,8 @@ void ggml_cuda_mul_mat_q(
|
|
|
const int64_t s03 = src0->nb[3] / ts_src0;
|
|
const int64_t s03 = src0->nb[3] / ts_src0;
|
|
|
const int64_t s3 = dst->nb[3] / ts_dst;
|
|
const int64_t s3 = dst->nb[3] / ts_dst;
|
|
|
|
|
|
|
|
- const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
|
|
|
|
- || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
|
|
|
|
|
|
|
+ const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
|
|
|
|
+ || GGML_CUDA_CC_IS_CDNA(cc);
|
|
|
|
|
|
|
|
if (!ids) {
|
|
if (!ids) {
|
|
|
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
|
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
|
@@ -252,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
|
|
|
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
|
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
|
|
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
|
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
|
|
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
|
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
|
|
- || (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
|
|
|
|
|
|
|
+ || GGML_CUDA_CC_IS_CDNA(cc))
|
|
|
&& src1_ncols == ne11;
|
|
&& src1_ncols == ne11;
|
|
|
const mmq_args args = {
|
|
const mmq_args args = {
|
|
|
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
|
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
|
@@ -306,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
|
return false;
|
|
return false;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if (new_mma_available(cc) || amd_mfma_available(cc)) {
|
|
|
|
|
|
|
+ if (new_mma_available(cc)) {
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -322,5 +322,21 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
|
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ if (amd_mfma_available(cc)) {
|
|
|
|
|
+ // As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
|
|
|
|
|
+ // performs better but is currently suffering from a crash on this architecture.
|
|
|
|
|
+ // TODO: Revisit when hipblaslt is fixed on CDNA3
|
|
|
|
|
+ if (GGML_CUDA_CC_IS_CDNA3(cc)) {
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+ return false;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
|
}
|
|
}
|