|
|
@@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
|
|
|
case GGML_TYPE_Q6_K:
|
|
|
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
|
|
break;
|
|
|
+ case GGML_TYPE_IQ2_XXS:
|
|
|
+ mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
|
|
|
+ break;
|
|
|
+ case GGML_TYPE_IQ2_XS:
|
|
|
+ mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
|
|
|
+ break;
|
|
|
+ case GGML_TYPE_IQ2_S:
|
|
|
+ mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
|
|
|
+ break;
|
|
|
+ case GGML_TYPE_IQ3_XXS:
|
|
|
+ mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
|
|
|
+ break;
|
|
|
+ case GGML_TYPE_IQ3_S:
|
|
|
+ mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
|
|
|
+ break;
|
|
|
+ case GGML_TYPE_IQ1_S:
|
|
|
+ mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
|
|
|
+ break;
|
|
|
case GGML_TYPE_IQ4_XS:
|
|
|
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
|
|
break;
|
|
|
@@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
|
case GGML_TYPE_Q4_K:
|
|
|
case GGML_TYPE_Q5_K:
|
|
|
case GGML_TYPE_Q6_K:
|
|
|
+ case GGML_TYPE_IQ2_XXS:
|
|
|
+ case GGML_TYPE_IQ2_XS:
|
|
|
+ case GGML_TYPE_IQ2_S:
|
|
|
+ case GGML_TYPE_IQ3_XXS:
|
|
|
+ case GGML_TYPE_IQ3_S:
|
|
|
+ case GGML_TYPE_IQ1_S:
|
|
|
case GGML_TYPE_IQ4_XS:
|
|
|
case GGML_TYPE_IQ4_NL:
|
|
|
mmq_supported = true;
|