|
@@ -13,7 +13,7 @@
|
|
|
|
|
|
|
|
It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
|
|
It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
|
|
|
*/
|
|
*/
|
|
|
-template <size_t n_experts, bool with_norm>
|
|
|
|
|
|
|
+template <int n_experts, bool with_norm>
|
|
|
__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
|
|
__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
|
|
|
float * weights,
|
|
float * weights,
|
|
|
int32_t * ids,
|
|
int32_t * ids,
|
|
@@ -204,8 +204,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
|
|
|
|
|
|
|
GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
|
|
GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
|
|
|
|
|
|
|
|
- cudaStream_t stream = ctx.stream();
|
|
|
|
|
-
|
|
|
|
|
const int n_expert_used = weights->ne[1];
|
|
const int n_expert_used = weights->ne[1];
|
|
|
|
|
|
|
|
if (with_norm) {
|
|
if (with_norm) {
|