|
@@ -9,6 +9,10 @@
|
|
|
#include "ggml-impl.h"
|
|
#include "ggml-impl.h"
|
|
|
#include "ggml-wgsl-shaders.hpp"
|
|
#include "ggml-wgsl-shaders.hpp"
|
|
|
|
|
|
|
|
|
|
+#ifdef __EMSCRIPTEN__
|
|
|
|
|
+# include <emscripten/emscripten.h>
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
#include <webgpu/webgpu_cpp.h>
|
|
#include <webgpu/webgpu_cpp.h>
|
|
|
|
|
|
|
|
#include <atomic>
|
|
#include <atomic>
|
|
@@ -261,9 +265,12 @@ struct webgpu_context_struct {
|
|
|
wgpu::Queue queue;
|
|
wgpu::Queue queue;
|
|
|
wgpu::Limits limits;
|
|
wgpu::Limits limits;
|
|
|
|
|
|
|
|
|
|
+ uint32_t subgroup_size;
|
|
|
|
|
+
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
bool supports_subgroup_matrix = false;
|
|
bool supports_subgroup_matrix = false;
|
|
|
- uint32_t subgroup_size;
|
|
|
|
|
wgpu::SubgroupMatrixConfig subgroup_matrix_config;
|
|
wgpu::SubgroupMatrixConfig subgroup_matrix_config;
|
|
|
|
|
+#endif
|
|
|
|
|
|
|
|
// Separate this out from limits since on some Metal systems, the limit returned by
|
|
// Separate this out from limits since on some Metal systems, the limit returned by
|
|
|
// querying the limits is higher than the actual allowed maximum.
|
|
// querying the limits is higher than the actual allowed maximum.
|
|
@@ -449,8 +456,8 @@ static void ggml_backend_webgpu_wait(webgpu_context & ct
|
|
|
// If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
|
|
// If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
|
|
|
// inflight_max may be 0, meaning that we must wait on all futures.
|
|
// inflight_max may be 0, meaning that we must wait on all futures.
|
|
|
uint64_t timeout_ms = block ? UINT64_MAX : 0;
|
|
uint64_t timeout_ms = block ? UINT64_MAX : 0;
|
|
|
- uint inflight_threads = ctx->inflight_threads;
|
|
|
|
|
- uint inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
|
|
|
|
|
|
|
+ uint32_t inflight_threads = ctx->inflight_threads;
|
|
|
|
|
+ uint32_t inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
|
|
|
while (futures.size() >= inflight_max && futures.size() > 0) {
|
|
while (futures.size() >= inflight_max && futures.size() > 0) {
|
|
|
ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
|
|
ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
|
|
|
futures.erase(futures.begin());
|
|
futures.erase(futures.begin());
|
|
@@ -986,6 +993,7 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
|
|
|
pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
|
|
pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
|
|
|
uint32_t wg_m;
|
|
uint32_t wg_m;
|
|
|
uint32_t wg_n;
|
|
uint32_t wg_n;
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
if (ctx->supports_subgroup_matrix) {
|
|
if (ctx->supports_subgroup_matrix) {
|
|
|
// The total number of subgroups/workgroups needed per matrix.
|
|
// The total number of subgroups/workgroups needed per matrix.
|
|
|
uint32_t wg_m_sg_tile =
|
|
uint32_t wg_m_sg_tile =
|
|
@@ -995,11 +1003,15 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
|
|
|
WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
|
|
WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
|
|
|
wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
|
|
wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
|
|
|
} else {
|
|
} else {
|
|
|
|
|
+#endif
|
|
|
uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
|
|
uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
|
|
|
uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
|
|
uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
|
|
|
wg_m = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
|
|
wg_m = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
|
|
|
wg_n = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
|
|
wg_n = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
}
|
|
}
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
|
|
wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -1419,9 +1431,9 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
|
|
commands.push_back(*cmd);
|
|
commands.push_back(*cmd);
|
|
|
}
|
|
}
|
|
|
// compute the batch size based on the number of inflight threads
|
|
// compute the batch size based on the number of inflight threads
|
|
|
- uint inflight_threads = ctx->inflight_threads;
|
|
|
|
|
- uint batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
|
|
|
|
|
- WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
|
|
|
|
|
|
|
+ uint32_t inflight_threads = ctx->inflight_threads;
|
|
|
|
|
+ uint32_t batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
|
|
|
|
|
+ WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
|
|
|
if (commands.size() >= batch_size) {
|
|
if (commands.size() >= batch_size) {
|
|
|
futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
|
|
futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
|
|
|
// Process events and check for completed submissions
|
|
// Process events and check for completed submissions
|
|
@@ -1758,6 +1770,17 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
|
|
|
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
|
|
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
|
|
|
wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
|
|
wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
|
|
|
|
|
|
|
|
|
|
+ std::string proc_mul_mat_f32_f32;
|
|
|
|
|
+ std::string proc_mul_mat_f32_f32_vec;
|
|
|
|
|
+ std::string proc_mul_mat_f16_f32;
|
|
|
|
|
+ std::string proc_mul_mat_f16_f32_vec;
|
|
|
|
|
+ std::string proc_mul_mat_f16_f16;
|
|
|
|
|
+ std::string proc_mul_mat_f16_f16_vec;
|
|
|
|
|
+ std::string proc_mul_mat_q4_0_f32;
|
|
|
|
|
+ std::string proc_mul_mat_q4_0_f32_vec;
|
|
|
|
|
+
|
|
|
|
|
+ std::vector<wgpu::ConstantEntry> mul_mat_constants;
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
if (webgpu_ctx->supports_subgroup_matrix) {
|
|
if (webgpu_ctx->supports_subgroup_matrix) {
|
|
|
std::map<std::string, std::string> sg_matrix_repls;
|
|
std::map<std::string, std::string> sg_matrix_repls;
|
|
|
sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
|
|
sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
|
|
@@ -1770,100 +1793,57 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
|
|
|
sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
|
|
sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
|
|
|
sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
|
|
sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
|
|
|
|
|
|
|
|
- std::string proc_mul_mat_subgroup_matrix_f32_f32 =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
|
|
|
|
|
- std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
|
|
|
|
|
|
|
+ proc_mul_mat_f32_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
|
|
|
|
|
+ proc_mul_mat_f32_f32_vec =
|
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
|
|
|
- std::string proc_mul_mat_subgroup_matrix_f16_f32 =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
|
|
|
|
|
- std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
|
|
|
|
|
|
|
+ proc_mul_mat_f16_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
|
|
|
|
|
+ proc_mul_mat_f16_f32_vec =
|
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
|
|
|
- std::string proc_mul_mat_subgroup_matrix_f16_f16 =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
|
|
|
|
|
- std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
|
|
|
|
|
|
|
+ proc_mul_mat_f16_f16 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
|
|
|
|
|
+ proc_mul_mat_f16_f16_vec =
|
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
|
|
|
- std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
|
|
|
|
|
|
|
+ proc_mul_mat_q4_0_f32 =
|
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
|
|
|
- std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
|
|
|
|
|
|
|
+ proc_mul_mat_q4_0_f32_vec =
|
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
|
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
|
|
|
-
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
- webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
|
|
|
|
|
- "mul_mat_subgroup_matrix_f32_f32_vec");
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
- webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
|
|
|
|
|
- "mul_mat_subgroup_matrix_f16_f32_vec");
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
- webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
|
|
|
|
|
- "mul_mat_subgroup_matrix_f16_f16_vec");
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
- webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
|
|
|
|
|
- "mul_mat_subgroup_matrix_q4_0_f32_vec");
|
|
|
|
|
} else {
|
|
} else {
|
|
|
- std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
|
|
|
|
|
- mul_mat_reg_tile_constants[0].key = "TILE_K";
|
|
|
|
|
- mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
|
|
|
|
|
- mul_mat_reg_tile_constants[1].key = "WORKGROUP_SIZE_M";
|
|
|
|
|
- mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
|
|
|
|
|
- mul_mat_reg_tile_constants[2].key = "WORKGROUP_SIZE_N";
|
|
|
|
|
- mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
|
|
|
|
|
|
|
+#endif
|
|
|
|
|
+ mul_mat_constants.push_back({ .key = "TILE_K", .value = WEBGPU_MUL_MAT_TILE_K });
|
|
|
|
|
+ mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_M", .value = WEBGPU_MUL_MAT_WG_SIZE_M });
|
|
|
|
|
+ mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_N", .value = WEBGPU_MUL_MAT_WG_SIZE_N });
|
|
|
|
|
|
|
|
std::map<std::string, std::string> reg_repls;
|
|
std::map<std::string, std::string> reg_repls;
|
|
|
reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
|
|
reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
|
|
|
reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
|
|
reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
|
|
|
|
|
|
|
|
- // Process each reg-tile shader with tile replacements.
|
|
|
|
|
- // Keep the processed strings in-scope so .c_str() remains valid.
|
|
|
|
|
- std::string proc_mul_mat_reg_tile_f32_f32 =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
|
|
|
|
|
- std::string proc_mul_mat_reg_tile_f32_f32_vec =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
|
|
|
|
|
- std::string proc_mul_mat_reg_tile_f16_f32 =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
|
|
|
|
|
- std::string proc_mul_mat_reg_tile_f16_f32_vec =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
|
|
|
|
|
- std::string proc_mul_mat_reg_tile_f16_f16 =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
|
|
|
|
|
- std::string proc_mul_mat_reg_tile_f16_f16_vec =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
|
|
|
|
|
- std::string proc_mul_mat_reg_tile_q4_0_f32 =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
|
|
|
|
|
- std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
|
|
|
|
|
- ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
|
|
|
|
|
-
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
|
|
|
|
|
- "mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
|
|
|
|
|
- "mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
|
|
|
|
|
- "mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
|
|
|
|
|
- "mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
|
|
|
|
|
- "mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
|
|
|
|
|
- "mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
|
|
|
|
|
- "mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
|
|
|
|
|
- webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
|
|
|
|
|
- ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
|
|
|
|
|
- "mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
|
|
|
|
|
|
|
+ proc_mul_mat_f32_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
|
|
|
|
|
+ proc_mul_mat_f32_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
|
|
|
|
|
+ proc_mul_mat_f16_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
|
|
|
|
|
+ proc_mul_mat_f16_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
|
|
|
|
|
+ proc_mul_mat_f16_f16 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
|
|
|
|
|
+ proc_mul_mat_f16_f16_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
|
|
|
|
|
+ proc_mul_mat_q4_0_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
|
|
|
|
|
+ proc_mul_mat_q4_0_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
}
|
|
}
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
+ webgpu_ctx->device, proc_mul_mat_f32_f32.c_str(), "mul_mat_f32_f32", mul_mat_constants);
|
|
|
|
|
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
+ webgpu_ctx->device, proc_mul_mat_f32_f32_vec.c_str(), "mul_mat_f32_f32_vec", mul_mat_constants);
|
|
|
|
|
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
+ webgpu_ctx->device, proc_mul_mat_f16_f32.c_str(), "mul_mat_f16_f32", mul_mat_constants);
|
|
|
|
|
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
+ webgpu_ctx->device, proc_mul_mat_f16_f32_vec.c_str(), "mul_mat_f16_f32_vec", mul_mat_constants);
|
|
|
|
|
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
+ webgpu_ctx->device, proc_mul_mat_f16_f16.c_str(), "mul_mat_f16_f16", mul_mat_constants);
|
|
|
|
|
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
+ webgpu_ctx->device, proc_mul_mat_f16_f16_vec.c_str(), "mul_mat_f16_f16_vec", mul_mat_constants);
|
|
|
|
|
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
+ webgpu_ctx->device, proc_mul_mat_q4_0_f32.c_str(), "mul_mat_q4_0_f32", mul_mat_constants);
|
|
|
|
|
+ webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
|
|
|
|
+ webgpu_ctx->device, proc_mul_mat_q4_0_f32_vec.c_str(), "mul_mat_q4_0_f32_vec", mul_mat_constants);
|
|
|
|
|
|
|
|
std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
|
|
std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
|
|
|
mul_mat_vec_constants[0].key = "WORKGROUP_SIZE";
|
|
mul_mat_vec_constants[0].key = "WORKGROUP_SIZE";
|
|
@@ -2384,13 +2364,17 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
|
|
|
|
|
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
|
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
|
|
|
|
|
|
|
|
|
+ wgpu::RequestAdapterOptions options = {};
|
|
|
|
|
+
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
// TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
|
|
// TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
|
|
|
const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
|
|
const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
|
|
|
wgpu::DawnTogglesDescriptor adapterTogglesDesc;
|
|
wgpu::DawnTogglesDescriptor adapterTogglesDesc;
|
|
|
adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
|
|
adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
|
|
|
adapterTogglesDesc.enabledToggleCount = 2;
|
|
adapterTogglesDesc.enabledToggleCount = 2;
|
|
|
- wgpu::RequestAdapterOptions options = {};
|
|
|
|
|
options.nextInChain = &adapterTogglesDesc;
|
|
options.nextInChain = &adapterTogglesDesc;
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
ctx->instance.WaitAny(ctx->instance.RequestAdapter(
|
|
ctx->instance.WaitAny(ctx->instance.RequestAdapter(
|
|
|
&options, wgpu::CallbackMode::AllowSpontaneous,
|
|
&options, wgpu::CallbackMode::AllowSpontaneous,
|
|
|
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
|
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
|
@@ -2406,11 +2390,13 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
ctx->adapter.GetLimits(&ctx->limits);
|
|
ctx->adapter.GetLimits(&ctx->limits);
|
|
|
ctx->max_wg_size_x = 288; // default value
|
|
ctx->max_wg_size_x = 288; // default value
|
|
|
|
|
|
|
|
- wgpu::AdapterInfo info{};
|
|
|
|
|
|
|
+ wgpu::AdapterInfo info{};
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
|
|
wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
|
|
|
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
|
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
|
|
info.nextInChain = &subgroup_matrix_configs;
|
|
info.nextInChain = &subgroup_matrix_configs;
|
|
|
}
|
|
}
|
|
|
|
|
+#endif
|
|
|
ctx->adapter.GetInfo(&info);
|
|
ctx->adapter.GetInfo(&info);
|
|
|
|
|
|
|
|
wgpu::SupportedFeatures features;
|
|
wgpu::SupportedFeatures features;
|
|
@@ -2418,6 +2404,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
// we require f16 support
|
|
// we require f16 support
|
|
|
GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
|
GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
|
|
|
|
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
// Only support square f16 matrices of size 8 or 16 for now
|
|
// Only support square f16 matrices of size 8 or 16 for now
|
|
|
bool valid_subgroup_matrix_config = false;
|
|
bool valid_subgroup_matrix_config = false;
|
|
|
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
|
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
|
@@ -2433,36 +2420,27 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
|
|
|
|
|
+#endif
|
|
|
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
|
|
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
|
|
|
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
|
|
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
|
|
|
- ctx->subgroup_size = info.subgroupMaxSize;
|
|
|
|
|
- ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
|
|
|
|
|
|
|
+ ctx->subgroup_size = info.subgroupMaxSize;
|
|
|
|
|
|
|
|
// Initialize device
|
|
// Initialize device
|
|
|
- std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
|
|
|
|
|
- wgpu::FeatureName::ImplicitDeviceSynchronization };
|
|
|
|
|
|
|
+ std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16 };
|
|
|
|
|
+
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
|
|
+ required_features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
|
|
|
if (ctx->supports_subgroup_matrix) {
|
|
if (ctx->supports_subgroup_matrix) {
|
|
|
required_features.push_back(wgpu::FeatureName::Subgroups);
|
|
required_features.push_back(wgpu::FeatureName::Subgroups);
|
|
|
required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
|
|
required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
|
|
|
}
|
|
}
|
|
|
|
|
+#endif
|
|
|
|
|
|
|
|
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
|
required_features.push_back(wgpu::FeatureName::TimestampQuery);
|
|
required_features.push_back(wgpu::FeatureName::TimestampQuery);
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
- // Enable Dawn-specific toggles to increase native performance
|
|
|
|
|
- // TODO: Don't enable for WASM builds, they won't have an effect anyways
|
|
|
|
|
- // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
|
|
|
|
|
- // only for native performance?
|
|
|
|
|
- const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init",
|
|
|
|
|
- "disable_polyfills_on_integer_div_and_mod" };
|
|
|
|
|
- const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
|
|
|
|
|
- wgpu::DawnTogglesDescriptor deviceTogglesDesc;
|
|
|
|
|
- deviceTogglesDesc.enabledToggles = deviceEnabledToggles;
|
|
|
|
|
- deviceTogglesDesc.enabledToggleCount = 4;
|
|
|
|
|
- deviceTogglesDesc.disabledToggles = deviceDisabledToggles;
|
|
|
|
|
- deviceTogglesDesc.disabledToggleCount = 1;
|
|
|
|
|
-
|
|
|
|
|
wgpu::DeviceDescriptor dev_desc;
|
|
wgpu::DeviceDescriptor dev_desc;
|
|
|
dev_desc.requiredLimits = &ctx->limits;
|
|
dev_desc.requiredLimits = &ctx->limits;
|
|
|
dev_desc.requiredFeatures = required_features.data();
|
|
dev_desc.requiredFeatures = required_features.data();
|
|
@@ -2480,7 +2458,23 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
|
|
GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
|
|
|
std::string(message).c_str());
|
|
std::string(message).c_str());
|
|
|
});
|
|
});
|
|
|
|
|
+
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
|
|
+ // Enable Dawn-specific toggles to increase native performance
|
|
|
|
|
+ // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
|
|
|
|
|
+ // only for native performance?
|
|
|
|
|
+ const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init",
|
|
|
|
|
+ "disable_polyfills_on_integer_div_and_mod" };
|
|
|
|
|
+ const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
|
|
|
|
|
+ wgpu::DawnTogglesDescriptor deviceTogglesDesc;
|
|
|
|
|
+ deviceTogglesDesc.enabledToggles = deviceEnabledToggles;
|
|
|
|
|
+ deviceTogglesDesc.enabledToggleCount = 4;
|
|
|
|
|
+ deviceTogglesDesc.disabledToggles = deviceDisabledToggles;
|
|
|
|
|
+ deviceTogglesDesc.disabledToggleCount = 1;
|
|
|
|
|
+
|
|
|
dev_desc.nextInChain = &deviceTogglesDesc;
|
|
dev_desc.nextInChain = &deviceTogglesDesc;
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
ctx->instance.WaitAny(ctx->adapter.RequestDevice(
|
|
ctx->instance.WaitAny(ctx->adapter.RequestDevice(
|
|
|
&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
|
|
&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
|
|
|
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
|
|
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
|
|
@@ -2578,18 +2572,27 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
|
|
|
ctx.name = GGML_WEBGPU_NAME;
|
|
ctx.name = GGML_WEBGPU_NAME;
|
|
|
ctx.device_count = 1;
|
|
ctx.device_count = 1;
|
|
|
|
|
|
|
|
- const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
|
|
|
|
|
-
|
|
|
|
|
- wgpu::DawnTogglesDescriptor instanceTogglesDesc;
|
|
|
|
|
- instanceTogglesDesc.enabledToggles = instanceEnabledToggles;
|
|
|
|
|
- instanceTogglesDesc.enabledToggleCount = 1;
|
|
|
|
|
wgpu::InstanceDescriptor instance_descriptor{};
|
|
wgpu::InstanceDescriptor instance_descriptor{};
|
|
|
std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
|
|
std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
|
|
|
instance_descriptor.requiredFeatures = instance_features.data();
|
|
instance_descriptor.requiredFeatures = instance_features.data();
|
|
|
instance_descriptor.requiredFeatureCount = instance_features.size();
|
|
instance_descriptor.requiredFeatureCount = instance_features.size();
|
|
|
- instance_descriptor.nextInChain = &instanceTogglesDesc;
|
|
|
|
|
|
|
+
|
|
|
|
|
+#ifndef __EMSCRIPTEN__
|
|
|
|
|
+ const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
|
|
|
|
|
+ wgpu::DawnTogglesDescriptor instanceTogglesDesc;
|
|
|
|
|
+ instanceTogglesDesc.enabledToggles = instanceEnabledToggles;
|
|
|
|
|
+ instanceTogglesDesc.enabledToggleCount = 1;
|
|
|
|
|
+ instance_descriptor.nextInChain = &instanceTogglesDesc;
|
|
|
|
|
+#endif
|
|
|
|
|
|
|
|
webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
|
|
webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
|
|
|
|
|
+
|
|
|
|
|
+#ifdef __EMSCRIPTEN__
|
|
|
|
|
+ if (webgpu_ctx->instance == nullptr) {
|
|
|
|
|
+ GGML_LOG_ERROR("ggml_webgpu: Failed to create WebGPU instance. Make sure either -sASYNCIFY or -sJSPI is set\n");
|
|
|
|
|
+ return nullptr;
|
|
|
|
|
+ }
|
|
|
|
|
+#endif
|
|
|
GGML_ASSERT(webgpu_ctx->instance != nullptr);
|
|
GGML_ASSERT(webgpu_ctx->instance != nullptr);
|
|
|
|
|
|
|
|
static ggml_backend_reg reg = {
|
|
static ggml_backend_reg reg = {
|