1 miesiąc temu · 7ca5991d2b
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -547,6 +547,46 @@ jobs:
 
				           # This is using llvmpipe and runs slower than other backends
			
 
				           ctest -L main --verbose --timeout 3600
			
 
				 
			
 
				+  ubuntu-24-wasm-webgpu:
			
 
				+    runs-on: ubuntu-24.04
			
 
				+
			
 
				+    steps:
			
 
				+      - name: Clone
			
 
				+        id: checkout
			
 
				+        uses: actions/checkout@v4
			
 
				+
			
 
				+      - name: ccache
			
 
				+        uses: ggml-org/ccache-action@v1.2.16
			
 
				+        with:
			
 
				+          key: ubuntu-latest-wasm-webgpu
			
 
				+          evict-old-files: 1d
			
 
				+
			
 
				+      - name: Install Emscripten
			
 
				+        run: |
			
 
				+          git clone https://github.com/emscripten-core/emsdk.git
			
 
				+          cd emsdk
			
 
				+          ./emsdk install latest
			
 
				+          ./emsdk activate latest
			
 
				+
			
 
				+      - name: Fetch emdawnwebgpu
			
 
				+        run: |
			
 
				+          DAWN_TAG="v20251027.212519"
			
 
				+          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
			
 
				+          echo "Downloading ${EMDAWN_PKG}"
			
 
				+          curl -L -o emdawn.zip \
			
 
				+            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
			
 
				+          unzip emdawn.zip
			
 
				+
			
 
				+      - name: Build WASM WebGPU
			
 
				+        run: |
			
 
				+          source emsdk/emsdk_env.sh
			
 
				+          emcmake cmake -B build-wasm \
			
 
				+            -DGGML_WEBGPU=ON \
			
 
				+            -DLLAMA_CURL=OFF \
			
 
				+            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
			
 
				+
			
 
				+          cmake --build build-wasm --target test-backend-ops -j $(nproc)
			
 
				+
			
 
				   ubuntu-22-cmake-hip:
			
 
				     runs-on: ubuntu-22.04
			
 
				     container: rocm/dev-ubuntu-22.04:6.1.2
			
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,5 @@ poetry.toml
 
				 # IDE
			
 
				 /*.code-workspace
			
 
				 /.windsurf/
			
 
				+# emscripten
			
 
				+a.out.*
			
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,10 +33,24 @@ endif()
 
				 
			
 
				 option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
			
 
				 
			
 
				+option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
			
 
				+
			
 
				 if (EMSCRIPTEN)
			
 
				     set(BUILD_SHARED_LIBS_DEFAULT OFF)
			
 
				 
			
 
				-    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
			
 
				+    # Use 64-bit memory to support backend_get_memory queries
			
 
				+    # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
			
 
				+    if (LLAMA_WASM_MEM64)
			
 
				+      add_compile_options("-sMEMORY64=1")
			
 
				+      add_link_options("-sMEMORY64=1")
			
 
				+    endif()
			
 
				+    add_link_options("-sALLOW_MEMORY_GROWTH=1")
			
 
				+
			
 
				+    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
			
 
				+    option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
			
 
				+    if (LLAMA_BUILD_HTML)
			
 
				+        set(CMAKE_EXECUTABLE_SUFFIX ".html")
			
 
				+    endif()
			
 
				 else()
			
 
				     if (MINGW)
			
 
				         set(BUILD_SHARED_LIBS_DEFAULT OFF)
			
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -30,6 +30,7 @@
 
				 #include <thread> // for hardware_concurrency
			
 
				 #include <vector>
			
 
				 
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				 #ifdef __linux__
			
 
				 #include <linux/limits.h>
			
 
				 #elif defined(_WIN32)
			
@@ -41,6 +42,8 @@
 
				 #else
			
 
				 #include <sys/syslimits.h>
			
 
				 #endif
			
 
				+#endif
			
 
				+
			
 
				 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
			
 
				 
			
 
				 using json = nlohmann::ordered_json;
			
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -902,6 +902,8 @@ std::string fs_get_cache_directory() {
 
				         cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
			
 
				 #elif defined(_WIN32)
			
 
				         cache_directory = std::getenv("LOCALAPPDATA");
			
 
				+#elif defined(__EMSCRIPTEN__)
			
 
				+        GGML_ABORT("not implemented on this platform");
			
 
				 #else
			
 
				 #  error Unknown architecture
			
 
				 #endif
			
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -24,6 +24,7 @@
 
				 #include "http.h"
			
 
				 #endif
			
 
				 
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				 #ifdef __linux__
			
 
				 #include <linux/limits.h>
			
 
				 #elif defined(_WIN32)
			
@@ -35,6 +36,8 @@
 
				 #else
			
 
				 #include <sys/syslimits.h>
			
 
				 #endif
			
 
				+#endif
			
 
				+
			
 
				 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
			
 
				 
			
 
				 // isatty
			
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -226,7 +226,7 @@ option(GGML_WEBGPU                          "ggml: use WebGPU"
 
				 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
			
 
				 option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
			
 
				 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
			
 
				-
			
 
				+option(GGML_WEBGPU_JSPI                     "ggml: use JSPI for WebGPU"                       ON)
			
 
				 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
			
 
				 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
			
 
				 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
			
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2698,6 +2698,11 @@ struct ggml_cplan ggml_graph_plan(
 
				         n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
			
 
				     }
			
 
				 
			
 
				+#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
			
 
				+    // Emscripten without pthreads support can only use a single thread
			
 
				+    n_threads = 1;
			
 
				+#endif
			
 
				+
			
 
				     size_t work_size = 0;
			
 
				 
			
 
				     struct ggml_cplan cplan;
			
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -39,8 +39,23 @@ add_dependencies(ggml-webgpu generate_shaders)
 
				 if(EMSCRIPTEN)
			
 
				     set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg")
			
 
				 
			
 
				-    target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
			
 
				-    target_link_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
			
 
				+    if(NOT EMDAWNWEBGPU_DIR)
			
 
				+        # default built-in port
			
 
				+        target_compile_options(ggml-webgpu PRIVATE "--use-port=emdawnwebgpu")
			
 
				+        target_link_options(ggml-webgpu INTERFACE "--use-port=emdawnwebgpu")
			
 
				+    else()
			
 
				+        # custom port
			
 
				+        target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
			
 
				+        target_link_options(ggml-webgpu INTERFACE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
			
 
				+    endif()
			
 
				+
			
 
				+    if (GGML_WEBGPU_JSPI)
			
 
				+        target_compile_options(ggml-webgpu PRIVATE "-fwasm-exceptions")
			
 
				+        target_link_options(ggml-webgpu INTERFACE "-sJSPI" "-fwasm-exceptions")
			
 
				+    else()
			
 
				+        target_compile_options(ggml-webgpu PRIVATE "-fexceptions")
			
 
				+        target_link_options(ggml-webgpu INTERFACE "-sASYNCIFY" "-exceptions")
			
 
				+    endif()
			
 
				 else()
			
 
				     find_package(Dawn REQUIRED)
			
 
				     set(DawnWebGPU_TARGET dawn::webgpu_dawn)
			
@@ -48,6 +63,9 @@ endif()
 
				 
			
 
				 if (GGML_WEBGPU_DEBUG)
			
 
				     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
			
 
				+    if(EMSCRIPTEN)
			
 
				+        target_link_options(ggml-webgpu INTERFACE "-sASSERTIONS=2")
			
 
				+    endif()
			
 
				 endif()
			
 
				 
			
 
				 if (GGML_WEBGPU_CPU_PROFILE)
			
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -9,6 +9,10 @@
 
				 #include "ggml-impl.h"
			
 
				 #include "ggml-wgsl-shaders.hpp"
			
 
				 
			
 
				+#ifdef __EMSCRIPTEN__
			
 
				+#    include <emscripten/emscripten.h>
			
 
				+#endif
			
 
				+
			
 
				 #include <webgpu/webgpu_cpp.h>
			
 
				 
			
 
				 #include <atomic>
			
@@ -261,9 +265,12 @@ struct webgpu_context_struct {
 
				     wgpu::Queue    queue;
			
 
				     wgpu::Limits   limits;
			
 
				 
			
 
				+    uint32_t subgroup_size;
			
 
				+
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				     bool                       supports_subgroup_matrix = false;
			
 
				-    uint32_t                   subgroup_size;
			
 
				     wgpu::SubgroupMatrixConfig subgroup_matrix_config;
			
 
				+#endif
			
 
				 
			
 
				     // Separate this out from limits since on some Metal systems, the limit returned by
			
 
				     // querying the limits is higher than the actual allowed maximum.
			
@@ -449,8 +456,8 @@ static void ggml_backend_webgpu_wait(webgpu_context &                         ct
 
				     // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
			
 
				     // inflight_max may be 0, meaning that we must wait on all futures.
			
 
				     uint64_t timeout_ms       = block ? UINT64_MAX : 0;
			
 
				-    uint     inflight_threads = ctx->inflight_threads;
			
 
				-    uint     inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
			
 
				+    uint32_t inflight_threads = ctx->inflight_threads;
			
 
				+    uint32_t inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
			
 
				     while (futures.size() >= inflight_max && futures.size() > 0) {
			
 
				         ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
			
 
				         futures.erase(futures.begin());
			
@@ -986,6 +993,7 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
 
				             pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
			
 
				             uint32_t wg_m;
			
 
				             uint32_t wg_n;
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				             if (ctx->supports_subgroup_matrix) {
			
 
				                 // The total number of subgroups/workgroups needed per matrix.
			
 
				                 uint32_t wg_m_sg_tile =
			
@@ -995,11 +1003,15 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
 
				                     WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
			
 
				                 wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
			
 
				             } else {
			
 
				+#endif
			
 
				                 uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
			
 
				                 uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
			
 
				                 wg_m              = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
			
 
				                 wg_n              = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				             }
			
 
				+#endif
			
 
				+
			
 
				             wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
			
 
				         }
			
 
				     }
			
@@ -1419,9 +1431,9 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
 
				             commands.push_back(*cmd);
			
 
				         }
			
 
				         // compute the batch size based on the number of inflight threads
			
 
				-        uint inflight_threads = ctx->inflight_threads;
			
 
				-        uint batch_size       = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
			
 
				-                                         WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
			
 
				+        uint32_t inflight_threads = ctx->inflight_threads;
			
 
				+        uint32_t batch_size       = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
			
 
				+                                             WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
			
 
				         if (commands.size() >= batch_size) {
			
 
				             futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
			
 
				             // Process events and check for completed submissions
			
@@ -1758,6 +1770,17 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
 
				     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
			
 
				                                 wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
			
 
				 
			
 
				+    std::string proc_mul_mat_f32_f32;
			
 
				+    std::string proc_mul_mat_f32_f32_vec;
			
 
				+    std::string proc_mul_mat_f16_f32;
			
 
				+    std::string proc_mul_mat_f16_f32_vec;
			
 
				+    std::string proc_mul_mat_f16_f16;
			
 
				+    std::string proc_mul_mat_f16_f16_vec;
			
 
				+    std::string proc_mul_mat_q4_0_f32;
			
 
				+    std::string proc_mul_mat_q4_0_f32_vec;
			
 
				+
			
 
				+    std::vector<wgpu::ConstantEntry> mul_mat_constants;
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				     if (webgpu_ctx->supports_subgroup_matrix) {
			
 
				         std::map<std::string, std::string> sg_matrix_repls;
			
 
				         sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
			
@@ -1770,100 +1793,57 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
 
				         sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
			
 
				         sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
			
 
				 
			
 
				-        std::string proc_mul_mat_subgroup_matrix_f32_f32 =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
			
 
				-        std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
			
 
				+        proc_mul_mat_f32_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
			
 
				+        proc_mul_mat_f32_f32_vec =
			
 
				             ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
			
 
				-        std::string proc_mul_mat_subgroup_matrix_f16_f32 =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
			
 
				-        std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
			
 
				+        proc_mul_mat_f16_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
			
 
				+        proc_mul_mat_f16_f32_vec =
			
 
				             ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
			
 
				-        std::string proc_mul_mat_subgroup_matrix_f16_f16 =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
			
 
				-        std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
			
 
				+        proc_mul_mat_f16_f16 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
			
 
				+        proc_mul_mat_f16_f16_vec =
			
 
				             ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
			
 
				-        std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
			
 
				+        proc_mul_mat_q4_0_f32 =
			
 
				             ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
			
 
				-        std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
			
 
				+        proc_mul_mat_q4_0_f32_vec =
			
 
				             ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
			
 
				-
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
			
 
				-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
			
 
				-                                         "mul_mat_subgroup_matrix_f32_f32_vec");
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
			
 
				-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
			
 
				-                                         "mul_mat_subgroup_matrix_f16_f32_vec");
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
			
 
				-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
			
 
				-                                         "mul_mat_subgroup_matrix_f16_f16_vec");
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
			
 
				-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
			
 
				-                                         "mul_mat_subgroup_matrix_q4_0_f32_vec");
			
 
				     } else {
			
 
				-        std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
			
 
				-        mul_mat_reg_tile_constants[0].key   = "TILE_K";
			
 
				-        mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
			
 
				-        mul_mat_reg_tile_constants[1].key   = "WORKGROUP_SIZE_M";
			
 
				-        mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
			
 
				-        mul_mat_reg_tile_constants[2].key   = "WORKGROUP_SIZE_N";
			
 
				-        mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
			
 
				+#endif
			
 
				+        mul_mat_constants.push_back({ .key = "TILE_K", .value = WEBGPU_MUL_MAT_TILE_K });
			
 
				+        mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_M", .value = WEBGPU_MUL_MAT_WG_SIZE_M });
			
 
				+        mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_N", .value = WEBGPU_MUL_MAT_WG_SIZE_N });
			
 
				 
			
 
				         std::map<std::string, std::string> reg_repls;
			
 
				         reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
			
 
				         reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
			
 
				 
			
 
				-        // Process each reg-tile shader with tile replacements.
			
 
				-        // Keep the processed strings in-scope so .c_str() remains valid.
			
 
				-        std::string proc_mul_mat_reg_tile_f32_f32 =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
			
 
				-        std::string proc_mul_mat_reg_tile_f32_f32_vec =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
			
 
				-        std::string proc_mul_mat_reg_tile_f16_f32 =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
			
 
				-        std::string proc_mul_mat_reg_tile_f16_f32_vec =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
			
 
				-        std::string proc_mul_mat_reg_tile_f16_f16 =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
			
 
				-        std::string proc_mul_mat_reg_tile_f16_f16_vec =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
			
 
				-        std::string proc_mul_mat_reg_tile_q4_0_f32 =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
			
 
				-        std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
			
 
				-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
			
 
				-
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
			
 
				-                                         "mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
			
 
				-                                         "mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
			
 
				-                                         "mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
			
 
				-                                         "mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
			
 
				-                                         "mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
			
 
				-                                         "mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
			
 
				-                                         "mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
			
 
				-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
			
 
				-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
			
 
				-                                         "mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
			
 
				+        proc_mul_mat_f32_f32      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
			
 
				+        proc_mul_mat_f32_f32_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
			
 
				+        proc_mul_mat_f16_f32      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
			
 
				+        proc_mul_mat_f16_f32_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
			
 
				+        proc_mul_mat_f16_f16      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
			
 
				+        proc_mul_mat_f16_f16_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
			
 
				+        proc_mul_mat_q4_0_f32     = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
			
 
				+        proc_mul_mat_q4_0_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				     }
			
 
				+#endif
			
 
				+
			
 
				+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
			
 
				+        webgpu_ctx->device, proc_mul_mat_f32_f32.c_str(), "mul_mat_f32_f32", mul_mat_constants);
			
 
				+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
			
 
				+        webgpu_ctx->device, proc_mul_mat_f32_f32_vec.c_str(), "mul_mat_f32_f32_vec", mul_mat_constants);
			
 
				+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
			
 
				+        webgpu_ctx->device, proc_mul_mat_f16_f32.c_str(), "mul_mat_f16_f32", mul_mat_constants);
			
 
				+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
			
 
				+        webgpu_ctx->device, proc_mul_mat_f16_f32_vec.c_str(), "mul_mat_f16_f32_vec", mul_mat_constants);
			
 
				+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
			
 
				+        webgpu_ctx->device, proc_mul_mat_f16_f16.c_str(), "mul_mat_f16_f16", mul_mat_constants);
			
 
				+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
			
 
				+        webgpu_ctx->device, proc_mul_mat_f16_f16_vec.c_str(), "mul_mat_f16_f16_vec", mul_mat_constants);
			
 
				+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
			
 
				+        webgpu_ctx->device, proc_mul_mat_q4_0_f32.c_str(), "mul_mat_q4_0_f32", mul_mat_constants);
			
 
				+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
			
 
				+        webgpu_ctx->device, proc_mul_mat_q4_0_f32_vec.c_str(), "mul_mat_q4_0_f32_vec", mul_mat_constants);
			
 
				 
			
 
				     std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
			
 
				     mul_mat_vec_constants[0].key   = "WORKGROUP_SIZE";
			
@@ -2384,13 +2364,17 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 
				 
			
 
				     webgpu_context ctx = reg_ctx->webgpu_ctx;
			
 
				 
			
 
				+    wgpu::RequestAdapterOptions options = {};
			
 
				+
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				     // TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
			
 
				     const char * const          adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
			
 
				     wgpu::DawnTogglesDescriptor adapterTogglesDesc;
			
 
				     adapterTogglesDesc.enabledToggles     = adapterEnabledToggles;
			
 
				     adapterTogglesDesc.enabledToggleCount = 2;
			
 
				-    wgpu::RequestAdapterOptions options   = {};
			
 
				     options.nextInChain                   = &adapterTogglesDesc;
			
 
				+#endif
			
 
				+
			
 
				     ctx->instance.WaitAny(ctx->instance.RequestAdapter(
			
 
				                               &options, wgpu::CallbackMode::AllowSpontaneous,
			
 
				                               [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
			
@@ -2406,11 +2390,13 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 
				     ctx->adapter.GetLimits(&ctx->limits);
			
 
				     ctx->max_wg_size_x = 288;  // default value
			
 
				 
			
 
				-    wgpu::AdapterInfo                            info{};
			
 
				+    wgpu::AdapterInfo info{};
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				     wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
			
 
				     if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
			
 
				         info.nextInChain = &subgroup_matrix_configs;
			
 
				     }
			
 
				+#endif
			
 
				     ctx->adapter.GetInfo(&info);
			
 
				 
			
 
				     wgpu::SupportedFeatures features;
			
@@ -2418,6 +2404,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 
				     // we require f16 support
			
 
				     GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
			
 
				 
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				     // Only support square f16 matrices of size 8 or 16 for now
			
 
				     bool valid_subgroup_matrix_config = false;
			
 
				     if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
			
@@ -2433,36 +2420,27 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 
				         }
			
 
				     }
			
 
				 
			
 
				+    ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
			
 
				+#endif
			
 
				     // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
			
 
				     // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
			
 
				-    ctx->subgroup_size            = info.subgroupMaxSize;
			
 
				-    ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
			
 
				+    ctx->subgroup_size = info.subgroupMaxSize;
			
 
				 
			
 
				     // Initialize device
			
 
				-    std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
			
 
				-                                                         wgpu::FeatureName::ImplicitDeviceSynchronization };
			
 
				+    std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16 };
			
 
				+
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				+    required_features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
			
 
				     if (ctx->supports_subgroup_matrix) {
			
 
				         required_features.push_back(wgpu::FeatureName::Subgroups);
			
 
				         required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
			
 
				     }
			
 
				+#endif
			
 
				 
			
 
				 #ifdef GGML_WEBGPU_GPU_PROFILE
			
 
				     required_features.push_back(wgpu::FeatureName::TimestampQuery);
			
 
				 #endif
			
 
				 
			
 
				-    // Enable Dawn-specific toggles to increase native performance
			
 
				-    // TODO: Don't enable for WASM builds, they won't have an effect anyways
			
 
				-    // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
			
 
				-    //       only for native performance?
			
 
				-    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
			
 
				-                                                   "disable_polyfills_on_integer_div_and_mod" };
			
 
				-    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
			
 
				-    wgpu::DawnTogglesDescriptor deviceTogglesDesc;
			
 
				-    deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
			
 
				-    deviceTogglesDesc.enabledToggleCount  = 4;
			
 
				-    deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
			
 
				-    deviceTogglesDesc.disabledToggleCount = 1;
			
 
				-
			
 
				     wgpu::DeviceDescriptor dev_desc;
			
 
				     dev_desc.requiredLimits       = &ctx->limits;
			
 
				     dev_desc.requiredFeatures     = required_features.data();
			
@@ -2480,7 +2458,23 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 
				             GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
			
 
				                        std::string(message).c_str());
			
 
				         });
			
 
				+
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				+    // Enable Dawn-specific toggles to increase native performance
			
 
				+    // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
			
 
				+    //       only for native performance?
			
 
				+    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
			
 
				+                                                   "disable_polyfills_on_integer_div_and_mod" };
			
 
				+    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
			
 
				+    wgpu::DawnTogglesDescriptor deviceTogglesDesc;
			
 
				+    deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
			
 
				+    deviceTogglesDesc.enabledToggleCount  = 4;
			
 
				+    deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
			
 
				+    deviceTogglesDesc.disabledToggleCount = 1;
			
 
				+
			
 
				     dev_desc.nextInChain = &deviceTogglesDesc;
			
 
				+#endif
			
 
				+
			
 
				     ctx->instance.WaitAny(ctx->adapter.RequestDevice(
			
 
				                               &dev_desc, wgpu::CallbackMode::AllowSpontaneous,
			
 
				                               [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
			
@@ -2578,18 +2572,27 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
 
				     ctx.name         = GGML_WEBGPU_NAME;
			
 
				     ctx.device_count = 1;
			
 
				 
			
 
				-    const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
			
 
				-
			
 
				-    wgpu::DawnTogglesDescriptor instanceTogglesDesc;
			
 
				-    instanceTogglesDesc.enabledToggles     = instanceEnabledToggles;
			
 
				-    instanceTogglesDesc.enabledToggleCount = 1;
			
 
				     wgpu::InstanceDescriptor               instance_descriptor{};
			
 
				     std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
			
 
				     instance_descriptor.requiredFeatures                     = instance_features.data();
			
 
				     instance_descriptor.requiredFeatureCount                 = instance_features.size();
			
 
				-    instance_descriptor.nextInChain                          = &instanceTogglesDesc;
			
 
				+
			
 
				+#ifndef __EMSCRIPTEN__
			
 
				+    const char * const          instanceEnabledToggles[] = { "allow_unsafe_apis" };
			
 
				+    wgpu::DawnTogglesDescriptor instanceTogglesDesc;
			
 
				+    instanceTogglesDesc.enabledToggles     = instanceEnabledToggles;
			
 
				+    instanceTogglesDesc.enabledToggleCount = 1;
			
 
				+    instance_descriptor.nextInChain        = &instanceTogglesDesc;
			
 
				+#endif
			
 
				 
			
 
				     webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
			
 
				+
			
 
				+#ifdef __EMSCRIPTEN__
			
 
				+    if (webgpu_ctx->instance == nullptr) {
			
 
				+        GGML_LOG_ERROR("ggml_webgpu: Failed to create WebGPU instance. Make sure either -sASYNCIFY or -sJSPI is set\n");
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+#endif
			
 
				     GGML_ASSERT(webgpu_ctx->instance != nullptr);
			
 
				 
			
 
				     static ggml_backend_reg reg = {
			
--- a/scripts/serve-static.js
+++ b/scripts/serve-static.js
@@ -0,0 +1,110 @@
 
				+const http = require('http');
			
 
				+const fs = require('fs').promises;
			
 
				+const path = require('path');
			
 
				+
			
 
				+// This file is used for testing wasm build from emscripten
			
 
				+// Example build command:
			
 
				+// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_CURL=OFF
			
 
				+// cmake --build build-wasm --target test-backend-ops -j
			
 
				+
			
 
				+const PORT = 8080;
			
 
				+const STATIC_DIR = path.join(__dirname, '../build-wasm/bin');
			
 
				+console.log(`Serving static files from: ${STATIC_DIR}`);
			
 
				+
			
 
				+const mimeTypes = {
			
 
				+  '.html': 'text/html',
			
 
				+  '.js': 'text/javascript',
			
 
				+  '.css': 'text/css',
			
 
				+  '.png': 'image/png',
			
 
				+  '.jpg': 'image/jpeg',
			
 
				+  '.gif': 'image/gif',
			
 
				+  '.svg': 'image/svg+xml',
			
 
				+  '.json': 'application/json',
			
 
				+  '.woff': 'font/woff',
			
 
				+  '.woff2': 'font/woff2',
			
 
				+};
			
 
				+
			
 
				+async function generateDirListing(dirPath, reqUrl) {
			
 
				+  const files = await fs.readdir(dirPath);
			
 
				+  let html = `
			
 
				+    <!DOCTYPE html>
			
 
				+    <html>
			
 
				+    <head>
			
 
				+      <title>Directory Listing</title>
			
 
				+      <style>
			
 
				+        body { font-family: Arial, sans-serif; padding: 20px; }
			
 
				+        ul { list-style: none; padding: 0; }
			
 
				+        li { margin: 5px 0; }
			
 
				+        a { text-decoration: none; color: #0066cc; }
			
 
				+        a:hover { text-decoration: underline; }
			
 
				+      </style>
			
 
				+    </head>
			
 
				+    <body>
			
 
				+      <h1>Directory: ${reqUrl}</h1>
			
 
				+      <ul>
			
 
				+  `;
			
 
				+
			
 
				+  if (reqUrl !== '/') {
			
 
				+    html += `<li><a href="../">../ (Parent Directory)</a></li>`;
			
 
				+  }
			
 
				+
			
 
				+  for (const file of files) {
			
 
				+    const filePath = path.join(dirPath, file);
			
 
				+    const stats = await fs.stat(filePath);
			
 
				+    const link = encodeURIComponent(file) + (stats.isDirectory() ? '/' : '');
			
 
				+    html += `<li><a href="${link}">${file}${stats.isDirectory() ? '/' : ''}</a></li>`;
			
 
				+  }
			
 
				+
			
 
				+  html += `
			
 
				+      </ul>
			
 
				+    </body>
			
 
				+    </html>
			
 
				+  `;
			
 
				+  return html;
			
 
				+}
			
 
				+
			
 
				+const server = http.createServer(async (req, res) => {
			
 
				+  try {
			
 
				+    // Set COOP and COEP headers
			
 
				+    res.setHeader('Cross-Origin-Opener-Policy', 'same-origin');
			
 
				+    res.setHeader('Cross-Origin-Embedder-Policy', 'require-corp');
			
 
				+    res.setHeader('Cache-Control', 'no-store, no-cache, must-revalidate, proxy-revalidate');
			
 
				+    res.setHeader('Pragma', 'no-cache');
			
 
				+    res.setHeader('Expires', '0');
			
 
				+
			
 
				+    const filePath = path.join(STATIC_DIR, decodeURIComponent(req.url));
			
 
				+    const stats = await fs.stat(filePath);
			
 
				+
			
 
				+    if (stats.isDirectory()) {
			
 
				+      const indexPath = path.join(filePath, 'index.html');
			
 
				+      try {
			
 
				+        const indexData = await fs.readFile(indexPath);
			
 
				+        res.writeHeader(200, { 'Content-Type': 'text/html' });
			
 
				+        res.end(indexData);
			
 
				+      } catch {
			
 
				+        // No index.html, generate directory listing
			
 
				+        const dirListing = await generateDirListing(filePath, req.url);
			
 
				+        res.writeHeader(200, { 'Content-Type': 'text/html' });
			
 
				+        res.end(dirListing);
			
 
				+      }
			
 
				+    } else {
			
 
				+      const ext = path.extname(filePath).toLowerCase();
			
 
				+      const contentType = mimeTypes[ext] || 'application/octet-stream';
			
 
				+      const data = await fs.readFile(filePath);
			
 
				+      res.writeHeader(200, { 'Content-Type': contentType });
			
 
				+      res.end(data);
			
 
				+    }
			
 
				+  } catch (err) {
			
 
				+    if (err.code === 'ENOENT') {
			
 
				+      res.writeHeader(404, { 'Content-Type': 'text/plain' });
			
 
				+      res.end('404 Not Found');
			
 
				+    } else {
			
 
				+      res.writeHeader(500, { 'Content-Type': 'text/plain' });
			
 
				+      res.end('500 Internal Server Error');
			
 
				+    }
			
 
				+  }
			
 
				+});
			
 
				+
			
 
				+server.listen(PORT, () => {
			
 
				+  console.log(`Server running at http://localhost:${PORT}/`);
			
 
				+});
			
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -41,12 +41,18 @@
 
				 #include <vector>
			
 
				 #include <unordered_map>
			
 
				 
			
 
				+#ifdef __EMSCRIPTEN__
			
 
				+#   define N_THREADS 1
			
 
				+#else
			
 
				+#   define N_THREADS std::thread::hardware_concurrency()
			
 
				+#endif
			
 
				+
			
 
				 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
			
 
				     size_t nels = ggml_nelements(tensor);
			
 
				     std::vector<float> data(nels);
			
 
				     {
			
 
				         // parallel initialization
			
 
				-        static const size_t n_threads = std::thread::hardware_concurrency();
			
 
				+        static const size_t n_threads = N_THREADS;
			
 
				         // static RNG initialization (revisit if n_threads stops being constant)
			
 
				         static std::vector<std::default_random_engine> generators = []() {
			
 
				             std::random_device rd;
			
@@ -65,15 +71,19 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
 
				             }
			
 
				         };
			
 
				 
			
 
				-        std::vector<std::future<void>> tasks;
			
 
				-        tasks.reserve(n_threads);
			
 
				-        for (size_t i = 0; i < n_threads; i++) {
			
 
				-            size_t start =     i*nels/n_threads;
			
 
				-            size_t end   = (i+1)*nels/n_threads;
			
 
				-            tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
			
 
				-        }
			
 
				-        for (auto & t : tasks) {
			
 
				-            t.get();
			
 
				+        if (n_threads == 1) {
			
 
				+            init_thread(0, 0, nels);
			
 
				+        } else {
			
 
				+            std::vector<std::future<void>> tasks;
			
 
				+            tasks.reserve(n_threads);
			
 
				+            for (size_t i = 0; i < n_threads; i++) {
			
 
				+                size_t start =     i*nels/n_threads;
			
 
				+                size_t end   = (i+1)*nels/n_threads;
			
 
				+                tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
			
 
				+            }
			
 
				+            for (auto & t : tasks) {
			
 
				+                t.get();
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -105,17 +115,23 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
 
				             };
			
 
				 
			
 
				             const size_t min_blocks_per_thread = 1;
			
 
				-            const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
			
 
				-                                                      std::max<size_t>(1, n_blocks / min_blocks_per_thread));
			
 
				-            std::vector<std::future<void>> tasks;
			
 
				-            tasks.reserve(n_threads);
			
 
				-            for (size_t i = 0; i < n_threads; i++) {
			
 
				-                size_t start =     i*n_blocks/n_threads;
			
 
				-                size_t end   = (i+1)*n_blocks/n_threads;
			
 
				-                tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
			
 
				-            }
			
 
				-            for (auto & t : tasks) {
			
 
				-                t.get();
			
 
				+            const size_t n_quant_threads = std::min<size_t>(std::max<size_t>(N_THREADS/2, 1),
			
 
				+                                                            std::max<size_t>(1, n_blocks / min_blocks_per_thread));
			
 
				+
			
 
				+            if (n_quant_threads == 1) {
			
 
				+                // single-threaded quantization: do all blocks in the current thread
			
 
				+                quantize_thread(0, n_blocks);
			
 
				+            } else {
			
 
				+                std::vector<std::future<void>> tasks;
			
 
				+                tasks.reserve(n_quant_threads);
			
 
				+                for (size_t i = 0; i < n_quant_threads; i++) {
			
 
				+                    size_t start =     i*n_blocks/n_quant_threads;
			
 
				+                    size_t end   = (i+1)*n_blocks/n_quant_threads;
			
 
				+                    tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
			
 
				+                }
			
 
				+                for (auto & t : tasks) {
			
 
				+                    t.get();
			
 
				+                }
			
 
				             }
			
 
				         }
			
 
				         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
			
@@ -8363,7 +8379,7 @@ int main(int argc, char ** argv) {
 
				         auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
			
 
				         if (ggml_backend_set_n_threads_fn) {
			
 
				             // TODO: better value for n_threads
			
 
				-            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
			
 
				+            ggml_backend_set_n_threads_fn(backend, N_THREADS);
			
 
				         }
			
 
				 
			
 
				         size_t free, total;  // NOLINT