7 месяцев назад · bef8176387
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -177,7 +177,6 @@ option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"
 
															 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
														
 
															 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
														
 
															 option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
														
 
															-option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
														
 
															 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
														
 
															 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
														
 
															 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
														
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -109,10 +109,6 @@ if (Vulkan_FOUND)
 
															         add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
														
 
															     endif()
														
 
															-    if (GGML_VULKAN_PERF)
														
 
															-        add_compile_definitions(GGML_VULKAN_PERF)
														
 
															-    endif()
														
 
															-
														
 
															     if (GGML_VULKAN_VALIDATE)
														
 
															         add_compile_definitions(GGML_VULKAN_VALIDATE)
														
 
															     endif()
														
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1,6 +1,6 @@
 
															 #include "ggml-vulkan.h"
														
 
															 #include <vulkan/vulkan_core.h>
														
 
															-#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
														
 
															+#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
														
 
															 #include <chrono>
														
 
															 #include "ggml-cpu.h"
														
 
															 #endif
														
@@ -184,9 +184,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
 
															 #ifdef GGML_VULKAN_MEMORY_DEBUG
														
 
															 class vk_memory_logger;
														
 
															 #endif
														
 
															-#ifdef GGML_VULKAN_PERF
														
 
															 class vk_perf_logger;
														
 
															-#endif
														
 
															 static void ggml_vk_destroy_buffer(vk_buffer& buf);
														
 
															 static constexpr uint32_t mul_mat_vec_max_cols = 8;
														
@@ -442,9 +440,11 @@ struct vk_device_struct {
 
															 #ifdef GGML_VULKAN_MEMORY_DEBUG
														
 
															     std::unique_ptr<vk_memory_logger> memory_logger;
														
 
															 #endif
														
 
															-#ifdef GGML_VULKAN_PERF
														
 
															+
														
 
															+    // for GGML_VK_PERF_LOGGER
														
 
															     std::unique_ptr<vk_perf_logger> perf_logger;
														
 
															-#endif
														
 
															+    vk::QueryPool query_pool;
														
 
															+    uint32_t num_queries;
														
 
															     ~vk_device_struct() {
														
 
															         VK_LOG_DEBUG("destroy device " << name);
														
@@ -828,8 +828,6 @@ private:
 
															 #define VK_LOG_MEMORY(msg) ((void) 0)
														
 
															 #endif // GGML_VULKAN_MEMORY_DEBUG
														
 
															-#if defined(GGML_VULKAN_PERF)
														
 
															-
														
 
															 class vk_perf_logger {
														
 
															 public:
														
 
															     void print_timings() {
														
@@ -839,7 +837,7 @@ public:
 
															             for (const auto& time : t.second) {
														
 
															                 total += time;
														
 
															             }
														
 
															-            std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
														
 
															+            std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
														
 
															         }
														
 
															         timings.clear();
														
@@ -868,7 +866,6 @@ public:
 
															 private:
														
 
															     std::map<std::string, std::vector<uint64_t>> timings;
														
 
															 };
														
 
															-#endif // GGML_VULKAN_PERF
														
 
															 struct ggml_backend_vk_context {
														
 
															     std::string name;
														
@@ -958,6 +955,8 @@ struct vk_instance_t {
 
															 static bool vk_instance_initialized = false;
														
 
															 static vk_instance_t vk_instance;
														
 
															+static bool vk_perf_logger_enabled = false;
														
 
															+
														
 
															 #ifdef GGML_VULKAN_CHECK_RESULTS
														
 
															 static size_t vk_skip_checks;
														
 
															 static size_t vk_output_tensor;
														
@@ -2757,9 +2756,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
 
															 #ifdef GGML_VULKAN_MEMORY_DEBUG
														
 
															         device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
														
 
															 #endif
														
 
															-#ifdef GGML_VULKAN_PERF
														
 
															-        device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
														
 
															-#endif
														
 
															+        if (vk_perf_logger_enabled) {
														
 
															+            device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
														
 
															+        }
														
 
															         size_t dev_num = vk_instance.device_indices[idx];
														
@@ -3547,6 +3546,8 @@ static void ggml_vk_instance_init() {
 
															     vk_instance.instance = vk::createInstance(instance_create_info);
														
 
															     vk_instance_initialized = true;
														
 
															+    vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
														
 
															+
														
 
															     size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
														
 
															     // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
														
@@ -8885,7 +8886,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 
															     ctx->tensor_ctxs[node_idx] = compute_ctx;
														
 
															-#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
														
 
															+#if defined(GGML_VULKAN_CHECK_RESULTS)
														
 
															     // Force context reset on each node so that each tensor ends up in its own context
														
 
															     // and can be run and compared to its CPU equivalent separately
														
 
															     last_node = true;
														
@@ -9505,6 +9506,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
															     bool first_node_in_batch = true; // true if next node will be first node in a batch
														
 
															     int submit_node_idx = 0; // index to first node in a batch
														
 
															+    vk_context compute_ctx;
														
 
															+    if (vk_perf_logger_enabled) {
														
 
															+        // allocate/resize the query pool
														
 
															+        if (ctx->device->num_queries < cgraph->n_nodes + 1) {
														
 
															+            if (ctx->device->query_pool) {
														
 
															+                ctx->device->device.destroyQueryPool(ctx->device->query_pool);
														
 
															+            }
														
 
															+            VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
														
 
															+            query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
														
 
															+            query_create_info.queryCount = cgraph->n_nodes + 100;
														
 
															+            ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
														
 
															+            ctx->device->num_queries = query_create_info.queryCount;
														
 
															+        }
														
 
															+
														
 
															+        ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
														
 
															+
														
 
															+        GGML_ASSERT(ctx->compute_ctx.expired());
														
 
															+        compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
														
 
															+        ctx->compute_ctx = compute_ctx;
														
 
															+        ggml_vk_ctx_begin(ctx->device, compute_ctx);
														
 
															+        compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
														
 
															+    }
														
 
															+
														
 
															     // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
														
 
															     // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
														
 
															     // (and scaled down based on model size, so smaller models submit earlier).
														
@@ -9532,6 +9556,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
															         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
														
 
															+        if (vk_perf_logger_enabled) {
														
 
															+            if (ctx->compute_ctx.expired()) {
														
 
															+                compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
														
 
															+                ctx->compute_ctx = compute_ctx;
														
 
															+                ggml_vk_ctx_begin(ctx->device, compute_ctx);
														
 
															+            } else {
														
 
															+                compute_ctx = ctx->compute_ctx.lock();
														
 
															+            }
														
 
															+            compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
														
 
															+        }
														
 
															+
														
 
															         if (enqueued) {
														
 
															             ++submitted_nodes;
														
@@ -9553,9 +9588,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
															         }
														
 
															     }
														
 
															-#ifdef GGML_VULKAN_PERF
														
 
															-    ctx->device->perf_logger->print_timings();
														
 
															-#endif
														
 
															+    if (vk_perf_logger_enabled) {
														
 
															+        // End the command buffer and submit/wait
														
 
															+        GGML_ASSERT(!ctx->compute_ctx.expired());
														
 
															+        compute_ctx = ctx->compute_ctx.lock();
														
 
															+        ggml_vk_ctx_end(compute_ctx);
														
 
															+
														
 
															+        ggml_vk_submit(compute_ctx, ctx->device->fence);
														
 
															+        VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
														
 
															+        ctx->device->device.resetFences({ ctx->device->fence });
														
 
															+
														
 
															+        // Get the results and pass them to the logger
														
 
															+        std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
														
 
															+        ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
														
 
															+        for (int i = 0; i < cgraph->n_nodes; i++) {
														
 
															+            if (!ggml_vk_is_empty(cgraph->nodes[i])) {
														
 
															+                ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
														
 
															+            }
														
 
															+        }
														
 
															+
														
 
															+        ctx->device->perf_logger->print_timings();
														
 
															+    }
														
 
															     ggml_vk_graph_cleanup(ctx);