3 weeks ago · 9a5724dee2
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2541,27 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
 
				     return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Determines if a tensor operation should be offloaded to the CANN
			
 
				- * backend.
			
 
				- *
			
 
				- * This function checks if a given tensor operation should be offloaded to the
			
 
				- * CANN backend based on the operation type and the size of the tensor. It
			
 
				- * returns true if the second dimension (ne[1]) of the tensor is greater than or
			
 
				- * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
			
 
				- *
			
 
				- * @param backend Pointer to the CANN backend.
			
 
				- * @param op Pointer to the tensor operation to check.
			
 
				- * @return bool Returns true if the operation should be offloaded, otherwise
			
 
				- * false.
			
 
				- */
			
 
				-static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
			
 
				-    const int min_batch_size = 32;
			
 
				-    GGML_UNUSED(dev);
			
 
				-
			
 
				-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Records an event on the CANN backend stream.
			
 
				  *
			
@@ -2637,6 +2616,7 @@ struct ggml_backend_cann_device_context {
 
				     int         device;
			
 
				     std::string name;
			
 
				     std::string description;
			
 
				+    int op_offload_min_batch_size;
			
 
				 };
			
 
				 
			
 
				 static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
			
@@ -2713,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
 
				     return ggml_backend_cann_host_buffer_type();
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Determines if a tensor operation should be offloaded to the CANN
			
 
				+ * backend.
			
 
				+ *
			
 
				+ * This function checks if a given tensor operation should be offloaded to the
			
 
				+ * CANN backend based on the operation type and the size of the tensor. It
			
 
				+ * returns true if the second dimension (ne[1]) of the tensor is greater than or
			
 
				+ * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
			
 
				+ *
			
 
				+ * @param backend Pointer to the CANN backend.
			
 
				+ * @param op Pointer to the tensor operation to check.
			
 
				+ * @return bool Returns true if the operation should be offloaded, otherwise
			
 
				+ * false.
			
 
				+ */
			
 
				+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
			
 
				+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
			
 
				+
			
 
				+    return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * @brief Creates a new event for the CANN backend device.
			
 
				  *
			
@@ -2829,12 +2829,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
 
				         if (!initialized) {
			
 
				             aclInit(nullptr);
			
 
				             ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
			
 
				+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
			
 
				 
			
 
				             for (int i = 0; i < ggml_cann_info().device_count; i++) {
			
 
				                 ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
			
 
				                 dev_ctx->description                       = aclrtGetSocName();
			
 
				                 dev_ctx->device                            = i;
			
 
				                 dev_ctx->name                              = GGML_CANN_NAME + std::to_string(i);
			
 
				+                dev_ctx->op_offload_min_batch_size         = min_batch_size;
			
 
				                 ggml_cann_set_device(i);
			
 
				                 ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface   = */ ggml_backend_cann_device_interface,
			
 
				                                                                   /* .reg     = */ &reg,
			
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4122,6 +4122,7 @@ struct ggml_backend_cuda_device_context {
 
				     std::string name;
			
 
				     std::string description;
			
 
				     std::string pci_bus_id;
			
 
				+    int op_offload_min_batch_size;
			
 
				 };
			
 
				 
			
 
				 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
			
@@ -4676,11 +4677,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 
				 }
			
 
				 
			
 
				 static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
			
 
				-    const int min_batch_size = 32;
			
 
				-
			
 
				-    return get_op_batch_size(op) >= min_batch_size;
			
 
				+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
			
 
				 
			
 
				-    GGML_UNUSED(dev);
			
 
				+    return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
			
 
				 }
			
 
				 
			
 
				 static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
			
@@ -4848,6 +4847,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
 
				         std::lock_guard<std::mutex> lock(mutex);
			
 
				         if (!initialized) {
			
 
				             ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
			
 
				+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
			
 
				 
			
 
				             for (int i = 0; i < ggml_cuda_info().device_count; i++) {
			
 
				                 ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
			
@@ -4861,6 +4861,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
 
				                 char pci_bus_id[16] = {};
			
 
				                 snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
			
 
				                 dev_ctx->pci_bus_id = pci_bus_id;
			
 
				+                dev_ctx->op_offload_min_batch_size = min_batch_size;
			
 
				 
			
 
				                 ggml_backend_dev_t dev = new ggml_backend_device {
			
 
				                     /* .iface   = */ ggml_backend_cuda_device_interface,
			
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -219,6 +219,8 @@ struct ggml_metal_device_props {
 
				     bool use_shared_buffers;
			
 
				 
			
 
				     bool supports_gpu_family_apple7;
			
 
				+
			
 
				+    int op_offload_min_batch_size;
			
 
				 };
			
 
				 
			
 
				 ggml_metal_device_t ggml_metal_device_init(void);
			
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 
				 
			
 
				             dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
			
 
				 
			
 
				+            dev->props.op_offload_min_batch_size  = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
			
 
				+
			
 
				             dev->props.max_buffer_size            = dev->mtl_device.maxBufferLength;
			
 
				             dev->props.max_working_set_size       = dev->mtl_device.recommendedMaxWorkingSetSize;
			
 
				             dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
			
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 
				 }
			
 
				 
			
 
				 static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
			
 
				-    const int min_batch_size = 32;
			
 
				+    ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
			
 
				 
			
 
				     return (op->op == GGML_OP_MUL_MAT ||
			
 
				             op->op == GGML_OP_MUL_MAT_ID) &&
			
 
				-            get_op_batch_size(op) >= min_batch_size;
			
 
				-
			
 
				-    GGML_UNUSED(dev);
			
 
				-    GGML_UNUSED(op);
			
 
				+            get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
			
 
				 }
			
 
				 
			
 
				 static ggml_backend_device_i ggml_backend_metal_device_i = {
			
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context {
 
				     int device;
			
 
				     std::string name;
			
 
				     std::string description;
			
 
				+    int op_offload_min_batch_size;
			
 
				 };
			
 
				 
			
 
				 static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
			
@@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 
				 }
			
 
				 
			
 
				 static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
			
 
				-    const int min_batch_size = 32;
			
 
				-    return get_op_batch_size(op) >= min_batch_size;
			
 
				-    GGML_UNUSED(dev);
			
 
				+    ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
			
 
				+    return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
			
 
				 }
			
 
				 
			
 
				 static ggml_backend_event_t
			
@@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
 
				         std::lock_guard<std::mutex> lock(mutex);
			
 
				         if (!initialized) {
			
 
				             ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
			
 
				+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
			
 
				 
			
 
				             for (int i = 0; i < ggml_sycl_info().device_count; i++) {
			
 
				                 ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
			
@@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
 
				                     prop, dpct::dev_mgr::instance().get_device(i))));
			
 
				 
			
 
				                 dev_ctx->description = prop.get_name();
			
 
				+                dev_ctx->op_offload_min_batch_size = min_batch_size;
			
 
				 
			
 
				                 ggml_backend_dev_t dev = new ggml_backend_device {
			
 
				                     /* .iface       = */ ggml_backend_sycl_device_interface,
			
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -14249,6 +14249,7 @@ struct ggml_backend_vk_device_context {
 
				     std::string description;
			
 
				     bool is_integrated_gpu;
			
 
				     std::string pci_bus_id;
			
 
				+    int op_offload_min_batch_size;
			
 
				 };
			
 
				 
			
 
				 static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
			
@@ -14820,12 +14821,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
 
				 }
			
 
				 
			
 
				 static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
			
 
				-    const int min_batch_size = 32;
			
 
				+    ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
			
 
				 
			
 
				-    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
			
 
				-           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
			
 
				-
			
 
				-    UNUSED(dev);
			
 
				+    return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
			
 
				+           (op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
			
 
				 }
			
 
				 
			
 
				 static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
			
@@ -14951,6 +14950,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
 
				         static std::mutex mutex;
			
 
				         std::lock_guard<std::mutex> lock(mutex);
			
 
				         if (!initialized) {
			
 
				+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
			
 
				             for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
			
 
				                 ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
			
 
				                 char desc[256];
			
@@ -14960,6 +14960,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
 
				                 ctx->description = desc;
			
 
				                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
			
 
				                 ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
			
 
				+                ctx->op_offload_min_batch_size = min_batch_size;
			
 
				                 devices.push_back(new ggml_backend_device {
			
 
				                     /* .iface   = */ ggml_backend_vk_device_i,
			
 
				                     /* .reg     = */ reg,