|
|
@@ -4,6 +4,7 @@
|
|
|
|
|
|
#include <future>
|
|
|
#include <vector>
|
|
|
+#include <cstring>
|
|
|
|
|
|
#if defined(GGML_USE_ACCELERATE)
|
|
|
# include <Accelerate/Accelerate.h>
|
|
|
@@ -26,30 +27,6 @@ struct ggml_backend_blas_context {
|
|
|
#endif
|
|
|
};
|
|
|
|
|
|
-// helper function to determine if it is better to use BLAS or not
|
|
|
-// for large matrices, BLAS is faster
|
|
|
-static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
|
|
|
- const struct ggml_tensor * src0 = dst->src[0];
|
|
|
- const struct ggml_tensor * src1 = dst->src[1];
|
|
|
-
|
|
|
- const int64_t ne10 = src1->ne[0];
|
|
|
-
|
|
|
- const int64_t ne0 = dst->ne[0];
|
|
|
- const int64_t ne1 = dst->ne[1];
|
|
|
-
|
|
|
- // TODO: find the optimal values for these
|
|
|
- if (ggml_is_contiguous(src0) &&
|
|
|
- ggml_is_contiguous(src1) &&
|
|
|
- src1->type == GGML_TYPE_F32 &&
|
|
|
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
|
|
-
|
|
|
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
|
|
- return true;
|
|
|
- }
|
|
|
-
|
|
|
- return false;
|
|
|
-}
|
|
|
-
|
|
|
static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
|
|
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
|
@@ -235,7 +212,7 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
|
|
|
|
|
|
// backend interface
|
|
|
|
|
|
-static const char * ggml_backend_blas_name(ggml_backend_t backend) {
|
|
|
+static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
|
|
|
return "BLAS";
|
|
|
|
|
|
GGML_UNUSED(backend);
|
|
|
@@ -285,29 +262,8 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
|
|
GGML_UNUSED(backend);
|
|
|
}
|
|
|
|
|
|
-static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
|
- const struct ggml_tensor * src0 = op->src[0];
|
|
|
- const struct ggml_tensor * src1 = op->src[1];
|
|
|
-
|
|
|
- return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) ||
|
|
|
- (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
|
|
|
- op->src[1]->type == GGML_TYPE_F32 &&
|
|
|
- ggml_is_matrix(src0) &&
|
|
|
- ggml_is_matrix(src1) &&
|
|
|
- ggml_is_contiguous(src0) &&
|
|
|
- (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
|
|
|
-
|
|
|
- GGML_UNUSED(backend);
|
|
|
-}
|
|
|
-
|
|
|
-static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
|
- return ggml_backend_buft_is_host(buft);
|
|
|
-
|
|
|
- GGML_UNUSED(backend);
|
|
|
-}
|
|
|
-
|
|
|
static struct ggml_backend_i blas_backend_i = {
|
|
|
- /* .get_name = */ ggml_backend_blas_name,
|
|
|
+ /* .get_name = */ ggml_backend_blas_get_name,
|
|
|
/* .free = */ ggml_backend_blas_free,
|
|
|
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
|
|
|
/* .set_tensor_async = */ NULL,
|
|
|
@@ -319,8 +275,8 @@ static struct ggml_backend_i blas_backend_i = {
|
|
|
/* .graph_plan_update = */ NULL,
|
|
|
/* .graph_plan_compute = */ NULL,
|
|
|
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
|
|
- /* .supports_op = */ ggml_backend_blas_supports_op,
|
|
|
- /* .supports_buft = */ ggml_backend_blas_supports_buft,
|
|
|
+ /* .supports_op = */ NULL,
|
|
|
+ /* .supports_buft = */ NULL,
|
|
|
/* .offload_op = */ NULL,
|
|
|
/* .event_record = */ NULL,
|
|
|
/* .event_wait = */ NULL,
|
|
|
@@ -337,7 +293,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|
|
ggml_backend_t backend = new ggml_backend {
|
|
|
/* .guid = */ ggml_backend_blas_guid(),
|
|
|
/* .interface = */ blas_backend_i,
|
|
|
- /* .device = */ nullptr,
|
|
|
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
|
|
/* .context = */ ctx,
|
|
|
};
|
|
|
|
|
|
@@ -364,3 +320,203 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
|
|
|
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
|
|
|
ctx->n_threads = n_threads;
|
|
|
}
|
|
|
+
|
|
|
+// device interface
|
|
|
+
|
|
|
+static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
|
|
|
+ return "BLAS";
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+}
|
|
|
+
|
|
|
+static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
|
|
|
+ #if defined(GGML_USE_ACCELERATE)
|
|
|
+ return "Accelerate";
|
|
|
+ #elif defined(GGML_BLAS_USE_MKL)
|
|
|
+ return "MKL";
|
|
|
+ #elif defined(GGML_BLAS_USE_BLIS)
|
|
|
+ return "BLIS";
|
|
|
+ #elif defined(GGML_BLAS_USE_NVPL)
|
|
|
+ return "NVPL";
|
|
|
+ #elif defined(OPENBLAS_VERSION)
|
|
|
+ return "OpenBLAS";
|
|
|
+ #else
|
|
|
+ return "BLAS";
|
|
|
+ #endif
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+}
|
|
|
+
|
|
|
+static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
|
+ // TODO
|
|
|
+ *free = 0;
|
|
|
+ *total = 0;
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+}
|
|
|
+
|
|
|
+static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
|
|
|
+ return GGML_BACKEND_DEVICE_TYPE_CPU;
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+}
|
|
|
+
|
|
|
+static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
|
|
+ props->name = ggml_backend_blas_device_get_name(dev);
|
|
|
+ props->description = ggml_backend_blas_device_get_description(dev);
|
|
|
+ props->type = ggml_backend_blas_device_get_type(dev);
|
|
|
+ ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
|
+ props->caps = {
|
|
|
+ /* .async = */ false,
|
|
|
+ /* .host_buffer = */ false,
|
|
|
+ /* .buffer_from_host_ptr = */ true,
|
|
|
+ /* .events = */ false,
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
|
+ return ggml_backend_blas_init();
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+ GGML_UNUSED(params);
|
|
|
+}
|
|
|
+
|
|
|
+static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
|
+ return ggml_backend_cpu_buffer_type();
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+}
|
|
|
+
|
|
|
+static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
|
|
+ return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+ GGML_UNUSED(max_tensor_size);
|
|
|
+}
|
|
|
+
|
|
|
+static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
|
+ const struct ggml_tensor * src0 = op->src[0];
|
|
|
+ const struct ggml_tensor * src1 = op->src[1];
|
|
|
+
|
|
|
+ switch (op->op) {
|
|
|
+ case GGML_OP_NONE:
|
|
|
+ case GGML_OP_RESHAPE:
|
|
|
+ case GGML_OP_VIEW:
|
|
|
+ case GGML_OP_PERMUTE:
|
|
|
+ case GGML_OP_TRANSPOSE:
|
|
|
+ return true;
|
|
|
+
|
|
|
+ case GGML_OP_MUL_MAT:
|
|
|
+ {
|
|
|
+ // BLAS usually is only faster for large matrices
|
|
|
+ const struct ggml_tensor * src0 = op->src[0];
|
|
|
+ const struct ggml_tensor * src1 = op->src[1];
|
|
|
+
|
|
|
+ const int64_t ne10 = src1->ne[0];
|
|
|
+
|
|
|
+ const int64_t ne0 = op->ne[0];
|
|
|
+ const int64_t ne1 = op->ne[1];
|
|
|
+
|
|
|
+ // TODO: find the optimal value
|
|
|
+ const int64_t min_batch = 32;
|
|
|
+
|
|
|
+ return (ggml_is_contiguous(src0) &&
|
|
|
+ ggml_is_contiguous(src1) &&
|
|
|
+ src1->type == GGML_TYPE_F32 &&
|
|
|
+ (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch));
|
|
|
+ }
|
|
|
+
|
|
|
+ case GGML_OP_OUT_PROD:
|
|
|
+ return (op->src[0]->type == GGML_TYPE_F32 &&
|
|
|
+ op->src[1]->type == GGML_TYPE_F32 &&
|
|
|
+ ggml_is_matrix(src0) &&
|
|
|
+ ggml_is_matrix(src1) &&
|
|
|
+ ggml_is_contiguous(src0) &&
|
|
|
+ (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
|
|
|
+
|
|
|
+ default:
|
|
|
+ return false;
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+}
|
|
|
+
|
|
|
+static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
|
+ return ggml_backend_buft_is_host(buft);
|
|
|
+
|
|
|
+ GGML_UNUSED(dev);
|
|
|
+}
|
|
|
+
|
|
|
+static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
|
|
|
+ /* .get_name = */ ggml_backend_blas_device_get_name,
|
|
|
+ /* .get_description = */ ggml_backend_blas_device_get_description,
|
|
|
+ /* .get_memory = */ ggml_backend_blas_device_get_memory,
|
|
|
+ /* .get_type = */ ggml_backend_blas_device_get_type,
|
|
|
+ /* .get_props = */ ggml_backend_blas_device_get_props,
|
|
|
+ /* .init_backend = */ ggml_backend_blas_device_init,
|
|
|
+ /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
|
|
|
+ /* .get_host_buffer_type = */ NULL,
|
|
|
+ /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
|
|
|
+ /* .supports_op = */ ggml_backend_blas_device_supports_op,
|
|
|
+ /* .supports_buft = */ ggml_backend_blas_device_supports_buft,
|
|
|
+ /* .offload_op = */ NULL,
|
|
|
+ /* .event_new = */ NULL,
|
|
|
+ /* .event_free = */ NULL,
|
|
|
+ /* .event_synchronize = */ NULL,
|
|
|
+};
|
|
|
+
|
|
|
+// backend reg interface
|
|
|
+
|
|
|
+static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
|
|
|
+ return "BLAS";
|
|
|
+
|
|
|
+ GGML_UNUSED(reg);
|
|
|
+}
|
|
|
+
|
|
|
+static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
|
+ return 1;
|
|
|
+
|
|
|
+ GGML_UNUSED(reg);
|
|
|
+}
|
|
|
+
|
|
|
+static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
|
|
+ GGML_ASSERT(index == 0);
|
|
|
+
|
|
|
+ static ggml_backend_device ggml_backend_blas_device = {
|
|
|
+ /* .iface = */ ggml_backend_blas_device_i,
|
|
|
+ /* .reg = */ reg,
|
|
|
+ /* .context = */ nullptr,
|
|
|
+ };
|
|
|
+
|
|
|
+ return &ggml_backend_blas_device;
|
|
|
+
|
|
|
+ GGML_UNUSED(reg);
|
|
|
+ GGML_UNUSED(index);
|
|
|
+}
|
|
|
+
|
|
|
+static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
|
+ if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
|
|
+ return (void *)ggml_backend_blas_set_n_threads;
|
|
|
+ }
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ GGML_UNUSED(reg);
|
|
|
+ GGML_UNUSED(name);
|
|
|
+}
|
|
|
+
|
|
|
+static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
|
|
|
+ /* .get_name = */ ggml_backend_blas_reg_get_name,
|
|
|
+ /* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
|
|
|
+ /* .get_device = */ ggml_backend_blas_reg_get_device,
|
|
|
+ /* .get_proc_address = */ ggml_backend_blas_get_proc_address,
|
|
|
+};
|
|
|
+
|
|
|
+ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
|
|
+ static struct ggml_backend_reg ggml_backend_blas_reg = {
|
|
|
+ /* .iface = */ ggml_backend_blas_reg_i,
|
|
|
+ /* .context = */ NULL,
|
|
|
+ };
|
|
|
+
|
|
|
+ return &ggml_backend_blas_reg;
|
|
|
+}
|