|
@@ -149,6 +149,8 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
|
#include <cblas.h>
|
|
#include <cblas.h>
|
|
|
#elif defined(GGML_USE_CUBLAS)
|
|
#elif defined(GGML_USE_CUBLAS)
|
|
|
#include "ggml-cuda.h"
|
|
#include "ggml-cuda.h"
|
|
|
|
|
+#elif defined(GGML_USE_CLBLAST)
|
|
|
|
|
+#include "ggml-opencl.h"
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
#undef MIN
|
|
#undef MIN
|
|
@@ -4363,6 +4365,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
// initialize cuBLAS
|
|
// initialize cuBLAS
|
|
|
#if defined(GGML_USE_CUBLAS)
|
|
#if defined(GGML_USE_CUBLAS)
|
|
|
ggml_init_cublas();
|
|
ggml_init_cublas();
|
|
|
|
|
+ #elif defined(GGML_USE_CLBLAST)
|
|
|
|
|
+ ggml_cl_init();
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
is_first_call = false;
|
|
is_first_call = false;
|
|
@@ -8104,7 +8108,7 @@ static void ggml_compute_forward_rms_norm(
|
|
|
|
|
|
|
|
// ggml_compute_forward_mul_mat
|
|
// ggml_compute_forward_mul_mat
|
|
|
|
|
|
|
|
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
|
|
|
|
|
|
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
|
// helper function to determine if it is better to use BLAS or not
|
|
// helper function to determine if it is better to use BLAS or not
|
|
|
// for large matrices, BLAS is faster
|
|
// for large matrices, BLAS is faster
|
|
|
static bool ggml_compute_forward_mul_mat_use_blas(
|
|
static bool ggml_compute_forward_mul_mat_use_blas(
|
|
@@ -8129,6 +8133,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
|
|
|
|
|
|
return false;
|
|
return false;
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
static void ggml_compute_forward_mul_mat_f32(
|
|
static void ggml_compute_forward_mul_mat_f32(
|
|
@@ -8144,7 +8149,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
|
const int64_t ne02 = src0->ne[2];
|
|
const int64_t ne02 = src0->ne[2];
|
|
|
const int64_t ne03 = src0->ne[3];
|
|
const int64_t ne03 = src0->ne[3];
|
|
|
|
|
|
|
|
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
|
|
|
|
|
|
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
|
const int64_t ne10 = src1->ne[0];
|
|
const int64_t ne10 = src1->ne[0];
|
|
|
#endif
|
|
#endif
|
|
|
const int64_t ne11 = src1->ne[1];
|
|
const int64_t ne11 = src1->ne[1];
|
|
@@ -8201,7 +8206,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
|
// nb01 >= nb00 - src0 is not transposed
|
|
// nb01 >= nb00 - src0 is not transposed
|
|
|
// compute by src0 rows
|
|
// compute by src0 rows
|
|
|
|
|
|
|
|
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
|
|
|
|
|
|
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
|
|
if (params->ith != 0) {
|
|
if (params->ith != 0) {
|
|
|
return;
|
|
return;
|
|
@@ -8250,8 +8255,15 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
|
|
|
|
|
|
// copy data to host
|
|
// copy data to host
|
|
|
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
|
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
|
|
-#else
|
|
|
|
|
|
|
+#elif defined(GGML_USE_CLBLAST)
|
|
|
// zT = y * xT
|
|
// zT = y * xT
|
|
|
|
|
+ ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
|
|
|
|
+ ne11, ne01, ne10,
|
|
|
|
|
+ 1.0f, y, ne10,
|
|
|
|
|
+ x, ne10,
|
|
|
|
|
+ 0.0f, d, ne01,
|
|
|
|
|
+ GGML_TYPE_F32);
|
|
|
|
|
+#else
|
|
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
|
ne11, ne01, ne10,
|
|
ne11, ne01, ne10,
|
|
|
1.0f, y, ne10,
|
|
1.0f, y, ne10,
|
|
@@ -8395,7 +8407,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
|
// nb01 >= nb00 - src0 is not transposed
|
|
// nb01 >= nb00 - src0 is not transposed
|
|
|
// compute by src0 rows
|
|
// compute by src0 rows
|
|
|
|
|
|
|
|
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
|
|
|
|
|
|
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
|
|
GGML_ASSERT(nb10 == sizeof(float));
|
|
GGML_ASSERT(nb10 == sizeof(float));
|
|
|
|
|
|
|
@@ -8472,6 +8484,19 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
|
|
|
|
|
|
// copy data to host
|
|
// copy data to host
|
|
|
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
|
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
|
|
|
|
+#elif defined(GGML_USE_CLBLAST)
|
|
|
|
|
+ const float * x = wdata;
|
|
|
|
|
+ const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
|
|
|
|
+
|
|
|
|
|
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
|
|
|
|
+
|
|
|
|
|
+ // zT = y * xT
|
|
|
|
|
+ ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
|
|
|
|
+ ne11, ne01, ne10,
|
|
|
|
|
+ 1.0f, y, ne10,
|
|
|
|
|
+ x, ne10,
|
|
|
|
|
+ 0.0f, d, ne01,
|
|
|
|
|
+ GGML_TYPE_F32);
|
|
|
#else
|
|
#else
|
|
|
const float * x = wdata;
|
|
const float * x = wdata;
|
|
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
|
@@ -8646,7 +8671,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
|
// nb01 >= nb00 - src0 is not transposed
|
|
// nb01 >= nb00 - src0 is not transposed
|
|
|
// compute by src0 rows
|
|
// compute by src0 rows
|
|
|
|
|
|
|
|
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
|
|
|
|
|
|
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
|
|
if (params->ith != 0) {
|
|
if (params->ith != 0) {
|
|
|
return;
|
|
return;
|
|
@@ -8698,7 +8723,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
|
else {
|
|
else {
|
|
|
GGML_ASSERT(false);
|
|
GGML_ASSERT(false);
|
|
|
}
|
|
}
|
|
|
-#else
|
|
|
|
|
|
|
+#elif !defined(GGML_USE_CLBLAST)
|
|
|
float * const wdata = params->wdata;
|
|
float * const wdata = params->wdata;
|
|
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
|
|
#endif
|
|
#endif
|
|
@@ -8717,6 +8742,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
|
|
|
|
|
|
dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
|
|
dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
|
|
|
CUDA_CHECK(cudaGetLastError());
|
|
CUDA_CHECK(cudaGetLastError());
|
|
|
|
|
+#elif defined(GGML_USE_CLBLAST)
|
|
|
|
|
+ const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
|
|
#else
|
|
#else
|
|
|
{
|
|
{
|
|
|
size_t id = 0;
|
|
size_t id = 0;
|
|
@@ -8743,8 +8770,15 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
|
|
|
|
|
|
// copy data to host
|
|
// copy data to host
|
|
|
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
|
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
|
|
-#else
|
|
|
|
|
|
|
+#elif defined(GGML_USE_CLBLAST)
|
|
|
// zT = y * xT
|
|
// zT = y * xT
|
|
|
|
|
+ ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
|
|
|
|
+ ne11, ne01, ne10,
|
|
|
|
|
+ 1.0f, y, ne10,
|
|
|
|
|
+ x, ne10,
|
|
|
|
|
+ 0.0f, d, ne01,
|
|
|
|
|
+ type);
|
|
|
|
|
+#else
|
|
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
|
ne11, ne01, ne10,
|
|
ne11, ne01, ne10,
|
|
|
1.0f, y, ne10,
|
|
1.0f, y, ne10,
|
|
@@ -11583,7 +11617,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
size_t cur = 0;
|
|
size_t cur = 0;
|
|
|
|
|
|
|
|
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
|
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
|
|
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
|
|
|
|
|
|
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
|
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
|
|
// the threads are still spinning
|
|
// the threads are still spinning
|
|
@@ -11600,7 +11634,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
|
|
cur = 0;
|
|
cur = 0;
|
|
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
|
|
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
|
|
|
|
|
|
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
|
|
node->n_tasks = 1;
|
|
node->n_tasks = 1;
|
|
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
|
@@ -13100,7 +13134,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
int ggml_cpu_has_blas(void) {
|
|
int ggml_cpu_has_blas(void) {
|
|
|
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
|
|
|
|
|
|
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
|
return 1;
|
|
return 1;
|
|
|
#else
|
|
#else
|
|
|
return 0;
|
|
return 0;
|
|
@@ -13115,6 +13149,18 @@ int ggml_cpu_has_cublas(void) {
|
|
|
#endif
|
|
#endif
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+int ggml_cpu_has_clblast(void) {
|
|
|
|
|
+#if defined(GGML_USE_CLBLAST)
|
|
|
|
|
+ return 1;
|
|
|
|
|
+#else
|
|
|
|
|
+ return 0;
|
|
|
|
|
+#endif
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+int ggml_cpu_has_gpublas(void) {
|
|
|
|
|
+ return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
int ggml_cpu_has_sse3(void) {
|
|
int ggml_cpu_has_sse3(void) {
|
|
|
#if defined(__SSE3__)
|
|
#if defined(__SSE3__)
|
|
|
return 1;
|
|
return 1;
|