|
@@ -249,9 +249,8 @@ class tinyBLAS {
|
|
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- void matmul(int64_t m, int64_t n, int task) {
|
|
|
|
|
- if (task == GGML_TASK_TYPE_COMPUTE)
|
|
|
|
|
- mnpack(0, m, 0, n);
|
|
|
|
|
|
|
+ void matmul(int64_t m, int64_t n) {
|
|
|
|
|
+ mnpack(0, m, 0, n);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
private:
|
|
@@ -458,9 +457,8 @@ class tinyBLAS_Q0_ARM {
|
|
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- void matmul(int64_t m, int64_t n, int task) {
|
|
|
|
|
- if (task == GGML_TASK_TYPE_COMPUTE)
|
|
|
|
|
- mnpack(0, m, 0, n);
|
|
|
|
|
|
|
+ void matmul(int64_t m, int64_t n) {
|
|
|
|
|
+ mnpack(0, m, 0, n);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
private:
|
|
@@ -596,9 +594,8 @@ class tinyBLAS_Q0_AVX {
|
|
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- void matmul(int64_t m, int64_t n, int task) {
|
|
|
|
|
- if (task == GGML_TASK_TYPE_COMPUTE)
|
|
|
|
|
- mnpack(0, m, 0, n);
|
|
|
|
|
|
|
+ void matmul(int64_t m, int64_t n) {
|
|
|
|
|
+ mnpack(0, m, 0, n);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
private:
|
|
@@ -829,7 +826,7 @@ class tinyBLAS_Q0_AVX {
|
|
|
* For example, for single-threaded single-precision GEMM you can say
|
|
* For example, for single-threaded single-precision GEMM you can say
|
|
|
*
|
|
*
|
|
|
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
|
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
|
|
- * 0, 1, GGML_TASK_TYPE_COMPUTE,
|
|
|
|
|
|
|
+ * 0, 1,
|
|
|
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
|
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
|
|
*
|
|
*
|
|
|
* @param m is rows in `A` and `C`
|
|
* @param m is rows in `A` and `C`
|
|
@@ -843,14 +840,13 @@ class tinyBLAS_Q0_AVX {
|
|
|
* @param ldc is row stride of `C`
|
|
* @param ldc is row stride of `C`
|
|
|
* @param ith is thread id (must be less than `nth`)
|
|
* @param ith is thread id (must be less than `nth`)
|
|
|
* @param nth is number of threads (must be greater than zero)
|
|
* @param nth is number of threads (must be greater than zero)
|
|
|
- * @param task is GGML task type
|
|
|
|
|
* @param Atype is GGML data type of `A`
|
|
* @param Atype is GGML data type of `A`
|
|
|
* @param Btype is GGML data type of `B`
|
|
* @param Btype is GGML data type of `B`
|
|
|
* @param Ctype is GGML data type of `C`
|
|
* @param Ctype is GGML data type of `C`
|
|
|
* @return true if this function was able to service the matmul request
|
|
* @return true if this function was able to service the matmul request
|
|
|
*/
|
|
*/
|
|
|
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
|
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
|
|
- int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
|
|
|
|
|
|
|
+ int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
|
|
|
|
|
|
|
assert(m >= 0);
|
|
assert(m >= 0);
|
|
|
assert(n >= 0);
|
|
assert(n >= 0);
|
|
@@ -877,7 +873,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const float *)B, ldb,
|
|
(const float *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#elif defined(__AVX__) || defined(__AVX2__)
|
|
#elif defined(__AVX__) || defined(__AVX2__)
|
|
|
if (k % 8)
|
|
if (k % 8)
|
|
@@ -887,7 +883,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const float *)B, ldb,
|
|
(const float *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#elif defined(__ARM_NEON)
|
|
#elif defined(__ARM_NEON)
|
|
|
if (n < 4)
|
|
if (n < 4)
|
|
@@ -899,7 +895,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const float *)B, ldb,
|
|
(const float *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#else
|
|
#else
|
|
|
return false;
|
|
return false;
|
|
@@ -917,7 +913,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const float *)B, ldb,
|
|
(const float *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
|
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
|
|
if (k % 8)
|
|
if (k % 8)
|
|
@@ -929,7 +925,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const float *)B, ldb,
|
|
(const float *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
|
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
|
|
if (n < 8)
|
|
if (n < 8)
|
|
@@ -943,7 +939,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const ggml_fp16_t *)B, ldb,
|
|
(const ggml_fp16_t *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
|
if (k % 4)
|
|
if (k % 4)
|
|
@@ -955,7 +951,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const float *)B, ldb,
|
|
(const float *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#else
|
|
#else
|
|
|
return false;
|
|
return false;
|
|
@@ -971,7 +967,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const block_q8_0 *)B, ldb,
|
|
(const block_q8_0 *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
|
|
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
|
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
|
@@ -979,7 +975,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const block_q8_0 *)B, ldb,
|
|
(const block_q8_0 *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#else
|
|
#else
|
|
|
return false;
|
|
return false;
|
|
@@ -995,7 +991,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const block_q8_0 *)B, ldb,
|
|
(const block_q8_0 *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
|
|
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
|
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
|
@@ -1003,7 +999,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(const block_q8_0 *)B, ldb,
|
|
(const block_q8_0 *)B, ldb,
|
|
|
(float *)C, ldc,
|
|
(float *)C, ldc,
|
|
|
ith, nth};
|
|
ith, nth};
|
|
|
- tb.matmul(m, n, task);
|
|
|
|
|
|
|
+ tb.matmul(m, n);
|
|
|
return true;
|
|
return true;
|
|
|
#else
|
|
#else
|
|
|
return false;
|
|
return false;
|
|
@@ -1025,7 +1021,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
|
(void)ldc;
|
|
(void)ldc;
|
|
|
(void)ith;
|
|
(void)ith;
|
|
|
(void)nth;
|
|
(void)nth;
|
|
|
- (void)task;
|
|
|
|
|
(void)Atype;
|
|
(void)Atype;
|
|
|
(void)Btype;
|
|
(void)Btype;
|
|
|
(void)Ctype;
|
|
(void)Ctype;
|