|
|
@@ -1964,9 +1964,10 @@ struct test_mul_mat : public test_case {
|
|
|
const std::array<int64_t, 2> bs; // dims 3 and 4
|
|
|
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
|
|
const std::array<int64_t, 4> per; // permutation of dimensions
|
|
|
+ const bool v; // whether a is a non-contiguous view
|
|
|
|
|
|
std::string vars() override {
|
|
|
- return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
|
|
|
+ return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
|
|
|
}
|
|
|
|
|
|
double max_nmse_err() override {
|
|
|
@@ -1986,8 +1987,9 @@ struct test_mul_mat : public test_case {
|
|
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
|
|
std::array<int64_t, 2> bs = {10, 10},
|
|
|
std::array<int64_t, 2> nr = {2, 2},
|
|
|
- std::array<int64_t, 4> per = {0, 1, 2, 3})
|
|
|
- : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
|
|
|
+ std::array<int64_t, 4> per = {0, 1, 2, 3},
|
|
|
+ bool v = false)
|
|
|
+ : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
|
|
|
|
|
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
|
|
@@ -1997,6 +1999,7 @@ struct test_mul_mat : public test_case {
|
|
|
const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
|
|
|
if (npermuted > 0) {
|
|
|
GGML_ASSERT(npermuted == 2);
|
|
|
+ GGML_ASSERT(!v); // not handled
|
|
|
GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
|
|
|
GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
|
|
|
|
|
|
@@ -2020,7 +2023,13 @@ struct test_mul_mat : public test_case {
|
|
|
ggml_set_name(a, "a_permuted");
|
|
|
ggml_set_name(b, "b_permuted");
|
|
|
} else {
|
|
|
- a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
|
|
+
|
|
|
+ if (v) {
|
|
|
+ a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
|
|
|
+ a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
|
|
|
+ } else {
|
|
|
+ a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
|
|
+ }
|
|
|
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
|
|
if (!ggml_is_quantized(type_a)) {
|
|
|
if (bs[1] == 1 && nr[1] == 1) {
|
|
|
@@ -4176,6 +4185,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
|
|
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
|
|
|
|
|
|
+ for (auto bs : {1,2,4,8}) {
|
|
|
+ for (auto nr : {1,4}) {
|
|
|
+ for (uint32_t m = 0; m < 2; ++m) {
|
|
|
+ for (uint32_t k = 0; k < 2; ++k) {
|
|
|
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3}));
|
|
|
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
// sycl backend will limit task global_range < MAX_INT
|
|
|
// test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
|
|
|
// however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
|
|
@@ -4444,6 +4464,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|
|
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
|
|
|
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
|
|
|
|
|
|
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
|
|
|
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, true));
|
|
|
+
|
|
|
for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
|
|
|
for (ggml_type type_a : all_types) {
|
|
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|