cturan
/
llama.cpp
mirror of https://github.com/cturan/llama.cpp


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628
							#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
#include "ggml.h"

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cassert>

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wdouble-promotion"
#endif

#define MAX_NARGS 3

#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

#define GGML_SILU_FP16

//
// logging
//

#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif

#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif

#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif

#define GGML_PRINT(...) printf(__VA_ARGS__)

static float frand(void) {
    return (float)rand()/(float)RAND_MAX;
}

static int irand(int n) {
    if (n == 0) return 0;
    return rand()%n;
}

static void get_random_dims(int64_t * dims, int ndims) {
    dims[0] = dims[1] = dims[2] = dims[3] = 1;

    for (int i = 0; i < ndims; i++) {
        dims[i] = 1 + irand(4);
    }
}

static struct ggml_tensor * get_random_tensor_f32(
        struct ggml_context * ctx0,
        int ndims,
        int64_t ne[],
        float fmin,
        float fmax) {
    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);

    switch (ndims) {
        case 1:
            for (int i0 = 0; i0 < ne[0]; i0++) {
                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
            }
            break;
        case 2:
            for (int i1 = 0; i1 < ne[1]; i1++) {
                for (int i0 = 0; i0 < ne[0]; i0++) {
                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
                }
            }
            break;
        case 3:
            for (int i2 = 0; i2 < ne[2]; i2++) {
                for (int i1 = 0; i1 < ne[1]; i1++) {
                    for (int i0 = 0; i0 < ne[0]; i0++) {
                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
                    }
                }
            }
            break;
        case 4:
            for (int i3 = 0; i3 < ne[3]; i3++) {
                for (int i2 = 0; i2 < ne[2]; i2++) {
                    for (int i1 = 0; i1 < ne[1]; i1++) {
                        for (int i0 = 0; i0 < ne[0]; i0++) {
                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
                        }
                    }
                }
            }
            break;
        default:
            assert(false);
    };

    return result;
}

static struct ggml_tensor * get_random_tensor_f16(
        struct ggml_context * ctx0,
        int ndims,
        int64_t ne[],
        float fmin,
        float fmax) {
    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);

    switch (ndims) {
        case 1:
            for (int i0 = 0; i0 < ne[0]; i0++) {
                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
            }
            break;
        case 2:
            for (int i1 = 0; i1 < ne[1]; i1++) {
                for (int i0 = 0; i0 < ne[0]; i0++) {
                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
                }
            }
            break;
        case 3:
            for (int i2 = 0; i2 < ne[2]; i2++) {
                for (int i1 = 0; i1 < ne[1]; i1++) {
                    for (int i0 = 0; i0 < ne[0]; i0++) {
                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
                    }
                }
            }
            break;
        case 4:
            for (int i3 = 0; i3 < ne[3]; i3++) {
                for (int i2 = 0; i2 < ne[2]; i2++) {
                    for (int i1 = 0; i1 < ne[1]; i1++) {
                        for (int i0 = 0; i0 < ne[0]; i0++) {
                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
                        }
                    }
                }
            }
            break;
        default:
            assert(false);
    };

    return result;
}

static struct ggml_tensor * get_random_tensor_i32(
        struct ggml_context * ctx0,
        int ndims,
        int64_t ne[],
        int32_t imin,
        int32_t imax) {
    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);

    switch (ndims) {
        case 1:
            for (int i0 = 0; i0 < ne[0]; i0++) {
                ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
            }
            break;
        case 2:
            for (int i1 = 0; i1 < ne[1]; i1++) {
                for (int i0 = 0; i0 < ne[0]; i0++) {
                    ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
                }
            }
            break;
        case 3:
            for (int i2 = 0; i2 < ne[2]; i2++) {
                for (int i1 = 0; i1 < ne[1]; i1++) {
                    for (int i0 = 0; i0 < ne[0]; i0++) {
                        ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
                    }
                }
            }
            break;
        case 4:
            for (int i3 = 0; i3 < ne[3]; i3++) {
                for (int i2 = 0; i2 < ne[2]; i2++) {
                    for (int i1 = 0; i1 < ne[1]; i1++) {
                        for (int i0 = 0; i0 < ne[0]; i0++) {
                            ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
                        }
                    }
                }
            }
            break;
        default:
            assert(false);
    };

    return result;
}

static void print_elements(const char* label, const struct ggml_tensor * t) {
    if (!t) {
        printf("%s: %s = null\n", __func__, label);
        return;
    }
    const int nelements = ggml_nelements(t);
    printf("%s: %s = [", __func__, label);
    for (int k = 0; k < nelements; ++k) {
        if (k > 0) { printf(", "); }
        printf("%.5f", ggml_get_f32_1d(t, k));
    }
    printf("] shape: [");
    for (int k = 0; k < t->n_dims; ++k) {
        if (k > 0) { printf(", "); }
        printf("%d", (int)t->ne[k]);
    }
    printf("]\n");

}

static bool check_gradient(
        const char * op_name,
        struct ggml_context * ctx0,
        struct ggml_tensor * x[],
        struct ggml_tensor * f,
        int ndims,
        int nargs,
        float eps,
        float max_error_abs,
        float max_error_rel) {

    static int n_threads = -1;
    if (n_threads < 0) {
        n_threads = GGML_DEFAULT_N_THREADS;

        const char *env = getenv("GGML_N_THREADS");
        if (env) {
            n_threads = atoi(env);
        }

        printf("GGML_N_THREADS = %d\n", n_threads);
    }

    struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
    struct ggml_cgraph * gb = ggml_new_graph(ctx0);
    *gb = *gf;
    ggml_build_backward_expand(ctx0, gf, gb, false);

    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);

    ggml_graph_reset  (gf);
    ggml_set_f32      (f->grad, 1.0f);

    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);

    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");

    for (int i = 0; i < nargs; ++i) {
        const int nelements = ggml_nelements(x[i]);
        for (int k = 0; k < nelements; ++k) {
            // compute gradient using finite differences
            const float x0 = ggml_get_f32_1d(x[i], k);
            const float xm = x0 - eps;
            const float xp = x0 + eps;
            ggml_set_f32_1d(x[i], k, xp);

            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);

            const double f0 = ggml_get_f32_1d(f, 0);

            ggml_set_f32_1d(x[i], k, xm);

            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);

            const double f1 = ggml_get_f32_1d(f, 0);
            const double g0 = (f0 - f1)/(2.0*(double) eps);

            ggml_set_f32_1d(x[i], k, x0);

            // compute gradient using backward graph
            ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);

            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);

            const double g1 = ggml_get_f32_1d(x[i]->grad, k);

            const double error_abs = fabs(g0 - g1);
            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;

            if (error_abs > max_error_abs || error_rel > max_error_rel) {
                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
                            op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
                //assert(false);
                return false;
            }
        }
    }

    return true;
}

// TODO: clean-up this ..
static bool check_mat_mul(
        const struct ggml_tensor * y,
        const struct ggml_tensor * x0,
        const struct ggml_tensor * x1) {
    float * dst  = (float *) y->data;
    float * src0 = (float *) x0->data;
    float * src1 = (float *) x1->data;

    const int nc = x0->ne[1];
    const int nr = x1->ne[1];
    const int nk = x0->ne[0];

    GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);

    GGML_PRINT_DEBUG("x0:\n");
    for (int j = 0; j < x0->ne[1]; ++j) {
        for (int i = 0; i < x0->ne[0]; ++i) {
            GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
        }
        GGML_PRINT_DEBUG("\n");
    }
    GGML_PRINT_DEBUG("\n");

    GGML_PRINT_DEBUG("x1:\n");
    for (int j = 0; j < x1->ne[1]; ++j) {
        for (int i = 0; i < x1->ne[0]; ++i) {
            GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
        }
        GGML_PRINT_DEBUG("\n");
    }
    GGML_PRINT_DEBUG("\n");

    GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
    for (int j = 0; j < y->ne[1]; ++j) {
        for (int i = 0; i < y->ne[0]; ++i) {
            GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
        }
        GGML_PRINT_DEBUG("\n");
    }

    for (int i = 0; i < nr; ++i) {
        for (int j = 0; j < nc; ++j) {
            float sum = 0.0f;

            for (int k = 0; k < nk; ++k) {
                sum += src0[j*nk + k]*src1[i*nk + k];
            }

            if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
                fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
                assert(false);
                return false;
            }
        }
    }

    return true;
}

#define NUM_PERMUTATIONS (4*3*2*1)

int main(int argc, const char ** argv) {
    struct ggml_init_params params = {
        /* .mem_size   = */ 256*1024*1024,
        /* .mem_buffer = */ NULL,
        /* .no_alloc   = */ false,
    };

    int64_t ne[4];

    int all_permutations[4 * NUM_PERMUTATIONS];
    {
        int count = 0;
        for (int ax0=0; ax0<4; ++ax0) {
            for (int ax1=0; ax1<4; ++ax1) {
                if (ax1 == ax0) continue;
                for (int ax2=0; ax2<4; ++ax2) {
                    if (ax2 == ax0) continue;
                    if (ax2 == ax1) continue;
                    for (int ax3=0; ax3<4; ++ax3) {
                        if (ax3 == ax0) continue;
                        if (ax3 == ax1) continue;
                        if (ax3 == ax2) continue;
                        assert(count < NUM_PERMUTATIONS);
                        all_permutations[count*4+0] = ax0;
                        all_permutations[count*4+1] = ax1;
                        all_permutations[count*4+2] = ax2;
                        all_permutations[count*4+3] = ax3;
                        ++count;
                    }
                }
            }
        }
    }

    unsigned seed_iter = 1;

    // original loop: 1000
    int niter = 4;
    const char *env = getenv("GGML_NLOOP");
    if (env != NULL) {
        niter = atoi(env);
    }
    if (argc > 1) {
        niter = atoi(argv[1]);
    }
    for (int iter = 0; iter < niter; ++iter) {
        srand(seed_iter);
        seed_iter = rand();
        unsigned seed = rand();

        printf("test-grad0: iter:%d/%d\n", iter, niter);
        struct ggml_context * ctx0 = ggml_init(params);

        get_random_dims(ne, 4);

        struct ggml_tensor * x[MAX_NARGS];

        // add f32
        {
            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));

                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
            }
        }

        // add f16
        {
            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));

                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
            }
        }

        // sub
        {
            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));

                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // mul
        {
            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));

                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // div
        {
            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));

                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
            }
        }

        // sqr
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));

                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // sqrt
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));

                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
            }
        }

        // log
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));

                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
            }
        }

        // sum
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);

                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }


        // sum_rows
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));

                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
            }
        }

        // mean, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));

                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // argmax
        if (0)
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));

                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // repeat
        {
            srand(seed);
            int64_t ne2[4];
            get_random_dims(ne2, 4);

            ne2[0] = ne[0] * ne2[0];
            ne2[1] = ne[1] * ne2[1];
            ne2[2] = 1;
            ne2[3] = 1;

            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));

                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
            }
        }

        // repeat back
        {
            srand(seed);
            int64_t ne2[4];
            get_random_dims(ne2, 4);

            ne2[0] = ne[0] * ne2[0];
            ne2[1] = ne[1] * ne2[1];
            ne2[2] = 1;
            ne2[3] = 1;

            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));

                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
            }
        }

        // abs (finite differences do not work)
        //{
        //    const int nargs = 1;

        //    for (int ndims = 1; ndims <= 2; ++ndims) {
        //        for (int i = 0; i < nargs; ++i) {
        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
        //            ggml_set_param(ctx0, x[i]);
        //        }

        //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));

        //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
        //    }
        //}

        // sgn
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));

                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // neg
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));

                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // step
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));

                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // tanh, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));

                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // mul_mat
        {
            srand(seed);
            const int nargs = 2;

            for (int ndims = 2; ndims <= 4; ++ndims) {
                int max_nrep = (ndims >= 3) ? 2 : 1;
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
                        {
                            int64_t ne2[4];
                            get_random_dims(ne2, 4);
                            ne2[0] = ne[0];
                            ne2[2] = nrep2 * ne[2];
                            ne2[3] = nrep3 * ne[3];
                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                        }

                        ggml_set_param(ctx0, x[0]);
                        ggml_set_param(ctx0, x[1]);

                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
                        struct ggml_tensor * f = ggml_sum(ctx0, m);

                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);

                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
                        if (ndims == 2) {
                            // check_mat_mul does not support ndims > 2
                            check_mat_mul(m, x[1], x[0]);
                        }
                    }
                }
            }
        }

        // elu, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));

                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // relu
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));

                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // gelu, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));

                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }

        // silu
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));

#ifdef GGML_SILU_FP16
                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
#else
                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
#endif
            }
        }

        // rms_norm
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));

                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
            }
        }

        // scale
        {
            srand(seed);
            const int nargs = 2;

            int64_t ne2[4];
            ne2[0] = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);

                ggml_set_param(ctx0, x[0]);
                ggml_set_param(ctx0, x[1]);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));

                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // cpy f32
        {
            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));

                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // cpy f16
        {
            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));

                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
            }
        }

        // reshape (1d->nd)
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                int64_t ne2[4];
                ne2[0] = 1;
                ne2[1] = 1;
                ne2[2] = 1;
                ne2[3] = 1;
                for (int i = 0; i < ndims; ++i) {
                    ne2[0] *= ne[i];
                }
                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);


                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // reshape (nd->1d)
        {
            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
                int64_t ne2[4];
                ne2[0] = 1;
                ne2[1] = 1;
                ne2[2] = 1;
                ne2[3] = 1;
                for (int i = 0; i < ndims; ++i) {
                    ne2[0] *= ne[i];
                }
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);


                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // acc 1d
        {
            srand(seed);
            int64_t ne2[4] = { 1, 1, 1, 1 };

            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 1);
                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
                    get_random_dims(ne2, 1);
                }

                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
                const int offset = irand(max_offset) * ggml_element_size(x[0]);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));

                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // acc 2d
        {
            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };

            const int nargs = 2;
            for (int ndims = 2; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 2);
                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
                    get_random_dims(ne2, 2);
                }

                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
                const int offset = offsets[0] + offsets[1];

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));

                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // acc 3d
        {
            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };

            const int nargs = 2;
            for (int ndims = 3; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 3);
                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
                    get_random_dims(ne2, 3);
                }

                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
                const int offset = offsets[0] + offsets[1] + offsets[2];

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));

                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // acc 4d
        {
            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };

            const int nargs = 2;
            for (int ndims = 4; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 4);
                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
                    get_random_dims(ne2, 4);
                }

                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
                max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
                offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
                const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));

                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // set_1d
        {
            srand(seed);
            int64_t ne2[4];

            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 1);
                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
                    get_random_dims(ne2, 1);
                }

                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
                const int offset = irand(max_offset) * ggml_element_size(x[0]);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));

                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // set_2d
        {
            srand(seed);
            int64_t ne2[4];
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };

            const int nargs = 1;
            for (int ndims = 2; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                get_random_dims(ne2, 2);
                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
                    get_random_dims(ne2, 2);
                }

                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);

                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
                const int offset = offsets[0] + offsets[1];

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));

                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // view_1d
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);

                ggml_set_param(ctx0, x[0]);

                const int k0 = irand(ggml_nelements(x[0]));
                const int k1 = irand(ggml_nelements(x[0]));
                const int i0 = MIN(k0, k1);
                const int i1 = MAX(k0, k1);

                const int offset = i0 * sizeof(float);
                const int nelem  = i1 - i0;

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));

                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // view_2d
        {
            srand(seed);
            int64_t ne2[4];
            int64_t nb2[4];

            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);

                get_random_dims(ne2, 2);
                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
                    get_random_dims(ne2, 2);
                }
                const int count = ne2[0]*ne2[1];

                nb2[0] = sizeof(float);
                nb2[1] = nb2[0]*ne2[0];

                ggml_set_param(ctx0, x[0]);

                const int max_offset = ggml_nelements(x[0]) - count;
                const int offset = irand(max_offset+1) * sizeof(float);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));

                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // view_3d
        {
            srand(seed);
            int64_t ne2[4] = {1,1,1,1};
            int64_t nb2[4] = {0,0,0,0};

            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {

                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);

                get_random_dims(ne2, 3);
                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
                    get_random_dims(ne2, 3);
                }
                const int count = ne2[0]*ne2[1]*ne2[2];

                nb2[0] = sizeof(float);
                nb2[1] = nb2[0]*ne2[0];
                nb2[2] = nb2[1]*ne2[1];

                ggml_set_param(ctx0, x[0]);

                const int max_offset = ggml_nelements(x[0]) - count;
                const int offset = irand(max_offset+1) * sizeof(float);

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));

                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // permute
        {
            srand(seed);
            int64_t ne2[4];

            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims)
            {
                // ggml_permute will set axes of dimensions below n_dims to 1.
                // to make ggml_permute work correctly on all axes,
                // the input tensor needs maximal n_dim of 4.
                for (int i=0; i<ndims; ++i) {
                    ne2[i] = ne[i];
                }
                for (int i=ndims; i<4; ++i) {
                    ne2[i] = 1;
                }
                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);

                ggml_set_param(ctx0, x[0]);

                const int p = irand(NUM_PERMUTATIONS);
                const int ax0 = all_permutations[p*4+0];
                const int ax1 = all_permutations[p*4+1];
                const int ax2 = all_permutations[p*4+2];
                const int ax3 = all_permutations[p*4+3];

                // sum requires contiguous tensor rows
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));

                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // transpose
        {
            srand(seed);
            int64_t ne2[4];

            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims)
            {
                // ggml_transpose will set axes of dimensions below n_dims to 1.
                // to make ggml_transpose work correctly on all axes,
                // the input tensor needs maximal n_dim of 4.
                for (int i=0; i<ndims; ++i) {
                    ne2[i] = ne[i];
                }
                for (int i=ndims; i<4; ++i) {
                    ne2[i] = 1;
                }
                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);

                ggml_set_param(ctx0, x[0]);

                // sum requires contiguous tensor rows
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));

                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

        // get_rows
        {
            srand(seed);
            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
            const int nargs = 1;
            const int ndims = 2;
            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);

            ggml_set_param(ctx0, x[0]);

            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));

            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
        }

        // diag_mask_inf
        {
            srand(seed);
            const int nargs = 1;
            const int ndims = 2;

            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
            ggml_set_param(ctx0, x[0]);

            int n_past = irand(ne[0]);

            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));

            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
        }

        // diag_mask_zero
        {
            srand(seed);
            const int nargs = 1;
            const int ndims = 2;

            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
            ggml_set_param(ctx0, x[0]);

            int n_past = irand(ne[0]);

            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));

            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
        }

        // softmax
        {
            srand(seed);
            const int nargs = 1;

            int64_t ne2[4];
            get_random_dims(ne2, 4);

            for (int ndims = 1; ndims <= 3; ++ndims) {
                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

                float eps = 1e-6f;
                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
                struct ggml_tensor * f = ggml_sum(ctx0,
                                            ggml_log(ctx0,
                                                ggml_add1(ctx0,
                                                    ggml_scale(ctx0,
                                                        ggml_soft_max(ctx0, x[0]),
                                                        ggml_new_f32(ctx0, 1.0f - eps)),
                                                    ggml_new_f32(ctx0, eps))));

                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
                // this may result in different gradients too finite differences.
                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
                // if only the table lookup causes gradients to differ this is acceptable.
            }
        }

        // cross_entropy_loss
        {
            srand(seed);
            const int nargs = 1;

            int64_t ne2[4];
            get_random_dims(ne2, 4);

            for (int ndims = 1; ndims <= 4; ++ndims) {
                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
                // the second argument to cross_entropy_loss must sum up to 1 for each row
                int nr = ggml_nrows(x[1]);
                int nc = ggml_nelements(x[1]) / nr;
                for (int ir = 0; ir < nr; ++ir) {
                    float sum = 0;
                    for (int ic = 0; ic < nc; ++ic) {
                        sum += ((float *) x[1]->data)[ic + ir*nc];
                    }
                    for (int ic = 0; ic < nc; ++ic) {
                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
                    }
                }
                ggml_set_param(ctx0, x[0]);

                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);

                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
            }
        }

        // rope f32
        {
            srand(seed);
            const int nargs = 1;

            int64_t ne2[4];
            get_random_dims(ne2, 4);
            ne2[0] += ne2[0] % 2;
            int n_rot = ne2[0];

            for (int ndims = 3; ndims <= 4; ++ndims) {
                for (int mode = 0; mode < 4; ++mode) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);

                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
                        for (int i = 0; i < ne2[2]; ++i) {
                            ((int32_t *) p->data)[i] = n_past + i;
                        }

                        ggml_set_param(ctx0, x[0]);

                        const bool skip_past = (mode & 1);
                        if (skip_past) {
                            // we have no past, so this would have to work on uninitialized memory.
                            // we only test the gradients here;
                            // skip_past should have no influence on gradient computation.
                            // so when other modes work, we assume that this does as well.
                            continue;
                        }

                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));

                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
                    }
                }
            }
        }

        // rope f16
        {
            srand(seed);
            const int nargs = 1;

            int64_t ne2[4];
            get_random_dims(ne2, 4);
            ne2[0] += ne2[0] % 2;
            int n_rot = ne2[0];

            for (int ndims = 3; ndims <= 4; ++ndims) {
                for (int mode = 0; mode < 4; ++mode) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);

                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
                        for (int i = 0; i < ne2[2]; ++i) {
                            ((int32_t *) p->data)[i] = n_past + i;
                        }

                        ggml_set_param(ctx0, x[0]);

                        const bool skip_past = (mode & 1);
                        if (skip_past) {
                            // we have no past, so this would have to work on uninitialized memory.
                            // we only test the gradients here;
                            // skip_past should have no influence on gradient computation.
                            // so when other modes work, we assume that this does as well.
                            continue;
                        }

                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));

                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
                    }
                }
            }
        }

        // flash_attn f32
        {
            srand(seed);
            const int nargs = 3;

            int64_t ne2[4];

            get_random_dims(ne2, 4);
            int64_t D = ne2[0];
            int64_t N = ne2[1];
            int64_t M = ne2[2] + N;
            int64_t B = ne2[3];

            for (int masked = 0; masked <= 1; ++masked) {
                for (int ndims = 2; ndims <= 4; ++ndims) {
                    int max_nrep = (ndims >= 3) ? 2 : 1;
                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
                        int64_t nek[4] = { D, M, B, ne[3] };
                        int64_t nev[4] = { M, D, B, ne[3] };
                        if (ndims == 2) {
                            neq[2] = 1; neq[3] = 1;
                            nek[2] = 1; nek[3] = 1;
                            nev[2] = 1; nev[3] = 1;
                        } else if (ndims == 3) {
                            neq[3] = 1;
                            nek[3] = 1;
                            nev[3] = 1;
                        }
                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
                        ggml_set_param(ctx0, x[0]);
                        ggml_set_param(ctx0, x[1]);
                        ggml_set_param(ctx0, x[2]);

                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));

                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                    }
                }
            }
        }

        // flash_attn f16, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 3;

            int64_t ne2[4];

            get_random_dims(ne2, 4);
            int64_t D = ne2[0];
            int64_t N = ne2[1];
            int64_t M = ne2[2] + N;
            int64_t B = ne2[3];

            for (int masked = 0; masked <= 1; ++masked) {
                for (int ndims = 2; ndims <= 4; ++ndims) {
                    int64_t neq[4] = { D, N, B, ne[3] };
                    int64_t nek[4] = { D, M, B, ne[3] };
                    int64_t nev[4] = { M, D, B, ne[3] };
                    if (ndims == 2) {
                        neq[2] = 1; neq[3] = 1;
                        nek[2] = 1; nek[3] = 1;
                        nev[2] = 1; nev[3] = 1;
                    } else if (ndims == 3) {
                        neq[3] = 1;
                        nek[3] = 1;
                        nev[3] = 1;
                    }
                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
                    ggml_set_param(ctx0, x[0]);
                    ggml_set_param(ctx0, x[1]);
                    ggml_set_param(ctx0, x[2]);

                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));

                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                }
            }
        }
        ggml_free(ctx0);
    }

    return 0;
}