test-grad0.cpp 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683
  1. #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
  2. #include "ggml.h"
  3. #include <cfloat>
  4. #include <cmath>
  5. #include <cstdint>
  6. #include <cstdio>
  7. #include <cstdlib>
  8. #include <cassert>
  9. #include <initializer_list>
  10. #include <vector>
  11. #if defined(_MSC_VER)
  12. #pragma warning(disable: 4244 4267) // possible loss of data
  13. #endif
  14. #if defined(__GNUC__)
  15. #pragma GCC diagnostic ignored "-Wdouble-promotion"
  16. #endif
  17. #define MAX_NARGS 3
  18. #undef MIN
  19. #undef MAX
  20. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  21. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  22. #define GGML_SILU_FP16
  23. //
  24. // logging
  25. //
  26. #if (GGML_DEBUG >= 1)
  27. #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
  28. #else
  29. #define GGML_PRINT_DEBUG(...)
  30. #endif
  31. #if (GGML_DEBUG >= 5)
  32. #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
  33. #else
  34. #define GGML_PRINT_DEBUG_5(...)
  35. #endif
  36. #if (GGML_DEBUG >= 10)
  37. #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
  38. #else
  39. #define GGML_PRINT_DEBUG_10(...)
  40. #endif
  41. #define GGML_PRINT(...) printf(__VA_ARGS__)
  42. static float frand(void) {
  43. return (float)rand()/(float)RAND_MAX;
  44. }
  45. static int irand(int n) {
  46. if (n == 0) return 0;
  47. return rand()%n;
  48. }
  49. static void get_random_dims(int64_t * dims, int ndims) {
  50. dims[0] = dims[1] = dims[2] = dims[3] = 1;
  51. for (int i = 0; i < ndims; i++) {
  52. dims[i] = 1 + irand(4);
  53. }
  54. }
  55. static struct ggml_tensor * get_random_tensor_f32(
  56. struct ggml_context * ctx0,
  57. int ndims,
  58. int64_t ne[],
  59. float fmin,
  60. float fmax) {
  61. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
  62. switch (ndims) {
  63. case 1:
  64. for (int i0 = 0; i0 < ne[0]; i0++) {
  65. ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
  66. }
  67. break;
  68. case 2:
  69. for (int i1 = 0; i1 < ne[1]; i1++) {
  70. for (int i0 = 0; i0 < ne[0]; i0++) {
  71. ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  72. }
  73. }
  74. break;
  75. case 3:
  76. for (int i2 = 0; i2 < ne[2]; i2++) {
  77. for (int i1 = 0; i1 < ne[1]; i1++) {
  78. for (int i0 = 0; i0 < ne[0]; i0++) {
  79. ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  80. }
  81. }
  82. }
  83. break;
  84. case 4:
  85. for (int i3 = 0; i3 < ne[3]; i3++) {
  86. for (int i2 = 0; i2 < ne[2]; i2++) {
  87. for (int i1 = 0; i1 < ne[1]; i1++) {
  88. for (int i0 = 0; i0 < ne[0]; i0++) {
  89. ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  90. }
  91. }
  92. }
  93. }
  94. break;
  95. default:
  96. assert(false);
  97. }
  98. return result;
  99. }
  100. static struct ggml_tensor * get_random_tensor_f16(
  101. struct ggml_context * ctx0,
  102. int ndims,
  103. int64_t ne[],
  104. float fmin,
  105. float fmax) {
  106. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
  107. switch (ndims) {
  108. case 1:
  109. for (int i0 = 0; i0 < ne[0]; i0++) {
  110. ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  111. }
  112. break;
  113. case 2:
  114. for (int i1 = 0; i1 < ne[1]; i1++) {
  115. for (int i0 = 0; i0 < ne[0]; i0++) {
  116. ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  117. }
  118. }
  119. break;
  120. case 3:
  121. for (int i2 = 0; i2 < ne[2]; i2++) {
  122. for (int i1 = 0; i1 < ne[1]; i1++) {
  123. for (int i0 = 0; i0 < ne[0]; i0++) {
  124. ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  125. }
  126. }
  127. }
  128. break;
  129. case 4:
  130. for (int i3 = 0; i3 < ne[3]; i3++) {
  131. for (int i2 = 0; i2 < ne[2]; i2++) {
  132. for (int i1 = 0; i1 < ne[1]; i1++) {
  133. for (int i0 = 0; i0 < ne[0]; i0++) {
  134. ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  135. }
  136. }
  137. }
  138. }
  139. break;
  140. default:
  141. assert(false);
  142. }
  143. return result;
  144. }
  145. static struct ggml_tensor * get_random_tensor_i32(
  146. struct ggml_context * ctx0,
  147. int ndims,
  148. int64_t ne[],
  149. int32_t imin,
  150. int32_t imax) {
  151. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
  152. switch (ndims) {
  153. case 1:
  154. for (int i0 = 0; i0 < ne[0]; i0++) {
  155. ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
  156. }
  157. break;
  158. case 2:
  159. for (int i1 = 0; i1 < ne[1]; i1++) {
  160. for (int i0 = 0; i0 < ne[0]; i0++) {
  161. ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
  162. }
  163. }
  164. break;
  165. case 3:
  166. for (int i2 = 0; i2 < ne[2]; i2++) {
  167. for (int i1 = 0; i1 < ne[1]; i1++) {
  168. for (int i0 = 0; i0 < ne[0]; i0++) {
  169. ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  170. }
  171. }
  172. }
  173. break;
  174. case 4:
  175. for (int i3 = 0; i3 < ne[3]; i3++) {
  176. for (int i2 = 0; i2 < ne[2]; i2++) {
  177. for (int i1 = 0; i1 < ne[1]; i1++) {
  178. for (int i0 = 0; i0 < ne[0]; i0++) {
  179. ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  180. }
  181. }
  182. }
  183. }
  184. break;
  185. default:
  186. assert(false);
  187. }
  188. return result;
  189. }
  190. static bool check_gradient(
  191. const char * op_name,
  192. struct ggml_context * ctx0,
  193. struct ggml_tensor * x[],
  194. struct ggml_tensor * f,
  195. int ndims,
  196. int nargs,
  197. float eps,
  198. float max_error_abs,
  199. float max_error_rel,
  200. std::vector<double> expected_vals) {
  201. static int n_threads = -1;
  202. if (n_threads < 0) {
  203. n_threads = GGML_DEFAULT_N_THREADS;
  204. const char *env = getenv("GGML_N_THREADS");
  205. if (env) {
  206. n_threads = atoi(env);
  207. }
  208. printf("GGML_N_THREADS = %d\n", n_threads);
  209. }
  210. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
  211. struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
  212. ggml_build_forward_expand(gf, f);
  213. ggml_graph_cpy(gf, gb);
  214. ggml_build_backward_expand(ctx0, gf, gb, false);
  215. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  216. ggml_graph_reset(gb);
  217. if (f->grad) {
  218. ggml_set_f32(f->grad, 1.0f);
  219. }
  220. ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
  221. // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
  222. // ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
  223. for (int i = 0; i < nargs; ++i) {
  224. bool all_g0_bad = true;
  225. const int nelements = ggml_nelements(x[i]);
  226. for (int k = 0; k < nelements; ++k) {
  227. // Calculate gradient numerically:
  228. const float x0 = ggml_get_f32_1d(x[i], k);
  229. const float xm = x0 - eps;
  230. const float xp = x0 + eps;
  231. ggml_set_f32_1d(x[i], k, xp);
  232. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  233. const double f0 = ggml_get_f32_1d(f, 0);
  234. ggml_set_f32_1d(x[i], k, xm);
  235. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  236. const double f1 = ggml_get_f32_1d(f, 0);
  237. const double g0 = (f0 - f1)/(2.0*(double) eps);
  238. // The numerical calculation of the gradient fails around noncontinuities (e.g. 0 for ReLU).
  239. // In such cases, provide a vector of expected values and skip the comparison for failed calculations.
  240. if (!expected_vals.empty()) {
  241. bool matches_any = false;
  242. for (const double & ev : expected_vals) {
  243. const double error_abs = std::fabs(g0 - ev);
  244. if (error_abs > max_error_abs) {
  245. continue;
  246. }
  247. const double error_rel = g0 != 0.0 ? fabs(g0 - ev)/fabs(g0) : 0.0;
  248. if (error_rel > max_error_rel) {
  249. continue;
  250. }
  251. matches_any = true;
  252. break;
  253. }
  254. if (!matches_any) {
  255. continue;
  256. }
  257. }
  258. all_g0_bad = false;
  259. ggml_set_f32_1d(x[i], k, x0);
  260. // compute gradient using backward graph
  261. ggml_graph_reset(gb);
  262. if (f->grad) {
  263. ggml_set_f32(f->grad, 1.0f);
  264. }
  265. ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
  266. const double g1 = ggml_get_f32_1d(x[i]->grad, k);
  267. const double error_abs = fabs(g0 - g1);
  268. const double error_rel = g0 != 0.0 ? fabs(g0 - g1)/fabs(g0) : 0.0;
  269. if (error_abs > max_error_abs || error_rel > max_error_rel) {
  270. printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
  271. op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
  272. //assert(false);
  273. return false;
  274. }
  275. }
  276. if (all_g0_bad) {
  277. printf("%s: numerical calculation of the gradient failed for all values\n", op_name);
  278. return false;
  279. }
  280. }
  281. return true;
  282. }
  283. // TODO: clean-up this ..
  284. static bool check_mat_mul(
  285. const struct ggml_tensor * y,
  286. const struct ggml_tensor * x0,
  287. const struct ggml_tensor * x1) {
  288. float * dst = (float *) y->data;
  289. float * src0 = (float *) x0->data;
  290. float * src1 = (float *) x1->data;
  291. const int nc = x0->ne[1];
  292. const int nr = x1->ne[1];
  293. const int nk = x0->ne[0];
  294. GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
  295. GGML_PRINT_DEBUG("x0:\n");
  296. for (int j = 0; j < x0->ne[1]; ++j) {
  297. for (int i = 0; i < x0->ne[0]; ++i) {
  298. GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
  299. }
  300. GGML_PRINT_DEBUG("\n");
  301. }
  302. GGML_PRINT_DEBUG("\n");
  303. GGML_PRINT_DEBUG("x1:\n");
  304. for (int j = 0; j < x1->ne[1]; ++j) {
  305. for (int i = 0; i < x1->ne[0]; ++i) {
  306. GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
  307. }
  308. GGML_PRINT_DEBUG("\n");
  309. }
  310. GGML_PRINT_DEBUG("\n");
  311. GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
  312. for (int j = 0; j < y->ne[1]; ++j) {
  313. for (int i = 0; i < y->ne[0]; ++i) {
  314. GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
  315. }
  316. GGML_PRINT_DEBUG("\n");
  317. }
  318. for (int i = 0; i < nr; ++i) {
  319. for (int j = 0; j < nc; ++j) {
  320. float sum = 0.0f;
  321. for (int k = 0; k < nk; ++k) {
  322. sum += src0[j*nk + k]*src1[i*nk + k];
  323. }
  324. if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
  325. fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
  326. assert(false);
  327. return false;
  328. }
  329. }
  330. }
  331. return true;
  332. }
  333. #define NUM_PERMUTATIONS (4*3*2*1)
  334. int main(int argc, const char ** argv) {
  335. struct ggml_init_params params = {
  336. /* .mem_size = */ 256*1024*1024,
  337. /* .mem_buffer = */ NULL,
  338. /* .no_alloc = */ false,
  339. };
  340. int64_t ne[4];
  341. int all_permutations[4 * NUM_PERMUTATIONS];
  342. {
  343. int count = 0;
  344. for (int ax0=0; ax0<4; ++ax0) {
  345. for (int ax1=0; ax1<4; ++ax1) {
  346. if (ax1 == ax0) continue;
  347. for (int ax2=0; ax2<4; ++ax2) {
  348. if (ax2 == ax0) continue;
  349. if (ax2 == ax1) continue;
  350. for (int ax3=0; ax3<4; ++ax3) {
  351. if (ax3 == ax0) continue;
  352. if (ax3 == ax1) continue;
  353. if (ax3 == ax2) continue;
  354. assert(count < NUM_PERMUTATIONS);
  355. all_permutations[count*4+0] = ax0;
  356. all_permutations[count*4+1] = ax1;
  357. all_permutations[count*4+2] = ax2;
  358. all_permutations[count*4+3] = ax3;
  359. ++count;
  360. }
  361. }
  362. }
  363. }
  364. }
  365. unsigned seed_iter = 1;
  366. // original loop: 1000
  367. int niter = 4;
  368. const char *env = getenv("GGML_NLOOP");
  369. if (env != NULL) {
  370. niter = atoi(env);
  371. }
  372. if (argc > 1) {
  373. niter = atoi(argv[1]);
  374. }
  375. for (int iter = 0; iter < niter; ++iter) {
  376. srand(seed_iter);
  377. seed_iter = rand();
  378. unsigned seed = rand();
  379. printf("test-grad0: iter:%d/%d\n", (iter+1), niter);
  380. struct ggml_context * ctx0 = ggml_init(params);
  381. get_random_dims(ne, 4);
  382. struct ggml_tensor * x[MAX_NARGS];
  383. // add f32
  384. {
  385. srand(seed);
  386. const int nargs = 2;
  387. for (int ndims = 1; ndims <= 4; ++ndims) {
  388. for (int i = 0; i < nargs; ++i) {
  389. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  390. ggml_set_param(ctx0, x[i]);
  391. }
  392. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  393. check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f, {});
  394. }
  395. }
  396. // add f16
  397. {
  398. srand(seed);
  399. const int nargs = 2;
  400. for (int ndims = 1; ndims <= 4; ++ndims) {
  401. for (int i = 0; i < nargs; ++i) {
  402. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  403. ggml_set_param(ctx0, x[i]);
  404. }
  405. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  406. check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f, {});
  407. }
  408. }
  409. // sub
  410. {
  411. srand(seed);
  412. const int nargs = 2;
  413. for (int ndims = 1; ndims <= 4; ++ndims) {
  414. for (int i = 0; i < nargs; ++i) {
  415. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  416. ggml_set_param(ctx0, x[i]);
  417. }
  418. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
  419. check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
  420. }
  421. }
  422. // mul
  423. {
  424. srand(seed);
  425. const int nargs = 2;
  426. for (int ndims = 1; ndims <= 4; ++ndims) {
  427. for (int i = 0; i < nargs; ++i) {
  428. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  429. ggml_set_param(ctx0, x[i]);
  430. }
  431. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
  432. check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  433. }
  434. }
  435. // div
  436. {
  437. srand(seed);
  438. const int nargs = 2;
  439. for (int ndims = 1; ndims <= 4; ++ndims) {
  440. for (int i = 0; i < nargs; ++i) {
  441. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
  442. ggml_set_param(ctx0, x[i]);
  443. }
  444. struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
  445. check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f, {});
  446. }
  447. }
  448. // sqr
  449. {
  450. srand(seed);
  451. const int nargs = 1;
  452. for (int ndims = 1; ndims <= 2; ++ndims) {
  453. for (int i = 0; i < nargs; ++i) {
  454. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  455. ggml_set_param(ctx0, x[i]);
  456. }
  457. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
  458. check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  459. }
  460. }
  461. // sqrt
  462. {
  463. srand(seed);
  464. const int nargs = 1;
  465. for (int ndims = 1; ndims <= 2; ++ndims) {
  466. for (int i = 0; i < nargs; ++i) {
  467. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  468. ggml_set_param(ctx0, x[i]);
  469. }
  470. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
  471. check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f, {});
  472. }
  473. }
  474. // log
  475. {
  476. srand(seed);
  477. const int nargs = 1;
  478. for (int ndims = 1; ndims <= 2; ++ndims) {
  479. for (int i = 0; i < nargs; ++i) {
  480. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  481. ggml_set_param(ctx0, x[i]);
  482. }
  483. struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
  484. check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f, {});
  485. }
  486. }
  487. // sum
  488. {
  489. srand(seed);
  490. const int nargs = 1;
  491. for (int ndims = 1; ndims <= 2; ++ndims) {
  492. for (int i = 0; i < nargs; ++i) {
  493. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  494. ggml_set_param(ctx0, x[i]);
  495. }
  496. struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
  497. check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
  498. }
  499. }
  500. // sum_rows
  501. {
  502. srand(seed);
  503. const int nargs = 1;
  504. for (int ndims = 1; ndims <= 4; ++ndims) {
  505. for (int i = 0; i < nargs; ++i) {
  506. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  507. ggml_set_param(ctx0, x[i]);
  508. }
  509. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
  510. check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
  511. }
  512. }
  513. // mean, not yet fully implemented
  514. if(0)
  515. {
  516. srand(seed);
  517. const int nargs = 1;
  518. for (int ndims = 1; ndims <= 4; ++ndims) {
  519. for (int i = 0; i < nargs; ++i) {
  520. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  521. ggml_set_param(ctx0, x[i]);
  522. }
  523. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
  524. check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
  525. }
  526. }
  527. // argmax
  528. if (0)
  529. {
  530. srand(seed);
  531. const int nargs = 1;
  532. for (int ndims = 1; ndims <= 4; ++ndims) {
  533. for (int i = 0; i < nargs; ++i) {
  534. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  535. ggml_set_param(ctx0, x[i]);
  536. }
  537. struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
  538. check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
  539. }
  540. }
  541. // repeat
  542. {
  543. srand(seed);
  544. int64_t ne2[4];
  545. get_random_dims(ne2, 4);
  546. ne2[0] = ne[0] * ne2[0];
  547. ne2[1] = ne[1] * ne2[1];
  548. ne2[2] = 1;
  549. ne2[3] = 1;
  550. const int nargs = 1;
  551. for (int ndims = 1; ndims <= 2; ++ndims) {
  552. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  553. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  554. ggml_set_param(ctx0, x[0]);
  555. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
  556. check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
  557. }
  558. }
  559. // repeat back
  560. {
  561. srand(seed);
  562. int64_t ne2[4];
  563. get_random_dims(ne2, 4);
  564. ne2[0] = ne[0] * ne2[0];
  565. ne2[1] = ne[1] * ne2[1];
  566. ne2[2] = 1;
  567. ne2[3] = 1;
  568. const int nargs = 1;
  569. for (int ndims = 1; ndims <= 2; ++ndims) {
  570. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  571. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  572. ggml_set_param(ctx0, x[0]);
  573. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
  574. check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
  575. }
  576. }
  577. // abs
  578. {
  579. const int nargs = 1;
  580. for (int ndims = 1; ndims <= 4; ++ndims) {
  581. for (int i = 0; i < nargs; ++i) {
  582. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  583. ggml_set_param(ctx0, x[i]);
  584. }
  585. struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
  586. check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f, {-1.0, 1.0});
  587. }
  588. }
  589. // sgn
  590. {
  591. srand(seed);
  592. const int nargs = 1;
  593. for (int ndims = 1; ndims <= 4; ++ndims) {
  594. for (int i = 0; i < nargs; ++i) {
  595. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  596. ggml_set_param(ctx0, x[i]);
  597. }
  598. struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
  599. check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
  600. }
  601. }
  602. // neg
  603. {
  604. srand(seed);
  605. const int nargs = 1;
  606. for (int ndims = 1; ndims <= 4; ++ndims) {
  607. for (int i = 0; i < nargs; ++i) {
  608. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  609. ggml_set_param(ctx0, x[i]);
  610. }
  611. struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
  612. check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
  613. }
  614. }
  615. // step
  616. {
  617. srand(seed);
  618. const int nargs = 1;
  619. for (int ndims = 1; ndims <= 4; ++ndims) {
  620. for (int i = 0; i < nargs; ++i) {
  621. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  622. ggml_set_param(ctx0, x[i]);
  623. }
  624. struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
  625. check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
  626. }
  627. }
  628. // tanh, not yet fully implemented
  629. if(0)
  630. {
  631. srand(seed);
  632. const int nargs = 1;
  633. for (int ndims = 1; ndims <= 4; ++ndims) {
  634. for (int i = 0; i < nargs; ++i) {
  635. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  636. ggml_set_param(ctx0, x[i]);
  637. }
  638. struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
  639. check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
  640. }
  641. }
  642. // mul_mat
  643. {
  644. srand(seed);
  645. const int nargs = 2;
  646. for (int ndims = 2; ndims <= 4; ++ndims) {
  647. int max_nrep = (ndims >= 3) ? 2 : 1;
  648. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  649. for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
  650. for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
  651. {
  652. int64_t ne2[4];
  653. get_random_dims(ne2, 4);
  654. ne2[0] = ne[0];
  655. ne2[2] = nrep2 * ne[2];
  656. ne2[3] = nrep3 * ne[3];
  657. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  658. }
  659. ggml_set_param(ctx0, x[0]);
  660. ggml_set_param(ctx0, x[1]);
  661. struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
  662. struct ggml_tensor * f = ggml_sum(ctx0, m);
  663. GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
  664. check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  665. if (ndims == 2) {
  666. // check_mat_mul does not support ndims > 2
  667. check_mat_mul(m, x[1], x[0]);
  668. }
  669. }
  670. }
  671. }
  672. }
  673. // elu, not yet fully implemented
  674. if(0)
  675. {
  676. srand(seed);
  677. const int nargs = 1;
  678. for (int ndims = 1; ndims <= 4; ++ndims) {
  679. for (int i = 0; i < nargs; ++i) {
  680. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  681. ggml_set_param(ctx0, x[i]);
  682. }
  683. struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
  684. check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
  685. }
  686. }
  687. // relu
  688. {
  689. srand(seed);
  690. const int nargs = 1;
  691. for (int ndims = 1; ndims <= 4; ++ndims) {
  692. for (int i = 0; i < nargs; ++i) {
  693. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  694. ggml_set_param(ctx0, x[i]);
  695. }
  696. struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
  697. check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {0.0, 1.0});
  698. }
  699. }
  700. // gelu, not yet fully implemented
  701. if(0)
  702. {
  703. srand(seed);
  704. const int nargs = 1;
  705. for (int ndims = 1; ndims <= 4; ++ndims) {
  706. for (int i = 0; i < nargs; ++i) {
  707. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  708. ggml_set_param(ctx0, x[i]);
  709. }
  710. struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
  711. check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
  712. }
  713. }
  714. // silu
  715. {
  716. srand(seed);
  717. const int nargs = 1;
  718. for (int ndims = 1; ndims <= 2; ++ndims) {
  719. for (int i = 0; i < nargs; ++i) {
  720. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  721. ggml_set_param(ctx0, x[i]);
  722. }
  723. struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
  724. #ifdef GGML_SILU_FP16
  725. // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
  726. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY, {});
  727. #else
  728. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  729. #endif
  730. }
  731. }
  732. // rms_norm
  733. {
  734. srand(seed);
  735. const int nargs = 1;
  736. for (int ndims = 1; ndims <= 2; ++ndims) {
  737. for (int i = 0; i < nargs; ++i) {
  738. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  739. ggml_set_param(ctx0, x[i]);
  740. }
  741. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
  742. check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY, {});
  743. }
  744. }
  745. // scale
  746. {
  747. srand(seed);
  748. const int nargs = 1;
  749. for (int ndims = 1; ndims <= 2; ++ndims) {
  750. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  751. const float s = -1.0f + 2.0f*frand();
  752. ggml_set_param(ctx0, x[0]);
  753. struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
  754. check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  755. }
  756. }
  757. // cpy f32
  758. {
  759. srand(seed);
  760. const int nargs = 2;
  761. for (int ndims = 1; ndims <= 2; ++ndims) {
  762. for (int i = 0; i < nargs; ++i) {
  763. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  764. ggml_set_param(ctx0, x[i]);
  765. }
  766. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  767. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  768. check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  769. }
  770. }
  771. // cpy f16
  772. {
  773. srand(seed);
  774. const int nargs = 2;
  775. for (int ndims = 1; ndims <= 2; ++ndims) {
  776. for (int i = 0; i < nargs; ++i) {
  777. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  778. ggml_set_param(ctx0, x[i]);
  779. }
  780. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  781. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  782. check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
  783. }
  784. }
  785. // reshape (1d->nd)
  786. {
  787. srand(seed);
  788. const int nargs = 1;
  789. for (int ndims = 1; ndims <= 2; ++ndims) {
  790. int64_t ne2[4];
  791. ne2[0] = 1;
  792. ne2[1] = 1;
  793. ne2[2] = 1;
  794. ne2[3] = 1;
  795. for (int i = 0; i < ndims; ++i) {
  796. ne2[0] *= ne[i];
  797. }
  798. x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  799. x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  800. ggml_set_param(ctx0, x[0]);
  801. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  802. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  803. }
  804. }
  805. // reshape (nd->1d)
  806. {
  807. srand(seed);
  808. const int nargs = 1;
  809. for (int ndims = 1; ndims <= 2; ++ndims) {
  810. int64_t ne2[4];
  811. ne2[0] = 1;
  812. ne2[1] = 1;
  813. ne2[2] = 1;
  814. ne2[3] = 1;
  815. for (int i = 0; i < ndims; ++i) {
  816. ne2[0] *= ne[i];
  817. }
  818. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  819. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  820. ggml_set_param(ctx0, x[0]);
  821. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  822. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  823. }
  824. }
  825. // acc 1d
  826. {
  827. srand(seed);
  828. int64_t ne2[4] = { 1, 1, 1, 1 };
  829. const int nargs = 2;
  830. for (int ndims = 1; ndims <= 4; ++ndims) {
  831. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  832. ggml_set_param(ctx0, x[0]);
  833. get_random_dims(ne2, 1);
  834. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  835. get_random_dims(ne2, 1);
  836. }
  837. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  838. ggml_set_param(ctx0, x[1]);
  839. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  840. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  841. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  842. check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  843. }
  844. }
  845. // acc 2d
  846. {
  847. srand(seed);
  848. int64_t ne2[4] = { 1, 1, 1, 1 };
  849. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  850. int64_t offsets[4] = { 0, 0, 0, 0 };
  851. const int nargs = 2;
  852. for (int ndims = 2; ndims <= 4; ++ndims) {
  853. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  854. ggml_set_param(ctx0, x[0]);
  855. get_random_dims(ne2, 2);
  856. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  857. get_random_dims(ne2, 2);
  858. }
  859. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  860. ggml_set_param(ctx0, x[1]);
  861. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  862. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  863. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  864. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  865. const int offset = offsets[0] + offsets[1];
  866. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  867. check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  868. }
  869. }
  870. // acc 3d
  871. {
  872. srand(seed);
  873. int64_t ne2[4] = { 1, 1, 1, 1 };
  874. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  875. int64_t offsets[4] = { 0, 0, 0, 0 };
  876. const int nargs = 2;
  877. for (int ndims = 3; ndims <= 4; ++ndims) {
  878. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  879. ggml_set_param(ctx0, x[0]);
  880. get_random_dims(ne2, 3);
  881. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
  882. get_random_dims(ne2, 3);
  883. }
  884. x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
  885. ggml_set_param(ctx0, x[1]);
  886. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  887. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  888. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  889. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  890. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  891. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  892. const int offset = offsets[0] + offsets[1] + offsets[2];
  893. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  894. check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  895. }
  896. }
  897. // acc 4d
  898. {
  899. srand(seed);
  900. int64_t ne2[4] = { 1, 1, 1, 1 };
  901. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  902. int64_t offsets[4] = { 0, 0, 0, 0 };
  903. const int nargs = 2;
  904. for (int ndims = 4; ndims <= 4; ++ndims) {
  905. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  906. ggml_set_param(ctx0, x[0]);
  907. get_random_dims(ne2, 4);
  908. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
  909. get_random_dims(ne2, 4);
  910. }
  911. x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  912. ggml_set_param(ctx0, x[1]);
  913. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  914. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  915. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  916. max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
  917. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  918. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  919. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  920. offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
  921. const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
  922. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  923. check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  924. }
  925. }
  926. // set_1d
  927. {
  928. srand(seed);
  929. int64_t ne2[4];
  930. const int nargs = 2;
  931. for (int ndims = 1; ndims <= 4; ++ndims) {
  932. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  933. ggml_set_param(ctx0, x[0]);
  934. get_random_dims(ne2, 1);
  935. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  936. get_random_dims(ne2, 1);
  937. }
  938. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  939. ggml_set_param(ctx0, x[1]);
  940. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  941. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  942. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
  943. check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  944. }
  945. }
  946. // set_2d
  947. {
  948. srand(seed);
  949. int64_t ne2[4];
  950. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  951. int64_t offsets[4] = { 0, 0, 0, 0 };
  952. const int nargs = 1;
  953. for (int ndims = 2; ndims <= 4; ++ndims) {
  954. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  955. ggml_set_param(ctx0, x[0]);
  956. get_random_dims(ne2, 2);
  957. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  958. get_random_dims(ne2, 2);
  959. }
  960. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  961. ggml_set_param(ctx0, x[1]);
  962. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  963. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  964. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  965. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  966. const int offset = offsets[0] + offsets[1];
  967. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
  968. check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  969. }
  970. }
  971. // view_1d
  972. {
  973. srand(seed);
  974. const int nargs = 1;
  975. for (int ndims = 1; ndims <= 4; ++ndims) {
  976. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  977. ggml_set_param(ctx0, x[0]);
  978. const int k0 = irand(ggml_nelements(x[0]));
  979. const int k1 = irand(ggml_nelements(x[0]));
  980. const int i0 = MIN(k0, k1);
  981. const int i1 = MAX(k0, k1);
  982. const int offset = i0 * sizeof(float);
  983. const int nelem = i1 - i0;
  984. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
  985. check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  986. }
  987. }
  988. // view_2d
  989. {
  990. srand(seed);
  991. int64_t ne2[4];
  992. int64_t nb2[4];
  993. const int nargs = 1;
  994. for (int ndims = 1; ndims <= 4; ++ndims) {
  995. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  996. get_random_dims(ne2, 2);
  997. while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
  998. get_random_dims(ne2, 2);
  999. }
  1000. const int count = ne2[0]*ne2[1];
  1001. nb2[0] = sizeof(float);
  1002. nb2[1] = nb2[0]*ne2[0];
  1003. ggml_set_param(ctx0, x[0]);
  1004. const int max_offset = ggml_nelements(x[0]) - count;
  1005. const int offset = irand(max_offset+1) * sizeof(float);
  1006. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
  1007. check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  1008. }
  1009. }
  1010. // view_3d
  1011. {
  1012. srand(seed);
  1013. int64_t ne2[4] = {1,1,1,1};
  1014. int64_t nb2[4] = {0,0,0,0};
  1015. const int nargs = 1;
  1016. for (int ndims = 1; ndims <= 4; ++ndims) {
  1017. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1018. get_random_dims(ne2, 3);
  1019. while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
  1020. get_random_dims(ne2, 3);
  1021. }
  1022. const int count = ne2[0]*ne2[1]*ne2[2];
  1023. nb2[0] = sizeof(float);
  1024. nb2[1] = nb2[0]*ne2[0];
  1025. nb2[2] = nb2[1]*ne2[1];
  1026. ggml_set_param(ctx0, x[0]);
  1027. const int max_offset = ggml_nelements(x[0]) - count;
  1028. const int offset = irand(max_offset+1) * sizeof(float);
  1029. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
  1030. check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  1031. }
  1032. }
  1033. // permute
  1034. {
  1035. srand(seed);
  1036. int64_t ne2[4];
  1037. const int nargs = 1;
  1038. for (int ndims = 1; ndims <= 4; ++ndims)
  1039. {
  1040. // ggml_permute will set axes of dimensions below n_dims to 1.
  1041. // to make ggml_permute work correctly on all axes,
  1042. // the input tensor needs maximal n_dim of 4.
  1043. for (int i=0; i<ndims; ++i) {
  1044. ne2[i] = ne[i];
  1045. }
  1046. for (int i=ndims; i<4; ++i) {
  1047. ne2[i] = 1;
  1048. }
  1049. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1050. ggml_set_param(ctx0, x[0]);
  1051. const int p = irand(NUM_PERMUTATIONS);
  1052. const int ax0 = all_permutations[p*4+0];
  1053. const int ax1 = all_permutations[p*4+1];
  1054. const int ax2 = all_permutations[p*4+2];
  1055. const int ax3 = all_permutations[p*4+3];
  1056. // sum requires contiguous tensor rows
  1057. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
  1058. check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  1059. }
  1060. }
  1061. // transpose
  1062. {
  1063. srand(seed);
  1064. int64_t ne2[4];
  1065. const int nargs = 1;
  1066. for (int ndims = 1; ndims <= 4; ++ndims)
  1067. {
  1068. // ggml_transpose will set axes of dimensions below n_dims to 1.
  1069. // to make ggml_transpose work correctly on all axes,
  1070. // the input tensor needs maximal n_dim of 4.
  1071. for (int i=0; i<ndims; ++i) {
  1072. ne2[i] = ne[i];
  1073. }
  1074. for (int i=ndims; i<4; ++i) {
  1075. ne2[i] = 1;
  1076. }
  1077. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1078. ggml_set_param(ctx0, x[0]);
  1079. // sum requires contiguous tensor rows
  1080. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
  1081. check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  1082. }
  1083. }
  1084. // get_rows
  1085. {
  1086. srand(seed);
  1087. int64_t ne2[4] = {ne[0], ne[1], 1, 1};
  1088. int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
  1089. const int nargs = 1;
  1090. const int ndims = 2;
  1091. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1092. x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
  1093. ggml_set_param(ctx0, x[0]);
  1094. struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
  1095. check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  1096. }
  1097. // diag_mask_inf
  1098. {
  1099. srand(seed);
  1100. const int nargs = 1;
  1101. const int ndims = 2;
  1102. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1103. ggml_set_param(ctx0, x[0]);
  1104. int n_past = irand(ne[0]);
  1105. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
  1106. check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  1107. }
  1108. // diag_mask_zero
  1109. {
  1110. srand(seed);
  1111. const int nargs = 1;
  1112. const int ndims = 2;
  1113. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1114. ggml_set_param(ctx0, x[0]);
  1115. int n_past = irand(ne[0]);
  1116. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
  1117. check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  1118. }
  1119. // softmax
  1120. {
  1121. srand(seed);
  1122. const int nargs = 1;
  1123. int64_t ne2[4];
  1124. get_random_dims(ne2, 4);
  1125. for (int ndims = 1; ndims <= 3; ++ndims) {
  1126. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1127. ggml_set_param(ctx0, x[0]);
  1128. float eps = 1e-6f;
  1129. // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
  1130. // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
  1131. struct ggml_tensor * f = ggml_sum(ctx0,
  1132. ggml_log(ctx0,
  1133. ggml_add1(ctx0,
  1134. ggml_scale(ctx0,
  1135. ggml_soft_max(ctx0, x[0]),
  1136. 1.0f - eps),
  1137. ggml_new_f32(ctx0, eps))));
  1138. check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY, {});
  1139. // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
  1140. // this may result in different gradients too finite differences.
  1141. // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
  1142. // if only the table lookup causes gradients to differ this is acceptable.
  1143. }
  1144. }
  1145. // cross_entropy_loss
  1146. {
  1147. srand(seed);
  1148. const int nargs = 1;
  1149. int64_t ne2[4];
  1150. get_random_dims(ne2, 4);
  1151. for (int ndims = 1; ndims <= 4; ++ndims) {
  1152. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1153. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
  1154. // the second argument to cross_entropy_loss must sum up to 1 for each row
  1155. int nr = ggml_nrows(x[1]);
  1156. int nc = ggml_nelements(x[1]) / nr;
  1157. for (int ir = 0; ir < nr; ++ir) {
  1158. float sum = 0;
  1159. for (int ic = 0; ic < nc; ++ic) {
  1160. sum += ((float *) x[1]->data)[ic + ir*nc];
  1161. }
  1162. for (int ic = 0; ic < nc; ++ic) {
  1163. ((float *) x[1]->data)[ic + ir*nc] /= sum;
  1164. }
  1165. }
  1166. ggml_set_param(ctx0, x[0]);
  1167. struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
  1168. check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
  1169. }
  1170. }
  1171. // rope f32
  1172. {
  1173. srand(seed);
  1174. const int nargs = 1;
  1175. int64_t ne2[4];
  1176. get_random_dims(ne2, 4);
  1177. ne2[0] += ne2[0] % 2;
  1178. int n_rot = ne2[0];
  1179. for (int ndims = 3; ndims <= 4; ++ndims) {
  1180. for (int mode = 0; mode < 4; ++mode) {
  1181. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1182. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1183. struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
  1184. for (int i = 0; i < ne2[2]; ++i) {
  1185. ((int32_t *) p->data)[i] = n_past + i;
  1186. }
  1187. ggml_set_param(ctx0, x[0]);
  1188. const bool skip_past = (mode & 1);
  1189. if (skip_past) {
  1190. // we have no past, so this would have to work on uninitialized memory.
  1191. // we only test the gradients here;
  1192. // skip_past should have no influence on gradient computation.
  1193. // so when other modes work, we assume that this does as well.
  1194. continue;
  1195. }
  1196. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
  1197. GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1198. check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
  1199. }
  1200. }
  1201. }
  1202. }
  1203. // rope f16
  1204. {
  1205. srand(seed);
  1206. const int nargs = 1;
  1207. int64_t ne2[4];
  1208. get_random_dims(ne2, 4);
  1209. ne2[0] += ne2[0] % 2;
  1210. int n_rot = ne2[0];
  1211. for (int ndims = 3; ndims <= 4; ++ndims) {
  1212. for (int mode = 0; mode < 4; ++mode) {
  1213. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1214. x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
  1215. struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
  1216. for (int i = 0; i < ne2[2]; ++i) {
  1217. ((int32_t *) p->data)[i] = n_past + i;
  1218. }
  1219. ggml_set_param(ctx0, x[0]);
  1220. const bool skip_past = (mode & 1);
  1221. if (skip_past) {
  1222. // we have no past, so this would have to work on uninitialized memory.
  1223. // we only test the gradients here;
  1224. // skip_past should have no influence on gradient computation.
  1225. // so when other modes work, we assume that this does as well.
  1226. continue;
  1227. }
  1228. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
  1229. GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1230. check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
  1231. }
  1232. }
  1233. }
  1234. }
  1235. // im2col f32
  1236. {
  1237. srand(seed);
  1238. const int nargs = 1;
  1239. const int ndims = 4;
  1240. for (const bool is_2D : {false, true}) {
  1241. int64_t ne0[ndims];
  1242. int64_t ne1[ndims];
  1243. get_random_dims(ne0, ndims);
  1244. get_random_dims(ne1, ndims);
  1245. // // Ensure that the output is not zero-sized:
  1246. ne1[0] += 8;
  1247. ne1[1] += 8;
  1248. if (is_2D) {
  1249. ne1[2] = ne0[2];
  1250. } else {
  1251. ne1[1] = ne0[1];
  1252. ne0[3] = 1;
  1253. ne1[3] = 1;
  1254. }
  1255. // The order of arguments is swapped because the first tensor is only used for its shape.
  1256. x[1] = get_random_tensor_f16(ctx0, ndims, ne0, -1.0f, 1.0f);
  1257. x[0] = get_random_tensor_f32(ctx0, ndims, ne1, -1.0f, 1.0f);
  1258. ggml_set_param(ctx0, x[0]);
  1259. const int s0 = 1 + irand(2);
  1260. const int s1 = is_2D ? 1 + irand(2) : 0;
  1261. const int p0 = 0 + irand(2);
  1262. const int p1 = is_2D ? 0 + irand(2) : 0;
  1263. const int d0 = 1 + irand(2);
  1264. const int d1 = is_2D ? 1 + irand(2) : 0;
  1265. struct ggml_tensor * f = ggml_sum(ctx0, ggml_im2col(ctx0, x[1], x[0], s0, s1, p0, p1, d0, d1, is_2D, GGML_TYPE_F32));
  1266. GGML_PRINT_DEBUG("im2col f32: is_2D=%s, s0=%d, s1=%d, p0=%d, p1=%d, d0=%d, d1=%d\n", is_2D ? "yes" : "no", s0, s1, p0, p1, d0, d1);
  1267. check_gradient("im2col f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
  1268. }
  1269. }
  1270. // pool_2d f32
  1271. {
  1272. srand(seed);
  1273. const int nargs = 1;
  1274. const int ndims = 4;
  1275. for (const enum ggml_op_pool op : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
  1276. int64_t ne0[ndims];
  1277. get_random_dims(ne0, ndims);
  1278. ne0[0] += 8;
  1279. ne0[1] += 8;
  1280. x[0] = get_random_tensor_f32(ctx0, ndims, ne0, -1.0f, 1.0f);
  1281. ggml_set_param(ctx0, x[0]);
  1282. const int k0 = 2 + irand(2);
  1283. const int k1 = 2 + irand(2);
  1284. const int s0 = 2 + irand(2);
  1285. const int s1 = 2 + irand(2);
  1286. const int p0 = 0 + irand(2);
  1287. const int p1 = 0 + irand(2);
  1288. struct ggml_tensor * f = ggml_sum(ctx0, ggml_pool_2d(ctx0, x[0], op, k0, k1, s0, s1, p0, p1));
  1289. GGML_PRINT_DEBUG("ggml_pool_2d f32: op=%s k0=%d, k1=%d, s0=%d, s1=%d, p0=%d, p1=%d\n",
  1290. op == GGML_OP_POOL_MAX ? "max" : "avg", k0, k1, s0, s1, p0, p1);
  1291. std::vector<double> expected_vals;
  1292. if (op == GGML_OP_POOL_MAX) {
  1293. expected_vals.push_back(0.0);
  1294. expected_vals.push_back(1.0);
  1295. }
  1296. check_gradient("ggml_pool_2d f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, expected_vals);
  1297. }
  1298. }
  1299. // flash_attn f32
  1300. // TODO: adapt to ggml_flash_attn_ext() changes
  1301. //{
  1302. // srand(seed);
  1303. // const int nargs = 3;
  1304. // int64_t ne2[4];
  1305. // get_random_dims(ne2, 4);
  1306. // int64_t D = ne2[0];
  1307. // int64_t N = ne2[1];
  1308. // int64_t M = ne2[2] + N;
  1309. // int64_t B = ne2[3];
  1310. // for (int masked = 0; masked <= 1; ++masked) {
  1311. // for (int ndims = 2; ndims <= 4; ++ndims) {
  1312. // int max_nrep = (ndims >= 3) ? 2 : 1;
  1313. // for (int nrep = 1; nrep < max_nrep; ++nrep) {
  1314. // int64_t neq[4] = { D, N, B*nrep, ne[3] };
  1315. // int64_t nek[4] = { D, M, B, ne[3] };
  1316. // int64_t nev[4] = { M, D, B, ne[3] };
  1317. // if (ndims == 2) {
  1318. // neq[2] = 1; neq[3] = 1;
  1319. // nek[2] = 1; nek[3] = 1;
  1320. // nev[2] = 1; nev[3] = 1;
  1321. // } else if (ndims == 3) {
  1322. // neq[3] = 1;
  1323. // nek[3] = 1;
  1324. // nev[3] = 1;
  1325. // }
  1326. // x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1327. // x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1328. // x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1329. // ggml_set_param(ctx0, x[0]);
  1330. // ggml_set_param(ctx0, x[1]);
  1331. // ggml_set_param(ctx0, x[2]);
  1332. // struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1333. // check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY, {});
  1334. // }
  1335. // }
  1336. // }
  1337. //}
  1338. ggml_free(ctx0);
  1339. }
  1340. return 0;
  1341. }