test-grad0.cpp 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628
  1. #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
  2. #include "ggml.h"
  3. #include <cmath>
  4. #include <cstdio>
  5. #include <cstdlib>
  6. #include <cassert>
  7. #if defined(_MSC_VER)
  8. #pragma warning(disable: 4244 4267) // possible loss of data
  9. #endif
  10. #if defined(__GNUC__)
  11. #pragma GCC diagnostic ignored "-Wdouble-promotion"
  12. #endif
  13. #define MAX_NARGS 3
  14. #undef MIN
  15. #undef MAX
  16. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  17. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  18. #define GGML_SILU_FP16
  19. //
  20. // logging
  21. //
  22. #if (GGML_DEBUG >= 1)
  23. #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
  24. #else
  25. #define GGML_PRINT_DEBUG(...)
  26. #endif
  27. #if (GGML_DEBUG >= 5)
  28. #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
  29. #else
  30. #define GGML_PRINT_DEBUG_5(...)
  31. #endif
  32. #if (GGML_DEBUG >= 10)
  33. #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
  34. #else
  35. #define GGML_PRINT_DEBUG_10(...)
  36. #endif
  37. #define GGML_PRINT(...) printf(__VA_ARGS__)
  38. static float frand(void) {
  39. return (float)rand()/(float)RAND_MAX;
  40. }
  41. static int irand(int n) {
  42. if (n == 0) return 0;
  43. return rand()%n;
  44. }
  45. static void get_random_dims(int64_t * dims, int ndims) {
  46. dims[0] = dims[1] = dims[2] = dims[3] = 1;
  47. for (int i = 0; i < ndims; i++) {
  48. dims[i] = 1 + irand(4);
  49. }
  50. }
  51. static struct ggml_tensor * get_random_tensor_f32(
  52. struct ggml_context * ctx0,
  53. int ndims,
  54. int64_t ne[],
  55. float fmin,
  56. float fmax) {
  57. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
  58. switch (ndims) {
  59. case 1:
  60. for (int i0 = 0; i0 < ne[0]; i0++) {
  61. ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
  62. }
  63. break;
  64. case 2:
  65. for (int i1 = 0; i1 < ne[1]; i1++) {
  66. for (int i0 = 0; i0 < ne[0]; i0++) {
  67. ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  68. }
  69. }
  70. break;
  71. case 3:
  72. for (int i2 = 0; i2 < ne[2]; i2++) {
  73. for (int i1 = 0; i1 < ne[1]; i1++) {
  74. for (int i0 = 0; i0 < ne[0]; i0++) {
  75. ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  76. }
  77. }
  78. }
  79. break;
  80. case 4:
  81. for (int i3 = 0; i3 < ne[3]; i3++) {
  82. for (int i2 = 0; i2 < ne[2]; i2++) {
  83. for (int i1 = 0; i1 < ne[1]; i1++) {
  84. for (int i0 = 0; i0 < ne[0]; i0++) {
  85. ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  86. }
  87. }
  88. }
  89. }
  90. break;
  91. default:
  92. assert(false);
  93. };
  94. return result;
  95. }
  96. static struct ggml_tensor * get_random_tensor_f16(
  97. struct ggml_context * ctx0,
  98. int ndims,
  99. int64_t ne[],
  100. float fmin,
  101. float fmax) {
  102. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
  103. switch (ndims) {
  104. case 1:
  105. for (int i0 = 0; i0 < ne[0]; i0++) {
  106. ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  107. }
  108. break;
  109. case 2:
  110. for (int i1 = 0; i1 < ne[1]; i1++) {
  111. for (int i0 = 0; i0 < ne[0]; i0++) {
  112. ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  113. }
  114. }
  115. break;
  116. case 3:
  117. for (int i2 = 0; i2 < ne[2]; i2++) {
  118. for (int i1 = 0; i1 < ne[1]; i1++) {
  119. for (int i0 = 0; i0 < ne[0]; i0++) {
  120. ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  121. }
  122. }
  123. }
  124. break;
  125. case 4:
  126. for (int i3 = 0; i3 < ne[3]; i3++) {
  127. for (int i2 = 0; i2 < ne[2]; i2++) {
  128. for (int i1 = 0; i1 < ne[1]; i1++) {
  129. for (int i0 = 0; i0 < ne[0]; i0++) {
  130. ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  131. }
  132. }
  133. }
  134. }
  135. break;
  136. default:
  137. assert(false);
  138. };
  139. return result;
  140. }
  141. static struct ggml_tensor * get_random_tensor_i32(
  142. struct ggml_context * ctx0,
  143. int ndims,
  144. int64_t ne[],
  145. int32_t imin,
  146. int32_t imax) {
  147. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
  148. switch (ndims) {
  149. case 1:
  150. for (int i0 = 0; i0 < ne[0]; i0++) {
  151. ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
  152. }
  153. break;
  154. case 2:
  155. for (int i1 = 0; i1 < ne[1]; i1++) {
  156. for (int i0 = 0; i0 < ne[0]; i0++) {
  157. ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
  158. }
  159. }
  160. break;
  161. case 3:
  162. for (int i2 = 0; i2 < ne[2]; i2++) {
  163. for (int i1 = 0; i1 < ne[1]; i1++) {
  164. for (int i0 = 0; i0 < ne[0]; i0++) {
  165. ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  166. }
  167. }
  168. }
  169. break;
  170. case 4:
  171. for (int i3 = 0; i3 < ne[3]; i3++) {
  172. for (int i2 = 0; i2 < ne[2]; i2++) {
  173. for (int i1 = 0; i1 < ne[1]; i1++) {
  174. for (int i0 = 0; i0 < ne[0]; i0++) {
  175. ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  176. }
  177. }
  178. }
  179. }
  180. break;
  181. default:
  182. assert(false);
  183. };
  184. return result;
  185. }
  186. static void print_elements(const char* label, const struct ggml_tensor * t) {
  187. if (!t) {
  188. printf("%s: %s = null\n", __func__, label);
  189. return;
  190. }
  191. const int nelements = ggml_nelements(t);
  192. printf("%s: %s = [", __func__, label);
  193. for (int k = 0; k < nelements; ++k) {
  194. if (k > 0) { printf(", "); }
  195. printf("%.5f", ggml_get_f32_1d(t, k));
  196. }
  197. printf("] shape: [");
  198. for (int k = 0; k < t->n_dims; ++k) {
  199. if (k > 0) { printf(", "); }
  200. printf("%d", (int)t->ne[k]);
  201. }
  202. printf("]\n");
  203. }
  204. static bool check_gradient(
  205. const char * op_name,
  206. struct ggml_context * ctx0,
  207. struct ggml_tensor * x[],
  208. struct ggml_tensor * f,
  209. int ndims,
  210. int nargs,
  211. float eps,
  212. float max_error_abs,
  213. float max_error_rel) {
  214. static int n_threads = -1;
  215. if (n_threads < 0) {
  216. n_threads = GGML_DEFAULT_N_THREADS;
  217. const char *env = getenv("GGML_N_THREADS");
  218. if (env) {
  219. n_threads = atoi(env);
  220. }
  221. printf("GGML_N_THREADS = %d\n", n_threads);
  222. }
  223. struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
  224. struct ggml_cgraph * gb = ggml_new_graph(ctx0);
  225. *gb = *gf;
  226. ggml_build_backward_expand(ctx0, gf, gb, false);
  227. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  228. ggml_graph_reset (gf);
  229. ggml_set_f32 (f->grad, 1.0f);
  230. ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
  231. // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
  232. // ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
  233. for (int i = 0; i < nargs; ++i) {
  234. const int nelements = ggml_nelements(x[i]);
  235. for (int k = 0; k < nelements; ++k) {
  236. // compute gradient using finite differences
  237. const float x0 = ggml_get_f32_1d(x[i], k);
  238. const float xm = x0 - eps;
  239. const float xp = x0 + eps;
  240. ggml_set_f32_1d(x[i], k, xp);
  241. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  242. const double f0 = ggml_get_f32_1d(f, 0);
  243. ggml_set_f32_1d(x[i], k, xm);
  244. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  245. const double f1 = ggml_get_f32_1d(f, 0);
  246. const double g0 = (f0 - f1)/(2.0*(double) eps);
  247. ggml_set_f32_1d(x[i], k, x0);
  248. // compute gradient using backward graph
  249. ggml_graph_reset (gf);
  250. ggml_set_f32 (f->grad, 1.0f);
  251. ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
  252. const double g1 = ggml_get_f32_1d(x[i]->grad, k);
  253. const double error_abs = fabs(g0 - g1);
  254. const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
  255. if (error_abs > max_error_abs || error_rel > max_error_rel) {
  256. printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
  257. op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
  258. //assert(false);
  259. return false;
  260. }
  261. }
  262. }
  263. return true;
  264. }
  265. // TODO: clean-up this ..
  266. static bool check_mat_mul(
  267. const struct ggml_tensor * y,
  268. const struct ggml_tensor * x0,
  269. const struct ggml_tensor * x1) {
  270. float * dst = (float *) y->data;
  271. float * src0 = (float *) x0->data;
  272. float * src1 = (float *) x1->data;
  273. const int nc = x0->ne[1];
  274. const int nr = x1->ne[1];
  275. const int nk = x0->ne[0];
  276. GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
  277. GGML_PRINT_DEBUG("x0:\n");
  278. for (int j = 0; j < x0->ne[1]; ++j) {
  279. for (int i = 0; i < x0->ne[0]; ++i) {
  280. GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
  281. }
  282. GGML_PRINT_DEBUG("\n");
  283. }
  284. GGML_PRINT_DEBUG("\n");
  285. GGML_PRINT_DEBUG("x1:\n");
  286. for (int j = 0; j < x1->ne[1]; ++j) {
  287. for (int i = 0; i < x1->ne[0]; ++i) {
  288. GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
  289. }
  290. GGML_PRINT_DEBUG("\n");
  291. }
  292. GGML_PRINT_DEBUG("\n");
  293. GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
  294. for (int j = 0; j < y->ne[1]; ++j) {
  295. for (int i = 0; i < y->ne[0]; ++i) {
  296. GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
  297. }
  298. GGML_PRINT_DEBUG("\n");
  299. }
  300. for (int i = 0; i < nr; ++i) {
  301. for (int j = 0; j < nc; ++j) {
  302. float sum = 0.0f;
  303. for (int k = 0; k < nk; ++k) {
  304. sum += src0[j*nk + k]*src1[i*nk + k];
  305. }
  306. if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
  307. fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
  308. assert(false);
  309. return false;
  310. }
  311. }
  312. }
  313. return true;
  314. }
  315. #define NUM_PERMUTATIONS (4*3*2*1)
  316. int main(int argc, const char ** argv) {
  317. struct ggml_init_params params = {
  318. /* .mem_size = */ 256*1024*1024,
  319. /* .mem_buffer = */ NULL,
  320. /* .no_alloc = */ false,
  321. };
  322. int64_t ne[4];
  323. int all_permutations[4 * NUM_PERMUTATIONS];
  324. {
  325. int count = 0;
  326. for (int ax0=0; ax0<4; ++ax0) {
  327. for (int ax1=0; ax1<4; ++ax1) {
  328. if (ax1 == ax0) continue;
  329. for (int ax2=0; ax2<4; ++ax2) {
  330. if (ax2 == ax0) continue;
  331. if (ax2 == ax1) continue;
  332. for (int ax3=0; ax3<4; ++ax3) {
  333. if (ax3 == ax0) continue;
  334. if (ax3 == ax1) continue;
  335. if (ax3 == ax2) continue;
  336. assert(count < NUM_PERMUTATIONS);
  337. all_permutations[count*4+0] = ax0;
  338. all_permutations[count*4+1] = ax1;
  339. all_permutations[count*4+2] = ax2;
  340. all_permutations[count*4+3] = ax3;
  341. ++count;
  342. }
  343. }
  344. }
  345. }
  346. }
  347. unsigned seed_iter = 1;
  348. // original loop: 1000
  349. int niter = 4;
  350. const char *env = getenv("GGML_NLOOP");
  351. if (env != NULL) {
  352. niter = atoi(env);
  353. }
  354. if (argc > 1) {
  355. niter = atoi(argv[1]);
  356. }
  357. for (int iter = 0; iter < niter; ++iter) {
  358. srand(seed_iter);
  359. seed_iter = rand();
  360. unsigned seed = rand();
  361. printf("test-grad0: iter:%d/%d\n", iter, niter);
  362. struct ggml_context * ctx0 = ggml_init(params);
  363. get_random_dims(ne, 4);
  364. struct ggml_tensor * x[MAX_NARGS];
  365. // add f32
  366. {
  367. srand(seed);
  368. const int nargs = 2;
  369. for (int ndims = 1; ndims <= 4; ++ndims) {
  370. for (int i = 0; i < nargs; ++i) {
  371. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  372. ggml_set_param(ctx0, x[i]);
  373. }
  374. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  375. check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
  376. }
  377. }
  378. // add f16
  379. {
  380. srand(seed);
  381. const int nargs = 2;
  382. for (int ndims = 1; ndims <= 4; ++ndims) {
  383. for (int i = 0; i < nargs; ++i) {
  384. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  385. ggml_set_param(ctx0, x[i]);
  386. }
  387. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  388. check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
  389. }
  390. }
  391. // sub
  392. {
  393. srand(seed);
  394. const int nargs = 2;
  395. for (int ndims = 1; ndims <= 4; ++ndims) {
  396. for (int i = 0; i < nargs; ++i) {
  397. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  398. ggml_set_param(ctx0, x[i]);
  399. }
  400. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
  401. check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  402. }
  403. }
  404. // mul
  405. {
  406. srand(seed);
  407. const int nargs = 2;
  408. for (int ndims = 1; ndims <= 4; ++ndims) {
  409. for (int i = 0; i < nargs; ++i) {
  410. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  411. ggml_set_param(ctx0, x[i]);
  412. }
  413. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
  414. check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  415. }
  416. }
  417. // div
  418. {
  419. srand(seed);
  420. const int nargs = 2;
  421. for (int ndims = 1; ndims <= 4; ++ndims) {
  422. for (int i = 0; i < nargs; ++i) {
  423. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
  424. ggml_set_param(ctx0, x[i]);
  425. }
  426. struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
  427. check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
  428. }
  429. }
  430. // sqr
  431. {
  432. srand(seed);
  433. const int nargs = 1;
  434. for (int ndims = 1; ndims <= 2; ++ndims) {
  435. for (int i = 0; i < nargs; ++i) {
  436. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  437. ggml_set_param(ctx0, x[i]);
  438. }
  439. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
  440. check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  441. }
  442. }
  443. // sqrt
  444. {
  445. srand(seed);
  446. const int nargs = 1;
  447. for (int ndims = 1; ndims <= 2; ++ndims) {
  448. for (int i = 0; i < nargs; ++i) {
  449. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  450. ggml_set_param(ctx0, x[i]);
  451. }
  452. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
  453. check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
  454. }
  455. }
  456. // log
  457. {
  458. srand(seed);
  459. const int nargs = 1;
  460. for (int ndims = 1; ndims <= 2; ++ndims) {
  461. for (int i = 0; i < nargs; ++i) {
  462. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  463. ggml_set_param(ctx0, x[i]);
  464. }
  465. struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
  466. check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
  467. }
  468. }
  469. // sum
  470. {
  471. srand(seed);
  472. const int nargs = 1;
  473. for (int ndims = 1; ndims <= 2; ++ndims) {
  474. for (int i = 0; i < nargs; ++i) {
  475. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  476. ggml_set_param(ctx0, x[i]);
  477. }
  478. struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
  479. check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  480. }
  481. }
  482. // sum_rows
  483. {
  484. srand(seed);
  485. const int nargs = 1;
  486. for (int ndims = 1; ndims <= 4; ++ndims) {
  487. for (int i = 0; i < nargs; ++i) {
  488. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  489. ggml_set_param(ctx0, x[i]);
  490. }
  491. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
  492. check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  493. }
  494. }
  495. // mean, not yet fully implemented
  496. if(0)
  497. {
  498. srand(seed);
  499. const int nargs = 1;
  500. for (int ndims = 1; ndims <= 4; ++ndims) {
  501. for (int i = 0; i < nargs; ++i) {
  502. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  503. ggml_set_param(ctx0, x[i]);
  504. }
  505. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
  506. check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  507. }
  508. }
  509. // argmax
  510. if (0)
  511. {
  512. srand(seed);
  513. const int nargs = 1;
  514. for (int ndims = 1; ndims <= 4; ++ndims) {
  515. for (int i = 0; i < nargs; ++i) {
  516. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  517. ggml_set_param(ctx0, x[i]);
  518. }
  519. struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
  520. check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  521. }
  522. }
  523. // repeat
  524. {
  525. srand(seed);
  526. int64_t ne2[4];
  527. get_random_dims(ne2, 4);
  528. ne2[0] = ne[0] * ne2[0];
  529. ne2[1] = ne[1] * ne2[1];
  530. ne2[2] = 1;
  531. ne2[3] = 1;
  532. const int nargs = 1;
  533. for (int ndims = 1; ndims <= 2; ++ndims) {
  534. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  535. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  536. ggml_set_param(ctx0, x[0]);
  537. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
  538. check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  539. }
  540. }
  541. // repeat back
  542. {
  543. srand(seed);
  544. int64_t ne2[4];
  545. get_random_dims(ne2, 4);
  546. ne2[0] = ne[0] * ne2[0];
  547. ne2[1] = ne[1] * ne2[1];
  548. ne2[2] = 1;
  549. ne2[3] = 1;
  550. const int nargs = 1;
  551. for (int ndims = 1; ndims <= 2; ++ndims) {
  552. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  553. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  554. ggml_set_param(ctx0, x[0]);
  555. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
  556. check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  557. }
  558. }
  559. // abs (finite differences do not work)
  560. //{
  561. // const int nargs = 1;
  562. // for (int ndims = 1; ndims <= 2; ++ndims) {
  563. // for (int i = 0; i < nargs; ++i) {
  564. // x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  565. // ggml_set_param(ctx0, x[i]);
  566. // }
  567. // struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
  568. // check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
  569. // }
  570. //}
  571. // sgn
  572. {
  573. srand(seed);
  574. const int nargs = 1;
  575. for (int ndims = 1; ndims <= 4; ++ndims) {
  576. for (int i = 0; i < nargs; ++i) {
  577. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  578. ggml_set_param(ctx0, x[i]);
  579. }
  580. struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
  581. check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  582. }
  583. }
  584. // neg
  585. {
  586. srand(seed);
  587. const int nargs = 1;
  588. for (int ndims = 1; ndims <= 4; ++ndims) {
  589. for (int i = 0; i < nargs; ++i) {
  590. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  591. ggml_set_param(ctx0, x[i]);
  592. }
  593. struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
  594. check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  595. }
  596. }
  597. // step
  598. {
  599. srand(seed);
  600. const int nargs = 1;
  601. for (int ndims = 1; ndims <= 4; ++ndims) {
  602. for (int i = 0; i < nargs; ++i) {
  603. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  604. ggml_set_param(ctx0, x[i]);
  605. }
  606. struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
  607. check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  608. }
  609. }
  610. // tanh, not yet fully implemented
  611. if(0)
  612. {
  613. srand(seed);
  614. const int nargs = 1;
  615. for (int ndims = 1; ndims <= 4; ++ndims) {
  616. for (int i = 0; i < nargs; ++i) {
  617. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  618. ggml_set_param(ctx0, x[i]);
  619. }
  620. struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
  621. check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  622. }
  623. }
  624. // mul_mat
  625. {
  626. srand(seed);
  627. const int nargs = 2;
  628. for (int ndims = 2; ndims <= 4; ++ndims) {
  629. int max_nrep = (ndims >= 3) ? 2 : 1;
  630. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  631. for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
  632. for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
  633. {
  634. int64_t ne2[4];
  635. get_random_dims(ne2, 4);
  636. ne2[0] = ne[0];
  637. ne2[2] = nrep2 * ne[2];
  638. ne2[3] = nrep3 * ne[3];
  639. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  640. }
  641. ggml_set_param(ctx0, x[0]);
  642. ggml_set_param(ctx0, x[1]);
  643. struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
  644. struct ggml_tensor * f = ggml_sum(ctx0, m);
  645. GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
  646. check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  647. if (ndims == 2) {
  648. // check_mat_mul does not support ndims > 2
  649. check_mat_mul(m, x[1], x[0]);
  650. }
  651. }
  652. }
  653. }
  654. }
  655. // elu, not yet fully implemented
  656. if(0)
  657. {
  658. srand(seed);
  659. const int nargs = 1;
  660. for (int ndims = 1; ndims <= 4; ++ndims) {
  661. for (int i = 0; i < nargs; ++i) {
  662. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  663. ggml_set_param(ctx0, x[i]);
  664. }
  665. struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
  666. check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  667. }
  668. }
  669. // relu
  670. {
  671. srand(seed);
  672. const int nargs = 1;
  673. for (int ndims = 1; ndims <= 4; ++ndims) {
  674. for (int i = 0; i < nargs; ++i) {
  675. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  676. ggml_set_param(ctx0, x[i]);
  677. }
  678. struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
  679. check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  680. }
  681. }
  682. // gelu, not yet fully implemented
  683. if(0)
  684. {
  685. srand(seed);
  686. const int nargs = 1;
  687. for (int ndims = 1; ndims <= 4; ++ndims) {
  688. for (int i = 0; i < nargs; ++i) {
  689. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  690. ggml_set_param(ctx0, x[i]);
  691. }
  692. struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
  693. check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  694. }
  695. }
  696. // silu
  697. {
  698. srand(seed);
  699. const int nargs = 1;
  700. for (int ndims = 1; ndims <= 2; ++ndims) {
  701. for (int i = 0; i < nargs; ++i) {
  702. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  703. ggml_set_param(ctx0, x[i]);
  704. }
  705. struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
  706. #ifdef GGML_SILU_FP16
  707. // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
  708. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
  709. #else
  710. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  711. #endif
  712. }
  713. }
  714. // rms_norm
  715. {
  716. srand(seed);
  717. const int nargs = 1;
  718. for (int ndims = 1; ndims <= 2; ++ndims) {
  719. for (int i = 0; i < nargs; ++i) {
  720. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  721. ggml_set_param(ctx0, x[i]);
  722. }
  723. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
  724. check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
  725. }
  726. }
  727. // scale
  728. {
  729. srand(seed);
  730. const int nargs = 2;
  731. int64_t ne2[4];
  732. ne2[0] = 1;
  733. for (int ndims = 1; ndims <= 2; ++ndims) {
  734. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  735. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  736. ggml_set_param(ctx0, x[0]);
  737. ggml_set_param(ctx0, x[1]);
  738. struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
  739. check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  740. }
  741. }
  742. // cpy f32
  743. {
  744. srand(seed);
  745. const int nargs = 2;
  746. for (int ndims = 1; ndims <= 2; ++ndims) {
  747. for (int i = 0; i < nargs; ++i) {
  748. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  749. ggml_set_param(ctx0, x[i]);
  750. }
  751. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  752. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  753. check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  754. }
  755. }
  756. // cpy f16
  757. {
  758. srand(seed);
  759. const int nargs = 2;
  760. for (int ndims = 1; ndims <= 2; ++ndims) {
  761. for (int i = 0; i < nargs; ++i) {
  762. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  763. ggml_set_param(ctx0, x[i]);
  764. }
  765. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  766. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  767. check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
  768. }
  769. }
  770. // reshape (1d->nd)
  771. {
  772. srand(seed);
  773. const int nargs = 1;
  774. for (int ndims = 1; ndims <= 2; ++ndims) {
  775. int64_t ne2[4];
  776. ne2[0] = 1;
  777. ne2[1] = 1;
  778. ne2[2] = 1;
  779. ne2[3] = 1;
  780. for (int i = 0; i < ndims; ++i) {
  781. ne2[0] *= ne[i];
  782. }
  783. x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  784. x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  785. ggml_set_param(ctx0, x[0]);
  786. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  787. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  788. }
  789. }
  790. // reshape (nd->1d)
  791. {
  792. srand(seed);
  793. const int nargs = 1;
  794. for (int ndims = 1; ndims <= 2; ++ndims) {
  795. int64_t ne2[4];
  796. ne2[0] = 1;
  797. ne2[1] = 1;
  798. ne2[2] = 1;
  799. ne2[3] = 1;
  800. for (int i = 0; i < ndims; ++i) {
  801. ne2[0] *= ne[i];
  802. }
  803. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  804. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  805. ggml_set_param(ctx0, x[0]);
  806. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  807. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  808. }
  809. }
  810. // acc 1d
  811. {
  812. srand(seed);
  813. int64_t ne2[4] = { 1, 1, 1, 1 };
  814. const int nargs = 2;
  815. for (int ndims = 1; ndims <= 4; ++ndims) {
  816. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  817. ggml_set_param(ctx0, x[0]);
  818. get_random_dims(ne2, 1);
  819. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  820. get_random_dims(ne2, 1);
  821. }
  822. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  823. ggml_set_param(ctx0, x[1]);
  824. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  825. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  826. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  827. check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  828. }
  829. }
  830. // acc 2d
  831. {
  832. srand(seed);
  833. int64_t ne2[4] = { 1, 1, 1, 1 };
  834. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  835. int64_t offsets[4] = { 0, 0, 0, 0 };
  836. const int nargs = 2;
  837. for (int ndims = 2; ndims <= 4; ++ndims) {
  838. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  839. ggml_set_param(ctx0, x[0]);
  840. get_random_dims(ne2, 2);
  841. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  842. get_random_dims(ne2, 2);
  843. }
  844. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  845. ggml_set_param(ctx0, x[1]);
  846. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  847. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  848. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  849. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  850. const int offset = offsets[0] + offsets[1];
  851. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  852. check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  853. }
  854. }
  855. // acc 3d
  856. {
  857. srand(seed);
  858. int64_t ne2[4] = { 1, 1, 1, 1 };
  859. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  860. int64_t offsets[4] = { 0, 0, 0, 0 };
  861. const int nargs = 2;
  862. for (int ndims = 3; ndims <= 4; ++ndims) {
  863. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  864. ggml_set_param(ctx0, x[0]);
  865. get_random_dims(ne2, 3);
  866. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
  867. get_random_dims(ne2, 3);
  868. }
  869. x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
  870. ggml_set_param(ctx0, x[1]);
  871. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  872. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  873. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  874. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  875. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  876. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  877. const int offset = offsets[0] + offsets[1] + offsets[2];
  878. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  879. check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  880. }
  881. }
  882. // acc 4d
  883. {
  884. srand(seed);
  885. int64_t ne2[4] = { 1, 1, 1, 1 };
  886. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  887. int64_t offsets[4] = { 0, 0, 0, 0 };
  888. const int nargs = 2;
  889. for (int ndims = 4; ndims <= 4; ++ndims) {
  890. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  891. ggml_set_param(ctx0, x[0]);
  892. get_random_dims(ne2, 4);
  893. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
  894. get_random_dims(ne2, 4);
  895. }
  896. x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  897. ggml_set_param(ctx0, x[1]);
  898. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  899. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  900. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  901. max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
  902. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  903. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  904. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  905. offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
  906. const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
  907. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  908. check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  909. }
  910. }
  911. // set_1d
  912. {
  913. srand(seed);
  914. int64_t ne2[4];
  915. const int nargs = 2;
  916. for (int ndims = 1; ndims <= 4; ++ndims) {
  917. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  918. ggml_set_param(ctx0, x[0]);
  919. get_random_dims(ne2, 1);
  920. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  921. get_random_dims(ne2, 1);
  922. }
  923. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  924. ggml_set_param(ctx0, x[1]);
  925. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  926. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  927. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
  928. check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  929. }
  930. }
  931. // set_2d
  932. {
  933. srand(seed);
  934. int64_t ne2[4];
  935. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  936. int64_t offsets[4] = { 0, 0, 0, 0 };
  937. const int nargs = 1;
  938. for (int ndims = 2; ndims <= 4; ++ndims) {
  939. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  940. ggml_set_param(ctx0, x[0]);
  941. get_random_dims(ne2, 2);
  942. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  943. get_random_dims(ne2, 2);
  944. }
  945. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  946. ggml_set_param(ctx0, x[1]);
  947. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  948. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  949. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  950. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  951. const int offset = offsets[0] + offsets[1];
  952. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
  953. check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  954. }
  955. }
  956. // view_1d
  957. {
  958. srand(seed);
  959. const int nargs = 1;
  960. for (int ndims = 1; ndims <= 4; ++ndims) {
  961. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  962. ggml_set_param(ctx0, x[0]);
  963. const int k0 = irand(ggml_nelements(x[0]));
  964. const int k1 = irand(ggml_nelements(x[0]));
  965. const int i0 = MIN(k0, k1);
  966. const int i1 = MAX(k0, k1);
  967. const int offset = i0 * sizeof(float);
  968. const int nelem = i1 - i0;
  969. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
  970. check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  971. }
  972. }
  973. // view_2d
  974. {
  975. srand(seed);
  976. int64_t ne2[4];
  977. int64_t nb2[4];
  978. const int nargs = 1;
  979. for (int ndims = 1; ndims <= 4; ++ndims) {
  980. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  981. get_random_dims(ne2, 2);
  982. while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
  983. get_random_dims(ne2, 2);
  984. }
  985. const int count = ne2[0]*ne2[1];
  986. nb2[0] = sizeof(float);
  987. nb2[1] = nb2[0]*ne2[0];
  988. ggml_set_param(ctx0, x[0]);
  989. const int max_offset = ggml_nelements(x[0]) - count;
  990. const int offset = irand(max_offset+1) * sizeof(float);
  991. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
  992. check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  993. }
  994. }
  995. // view_3d
  996. {
  997. srand(seed);
  998. int64_t ne2[4] = {1,1,1,1};
  999. int64_t nb2[4] = {0,0,0,0};
  1000. const int nargs = 1;
  1001. for (int ndims = 1; ndims <= 4; ++ndims) {
  1002. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1003. get_random_dims(ne2, 3);
  1004. while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
  1005. get_random_dims(ne2, 3);
  1006. }
  1007. const int count = ne2[0]*ne2[1]*ne2[2];
  1008. nb2[0] = sizeof(float);
  1009. nb2[1] = nb2[0]*ne2[0];
  1010. nb2[2] = nb2[1]*ne2[1];
  1011. ggml_set_param(ctx0, x[0]);
  1012. const int max_offset = ggml_nelements(x[0]) - count;
  1013. const int offset = irand(max_offset+1) * sizeof(float);
  1014. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
  1015. check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1016. }
  1017. }
  1018. // permute
  1019. {
  1020. srand(seed);
  1021. int64_t ne2[4];
  1022. const int nargs = 1;
  1023. for (int ndims = 1; ndims <= 4; ++ndims)
  1024. {
  1025. // ggml_permute will set axes of dimensions below n_dims to 1.
  1026. // to make ggml_permute work correctly on all axes,
  1027. // the input tensor needs maximal n_dim of 4.
  1028. for (int i=0; i<ndims; ++i) {
  1029. ne2[i] = ne[i];
  1030. }
  1031. for (int i=ndims; i<4; ++i) {
  1032. ne2[i] = 1;
  1033. }
  1034. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1035. ggml_set_param(ctx0, x[0]);
  1036. const int p = irand(NUM_PERMUTATIONS);
  1037. const int ax0 = all_permutations[p*4+0];
  1038. const int ax1 = all_permutations[p*4+1];
  1039. const int ax2 = all_permutations[p*4+2];
  1040. const int ax3 = all_permutations[p*4+3];
  1041. // sum requires contiguous tensor rows
  1042. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
  1043. check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1044. }
  1045. }
  1046. // transpose
  1047. {
  1048. srand(seed);
  1049. int64_t ne2[4];
  1050. const int nargs = 1;
  1051. for (int ndims = 1; ndims <= 4; ++ndims)
  1052. {
  1053. // ggml_transpose will set axes of dimensions below n_dims to 1.
  1054. // to make ggml_transpose work correctly on all axes,
  1055. // the input tensor needs maximal n_dim of 4.
  1056. for (int i=0; i<ndims; ++i) {
  1057. ne2[i] = ne[i];
  1058. }
  1059. for (int i=ndims; i<4; ++i) {
  1060. ne2[i] = 1;
  1061. }
  1062. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1063. ggml_set_param(ctx0, x[0]);
  1064. // sum requires contiguous tensor rows
  1065. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
  1066. check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1067. }
  1068. }
  1069. // get_rows
  1070. {
  1071. srand(seed);
  1072. int64_t ne2[4] = {ne[0], ne[1], 1, 1};
  1073. int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
  1074. const int nargs = 1;
  1075. const int ndims = 2;
  1076. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1077. x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
  1078. ggml_set_param(ctx0, x[0]);
  1079. struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
  1080. check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1081. }
  1082. // diag_mask_inf
  1083. {
  1084. srand(seed);
  1085. const int nargs = 1;
  1086. const int ndims = 2;
  1087. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1088. ggml_set_param(ctx0, x[0]);
  1089. int n_past = irand(ne[0]);
  1090. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
  1091. check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1092. }
  1093. // diag_mask_zero
  1094. {
  1095. srand(seed);
  1096. const int nargs = 1;
  1097. const int ndims = 2;
  1098. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1099. ggml_set_param(ctx0, x[0]);
  1100. int n_past = irand(ne[0]);
  1101. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
  1102. check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1103. }
  1104. // softmax
  1105. {
  1106. srand(seed);
  1107. const int nargs = 1;
  1108. int64_t ne2[4];
  1109. get_random_dims(ne2, 4);
  1110. for (int ndims = 1; ndims <= 3; ++ndims) {
  1111. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1112. ggml_set_param(ctx0, x[0]);
  1113. float eps = 1e-6f;
  1114. // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
  1115. // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
  1116. struct ggml_tensor * f = ggml_sum(ctx0,
  1117. ggml_log(ctx0,
  1118. ggml_add1(ctx0,
  1119. ggml_scale(ctx0,
  1120. ggml_soft_max(ctx0, x[0]),
  1121. ggml_new_f32(ctx0, 1.0f - eps)),
  1122. ggml_new_f32(ctx0, eps))));
  1123. check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
  1124. // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
  1125. // this may result in different gradients too finite differences.
  1126. // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
  1127. // if only the table lookup causes gradients to differ this is acceptable.
  1128. }
  1129. }
  1130. // cross_entropy_loss
  1131. {
  1132. srand(seed);
  1133. const int nargs = 1;
  1134. int64_t ne2[4];
  1135. get_random_dims(ne2, 4);
  1136. for (int ndims = 1; ndims <= 4; ++ndims) {
  1137. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
  1138. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
  1139. // the second argument to cross_entropy_loss must sum up to 1 for each row
  1140. int nr = ggml_nrows(x[1]);
  1141. int nc = ggml_nelements(x[1]) / nr;
  1142. for (int ir = 0; ir < nr; ++ir) {
  1143. float sum = 0;
  1144. for (int ic = 0; ic < nc; ++ic) {
  1145. sum += ((float *) x[1]->data)[ic + ir*nc];
  1146. }
  1147. for (int ic = 0; ic < nc; ++ic) {
  1148. ((float *) x[1]->data)[ic + ir*nc] /= sum;
  1149. }
  1150. }
  1151. ggml_set_param(ctx0, x[0]);
  1152. struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
  1153. check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
  1154. }
  1155. }
  1156. // rope f32
  1157. {
  1158. srand(seed);
  1159. const int nargs = 1;
  1160. int64_t ne2[4];
  1161. get_random_dims(ne2, 4);
  1162. ne2[0] += ne2[0] % 2;
  1163. int n_rot = ne2[0];
  1164. for (int ndims = 3; ndims <= 4; ++ndims) {
  1165. for (int mode = 0; mode < 4; ++mode) {
  1166. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1167. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1168. struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
  1169. for (int i = 0; i < ne2[2]; ++i) {
  1170. ((int32_t *) p->data)[i] = n_past + i;
  1171. }
  1172. ggml_set_param(ctx0, x[0]);
  1173. const bool skip_past = (mode & 1);
  1174. if (skip_past) {
  1175. // we have no past, so this would have to work on uninitialized memory.
  1176. // we only test the gradients here;
  1177. // skip_past should have no influence on gradient computation.
  1178. // so when other modes work, we assume that this does as well.
  1179. continue;
  1180. }
  1181. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
  1182. GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1183. check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
  1184. }
  1185. }
  1186. }
  1187. }
  1188. // rope f16
  1189. {
  1190. srand(seed);
  1191. const int nargs = 1;
  1192. int64_t ne2[4];
  1193. get_random_dims(ne2, 4);
  1194. ne2[0] += ne2[0] % 2;
  1195. int n_rot = ne2[0];
  1196. for (int ndims = 3; ndims <= 4; ++ndims) {
  1197. for (int mode = 0; mode < 4; ++mode) {
  1198. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1199. x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
  1200. struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
  1201. for (int i = 0; i < ne2[2]; ++i) {
  1202. ((int32_t *) p->data)[i] = n_past + i;
  1203. }
  1204. ggml_set_param(ctx0, x[0]);
  1205. const bool skip_past = (mode & 1);
  1206. if (skip_past) {
  1207. // we have no past, so this would have to work on uninitialized memory.
  1208. // we only test the gradients here;
  1209. // skip_past should have no influence on gradient computation.
  1210. // so when other modes work, we assume that this does as well.
  1211. continue;
  1212. }
  1213. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
  1214. GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1215. check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
  1216. }
  1217. }
  1218. }
  1219. }
  1220. // flash_attn f32
  1221. {
  1222. srand(seed);
  1223. const int nargs = 3;
  1224. int64_t ne2[4];
  1225. get_random_dims(ne2, 4);
  1226. int64_t D = ne2[0];
  1227. int64_t N = ne2[1];
  1228. int64_t M = ne2[2] + N;
  1229. int64_t B = ne2[3];
  1230. for (int masked = 0; masked <= 1; ++masked) {
  1231. for (int ndims = 2; ndims <= 4; ++ndims) {
  1232. int max_nrep = (ndims >= 3) ? 2 : 1;
  1233. for (int nrep = 1; nrep < max_nrep; ++nrep) {
  1234. int64_t neq[4] = { D, N, B*nrep, ne[3] };
  1235. int64_t nek[4] = { D, M, B, ne[3] };
  1236. int64_t nev[4] = { M, D, B, ne[3] };
  1237. if (ndims == 2) {
  1238. neq[2] = 1; neq[3] = 1;
  1239. nek[2] = 1; nek[3] = 1;
  1240. nev[2] = 1; nev[3] = 1;
  1241. } else if (ndims == 3) {
  1242. neq[3] = 1;
  1243. nek[3] = 1;
  1244. nev[3] = 1;
  1245. }
  1246. x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1247. x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1248. x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1249. ggml_set_param(ctx0, x[0]);
  1250. ggml_set_param(ctx0, x[1]);
  1251. ggml_set_param(ctx0, x[2]);
  1252. struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1253. check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
  1254. }
  1255. }
  1256. }
  1257. }
  1258. // flash_attn f16, not yet fully implemented
  1259. if(0)
  1260. {
  1261. srand(seed);
  1262. const int nargs = 3;
  1263. int64_t ne2[4];
  1264. get_random_dims(ne2, 4);
  1265. int64_t D = ne2[0];
  1266. int64_t N = ne2[1];
  1267. int64_t M = ne2[2] + N;
  1268. int64_t B = ne2[3];
  1269. for (int masked = 0; masked <= 1; ++masked) {
  1270. for (int ndims = 2; ndims <= 4; ++ndims) {
  1271. int64_t neq[4] = { D, N, B, ne[3] };
  1272. int64_t nek[4] = { D, M, B, ne[3] };
  1273. int64_t nev[4] = { M, D, B, ne[3] };
  1274. if (ndims == 2) {
  1275. neq[2] = 1; neq[3] = 1;
  1276. nek[2] = 1; nek[3] = 1;
  1277. nev[2] = 1; nev[3] = 1;
  1278. } else if (ndims == 3) {
  1279. neq[3] = 1;
  1280. nek[3] = 1;
  1281. nev[3] = 1;
  1282. }
  1283. x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1284. x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1285. x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1286. ggml_set_param(ctx0, x[0]);
  1287. ggml_set_param(ctx0, x[1]);
  1288. ggml_set_param(ctx0, x[2]);
  1289. struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1290. check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
  1291. }
  1292. }
  1293. }
  1294. ggml_free(ctx0);
  1295. }
  1296. return 0;
  1297. }