1
0

test-barrier.cpp 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. #include "ggml.h"
  2. #include "ggml-cpu.h"
  3. #include <chrono>
  4. #include <iostream>
  5. #include <cstdio>
  6. #include <cstdlib>
  7. #include <cassert>
  8. #include <vector>
  9. #include <thread>
  10. #define MAX_NARGS 2
  11. static void test_barrier(int n_threads, int n_rounds) {
  12. struct ggml_init_params params = {
  13. /* .mem_size = */ 1024*1024*1024,
  14. /* .mem_buffer = */ NULL,
  15. /* .no_alloc = */ false,
  16. };
  17. struct ggml_context * ctx = ggml_init(params);
  18. // Create graph
  19. struct ggml_cgraph * gf = ggml_new_graph(ctx);
  20. // Lots of small, parallel ops where barriers in between will dominate
  21. struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
  22. for (int i = 0; i < 1000; i++) {
  23. struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
  24. out = ggml_mul_mat(ctx, a, out);
  25. struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
  26. out = ggml_mul_mat(ctx, d, out);
  27. }
  28. ggml_build_forward_expand(gf, out);
  29. int n_nodes = ggml_graph_n_nodes(gf);
  30. // Create threadpool
  31. struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
  32. struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
  33. if (!threadpool) {
  34. fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
  35. exit(1);
  36. }
  37. // The test runs with constant number of threads
  38. struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
  39. std::vector<uint8_t> work_data(cplan.work_size);
  40. cplan.work_data = work_data.data();
  41. std::cerr << "graph-compute with"
  42. << "\n n_threads: " << n_threads
  43. << "\n n_nodes: " << n_nodes
  44. << "\n n_rounds: " << n_rounds
  45. << "\n";
  46. // ggml_graph_print(gf);
  47. // Warmup
  48. ggml_graph_compute(gf, &cplan);
  49. auto t0 = std::chrono::high_resolution_clock::now();
  50. for (int i=0; i < n_rounds; i++) {
  51. ggml_graph_compute(gf, &cplan);
  52. }
  53. auto t1 = std::chrono::high_resolution_clock::now();
  54. auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
  55. auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
  56. std::cerr << "graph-compute took " << usec << " usec "
  57. << "\n " << (float) usec / n_rounds << " usec per-iter"
  58. << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
  59. << "\n";
  60. ggml_threadpool_free(threadpool);
  61. ggml_free(ctx);
  62. }
  63. static void test_active(int n_threads, int n_rounds) {
  64. struct ggml_init_params params = {
  65. /* .mem_size = */ 1024*1024*1024,
  66. /* .mem_buffer = */ NULL,
  67. /* .no_alloc = */ false,
  68. };
  69. struct ggml_context * ctx = ggml_init(params);
  70. // Create graph
  71. struct ggml_cgraph * gf = ggml_new_graph(ctx);
  72. // Small graph with, parallel ops with barriers
  73. struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
  74. for (int i = 0; i < 2; i++) {
  75. struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
  76. out = ggml_mul_mat(ctx, a, out);
  77. struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
  78. out = ggml_mul_mat(ctx, d, out);
  79. }
  80. ggml_build_forward_expand(gf, out);
  81. int n_nodes = ggml_graph_n_nodes(gf);
  82. // Create threadpool
  83. struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
  84. struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
  85. if (!threadpool) {
  86. fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
  87. exit(1);
  88. }
  89. std::cerr << "graph-compute with"
  90. << "\n n_threads: " << n_threads
  91. << "\n n_nodes: " << n_nodes
  92. << "\n n_rounds: " << n_rounds
  93. << "\n";
  94. // ggml_graph_print(gf);
  95. // In this test we keep changing the number of threads every 4th iteration
  96. // to test for race conditions in that path
  97. for (int i=0; i < n_rounds; i++) {
  98. struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
  99. std::vector<uint8_t> work_data(cplan.work_size);
  100. cplan.work_data = work_data.data();
  101. ggml_graph_compute(gf, &cplan);
  102. }
  103. ggml_threadpool_free(threadpool);
  104. ggml_free(ctx);
  105. }
  106. static void test_multi_graph(int n_threads, int n_rounds) {
  107. struct ggml_init_params params = {
  108. /* .mem_size = */ 1024*1024*1024,
  109. /* .mem_buffer = */ NULL,
  110. /* .no_alloc = */ false,
  111. };
  112. struct ggml_context * ctx = ggml_init(params);
  113. // Create graphs
  114. struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
  115. {
  116. // Small graph with parallel ops with barriers
  117. struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
  118. for (int i = 0; i < 2; i++) {
  119. struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
  120. out = ggml_mul_mat(ctx, a, out);
  121. struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
  122. out = ggml_mul_mat(ctx, d, out);
  123. }
  124. ggml_build_forward_expand(gf0, out);
  125. }
  126. struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
  127. {
  128. // Small graph with parallel ops with barriers
  129. // Use larger tensors to make sure work_data size is larger than gf0
  130. struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 256);
  131. for (int i = 0; i < 4; i++) {
  132. struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
  133. out = ggml_mul_mat(ctx, a, out);
  134. struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
  135. out = ggml_mul_mat(ctx, d, out);
  136. }
  137. ggml_build_forward_expand(gf1, out);
  138. }
  139. // Create threadpool
  140. struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
  141. struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
  142. if (!threadpool) {
  143. fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
  144. exit(1);
  145. }
  146. std::cerr << "graph-compute with"
  147. << "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
  148. << "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
  149. << "\n n_threads: " << n_threads
  150. << "\n n_rounds: " << n_rounds
  151. << "\n";
  152. // In this test we keep changing the number of threads every 4th iteration
  153. // and we compute two graphs back to back to test graph frequent graph switching
  154. for (int i=0; i < n_rounds; i++) {
  155. struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
  156. std::vector<uint8_t> work_data0(cplan0.work_size);
  157. cplan0.work_data = work_data0.data();
  158. struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
  159. std::vector<uint8_t> work_data1(cplan1.work_size);
  160. cplan1.work_data = work_data1.data();
  161. ggml_graph_compute(gf0, &cplan0);
  162. ggml_graph_compute(gf1, &cplan1);
  163. }
  164. ggml_threadpool_free(threadpool);
  165. ggml_free(ctx);
  166. }
  167. int main(int argc, char *argv[]) {
  168. int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
  169. int n_rounds = 100;
  170. if (argc > 1) {
  171. n_threads = std::atoi(argv[1]);
  172. }
  173. if (argc > 2) {
  174. n_rounds = std::atoi(argv[2]);
  175. }
  176. test_barrier(n_threads, n_rounds);
  177. test_active(n_threads, n_rounds * 100);
  178. test_multi_graph(n_threads, n_rounds * 10);
  179. return 0;
  180. }