1
0

test-barrier.cpp 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #include "ggml.h"
  2. #include "ggml-backend.h"
  3. #include <chrono>
  4. #include <iostream>
  5. #include <cstdio>
  6. #include <cstdlib>
  7. #include <cassert>
  8. #include <vector>
  9. #define MAX_NARGS 2
  10. int main(int argc, char *argv[]) {
  11. int n_threads = 4;
  12. int n_rounds = 100;
  13. if (argc > 1) {
  14. n_threads = std::atoi(argv[1]);
  15. }
  16. if (argc > 2) {
  17. n_rounds = std::atoi(argv[2]);
  18. }
  19. struct ggml_init_params params = {
  20. /* .mem_size = */ 1024*1024*1024,
  21. /* .mem_buffer = */ NULL,
  22. /* .no_alloc = */ false,
  23. };
  24. struct ggml_context * ctx = ggml_init(params);
  25. // Create graph
  26. struct ggml_cgraph * gf = ggml_new_graph(ctx);
  27. // Lots of small, parallel ops where barriers in between will dominate
  28. struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
  29. for (int i = 0; i < 1000; i++) {
  30. struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
  31. out = ggml_mul_mat(ctx, a, out);
  32. struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
  33. out = ggml_mul_mat(ctx, d, out);
  34. }
  35. ggml_build_forward_expand(gf, out);
  36. int n_nodes = ggml_graph_n_nodes(gf);
  37. // Create threadpool
  38. struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
  39. struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
  40. if (!threadpool) {
  41. fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
  42. exit(1);
  43. }
  44. // Create compute plan
  45. struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
  46. std::vector<uint8_t> work_data(cplan.work_size);
  47. cplan.work_data = work_data.data();
  48. std::cerr << "graph-compute with"
  49. << "\n n_threads: " << n_threads
  50. << "\n n_nodes: " << n_nodes
  51. << "\n n_rounds: " << n_rounds
  52. << "\n";
  53. // ggml_graph_print(gf);
  54. // Warmup
  55. ggml_graph_compute(gf, &cplan);
  56. auto t0 = std::chrono::high_resolution_clock::now();
  57. for (int i=0; i < n_rounds; i++) {
  58. ggml_graph_compute(gf, &cplan);
  59. }
  60. auto t1 = std::chrono::high_resolution_clock::now();
  61. auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
  62. auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
  63. std::cerr << "graph-compute took " << usec << " usec "
  64. << "\n " << (float) usec / n_rounds << " usec per-iter"
  65. << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
  66. << "\n";
  67. ggml_threadpool_free(threadpool);
  68. ggml_free(ctx);
  69. return 0;
  70. }