pca.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. #include "common.h"
  2. #include "llama.h"
  3. #include "ggml.h"
  4. #ifdef GGML_USE_CUDA
  5. #include "ggml-cuda.h"
  6. #endif
  7. #ifdef GGML_USE_METAL
  8. #include "ggml-metal.h"
  9. #endif
  10. #include <cstdio>
  11. #include <ctime>
  12. #include <string>
  13. #include <tuple>
  14. #include <vector>
  15. #include <algorithm>
  16. #include <iostream>
  17. #include <fstream>
  18. #define DEBUG_POS 5
  19. static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
  20. printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
  21. if (!with_data) return;
  22. printf("%s: %s[0] = [", __func__, t->name);
  23. for (size_t i = 0; i <= DEBUG_POS; i++) {
  24. printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
  25. }
  26. printf(" ... ]\n");
  27. }
  28. namespace PCA {
  29. // input params for PCA computations
  30. struct pca_params {
  31. int n_threads = 1;
  32. int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
  33. int n_iterations = 1000;
  34. float tolerance = 1e-7;
  35. // for debugging
  36. int i_layer = 0;
  37. int n_layers = 0;
  38. };
  39. // result from each iteration
  40. struct pca_result {
  41. struct ggml_tensor * calculated_square = NULL;
  42. std::vector<struct ggml_tensor *> eigenvectors;
  43. std::vector<float> distances;
  44. };
  45. struct pca_model {
  46. ggml_backend_t backend = NULL;
  47. ggml_backend_buffer_t buffer;
  48. struct ggml_context * ctx; // context to compute graph on target device
  49. struct ggml_context * ctx_host; // host context to store results
  50. // tensors on target device
  51. struct ggml_tensor * dev_input;
  52. struct ggml_tensor * dev_square;
  53. struct ggml_tensor * dev_eigenvector;
  54. pca_model(struct ggml_tensor * t_input) {
  55. // TODO: enable GPU support when support for GGML_OP_SQRT is added
  56. // #ifdef GGML_USE_CUDA
  57. // fprintf(stderr, "%s: using CUDA backend\n", __func__);
  58. // backend = ggml_backend_cuda_init(0); // init device 0
  59. // if (!backend) {
  60. // fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
  61. // }
  62. // #endif
  63. // #ifdef GGML_USE_METAL
  64. // fprintf(stderr, "%s: using Metal backend\n", __func__);
  65. // backend = ggml_backend_metal_init();
  66. // if (!backend) {
  67. // fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
  68. // }
  69. // #endif
  70. // if there aren't GPU Backends fallback to CPU backend
  71. if (!backend) {
  72. backend = ggml_backend_cpu_init();
  73. }
  74. const int num_tensors = 4;
  75. struct ggml_init_params params {
  76. /*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
  77. /*.mem_buffer =*/ NULL,
  78. /*.no_alloc =*/ true,
  79. };
  80. ctx = ggml_init(params);
  81. auto n_samples = t_input->ne[0];
  82. auto n_embd = t_input->ne[1];
  83. dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
  84. dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
  85. dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
  86. ggml_set_name(dev_input, "dev_input");
  87. ggml_set_name(dev_square, "dev_square");
  88. ggml_set_name(dev_eigenvector, "dev_eigenvector");
  89. buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
  90. ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
  91. // initialize eigenvector to random normalized vector
  92. {
  93. std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
  94. std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
  95. std::uniform_real_distribution<float> distribution(0.0, 1.0);
  96. float sum_sqr = 0.0; // for normalizing random_vec
  97. for (size_t i = 0; i < random_vec.size(); ++i) {
  98. float f = distribution(generator);
  99. sum_sqr += f * f;
  100. random_vec[i] = f;
  101. }
  102. // normalize it
  103. float random_vec_norm = std::sqrt(sum_sqr);
  104. for (size_t i = 0; i < random_vec.size(); ++i) {
  105. random_vec[i] /= random_vec_norm;
  106. }
  107. ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
  108. }
  109. }
  110. ~pca_model() {
  111. ggml_free(ctx);
  112. ggml_backend_buffer_free(buffer);
  113. ggml_backend_free(backend);
  114. }
  115. };
  116. static struct ggml_cgraph * build_graph_piter(
  117. const struct pca_params & params,
  118. const pca_model & model,
  119. bool calc_square = false) {
  120. GGML_ASSERT(params.n_batch > 0);
  121. // TODO: buf_size must be able to scale with params.n_batch
  122. static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
  123. static std::vector<uint8_t> buf(buf_size);
  124. struct ggml_init_params params0 = {
  125. /*.mem_size =*/ buf_size,
  126. /*.mem_buffer =*/ buf.data(),
  127. /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
  128. };
  129. // create a temporally context to build the graph
  130. struct ggml_context * ctx0 = ggml_init(params0);
  131. struct ggml_cgraph * gf = ggml_new_graph(ctx0);
  132. // turn v_diff_original into square matrix if needed
  133. struct ggml_tensor * tmp_square;
  134. if (calc_square) {
  135. tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
  136. ggml_set_name(tmp_square, "tmp_square");
  137. }
  138. struct ggml_tensor * b_tensor;
  139. struct ggml_tensor * distance;
  140. struct ggml_tensor * old_eigen = model.dev_eigenvector;
  141. struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
  142. for (int i = 0; i < params.n_batch; ++i) {
  143. // b_tensor = square * eigenvector^T
  144. b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
  145. ggml_set_name(b_tensor, "b_tensor");
  146. // normalize
  147. b_tensor = ggml_div_inplace(ctx0,
  148. b_tensor,
  149. ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
  150. );
  151. ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
  152. // calculate distance(new eigenvector - old eigenvector)
  153. // we don't use ggml_sub because it may not be implemented on GPU backend
  154. struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
  155. distance = ggml_sqrt_inplace(ctx0,
  156. ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
  157. ggml_format_name(distance, "distance_%d", i);
  158. old_eigen = b_tensor;
  159. // build operations nodes
  160. ggml_build_forward_expand(gf, distance);
  161. }
  162. // delete the temporally context used to build the graph
  163. ggml_free(ctx0);
  164. return gf;
  165. }
  166. static ggml_status compute_piter(
  167. const struct pca_params & params,
  168. const pca_model & model,
  169. struct ggml_cgraph * gf,
  170. ggml_gallocr_t allocr,
  171. struct pca_result & result) {
  172. // allocate tensors
  173. ggml_gallocr_alloc_graph(allocr, gf);
  174. if (ggml_backend_is_cpu(model.backend)) {
  175. ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
  176. }
  177. // TODO: enable GPU support when support for GGML_OP_SQRT is added
  178. //#ifdef GGML_USE_METAL
  179. // if (ggml_backend_is_metal(model.backend)) {
  180. // ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
  181. // }
  182. //#endif
  183. ggml_status res = ggml_backend_graph_compute(model.backend, gf);
  184. if (res == GGML_STATUS_SUCCESS) {
  185. auto extract_i = [](std::string prefix, std::string str) -> int {
  186. int i = -1;
  187. if (str.rfind(prefix, 0) == 0) {
  188. sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
  189. }
  190. return i;
  191. };
  192. result.calculated_square = NULL;
  193. result.eigenvectors.clear();
  194. result.distances.clear();
  195. result.eigenvectors.resize(params.n_batch);
  196. result.distances.resize(params.n_batch);
  197. // get output nodes
  198. for (int i = 0; i < gf->n_nodes; ++i) {
  199. auto node = gf->nodes[i];
  200. int iter = -1;
  201. // find b_tensor (without copying data from device)
  202. if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
  203. result.eigenvectors[iter] = node;
  204. }
  205. // find distances, then copy data from device
  206. if ((iter = extract_i("distance_", node->name)) > -1) {
  207. float d;
  208. ggml_backend_tensor_get(node, &d, 0, sizeof(float));
  209. result.distances[iter] = d;
  210. // std::cout << node->name << " = " << d << "\n";
  211. }
  212. // find tmp_square if it exists (without copying data from device)
  213. if (std::string(node->name) == "tmp_square") {
  214. result.calculated_square = node;
  215. }
  216. }
  217. }
  218. return res;
  219. }
  220. static void power_iteration(
  221. const struct pca_params & params,
  222. struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
  223. struct ggml_tensor * output) {
  224. //printf("in power iteration\n");
  225. struct pca_model model(input);
  226. ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
  227. struct pca_result result;
  228. struct ggml_tensor * last_eigenvector = NULL;
  229. int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
  230. for (int iter = 0; iter < n_iters; ++iter) {
  231. bool calc_square = (iter == 0); // only need to calculate square for first iteration
  232. struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
  233. // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
  234. compute_piter(params, model, gf, allocr, result);
  235. for (size_t k = 0; k < result.distances.size(); ++k) {
  236. last_eigenvector = result.eigenvectors[k];
  237. if (result.distances[k] < params.tolerance) {
  238. break; // done
  239. }
  240. }
  241. if (calc_square) {
  242. // copy and store the square matrix if needed
  243. GGML_ASSERT(result.calculated_square != NULL);
  244. ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
  245. }
  246. {
  247. // copy last eigen vector and store as input for next iteration
  248. GGML_ASSERT(last_eigenvector != NULL);
  249. ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
  250. }
  251. printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
  252. __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
  253. }
  254. // get output tensor
  255. GGML_ASSERT(last_eigenvector);
  256. ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
  257. //print_debug_tensor(output);
  258. ggml_gallocr_free(allocr);
  259. }
  260. static void run_pca(
  261. struct pca_params & params,
  262. const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
  263. const std::vector<struct ggml_tensor *> & v_output) {
  264. printf("%s: Running PCA...\n", __func__);
  265. for (size_t il = 0; il < v_input.size(); ++il) {
  266. // prepare output vector
  267. struct ggml_tensor * ctrl_out = v_output[il];
  268. ggml_format_name(ctrl_out, "direction.%ld", il+1);
  269. // run power_iteration
  270. params.i_layer = il;
  271. params.n_layers = v_input.size();
  272. power_iteration(params, v_input[il], ctrl_out);
  273. printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
  274. }
  275. }
  276. }