llama.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. #include "llama-impl.h"
  2. #include "llama-chat.h"
  3. #include "llama-mmap.h"
  4. #include "llama-vocab.h"
  5. #include "llama-model-loader.h"
  6. #include "llama-model.h"
  7. #include "ggml.h"
  8. #include "ggml-backend.h"
  9. #include <algorithm>
  10. #include <cstddef>
  11. #include <cstdint>
  12. #include <cstdio>
  13. #include <cstring>
  14. #include <ctime>
  15. #if defined(_MSC_VER)
  16. #pragma warning(disable: 4244 4267) // possible loss of data
  17. #endif
  18. //
  19. // interface implementation
  20. //
  21. struct llama_sampler_chain_params llama_sampler_chain_default_params() {
  22. struct llama_sampler_chain_params result = {
  23. /*.no_perf =*/ true,
  24. };
  25. return result;
  26. }
  27. size_t llama_max_devices(void) {
  28. return 16;
  29. }
  30. bool llama_supports_mmap(void) {
  31. return llama_mmap::SUPPORTED;
  32. }
  33. bool llama_supports_mlock(void) {
  34. return llama_mlock::SUPPORTED;
  35. }
  36. bool llama_supports_gpu_offload(void) {
  37. return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
  38. llama_supports_rpc();
  39. }
  40. bool llama_supports_rpc(void) {
  41. return ggml_backend_reg_by_name("RPC") != nullptr;
  42. }
  43. void llama_backend_init(void) {
  44. ggml_time_init();
  45. // needed to initialize f16 tables
  46. {
  47. struct ggml_init_params params = { 0, NULL, false };
  48. struct ggml_context * ctx = ggml_init(params);
  49. ggml_free(ctx);
  50. }
  51. }
  52. void llama_numa_init(enum ggml_numa_strategy numa) {
  53. if (numa != GGML_NUMA_STRATEGY_DISABLED) {
  54. auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  55. GGML_ASSERT(dev && "CPU backend is not loaded");
  56. auto * reg = ggml_backend_dev_backend_reg(dev);
  57. auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
  58. numa_init_fn(numa);
  59. }
  60. }
  61. void llama_backend_free(void) {
  62. ggml_quantize_free();
  63. }
  64. int64_t llama_time_us(void) {
  65. return ggml_time_us();
  66. }
  67. // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
  68. static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
  69. // loading time will be recalculated after the first eval, so
  70. // we take page faults deferred by mmap() into consideration
  71. model.t_load_us = 0;
  72. time_meas tm(model.t_load_us);
  73. model.t_start_us = tm.t_start_us;
  74. try {
  75. llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
  76. ml.print_info();
  77. model.hparams.vocab_only = params.vocab_only;
  78. try {
  79. model.load_arch(ml);
  80. } catch(const std::exception & e) {
  81. throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
  82. }
  83. try {
  84. model.load_hparams(ml);
  85. } catch(const std::exception & e) {
  86. throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
  87. }
  88. try {
  89. model.load_vocab(ml);
  90. } catch(const std::exception & e) {
  91. throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
  92. }
  93. model.load_stats(ml);
  94. model.print_info();
  95. if (params.vocab_only) {
  96. LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
  97. return 0;
  98. }
  99. if (!model.load_tensors(ml)) {
  100. return -2;
  101. }
  102. } catch (const std::exception & err) {
  103. LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
  104. return -1;
  105. }
  106. return 0;
  107. }
  108. static struct llama_model * llama_model_load_from_file_impl(
  109. const std::string & path_model,
  110. std::vector<std::string> & splits,
  111. struct llama_model_params params) {
  112. ggml_time_init();
  113. unsigned cur_percentage = 0;
  114. if (params.progress_callback == NULL) {
  115. params.progress_callback_user_data = &cur_percentage;
  116. params.progress_callback = [](float progress, void * ctx) {
  117. unsigned * cur_percentage_p = (unsigned *) ctx;
  118. unsigned percentage = (unsigned) (100 * progress);
  119. while (percentage > *cur_percentage_p) {
  120. *cur_percentage_p = percentage;
  121. LLAMA_LOG_CONT(".");
  122. if (percentage >= 100) {
  123. LLAMA_LOG_CONT("\n");
  124. }
  125. }
  126. return true;
  127. };
  128. }
  129. llama_model * model = new llama_model(params);
  130. // create list of devices to use with this model
  131. if (params.devices) {
  132. for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
  133. model->devices.push_back(*dev);
  134. }
  135. } else {
  136. std::vector<ggml_backend_dev_t> rpc_servers;
  137. // use all available devices
  138. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  139. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  140. switch (ggml_backend_dev_type(dev)) {
  141. case GGML_BACKEND_DEVICE_TYPE_CPU:
  142. case GGML_BACKEND_DEVICE_TYPE_ACCEL:
  143. // skip CPU backends since they are handled separately
  144. break;
  145. case GGML_BACKEND_DEVICE_TYPE_GPU:
  146. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  147. if (ggml_backend_reg_name(reg) == std::string("RPC")) {
  148. rpc_servers.push_back(dev);
  149. } else {
  150. model->devices.push_back(dev);
  151. }
  152. break;
  153. }
  154. }
  155. // add RPC servers at the front of the list
  156. if (!rpc_servers.empty()) {
  157. model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
  158. }
  159. }
  160. // if using single GPU mode, remove all except the main GPU
  161. if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
  162. if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
  163. LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
  164. llama_model_free(model);
  165. return nullptr;
  166. }
  167. ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
  168. model->devices.clear();
  169. model->devices.push_back(main_gpu);
  170. }
  171. for (auto * dev : model->devices) {
  172. size_t free, total; // NOLINT
  173. ggml_backend_dev_memory(dev, &free, &total);
  174. LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
  175. }
  176. const int status = llama_model_load(path_model, splits, *model, params);
  177. GGML_ASSERT(status <= 0);
  178. if (status < 0) {
  179. if (status == -1) {
  180. LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
  181. } else if (status == -2) {
  182. LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
  183. }
  184. llama_model_free(model);
  185. return nullptr;
  186. }
  187. return model;
  188. }
  189. // deprecated
  190. struct llama_model * llama_load_model_from_file(
  191. const char * path_model,
  192. struct llama_model_params params) {
  193. return llama_model_load_from_file(path_model, params);
  194. }
  195. struct llama_model * llama_model_load_from_file(
  196. const char * path_model,
  197. struct llama_model_params params) {
  198. std::vector<std::string> splits = {};
  199. return llama_model_load_from_file_impl(path_model, splits, params);
  200. }
  201. struct llama_model * llama_model_load_from_splits(
  202. const char ** paths,
  203. size_t n_paths,
  204. struct llama_model_params params) {
  205. std::vector<std::string> splits;
  206. if (n_paths == 0) {
  207. LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
  208. return nullptr;
  209. }
  210. for (size_t i = 0; i < n_paths; ++i) {
  211. splits.push_back(paths[i]);
  212. }
  213. return llama_model_load_from_file_impl(splits.front(), splits, params);
  214. }
  215. //
  216. // chat templates
  217. //
  218. int32_t llama_chat_apply_template(
  219. const char * tmpl,
  220. const struct llama_chat_message * chat,
  221. size_t n_msg,
  222. bool add_ass,
  223. char * buf,
  224. int32_t length) {
  225. const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
  226. // format the chat to string
  227. std::vector<const llama_chat_message *> chat_vec;
  228. chat_vec.resize(n_msg);
  229. for (size_t i = 0; i < n_msg; i++) {
  230. chat_vec[i] = &chat[i];
  231. }
  232. std::string formatted_chat;
  233. llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
  234. if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
  235. return -1;
  236. }
  237. int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
  238. if (res < 0) {
  239. return res;
  240. }
  241. if (buf && length > 0) {
  242. strncpy(buf, formatted_chat.c_str(), length);
  243. }
  244. return res;
  245. }
  246. //
  247. // model split
  248. //
  249. int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
  250. static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
  251. if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
  252. return strlen(split_path);
  253. }
  254. return 0;
  255. }
  256. int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
  257. std::string str_split_path(split_path);
  258. char postfix[32];
  259. snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
  260. std::string str_postfix(postfix);
  261. // check if split_prefix ends with postfix
  262. int size_prefix = str_split_path.size() - str_postfix.size();
  263. if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
  264. snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
  265. return size_prefix;
  266. }
  267. return 0;
  268. }
  269. const char * llama_print_system_info(void) {
  270. static std::string s;
  271. s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
  272. for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
  273. auto * reg = ggml_backend_reg_get(i);
  274. auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
  275. if (get_features_fn) {
  276. ggml_backend_feature * features = get_features_fn(reg);
  277. s += ggml_backend_reg_name(reg);
  278. s += " : ";
  279. for (; features->name; features++) {
  280. s += features->name;
  281. s += " = ";
  282. s += features->value;
  283. s += " | ";
  284. }
  285. }
  286. }
  287. return s.c_str();
  288. }