llama-adapter.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. #include "llama-adapter.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-model.h"
  5. #include <map>
  6. #include <cassert>
  7. #include <sstream>
  8. #include <stdexcept>
  9. // vec
  10. ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
  11. if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
  12. return nullptr;
  13. }
  14. return tensors[il];
  15. }
  16. ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
  17. ggml_tensor * layer_dir = tensor_for(il);
  18. if (layer_dir != nullptr) {
  19. cur = ggml_add(ctx, cur, layer_dir);
  20. }
  21. return cur;
  22. }
  23. bool llama_adapter_cvec::init(const llama_model & model) {
  24. const auto & hparams = model.hparams;
  25. GGML_ASSERT(tensors.empty());
  26. GGML_ASSERT(ctxs.empty());
  27. GGML_ASSERT(bufs.empty());
  28. // create a context for each buffer type
  29. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  30. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  31. auto it = ctx_map.find(buft);
  32. if (it == ctx_map.end()) {
  33. ggml_init_params params = {
  34. /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
  35. /*.mem_buffer =*/ NULL,
  36. /*.no_alloc =*/ true,
  37. };
  38. ggml_context * ctx = ggml_init(params);
  39. if (!ctx) {
  40. return nullptr;
  41. }
  42. ctx_map[buft] = ctx;
  43. ctxs.emplace_back(ctx);
  44. return ctx;
  45. }
  46. return it->second;
  47. };
  48. // make tensors
  49. tensors.reserve(hparams.n_layer);
  50. tensors.push_back(nullptr); // there's never a tensor for layer 0
  51. for (size_t il = 1; il < hparams.n_layer; il++) {
  52. ggml_backend_buffer_type_t buft = model.select_buft(il);
  53. ggml_context * ctx = ctx_for_buft(buft);
  54. if (!ctx) {
  55. LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
  56. return false;
  57. }
  58. ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  59. tensors.push_back(tensor);
  60. }
  61. // allocate tensors / buffers and zero
  62. bufs.reserve(ctx_map.size());
  63. for (auto it : ctx_map) {
  64. ggml_backend_buffer_type_t buft = it.first;
  65. ggml_context * ctx = it.second;
  66. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  67. if (!buf) {
  68. LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
  69. return false;
  70. }
  71. ggml_backend_buffer_clear(buf, 0);
  72. bufs.emplace_back(buf);
  73. }
  74. return true;
  75. }
  76. bool llama_adapter_cvec::apply(
  77. const llama_model & model,
  78. const float * data,
  79. size_t len,
  80. int32_t n_embd,
  81. int32_t il_start,
  82. int32_t il_end) {
  83. const auto & hparams = model.hparams;
  84. if (data == nullptr) {
  85. // disable the current control vector (but leave allocated for later)
  86. layer_start = -1;
  87. layer_end = -1;
  88. return true;
  89. }
  90. if (n_embd != (int) hparams.n_embd) {
  91. LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
  92. return false;
  93. }
  94. if (tensors.empty()) {
  95. if (!init(model)) {
  96. return false;
  97. }
  98. }
  99. layer_start = il_start;
  100. layer_end = il_end;
  101. for (size_t il = 1; il < hparams.n_layer; il++) {
  102. assert(tensors[il] != nullptr);
  103. const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
  104. if (off + n_embd <= len) {
  105. ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
  106. }
  107. }
  108. return true;
  109. }
  110. // lora
  111. llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
  112. const std::string name(w->name);
  113. const auto pos = ab_map.find(name);
  114. if (pos != ab_map.end()) {
  115. return &pos->second;
  116. }
  117. return nullptr;
  118. }
  119. static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
  120. LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
  121. ggml_context * ctx_init;
  122. gguf_init_params meta_gguf_params = {
  123. /* .no_alloc = */ true,
  124. /* .ctx = */ &ctx_init,
  125. };
  126. gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
  127. if (!ctx_gguf) {
  128. throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
  129. }
  130. ggml_context_ptr ctx { ctx_init };
  131. // check metadata
  132. {
  133. const gguf_context * gguf_ctx = ctx_gguf.get();
  134. LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
  135. // get metadata as string
  136. for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
  137. gguf_type type = gguf_get_kv_type(gguf_ctx, i);
  138. const std::string type_name =
  139. type == GGUF_TYPE_ARRAY
  140. ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
  141. : gguf_type_name(type);
  142. const char * name = gguf_get_key(gguf_ctx, i);
  143. const std::string value = gguf_kv_to_str(gguf_ctx, i);
  144. if (type != GGUF_TYPE_ARRAY) {
  145. adapter.gguf_kv.emplace(name, value);
  146. }
  147. const size_t MAX_VALUE_LEN = 40;
  148. std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
  149. replace_all(print_value, "\n", "\\n");
  150. LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
  151. }
  152. auto get_kv_str = [&](const std::string & key) -> std::string {
  153. int id = gguf_find_key(gguf_ctx, key.c_str());
  154. return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
  155. };
  156. auto get_kv_f32 = [&](const std::string & key) -> float {
  157. int id = gguf_find_key(gguf_ctx, key.c_str());
  158. return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
  159. };
  160. LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
  161. auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
  162. if (general_type != "adapter") {
  163. throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
  164. }
  165. auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
  166. auto general_arch = llm_arch_from_string(general_arch_str);
  167. if (general_arch != model.arch) {
  168. throw std::runtime_error("model arch and LoRA arch mismatch");
  169. }
  170. auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
  171. if (adapter_type != "lora") {
  172. throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
  173. }
  174. adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
  175. // parse alora invocation sequence vector
  176. const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
  177. const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
  178. if (kid >= 0) {
  179. if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
  180. throw std::runtime_error("invalid gguf type for " + key);
  181. }
  182. const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
  183. if (arr_type != GGUF_TYPE_UINT32) {
  184. throw std::runtime_error("invalid gguf element type for " + key);
  185. }
  186. const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
  187. const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
  188. adapter.alora_invocation_tokens.resize(seq_len);
  189. std::copy(
  190. (const llama_token *)data,
  191. (const llama_token *)data + seq_len,
  192. adapter.alora_invocation_tokens.begin());
  193. }
  194. }
  195. int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
  196. // contexts for each buffer type
  197. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  198. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  199. auto it = ctx_map.find(buft);
  200. if (it == ctx_map.end()) {
  201. // add a new context
  202. ggml_init_params params = {
  203. /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
  204. /*.mem_buffer =*/ NULL,
  205. /*.no_alloc =*/ true,
  206. };
  207. ggml_context * buft_ctx = ggml_init(params);
  208. if (!buft_ctx) {
  209. return nullptr;
  210. }
  211. ctx_map[buft] = buft_ctx;
  212. adapter.ctxs.emplace_back(buft_ctx);
  213. return buft_ctx;
  214. };
  215. return it->second;
  216. };
  217. // bundle lora_a and lora_b into pairs
  218. std::map<std::string, llama_adapter_lora_weight> ab_map;
  219. auto str_endswith = [](const std::string & str, const std::string & suffix) {
  220. return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
  221. };
  222. for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  223. std::string name(cur->name);
  224. if (str_endswith(name, ".lora_a")) {
  225. replace_all(name, ".lora_a", "");
  226. if (ab_map.find(name) == ab_map.end()) {
  227. ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
  228. } else {
  229. ab_map[name].a = cur;
  230. }
  231. } else if (str_endswith(name, ".lora_b")) {
  232. replace_all(name, ".lora_b", "");
  233. if (ab_map.find(name) == ab_map.end()) {
  234. ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
  235. } else {
  236. ab_map[name].b = cur;
  237. }
  238. } else if (str_endswith(name, "_norm.weight")) {
  239. // TODO: add support for norm vector
  240. // for now, we don't really care because most adapters still work fine without it
  241. continue;
  242. } else {
  243. throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
  244. }
  245. }
  246. // get extra buffer types of the CPU
  247. // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
  248. // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
  249. std::vector<ggml_backend_buffer_type_t> buft_extra;
  250. {
  251. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  252. if (!cpu_dev) {
  253. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  254. }
  255. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  256. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  257. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  258. if (ggml_backend_dev_get_extra_bufts_fn) {
  259. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  260. while (extra_bufts && *extra_bufts) {
  261. buft_extra.emplace_back(*extra_bufts);
  262. ++extra_bufts;
  263. }
  264. }
  265. }
  266. // add tensors
  267. for (auto & it : ab_map) {
  268. const std::string & name = it.first;
  269. llama_adapter_lora_weight & w = it.second;
  270. bool is_token_embd = str_endswith(name, "token_embd.weight");
  271. if (!w.a || !w.b) {
  272. throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
  273. }
  274. // device buft and device ctx
  275. const auto * model_tensor = model.get_tensor(name.c_str());
  276. if (!model_tensor) {
  277. throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
  278. }
  279. auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
  280. // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
  281. for (auto & ex : buft_extra) {
  282. if (ex == buft) {
  283. LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
  284. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  285. if (!cpu_dev) {
  286. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  287. }
  288. buft = ggml_backend_dev_buffer_type(cpu_dev);
  289. break;
  290. }
  291. }
  292. LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
  293. ggml_context * dev_ctx = ctx_for_buft(buft);
  294. // validate tensor shape
  295. if (is_token_embd) {
  296. // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
  297. if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
  298. throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
  299. }
  300. } else {
  301. if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
  302. throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
  303. }
  304. if (w.a->ne[1] != w.b->ne[0]) {
  305. throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
  306. }
  307. }
  308. // save tensor to adapter
  309. ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
  310. ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
  311. ggml_set_name(tensor_a, w.a->name);
  312. ggml_set_name(tensor_b, w.b->name);
  313. adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
  314. }
  315. // allocate tensors / buffers and zero
  316. {
  317. adapter.ctxs.reserve(ctx_map.size());
  318. adapter.bufs.reserve(ctx_map.size());
  319. for (auto & it : ctx_map) {
  320. ggml_backend_buffer_type_t buft = it.first;
  321. ggml_context * ctx_dev = it.second;
  322. ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
  323. if (!buf) {
  324. throw std::runtime_error("failed to allocate buffer for lora adapter\n");
  325. }
  326. LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
  327. adapter.bufs.emplace_back(std::move(buf));
  328. }
  329. }
  330. // set tensor data
  331. {
  332. llama_file gguf_file(path_lora, "rb");
  333. std::vector<uint8_t> read_buf;
  334. auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
  335. size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
  336. size_t size = ggml_nbytes(orig);
  337. read_buf.resize(size);
  338. gguf_file.seek(offs, SEEK_SET);
  339. gguf_file.read_raw(read_buf.data(), size);
  340. ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
  341. };
  342. for (auto & it : adapter.ab_map) {
  343. auto orig = ab_map[it.first];
  344. auto dev = it.second;
  345. set_tensor(orig.a, dev.a);
  346. set_tensor(orig.b, dev.b);
  347. }
  348. }
  349. LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
  350. }
  351. llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
  352. llama_adapter_lora * adapter = new llama_adapter_lora();
  353. try {
  354. llama_adapter_lora_init_impl(*model, path_lora, *adapter);
  355. return adapter;
  356. } catch (const std::exception & err) {
  357. LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
  358. delete adapter;
  359. }
  360. return nullptr;
  361. }
  362. int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
  363. const auto & it = adapter->gguf_kv.find(key);
  364. if (it == adapter->gguf_kv.end()) {
  365. if (buf_size > 0) {
  366. buf[0] = '\0';
  367. }
  368. return -1;
  369. }
  370. return snprintf(buf, buf_size, "%s", it->second.c_str());
  371. }
  372. int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
  373. return (int)adapter->gguf_kv.size();
  374. }
  375. int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
  376. if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
  377. if (buf_size > 0) {
  378. buf[0] = '\0';
  379. }
  380. return -1;
  381. }
  382. auto it = adapter->gguf_kv.begin();
  383. std::advance(it, i);
  384. return snprintf(buf, buf_size, "%s", it->first.c_str());
  385. }
  386. int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
  387. if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
  388. if (buf_size > 0) {
  389. buf[0] = '\0';
  390. }
  391. return -1;
  392. }
  393. auto it = adapter->gguf_kv.begin();
  394. std::advance(it, i);
  395. return snprintf(buf, buf_size, "%s", it->second.c_str());
  396. }
  397. void llama_adapter_lora_free(llama_adapter_lora * adapter) {
  398. delete adapter;
  399. }
  400. uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
  401. if (!adapter) {
  402. return 0;
  403. }
  404. return adapter->alora_invocation_tokens.size();
  405. }
  406. const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
  407. GGML_ASSERT(adapter);
  408. return adapter->alora_invocation_tokens.data();
  409. }