llama-adapter.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. #include "llama-adapter.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-model.h"
  5. #include <map>
  6. #include <cassert>
  7. #include <stdexcept>
  8. // vec
  9. ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
  10. if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
  11. return nullptr;
  12. }
  13. return tensors[il];
  14. }
  15. ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
  16. ggml_tensor * layer_dir = tensor_for(il);
  17. if (layer_dir != nullptr) {
  18. cur = ggml_add(ctx, cur, layer_dir);
  19. }
  20. return cur;
  21. }
  22. bool llama_adapter_cvec::init(const llama_model & model) {
  23. const auto & hparams = model.hparams;
  24. GGML_ASSERT(tensors.empty());
  25. GGML_ASSERT(ctxs.empty());
  26. GGML_ASSERT(bufs.empty());
  27. // create a context for each buffer type
  28. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  29. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  30. auto it = ctx_map.find(buft);
  31. if (it == ctx_map.end()) {
  32. ggml_init_params params = {
  33. /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
  34. /*.mem_buffer =*/ NULL,
  35. /*.no_alloc =*/ true,
  36. };
  37. ggml_context * ctx = ggml_init(params);
  38. if (!ctx) {
  39. return nullptr;
  40. }
  41. ctx_map[buft] = ctx;
  42. ctxs.emplace_back(ctx);
  43. return ctx;
  44. }
  45. return it->second;
  46. };
  47. // make tensors
  48. tensors.reserve(hparams.n_layer);
  49. tensors.push_back(nullptr); // there's never a tensor for layer 0
  50. for (size_t il = 1; il < hparams.n_layer; il++) {
  51. ggml_backend_buffer_type_t buft = model.select_buft(il);
  52. ggml_context * ctx = ctx_for_buft(buft);
  53. if (!ctx) {
  54. LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
  55. return false;
  56. }
  57. ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  58. tensors.push_back(tensor);
  59. }
  60. // allocate tensors / buffers and zero
  61. bufs.reserve(ctx_map.size());
  62. for (auto it : ctx_map) {
  63. ggml_backend_buffer_type_t buft = it.first;
  64. ggml_context * ctx = it.second;
  65. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  66. if (!buf) {
  67. LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
  68. return false;
  69. }
  70. ggml_backend_buffer_clear(buf, 0);
  71. bufs.emplace_back(buf);
  72. }
  73. return true;
  74. }
  75. bool llama_adapter_cvec::apply(
  76. const llama_model & model,
  77. const float * data,
  78. size_t len,
  79. int32_t n_embd,
  80. int32_t il_start,
  81. int32_t il_end) {
  82. const auto & hparams = model.hparams;
  83. if (data == nullptr) {
  84. // disable the current control vector (but leave allocated for later)
  85. layer_start = -1;
  86. layer_end = -1;
  87. return true;
  88. }
  89. if (n_embd != (int) hparams.n_embd) {
  90. LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
  91. return false;
  92. }
  93. if (tensors.empty()) {
  94. if (!init(model)) {
  95. return false;
  96. }
  97. }
  98. layer_start = il_start;
  99. layer_end = il_end;
  100. for (size_t il = 1; il < hparams.n_layer; il++) {
  101. assert(tensors[il] != nullptr);
  102. const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
  103. if (off + n_embd <= len) {
  104. ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
  105. }
  106. }
  107. return true;
  108. }
  109. // lora
  110. llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
  111. const std::string name(w->name);
  112. const auto pos = ab_map.find(name);
  113. if (pos != ab_map.end()) {
  114. return &pos->second;
  115. }
  116. return nullptr;
  117. }
  118. static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
  119. LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
  120. ggml_context * ctx_init;
  121. gguf_init_params meta_gguf_params = {
  122. /* .no_alloc = */ true,
  123. /* .ctx = */ &ctx_init,
  124. };
  125. gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
  126. if (!ctx_gguf) {
  127. throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
  128. }
  129. ggml_context_ptr ctx { ctx_init };
  130. // check metadata
  131. {
  132. auto get_kv_str = [&](const std::string & key) -> std::string {
  133. int id = gguf_find_key(ctx_gguf.get(), key.c_str());
  134. return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
  135. };
  136. auto get_kv_f32 = [&](const std::string & key) -> float {
  137. int id = gguf_find_key(ctx_gguf.get(), key.c_str());
  138. return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
  139. };
  140. LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
  141. auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
  142. if (general_type != "adapter") {
  143. throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
  144. }
  145. auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
  146. auto general_arch = llm_arch_from_string(general_arch_str);
  147. if (general_arch != model.arch) {
  148. throw std::runtime_error("model arch and LoRA arch mismatch");
  149. }
  150. auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
  151. if (adapter_type != "lora") {
  152. throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
  153. }
  154. adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
  155. }
  156. int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
  157. // contexts for each buffer type
  158. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  159. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  160. auto it = ctx_map.find(buft);
  161. if (it == ctx_map.end()) {
  162. // add a new context
  163. ggml_init_params params = {
  164. /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
  165. /*.mem_buffer =*/ NULL,
  166. /*.no_alloc =*/ true,
  167. };
  168. ggml_context * buft_ctx = ggml_init(params);
  169. if (!buft_ctx) {
  170. return nullptr;
  171. }
  172. ctx_map[buft] = buft_ctx;
  173. adapter.ctxs.emplace_back(buft_ctx);
  174. return buft_ctx;
  175. };
  176. return it->second;
  177. };
  178. // bundle lora_a and lora_b into pairs
  179. std::map<std::string, llama_adapter_lora_weight> ab_map;
  180. auto str_endswith = [](const std::string & str, const std::string & suffix) {
  181. return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
  182. };
  183. for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  184. std::string name(cur->name);
  185. if (str_endswith(name, ".lora_a")) {
  186. replace_all(name, ".lora_a", "");
  187. if (ab_map.find(name) == ab_map.end()) {
  188. ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
  189. } else {
  190. ab_map[name].a = cur;
  191. }
  192. } else if (str_endswith(name, ".lora_b")) {
  193. replace_all(name, ".lora_b", "");
  194. if (ab_map.find(name) == ab_map.end()) {
  195. ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
  196. } else {
  197. ab_map[name].b = cur;
  198. }
  199. } else if (str_endswith(name, "_norm.weight")) {
  200. // TODO: add support for norm vector
  201. // for now, we don't really care because most adapters still work fine without it
  202. continue;
  203. } else {
  204. throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
  205. }
  206. }
  207. // get extra buffer types of the CPU
  208. // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
  209. // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
  210. std::vector<ggml_backend_buffer_type_t> buft_extra;
  211. {
  212. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  213. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  214. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  215. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  216. if (ggml_backend_dev_get_extra_bufts_fn) {
  217. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  218. while (extra_bufts && *extra_bufts) {
  219. buft_extra.emplace_back(*extra_bufts);
  220. ++extra_bufts;
  221. }
  222. }
  223. }
  224. // add tensors
  225. for (auto & it : ab_map) {
  226. const std::string & name = it.first;
  227. llama_adapter_lora_weight & w = it.second;
  228. bool is_token_embd = str_endswith(name, "token_embd.weight");
  229. if (!w.a || !w.b) {
  230. throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
  231. }
  232. // device buft and device ctx
  233. const auto * model_tensor = model.get_tensor(name.c_str());
  234. if (!model_tensor) {
  235. throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
  236. }
  237. auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
  238. // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
  239. for (auto & ex : buft_extra) {
  240. if (ex == buft) {
  241. LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
  242. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  243. buft = ggml_backend_dev_buffer_type(cpu_dev);
  244. break;
  245. }
  246. }
  247. LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
  248. ggml_context * dev_ctx = ctx_for_buft(buft);
  249. // validate tensor shape
  250. if (is_token_embd) {
  251. // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
  252. if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
  253. throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
  254. }
  255. } else {
  256. if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
  257. throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
  258. }
  259. if (w.a->ne[1] != w.b->ne[0]) {
  260. throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
  261. }
  262. }
  263. // save tensor to adapter
  264. ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
  265. ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
  266. ggml_set_name(tensor_a, w.a->name);
  267. ggml_set_name(tensor_b, w.b->name);
  268. adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
  269. }
  270. // allocate tensors / buffers and zero
  271. {
  272. adapter.ctxs.reserve(ctx_map.size());
  273. adapter.bufs.reserve(ctx_map.size());
  274. for (auto & it : ctx_map) {
  275. ggml_backend_buffer_type_t buft = it.first;
  276. ggml_context * ctx_dev = it.second;
  277. ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
  278. if (!buf) {
  279. throw std::runtime_error("failed to allocate buffer for lora adapter\n");
  280. }
  281. LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
  282. adapter.bufs.emplace_back(std::move(buf));
  283. }
  284. }
  285. // set tensor data
  286. {
  287. llama_file gguf_file(path_lora, "rb");
  288. std::vector<uint8_t> read_buf;
  289. auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
  290. size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
  291. size_t size = ggml_nbytes(orig);
  292. read_buf.resize(size);
  293. gguf_file.seek(offs, SEEK_SET);
  294. gguf_file.read_raw(read_buf.data(), size);
  295. ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
  296. };
  297. for (auto & it : adapter.ab_map) {
  298. auto orig = ab_map[it.first];
  299. auto dev = it.second;
  300. set_tensor(orig.a, dev.a);
  301. set_tensor(orig.b, dev.b);
  302. }
  303. }
  304. LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
  305. }
  306. llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
  307. llama_adapter_lora * adapter = new llama_adapter_lora();
  308. try {
  309. llama_adapter_lora_init_impl(*model, path_lora, *adapter);
  310. return adapter;
  311. } catch (const std::exception & err) {
  312. LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
  313. delete adapter;
  314. }
  315. return nullptr;
  316. }
  317. void llama_adapter_lora_free(llama_adapter_lora * adapter) {
  318. delete adapter;
  319. }