1
0

convert_finetune_checkpoint_to_gguf.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. #!/usr/bin/env python3
  2. # finetune checkpoint --> gguf conversion
  3. import argparse
  4. import gguf
  5. import struct
  6. import numpy as np
  7. from pathlib import Path
  8. # gguf constants
  9. LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
  10. LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
  11. LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
  12. LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
  13. LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
  14. LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
  15. LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
  16. LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
  17. LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
  18. LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
  19. LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
  20. LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
  21. LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
  22. LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
  23. LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
  24. LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
  25. LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
  26. LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
  27. LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
  28. LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
  29. LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
  30. LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
  31. LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
  32. LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
  33. LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
  34. LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
  35. LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
  36. LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
  37. LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
  38. LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
  39. LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
  40. LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
  41. LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
  42. LLM_KV_TRAINING_TYPE = "training.type"
  43. LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
  44. LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
  45. LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
  46. LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
  47. LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
  48. LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
  49. LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
  50. LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
  51. LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
  52. LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
  53. LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
  54. LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
  55. LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
  56. LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
  57. LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
  58. LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
  59. class Tensor:
  60. def __init__(self, dtype='f', ne=None):
  61. if ne is None:
  62. ne = []
  63. self.dtype = dtype
  64. self.ne = ne
  65. self.nbytes = 0
  66. if self.dtype == 'f':
  67. if len(self.ne) == 0:
  68. self.nbytes = 0
  69. else:
  70. self.nbytes = int(np.product(self.ne)) * 4
  71. else:
  72. raise ValueError(f"Unhandled data type '{self.dtype}'")
  73. def load(self, data, offset):
  74. nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  75. namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  76. dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  77. assert(nd == len(self.ne))
  78. ne = []
  79. for d in range(nd):
  80. n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  81. ne.append(n)
  82. if tuple(ne) != tuple(self.ne):
  83. raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
  84. if self.dtype == 'f':
  85. assert(dtype == 0)
  86. else:
  87. raise ValueError(f"Unhandled data type '{self.dtype}'")
  88. self.name = bytes(data[offset:offset+namelen]); offset += namelen
  89. # 32-byte alignment
  90. offset += (0 - offset) & 31
  91. self.data = data[offset:offset+self.nbytes]
  92. offset += self.nbytes
  93. return offset
  94. def max_storage_size(self):
  95. result = 0
  96. result += 4 # nd
  97. result += 4 # namelen
  98. result += 4 # dtype
  99. result += len(self.ne)*8 # ne
  100. result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
  101. result += 31 # 32-byte alignment
  102. result += self.nbytes
  103. return result
  104. def save_gguf(self, gguf_writer, name):
  105. gguf_writer.add_tensor(
  106. name=name,
  107. tensor=self.data,
  108. raw_shape=np.array(list(reversed(self.ne))),
  109. raw_dtype=gguf.GGMLQuantizationType.F32)
  110. class OptimizationContext:
  111. def __init__(self):
  112. pass
  113. def load(self, data, offset):
  114. self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
  115. offset += 4
  116. if self.version != 1:
  117. raise ValueError('Invalid version of optimization context in checkpoint file')
  118. self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  119. self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  120. self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
  121. self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  122. self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
  123. self.adam_m = Tensor('f', [self.nx])
  124. self.adam_v = Tensor('f', [self.nx])
  125. self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
  126. self.lbfgs_x = Tensor('f', [self.nx])
  127. self.lbfgs_xp = Tensor('f', [self.nx])
  128. self.lbfgs_g = Tensor('f', [self.nx])
  129. self.lbfgs_gp = Tensor('f', [self.nx])
  130. self.lbfgs_d = Tensor('f', [self.nx])
  131. self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
  132. self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
  133. self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
  134. self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
  135. self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
  136. # forgot to save type in version 1:
  137. # guess self.type from number of remaining bytes
  138. size_type_0 = 12 + sum([t.max_storage_size() for t in
  139. [self.adam_m, self.adam_v]
  140. +([self.adam_pf] if (self.past > 0) else [])])
  141. size_type_1 = 24 + sum([t.max_storage_size() for t in
  142. [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
  143. self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
  144. self.lbfgs_lmal, self.lbfgs_lmys,
  145. self.lbfgs_lms, self.lbfgs_lmy]
  146. +([self.lbfgs_pf] if (self.past > 0) else [])])
  147. # due to alignment padding the size might not by exact
  148. # but the difference in size for both types is significant,
  149. # so we can just use whichever is closest
  150. remaining = len(data) - offset
  151. if abs(remaining - size_type_0) < abs(remaining - size_type_1):
  152. self.type = 0
  153. else:
  154. self.type = 1
  155. if self.type == 0:
  156. offset = self.adam_m.load(data, offset)
  157. offset = self.adam_v.load(data, offset)
  158. offset = self.adam_pf.load(data,offset)
  159. self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  160. self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  161. self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  162. elif self.type == 1:
  163. offset = self.lbfgs_x.load(data, offset)
  164. offset = self.lbfgs_xp.load(data, offset)
  165. offset = self.lbfgs_g.load(data, offset)
  166. offset = self.lbfgs_gp.load(data, offset)
  167. offset = self.lbfgs_d.load(data, offset)
  168. offset = self.lbfgs_pf.load(data, offset)
  169. offset = self.lbfgs_lmal.load(data, offset)
  170. offset = self.lbfgs_lmys.load(data, offset)
  171. offset = self.lbfgs_lms.load(data, offset)
  172. offset = self.lbfgs_lmy.load(data, offset)
  173. self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  174. self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  175. self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  176. self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  177. self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  178. self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  179. else:
  180. raise ValueError(f"Invalid optimizer type '{self.type}'")
  181. return offset
  182. def save_gguf(self, gguf_writer):
  183. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
  184. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
  185. gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
  186. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
  187. gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
  188. if self.type == 0:
  189. gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
  190. gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
  191. gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
  192. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
  193. self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
  194. self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
  195. if self.past > 0:
  196. self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
  197. elif self.type == 1:
  198. gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
  199. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
  200. gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
  201. gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
  202. gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
  203. gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
  204. gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
  205. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
  206. self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
  207. self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
  208. self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
  209. self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
  210. self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
  211. if self.past > 0:
  212. self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
  213. self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
  214. self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
  215. self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
  216. self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
  217. else:
  218. raise ValueError('Unknown optimizer type')
  219. class LoraParams:
  220. def __init__(self):
  221. pass
  222. def load(self, data, offset):
  223. self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  224. self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  225. self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  226. self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  227. self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  228. self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  229. self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  230. self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  231. self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  232. self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  233. self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  234. self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  235. return offset
  236. def save_gguf(self, gguf_writer):
  237. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
  238. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
  239. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
  240. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
  241. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
  242. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
  243. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
  244. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
  245. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
  246. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
  247. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
  248. gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
  249. class ModelParams:
  250. def __init__(self, n_ff = None):
  251. self.n_ff = n_ff
  252. def load(self, data, offset):
  253. self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  254. self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  255. self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  256. self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  257. self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  258. self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  259. return offset
  260. def get_n_ff(self):
  261. if self.n_ff is None:
  262. # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
  263. return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
  264. else:
  265. return self.n_ff
  266. def save_gguf(self, gguf_writer):
  267. # self.n_vocab not saved
  268. gguf_writer.add_embedding_length(self.n_embd)
  269. gguf_writer.add_head_count(self.n_head)
  270. gguf_writer.add_block_count(self.n_layer)
  271. gguf_writer.add_rope_dimension_count(self.n_rot)
  272. gguf_writer.add_feed_forward_length(self.get_n_ff())
  273. def tensor_name(key, bid=None, suffix=".weight"):
  274. return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
  275. class Layer:
  276. def __init__(self, params, lora_params, bid):
  277. self.bid = bid
  278. self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
  279. self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
  280. self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
  281. self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
  282. self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
  283. self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
  284. self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
  285. self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
  286. self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
  287. self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
  288. self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
  289. self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
  290. self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
  291. self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
  292. self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
  293. self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
  294. self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
  295. self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
  296. def load(self, data, offset):
  297. offset = self.att_norm_a.load(data, offset)
  298. offset = self.att_norm_b.load(data, offset)
  299. offset = self.wq_a.load(data, offset)
  300. offset = self.wq_b.load(data, offset)
  301. offset = self.wk_a.load(data, offset)
  302. offset = self.wk_b.load(data, offset)
  303. offset = self.wv_a.load(data, offset)
  304. offset = self.wv_b.load(data, offset)
  305. offset = self.wo_a.load(data, offset)
  306. offset = self.wo_b.load(data, offset)
  307. offset = self.ffn_norm_a.load(data, offset)
  308. offset = self.ffn_norm_b.load(data, offset)
  309. offset = self.w1_a.load(data, offset)
  310. offset = self.w1_b.load(data, offset)
  311. offset = self.w2_a.load(data, offset)
  312. offset = self.w2_b.load(data, offset)
  313. offset = self.w3_a.load(data, offset)
  314. offset = self.w3_b.load(data, offset)
  315. return offset
  316. def save_gguf(self, gguf_writer):
  317. self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
  318. self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
  319. self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
  320. self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
  321. self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
  322. self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
  323. self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
  324. self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
  325. self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
  326. self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
  327. self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
  328. self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
  329. self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
  330. self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
  331. self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
  332. self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
  333. self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
  334. self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
  335. class LoraModel:
  336. def __init__(self, n_ff = None):
  337. self.params = ModelParams(n_ff = n_ff)
  338. self.lora_params = LoraParams()
  339. self.layers = []
  340. def load(self, data, offset):
  341. offset = self.params.load(data, offset)
  342. offset = self.lora_params.load(data, offset)
  343. self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
  344. self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
  345. self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
  346. self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
  347. self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
  348. self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
  349. offset = self.tok_embd_a.load(data, offset)
  350. offset = self.tok_embd_b.load(data, offset)
  351. offset = self.norm_a.load(data, offset)
  352. offset = self.norm_b.load(data, offset)
  353. offset = self.output_a.load(data, offset)
  354. offset = self.output_b.load(data, offset)
  355. self.layers.clear()
  356. for bid in range(self.params.n_layer):
  357. layer = Layer(self.params, self.lora_params, bid)
  358. offset = layer.load(data, offset)
  359. self.layers.append(layer)
  360. return offset
  361. def save_gguf(self, gguf_writer):
  362. self.params.save_gguf(gguf_writer)
  363. self.lora_params.save_gguf(gguf_writer)
  364. self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
  365. self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
  366. self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
  367. self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
  368. self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
  369. self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
  370. for layer in self.layers:
  371. layer.save_gguf(gguf_writer)
  372. class LoraCheckpoint:
  373. def __init__(self, n_ff = None):
  374. self.model = LoraModel(n_ff = n_ff)
  375. self.opt_ctx = OptimizationContext()
  376. def load(self, data, offset):
  377. magic = bytes(reversed(data[offset:offset + 4])); offset += 4
  378. if magic != b'ggcl':
  379. raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
  380. self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  381. if self.version != 0:
  382. raise ValueError('Invalid version of checkpoint file')
  383. self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  384. self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  385. self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  386. offset = self.model.load(data, offset)
  387. offset = self.opt_ctx.load(data, offset)
  388. return offset
  389. def save_gguf(self, gguf_writer):
  390. gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
  391. gguf_writer.add_layer_norm_rms_eps(1e-5)
  392. gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
  393. gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
  394. gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
  395. gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
  396. gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
  397. self.model.save_gguf(gguf_writer)
  398. self.opt_ctx.save_gguf(gguf_writer)
  399. def handle_args():
  400. parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
  401. parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
  402. parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
  403. parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
  404. return parser.parse_args()
  405. def main():
  406. cfg = handle_args()
  407. print(cfg)
  408. data = np.memmap(cfg.input, mode = 'r')
  409. chk = LoraCheckpoint(n_ff = cfg.ff)
  410. offset = 0
  411. offset = chk.load(data, offset)
  412. # we should have read all available data
  413. assert(offset == len(data))
  414. gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
  415. chk.save_gguf(gguf_writer)
  416. print(" gguf: write header")
  417. gguf_writer.write_header_to_file()
  418. print(" gguf: write metadata")
  419. gguf_writer.write_kv_data_to_file()
  420. print(" gguf: write tensors")
  421. gguf_writer.write_tensors_to_file()
  422. gguf_writer.close()
  423. if __name__ == '__main__':
  424. main()