convert_train_checkpoint_to_gguf.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. #!/usr/bin/env python3
  2. # train-text-from-scratch checkpoint --> gguf conversion
  3. import argparse
  4. import os
  5. import struct
  6. import sys
  7. import numpy as np
  8. from pathlib import Path
  9. if 'NO_LOCAL_GGUF' not in os.environ:
  10. sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
  11. import gguf
  12. # gguf constants
  13. LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
  14. LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
  15. LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
  16. LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
  17. LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
  18. LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
  19. LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
  20. LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
  21. LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
  22. LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
  23. LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
  24. LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
  25. LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
  26. LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
  27. LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
  28. LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
  29. LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
  30. LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
  31. LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
  32. LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
  33. LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
  34. LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
  35. LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
  36. LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
  37. LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
  38. LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
  39. LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
  40. LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
  41. LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
  42. LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
  43. LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
  44. LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
  45. LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
  46. LLM_KV_TRAINING_TYPE = "training.type"
  47. LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
  48. LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
  49. LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
  50. LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
  51. class Tensor:
  52. def __init__(self, dtype='f', ne=None):
  53. if ne is None:
  54. ne = []
  55. self.dtype = dtype
  56. self.ne = ne
  57. self.nbytes = 0
  58. if self.dtype == 'f':
  59. if len(self.ne) == 0:
  60. self.nbytes = 0
  61. else:
  62. self.nbytes = int(np.prod(self.ne)) * 4
  63. else:
  64. raise ValueError(f"Unhandled data type '{self.dtype}'")
  65. def load(self, data, offset):
  66. nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  67. namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  68. dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  69. assert(nd == len(self.ne))
  70. ne = []
  71. for d in range(nd):
  72. n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  73. ne.append(n)
  74. assert(tuple(ne) == tuple(self.ne))
  75. if self.dtype == 'f':
  76. assert(dtype == 0)
  77. else:
  78. raise ValueError(f"Unhandled data type '{self.dtype}'")
  79. self.name = bytes(data[offset:offset+namelen]); offset += namelen
  80. # 32-byte alignment
  81. offset += (0 - offset) & 31
  82. self.data = data[offset:offset+self.nbytes]
  83. offset += self.nbytes
  84. return offset
  85. def max_storage_size(self):
  86. result = 0
  87. result += 4 # nd
  88. result += 4 # namelen
  89. result += 4 # dtype
  90. result += len(self.ne)*8 # ne
  91. result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
  92. result += 31 # 32-byte alignment
  93. result += self.nbytes
  94. return result
  95. def save_gguf(self, gguf_writer, name):
  96. gguf_writer.add_tensor(
  97. name=name,
  98. tensor=self.data,
  99. raw_shape=np.array(list(reversed(self.ne))),
  100. raw_dtype=gguf.GGMLQuantizationType.F32)
  101. class OptimizationParamsV0:
  102. def __init__(self):
  103. pass
  104. def load(self, data, offset):
  105. self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  106. self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  107. self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  108. self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  109. self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
  110. self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
  111. self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  112. self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  113. self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  114. self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  115. self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  116. self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  117. self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  118. self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  119. self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  120. self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  121. self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  122. self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  123. self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  124. self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  125. self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  126. self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  127. self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  128. self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  129. return offset
  130. class OptimizationContext:
  131. def __init__(self):
  132. pass
  133. def load(self, data, offset):
  134. self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
  135. offset += 4
  136. if self.version == 0:
  137. params = OptimizationParamsV0()
  138. offset = params.load(data, offset)
  139. self.past = params.past
  140. self.lbfgs_m = params.lbfgs_m
  141. self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
  142. self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  143. self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
  144. self.type = params.type
  145. self.adam_m = Tensor('f', [self.nx])
  146. self.adam_v = Tensor('f', [self.nx])
  147. self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
  148. self.lbfgs_x = Tensor('f', [self.nx])
  149. self.lbfgs_xp = Tensor('f', [self.nx])
  150. self.lbfgs_g = Tensor('f', [self.nx])
  151. self.lbfgs_gp = Tensor('f', [self.nx])
  152. self.lbfgs_d = Tensor('f', [self.nx])
  153. self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
  154. self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
  155. self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
  156. self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
  157. self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
  158. if self.type == 0:
  159. # these tensors are stored, but we don't need their data
  160. x = Tensor('f', [self.nx])
  161. g = Tensor('f', [self.nx])
  162. g2 = Tensor('f', [self.nx])
  163. mh = Tensor('f', [self.nx])
  164. vh = Tensor('f', [self.nx])
  165. offset = x.load(data, offset)
  166. offset = g.load(data, offset)
  167. offset = g2.load(data, offset)
  168. offset = self.adam_m.load(data, offset)
  169. offset = self.adam_v.load(data, offset)
  170. offset = mh.load(data, offset)
  171. offset = vh.load(data, offset)
  172. offset = self.adam_pf.load(data, offset)
  173. self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  174. self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  175. self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  176. elif self.type == 1:
  177. offset = self.lbfgs_x.load(data, offset)
  178. offset = self.lbfgs_xp.load(data, offset)
  179. offset = self.lbfgs_g.load(data, offset)
  180. offset = self.lbfgs_gp.load(data, offset)
  181. offset = self.lbfgs_d.load(data, offset)
  182. offset = self.lbfgs_pf.load(data, offset)
  183. offset = self.lbfgs_lmal.load(data, offset)
  184. offset = self.lbfgs_lmys.load(data, offset)
  185. offset = self.lbfgs_lms.load(data, offset)
  186. offset = self.lbfgs_lmy.load(data, offset)
  187. self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  188. self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  189. self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  190. self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  191. self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  192. self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  193. else:
  194. raise ValueError('Unknown optimizer type')
  195. elif self.version == 1:
  196. self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  197. self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  198. self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
  199. self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  200. self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
  201. self.adam_m = Tensor('f', [self.nx])
  202. self.adam_v = Tensor('f', [self.nx])
  203. self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
  204. self.lbfgs_x = Tensor('f', [self.nx])
  205. self.lbfgs_xp = Tensor('f', [self.nx])
  206. self.lbfgs_g = Tensor('f', [self.nx])
  207. self.lbfgs_gp = Tensor('f', [self.nx])
  208. self.lbfgs_d = Tensor('f', [self.nx])
  209. self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
  210. self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
  211. self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
  212. self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
  213. self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
  214. # forgot to save type in version 1:
  215. # guess self.type from number of remaining bytes
  216. size_type_0 = 12 + sum([t.max_storage_size() for t in
  217. [self.adam_m, self.adam_v]
  218. +([self.adam_pf] if (self.past > 0) else [])])
  219. size_type_1 = 24 + sum([t.max_storage_size() for t in
  220. [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
  221. self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
  222. self.lbfgs_lmal, self.lbfgs_lmys,
  223. self.lbfgs_lms, self.lbfgs_lmy]
  224. +([self.lbfgs_pf] if (self.past > 0) else [])])
  225. # due to alignment padding the size might not by exact
  226. # but the difference in size for both types is significant,
  227. # so we can just use whichever is closest
  228. remaining = len(data) - offset
  229. if abs(remaining - size_type_0) < abs(remaining - size_type_1):
  230. self.type = 0
  231. else:
  232. self.type = 1
  233. if self.type == 0:
  234. offset = self.adam_m.load(data, offset)
  235. offset = self.adam_v.load(data, offset)
  236. offset = self.adam_pf.load(data,offset)
  237. self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  238. self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  239. self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  240. elif self.type == 1:
  241. offset = self.lbfgs_x.load(data, offset)
  242. offset = self.lbfgs_xp.load(data, offset)
  243. offset = self.lbfgs_g.load(data, offset)
  244. offset = self.lbfgs_gp.load(data, offset)
  245. offset = self.lbfgs_d.load(data, offset)
  246. offset = self.lbfgs_pf.load(data, offset)
  247. offset = self.lbfgs_lmal.load(data, offset)
  248. offset = self.lbfgs_lmys.load(data, offset)
  249. offset = self.lbfgs_lms.load(data, offset)
  250. offset = self.lbfgs_lmy.load(data, offset)
  251. self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  252. self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
  253. self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  254. self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  255. self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  256. self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
  257. else:
  258. raise ValueError('Invalid version of checkpoint file')
  259. return offset
  260. def save_gguf(self, gguf_writer):
  261. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
  262. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
  263. gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
  264. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
  265. gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
  266. if self.type == 0:
  267. gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
  268. gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
  269. gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
  270. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
  271. self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
  272. self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
  273. if self.past > 0:
  274. self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
  275. elif self.type == 1:
  276. gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
  277. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
  278. gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
  279. gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
  280. gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
  281. gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
  282. gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
  283. gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
  284. self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
  285. self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
  286. self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
  287. self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
  288. self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
  289. if self.past > 0:
  290. self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
  291. self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
  292. self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
  293. self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
  294. self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
  295. else:
  296. raise ValueError('Unknown optimizer type')
  297. class ModelParams:
  298. def __init__(self):
  299. pass
  300. def load(self, data, offset):
  301. self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  302. self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  303. self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  304. self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  305. self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  306. self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  307. return offset
  308. def get_n_ff(self):
  309. # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
  310. return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
  311. def save_gguf(self, gguf_writer):
  312. # self.n_vocab not saved
  313. gguf_writer.add_embedding_length(self.n_embd)
  314. gguf_writer.add_head_count(self.n_head)
  315. gguf_writer.add_block_count(self.n_layer)
  316. gguf_writer.add_rope_dimension_count(self.n_rot)
  317. gguf_writer.add_feed_forward_length(self.get_n_ff())
  318. def tensor_name(key, bid=None):
  319. return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
  320. class Layer:
  321. def __init__(self, params, bid):
  322. self.bid = bid
  323. self.att_norm = Tensor('f', [params.n_embd])
  324. self.wq = Tensor('f', [params.n_embd, params.n_embd])
  325. self.wk = Tensor('f', [params.n_embd, params.n_embd])
  326. self.wv = Tensor('f', [params.n_embd, params.n_embd])
  327. self.wo = Tensor('f', [params.n_embd, params.n_embd])
  328. self.ffn_norm = Tensor('f', [params.n_embd])
  329. self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
  330. self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
  331. self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
  332. def load(self, data, offset):
  333. offset = self.att_norm.load(data, offset)
  334. offset = self.wq.load(data, offset)
  335. offset = self.wk.load(data, offset)
  336. offset = self.wv.load(data, offset)
  337. offset = self.wo.load(data, offset)
  338. offset = self.ffn_norm.load(data, offset)
  339. offset = self.w1.load(data, offset)
  340. offset = self.w2.load(data, offset)
  341. offset = self.w3.load(data, offset)
  342. return offset
  343. def save_gguf(self, gguf_writer):
  344. self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
  345. self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
  346. self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
  347. self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
  348. self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
  349. self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
  350. self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
  351. self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
  352. self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
  353. class Model:
  354. def __init__(self):
  355. self.params = ModelParams()
  356. self.layers = []
  357. def load(self, data, offset):
  358. offset = self.params.load(data, offset)
  359. self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
  360. self.norm = Tensor('f', [self.params.n_embd])
  361. self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
  362. offset = self.tok_embd.load(data, offset)
  363. offset = self.norm.load(data, offset)
  364. offset = self.output.load(data, offset)
  365. self.layers.clear()
  366. for bid in range(self.params.n_layer):
  367. layer = Layer(self.params, bid)
  368. offset = layer.load(data, offset)
  369. self.layers.append(layer)
  370. return offset
  371. def save_gguf(self, gguf_writer):
  372. self.params.save_gguf(gguf_writer)
  373. self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
  374. self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
  375. self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
  376. for layer in self.layers:
  377. layer.save_gguf(gguf_writer)
  378. class Checkpoint:
  379. def __init__(self):
  380. self.model = Model()
  381. self.opt_ctx = OptimizationContext()
  382. def load(self, data, offset):
  383. magic = bytes(reversed(data[offset:offset + 4])); offset += 4
  384. if magic != b'ggcp':
  385. raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
  386. self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  387. if self.version != 0:
  388. raise ValueError('Invalid version of checkpoint file')
  389. self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  390. self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  391. self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
  392. offset = self.model.load(data, offset)
  393. offset = self.opt_ctx.load(data, offset)
  394. return offset
  395. def save_gguf(self, gguf_writer):
  396. gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
  397. gguf_writer.add_layer_norm_rms_eps(1e-5)
  398. gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
  399. gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
  400. gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
  401. gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
  402. gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
  403. self.model.save_gguf(gguf_writer)
  404. self.opt_ctx.save_gguf(gguf_writer)
  405. def handle_args():
  406. parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
  407. parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
  408. parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
  409. return parser.parse_args()
  410. def main():
  411. cfg = handle_args()
  412. data = np.memmap(cfg.input, mode = 'r')
  413. chk = Checkpoint()
  414. offset = 0
  415. offset = chk.load(data, offset)
  416. # we should have read all available data
  417. assert(offset == len(data))
  418. gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
  419. chk.save_gguf(gguf_writer)
  420. print(" gguf: write header")
  421. gguf_writer.write_header_to_file()
  422. print(" gguf: write metadata")
  423. gguf_writer.write_kv_data_to_file()
  424. print(" gguf: write tensors")
  425. gguf_writer.write_tensors_to_file()
  426. gguf_writer.close()
  427. if __name__ == '__main__':
  428. main()