convert-llama-ggmlv3-to-gguf.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import math
  5. import struct
  6. import sys
  7. from pathlib import Path
  8. import numpy as np
  9. import os
  10. if 'NO_LOCAL_GGUF' not in os.environ:
  11. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
  12. import gguf
  13. # Note: Does not support GGML_QKK_64
  14. QK_K = 256
  15. # Items here are (block size, type size)
  16. GGML_QUANT_SIZES = {
  17. gguf.GGMLQuantizationType.F32 : (1, 4),
  18. gguf.GGMLQuantizationType.F16 : (1, 2),
  19. gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
  20. gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
  21. gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
  22. gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
  23. gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
  24. gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
  25. gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
  26. gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
  27. gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
  28. gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
  29. gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
  30. gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
  31. }
  32. class Hyperparameters:
  33. def __init__(self):
  34. self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
  35. self.n_ff = 0
  36. def set_n_ff(self, model):
  37. ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
  38. assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
  39. ff_tensor = model.tensors[ff_tensor_idx]
  40. self.n_ff = ff_tensor.dims[1]
  41. def load(self, data, offset):
  42. (
  43. self.n_vocab,
  44. self.n_embd,
  45. self.n_mult,
  46. self.n_head,
  47. self.n_layer,
  48. self.n_rot,
  49. self.ftype,
  50. ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
  51. return 4 * 7
  52. def __str__(self):
  53. return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
  54. class Vocab:
  55. def __init__(self):
  56. self.items = []
  57. def load(self, data, offset, n_vocab):
  58. orig_offset = offset
  59. for _ in range(n_vocab):
  60. itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
  61. assert itemlen < 4096, 'Absurd vocab item length'
  62. offset += 4
  63. vocab = bytes(data[offset:offset + itemlen])
  64. offset += itemlen
  65. score = struct.unpack('<f', data[offset:offset + 4])[0]
  66. offset += 4
  67. self.items.append((vocab, score))
  68. return offset - orig_offset
  69. class Tensor:
  70. def __init__(self):
  71. self.name = None
  72. self.dims: tuple[int, ...] = ()
  73. self.dtype = None
  74. self.start_offset = 0
  75. self.len_bytes = np.int64(0)
  76. def load(self, data, offset):
  77. orig_offset = offset
  78. (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
  79. assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
  80. assert name_len < 4096, 'Absurd tensor name length'
  81. quant = GGML_QUANT_SIZES.get(dtype)
  82. assert quant is not None, 'Unknown tensor type'
  83. (blksize, tysize) = quant
  84. offset += 12
  85. self.dtype= dtype
  86. self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
  87. offset += 4 * n_dims
  88. self.name = bytes(data[offset:offset + name_len])
  89. offset += name_len
  90. pad = ((offset + 31) & ~31) - offset
  91. offset += pad
  92. n_elems = np.prod(self.dims)
  93. n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
  94. self.start_offset = offset
  95. self.len_bytes = n_bytes
  96. offset += n_bytes
  97. # print(n_dims, name_len, dtype, self.dims, self.name, pad)
  98. return offset - orig_offset
  99. class GGMLV3Model:
  100. def __init__(self):
  101. self.hyperparameters = None
  102. self.vocab = None
  103. self.tensor_map = {}
  104. self.tensors = []
  105. def validate_header(self, data, offset):
  106. if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
  107. raise ValueError('Only GGJTv3 supported')
  108. return 8
  109. def load(self, data, offset):
  110. offset += self.validate_header(data, offset)
  111. hp = Hyperparameters()
  112. offset += hp.load(data, offset)
  113. vocab = Vocab()
  114. offset += vocab.load(data, offset, hp.n_vocab)
  115. tensors: list[Tensor] = []
  116. tensor_map = {}
  117. while offset < len(data):
  118. tensor = Tensor()
  119. offset += tensor.load(data, offset)
  120. tensor_map[tensor.name] = len(tensors)
  121. tensors.append(tensor)
  122. self.hyperparameters = hp
  123. self.vocab = vocab
  124. self.tensors = tensors
  125. self.tensor_map = tensor_map
  126. hp.set_n_ff(self)
  127. return offset
  128. class GGMLToGGUF:
  129. def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
  130. hp = ggml_model.hyperparameters
  131. self.model = ggml_model
  132. self.data = data
  133. self.cfg = cfg
  134. self.params_override = params_override
  135. self.vocab_override = vocab_override
  136. self.special_vocab = special_vocab
  137. if params_override is not None:
  138. n_kv_head = params_override.n_head_kv
  139. else:
  140. if cfg.gqa == 1:
  141. n_kv_head = hp.n_head
  142. else:
  143. gqa = float(cfg.gqa)
  144. n_kv_head = None
  145. for x in range(1, 256):
  146. if float(hp.n_head) / float(x) == gqa:
  147. n_kv_head = x
  148. assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
  149. print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
  150. self.n_kv_head = n_kv_head
  151. self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
  152. def save(self):
  153. print('* Preparing to save GGUF file')
  154. gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
  155. self.add_params(gguf_writer)
  156. self.add_vocab(gguf_writer)
  157. if self.special_vocab is not None:
  158. self.special_vocab.add_to_gguf(gguf_writer)
  159. self.add_tensors(gguf_writer)
  160. print(" gguf: write header")
  161. gguf_writer.write_header_to_file()
  162. print(" gguf: write metadata")
  163. gguf_writer.write_kv_data_to_file()
  164. print(" gguf: write tensors")
  165. gguf_writer.write_tensors_to_file()
  166. gguf_writer.close()
  167. def add_params(self, gguf_writer):
  168. hp = self.model.hyperparameters
  169. cfg = self.cfg
  170. desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
  171. try:
  172. # Filenames aren't necessarily valid UTF8.
  173. name = cfg.name if cfg.name is not None else cfg.input.name
  174. except UnicodeDecodeError:
  175. name = None
  176. print('* Adding model parameters and KV items')
  177. if name is not None:
  178. gguf_writer.add_name(name)
  179. gguf_writer.add_description(desc)
  180. if self.params_override is not None:
  181. po = self.params_override
  182. assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
  183. assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
  184. assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
  185. gguf_writer.add_context_length (po.n_ctx)
  186. gguf_writer.add_embedding_length (po.n_embd)
  187. gguf_writer.add_block_count (po.n_layer)
  188. gguf_writer.add_feed_forward_length (po.n_ff)
  189. gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
  190. gguf_writer.add_head_count (po.n_head)
  191. gguf_writer.add_head_count_kv (po.n_head_kv)
  192. gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
  193. return
  194. gguf_writer.add_context_length(cfg.context_length)
  195. gguf_writer.add_embedding_length(hp.n_embd)
  196. gguf_writer.add_block_count(hp.n_layer)
  197. gguf_writer.add_feed_forward_length(hp.n_ff)
  198. gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
  199. gguf_writer.add_head_count(hp.n_head)
  200. gguf_writer.add_head_count_kv(self.n_kv_head)
  201. gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
  202. def add_vocab(self, gguf_writer):
  203. hp = self.model.hyperparameters
  204. gguf_writer.add_tokenizer_model('llama')
  205. tokens = []
  206. scores = []
  207. toktypes = []
  208. if self.vocab_override is not None:
  209. vo = self.vocab_override
  210. print('* Adding vocab item(s)')
  211. for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
  212. tokens.append(vbytes)
  213. scores.append(score)
  214. toktypes.append(ttype)
  215. assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
  216. gguf_writer.add_token_list(tokens)
  217. gguf_writer.add_token_scores(scores)
  218. if len(toktypes) > 0:
  219. gguf_writer.add_token_types(toktypes)
  220. return
  221. print(f'* Adding {hp.n_vocab} vocab item(s)')
  222. assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
  223. for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
  224. tt = 1 # Normal
  225. # Special handling for UNK, BOS, EOS tokens.
  226. if tokid <= 2:
  227. if tokid == 0:
  228. vbytes = b'<unk>'
  229. tt = 2
  230. elif tokid == 1:
  231. vbytes = b'<s>'
  232. tt = 3
  233. else:
  234. vbytes = b'</s>'
  235. tt = 3
  236. elif len(vbytes) == 0:
  237. tt = 3 # Control
  238. elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
  239. vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
  240. tt = 6 # Byte
  241. else:
  242. vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
  243. toktypes.append(tt)
  244. tokens.append(vbytes)
  245. scores.append(vscore)
  246. gguf_writer.add_token_list(tokens)
  247. gguf_writer.add_token_scores(scores)
  248. gguf_writer.add_token_types(toktypes)
  249. gguf_writer.add_unk_token_id(0)
  250. gguf_writer.add_bos_token_id(1)
  251. gguf_writer.add_eos_token_id(2)
  252. def add_tensors(self, gguf_writer):
  253. tensor_map = self.name_map
  254. data = self.data
  255. print(f'* Adding {len(self.model.tensors)} tensor(s)')
  256. for tensor in self.model.tensors:
  257. name = str(tensor.name, 'UTF-8')
  258. mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  259. assert mapped_name is not None, f'Bad name {name}'
  260. tempdims = list(tensor.dims[:])
  261. if len(tempdims) > 1:
  262. temp = tempdims[1]
  263. tempdims[1] = tempdims[0]
  264. tempdims[0] = temp
  265. # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
  266. gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
  267. def handle_metadata(cfg, hp):
  268. import convert
  269. assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
  270. hf_config_path = cfg.model_metadata_dir / "config.json"
  271. orig_config_path = cfg.model_metadata_dir / "params.json"
  272. # We pass a fake model here. "original" mode will check the shapes of some
  273. # tensors if information is missing in the .json file: other than that, the
  274. # model data isn't used so this should be safe (at least for now).
  275. fakemodel = {
  276. 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
  277. 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
  278. }
  279. fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
  280. fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
  281. if hf_config_path.exists():
  282. params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
  283. elif orig_config_path.exists():
  284. params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
  285. else:
  286. raise ValueError('Unable to load metadata')
  287. vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
  288. # FIXME: Respect cfg.vocab_dir?
  289. svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
  290. convert.check_vocab_size(params, vocab)
  291. return (params, vocab, svocab)
  292. def handle_args():
  293. parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
  294. parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename')
  295. parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename')
  296. parser.add_argument('--name', help = 'Set model name')
  297. parser.add_argument('--desc', help = 'Set model description')
  298. parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
  299. parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
  300. parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
  301. parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
  302. parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
  303. parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
  304. return parser.parse_args()
  305. def main():
  306. cfg = handle_args()
  307. print(f'* Using config: {cfg}')
  308. print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
  309. data = np.memmap(cfg.input, mode = 'r')
  310. model = GGMLV3Model()
  311. print('* Scanning GGML input file')
  312. offset = model.load(data, 0)
  313. print(f'* GGML model hyperparameters: {model.hyperparameters}')
  314. vocab_override = None
  315. params_override = None
  316. special_vocab = None
  317. if cfg.model_metadata_dir is not None:
  318. (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
  319. print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
  320. print(f'* Overriding params: {params_override}')
  321. print(f'* Overriding vocab: {vocab_override}')
  322. print(f'* Special vocab: {special_vocab}')
  323. else:
  324. print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
  325. converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
  326. converter.save()
  327. print(f'* Successful completion. Output saved to: {cfg.output}')
  328. if __name__ == '__main__':
  329. main()