convert-llama-ggml-to-gguf.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import os
  5. import struct
  6. import sys
  7. from enum import IntEnum
  8. from pathlib import Path
  9. import numpy as np
  10. if 'NO_LOCAL_GGUF' not in os.environ:
  11. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  12. import gguf
  13. class GGMLFormat(IntEnum):
  14. GGML = 0
  15. GGMF = 1
  16. GGJT = 2
  17. class GGMLFType(IntEnum):
  18. ALL_F32 = 0
  19. MOSTLY_F16 = 1
  20. MOSTLY_Q4_0 = 2
  21. MOSTLY_Q4_1 = 3
  22. MOSTLY_Q4_1_SOME_F16 = 4
  23. MOSTLY_Q8_0 = 7
  24. MOSTLY_Q5_0 = 8
  25. MOSTLY_Q5_1 = 9
  26. MOSTLY_Q2_K = 10
  27. MOSTLY_Q3_K_S = 11
  28. MOSTLY_Q3_K_M = 12
  29. MOSTLY_Q3_K_L = 13
  30. MOSTLY_Q4_K_S = 14
  31. MOSTLY_Q4_K_M = 15
  32. MOSTLY_Q5_K_S = 16
  33. MOSTLY_Q5_K_M = 17
  34. MOSTLY_Q6_K = 18
  35. class Hyperparameters:
  36. def __init__(self):
  37. self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
  38. self.n_layer = self.n_rot = self.n_ff = 0
  39. self.ftype = GGMLFType.ALL_F32
  40. def set_n_ff(self, model):
  41. ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
  42. assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
  43. ff_tensor = model.tensors[ff_tensor_idx]
  44. self.n_ff = ff_tensor.dims[1]
  45. def load(self, data, offset):
  46. (
  47. self.n_vocab,
  48. self.n_embd,
  49. self.n_mult,
  50. self.n_head,
  51. self.n_layer,
  52. self.n_rot,
  53. ftype,
  54. ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
  55. try:
  56. self.ftype = GGMLFType(ftype)
  57. except ValueError:
  58. raise ValueError(f'Invalid ftype {ftype}')
  59. return 4 * 7
  60. def __str__(self):
  61. return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
  62. class Vocab:
  63. def __init__(self, load_scores = True):
  64. self.items = []
  65. self.load_scores = load_scores
  66. def load(self, data, offset, n_vocab):
  67. orig_offset = offset
  68. for _ in range(n_vocab):
  69. itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
  70. assert itemlen < 4096, 'Absurd vocab item length'
  71. offset += 4
  72. item_text = bytes(data[offset:offset + itemlen])
  73. offset += itemlen
  74. if self.load_scores:
  75. item_score = struct.unpack('<f', data[offset:offset + 4])[0]
  76. offset += 4
  77. else:
  78. item_score = 0.0
  79. self.items.append((item_text, item_score))
  80. return offset - orig_offset
  81. class Tensor:
  82. def __init__(self, use_padding = True):
  83. self.name = None
  84. self.dims: tuple[int, ...] = ()
  85. self.dtype = None
  86. self.start_offset = 0
  87. self.len_bytes = np.int64(0)
  88. self.use_padding = use_padding
  89. def load(self, data, offset):
  90. orig_offset = offset
  91. (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
  92. assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
  93. assert name_len < 4096, 'Absurd tensor name length'
  94. quant = gguf.GGML_QUANT_SIZES.get(dtype)
  95. assert quant is not None, 'Unknown tensor type'
  96. (blksize, tysize) = quant
  97. offset += 12
  98. self.dtype= dtype
  99. self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
  100. offset += 4 * n_dims
  101. self.name = bytes(data[offset:offset + name_len])
  102. offset += name_len
  103. pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
  104. offset += pad
  105. n_elems = np.prod(self.dims)
  106. n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
  107. self.start_offset = offset
  108. self.len_bytes = n_bytes
  109. offset += n_bytes
  110. # print(n_dims, name_len, dtype, self.dims, self.name, pad)
  111. return offset - orig_offset
  112. class GGMLModel:
  113. def __init__(self):
  114. self.hyperparameters = None
  115. self.vocab = None
  116. self.tensor_map = {}
  117. self.tensors = []
  118. def validate_header(self, data, offset):
  119. magic = bytes(data[offset:offset + 4])
  120. if magic == b'GGUF':
  121. raise ValueError('File is already in GGUF format.')
  122. if magic == b'lmgg':
  123. self.file_format = GGMLFormat.GGML
  124. self.format_version = 1
  125. return 4
  126. version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
  127. if magic == b'fmgg':
  128. if version != 1:
  129. raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
  130. self.file_format = GGMLFormat.GGMF
  131. self.format_version = version
  132. return 8
  133. if magic == b'tjgg':
  134. if version < 1 or version > 3:
  135. raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
  136. self.file_format = GGMLFormat.GGJT
  137. self.format_version = version
  138. return 8
  139. raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
  140. def validate_conversion(self, ftype):
  141. err = ''
  142. if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
  143. if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
  144. err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
  145. elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
  146. if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
  147. GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
  148. err = 'Q4 and Q8 quantizations changed in GGJTv3.'
  149. if len(err) > 0:
  150. raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
  151. def load(self, data, offset):
  152. offset += self.validate_header(data, offset)
  153. hp = Hyperparameters()
  154. offset += hp.load(data, offset)
  155. print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
  156. self.validate_conversion(hp.ftype)
  157. vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
  158. offset += vocab.load(data, offset, hp.n_vocab)
  159. tensors: list[Tensor] = []
  160. tensor_map = {}
  161. while offset < len(data):
  162. tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
  163. offset += tensor.load(data, offset)
  164. tensor_map[tensor.name] = len(tensors)
  165. tensors.append(tensor)
  166. self.hyperparameters = hp
  167. self.vocab = vocab
  168. self.tensors = tensors
  169. self.tensor_map = tensor_map
  170. hp.set_n_ff(self)
  171. return offset
  172. class GGMLToGGUF:
  173. def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
  174. hp = ggml_model.hyperparameters
  175. self.model = ggml_model
  176. self.data = data
  177. self.cfg = cfg
  178. self.params_override = params_override
  179. self.vocab_override = vocab_override
  180. self.special_vocab = special_vocab
  181. if params_override is not None:
  182. n_kv_head = params_override.n_head_kv
  183. else:
  184. if cfg.gqa == 1:
  185. n_kv_head = hp.n_head
  186. else:
  187. gqa = float(cfg.gqa)
  188. n_kv_head = None
  189. for x in range(1, 256):
  190. if float(hp.n_head) / float(x) == gqa:
  191. n_kv_head = x
  192. assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
  193. print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
  194. self.n_kv_head = n_kv_head
  195. self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
  196. def save(self):
  197. print('* Preparing to save GGUF file')
  198. gguf_writer = gguf.GGUFWriter(
  199. self.cfg.output,
  200. gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
  201. use_temp_file = False)
  202. self.add_params(gguf_writer)
  203. self.add_vocab(gguf_writer)
  204. if self.special_vocab is not None:
  205. self.special_vocab.add_to_gguf(gguf_writer)
  206. self.add_tensors(gguf_writer)
  207. print(" gguf: write header")
  208. gguf_writer.write_header_to_file()
  209. print(" gguf: write metadata")
  210. gguf_writer.write_kv_data_to_file()
  211. print(" gguf: write tensors")
  212. gguf_writer.write_tensors_to_file()
  213. gguf_writer.close()
  214. def add_params(self, gguf_writer):
  215. hp = self.model.hyperparameters
  216. cfg = self.cfg
  217. if cfg.desc is not None:
  218. desc = cfg.desc
  219. else:
  220. desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
  221. try:
  222. # Filenames aren't necessarily valid UTF8.
  223. name = cfg.name if cfg.name is not None else cfg.input.name
  224. except UnicodeDecodeError:
  225. name = None
  226. print('* Adding model parameters and KV items')
  227. if name is not None:
  228. gguf_writer.add_name(name)
  229. gguf_writer.add_description(desc)
  230. gguf_writer.add_file_type(int(hp.ftype))
  231. if self.params_override is not None:
  232. po = self.params_override
  233. assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
  234. assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
  235. assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
  236. gguf_writer.add_context_length (po.n_ctx)
  237. gguf_writer.add_embedding_length (po.n_embd)
  238. gguf_writer.add_block_count (po.n_layer)
  239. gguf_writer.add_feed_forward_length (po.n_ff)
  240. gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
  241. gguf_writer.add_head_count (po.n_head)
  242. gguf_writer.add_head_count_kv (po.n_head_kv)
  243. gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
  244. return
  245. gguf_writer.add_context_length(cfg.context_length)
  246. gguf_writer.add_embedding_length(hp.n_embd)
  247. gguf_writer.add_block_count(hp.n_layer)
  248. gguf_writer.add_feed_forward_length(hp.n_ff)
  249. gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
  250. gguf_writer.add_head_count(hp.n_head)
  251. gguf_writer.add_head_count_kv(self.n_kv_head)
  252. gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
  253. def add_vocab(self, gguf_writer):
  254. hp = self.model.hyperparameters
  255. gguf_writer.add_tokenizer_model('llama')
  256. tokens = []
  257. scores = []
  258. toktypes = []
  259. if self.vocab_override is not None:
  260. vo = self.vocab_override
  261. print('* Adding vocab item(s)')
  262. for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
  263. tokens.append(vbytes)
  264. scores.append(score)
  265. toktypes.append(ttype)
  266. assert len(tokens) == hp.n_vocab, \
  267. f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
  268. gguf_writer.add_token_list(tokens)
  269. gguf_writer.add_token_scores(scores)
  270. if len(toktypes) > 0:
  271. gguf_writer.add_token_types(toktypes)
  272. return
  273. print(f'* Adding {hp.n_vocab} vocab item(s)')
  274. assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
  275. for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
  276. tt = 1 # Normal
  277. # Special handling for UNK, BOS, EOS tokens.
  278. if tokid <= 2:
  279. if tokid == 0:
  280. vbytes = b'<unk>'
  281. tt = 2
  282. elif tokid == 1:
  283. vbytes = b'<s>'
  284. tt = 3
  285. else:
  286. vbytes = b'</s>'
  287. tt = 3
  288. elif len(vbytes) == 0:
  289. tt = 3 # Control
  290. elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
  291. vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
  292. tt = 6 # Byte
  293. else:
  294. vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
  295. toktypes.append(tt)
  296. tokens.append(vbytes)
  297. scores.append(vscore)
  298. gguf_writer.add_token_list(tokens)
  299. gguf_writer.add_token_scores(scores)
  300. gguf_writer.add_token_types(toktypes)
  301. gguf_writer.add_unk_token_id(0)
  302. gguf_writer.add_bos_token_id(1)
  303. gguf_writer.add_eos_token_id(2)
  304. def add_tensors(self, gguf_writer):
  305. tensor_map = self.name_map
  306. data = self.data
  307. print(f'* Adding {len(self.model.tensors)} tensor(s)')
  308. for tensor in self.model.tensors:
  309. name = str(tensor.name, 'UTF-8')
  310. mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  311. assert mapped_name is not None, f'Bad name {name}'
  312. tempdims = list(tensor.dims[:])
  313. if len(tempdims) > 1:
  314. temp = tempdims[1]
  315. tempdims[1] = tempdims[0]
  316. tempdims[0] = temp
  317. # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
  318. gguf_writer.add_tensor(
  319. mapped_name,
  320. data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
  321. raw_shape = tempdims,
  322. raw_dtype = tensor.dtype)
  323. def handle_metadata(cfg, hp):
  324. import convert
  325. assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
  326. hf_config_path = cfg.model_metadata_dir / "config.json"
  327. orig_config_path = cfg.model_metadata_dir / "params.json"
  328. # We pass a fake model here. "original" mode will check the shapes of some
  329. # tensors if information is missing in the .json file: other than that, the
  330. # model data isn't used so this should be safe (at least for now).
  331. fakemodel = {
  332. 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
  333. 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
  334. }
  335. fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
  336. fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
  337. if hf_config_path.exists():
  338. params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
  339. elif orig_config_path.exists():
  340. params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
  341. else:
  342. raise ValueError('Unable to load metadata')
  343. vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
  344. vocab_factory = convert.VocabFactory(vocab_path)
  345. vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
  346. convert.check_vocab_size(params, vocab)
  347. return params, vocab, special_vocab
  348. def handle_args():
  349. parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
  350. parser.add_argument('--input', '-i', type = Path, required = True,
  351. help = 'Input GGMLv3 filename')
  352. parser.add_argument('--output', '-o', type = Path, required = True,
  353. help ='Output GGUF filename')
  354. parser.add_argument('--name',
  355. help = 'Set model name')
  356. parser.add_argument('--desc',
  357. help = 'Set model description')
  358. parser.add_argument('--gqa', type = int, default = 1,
  359. help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
  360. parser.add_argument('--eps', default = '5.0e-06',
  361. help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
  362. parser.add_argument('--context-length', '-c', type=int, default = 2048,
  363. help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
  364. parser.add_argument('--model-metadata-dir', '-m', type = Path,
  365. help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
  366. parser.add_argument("--vocab-dir", type=Path,
  367. help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
  368. parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
  369. help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
  370. return parser.parse_args()
  371. def main():
  372. cfg = handle_args()
  373. print(f'* Using config: {cfg}')
  374. print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
  375. if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
  376. print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
  377. data = np.memmap(cfg.input, mode = 'r')
  378. model = GGMLModel()
  379. print('* Scanning GGML input file')
  380. offset = model.load(data, 0) # noqa
  381. print(f'* GGML model hyperparameters: {model.hyperparameters}')
  382. vocab_override = None
  383. params_override = None
  384. special_vocab = None
  385. if cfg.model_metadata_dir is not None:
  386. (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
  387. print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
  388. print(f'* Overriding params: {params_override}')
  389. print(f'* Overriding vocab: {vocab_override}')
  390. print(f'* Special vocab: {special_vocab}')
  391. else:
  392. print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
  393. if model.file_format == GGMLFormat.GGML:
  394. print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
  395. converter = GGMLToGGUF(
  396. model, data, cfg,
  397. params_override = params_override,
  398. vocab_override = vocab_override,
  399. special_vocab = special_vocab
  400. )
  401. converter.save()
  402. print(f'* Successful completion. Output saved to: {cfg.output}')
  403. if __name__ == '__main__':
  404. main()