convert_llama_ggml_to_gguf.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import logging
  4. import argparse
  5. import os
  6. import struct
  7. import sys
  8. from enum import IntEnum
  9. from pathlib import Path
  10. import numpy as np
  11. if 'NO_LOCAL_GGUF' not in os.environ:
  12. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  13. import gguf
  14. logger = logging.getLogger("ggml-to-gguf")
  15. class GGMLFormat(IntEnum):
  16. GGML = 0
  17. GGMF = 1
  18. GGJT = 2
  19. class GGMLFType(IntEnum):
  20. ALL_F32 = 0
  21. MOSTLY_F16 = 1
  22. MOSTLY_Q4_0 = 2
  23. MOSTLY_Q4_1 = 3
  24. MOSTLY_Q4_1_SOME_F16 = 4
  25. MOSTLY_Q8_0 = 7
  26. MOSTLY_Q5_0 = 8
  27. MOSTLY_Q5_1 = 9
  28. MOSTLY_Q2_K = 10
  29. MOSTLY_Q3_K_S = 11
  30. MOSTLY_Q3_K_M = 12
  31. MOSTLY_Q3_K_L = 13
  32. MOSTLY_Q4_K_S = 14
  33. MOSTLY_Q4_K_M = 15
  34. MOSTLY_Q5_K_S = 16
  35. MOSTLY_Q5_K_M = 17
  36. MOSTLY_Q6_K = 18
  37. class Hyperparameters:
  38. def __init__(self):
  39. self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
  40. self.n_layer = self.n_rot = self.n_ff = 0
  41. self.ftype = GGMLFType.ALL_F32
  42. def set_n_ff(self, model):
  43. ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
  44. assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
  45. ff_tensor = model.tensors[ff_tensor_idx]
  46. self.n_ff = ff_tensor.dims[1]
  47. def load(self, data, offset):
  48. (
  49. self.n_vocab,
  50. self.n_embd,
  51. self.n_mult,
  52. self.n_head,
  53. self.n_layer,
  54. self.n_rot,
  55. ftype,
  56. ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
  57. try:
  58. self.ftype = GGMLFType(ftype)
  59. except ValueError:
  60. raise ValueError(f'Invalid ftype {ftype}')
  61. return 4 * 7
  62. def __str__(self):
  63. return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
  64. class Vocab:
  65. def __init__(self, load_scores = True):
  66. self.items = []
  67. self.load_scores = load_scores
  68. def load(self, data, offset, n_vocab):
  69. orig_offset = offset
  70. for _ in range(n_vocab):
  71. itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
  72. assert itemlen < 4096, 'Absurd vocab item length'
  73. offset += 4
  74. item_text = bytes(data[offset:offset + itemlen])
  75. offset += itemlen
  76. if self.load_scores:
  77. item_score = struct.unpack('<f', data[offset:offset + 4])[0]
  78. offset += 4
  79. else:
  80. item_score = 0.0
  81. self.items.append((item_text, item_score))
  82. return offset - orig_offset
  83. class Tensor:
  84. def __init__(self, use_padding = True):
  85. self.name = None
  86. self.dims: tuple[int, ...] = ()
  87. self.dtype = None
  88. self.start_offset = 0
  89. self.len_bytes = np.int64(0)
  90. self.use_padding = use_padding
  91. def load(self, data, offset):
  92. orig_offset = offset
  93. (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
  94. assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
  95. assert name_len < 4096, 'Absurd tensor name length'
  96. quant = gguf.GGML_QUANT_SIZES.get(dtype)
  97. assert quant is not None, 'Unknown tensor type'
  98. (blksize, tysize) = quant
  99. offset += 12
  100. self.dtype= dtype
  101. self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
  102. offset += 4 * n_dims
  103. self.name = bytes(data[offset:offset + name_len])
  104. offset += name_len
  105. pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
  106. offset += pad
  107. n_elems = np.prod(self.dims)
  108. n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
  109. self.start_offset = offset
  110. self.len_bytes = n_bytes
  111. offset += n_bytes
  112. return offset - orig_offset
  113. class GGMLModel:
  114. def __init__(self):
  115. self.hyperparameters = None
  116. self.vocab = None
  117. self.tensor_map = {}
  118. self.tensors = []
  119. def validate_header(self, data, offset):
  120. magic = bytes(data[offset:offset + 4])
  121. if magic == b'GGUF':
  122. raise ValueError('File is already in GGUF format.')
  123. if magic == b'lmgg':
  124. self.file_format = GGMLFormat.GGML
  125. self.format_version = 1
  126. return 4
  127. version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
  128. if magic == b'fmgg':
  129. if version != 1:
  130. raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
  131. self.file_format = GGMLFormat.GGMF
  132. self.format_version = version
  133. return 8
  134. if magic == b'tjgg':
  135. if version < 1 or version > 3:
  136. raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
  137. self.file_format = GGMLFormat.GGJT
  138. self.format_version = version
  139. return 8
  140. raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
  141. def validate_conversion(self, ftype):
  142. err = ''
  143. if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
  144. if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
  145. err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
  146. elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
  147. if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
  148. GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
  149. err = 'Q4 and Q8 quantizations changed in GGJTv3.'
  150. if len(err) > 0:
  151. raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
  152. def load(self, data, offset):
  153. offset += self.validate_header(data, offset)
  154. hp = Hyperparameters()
  155. offset += hp.load(data, offset)
  156. logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
  157. self.validate_conversion(hp.ftype)
  158. vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
  159. offset += vocab.load(data, offset, hp.n_vocab)
  160. tensors: list[Tensor] = []
  161. tensor_map = {}
  162. while offset < len(data):
  163. tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
  164. offset += tensor.load(data, offset)
  165. tensor_map[tensor.name] = len(tensors)
  166. tensors.append(tensor)
  167. self.hyperparameters = hp
  168. self.vocab = vocab
  169. self.tensors = tensors
  170. self.tensor_map = tensor_map
  171. hp.set_n_ff(self)
  172. return offset
  173. class GGMLToGGUF:
  174. def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
  175. hp = ggml_model.hyperparameters
  176. self.model = ggml_model
  177. self.data = data
  178. self.cfg = cfg
  179. self.params_override = params_override
  180. self.vocab_override = vocab_override
  181. self.special_vocab = special_vocab
  182. if params_override is not None:
  183. n_kv_head = params_override.n_head_kv
  184. else:
  185. if cfg.gqa == 1:
  186. n_kv_head = hp.n_head
  187. else:
  188. gqa = float(cfg.gqa)
  189. n_kv_head = None
  190. for x in range(1, 256):
  191. if float(hp.n_head) / float(x) == gqa:
  192. n_kv_head = x
  193. assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
  194. logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
  195. self.n_kv_head = n_kv_head
  196. self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
  197. def save(self):
  198. logger.info('* Preparing to save GGUF file')
  199. gguf_writer = gguf.GGUFWriter(
  200. self.cfg.output,
  201. gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
  202. use_temp_file = False)
  203. self.add_params(gguf_writer)
  204. self.add_vocab(gguf_writer)
  205. if self.special_vocab is not None:
  206. self.special_vocab.add_to_gguf(gguf_writer)
  207. self.add_tensors(gguf_writer)
  208. logger.info(" gguf: write header")
  209. gguf_writer.write_header_to_file()
  210. logger.info(" gguf: write metadata")
  211. gguf_writer.write_kv_data_to_file()
  212. logger.info(" gguf: write tensors")
  213. gguf_writer.write_tensors_to_file()
  214. gguf_writer.close()
  215. def add_params(self, gguf_writer):
  216. hp = self.model.hyperparameters
  217. cfg = self.cfg
  218. if cfg.desc is not None:
  219. desc = cfg.desc
  220. else:
  221. desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
  222. try:
  223. # Filenames aren't necessarily valid UTF8.
  224. name = cfg.name if cfg.name is not None else cfg.input.name
  225. except UnicodeDecodeError:
  226. name = None
  227. logger.info('* Adding model parameters and KV items')
  228. if name is not None:
  229. gguf_writer.add_name(name)
  230. gguf_writer.add_description(desc)
  231. gguf_writer.add_file_type(int(hp.ftype))
  232. if self.params_override is not None:
  233. po = self.params_override
  234. assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
  235. assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
  236. assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
  237. gguf_writer.add_context_length (po.n_ctx)
  238. gguf_writer.add_embedding_length (po.n_embd)
  239. gguf_writer.add_block_count (po.n_layer)
  240. gguf_writer.add_feed_forward_length (po.n_ff)
  241. gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
  242. gguf_writer.add_head_count (po.n_head)
  243. gguf_writer.add_head_count_kv (po.n_head_kv)
  244. gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
  245. return
  246. gguf_writer.add_context_length(cfg.context_length)
  247. gguf_writer.add_embedding_length(hp.n_embd)
  248. gguf_writer.add_block_count(hp.n_layer)
  249. gguf_writer.add_feed_forward_length(hp.n_ff)
  250. gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
  251. gguf_writer.add_head_count(hp.n_head)
  252. gguf_writer.add_head_count_kv(self.n_kv_head)
  253. gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
  254. def add_vocab(self, gguf_writer):
  255. hp = self.model.hyperparameters
  256. gguf_writer.add_tokenizer_model('llama')
  257. gguf_writer.add_tokenizer_pre('default')
  258. tokens = []
  259. scores = []
  260. toktypes = []
  261. if self.vocab_override is not None:
  262. vo = self.vocab_override
  263. logger.info('* Adding vocab item(s)')
  264. for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
  265. tokens.append(vbytes)
  266. scores.append(score)
  267. toktypes.append(ttype)
  268. assert len(tokens) == hp.n_vocab, \
  269. f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
  270. gguf_writer.add_token_list(tokens)
  271. gguf_writer.add_token_scores(scores)
  272. if len(toktypes) > 0:
  273. gguf_writer.add_token_types(toktypes)
  274. return
  275. logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
  276. assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
  277. for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
  278. tt = 1 # Normal
  279. # Special handling for UNK, BOS, EOS tokens.
  280. if tokid <= 2:
  281. if tokid == 0:
  282. vbytes = b'<unk>'
  283. tt = 2
  284. elif tokid == 1:
  285. vbytes = b'<s>'
  286. tt = 3
  287. else:
  288. vbytes = b'</s>'
  289. tt = 3
  290. elif len(vbytes) == 0:
  291. tt = 3 # Control
  292. elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
  293. vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
  294. tt = 6 # Byte
  295. else:
  296. vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
  297. toktypes.append(tt)
  298. tokens.append(vbytes)
  299. scores.append(vscore)
  300. gguf_writer.add_token_list(tokens)
  301. gguf_writer.add_token_scores(scores)
  302. gguf_writer.add_token_types(toktypes)
  303. gguf_writer.add_unk_token_id(0)
  304. gguf_writer.add_bos_token_id(1)
  305. gguf_writer.add_eos_token_id(2)
  306. def add_tensors(self, gguf_writer):
  307. tensor_map = self.name_map
  308. data = self.data
  309. logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
  310. for tensor in self.model.tensors:
  311. name = str(tensor.name, 'UTF-8')
  312. mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  313. assert mapped_name is not None, f'Bad name {name}'
  314. tempdims = list(tensor.dims[:])
  315. if len(tempdims) > 1:
  316. temp = tempdims[1]
  317. tempdims[1] = tempdims[0]
  318. tempdims[0] = temp
  319. gguf_writer.add_tensor(
  320. mapped_name,
  321. data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
  322. raw_shape = tempdims,
  323. raw_dtype = tensor.dtype)
  324. def handle_metadata(cfg, hp):
  325. import convert
  326. assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
  327. hf_config_path = cfg.model_metadata_dir / "config.json"
  328. orig_config_path = cfg.model_metadata_dir / "params.json"
  329. # We pass a fake model here. "original" mode will check the shapes of some
  330. # tensors if information is missing in the .json file: other than that, the
  331. # model data isn't used so this should be safe (at least for now).
  332. fakemodel = {
  333. 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
  334. 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
  335. }
  336. fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
  337. fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
  338. if hf_config_path.exists():
  339. params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
  340. elif orig_config_path.exists():
  341. params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
  342. else:
  343. raise ValueError('Unable to load metadata')
  344. vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
  345. vocab_factory = convert.VocabFactory(vocab_path)
  346. vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
  347. convert.check_vocab_size(params, vocab)
  348. return params, vocab, special_vocab
  349. def handle_args():
  350. parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
  351. parser.add_argument('--input', '-i', type = Path, required = True,
  352. help = 'Input GGMLv3 filename')
  353. parser.add_argument('--output', '-o', type = Path, required = True,
  354. help ='Output GGUF filename')
  355. parser.add_argument('--name',
  356. help = 'Set model name')
  357. parser.add_argument('--desc',
  358. help = 'Set model description')
  359. parser.add_argument('--gqa', type = int, default = 1,
  360. help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
  361. parser.add_argument('--eps', default = '5.0e-06',
  362. help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
  363. parser.add_argument('--context-length', '-c', type=int, default = 2048,
  364. help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
  365. parser.add_argument('--model-metadata-dir', '-m', type = Path,
  366. help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
  367. parser.add_argument("--vocab-dir", type=Path,
  368. help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
  369. parser.add_argument("--vocabtype", default="spm,hfft",
  370. help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
  371. parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
  372. return parser.parse_args()
  373. def main():
  374. cfg = handle_args()
  375. logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
  376. logger.info(f'* Using config: {cfg}')
  377. logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
  378. if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
  379. logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
  380. data = np.memmap(cfg.input, mode = 'r')
  381. model = GGMLModel()
  382. logger.info('* Scanning GGML input file')
  383. offset = model.load(data, 0) # noqa
  384. logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
  385. vocab_override = None
  386. params_override = None
  387. special_vocab = None
  388. if cfg.model_metadata_dir is not None:
  389. (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
  390. logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
  391. logger.info(f'* Overriding params: {params_override}')
  392. logger.info(f'* Overriding vocab: {vocab_override}')
  393. logger.info(f'* Special vocab: {special_vocab}')
  394. else:
  395. logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
  396. if model.file_format == GGMLFormat.GGML:
  397. logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
  398. converter = GGMLToGGUF(
  399. model, data, cfg,
  400. params_override = params_override,
  401. vocab_override = vocab_override,
  402. special_vocab = special_vocab
  403. )
  404. converter.save()
  405. logger.info(f'* Successful completion. Output saved to: {cfg.output}')
  406. if __name__ == '__main__':
  407. main()