1
0

convert_llama_ggml_to_gguf.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import logging
  4. import argparse
  5. import os
  6. import struct
  7. import sys
  8. from enum import IntEnum
  9. from pathlib import Path
  10. import numpy as np
  11. if 'NO_LOCAL_GGUF' not in os.environ:
  12. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  13. import gguf
  14. logger = logging.getLogger("ggml-to-gguf")
  15. class GGMLFormat(IntEnum):
  16. GGML = 0
  17. GGMF = 1
  18. GGJT = 2
  19. class GGMLFType(IntEnum):
  20. ALL_F32 = 0
  21. MOSTLY_F16 = 1
  22. MOSTLY_Q4_0 = 2
  23. MOSTLY_Q4_1 = 3
  24. MOSTLY_Q4_1_SOME_F16 = 4
  25. MOSTLY_Q8_0 = 7
  26. MOSTLY_Q5_0 = 8
  27. MOSTLY_Q5_1 = 9
  28. MOSTLY_Q2_K = 10
  29. MOSTLY_Q3_K_S = 11
  30. MOSTLY_Q3_K_M = 12
  31. MOSTLY_Q3_K_L = 13
  32. MOSTLY_Q4_K_S = 14
  33. MOSTLY_Q4_K_M = 15
  34. MOSTLY_Q5_K_S = 16
  35. MOSTLY_Q5_K_M = 17
  36. MOSTLY_Q6_K = 18
  37. class Hyperparameters:
  38. def __init__(self):
  39. self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
  40. self.n_layer = self.n_rot = self.n_ff = 0
  41. self.ftype = GGMLFType.ALL_F32
  42. def set_n_ff(self, model):
  43. ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
  44. assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
  45. ff_tensor = model.tensors[ff_tensor_idx]
  46. self.n_ff = ff_tensor.dims[1]
  47. def load(self, data, offset):
  48. (
  49. self.n_vocab,
  50. self.n_embd,
  51. self.n_mult,
  52. self.n_head,
  53. self.n_layer,
  54. self.n_rot,
  55. ftype,
  56. ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
  57. try:
  58. self.ftype = GGMLFType(ftype)
  59. except ValueError:
  60. raise ValueError(f'Invalid ftype {ftype}')
  61. return 4 * 7
  62. def __str__(self):
  63. return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
  64. class Vocab:
  65. def __init__(self, load_scores = True):
  66. self.items = []
  67. self.load_scores = load_scores
  68. def load(self, data, offset, n_vocab):
  69. orig_offset = offset
  70. for _ in range(n_vocab):
  71. itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
  72. assert itemlen < 4096, 'Absurd vocab item length'
  73. offset += 4
  74. item_text = bytes(data[offset:offset + itemlen])
  75. offset += itemlen
  76. if self.load_scores:
  77. item_score = struct.unpack('<f', data[offset:offset + 4])[0]
  78. offset += 4
  79. else:
  80. item_score = 0.0
  81. self.items.append((item_text, item_score))
  82. return offset - orig_offset
  83. class Tensor:
  84. def __init__(self, use_padding = True):
  85. self.name = None
  86. self.dims: tuple[int, ...] = ()
  87. self.dtype = None
  88. self.start_offset = 0
  89. self.len_bytes = np.int64(0)
  90. self.use_padding = use_padding
  91. def load(self, data, offset):
  92. orig_offset = offset
  93. (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
  94. assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
  95. assert name_len < 4096, 'Absurd tensor name length'
  96. quant = gguf.GGML_QUANT_SIZES.get(dtype)
  97. assert quant is not None, 'Unknown tensor type'
  98. (blksize, tysize) = quant
  99. offset += 12
  100. self.dtype= gguf.GGMLQuantizationType(dtype)
  101. self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
  102. offset += 4 * n_dims
  103. self.name = bytes(data[offset:offset + name_len])
  104. offset += name_len
  105. pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
  106. offset += pad
  107. n_elems = np.prod(self.dims)
  108. n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
  109. self.start_offset = offset
  110. self.len_bytes = n_bytes
  111. offset += n_bytes
  112. return offset - orig_offset
  113. class GGMLModel:
  114. file_format: GGMLFormat
  115. format_version: int
  116. def __init__(self):
  117. self.hyperparameters = None
  118. self.vocab = None
  119. self.tensor_map = {}
  120. self.tensors = []
  121. def validate_header(self, data, offset):
  122. magic = bytes(data[offset:offset + 4])
  123. if magic == b'GGUF':
  124. raise ValueError('File is already in GGUF format.')
  125. if magic == b'lmgg':
  126. self.file_format = GGMLFormat.GGML
  127. self.format_version = 1
  128. return 4
  129. version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
  130. if magic == b'fmgg':
  131. if version != 1:
  132. raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
  133. self.file_format = GGMLFormat.GGMF
  134. self.format_version = version
  135. return 8
  136. if magic == b'tjgg':
  137. if version < 1 or version > 3:
  138. raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
  139. self.file_format = GGMLFormat.GGJT
  140. self.format_version = version
  141. return 8
  142. raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
  143. def validate_conversion(self, ftype):
  144. err = ''
  145. if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
  146. if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
  147. err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
  148. elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
  149. if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
  150. GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
  151. err = 'Q4 and Q8 quantizations changed in GGJTv3.'
  152. if len(err) > 0:
  153. raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
  154. def load(self, data, offset):
  155. offset += self.validate_header(data, offset)
  156. hp = Hyperparameters()
  157. offset += hp.load(data, offset)
  158. logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
  159. self.validate_conversion(hp.ftype)
  160. vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
  161. offset += vocab.load(data, offset, hp.n_vocab)
  162. tensors: list[Tensor] = []
  163. tensor_map = {}
  164. while offset < len(data):
  165. tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
  166. offset += tensor.load(data, offset)
  167. tensor_map[tensor.name] = len(tensors)
  168. tensors.append(tensor)
  169. self.hyperparameters = hp
  170. self.vocab = vocab
  171. self.tensors = tensors
  172. self.tensor_map = tensor_map
  173. hp.set_n_ff(self)
  174. return offset
  175. class GGMLToGGUF:
  176. def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
  177. hp = ggml_model.hyperparameters
  178. self.model = ggml_model
  179. self.data = data
  180. self.cfg = cfg
  181. self.params_override = params_override
  182. self.vocab_override = vocab_override
  183. self.special_vocab = special_vocab
  184. if params_override is not None:
  185. n_kv_head = params_override.n_head_kv
  186. else:
  187. if cfg.gqa == 1:
  188. n_kv_head = hp.n_head
  189. else:
  190. gqa = float(cfg.gqa)
  191. n_kv_head = None
  192. for x in range(1, 256):
  193. if float(hp.n_head) / float(x) == gqa:
  194. n_kv_head = x
  195. assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
  196. logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
  197. self.n_kv_head = n_kv_head
  198. self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
  199. def save(self):
  200. logger.info('* Preparing to save GGUF file')
  201. gguf_writer = gguf.GGUFWriter(
  202. self.cfg.output,
  203. gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
  204. use_temp_file = False)
  205. self.add_params(gguf_writer)
  206. self.add_vocab(gguf_writer)
  207. if self.special_vocab is not None:
  208. self.special_vocab.add_to_gguf(gguf_writer)
  209. self.add_tensors(gguf_writer)
  210. logger.info(" gguf: write header")
  211. gguf_writer.write_header_to_file()
  212. logger.info(" gguf: write metadata")
  213. gguf_writer.write_kv_data_to_file()
  214. logger.info(" gguf: write tensors")
  215. gguf_writer.write_tensors_to_file()
  216. gguf_writer.close()
  217. def add_params(self, gguf_writer):
  218. hp = self.model.hyperparameters
  219. cfg = self.cfg
  220. if cfg.desc is not None:
  221. desc = cfg.desc
  222. else:
  223. desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
  224. try:
  225. # Filenames aren't necessarily valid UTF8.
  226. name = cfg.name if cfg.name is not None else cfg.input.name
  227. except UnicodeDecodeError:
  228. name = None
  229. logger.info('* Adding model parameters and KV items')
  230. if name is not None:
  231. gguf_writer.add_name(name)
  232. gguf_writer.add_description(desc)
  233. gguf_writer.add_file_type(int(hp.ftype))
  234. if self.params_override is not None:
  235. po = self.params_override
  236. assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
  237. assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
  238. assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
  239. gguf_writer.add_context_length (po.n_ctx)
  240. gguf_writer.add_embedding_length (po.n_embd)
  241. gguf_writer.add_block_count (po.n_layer)
  242. gguf_writer.add_feed_forward_length (po.n_ff)
  243. gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
  244. gguf_writer.add_head_count (po.n_head)
  245. gguf_writer.add_head_count_kv (po.n_head_kv)
  246. gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
  247. return
  248. gguf_writer.add_context_length(cfg.context_length)
  249. gguf_writer.add_embedding_length(hp.n_embd)
  250. gguf_writer.add_block_count(hp.n_layer)
  251. gguf_writer.add_feed_forward_length(hp.n_ff)
  252. gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
  253. gguf_writer.add_head_count(hp.n_head)
  254. gguf_writer.add_head_count_kv(self.n_kv_head)
  255. gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
  256. def add_vocab(self, gguf_writer):
  257. hp = self.model.hyperparameters
  258. gguf_writer.add_tokenizer_model('llama')
  259. gguf_writer.add_tokenizer_pre('default')
  260. tokens = []
  261. scores = []
  262. toktypes = []
  263. if self.vocab_override is not None:
  264. vo = self.vocab_override
  265. logger.info('* Adding vocab item(s)')
  266. for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
  267. tokens.append(vbytes)
  268. scores.append(score)
  269. toktypes.append(ttype)
  270. assert len(tokens) == hp.n_vocab, \
  271. f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
  272. gguf_writer.add_token_list(tokens)
  273. gguf_writer.add_token_scores(scores)
  274. if len(toktypes) > 0:
  275. gguf_writer.add_token_types(toktypes)
  276. return
  277. logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
  278. assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
  279. for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
  280. tt = 1 # Normal
  281. # Special handling for UNK, BOS, EOS tokens.
  282. if tokid <= 2:
  283. if tokid == 0:
  284. vbytes = b'<unk>'
  285. tt = 2
  286. elif tokid == 1:
  287. vbytes = b'<s>'
  288. tt = 3
  289. else:
  290. vbytes = b'</s>'
  291. tt = 3
  292. elif len(vbytes) == 0:
  293. tt = 3 # Control
  294. elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
  295. vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
  296. tt = 6 # Byte
  297. else:
  298. vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
  299. toktypes.append(tt)
  300. tokens.append(vbytes)
  301. scores.append(vscore)
  302. gguf_writer.add_token_list(tokens)
  303. gguf_writer.add_token_scores(scores)
  304. gguf_writer.add_token_types(toktypes)
  305. gguf_writer.add_unk_token_id(0)
  306. gguf_writer.add_bos_token_id(1)
  307. gguf_writer.add_eos_token_id(2)
  308. def add_tensors(self, gguf_writer):
  309. tensor_map = self.name_map
  310. data = self.data
  311. logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
  312. for tensor in self.model.tensors:
  313. name = str(tensor.name, 'UTF-8')
  314. mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
  315. assert mapped_name is not None, f'Bad name {name}'
  316. tempdims = list(tensor.dims[:])
  317. if len(tempdims) > 1:
  318. temp = tempdims[1]
  319. tempdims[1] = tempdims[0]
  320. tempdims[0] = temp
  321. gguf_writer.add_tensor(
  322. mapped_name,
  323. data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
  324. raw_shape = tempdims,
  325. raw_dtype = tensor.dtype)
  326. def handle_metadata(cfg, hp):
  327. import examples.convert_legacy_llama as convert
  328. assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
  329. hf_config_path = cfg.model_metadata_dir / "config.json"
  330. orig_config_path = cfg.model_metadata_dir / "params.json"
  331. # We pass a fake model here. "original" mode will check the shapes of some
  332. # tensors if information is missing in the .json file: other than that, the
  333. # model data isn't used so this should be safe (at least for now).
  334. fakemodel = {
  335. 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
  336. 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
  337. }
  338. fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
  339. fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
  340. if hf_config_path.exists():
  341. params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
  342. elif orig_config_path.exists():
  343. params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
  344. else:
  345. raise ValueError('Unable to load metadata')
  346. vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
  347. vocab_factory = convert.VocabFactory(vocab_path)
  348. vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
  349. convert.check_vocab_size(params, vocab)
  350. return params, vocab, special_vocab
  351. def handle_args():
  352. parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
  353. parser.add_argument('--input', '-i', type = Path, required = True,
  354. help = 'Input GGMLv3 filename')
  355. parser.add_argument('--output', '-o', type = Path, required = True,
  356. help ='Output GGUF filename')
  357. parser.add_argument('--name',
  358. help = 'Set model name')
  359. parser.add_argument('--desc',
  360. help = 'Set model description')
  361. parser.add_argument('--gqa', type = int, default = 1,
  362. help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
  363. parser.add_argument('--eps', default = '5.0e-06',
  364. help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
  365. parser.add_argument('--context-length', '-c', type=int, default = 2048,
  366. help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
  367. parser.add_argument('--model-metadata-dir', '-m', type = Path,
  368. help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
  369. parser.add_argument("--vocab-dir", type=Path,
  370. help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
  371. parser.add_argument("--vocabtype", default="spm,hfft",
  372. help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
  373. parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
  374. return parser.parse_args()
  375. def main():
  376. cfg = handle_args()
  377. logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
  378. logger.info(f'* Using config: {cfg}')
  379. logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
  380. if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
  381. logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
  382. data = np.memmap(cfg.input, mode = 'r')
  383. model = GGMLModel()
  384. logger.info('* Scanning GGML input file')
  385. offset = model.load(data, 0) # noqa
  386. logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
  387. vocab_override = None
  388. params_override = None
  389. special_vocab = None
  390. if cfg.model_metadata_dir is not None:
  391. (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
  392. logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
  393. logger.info(f'* Overriding params: {params_override}')
  394. logger.info(f'* Overriding vocab: {vocab_override}')
  395. logger.info(f'* Special vocab: {special_vocab}')
  396. else:
  397. logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
  398. if model.file_format == GGMLFormat.GGML:
  399. logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
  400. converter = GGMLToGGUF(
  401. model, data, cfg,
  402. params_override = params_override,
  403. vocab_override = vocab_override,
  404. special_vocab = special_vocab
  405. )
  406. converter.save()
  407. logger.info(f'* Successful completion. Output saved to: {cfg.output}')
  408. if __name__ == '__main__':
  409. main()