convert-hf-to-gguf.py 58 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import contextlib
  5. import json
  6. import os
  7. import re
  8. import sys
  9. from enum import IntEnum
  10. from pathlib import Path
  11. from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
  12. import numpy as np
  13. import torch
  14. if TYPE_CHECKING:
  15. from torch import Tensor
  16. if 'NO_LOCAL_GGUF' not in os.environ:
  17. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  18. import gguf
  19. # check for any of the given keys in the dictionary and return the value of the first key found
  20. def get_key_opts(d, keys):
  21. for k in keys:
  22. if k in d:
  23. return d[k]
  24. print(f"Could not find any of {keys}")
  25. sys.exit()
  26. ###### MODEL DEFINITIONS ######
  27. class SentencePieceTokenTypes(IntEnum):
  28. NORMAL = 1
  29. UNKNOWN = 2
  30. CONTROL = 3
  31. USER_DEFINED = 4
  32. UNUSED = 5
  33. BYTE = 6
  34. class Model:
  35. def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
  36. self.dir_model = dir_model
  37. self.ftype = ftype
  38. self.fname_out = fname_out
  39. self.is_big_endian = is_big_endian
  40. self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  41. self.is_safetensors = self._is_model_safetensors()
  42. self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
  43. self.part_names = self._get_part_names()
  44. self.hparams = Model.load_hparams(self.dir_model)
  45. self.model_arch = self._get_model_architecture()
  46. self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
  47. def set_vocab(self):
  48. self._set_vocab_gpt2()
  49. def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
  50. for part_name in self.part_names:
  51. print(f"gguf: loading model part '{part_name}'")
  52. ctx: ContextManager[Any]
  53. if self.is_safetensors:
  54. from safetensors import safe_open
  55. ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
  56. else:
  57. ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
  58. with ctx as model_part:
  59. for name in model_part.keys():
  60. data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
  61. yield name, data
  62. def set_gguf_parameters(self):
  63. self.gguf_writer.add_name(self.dir_model.name)
  64. self.gguf_writer.add_block_count(self.hparams.get(
  65. "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
  66. ))
  67. if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
  68. self.gguf_writer.add_context_length(n_ctx)
  69. if (n_embd := self.hparams.get("hidden_size")) is not None:
  70. self.gguf_writer.add_embedding_length(n_embd)
  71. if (n_ff := self.hparams.get("intermediate_size")) is not None:
  72. self.gguf_writer.add_feed_forward_length(n_ff)
  73. if (n_head := self.hparams.get("num_attention_heads")) is not None:
  74. self.gguf_writer.add_head_count(n_head)
  75. if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
  76. self.gguf_writer.add_head_count_kv(n_head_kv)
  77. if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
  78. self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
  79. if (n_experts := self.hparams.get("num_local_experts")) is not None:
  80. self.gguf_writer.add_expert_count(n_experts)
  81. if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
  82. self.gguf_writer.add_expert_used_count(n_experts_used)
  83. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  84. def write_tensors(self):
  85. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  86. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  87. for name, data_torch in self.get_tensors():
  88. # we don't need these
  89. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  90. continue
  91. old_dtype = data_torch.dtype
  92. # convert any unsupported data types to float32
  93. if data_torch.dtype not in (torch.float16, torch.float32):
  94. data_torch = data_torch.to(torch.float32)
  95. data = data_torch.squeeze().numpy()
  96. # map tensor names
  97. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  98. if new_name is None:
  99. print(f"Can not map tensor {name!r}")
  100. sys.exit()
  101. n_dims = len(data.shape)
  102. data_dtype = data.dtype
  103. # if f32 desired, convert any float16 to float32
  104. if self.ftype == 0 and data_dtype == np.float16:
  105. data = data.astype(np.float32)
  106. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  107. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  108. data = data.astype(np.float32)
  109. # if f16 desired, convert any float32 2-dim weight tensors to float16
  110. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  111. data = data.astype(np.float16)
  112. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  113. self.gguf_writer.add_tensor(new_name, data)
  114. def write(self):
  115. self.write_tensors()
  116. self.gguf_writer.write_header_to_file()
  117. self.gguf_writer.write_kv_data_to_file()
  118. self.gguf_writer.write_tensors_to_file()
  119. self.gguf_writer.close()
  120. def write_vocab(self):
  121. self.gguf_writer.write_header_to_file()
  122. self.gguf_writer.write_kv_data_to_file()
  123. self.gguf_writer.close()
  124. @staticmethod
  125. def count_model_parts(dir_model: Path, prefix: str) -> int:
  126. num_parts = 0
  127. for filename in os.listdir(dir_model):
  128. if filename.endswith(prefix):
  129. num_parts += 1
  130. return num_parts
  131. @staticmethod
  132. def load_hparams(dir_model):
  133. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  134. return json.load(f)
  135. @staticmethod
  136. def from_model_architecture(model_architecture):
  137. if model_architecture == "GPTNeoXForCausalLM":
  138. return GPTNeoXModel
  139. if model_architecture == "BloomForCausalLM":
  140. return BloomModel
  141. if model_architecture == "MPTForCausalLM":
  142. return MPTModel
  143. if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  144. return BaichuanModel
  145. if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
  146. return FalconModel
  147. if model_architecture == "GPTBigCodeForCausalLM":
  148. return StarCoderModel
  149. if model_architecture == "GPTRefactForCausalLM":
  150. return RefactModel
  151. if model_architecture == "PersimmonForCausalLM":
  152. return PersimmonModel
  153. if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  154. return StableLMModel
  155. if model_architecture == "QWenLMHeadModel":
  156. return QwenModel
  157. if model_architecture == "Qwen2ForCausalLM":
  158. return Model
  159. if model_architecture == "MixtralForCausalLM":
  160. return MixtralModel
  161. if model_architecture == "GPT2LMHeadModel":
  162. return GPT2Model
  163. if model_architecture == "PhiForCausalLM":
  164. return Phi2Model
  165. if model_architecture == "PlamoForCausalLM":
  166. return PlamoModel
  167. if model_architecture == "CodeShellForCausalLM":
  168. return CodeShellModel
  169. return Model
  170. def _is_model_safetensors(self) -> bool:
  171. return Model.count_model_parts(self.dir_model, ".safetensors") > 0
  172. def _get_part_names(self):
  173. if self.is_safetensors:
  174. if self.num_parts == 1: # there's only one .safetensors file
  175. return ("model.safetensors",)
  176. return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
  177. if self.num_parts == 1: # there's only one .bin file
  178. return ("pytorch_model.bin",)
  179. return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
  180. def _get_model_architecture(self) -> gguf.MODEL_ARCH:
  181. arch = self.hparams["architectures"][0]
  182. if arch == "GPTNeoXForCausalLM":
  183. return gguf.MODEL_ARCH.GPTNEOX
  184. if arch == "BloomForCausalLM":
  185. return gguf.MODEL_ARCH.BLOOM
  186. if arch == "MPTForCausalLM":
  187. return gguf.MODEL_ARCH.MPT
  188. if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  189. return gguf.MODEL_ARCH.BAICHUAN
  190. if arch in ("FalconForCausalLM", "RWForCausalLM"):
  191. return gguf.MODEL_ARCH.FALCON
  192. if arch == "GPTBigCodeForCausalLM":
  193. return gguf.MODEL_ARCH.STARCODER
  194. if arch == "GPTRefactForCausalLM":
  195. return gguf.MODEL_ARCH.REFACT
  196. if arch == "PersimmonForCausalLM":
  197. return gguf.MODEL_ARCH.PERSIMMON
  198. if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  199. return gguf.MODEL_ARCH.STABLELM
  200. if arch == "QWenLMHeadModel":
  201. return gguf.MODEL_ARCH.QWEN
  202. if arch == "Qwen2ForCausalLM":
  203. return gguf.MODEL_ARCH.QWEN2
  204. if arch == "MixtralForCausalLM":
  205. return gguf.MODEL_ARCH.LLAMA
  206. if arch == "GPT2LMHeadModel":
  207. return gguf.MODEL_ARCH.GPT2
  208. if arch == "PhiForCausalLM":
  209. return gguf.MODEL_ARCH.PHI2
  210. if arch == "PlamoForCausalLM":
  211. return gguf.MODEL_ARCH.PLAMO
  212. if arch == "CodeShellForCausalLM":
  213. return gguf.MODEL_ARCH.CODESHELL
  214. raise NotImplementedError(f'Architecture "{arch}" not supported!')
  215. def _set_vocab_gpt2(self):
  216. dir_model = self.dir_model
  217. hparams = self.hparams
  218. tokens: list[bytearray] = []
  219. toktypes: list[int] = []
  220. from transformers import AutoTokenizer
  221. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  222. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  223. assert max(tokenizer.vocab.values()) < vocab_size
  224. reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
  225. added_vocab = tokenizer.get_added_vocab()
  226. for i in range(vocab_size):
  227. if i not in reverse_vocab:
  228. pad_token = f"[PAD{i}]".encode('utf-8')
  229. tokens.append(bytearray(pad_token))
  230. toktypes.append(gguf.TokenType.USER_DEFINED)
  231. elif reverse_vocab[i] in added_vocab:
  232. tokens.append(reverse_vocab[i])
  233. if tokenizer.added_tokens_decoder[i].special:
  234. toktypes.append(gguf.TokenType.CONTROL)
  235. else:
  236. toktypes.append(gguf.TokenType.USER_DEFINED)
  237. else:
  238. tokens.append(reverse_vocab[i])
  239. toktypes.append(gguf.TokenType.NORMAL)
  240. self.gguf_writer.add_tokenizer_model("gpt2")
  241. self.gguf_writer.add_token_list(tokens)
  242. self.gguf_writer.add_token_types(toktypes)
  243. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  244. special_vocab.add_to_gguf(self.gguf_writer)
  245. def _set_vocab_qwen(self):
  246. dir_model = self.dir_model
  247. hparams = self.hparams
  248. tokens: list[bytearray] = []
  249. toktypes: list[int] = []
  250. from transformers import AutoTokenizer
  251. tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
  252. vocab_size = hparams["vocab_size"]
  253. assert max(tokenizer.get_vocab().values()) < vocab_size
  254. merges = []
  255. vocab = {}
  256. mergeable_ranks = tokenizer.mergeable_ranks
  257. for token, rank in mergeable_ranks.items():
  258. vocab[QwenModel.token_bytes_to_string(token)] = rank
  259. if len(token) == 1:
  260. continue
  261. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
  262. assert len(merged) == 2
  263. merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
  264. # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
  265. added_vocab = tokenizer.special_tokens
  266. reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
  267. for i in range(vocab_size):
  268. if i not in reverse_vocab:
  269. pad_token = f"[PAD{i}]".encode("utf-8")
  270. tokens.append(bytearray(pad_token))
  271. toktypes.append(gguf.TokenType.USER_DEFINED)
  272. elif reverse_vocab[i] in added_vocab:
  273. tokens.append(reverse_vocab[i])
  274. toktypes.append(gguf.TokenType.CONTROL)
  275. else:
  276. tokens.append(reverse_vocab[i])
  277. toktypes.append(gguf.TokenType.NORMAL)
  278. self.gguf_writer.add_tokenizer_model("gpt2")
  279. self.gguf_writer.add_token_list(tokens)
  280. self.gguf_writer.add_token_types(toktypes)
  281. special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
  282. special_vocab.merges = merges
  283. # only add special tokens when they were not already loaded from config.json
  284. if len(special_vocab.special_token_ids) == 0:
  285. special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
  286. special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
  287. # this one is usually not in config.json anyway
  288. special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
  289. special_vocab.add_to_gguf(self.gguf_writer)
  290. def _set_vocab_sentencepiece(self):
  291. from sentencepiece import SentencePieceProcessor
  292. tokenizer_path = self.dir_model / 'tokenizer.model'
  293. tokens: list[bytes] = []
  294. scores: list[float] = []
  295. toktypes: list[int] = []
  296. if not tokenizer_path.is_file():
  297. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  298. sys.exit(1)
  299. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  300. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  301. for token_id in range(vocab_size):
  302. piece = tokenizer.id_to_piece(token_id)
  303. text = piece.encode("utf-8")
  304. score = tokenizer.get_score(token_id)
  305. toktype = SentencePieceTokenTypes.NORMAL
  306. if tokenizer.is_unknown(token_id):
  307. toktype = SentencePieceTokenTypes.UNKNOWN
  308. elif tokenizer.is_control(token_id):
  309. toktype = SentencePieceTokenTypes.CONTROL
  310. elif tokenizer.is_unused(token_id):
  311. toktype = SentencePieceTokenTypes.UNUSED
  312. elif tokenizer.is_byte(token_id):
  313. toktype = SentencePieceTokenTypes.BYTE
  314. tokens.append(text)
  315. scores.append(score)
  316. toktypes.append(toktype)
  317. added_tokens_file = self.dir_model / 'added_tokens.json'
  318. if added_tokens_file.is_file():
  319. with open(added_tokens_file, "r", encoding="utf-8") as f:
  320. added_tokens_json = json.load(f)
  321. for key in added_tokens_json:
  322. tokens.append(key.encode("utf-8"))
  323. scores.append(-1000.0)
  324. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  325. self.gguf_writer.add_tokenizer_model("llama")
  326. self.gguf_writer.add_token_list(tokens)
  327. self.gguf_writer.add_token_scores(scores)
  328. self.gguf_writer.add_token_types(toktypes)
  329. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  330. special_vocab.add_to_gguf(self.gguf_writer)
  331. class GPTNeoXModel(Model):
  332. def set_gguf_parameters(self):
  333. block_count = self.hparams["num_hidden_layers"]
  334. self.gguf_writer.add_name(self.dir_model.name)
  335. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  336. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  337. self.gguf_writer.add_block_count(block_count)
  338. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  339. self.gguf_writer.add_rope_dimension_count(
  340. int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
  341. )
  342. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  343. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  344. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  345. class BloomModel(Model):
  346. def set_gguf_parameters(self):
  347. self.gguf_writer.add_name("Bloom")
  348. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  349. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  350. self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
  351. self.gguf_writer.add_embedding_length(n_embed)
  352. self.gguf_writer.add_feed_forward_length(4 * n_embed)
  353. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  354. self.gguf_writer.add_head_count(n_head)
  355. self.gguf_writer.add_head_count_kv(n_head)
  356. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  357. self.gguf_writer.add_file_type(self.ftype)
  358. def write_tensors(self):
  359. block_count = self.hparams["n_layer"]
  360. tensors = dict(self.get_tensors())
  361. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  362. has_lm_head = True
  363. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  364. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  365. for name, data_torch in tensors.items():
  366. if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
  367. has_lm_head = False
  368. name = re.sub(r'transformer\.', '', name)
  369. old_dtype = data_torch.dtype
  370. # convert any unsupported data types to float32
  371. if data_torch.dtype not in (torch.float16, torch.float32):
  372. data_torch = data_torch.to(torch.float32)
  373. data = data_torch.squeeze().numpy()
  374. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  375. # Map bloom-style qkv_linear to gpt-style qkv_linear
  376. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  377. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  378. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  379. data = np.concatenate(
  380. (
  381. qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  382. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  383. qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
  384. ),
  385. axis=0,
  386. )
  387. print("re-format attention.linear_qkv.weight")
  388. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  389. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  390. data = np.concatenate(
  391. (
  392. qkv_bias[:, 0, :].reshape((n_embed,)),
  393. qkv_bias[:, 1, :].reshape((n_embed,)),
  394. qkv_bias[:, 2, :].reshape((n_embed,)),
  395. ),
  396. axis=0,
  397. )
  398. print("re-format attention.linear_qkv.bias")
  399. # map tensor names
  400. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  401. if new_name is None:
  402. print(f"Can not map tensor {name!r}")
  403. sys.exit()
  404. n_dims = len(data.shape)
  405. data_dtype = data.dtype
  406. # if f32 desired, convert any float16 to float32
  407. if self.ftype == 0 and data_dtype == np.float16:
  408. data = data.astype(np.float32)
  409. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  410. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  411. data = data.astype(np.float32)
  412. # if f16 desired, convert any float32 2-dim weight tensors to float16
  413. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  414. data = data.astype(np.float16)
  415. print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  416. self.gguf_writer.add_tensor(new_name, data)
  417. if not has_lm_head and name == "word_embeddings.weight":
  418. self.gguf_writer.add_tensor("output.weight", data)
  419. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  420. class MPTModel(Model):
  421. def set_gguf_parameters(self):
  422. block_count = self.hparams["n_layers"]
  423. self.gguf_writer.add_name(self.dir_model.name)
  424. self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
  425. self.gguf_writer.add_embedding_length(self.hparams["d_model"])
  426. self.gguf_writer.add_block_count(block_count)
  427. self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
  428. self.gguf_writer.add_head_count(self.hparams["n_heads"])
  429. if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
  430. self.gguf_writer.add_head_count_kv(kv_n_heads)
  431. self.gguf_writer.add_layer_norm_eps(1e-5)
  432. if self.hparams["attn_config"]["clip_qkv"] is not None:
  433. self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
  434. self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
  435. def write_tensors(self):
  436. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
  437. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  438. for name, data_torch in self.get_tensors():
  439. # we don't need these
  440. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  441. continue
  442. old_dtype = data_torch.dtype
  443. # convert any unsupported data types to float32
  444. if data_torch.dtype not in (torch.float16, torch.float32):
  445. data_torch = data_torch.to(torch.float32)
  446. data = data_torch.squeeze().numpy()
  447. # map tensor names
  448. if "scales" in name:
  449. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
  450. if new_name is not None:
  451. new_name = new_name.replace("scales", "act.scales")
  452. else:
  453. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  454. if new_name is None:
  455. print(f"Can not map tensor {name!r}")
  456. sys.exit()
  457. n_dims = len(data.shape)
  458. data_dtype = data.dtype
  459. # if f32 desired, convert any float16 to float32
  460. if self.ftype == 0 and data_dtype == np.float16:
  461. data = data.astype(np.float32)
  462. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  463. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  464. data = data.astype(np.float32)
  465. # if f16 desired, convert any float32 2-dim weight tensors to float16
  466. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  467. data = data.astype(np.float16)
  468. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  469. self.gguf_writer.add_tensor(new_name, data)
  470. # note: MPT output is tied to (same as) wte in original model;
  471. # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
  472. if new_name == "token_embd.weight":
  473. self.gguf_writer.add_tensor("output.weight", data)
  474. class BaichuanModel(Model):
  475. def set_vocab(self):
  476. self._set_vocab_sentencepiece()
  477. def set_gguf_parameters(self):
  478. block_count = self.hparams["num_hidden_layers"]
  479. head_count = self.hparams["num_attention_heads"]
  480. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  481. hf_repo = self.hparams.get("_name_or_path", "")
  482. ctx_length = 0
  483. if "max_sequence_length" in self.hparams:
  484. ctx_length = self.hparams["max_sequence_length"]
  485. elif "max_position_embeddings" in self.hparams:
  486. ctx_length = self.hparams["max_position_embeddings"]
  487. elif "model_max_length" in self.hparams:
  488. ctx_length = self.hparams["model_max_length"]
  489. else:
  490. print("gguf: can not find ctx length parameter.")
  491. sys.exit()
  492. self.gguf_writer.add_name(self.dir_model.name)
  493. self.gguf_writer.add_source_hf_repo(hf_repo)
  494. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  495. self.gguf_writer.add_context_length(ctx_length)
  496. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  497. self.gguf_writer.add_block_count(block_count)
  498. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  499. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  500. self.gguf_writer.add_head_count(head_count)
  501. self.gguf_writer.add_head_count_kv(head_count_kv)
  502. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  503. if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
  504. if self.hparams["rope_scaling"].get("type") == "linear":
  505. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  506. self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
  507. def write_tensors(self):
  508. # Collect tensors from generator object
  509. model_kv = dict(self.get_tensors())
  510. block_count = self.hparams["num_hidden_layers"]
  511. head_count = self.hparams["num_attention_heads"]
  512. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  513. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  514. for i in range(block_count):
  515. if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
  516. print(f"Unpacking and permuting layer {i}")
  517. model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
  518. self._reverse_hf_permute_part(w, 0, head_count, head_count)
  519. model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
  520. self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
  521. model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
  522. self._reverse_hf_part(w, 2)
  523. del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
  524. for name, data_torch in model_kv.items():
  525. # we don't need these
  526. if name.endswith(".rotary_emb.inv_freq"):
  527. continue
  528. old_dtype = data_torch.dtype
  529. # convert any unsupported data types to float32
  530. if data_torch.dtype not in (torch.float16, torch.float32):
  531. data_torch = data_torch.to(torch.float32)
  532. data = data_torch.squeeze().numpy()
  533. # map tensor names
  534. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  535. if new_name is None:
  536. print(f"Can not map tensor {name!r}")
  537. sys.exit()
  538. n_dims = len(data.shape)
  539. data_dtype = data.dtype
  540. # if f32 desired, convert any float16 to float32
  541. if self.ftype == 0 and data_dtype == np.float16:
  542. data = data.astype(np.float32)
  543. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  544. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  545. data = data.astype(np.float32)
  546. # if f16 desired, convert any float32 2-dim weight tensors to float16
  547. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  548. data = data.astype(np.float16)
  549. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  550. self.gguf_writer.add_tensor(new_name, data)
  551. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  552. if n_kv_head is not None and n_head != n_kv_head:
  553. n_head //= n_kv_head
  554. return (
  555. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  556. .swapaxes(1, 2)
  557. .reshape(weights.shape)
  558. )
  559. def _reverse_hf_permute_part(
  560. self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
  561. ) -> Tensor:
  562. r = weights.shape[0] // 3
  563. return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
  564. def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
  565. r = weights.shape[0] // 3
  566. return weights[r * n_part:r * n_part + r, ...]
  567. class FalconModel(Model):
  568. def set_gguf_parameters(self):
  569. block_count = self.hparams.get("num_hidden_layers")
  570. if block_count is None:
  571. block_count = self.hparams["n_layer"] # old name
  572. n_head = self.hparams.get("num_attention_heads")
  573. if n_head is None:
  574. n_head = self.hparams["n_head"] # old name
  575. n_head_kv = self.hparams.get("num_kv_heads")
  576. if n_head_kv is None:
  577. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  578. self.gguf_writer.add_name("Falcon")
  579. self.gguf_writer.add_context_length(2048) # not in config.json
  580. self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
  581. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  582. self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
  583. self.gguf_writer.add_block_count(block_count)
  584. self.gguf_writer.add_head_count(n_head)
  585. self.gguf_writer.add_head_count_kv(n_head_kv)
  586. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  587. self.gguf_writer.add_file_type(self.ftype)
  588. def write_tensors(self):
  589. block_count = self.hparams.get("num_hidden_layers")
  590. if block_count is None:
  591. block_count = self.hparams["n_layer"] # old name
  592. n_head = self.hparams.get("num_attention_heads")
  593. if n_head is None:
  594. n_head = self.hparams["n_head"] # old name
  595. n_head_kv = self.hparams.get("num_kv_heads")
  596. if n_head_kv is None:
  597. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  598. head_dim = self.hparams["hidden_size"] // n_head
  599. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  600. for name, data_torch in self.get_tensors():
  601. old_dtype = data_torch.dtype
  602. # convert any unsupported data types to float32
  603. if data_torch.dtype not in (torch.float16, torch.float32):
  604. data_torch = data_torch.to(torch.float32)
  605. # QKV tensor transform
  606. # The original query_key_value tensor contains n_head_kv "kv groups",
  607. # each consisting of n_head/n_head_kv query weights followed by one key
  608. # and one value weight (shared by all query heads in the kv group).
  609. # This layout makes it a big pain to work with in GGML.
  610. # So we rearrange them here,, so that we have n_head query weights
  611. # followed by n_head_kv key weights followed by n_head_kv value weights,
  612. # in contiguous fashion.
  613. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
  614. if "query_key_value" in name:
  615. qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
  616. q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
  617. k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
  618. v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
  619. data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
  620. data = data_torch.squeeze().numpy()
  621. # map tensor names
  622. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  623. if new_name is None:
  624. print(f"Can not map tensor {name!r}")
  625. sys.exit()
  626. n_dims = len(data.shape)
  627. data_dtype = data.dtype
  628. # if f32 desired, convert any float16 to float32
  629. if self.ftype == 0 and data_dtype == np.float16:
  630. data = data.astype(np.float32)
  631. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  632. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  633. data = data.astype(np.float32)
  634. # if f16 desired, convert any float32 2-dim weight tensors to float16
  635. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  636. data = data.astype(np.float16)
  637. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  638. self.gguf_writer.add_tensor(new_name, data)
  639. class StarCoderModel(Model):
  640. def set_gguf_parameters(self):
  641. block_count = self.hparams["n_layer"]
  642. self.gguf_writer.add_name("StarCoder")
  643. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  644. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  645. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  646. self.gguf_writer.add_block_count(block_count)
  647. self.gguf_writer.add_head_count(self.hparams["n_head"])
  648. self.gguf_writer.add_head_count_kv(1)
  649. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  650. self.gguf_writer.add_file_type(self.ftype)
  651. class RefactModel(Model):
  652. def set_gguf_parameters(self):
  653. hidden_dim = self.hparams["n_embd"]
  654. inner_dim = 4 * hidden_dim
  655. hidden_dim = int(2 * inner_dim / 3)
  656. multiple_of = 256
  657. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  658. block_count = self.hparams["n_layer"]
  659. self.gguf_writer.add_name("Refact")
  660. # refact uses Alibi. So this is from config.json which might be used by training.
  661. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  662. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  663. self.gguf_writer.add_feed_forward_length(ff_dim)
  664. self.gguf_writer.add_block_count(block_count)
  665. self.gguf_writer.add_head_count(self.hparams["n_head"])
  666. self.gguf_writer.add_head_count_kv(1)
  667. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  668. self.gguf_writer.add_file_type(self.ftype)
  669. def write_tensors(self):
  670. hidden_dim = self.hparams["n_embd"]
  671. inner_dim = 4 * hidden_dim
  672. hidden_dim = int(2 * inner_dim / 3)
  673. multiple_of = 256
  674. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  675. n_head = self.hparams["n_head"]
  676. n_head_kv = 1
  677. head_dim = self.hparams["n_embd"] // n_head
  678. block_count = self.hparams["n_layer"]
  679. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  680. tensors = dict(self.get_tensors())
  681. for i in range(block_count):
  682. if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
  683. tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
  684. tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
  685. del tensors[f"transformer.h.{i}.attn.kv.weight"]
  686. if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
  687. tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
  688. del tensors[f"transformer.h.{i}.attn.q.weight"]
  689. if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
  690. tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
  691. tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
  692. del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  693. for name, data_torch in tensors.items():
  694. old_dtype = data_torch.dtype
  695. # convert any unsupported data types to float32
  696. if data_torch.dtype not in (torch.float16, torch.float32):
  697. data_torch = data_torch.to(torch.float32)
  698. data = data_torch.squeeze().numpy()
  699. # map tensor names
  700. new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
  701. if new_name is None:
  702. print(f"Can not map tensor {name!r}")
  703. sys.exit()
  704. n_dims = len(data.shape)
  705. data_dtype = data.dtype
  706. # if f32 desired, convert any float16 to float32
  707. if self.ftype == 0 and data_dtype == np.float16:
  708. data = data.astype(np.float32)
  709. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  710. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  711. data = data.astype(np.float32)
  712. # if f16 desired, convert any float32 2-dim weight tensors to float16
  713. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  714. data = data.astype(np.float16)
  715. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  716. self.gguf_writer.add_tensor(new_name, data)
  717. class PersimmonModel(Model):
  718. def set_gguf_parameters(self):
  719. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  720. head_count = self.hparams["num_attention_heads"]
  721. head_count_kv = head_count
  722. hidden_size = self.hparams["hidden_size"]
  723. self.gguf_writer.add_name('persimmon-8b-chat')
  724. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  725. self.gguf_writer.add_embedding_length(hidden_size)
  726. self.gguf_writer.add_block_count(block_count)
  727. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  728. # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
  729. # than the head size?
  730. # ref: https://github.com/ggerganov/llama.cpp/pull/4889
  731. # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
  732. self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
  733. self.gguf_writer.add_head_count(head_count)
  734. self.gguf_writer.add_head_count_kv(head_count_kv)
  735. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  736. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  737. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  738. def set_vocab(self):
  739. self._set_vocab_sentencepiece()
  740. # self.gguf_writer.add_bos_token_id(71013)
  741. # self.gguf_writer.add_eos_token_id(71013)
  742. def write_tensors(self):
  743. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  744. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  745. for name, data_torch in self.get_tensors():
  746. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  747. continue
  748. old_dtype = data_torch.dtype
  749. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  750. data = data_torch.to(torch.float32).squeeze().numpy()
  751. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  752. if new_name is None:
  753. print(f"Can not map tensor {name!r}")
  754. sys.exit()
  755. n_dims = len(data.shape)
  756. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  757. self.gguf_writer.add_tensor(new_name, data)
  758. class StableLMModel(Model):
  759. def set_vocab(self):
  760. if (self.dir_model / "tokenizer.json").is_file():
  761. self._set_vocab_gpt2()
  762. else:
  763. # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
  764. self._set_vocab_qwen()
  765. def set_gguf_parameters(self):
  766. hparams = self.hparams
  767. block_count = hparams["num_hidden_layers"]
  768. self.gguf_writer.add_name(self.dir_model.name)
  769. self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  770. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  771. self.gguf_writer.add_block_count(block_count)
  772. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  773. self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
  774. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  775. self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
  776. self.gguf_writer.add_layer_norm_eps(1e-5)
  777. class MixtralModel(Model):
  778. def set_vocab(self):
  779. self._set_vocab_sentencepiece()
  780. class QwenModel(Model):
  781. @staticmethod
  782. def token_bytes_to_string(b):
  783. from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
  784. byte_encoder = bytes_to_unicode()
  785. return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
  786. @staticmethod
  787. def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
  788. parts = [bytes([b]) for b in token]
  789. while True:
  790. min_idx = None
  791. min_rank = None
  792. for i, pair in enumerate(zip(parts[:-1], parts[1:])):
  793. rank = mergeable_ranks.get(pair[0] + pair[1])
  794. if rank is not None and (min_rank is None or rank < min_rank):
  795. min_idx = i
  796. min_rank = rank
  797. if min_rank is None or (max_rank is not None and min_rank >= max_rank):
  798. break
  799. assert min_idx is not None
  800. parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
  801. return parts
  802. def set_vocab(self):
  803. self._set_vocab_qwen()
  804. def set_gguf_parameters(self):
  805. self.gguf_writer.add_name("Qwen")
  806. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  807. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  808. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  809. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  810. self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
  811. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  812. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  813. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  814. def write_tensors(self):
  815. block_count = self.hparams["num_hidden_layers"]
  816. model_kv = dict(self.get_tensors())
  817. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  818. for name, data_torch in model_kv.items():
  819. # we don't need these
  820. if name.endswith(".rotary_emb.inv_freq"):
  821. continue
  822. old_dtype = data_torch.dtype
  823. # convert any unsupported data types to float32
  824. if data_torch.dtype not in (torch.float16, torch.float32):
  825. data_torch = data_torch.to(torch.float32)
  826. data = data_torch.squeeze().numpy()
  827. # map tensor names
  828. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  829. if new_name is None:
  830. print(f"Can not map tensor {name!r}")
  831. sys.exit()
  832. n_dims = len(data.shape)
  833. data_dtype = data.dtype
  834. # if f32 desired, convert any float16 to float32
  835. if self.ftype == 0 and data_dtype == np.float16:
  836. data = data.astype(np.float32)
  837. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  838. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  839. data = data.astype(np.float32)
  840. # if f16 desired, convert any float32 2-dim weight tensors to float16
  841. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  842. data = data.astype(np.float16)
  843. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  844. self.gguf_writer.add_tensor(new_name, data)
  845. class GPT2Model(Model):
  846. def set_gguf_parameters(self):
  847. self.gguf_writer.add_name(self.dir_model.name)
  848. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  849. self.gguf_writer.add_context_length(self.hparams["n_ctx"])
  850. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  851. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  852. self.gguf_writer.add_head_count(self.hparams["n_head"])
  853. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  854. self.gguf_writer.add_file_type(self.ftype)
  855. def write_tensors(self):
  856. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  857. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  858. for name, data_torch in self.get_tensors():
  859. # we don't need these
  860. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
  861. continue
  862. if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
  863. data_torch = data_torch.transpose(1, 0)
  864. old_dtype = data_torch.dtype
  865. # convert any unsupported data types to float32
  866. if data_torch.dtype not in (torch.float16, torch.float32):
  867. data_torch = data_torch.to(torch.float32)
  868. data = data_torch.squeeze().numpy()
  869. # map tensor names
  870. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  871. if new_name is None:
  872. print(f"Can not map tensor {name!r}")
  873. sys.exit()
  874. n_dims = len(data.shape)
  875. data_dtype = data.dtype
  876. # if f32 desired, convert any float16 to float32
  877. if self.ftype == 0 and data_dtype == np.float16:
  878. data = data.astype(np.float32)
  879. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  880. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  881. data = data.astype(np.float32)
  882. # if f16 desired, convert any float32 2-dim weight tensors to float16
  883. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  884. data = data.astype(np.float16)
  885. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  886. self.gguf_writer.add_tensor(new_name, data)
  887. # note: GPT2 output is tied to (same as) wte in original model
  888. if new_name == "token_embd.weight":
  889. print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  890. self.gguf_writer.add_tensor("output.weight", data)
  891. class Phi2Model(Model):
  892. def set_gguf_parameters(self):
  893. block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
  894. rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
  895. n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
  896. n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
  897. self.gguf_writer.add_name("Phi2")
  898. self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
  899. self.gguf_writer.add_embedding_length(n_embd)
  900. self.gguf_writer.add_feed_forward_length(4 * n_embd)
  901. self.gguf_writer.add_block_count(block_count)
  902. self.gguf_writer.add_head_count(n_head)
  903. self.gguf_writer.add_head_count_kv(n_head)
  904. self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
  905. self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
  906. self.gguf_writer.add_file_type(self.ftype)
  907. self.gguf_writer.add_add_bos_token(False)
  908. class PlamoModel(Model):
  909. def set_vocab(self):
  910. self._set_vocab_sentencepiece()
  911. def set_gguf_parameters(self):
  912. hparams = self.hparams
  913. block_count = hparams["num_hidden_layers"]
  914. self.gguf_writer.add_name("PLaMo")
  915. self.gguf_writer.add_context_length(4096) # not in config.json
  916. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  917. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  918. self.gguf_writer.add_block_count(block_count)
  919. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  920. self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
  921. self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
  922. def shuffle_attn_q_weight(self, data_torch):
  923. assert data_torch.size() == (5120, 5120)
  924. data_torch = data_torch.reshape(8, 5, 128, 5120)
  925. data_torch = torch.permute(data_torch, (1, 0, 2, 3))
  926. data_torch = torch.reshape(data_torch, (5120, 5120))
  927. return data_torch
  928. def shuffle_attn_output_weight(self, data_torch):
  929. assert data_torch.size() == (5120, 5120)
  930. data_torch = data_torch.reshape(5120, 8, 5, 128)
  931. data_torch = torch.permute(data_torch, (0, 2, 1, 3))
  932. data_torch = torch.reshape(data_torch, (5120, 5120))
  933. return data_torch
  934. def write_tensors(self):
  935. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  936. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  937. for name, data_torch in self.get_tensors():
  938. if "self_attn.rotary_emb.inv_freq" in name:
  939. continue
  940. # map tensor names
  941. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  942. if new_name is None:
  943. print(f"Can not map tensor {name!r}")
  944. sys.exit()
  945. # shuffle for broadcasting of gqa in ggml_mul_mat
  946. if new_name.endswith("attn_q.weight"):
  947. data_torch = self.shuffle_attn_q_weight(data_torch)
  948. elif new_name.endswith("attn_output.weight"):
  949. data_torch = self.shuffle_attn_output_weight(data_torch)
  950. old_dtype = data_torch.dtype
  951. # convert any unsupported data types to float32
  952. if data_torch.dtype not in (torch.float16, torch.float32):
  953. data_torch = data_torch.to(torch.float32)
  954. data = data_torch.squeeze().numpy()
  955. n_dims = len(data.shape)
  956. data_dtype = data.dtype
  957. # if f32 desired, convert any float16 to float32
  958. if self.ftype == 0 and data_dtype == np.float16:
  959. data = data.astype(np.float32)
  960. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  961. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  962. data = data.astype(np.float32)
  963. # if f16 desired, convert any float32 2-dim weight tensors to float16
  964. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  965. data = data.astype(np.float16)
  966. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  967. self.gguf_writer.add_tensor(new_name, data)
  968. class CodeShellModel(Model):
  969. def set_gguf_parameters(self):
  970. block_count = self.hparams["n_layer"]
  971. self.gguf_writer.add_name("CodeShell")
  972. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  973. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  974. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  975. self.gguf_writer.add_block_count(block_count)
  976. self.gguf_writer.add_head_count(self.hparams["n_head"])
  977. self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
  978. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  979. self.gguf_writer.add_file_type(self.ftype)
  980. self.gguf_writer.add_rope_freq_base(10000.0)
  981. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  982. self.gguf_writer.add_rope_scaling_factor(1.0)
  983. def write_tensors(self):
  984. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  985. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  986. tensors = dict(self.get_tensors())
  987. has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
  988. for name, data_torch in tensors.items():
  989. # we don't need these
  990. if name.endswith((".attn.rotary_emb.inv_freq")):
  991. continue
  992. old_dtype = data_torch.dtype
  993. # convert any unsupported data types to float32
  994. if data_torch.dtype not in (torch.float16, torch.float32):
  995. data_torch = data_torch.to(torch.float32)
  996. data = data_torch.squeeze().numpy()
  997. # map tensor names
  998. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  999. if new_name is None:
  1000. print(f"Can not map tensor {name!r}")
  1001. sys.exit()
  1002. n_dims = len(data.shape)
  1003. data_dtype = data.dtype
  1004. # if f32 desired, convert any float16 to float32
  1005. if self.ftype == 0 and data_dtype == np.float16:
  1006. data = data.astype(np.float32)
  1007. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1008. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1009. data = data.astype(np.float32)
  1010. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1011. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1012. data = data.astype(np.float16)
  1013. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1014. self.gguf_writer.add_tensor(new_name, data)
  1015. if not has_lm_head and name == "transformer.wte.weight":
  1016. self.gguf_writer.add_tensor("output.weight", data)
  1017. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  1018. ###### CONVERSION LOGIC ######
  1019. def parse_args() -> argparse.Namespace:
  1020. parser = argparse.ArgumentParser(
  1021. description="Convert a huggingface model to a GGML compatible file")
  1022. parser.add_argument(
  1023. "--vocab-only", action="store_true",
  1024. help="extract only the vocab",
  1025. )
  1026. parser.add_argument(
  1027. "--awq-path", type=Path, default=None,
  1028. help="Path to scale awq cache file")
  1029. parser.add_argument(
  1030. "--outfile", type=Path,
  1031. help="path to write to; default: based on input",
  1032. )
  1033. parser.add_argument(
  1034. "--outtype", type=str, choices=["f32", "f16"], default="f16",
  1035. help="output format - use f32 for float32, f16 for float16",
  1036. )
  1037. parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
  1038. parser.add_argument(
  1039. "model", type=Path,
  1040. help="directory containing model file",
  1041. )
  1042. return parser.parse_args()
  1043. def main() -> None:
  1044. args = parse_args()
  1045. dir_model = args.model
  1046. if args.awq_path:
  1047. sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
  1048. from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
  1049. tmp_model_path = args.model / "weighted_model"
  1050. dir_model = tmp_model_path
  1051. if tmp_model_path.is_dir():
  1052. print(f"{tmp_model_path} exists as a weighted model.")
  1053. else:
  1054. tmp_model_path.mkdir(parents=True, exist_ok=True)
  1055. print("Saving new weighted model ...")
  1056. add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
  1057. print(f"Saved weighted model at {tmp_model_path}.")
  1058. if not dir_model.is_dir():
  1059. print(f'Error: {args.model} is not a directory', file=sys.stderr)
  1060. sys.exit(1)
  1061. ftype_map = {
  1062. "f32": gguf.GGMLQuantizationType.F32,
  1063. "f16": gguf.GGMLQuantizationType.F16,
  1064. }
  1065. if args.outfile is not None:
  1066. fname_out = args.outfile
  1067. else:
  1068. # output in the same directory as the model by default
  1069. fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
  1070. print(f"Loading model: {dir_model.name}")
  1071. hparams = Model.load_hparams(dir_model)
  1072. with torch.inference_mode():
  1073. model_class = Model.from_model_architecture(hparams["architectures"][0])
  1074. model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
  1075. print("Set model parameters")
  1076. model_instance.set_gguf_parameters()
  1077. print("Set model tokenizer")
  1078. model_instance.set_vocab()
  1079. if args.vocab_only:
  1080. print(f"Exporting model vocab to '{fname_out}'")
  1081. model_instance.write_vocab()
  1082. else:
  1083. print(f"Exporting model to '{fname_out}'")
  1084. model_instance.write()
  1085. print(f"Model successfully exported to '{fname_out}'")
  1086. if __name__ == '__main__':
  1087. main()