convert-hf-to-gguf.py 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import contextlib
  5. import json
  6. import os
  7. import re
  8. import sys
  9. from enum import IntEnum
  10. from pathlib import Path
  11. from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
  12. import numpy as np
  13. import torch
  14. if TYPE_CHECKING:
  15. from torch import Tensor
  16. if 'NO_LOCAL_GGUF' not in os.environ:
  17. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  18. import gguf
  19. # check for any of the given keys in the dictionary and return the value of the first key found
  20. def get_key_opts(d, keys):
  21. for k in keys:
  22. if k in d:
  23. return d[k]
  24. print(f"Could not find any of {keys}")
  25. sys.exit()
  26. ###### MODEL DEFINITIONS ######
  27. class SentencePieceTokenTypes(IntEnum):
  28. NORMAL = 1
  29. UNKNOWN = 2
  30. CONTROL = 3
  31. USER_DEFINED = 4
  32. UNUSED = 5
  33. BYTE = 6
  34. class Model:
  35. def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
  36. self.dir_model = dir_model
  37. self.ftype = ftype
  38. self.fname_out = fname_out
  39. self.is_big_endian = is_big_endian
  40. self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  41. self.is_safetensors = self._is_model_safetensors()
  42. self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
  43. self.part_names = self._get_part_names()
  44. self.hparams = Model.load_hparams(self.dir_model)
  45. self.model_arch = self._get_model_architecture()
  46. self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
  47. def set_vocab(self):
  48. self._set_vocab_gpt2()
  49. def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
  50. for part_name in self.part_names:
  51. print(f"gguf: loading model part '{part_name}'")
  52. ctx: ContextManager[Any]
  53. if self.is_safetensors:
  54. from safetensors import safe_open
  55. ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
  56. else:
  57. ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
  58. with ctx as model_part:
  59. for name in model_part.keys():
  60. data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
  61. yield name, data
  62. def set_gguf_parameters(self):
  63. self.gguf_writer.add_name(self.dir_model.name)
  64. self.gguf_writer.add_block_count(self.hparams.get(
  65. "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
  66. ))
  67. if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
  68. self.gguf_writer.add_context_length(n_ctx)
  69. if (n_embd := self.hparams.get("hidden_size")) is not None:
  70. self.gguf_writer.add_embedding_length(n_embd)
  71. if (n_ff := self.hparams.get("intermediate_size")) is not None:
  72. self.gguf_writer.add_feed_forward_length(n_ff)
  73. if (n_head := self.hparams.get("num_attention_heads")) is not None:
  74. self.gguf_writer.add_head_count(n_head)
  75. if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
  76. self.gguf_writer.add_head_count_kv(n_head_kv)
  77. if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
  78. self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
  79. if (n_experts := self.hparams.get("num_local_experts")) is not None:
  80. self.gguf_writer.add_expert_count(n_experts)
  81. if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
  82. self.gguf_writer.add_expert_used_count(n_experts_used)
  83. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  84. def write_tensors(self):
  85. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  86. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  87. for name, data_torch in self.get_tensors():
  88. # we don't need these
  89. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  90. continue
  91. old_dtype = data_torch.dtype
  92. # convert any unsupported data types to float32
  93. if data_torch.dtype not in (torch.float16, torch.float32):
  94. data_torch = data_torch.to(torch.float32)
  95. data = data_torch.squeeze().numpy()
  96. # map tensor names
  97. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  98. if new_name is None:
  99. print(f"Can not map tensor {name!r}")
  100. sys.exit()
  101. n_dims = len(data.shape)
  102. data_dtype = data.dtype
  103. # if f32 desired, convert any float16 to float32
  104. if self.ftype == 0 and data_dtype == np.float16:
  105. data = data.astype(np.float32)
  106. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  107. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  108. data = data.astype(np.float32)
  109. # if f16 desired, convert any float32 2-dim weight tensors to float16
  110. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  111. data = data.astype(np.float16)
  112. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  113. self.gguf_writer.add_tensor(new_name, data)
  114. def write(self):
  115. self.write_tensors()
  116. self.gguf_writer.write_header_to_file()
  117. self.gguf_writer.write_kv_data_to_file()
  118. self.gguf_writer.write_tensors_to_file()
  119. self.gguf_writer.close()
  120. def write_vocab(self):
  121. self.gguf_writer.write_header_to_file()
  122. self.gguf_writer.write_kv_data_to_file()
  123. self.gguf_writer.close()
  124. @staticmethod
  125. def count_model_parts(dir_model: Path, prefix: str) -> int:
  126. num_parts = 0
  127. for filename in os.listdir(dir_model):
  128. if filename.endswith(prefix):
  129. num_parts += 1
  130. return num_parts
  131. @staticmethod
  132. def load_hparams(dir_model):
  133. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  134. return json.load(f)
  135. @staticmethod
  136. def from_model_architecture(model_architecture):
  137. if model_architecture == "GPTNeoXForCausalLM":
  138. return GPTNeoXModel
  139. if model_architecture == "BloomForCausalLM":
  140. return BloomModel
  141. if model_architecture == "MPTForCausalLM":
  142. return MPTModel
  143. if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  144. return BaichuanModel
  145. if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
  146. return FalconModel
  147. if model_architecture == "GPTBigCodeForCausalLM":
  148. return StarCoderModel
  149. if model_architecture == "GPTRefactForCausalLM":
  150. return RefactModel
  151. if model_architecture == "PersimmonForCausalLM":
  152. return PersimmonModel
  153. if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  154. return StableLMModel
  155. if model_architecture == "QWenLMHeadModel":
  156. return QwenModel
  157. if model_architecture == "MixtralForCausalLM":
  158. return MixtralModel
  159. if model_architecture == "GPT2LMHeadModel":
  160. return GPT2Model
  161. if model_architecture == "PhiForCausalLM":
  162. return Phi2Model
  163. if model_architecture == "PlamoForCausalLM":
  164. return PlamoModel
  165. if model_architecture == "CodeShellForCausalLM":
  166. return CodeShellModel
  167. return Model
  168. def _is_model_safetensors(self) -> bool:
  169. return Model.count_model_parts(self.dir_model, ".safetensors") > 0
  170. def _get_part_names(self):
  171. if self.is_safetensors:
  172. if self.num_parts == 1: # there's only one .safetensors file
  173. return ("model.safetensors",)
  174. return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
  175. if self.num_parts == 1: # there's only one .bin file
  176. return ("pytorch_model.bin",)
  177. return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
  178. def _get_model_architecture(self) -> gguf.MODEL_ARCH:
  179. arch = self.hparams["architectures"][0]
  180. if arch == "GPTNeoXForCausalLM":
  181. return gguf.MODEL_ARCH.GPTNEOX
  182. if arch == "BloomForCausalLM":
  183. return gguf.MODEL_ARCH.BLOOM
  184. if arch == "MPTForCausalLM":
  185. return gguf.MODEL_ARCH.MPT
  186. if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  187. return gguf.MODEL_ARCH.BAICHUAN
  188. if arch in ("FalconForCausalLM", "RWForCausalLM"):
  189. return gguf.MODEL_ARCH.FALCON
  190. if arch == "GPTBigCodeForCausalLM":
  191. return gguf.MODEL_ARCH.STARCODER
  192. if arch == "GPTRefactForCausalLM":
  193. return gguf.MODEL_ARCH.REFACT
  194. if arch == "PersimmonForCausalLM":
  195. return gguf.MODEL_ARCH.PERSIMMON
  196. if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  197. return gguf.MODEL_ARCH.STABLELM
  198. if arch == "QWenLMHeadModel":
  199. return gguf.MODEL_ARCH.QWEN
  200. if arch == "MixtralForCausalLM":
  201. return gguf.MODEL_ARCH.LLAMA
  202. if arch == "GPT2LMHeadModel":
  203. return gguf.MODEL_ARCH.GPT2
  204. if arch == "PhiForCausalLM":
  205. return gguf.MODEL_ARCH.PHI2
  206. if arch == "PlamoForCausalLM":
  207. return gguf.MODEL_ARCH.PLAMO
  208. if arch == "CodeShellForCausalLM":
  209. return gguf.MODEL_ARCH.CODESHELL
  210. raise NotImplementedError(f'Architecture "{arch}" not supported!')
  211. def _set_vocab_gpt2(self):
  212. dir_model = self.dir_model
  213. hparams = self.hparams
  214. tokens: list[bytearray] = []
  215. toktypes: list[int] = []
  216. from transformers import AutoTokenizer
  217. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  218. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  219. assert max(tokenizer.vocab.values()) < vocab_size
  220. reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
  221. added_vocab = tokenizer.get_added_vocab()
  222. for i in range(vocab_size):
  223. if i not in reverse_vocab:
  224. pad_token = f"[PAD{i}]".encode('utf-8')
  225. tokens.append(bytearray(pad_token))
  226. toktypes.append(gguf.TokenType.USER_DEFINED)
  227. elif reverse_vocab[i] in added_vocab:
  228. tokens.append(reverse_vocab[i])
  229. if tokenizer.added_tokens_decoder[i].special:
  230. toktypes.append(gguf.TokenType.CONTROL)
  231. else:
  232. toktypes.append(gguf.TokenType.USER_DEFINED)
  233. else:
  234. tokens.append(reverse_vocab[i])
  235. toktypes.append(gguf.TokenType.NORMAL)
  236. self.gguf_writer.add_tokenizer_model("gpt2")
  237. self.gguf_writer.add_token_list(tokens)
  238. self.gguf_writer.add_token_types(toktypes)
  239. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  240. special_vocab.add_to_gguf(self.gguf_writer)
  241. def _set_vocab_sentencepiece(self):
  242. from sentencepiece import SentencePieceProcessor
  243. tokenizer_path = self.dir_model / 'tokenizer.model'
  244. tokens: list[bytes] = []
  245. scores: list[float] = []
  246. toktypes: list[int] = []
  247. if not tokenizer_path.is_file():
  248. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  249. sys.exit(1)
  250. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  251. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  252. for token_id in range(vocab_size):
  253. piece = tokenizer.id_to_piece(token_id)
  254. text = piece.encode("utf-8")
  255. score = tokenizer.get_score(token_id)
  256. toktype = SentencePieceTokenTypes.NORMAL
  257. if tokenizer.is_unknown(token_id):
  258. toktype = SentencePieceTokenTypes.UNKNOWN
  259. elif tokenizer.is_control(token_id):
  260. toktype = SentencePieceTokenTypes.CONTROL
  261. elif tokenizer.is_unused(token_id):
  262. toktype = SentencePieceTokenTypes.UNUSED
  263. elif tokenizer.is_byte(token_id):
  264. toktype = SentencePieceTokenTypes.BYTE
  265. tokens.append(text)
  266. scores.append(score)
  267. toktypes.append(toktype)
  268. added_tokens_file = self.dir_model / 'added_tokens.json'
  269. if added_tokens_file.is_file():
  270. with open(added_tokens_file, "r", encoding="utf-8") as f:
  271. added_tokens_json = json.load(f)
  272. for key in added_tokens_json:
  273. tokens.append(key.encode("utf-8"))
  274. scores.append(-1000.0)
  275. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  276. self.gguf_writer.add_tokenizer_model("llama")
  277. self.gguf_writer.add_token_list(tokens)
  278. self.gguf_writer.add_token_scores(scores)
  279. self.gguf_writer.add_token_types(toktypes)
  280. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  281. special_vocab.add_to_gguf(self.gguf_writer)
  282. class GPTNeoXModel(Model):
  283. def set_gguf_parameters(self):
  284. block_count = self.hparams["num_hidden_layers"]
  285. self.gguf_writer.add_name(self.dir_model.name)
  286. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  287. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  288. self.gguf_writer.add_block_count(block_count)
  289. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  290. self.gguf_writer.add_rope_dimension_count(
  291. int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
  292. )
  293. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  294. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  295. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  296. class BloomModel(Model):
  297. def set_gguf_parameters(self):
  298. self.gguf_writer.add_name("Bloom")
  299. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  300. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  301. self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
  302. self.gguf_writer.add_embedding_length(n_embed)
  303. self.gguf_writer.add_feed_forward_length(4 * n_embed)
  304. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  305. self.gguf_writer.add_head_count(n_head)
  306. self.gguf_writer.add_head_count_kv(n_head)
  307. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  308. self.gguf_writer.add_file_type(self.ftype)
  309. def write_tensors(self):
  310. block_count = self.hparams["n_layer"]
  311. tensors = dict(self.get_tensors())
  312. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  313. has_lm_head = True
  314. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  315. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  316. for name, data_torch in tensors.items():
  317. if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
  318. has_lm_head = False
  319. name = re.sub(r'transformer\.', '', name)
  320. old_dtype = data_torch.dtype
  321. # convert any unsupported data types to float32
  322. if data_torch.dtype not in (torch.float16, torch.float32):
  323. data_torch = data_torch.to(torch.float32)
  324. data = data_torch.squeeze().numpy()
  325. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  326. # Map bloom-style qkv_linear to gpt-style qkv_linear
  327. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  328. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  329. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  330. data = np.concatenate(
  331. (
  332. qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  333. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  334. qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
  335. ),
  336. axis=0,
  337. )
  338. print("re-format attention.linear_qkv.weight")
  339. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  340. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  341. data = np.concatenate(
  342. (
  343. qkv_bias[:, 0, :].reshape((n_embed,)),
  344. qkv_bias[:, 1, :].reshape((n_embed,)),
  345. qkv_bias[:, 2, :].reshape((n_embed,)),
  346. ),
  347. axis=0,
  348. )
  349. print("re-format attention.linear_qkv.bias")
  350. # map tensor names
  351. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  352. if new_name is None:
  353. print(f"Can not map tensor {name!r}")
  354. sys.exit()
  355. n_dims = len(data.shape)
  356. data_dtype = data.dtype
  357. # if f32 desired, convert any float16 to float32
  358. if self.ftype == 0 and data_dtype == np.float16:
  359. data = data.astype(np.float32)
  360. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  361. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  362. data = data.astype(np.float32)
  363. # if f16 desired, convert any float32 2-dim weight tensors to float16
  364. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  365. data = data.astype(np.float16)
  366. print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  367. self.gguf_writer.add_tensor(new_name, data)
  368. if not has_lm_head and name == "word_embeddings.weight":
  369. self.gguf_writer.add_tensor("output.weight", data)
  370. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  371. class MPTModel(Model):
  372. def set_gguf_parameters(self):
  373. block_count = self.hparams["n_layers"]
  374. self.gguf_writer.add_name(self.dir_model.name)
  375. self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
  376. self.gguf_writer.add_embedding_length(self.hparams["d_model"])
  377. self.gguf_writer.add_block_count(block_count)
  378. self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
  379. self.gguf_writer.add_head_count(self.hparams["n_heads"])
  380. if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
  381. self.gguf_writer.add_head_count_kv(kv_n_heads)
  382. self.gguf_writer.add_layer_norm_eps(1e-5)
  383. if self.hparams["attn_config"]["clip_qkv"] is not None:
  384. self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
  385. self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
  386. def write_tensors(self):
  387. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
  388. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  389. for name, data_torch in self.get_tensors():
  390. # we don't need these
  391. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  392. continue
  393. old_dtype = data_torch.dtype
  394. # convert any unsupported data types to float32
  395. if data_torch.dtype not in (torch.float16, torch.float32):
  396. data_torch = data_torch.to(torch.float32)
  397. data = data_torch.squeeze().numpy()
  398. # map tensor names
  399. if "scales" in name:
  400. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
  401. new_name = new_name.replace("scales", "act.scales")
  402. else:
  403. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  404. if new_name is None:
  405. print(f"Can not map tensor {name!r}")
  406. sys.exit()
  407. n_dims = len(data.shape)
  408. data_dtype = data.dtype
  409. # if f32 desired, convert any float16 to float32
  410. if self.ftype == 0 and data_dtype == np.float16:
  411. data = data.astype(np.float32)
  412. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  413. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  414. data = data.astype(np.float32)
  415. # if f16 desired, convert any float32 2-dim weight tensors to float16
  416. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  417. data = data.astype(np.float16)
  418. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  419. self.gguf_writer.add_tensor(new_name, data)
  420. # note: MPT output is tied to (same as) wte in original model;
  421. # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
  422. if new_name == "token_embd.weight":
  423. self.gguf_writer.add_tensor("output.weight", data)
  424. class BaichuanModel(Model):
  425. def set_vocab(self):
  426. self._set_vocab_sentencepiece()
  427. def set_gguf_parameters(self):
  428. block_count = self.hparams["num_hidden_layers"]
  429. head_count = self.hparams["num_attention_heads"]
  430. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  431. hf_repo = self.hparams.get("_name_or_path", "")
  432. ctx_length = 0
  433. if "max_sequence_length" in self.hparams:
  434. ctx_length = self.hparams["max_sequence_length"]
  435. elif "max_position_embeddings" in self.hparams:
  436. ctx_length = self.hparams["max_position_embeddings"]
  437. elif "model_max_length" in self.hparams:
  438. ctx_length = self.hparams["model_max_length"]
  439. else:
  440. print("gguf: can not find ctx length parameter.")
  441. sys.exit()
  442. self.gguf_writer.add_name(self.dir_model.name)
  443. self.gguf_writer.add_source_hf_repo(hf_repo)
  444. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  445. self.gguf_writer.add_context_length(ctx_length)
  446. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  447. self.gguf_writer.add_block_count(block_count)
  448. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  449. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  450. self.gguf_writer.add_head_count(head_count)
  451. self.gguf_writer.add_head_count_kv(head_count_kv)
  452. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  453. if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
  454. if self.hparams["rope_scaling"].get("type") == "linear":
  455. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  456. self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
  457. def write_tensors(self):
  458. # Collect tensors from generator object
  459. model_kv = dict(self.get_tensors())
  460. block_count = self.hparams["num_hidden_layers"]
  461. head_count = self.hparams["num_attention_heads"]
  462. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  463. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  464. for i in range(block_count):
  465. if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
  466. print(f"Unpacking and permuting layer {i}")
  467. model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
  468. self._reverse_hf_permute_part(w, 0, head_count, head_count)
  469. model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
  470. self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
  471. model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
  472. self._reverse_hf_part(w, 2)
  473. del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
  474. for name, data_torch in model_kv.items():
  475. # we don't need these
  476. if name.endswith(".rotary_emb.inv_freq"):
  477. continue
  478. old_dtype = data_torch.dtype
  479. # convert any unsupported data types to float32
  480. if data_torch.dtype not in (torch.float16, torch.float32):
  481. data_torch = data_torch.to(torch.float32)
  482. data = data_torch.squeeze().numpy()
  483. # map tensor names
  484. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  485. if new_name is None:
  486. print(f"Can not map tensor {name!r}")
  487. sys.exit()
  488. n_dims = len(data.shape)
  489. data_dtype = data.dtype
  490. # if f32 desired, convert any float16 to float32
  491. if self.ftype == 0 and data_dtype == np.float16:
  492. data = data.astype(np.float32)
  493. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  494. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  495. data = data.astype(np.float32)
  496. # if f16 desired, convert any float32 2-dim weight tensors to float16
  497. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  498. data = data.astype(np.float16)
  499. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  500. self.gguf_writer.add_tensor(new_name, data)
  501. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  502. if n_kv_head is not None and n_head != n_kv_head:
  503. n_head //= n_kv_head
  504. return (
  505. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  506. .swapaxes(1, 2)
  507. .reshape(weights.shape)
  508. )
  509. def _reverse_hf_permute_part(
  510. self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
  511. ) -> Tensor:
  512. r = weights.shape[0] // 3
  513. return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
  514. def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
  515. r = weights.shape[0] // 3
  516. return weights[r * n_part:r * n_part + r, ...]
  517. class FalconModel(Model):
  518. def set_gguf_parameters(self):
  519. block_count = self.hparams.get("num_hidden_layers")
  520. if block_count is None:
  521. block_count = self.hparams["n_layer"] # old name
  522. n_head = self.hparams.get("num_attention_heads")
  523. if n_head is None:
  524. n_head = self.hparams["n_head"] # old name
  525. n_head_kv = self.hparams.get("num_kv_heads")
  526. if n_head_kv is None:
  527. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  528. self.gguf_writer.add_name("Falcon")
  529. self.gguf_writer.add_context_length(2048) # not in config.json
  530. self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
  531. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  532. self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
  533. self.gguf_writer.add_block_count(block_count)
  534. self.gguf_writer.add_head_count(n_head)
  535. self.gguf_writer.add_head_count_kv(n_head_kv)
  536. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  537. self.gguf_writer.add_file_type(self.ftype)
  538. def write_tensors(self):
  539. block_count = self.hparams.get("num_hidden_layers")
  540. if block_count is None:
  541. block_count = self.hparams["n_layer"] # old name
  542. n_head = self.hparams.get("num_attention_heads")
  543. if n_head is None:
  544. n_head = self.hparams["n_head"] # old name
  545. n_head_kv = self.hparams.get("num_kv_heads")
  546. if n_head_kv is None:
  547. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  548. head_dim = self.hparams["hidden_size"] // n_head
  549. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  550. for name, data_torch in self.get_tensors():
  551. old_dtype = data_torch.dtype
  552. # convert any unsupported data types to float32
  553. if data_torch.dtype not in (torch.float16, torch.float32):
  554. data_torch = data_torch.to(torch.float32)
  555. # QKV tensor transform
  556. # The original query_key_value tensor contains n_head_kv "kv groups",
  557. # each consisting of n_head/n_head_kv query weights followed by one key
  558. # and one value weight (shared by all query heads in the kv group).
  559. # This layout makes it a big pain to work with in GGML.
  560. # So we rearrange them here,, so that we have n_head query weights
  561. # followed by n_head_kv key weights followed by n_head_kv value weights,
  562. # in contiguous fashion.
  563. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
  564. if "query_key_value" in name:
  565. qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
  566. q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
  567. k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
  568. v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
  569. data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
  570. data = data_torch.squeeze().numpy()
  571. # map tensor names
  572. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  573. if new_name is None:
  574. print(f"Can not map tensor {name!r}")
  575. sys.exit()
  576. n_dims = len(data.shape)
  577. data_dtype = data.dtype
  578. # if f32 desired, convert any float16 to float32
  579. if self.ftype == 0 and data_dtype == np.float16:
  580. data = data.astype(np.float32)
  581. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  582. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  583. data = data.astype(np.float32)
  584. # if f16 desired, convert any float32 2-dim weight tensors to float16
  585. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  586. data = data.astype(np.float16)
  587. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  588. self.gguf_writer.add_tensor(new_name, data)
  589. class StarCoderModel(Model):
  590. def set_gguf_parameters(self):
  591. block_count = self.hparams["n_layer"]
  592. self.gguf_writer.add_name("StarCoder")
  593. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  594. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  595. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  596. self.gguf_writer.add_block_count(block_count)
  597. self.gguf_writer.add_head_count(self.hparams["n_head"])
  598. self.gguf_writer.add_head_count_kv(1)
  599. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  600. self.gguf_writer.add_file_type(self.ftype)
  601. class RefactModel(Model):
  602. def set_gguf_parameters(self):
  603. hidden_dim = self.hparams["n_embd"]
  604. inner_dim = 4 * hidden_dim
  605. hidden_dim = int(2 * inner_dim / 3)
  606. multiple_of = 256
  607. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  608. block_count = self.hparams["n_layer"]
  609. self.gguf_writer.add_name("Refact")
  610. # refact uses Alibi. So this is from config.json which might be used by training.
  611. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  612. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  613. self.gguf_writer.add_feed_forward_length(ff_dim)
  614. self.gguf_writer.add_block_count(block_count)
  615. self.gguf_writer.add_head_count(self.hparams["n_head"])
  616. self.gguf_writer.add_head_count_kv(1)
  617. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  618. self.gguf_writer.add_file_type(self.ftype)
  619. def write_tensors(self):
  620. hidden_dim = self.hparams["n_embd"]
  621. inner_dim = 4 * hidden_dim
  622. hidden_dim = int(2 * inner_dim / 3)
  623. multiple_of = 256
  624. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  625. n_head = self.hparams["n_head"]
  626. n_head_kv = 1
  627. head_dim = self.hparams["n_embd"] // n_head
  628. block_count = self.hparams["n_layer"]
  629. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  630. tensors = dict(self.get_tensors())
  631. for i in range(block_count):
  632. if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
  633. tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
  634. tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
  635. del tensors[f"transformer.h.{i}.attn.kv.weight"]
  636. if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
  637. tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
  638. del tensors[f"transformer.h.{i}.attn.q.weight"]
  639. if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
  640. tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
  641. tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
  642. del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  643. for name, data_torch in tensors.items():
  644. old_dtype = data_torch.dtype
  645. # convert any unsupported data types to float32
  646. if data_torch.dtype not in (torch.float16, torch.float32):
  647. data_torch = data_torch.to(torch.float32)
  648. data = data_torch.squeeze().numpy()
  649. # map tensor names
  650. new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
  651. if new_name is None:
  652. print(f"Can not map tensor {name!r}")
  653. sys.exit()
  654. n_dims = len(data.shape)
  655. data_dtype = data.dtype
  656. # if f32 desired, convert any float16 to float32
  657. if self.ftype == 0 and data_dtype == np.float16:
  658. data = data.astype(np.float32)
  659. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  660. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  661. data = data.astype(np.float32)
  662. # if f16 desired, convert any float32 2-dim weight tensors to float16
  663. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  664. data = data.astype(np.float16)
  665. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  666. self.gguf_writer.add_tensor(new_name, data)
  667. class PersimmonModel(Model):
  668. def set_gguf_parameters(self):
  669. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  670. head_count = self.hparams["num_attention_heads"]
  671. head_count_kv = head_count
  672. hidden_size = self.hparams["hidden_size"]
  673. self.gguf_writer.add_name('persimmon-8b-chat')
  674. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  675. self.gguf_writer.add_embedding_length(hidden_size)
  676. self.gguf_writer.add_block_count(block_count)
  677. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  678. # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
  679. # than the head size?
  680. # ref: https://github.com/ggerganov/llama.cpp/pull/4889
  681. # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
  682. self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
  683. self.gguf_writer.add_head_count(head_count)
  684. self.gguf_writer.add_head_count_kv(head_count_kv)
  685. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  686. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  687. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  688. def set_vocab(self):
  689. self._set_vocab_sentencepiece()
  690. # self.gguf_writer.add_bos_token_id(71013)
  691. # self.gguf_writer.add_eos_token_id(71013)
  692. def write_tensors(self):
  693. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  694. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  695. for name, data_torch in self.get_tensors():
  696. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  697. continue
  698. old_dtype = data_torch.dtype
  699. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  700. data = data_torch.to(torch.float32).squeeze().numpy()
  701. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  702. if new_name is None:
  703. print(f"Can not map tensor {name!r}")
  704. sys.exit()
  705. n_dims = len(data.shape)
  706. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  707. self.gguf_writer.add_tensor(new_name, data)
  708. class StableLMModel(Model):
  709. def set_gguf_parameters(self):
  710. hparams = self.hparams
  711. block_count = hparams["num_hidden_layers"]
  712. self.gguf_writer.add_name(self.dir_model.name)
  713. self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  714. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  715. self.gguf_writer.add_block_count(block_count)
  716. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  717. self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
  718. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  719. self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
  720. self.gguf_writer.add_layer_norm_eps(1e-5)
  721. class MixtralModel(Model):
  722. def set_vocab(self):
  723. self._set_vocab_sentencepiece()
  724. class QwenModel(Model):
  725. @staticmethod
  726. def token_bytes_to_string(b):
  727. from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
  728. byte_encoder = bytes_to_unicode()
  729. return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
  730. @staticmethod
  731. def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
  732. parts = [bytes([b]) for b in token]
  733. while True:
  734. min_idx = None
  735. min_rank = None
  736. for i, pair in enumerate(zip(parts[:-1], parts[1:])):
  737. rank = mergeable_ranks.get(pair[0] + pair[1])
  738. if rank is not None and (min_rank is None or rank < min_rank):
  739. min_idx = i
  740. min_rank = rank
  741. if min_rank is None or (max_rank is not None and min_rank >= max_rank):
  742. break
  743. assert min_idx is not None
  744. parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
  745. return parts
  746. def set_vocab(self):
  747. dir_model = self.dir_model
  748. hparams = self.hparams
  749. tokens: list[bytearray] = []
  750. toktypes: list[int] = []
  751. from transformers import AutoTokenizer
  752. tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
  753. vocab_size = hparams["vocab_size"]
  754. assert max(tokenizer.get_vocab().values()) < vocab_size
  755. merges = []
  756. vocab = {}
  757. mergeable_ranks = tokenizer.mergeable_ranks
  758. for token, rank in mergeable_ranks.items():
  759. vocab[self.token_bytes_to_string(token)] = rank
  760. if len(token) == 1:
  761. continue
  762. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
  763. assert len(merged) == 2
  764. merges.append(' '.join(map(self.token_bytes_to_string, merged)))
  765. reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
  766. added_vocab = tokenizer.special_tokens
  767. for i in range(vocab_size):
  768. if i not in reverse_vocab:
  769. pad_token = f"[PAD{i}]".encode("utf-8")
  770. tokens.append(bytearray(pad_token))
  771. toktypes.append(gguf.TokenType.USER_DEFINED)
  772. elif reverse_vocab[i] in added_vocab:
  773. tokens.append(reverse_vocab[i])
  774. toktypes.append(gguf.TokenType.CONTROL)
  775. else:
  776. tokens.append(reverse_vocab[i])
  777. toktypes.append(gguf.TokenType.NORMAL)
  778. self.gguf_writer.add_tokenizer_model("gpt2")
  779. self.gguf_writer.add_token_list(tokens)
  780. self.gguf_writer.add_token_types(toktypes)
  781. special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
  782. special_vocab.merges = merges
  783. special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
  784. special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
  785. special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
  786. special_vocab.add_to_gguf(self.gguf_writer)
  787. def set_gguf_parameters(self):
  788. self.gguf_writer.add_name("Qwen")
  789. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  790. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  791. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  792. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  793. self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
  794. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  795. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  796. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  797. def write_tensors(self):
  798. block_count = self.hparams["num_hidden_layers"]
  799. model_kv = dict(self.get_tensors())
  800. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  801. for name, data_torch in model_kv.items():
  802. # we don't need these
  803. if name.endswith(".rotary_emb.inv_freq"):
  804. continue
  805. old_dtype = data_torch.dtype
  806. # convert any unsupported data types to float32
  807. if data_torch.dtype not in (torch.float16, torch.float32):
  808. data_torch = data_torch.to(torch.float32)
  809. data = data_torch.squeeze().numpy()
  810. # map tensor names
  811. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  812. if new_name is None:
  813. print(f"Can not map tensor {name!r}")
  814. sys.exit()
  815. n_dims = len(data.shape)
  816. data_dtype = data.dtype
  817. # if f32 desired, convert any float16 to float32
  818. if self.ftype == 0 and data_dtype == np.float16:
  819. data = data.astype(np.float32)
  820. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  821. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  822. data = data.astype(np.float32)
  823. # if f16 desired, convert any float32 2-dim weight tensors to float16
  824. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  825. data = data.astype(np.float16)
  826. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  827. self.gguf_writer.add_tensor(new_name, data)
  828. class GPT2Model(Model):
  829. def set_gguf_parameters(self):
  830. self.gguf_writer.add_name(self.dir_model.name)
  831. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  832. self.gguf_writer.add_context_length(self.hparams["n_ctx"])
  833. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  834. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  835. self.gguf_writer.add_head_count(self.hparams["n_head"])
  836. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  837. self.gguf_writer.add_file_type(self.ftype)
  838. def write_tensors(self):
  839. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  840. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  841. for name, data_torch in self.get_tensors():
  842. # we don't need these
  843. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
  844. continue
  845. if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
  846. data_torch = data_torch.transpose(1, 0)
  847. old_dtype = data_torch.dtype
  848. # convert any unsupported data types to float32
  849. if data_torch.dtype not in (torch.float16, torch.float32):
  850. data_torch = data_torch.to(torch.float32)
  851. data = data_torch.squeeze().numpy()
  852. # map tensor names
  853. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  854. if new_name is None:
  855. print(f"Can not map tensor {name!r}")
  856. sys.exit()
  857. n_dims = len(data.shape)
  858. data_dtype = data.dtype
  859. # if f32 desired, convert any float16 to float32
  860. if self.ftype == 0 and data_dtype == np.float16:
  861. data = data.astype(np.float32)
  862. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  863. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  864. data = data.astype(np.float32)
  865. # if f16 desired, convert any float32 2-dim weight tensors to float16
  866. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  867. data = data.astype(np.float16)
  868. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  869. self.gguf_writer.add_tensor(new_name, data)
  870. # note: GPT2 output is tied to (same as) wte in original model
  871. if new_name == "token_embd.weight":
  872. print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  873. self.gguf_writer.add_tensor("output.weight", data)
  874. class Phi2Model(Model):
  875. def set_gguf_parameters(self):
  876. block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
  877. rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
  878. n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
  879. n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
  880. self.gguf_writer.add_name("Phi2")
  881. self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
  882. self.gguf_writer.add_embedding_length(n_embd)
  883. self.gguf_writer.add_feed_forward_length(4 * n_embd)
  884. self.gguf_writer.add_block_count(block_count)
  885. self.gguf_writer.add_head_count(n_head)
  886. self.gguf_writer.add_head_count_kv(n_head)
  887. self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
  888. self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
  889. self.gguf_writer.add_file_type(self.ftype)
  890. self.gguf_writer.add_add_bos_token(False)
  891. class PlamoModel(Model):
  892. def set_vocab(self):
  893. self._set_vocab_sentencepiece()
  894. def set_gguf_parameters(self):
  895. hparams = self.hparams
  896. block_count = hparams["num_hidden_layers"]
  897. self.gguf_writer.add_name("PLaMo")
  898. self.gguf_writer.add_context_length(4096) # not in config.json
  899. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  900. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  901. self.gguf_writer.add_block_count(block_count)
  902. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  903. self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
  904. self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
  905. def shuffle_attn_q_weight(self, data_torch):
  906. assert data_torch.size() == (5120, 5120)
  907. data_torch = data_torch.reshape(8, 5, 128, 5120)
  908. data_torch = torch.permute(data_torch, (1, 0, 2, 3))
  909. data_torch = torch.reshape(data_torch, (5120, 5120))
  910. return data_torch
  911. def shuffle_attn_output_weight(self, data_torch):
  912. assert data_torch.size() == (5120, 5120)
  913. data_torch = data_torch.reshape(5120, 8, 5, 128)
  914. data_torch = torch.permute(data_torch, (0, 2, 1, 3))
  915. data_torch = torch.reshape(data_torch, (5120, 5120))
  916. return data_torch
  917. def write_tensors(self):
  918. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  919. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  920. for name, data_torch in self.get_tensors():
  921. if "self_attn.rotary_emb.inv_freq" in name:
  922. continue
  923. # map tensor names
  924. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  925. if new_name is None:
  926. print(f"Can not map tensor {name!r}")
  927. sys.exit()
  928. # shuffle for broadcasting of gqa in ggml_mul_mat
  929. if new_name.endswith("attn_q.weight"):
  930. data_torch = self.shuffle_attn_q_weight(data_torch)
  931. elif new_name.endswith("attn_output.weight"):
  932. data_torch = self.shuffle_attn_output_weight(data_torch)
  933. old_dtype = data_torch.dtype
  934. # convert any unsupported data types to float32
  935. if data_torch.dtype not in (torch.float16, torch.float32):
  936. data_torch = data_torch.to(torch.float32)
  937. data = data_torch.squeeze().numpy()
  938. n_dims = len(data.shape)
  939. data_dtype = data.dtype
  940. # if f32 desired, convert any float16 to float32
  941. if self.ftype == 0 and data_dtype == np.float16:
  942. data = data.astype(np.float32)
  943. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  944. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  945. data = data.astype(np.float32)
  946. # if f16 desired, convert any float32 2-dim weight tensors to float16
  947. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  948. data = data.astype(np.float16)
  949. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  950. self.gguf_writer.add_tensor(new_name, data)
  951. class CodeShellModel(Model):
  952. def set_gguf_parameters(self):
  953. block_count = self.hparams["n_layer"]
  954. self.gguf_writer.add_name("CodeShell")
  955. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  956. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  957. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  958. self.gguf_writer.add_block_count(block_count)
  959. self.gguf_writer.add_head_count(self.hparams["n_head"])
  960. self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
  961. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  962. self.gguf_writer.add_file_type(self.ftype)
  963. self.gguf_writer.add_rope_freq_base(10000.0)
  964. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  965. self.gguf_writer.add_rope_scaling_factor(1.0)
  966. def write_tensors(self):
  967. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  968. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  969. tensors = dict(self.get_tensors())
  970. has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
  971. for name, data_torch in tensors.items():
  972. # we don't need these
  973. if name.endswith((".attn.rotary_emb.inv_freq")):
  974. continue
  975. old_dtype = data_torch.dtype
  976. # convert any unsupported data types to float32
  977. if data_torch.dtype not in (torch.float16, torch.float32):
  978. data_torch = data_torch.to(torch.float32)
  979. data = data_torch.squeeze().numpy()
  980. # map tensor names
  981. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  982. if new_name is None:
  983. print(f"Can not map tensor {name!r}")
  984. sys.exit()
  985. n_dims = len(data.shape)
  986. data_dtype = data.dtype
  987. # if f32 desired, convert any float16 to float32
  988. if self.ftype == 0 and data_dtype == np.float16:
  989. data = data.astype(np.float32)
  990. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  991. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  992. data = data.astype(np.float32)
  993. # if f16 desired, convert any float32 2-dim weight tensors to float16
  994. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  995. data = data.astype(np.float16)
  996. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  997. self.gguf_writer.add_tensor(new_name, data)
  998. if not has_lm_head and name == "transformer.wte.weight":
  999. self.gguf_writer.add_tensor("output.weight", data)
  1000. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  1001. ###### CONVERSION LOGIC ######
  1002. def parse_args() -> argparse.Namespace:
  1003. parser = argparse.ArgumentParser(
  1004. description="Convert a huggingface model to a GGML compatible file")
  1005. parser.add_argument(
  1006. "--vocab-only", action="store_true",
  1007. help="extract only the vocab",
  1008. )
  1009. parser.add_argument(
  1010. "--awq-path", type=Path, default=None,
  1011. help="Path to scale awq cache file")
  1012. parser.add_argument(
  1013. "--outfile", type=Path,
  1014. help="path to write to; default: based on input",
  1015. )
  1016. parser.add_argument(
  1017. "--outtype", type=str, choices=["f32", "f16"], default="f16",
  1018. help="output format - use f32 for float32, f16 for float16",
  1019. )
  1020. parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
  1021. parser.add_argument(
  1022. "model", type=Path,
  1023. help="directory containing model file",
  1024. )
  1025. return parser.parse_args()
  1026. def main() -> None:
  1027. args = parse_args()
  1028. dir_model = args.model
  1029. if args.awq_path:
  1030. sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
  1031. from awq.apply_awq import add_scale_weights
  1032. tmp_model_path = args.model / "weighted_model"
  1033. dir_model = tmp_model_path
  1034. if tmp_model_path.is_dir():
  1035. print(f"{tmp_model_path} exists as a weighted model.")
  1036. else:
  1037. tmp_model_path.mkdir(parents=True, exist_ok=True)
  1038. print("Saving new weighted model ...")
  1039. add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
  1040. print(f"Saved weighted model at {tmp_model_path}.")
  1041. if not dir_model.is_dir():
  1042. print(f'Error: {args.model} is not a directory', file=sys.stderr)
  1043. sys.exit(1)
  1044. ftype_map = {
  1045. "f32": gguf.GGMLQuantizationType.F32,
  1046. "f16": gguf.GGMLQuantizationType.F16,
  1047. }
  1048. if args.outfile is not None:
  1049. fname_out = args.outfile
  1050. else:
  1051. # output in the same directory as the model by default
  1052. fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
  1053. print(f"Loading model: {dir_model.name}")
  1054. hparams = Model.load_hparams(dir_model)
  1055. with torch.inference_mode():
  1056. model_class = Model.from_model_architecture(hparams["architectures"][0])
  1057. model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
  1058. print("Set model parameters")
  1059. model_instance.set_gguf_parameters()
  1060. print("Set model tokenizer")
  1061. model_instance.set_vocab()
  1062. if args.vocab_only:
  1063. print(f"Exporting model vocab to '{fname_out}'")
  1064. model_instance.write_vocab()
  1065. else:
  1066. print(f"Exporting model to '{fname_out}'")
  1067. model_instance.write()
  1068. print(f"Model successfully exported to '{fname_out}'")
  1069. if __name__ == '__main__':
  1070. main()