convert-hf-to-gguf.py 54 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import contextlib
  5. import json
  6. import os
  7. import re
  8. import sys
  9. from enum import IntEnum
  10. from pathlib import Path
  11. from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
  12. import numpy as np
  13. import torch
  14. if TYPE_CHECKING:
  15. from torch import Tensor
  16. if 'NO_LOCAL_GGUF' not in os.environ:
  17. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  18. import gguf
  19. ###### MODEL DEFINITIONS ######
  20. class SentencePieceTokenTypes(IntEnum):
  21. NORMAL = 1
  22. UNKNOWN = 2
  23. CONTROL = 3
  24. USER_DEFINED = 4
  25. UNUSED = 5
  26. BYTE = 6
  27. class Model:
  28. def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
  29. self.dir_model = dir_model
  30. self.ftype = ftype
  31. self.fname_out = fname_out
  32. self.is_big_endian = is_big_endian
  33. self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  34. self.is_safetensors = self._is_model_safetensors()
  35. self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
  36. self.part_names = self._get_part_names()
  37. self.hparams = Model.load_hparams(self.dir_model)
  38. self.model_arch = self._get_model_architecture()
  39. self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
  40. def set_vocab(self):
  41. self._set_vocab_gpt2()
  42. def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
  43. for part_name in self.part_names:
  44. print(f"gguf: loading model part '{part_name}'")
  45. ctx: ContextManager[Any]
  46. if self.is_safetensors:
  47. from safetensors import safe_open
  48. ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
  49. else:
  50. ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
  51. with ctx as model_part:
  52. for name in model_part.keys():
  53. data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
  54. yield name, data
  55. def set_gguf_parameters(self):
  56. self.gguf_writer.add_name(self.dir_model.name)
  57. self.gguf_writer.add_block_count(self.hparams.get(
  58. "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
  59. ))
  60. if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
  61. self.gguf_writer.add_context_length(n_ctx)
  62. if (n_embd := self.hparams.get("hidden_size")) is not None:
  63. self.gguf_writer.add_embedding_length(n_embd)
  64. if (n_ff := self.hparams.get("intermediate_size")) is not None:
  65. self.gguf_writer.add_feed_forward_length(n_ff)
  66. if (n_head := self.hparams.get("num_attention_heads")) is not None:
  67. self.gguf_writer.add_head_count(n_head)
  68. if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
  69. self.gguf_writer.add_head_count_kv(n_head_kv)
  70. if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
  71. self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
  72. if (n_experts := self.hparams.get("num_local_experts")) is not None:
  73. self.gguf_writer.add_expert_count(n_experts)
  74. if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
  75. self.gguf_writer.add_expert_used_count(n_experts_used)
  76. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  77. def write_tensors(self):
  78. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  79. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  80. for name, data_torch in self.get_tensors():
  81. # we don't need these
  82. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  83. continue
  84. old_dtype = data_torch.dtype
  85. # convert any unsupported data types to float32
  86. if data_torch.dtype not in (torch.float16, torch.float32):
  87. data_torch = data_torch.to(torch.float32)
  88. data = data_torch.squeeze().numpy()
  89. # map tensor names
  90. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  91. if new_name is None:
  92. print(f"Can not map tensor {name!r}")
  93. sys.exit()
  94. n_dims = len(data.shape)
  95. data_dtype = data.dtype
  96. # if f32 desired, convert any float16 to float32
  97. if self.ftype == 0 and data_dtype == np.float16:
  98. data = data.astype(np.float32)
  99. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  100. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  101. data = data.astype(np.float32)
  102. # if f16 desired, convert any float32 2-dim weight tensors to float16
  103. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  104. data = data.astype(np.float16)
  105. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  106. self.gguf_writer.add_tensor(new_name, data)
  107. def write(self):
  108. self.write_tensors()
  109. self.gguf_writer.write_header_to_file()
  110. self.gguf_writer.write_kv_data_to_file()
  111. self.gguf_writer.write_tensors_to_file()
  112. self.gguf_writer.close()
  113. def write_vocab(self):
  114. self.gguf_writer.write_header_to_file()
  115. self.gguf_writer.write_kv_data_to_file()
  116. self.gguf_writer.close()
  117. @staticmethod
  118. def count_model_parts(dir_model: Path, prefix: str) -> int:
  119. num_parts = 0
  120. for filename in os.listdir(dir_model):
  121. if filename.endswith(prefix):
  122. num_parts += 1
  123. return num_parts
  124. @staticmethod
  125. def load_hparams(dir_model):
  126. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  127. return json.load(f)
  128. @staticmethod
  129. def from_model_architecture(model_architecture):
  130. if model_architecture == "GPTNeoXForCausalLM":
  131. return GPTNeoXModel
  132. if model_architecture == "BloomForCausalLM":
  133. return BloomModel
  134. if model_architecture == "MPTForCausalLM":
  135. return MPTModel
  136. if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  137. return BaichuanModel
  138. if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
  139. return FalconModel
  140. if model_architecture == "GPTBigCodeForCausalLM":
  141. return StarCoderModel
  142. if model_architecture == "GPTRefactForCausalLM":
  143. return RefactModel
  144. if model_architecture == "PersimmonForCausalLM":
  145. return PersimmonModel
  146. if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  147. return StableLMModel
  148. if model_architecture == "QWenLMHeadModel":
  149. return QwenModel
  150. if model_architecture == "MixtralForCausalLM":
  151. return MixtralModel
  152. if model_architecture == "GPT2LMHeadModel":
  153. return GPT2Model
  154. if model_architecture == "PhiForCausalLM":
  155. return Phi2Model
  156. if model_architecture == "PlamoForCausalLM":
  157. return PlamoModel
  158. return Model
  159. def _is_model_safetensors(self) -> bool:
  160. return Model.count_model_parts(self.dir_model, ".safetensors") > 0
  161. def _get_part_names(self):
  162. if self.is_safetensors:
  163. if self.num_parts == 1: # there's only one .safetensors file
  164. return ("model.safetensors",)
  165. return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
  166. if self.num_parts == 1: # there's only one .bin file
  167. return ("pytorch_model.bin",)
  168. return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
  169. def _get_model_architecture(self) -> gguf.MODEL_ARCH:
  170. arch = self.hparams["architectures"][0]
  171. if arch == "GPTNeoXForCausalLM":
  172. return gguf.MODEL_ARCH.GPTNEOX
  173. if arch == "BloomForCausalLM":
  174. return gguf.MODEL_ARCH.BLOOM
  175. if arch == "MPTForCausalLM":
  176. return gguf.MODEL_ARCH.MPT
  177. if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  178. return gguf.MODEL_ARCH.BAICHUAN
  179. if arch in ("FalconForCausalLM", "RWForCausalLM"):
  180. return gguf.MODEL_ARCH.FALCON
  181. if arch == "GPTBigCodeForCausalLM":
  182. return gguf.MODEL_ARCH.STARCODER
  183. if arch == "GPTRefactForCausalLM":
  184. return gguf.MODEL_ARCH.REFACT
  185. if arch == "PersimmonForCausalLM":
  186. return gguf.MODEL_ARCH.PERSIMMON
  187. if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  188. return gguf.MODEL_ARCH.STABLELM
  189. if arch == "QWenLMHeadModel":
  190. return gguf.MODEL_ARCH.QWEN
  191. if arch == "MixtralForCausalLM":
  192. return gguf.MODEL_ARCH.LLAMA
  193. if arch == "GPT2LMHeadModel":
  194. return gguf.MODEL_ARCH.GPT2
  195. if arch == "PhiForCausalLM":
  196. return gguf.MODEL_ARCH.PHI2
  197. if arch == "PlamoForCausalLM":
  198. return gguf.MODEL_ARCH.PLAMO
  199. raise NotImplementedError(f'Architecture "{arch}" not supported!')
  200. def _set_vocab_gpt2(self):
  201. dir_model = self.dir_model
  202. hparams = self.hparams
  203. tokens: list[bytearray] = []
  204. toktypes: list[int] = []
  205. from transformers import AutoTokenizer
  206. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  207. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  208. assert max(tokenizer.vocab.values()) < vocab_size
  209. reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
  210. added_vocab = tokenizer.get_added_vocab()
  211. for i in range(vocab_size):
  212. if i not in reverse_vocab:
  213. pad_token = f"[PAD{i}]".encode('utf-8')
  214. tokens.append(bytearray(pad_token))
  215. toktypes.append(gguf.TokenType.USER_DEFINED)
  216. elif reverse_vocab[i] in added_vocab:
  217. tokens.append(reverse_vocab[i])
  218. if tokenizer.added_tokens_decoder[i].special:
  219. toktypes.append(gguf.TokenType.CONTROL)
  220. else:
  221. toktypes.append(gguf.TokenType.USER_DEFINED)
  222. else:
  223. tokens.append(reverse_vocab[i])
  224. toktypes.append(gguf.TokenType.NORMAL)
  225. self.gguf_writer.add_tokenizer_model("gpt2")
  226. self.gguf_writer.add_token_list(tokens)
  227. self.gguf_writer.add_token_types(toktypes)
  228. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  229. special_vocab.add_to_gguf(self.gguf_writer)
  230. def _set_vocab_sentencepiece(self):
  231. from sentencepiece import SentencePieceProcessor
  232. tokenizer_path = self.dir_model / 'tokenizer.model'
  233. tokens: list[bytes] = []
  234. scores: list[float] = []
  235. toktypes: list[int] = []
  236. if not tokenizer_path.is_file():
  237. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  238. sys.exit(1)
  239. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  240. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  241. for token_id in range(vocab_size):
  242. piece = tokenizer.id_to_piece(token_id)
  243. text = piece.encode("utf-8")
  244. score = tokenizer.get_score(token_id)
  245. toktype = SentencePieceTokenTypes.NORMAL
  246. if tokenizer.is_unknown(token_id):
  247. toktype = SentencePieceTokenTypes.UNKNOWN
  248. elif tokenizer.is_control(token_id):
  249. toktype = SentencePieceTokenTypes.CONTROL
  250. elif tokenizer.is_unused(token_id):
  251. toktype = SentencePieceTokenTypes.UNUSED
  252. elif tokenizer.is_byte(token_id):
  253. toktype = SentencePieceTokenTypes.BYTE
  254. tokens.append(text)
  255. scores.append(score)
  256. toktypes.append(toktype)
  257. added_tokens_file = self.dir_model / 'added_tokens.json'
  258. if added_tokens_file.is_file():
  259. with open(added_tokens_file, "r", encoding="utf-8") as f:
  260. added_tokens_json = json.load(f)
  261. for key in added_tokens_json:
  262. tokens.append(key.encode("utf-8"))
  263. scores.append(-1000.0)
  264. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  265. self.gguf_writer.add_tokenizer_model("llama")
  266. self.gguf_writer.add_token_list(tokens)
  267. self.gguf_writer.add_token_scores(scores)
  268. self.gguf_writer.add_token_types(toktypes)
  269. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  270. special_vocab.add_to_gguf(self.gguf_writer)
  271. class GPTNeoXModel(Model):
  272. def set_gguf_parameters(self):
  273. block_count = self.hparams["num_hidden_layers"]
  274. self.gguf_writer.add_name(self.dir_model.name)
  275. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  276. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  277. self.gguf_writer.add_block_count(block_count)
  278. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  279. self.gguf_writer.add_rope_dimension_count(
  280. int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
  281. )
  282. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  283. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  284. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  285. class BloomModel(Model):
  286. def set_gguf_parameters(self):
  287. self.gguf_writer.add_name("Bloom")
  288. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  289. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  290. self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
  291. self.gguf_writer.add_embedding_length(n_embed)
  292. self.gguf_writer.add_feed_forward_length(4 * n_embed)
  293. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  294. self.gguf_writer.add_head_count(n_head)
  295. self.gguf_writer.add_head_count_kv(n_head)
  296. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  297. self.gguf_writer.add_file_type(self.ftype)
  298. def write_tensors(self):
  299. block_count = self.hparams["n_layer"]
  300. tensors = dict(self.get_tensors())
  301. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  302. has_lm_head = True
  303. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  304. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  305. for name, data_torch in tensors.items():
  306. if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
  307. has_lm_head = False
  308. name = re.sub(r'transformer\.', '', name)
  309. old_dtype = data_torch.dtype
  310. # convert any unsupported data types to float32
  311. if data_torch.dtype not in (torch.float16, torch.float32):
  312. data_torch = data_torch.to(torch.float32)
  313. data = data_torch.squeeze().numpy()
  314. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  315. # Map bloom-style qkv_linear to gpt-style qkv_linear
  316. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  317. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  318. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  319. data = np.concatenate(
  320. (
  321. qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  322. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  323. qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
  324. ),
  325. axis=0,
  326. )
  327. print("re-format attention.linear_qkv.weight")
  328. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  329. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  330. data = np.concatenate(
  331. (
  332. qkv_bias[:, 0, :].reshape((n_embed,)),
  333. qkv_bias[:, 1, :].reshape((n_embed,)),
  334. qkv_bias[:, 2, :].reshape((n_embed,)),
  335. ),
  336. axis=0,
  337. )
  338. print("re-format attention.linear_qkv.bias")
  339. # map tensor names
  340. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  341. if new_name is None:
  342. print(f"Can not map tensor {name!r}")
  343. sys.exit()
  344. n_dims = len(data.shape)
  345. data_dtype = data.dtype
  346. # if f32 desired, convert any float16 to float32
  347. if self.ftype == 0 and data_dtype == np.float16:
  348. data = data.astype(np.float32)
  349. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  350. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  351. data = data.astype(np.float32)
  352. # if f16 desired, convert any float32 2-dim weight tensors to float16
  353. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  354. data = data.astype(np.float16)
  355. print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  356. self.gguf_writer.add_tensor(new_name, data)
  357. if not has_lm_head and name == "word_embeddings.weight":
  358. self.gguf_writer.add_tensor("output.weight", data)
  359. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  360. class MPTModel(Model):
  361. def set_gguf_parameters(self):
  362. block_count = self.hparams["n_layers"]
  363. self.gguf_writer.add_name(self.dir_model.name)
  364. self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
  365. self.gguf_writer.add_embedding_length(self.hparams["d_model"])
  366. self.gguf_writer.add_block_count(block_count)
  367. self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
  368. self.gguf_writer.add_head_count(self.hparams["n_heads"])
  369. if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
  370. self.gguf_writer.add_head_count_kv(kv_n_heads)
  371. self.gguf_writer.add_layer_norm_eps(1e-5)
  372. if self.hparams["attn_config"]["clip_qkv"] is not None:
  373. self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
  374. self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
  375. def write_tensors(self):
  376. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
  377. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  378. for name, data_torch in self.get_tensors():
  379. # we don't need these
  380. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  381. continue
  382. old_dtype = data_torch.dtype
  383. # convert any unsupported data types to float32
  384. if data_torch.dtype not in (torch.float16, torch.float32):
  385. data_torch = data_torch.to(torch.float32)
  386. data = data_torch.squeeze().numpy()
  387. # map tensor names
  388. if "scales" in name:
  389. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
  390. new_name = new_name.replace("scales", "act.scales")
  391. else:
  392. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  393. if new_name is None:
  394. print(f"Can not map tensor {name!r}")
  395. sys.exit()
  396. n_dims = len(data.shape)
  397. data_dtype = data.dtype
  398. # if f32 desired, convert any float16 to float32
  399. if self.ftype == 0 and data_dtype == np.float16:
  400. data = data.astype(np.float32)
  401. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  402. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  403. data = data.astype(np.float32)
  404. # if f16 desired, convert any float32 2-dim weight tensors to float16
  405. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  406. data = data.astype(np.float16)
  407. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  408. self.gguf_writer.add_tensor(new_name, data)
  409. # note: MPT output is tied to (same as) wte in original model;
  410. # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
  411. if new_name == "token_embd.weight":
  412. self.gguf_writer.add_tensor("output.weight", data)
  413. class BaichuanModel(Model):
  414. def set_vocab(self):
  415. self._set_vocab_sentencepiece()
  416. def set_gguf_parameters(self):
  417. block_count = self.hparams["num_hidden_layers"]
  418. head_count = self.hparams["num_attention_heads"]
  419. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  420. hf_repo = self.hparams.get("_name_or_path", "")
  421. ctx_length = 0
  422. if "max_sequence_length" in self.hparams:
  423. ctx_length = self.hparams["max_sequence_length"]
  424. elif "max_position_embeddings" in self.hparams:
  425. ctx_length = self.hparams["max_position_embeddings"]
  426. elif "model_max_length" in self.hparams:
  427. ctx_length = self.hparams["model_max_length"]
  428. else:
  429. print("gguf: can not find ctx length parameter.")
  430. sys.exit()
  431. self.gguf_writer.add_name(self.dir_model.name)
  432. self.gguf_writer.add_source_hf_repo(hf_repo)
  433. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  434. self.gguf_writer.add_context_length(ctx_length)
  435. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  436. self.gguf_writer.add_block_count(block_count)
  437. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  438. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  439. self.gguf_writer.add_head_count(head_count)
  440. self.gguf_writer.add_head_count_kv(head_count_kv)
  441. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  442. if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
  443. if self.hparams["rope_scaling"].get("type") == "linear":
  444. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  445. self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
  446. def write_tensors(self):
  447. # Collect tensors from generator object
  448. model_kv = dict(self.get_tensors())
  449. block_count = self.hparams["num_hidden_layers"]
  450. head_count = self.hparams["num_attention_heads"]
  451. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  452. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  453. for i in range(block_count):
  454. if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
  455. print(f"Unpacking and permuting layer {i}")
  456. model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
  457. self._reverse_hf_permute_part(w, 0, head_count, head_count)
  458. model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
  459. self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
  460. model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
  461. self._reverse_hf_part(w, 2)
  462. del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
  463. for name, data_torch in model_kv.items():
  464. # we don't need these
  465. if name.endswith(".rotary_emb.inv_freq"):
  466. continue
  467. old_dtype = data_torch.dtype
  468. # convert any unsupported data types to float32
  469. if data_torch.dtype not in (torch.float16, torch.float32):
  470. data_torch = data_torch.to(torch.float32)
  471. data = data_torch.squeeze().numpy()
  472. # map tensor names
  473. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  474. if new_name is None:
  475. print(f"Can not map tensor {name!r}")
  476. sys.exit()
  477. n_dims = len(data.shape)
  478. data_dtype = data.dtype
  479. # if f32 desired, convert any float16 to float32
  480. if self.ftype == 0 and data_dtype == np.float16:
  481. data = data.astype(np.float32)
  482. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  483. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  484. data = data.astype(np.float32)
  485. # if f16 desired, convert any float32 2-dim weight tensors to float16
  486. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  487. data = data.astype(np.float16)
  488. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  489. self.gguf_writer.add_tensor(new_name, data)
  490. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  491. if n_kv_head is not None and n_head != n_kv_head:
  492. n_head //= n_kv_head
  493. return (
  494. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  495. .swapaxes(1, 2)
  496. .reshape(weights.shape)
  497. )
  498. def _reverse_hf_permute_part(
  499. self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
  500. ) -> Tensor:
  501. r = weights.shape[0] // 3
  502. return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
  503. def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
  504. r = weights.shape[0] // 3
  505. return weights[r * n_part:r * n_part + r, ...]
  506. class FalconModel(Model):
  507. def set_gguf_parameters(self):
  508. block_count = self.hparams.get("num_hidden_layers")
  509. if block_count is None:
  510. block_count = self.hparams["n_layer"] # old name
  511. n_head = self.hparams.get("num_attention_heads")
  512. if n_head is None:
  513. n_head = self.hparams["n_head"] # old name
  514. n_head_kv = self.hparams.get("num_kv_heads")
  515. if n_head_kv is None:
  516. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  517. self.gguf_writer.add_name("Falcon")
  518. self.gguf_writer.add_context_length(2048) # not in config.json
  519. self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
  520. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  521. self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
  522. self.gguf_writer.add_block_count(block_count)
  523. self.gguf_writer.add_head_count(n_head)
  524. self.gguf_writer.add_head_count_kv(n_head_kv)
  525. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  526. self.gguf_writer.add_file_type(self.ftype)
  527. def write_tensors(self):
  528. block_count = self.hparams.get("num_hidden_layers")
  529. if block_count is None:
  530. block_count = self.hparams["n_layer"] # old name
  531. n_head = self.hparams.get("num_attention_heads")
  532. if n_head is None:
  533. n_head = self.hparams["n_head"] # old name
  534. n_head_kv = self.hparams.get("num_kv_heads")
  535. if n_head_kv is None:
  536. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  537. head_dim = self.hparams["hidden_size"] // n_head
  538. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  539. for name, data_torch in self.get_tensors():
  540. old_dtype = data_torch.dtype
  541. # convert any unsupported data types to float32
  542. if data_torch.dtype not in (torch.float16, torch.float32):
  543. data_torch = data_torch.to(torch.float32)
  544. # QKV tensor transform
  545. # The original query_key_value tensor contains n_head_kv "kv groups",
  546. # each consisting of n_head/n_head_kv query weights followed by one key
  547. # and one value weight (shared by all query heads in the kv group).
  548. # This layout makes it a big pain to work with in GGML.
  549. # So we rearrange them here,, so that we have n_head query weights
  550. # followed by n_head_kv key weights followed by n_head_kv value weights,
  551. # in contiguous fashion.
  552. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
  553. if "query_key_value" in name:
  554. qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
  555. q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
  556. k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
  557. v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
  558. data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
  559. data = data_torch.squeeze().numpy()
  560. # map tensor names
  561. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  562. if new_name is None:
  563. print(f"Can not map tensor {name!r}")
  564. sys.exit()
  565. n_dims = len(data.shape)
  566. data_dtype = data.dtype
  567. # if f32 desired, convert any float16 to float32
  568. if self.ftype == 0 and data_dtype == np.float16:
  569. data = data.astype(np.float32)
  570. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  571. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  572. data = data.astype(np.float32)
  573. # if f16 desired, convert any float32 2-dim weight tensors to float16
  574. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  575. data = data.astype(np.float16)
  576. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  577. self.gguf_writer.add_tensor(new_name, data)
  578. class StarCoderModel(Model):
  579. def set_gguf_parameters(self):
  580. block_count = self.hparams["n_layer"]
  581. self.gguf_writer.add_name("StarCoder")
  582. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  583. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  584. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  585. self.gguf_writer.add_block_count(block_count)
  586. self.gguf_writer.add_head_count(self.hparams["n_head"])
  587. self.gguf_writer.add_head_count_kv(1)
  588. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  589. self.gguf_writer.add_file_type(self.ftype)
  590. class RefactModel(Model):
  591. def set_gguf_parameters(self):
  592. hidden_dim = self.hparams["n_embd"]
  593. inner_dim = 4 * hidden_dim
  594. hidden_dim = int(2 * inner_dim / 3)
  595. multiple_of = 256
  596. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  597. block_count = self.hparams["n_layer"]
  598. self.gguf_writer.add_name("Refact")
  599. # refact uses Alibi. So this is from config.json which might be used by training.
  600. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  601. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  602. self.gguf_writer.add_feed_forward_length(ff_dim)
  603. self.gguf_writer.add_block_count(block_count)
  604. self.gguf_writer.add_head_count(self.hparams["n_head"])
  605. self.gguf_writer.add_head_count_kv(1)
  606. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  607. self.gguf_writer.add_file_type(self.ftype)
  608. def write_tensors(self):
  609. hidden_dim = self.hparams["n_embd"]
  610. inner_dim = 4 * hidden_dim
  611. hidden_dim = int(2 * inner_dim / 3)
  612. multiple_of = 256
  613. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  614. n_head = self.hparams["n_head"]
  615. n_head_kv = 1
  616. head_dim = self.hparams["n_embd"] // n_head
  617. block_count = self.hparams["n_layer"]
  618. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  619. tensors = dict(self.get_tensors())
  620. for i in range(block_count):
  621. if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
  622. tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
  623. tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
  624. del tensors[f"transformer.h.{i}.attn.kv.weight"]
  625. if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
  626. tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
  627. del tensors[f"transformer.h.{i}.attn.q.weight"]
  628. if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
  629. tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
  630. tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
  631. del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  632. for name, data_torch in tensors.items():
  633. old_dtype = data_torch.dtype
  634. # convert any unsupported data types to float32
  635. if data_torch.dtype not in (torch.float16, torch.float32):
  636. data_torch = data_torch.to(torch.float32)
  637. data = data_torch.squeeze().numpy()
  638. # map tensor names
  639. new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
  640. if new_name is None:
  641. print(f"Can not map tensor {name!r}")
  642. sys.exit()
  643. n_dims = len(data.shape)
  644. data_dtype = data.dtype
  645. # if f32 desired, convert any float16 to float32
  646. if self.ftype == 0 and data_dtype == np.float16:
  647. data = data.astype(np.float32)
  648. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  649. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  650. data = data.astype(np.float32)
  651. # if f16 desired, convert any float32 2-dim weight tensors to float16
  652. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  653. data = data.astype(np.float16)
  654. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  655. self.gguf_writer.add_tensor(new_name, data)
  656. class PersimmonModel(Model):
  657. def set_gguf_parameters(self):
  658. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  659. head_count = self.hparams["num_attention_heads"]
  660. head_count_kv = head_count
  661. hidden_size = self.hparams["hidden_size"]
  662. self.gguf_writer.add_name('persimmon-8b-chat')
  663. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  664. self.gguf_writer.add_embedding_length(hidden_size)
  665. self.gguf_writer.add_block_count(block_count)
  666. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  667. # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
  668. # than the head size?
  669. # ref: https://github.com/ggerganov/llama.cpp/pull/4889
  670. #self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
  671. self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
  672. self.gguf_writer.add_head_count(head_count)
  673. self.gguf_writer.add_head_count_kv(head_count_kv)
  674. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  675. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  676. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  677. def set_vocab(self):
  678. self._set_vocab_sentencepiece()
  679. # self.gguf_writer.add_bos_token_id(71013)
  680. # self.gguf_writer.add_eos_token_id(71013)
  681. def write_tensors(self):
  682. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  683. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  684. for name, data_torch in self.get_tensors():
  685. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  686. continue
  687. old_dtype = data_torch.dtype
  688. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  689. data = data_torch.to(torch.float32).squeeze().numpy()
  690. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  691. if new_name is None:
  692. print(f"Can not map tensor {name!r}")
  693. sys.exit()
  694. n_dims = len(data.shape)
  695. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  696. self.gguf_writer.add_tensor(new_name, data)
  697. class StableLMModel(Model):
  698. def set_gguf_parameters(self):
  699. hparams = self.hparams
  700. block_count = hparams["num_hidden_layers"]
  701. self.gguf_writer.add_name(self.dir_model.name)
  702. self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  703. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  704. self.gguf_writer.add_block_count(block_count)
  705. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  706. self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
  707. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  708. self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
  709. self.gguf_writer.add_layer_norm_eps(1e-5)
  710. class MixtralModel(Model):
  711. def set_vocab(self):
  712. self._set_vocab_sentencepiece()
  713. class QwenModel(Model):
  714. @staticmethod
  715. def token_bytes_to_string(b):
  716. from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
  717. byte_encoder = bytes_to_unicode()
  718. return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
  719. @staticmethod
  720. def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
  721. parts = [bytes([b]) for b in token]
  722. while True:
  723. min_idx = None
  724. min_rank = None
  725. for i, pair in enumerate(zip(parts[:-1], parts[1:])):
  726. rank = mergeable_ranks.get(pair[0] + pair[1])
  727. if rank is not None and (min_rank is None or rank < min_rank):
  728. min_idx = i
  729. min_rank = rank
  730. if min_rank is None or (max_rank is not None and min_rank >= max_rank):
  731. break
  732. assert min_idx is not None
  733. parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
  734. return parts
  735. def set_vocab(self):
  736. dir_model = self.dir_model
  737. hparams = self.hparams
  738. tokens: list[bytearray] = []
  739. toktypes: list[int] = []
  740. from transformers import AutoTokenizer
  741. tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
  742. vocab_size = hparams["vocab_size"]
  743. assert max(tokenizer.get_vocab().values()) < vocab_size
  744. merges = []
  745. vocab = {}
  746. mergeable_ranks = tokenizer.mergeable_ranks
  747. for token, rank in mergeable_ranks.items():
  748. vocab[self.token_bytes_to_string(token)] = rank
  749. if len(token) == 1:
  750. continue
  751. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
  752. assert len(merged) == 2
  753. merges.append(' '.join(map(self.token_bytes_to_string, merged)))
  754. reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
  755. added_vocab = tokenizer.special_tokens
  756. for i in range(vocab_size):
  757. if i not in reverse_vocab:
  758. pad_token = f"[PAD{i}]".encode("utf-8")
  759. tokens.append(bytearray(pad_token))
  760. toktypes.append(gguf.TokenType.USER_DEFINED)
  761. elif reverse_vocab[i] in added_vocab:
  762. tokens.append(reverse_vocab[i])
  763. toktypes.append(gguf.TokenType.CONTROL)
  764. else:
  765. tokens.append(reverse_vocab[i])
  766. toktypes.append(gguf.TokenType.NORMAL)
  767. self.gguf_writer.add_tokenizer_model("gpt2")
  768. self.gguf_writer.add_token_list(tokens)
  769. self.gguf_writer.add_token_types(toktypes)
  770. special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
  771. special_vocab.merges = merges
  772. special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
  773. special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
  774. special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
  775. special_vocab.add_to_gguf(self.gguf_writer)
  776. def set_gguf_parameters(self):
  777. self.gguf_writer.add_name("Qwen")
  778. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  779. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  780. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  781. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  782. self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
  783. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  784. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  785. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  786. def write_tensors(self):
  787. block_count = self.hparams["num_hidden_layers"]
  788. model_kv = dict(self.get_tensors())
  789. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  790. for name, data_torch in model_kv.items():
  791. # we don't need these
  792. if name.endswith(".rotary_emb.inv_freq"):
  793. continue
  794. old_dtype = data_torch.dtype
  795. # convert any unsupported data types to float32
  796. if data_torch.dtype not in (torch.float16, torch.float32):
  797. data_torch = data_torch.to(torch.float32)
  798. data = data_torch.squeeze().numpy()
  799. # map tensor names
  800. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  801. if new_name is None:
  802. print(f"Can not map tensor {name!r}")
  803. sys.exit()
  804. n_dims = len(data.shape)
  805. data_dtype = data.dtype
  806. # if f32 desired, convert any float16 to float32
  807. if self.ftype == 0 and data_dtype == np.float16:
  808. data = data.astype(np.float32)
  809. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  810. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  811. data = data.astype(np.float32)
  812. # if f16 desired, convert any float32 2-dim weight tensors to float16
  813. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  814. data = data.astype(np.float16)
  815. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  816. self.gguf_writer.add_tensor(new_name, data)
  817. class GPT2Model(Model):
  818. def set_gguf_parameters(self):
  819. self.gguf_writer.add_name(self.dir_model.name)
  820. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  821. self.gguf_writer.add_context_length(self.hparams["n_ctx"])
  822. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  823. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  824. self.gguf_writer.add_head_count(self.hparams["n_head"])
  825. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  826. self.gguf_writer.add_file_type(self.ftype)
  827. def write_tensors(self):
  828. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  829. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  830. for name, data_torch in self.get_tensors():
  831. # we don't need these
  832. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
  833. continue
  834. if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
  835. data_torch = data_torch.transpose(1, 0)
  836. old_dtype = data_torch.dtype
  837. # convert any unsupported data types to float32
  838. if data_torch.dtype not in (torch.float16, torch.float32):
  839. data_torch = data_torch.to(torch.float32)
  840. data = data_torch.squeeze().numpy()
  841. # map tensor names
  842. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  843. if new_name is None:
  844. print(f"Can not map tensor {name!r}")
  845. sys.exit()
  846. n_dims = len(data.shape)
  847. data_dtype = data.dtype
  848. # if f32 desired, convert any float16 to float32
  849. if self.ftype == 0 and data_dtype == np.float16:
  850. data = data.astype(np.float32)
  851. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  852. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  853. data = data.astype(np.float32)
  854. # if f16 desired, convert any float32 2-dim weight tensors to float16
  855. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  856. data = data.astype(np.float16)
  857. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  858. self.gguf_writer.add_tensor(new_name, data)
  859. # note: GPT2 output is tied to (same as) wte in original model
  860. if new_name == "token_embd.weight":
  861. print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  862. self.gguf_writer.add_tensor("output.weight", data)
  863. class Phi2Model(Model):
  864. def set_gguf_parameters(self):
  865. block_count = self.hparams["n_layer"]
  866. self.gguf_writer.add_name("Phi2")
  867. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  868. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  869. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  870. self.gguf_writer.add_block_count(block_count)
  871. self.gguf_writer.add_head_count(self.hparams["n_head"])
  872. self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
  873. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  874. self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
  875. self.gguf_writer.add_file_type(self.ftype)
  876. self.gguf_writer.add_add_bos_token(False)
  877. class PlamoModel(Model):
  878. def set_vocab(self):
  879. self._set_vocab_sentencepiece()
  880. def set_gguf_parameters(self):
  881. hparams = self.hparams
  882. block_count = hparams["num_hidden_layers"]
  883. self.gguf_writer.add_name("PLaMo")
  884. self.gguf_writer.add_context_length(4096) # not in config.json
  885. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  886. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  887. self.gguf_writer.add_block_count(block_count)
  888. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  889. self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
  890. self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
  891. def shuffle_attn_q_weight(self, data_torch):
  892. assert data_torch.size() == (5120, 5120)
  893. data_torch = data_torch.reshape(8, 5, 128, 5120)
  894. data_torch = torch.permute(data_torch, (1, 0, 2, 3))
  895. data_torch = torch.reshape(data_torch, (5120, 5120))
  896. return data_torch
  897. def shuffle_attn_output_weight(self, data_torch):
  898. assert data_torch.size() == (5120, 5120)
  899. data_torch = data_torch.reshape(5120, 8, 5, 128)
  900. data_torch = torch.permute(data_torch, (0, 2, 1, 3))
  901. data_torch = torch.reshape(data_torch, (5120, 5120))
  902. return data_torch
  903. def write_tensors(self):
  904. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  905. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  906. for name, data_torch in self.get_tensors():
  907. if "self_attn.rotary_emb.inv_freq" in name:
  908. continue
  909. # map tensor names
  910. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  911. if new_name is None:
  912. print(f"Can not map tensor {name!r}")
  913. sys.exit()
  914. # shuffle for broadcasting of gqa in ggml_mul_mat
  915. if new_name.endswith("attn_q.weight"):
  916. data_torch = self.shuffle_attn_q_weight(data_torch)
  917. elif new_name.endswith("attn_output.weight"):
  918. data_torch = self.shuffle_attn_output_weight(data_torch)
  919. old_dtype = data_torch.dtype
  920. # convert any unsupported data types to float32
  921. if data_torch.dtype not in (torch.float16, torch.float32):
  922. data_torch = data_torch.to(torch.float32)
  923. data = data_torch.squeeze().numpy()
  924. n_dims = len(data.shape)
  925. data_dtype = data.dtype
  926. # if f32 desired, convert any float16 to float32
  927. if self.ftype == 0 and data_dtype == np.float16:
  928. data = data.astype(np.float32)
  929. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  930. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  931. data = data.astype(np.float32)
  932. # if f16 desired, convert any float32 2-dim weight tensors to float16
  933. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  934. data = data.astype(np.float16)
  935. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  936. self.gguf_writer.add_tensor(new_name, data)
  937. ###### CONVERSION LOGIC ######
  938. def parse_args() -> argparse.Namespace:
  939. parser = argparse.ArgumentParser(
  940. description="Convert a huggingface model to a GGML compatible file")
  941. parser.add_argument(
  942. "--vocab-only", action="store_true",
  943. help="extract only the vocab",
  944. )
  945. parser.add_argument(
  946. "--awq-path", type=Path, default=None,
  947. help="Path to scale awq cache file")
  948. parser.add_argument(
  949. "--outfile", type=Path,
  950. help="path to write to; default: based on input",
  951. )
  952. parser.add_argument(
  953. "--outtype", type=str, choices=["f32", "f16"], default="f16",
  954. help="output format - use f32 for float32, f16 for float16",
  955. )
  956. parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
  957. parser.add_argument(
  958. "model", type=Path,
  959. help="directory containing model file",
  960. )
  961. return parser.parse_args()
  962. def main() -> None:
  963. args = parse_args()
  964. dir_model = args.model
  965. if args.awq_path:
  966. sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
  967. from awq.apply_awq import add_scale_weights
  968. tmp_model_path = args.model / "weighted_model"
  969. dir_model = tmp_model_path
  970. if tmp_model_path.is_dir():
  971. print(f"{tmp_model_path} exists as a weighted model.")
  972. else:
  973. tmp_model_path.mkdir(parents=True, exist_ok=True)
  974. print("Saving new weighted model ...")
  975. add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
  976. print(f"Saved weighted model at {tmp_model_path}.")
  977. if not dir_model.is_dir():
  978. print(f'Error: {args.model} is not a directory', file=sys.stderr)
  979. sys.exit(1)
  980. ftype_map = {
  981. "f32": gguf.GGMLQuantizationType.F32,
  982. "f16": gguf.GGMLQuantizationType.F16,
  983. }
  984. if args.outfile is not None:
  985. fname_out = args.outfile
  986. else:
  987. # output in the same directory as the model by default
  988. fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
  989. print(f"Loading model: {dir_model.name}")
  990. hparams = Model.load_hparams(dir_model)
  991. with torch.inference_mode():
  992. model_class = Model.from_model_architecture(hparams["architectures"][0])
  993. model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
  994. print("Set model parameters")
  995. model_instance.set_gguf_parameters()
  996. print("Set model tokenizer")
  997. model_instance.set_vocab()
  998. if args.vocab_only:
  999. print(f"Exporting model vocab to '{fname_out}'")
  1000. model_instance.write_vocab()
  1001. else:
  1002. print(f"Exporting model to '{fname_out}'")
  1003. model_instance.write()
  1004. print(f"Model successfully exported to '{fname_out}'")
  1005. if __name__ == '__main__':
  1006. main()