convert-hf-to-gguf.py 70 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import contextlib
  5. import json
  6. import os
  7. import re
  8. import sys
  9. from enum import IntEnum
  10. from pathlib import Path
  11. from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
  12. import numpy as np
  13. import torch
  14. if TYPE_CHECKING:
  15. from torch import Tensor
  16. if 'NO_LOCAL_GGUF' not in os.environ:
  17. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  18. import gguf
  19. # check for any of the given keys in the dictionary and return the value of the first key found
  20. def get_key_opts(d, keys):
  21. for k in keys:
  22. if k in d:
  23. return d[k]
  24. print(f"Could not find any of {keys}")
  25. sys.exit()
  26. ###### MODEL DEFINITIONS ######
  27. class SentencePieceTokenTypes(IntEnum):
  28. NORMAL = 1
  29. UNKNOWN = 2
  30. CONTROL = 3
  31. USER_DEFINED = 4
  32. UNUSED = 5
  33. BYTE = 6
  34. class Model:
  35. def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
  36. self.dir_model = dir_model
  37. self.ftype = ftype
  38. self.fname_out = fname_out
  39. self.is_big_endian = is_big_endian
  40. self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  41. self.is_safetensors = self._is_model_safetensors()
  42. self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
  43. self.part_names = self._get_part_names()
  44. self.hparams = Model.load_hparams(self.dir_model)
  45. self.model_arch = self._get_model_architecture()
  46. self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
  47. def set_vocab(self):
  48. self._set_vocab_gpt2()
  49. def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
  50. for part_name in self.part_names:
  51. print(f"gguf: loading model part '{part_name}'")
  52. ctx: ContextManager[Any]
  53. if self.is_safetensors:
  54. from safetensors import safe_open
  55. ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
  56. else:
  57. ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
  58. with ctx as model_part:
  59. for name in model_part.keys():
  60. data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
  61. yield name, data
  62. def set_gguf_parameters(self):
  63. self.gguf_writer.add_name(self.dir_model.name)
  64. self.gguf_writer.add_block_count(self.hparams.get(
  65. "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
  66. ))
  67. if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
  68. self.gguf_writer.add_context_length(n_ctx)
  69. if (n_embd := self.hparams.get("hidden_size")) is not None:
  70. self.gguf_writer.add_embedding_length(n_embd)
  71. if (n_ff := self.hparams.get("intermediate_size")) is not None:
  72. self.gguf_writer.add_feed_forward_length(n_ff)
  73. if (n_head := self.hparams.get("num_attention_heads")) is not None:
  74. self.gguf_writer.add_head_count(n_head)
  75. if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
  76. self.gguf_writer.add_head_count_kv(n_head_kv)
  77. if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
  78. self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
  79. if (n_experts := self.hparams.get("num_local_experts")) is not None:
  80. self.gguf_writer.add_expert_count(n_experts)
  81. if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
  82. self.gguf_writer.add_expert_used_count(n_experts_used)
  83. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  84. def write_tensors(self):
  85. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  86. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  87. for name, data_torch in self.get_tensors():
  88. # we don't need these
  89. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  90. continue
  91. old_dtype = data_torch.dtype
  92. # convert any unsupported data types to float32
  93. if data_torch.dtype not in (torch.float16, torch.float32):
  94. data_torch = data_torch.to(torch.float32)
  95. data = data_torch.squeeze().numpy()
  96. # map tensor names
  97. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  98. if new_name is None:
  99. print(f"Can not map tensor {name!r}")
  100. sys.exit()
  101. n_dims = len(data.shape)
  102. data_dtype = data.dtype
  103. # if f32 desired, convert any float16 to float32
  104. if self.ftype == 0 and data_dtype == np.float16:
  105. data = data.astype(np.float32)
  106. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  107. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  108. data = data.astype(np.float32)
  109. # if f16 desired, convert any float32 2-dim weight tensors to float16
  110. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  111. data = data.astype(np.float16)
  112. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  113. self.gguf_writer.add_tensor(new_name, data)
  114. def write(self):
  115. self.write_tensors()
  116. self.gguf_writer.write_header_to_file()
  117. self.gguf_writer.write_kv_data_to_file()
  118. self.gguf_writer.write_tensors_to_file()
  119. self.gguf_writer.close()
  120. def write_vocab(self):
  121. self.gguf_writer.write_header_to_file()
  122. self.gguf_writer.write_kv_data_to_file()
  123. self.gguf_writer.close()
  124. @staticmethod
  125. def count_model_parts(dir_model: Path, prefix: str) -> int:
  126. num_parts = 0
  127. for filename in os.listdir(dir_model):
  128. if filename.endswith(prefix):
  129. num_parts += 1
  130. return num_parts
  131. @staticmethod
  132. def load_hparams(dir_model):
  133. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  134. return json.load(f)
  135. @staticmethod
  136. def from_model_architecture(model_architecture):
  137. if model_architecture == "GPTNeoXForCausalLM":
  138. return GPTNeoXModel
  139. if model_architecture == "BloomForCausalLM":
  140. return BloomModel
  141. if model_architecture == "MPTForCausalLM":
  142. return MPTModel
  143. if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  144. return BaichuanModel
  145. if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
  146. return FalconModel
  147. if model_architecture == "GPTBigCodeForCausalLM":
  148. return StarCoderModel
  149. if model_architecture == "GPTRefactForCausalLM":
  150. return RefactModel
  151. if model_architecture == "PersimmonForCausalLM":
  152. return PersimmonModel
  153. if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  154. return StableLMModel
  155. if model_architecture == "QWenLMHeadModel":
  156. return QwenModel
  157. if model_architecture == "Qwen2ForCausalLM":
  158. return Model
  159. if model_architecture == "MixtralForCausalLM":
  160. return MixtralModel
  161. if model_architecture == "GPT2LMHeadModel":
  162. return GPT2Model
  163. if model_architecture == "PhiForCausalLM":
  164. return Phi2Model
  165. if model_architecture == "PlamoForCausalLM":
  166. return PlamoModel
  167. if model_architecture == "CodeShellForCausalLM":
  168. return CodeShellModel
  169. if model_architecture == "OrionForCausalLM":
  170. return OrionModel
  171. if model_architecture == "InternLM2ForCausalLM":
  172. return InternLM2Model
  173. return Model
  174. def _is_model_safetensors(self) -> bool:
  175. return Model.count_model_parts(self.dir_model, ".safetensors") > 0
  176. def _get_part_names(self):
  177. if self.is_safetensors:
  178. if self.num_parts == 1: # there's only one .safetensors file
  179. return ("model.safetensors",)
  180. return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
  181. if self.num_parts == 1: # there's only one .bin file
  182. return ("pytorch_model.bin",)
  183. return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
  184. def _get_model_architecture(self) -> gguf.MODEL_ARCH:
  185. arch = self.hparams["architectures"][0]
  186. if arch == "GPTNeoXForCausalLM":
  187. return gguf.MODEL_ARCH.GPTNEOX
  188. if arch == "BloomForCausalLM":
  189. return gguf.MODEL_ARCH.BLOOM
  190. if arch == "MPTForCausalLM":
  191. return gguf.MODEL_ARCH.MPT
  192. if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  193. return gguf.MODEL_ARCH.BAICHUAN
  194. if arch in ("FalconForCausalLM", "RWForCausalLM"):
  195. return gguf.MODEL_ARCH.FALCON
  196. if arch == "GPTBigCodeForCausalLM":
  197. return gguf.MODEL_ARCH.STARCODER
  198. if arch == "GPTRefactForCausalLM":
  199. return gguf.MODEL_ARCH.REFACT
  200. if arch == "PersimmonForCausalLM":
  201. return gguf.MODEL_ARCH.PERSIMMON
  202. if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  203. return gguf.MODEL_ARCH.STABLELM
  204. if arch == "QWenLMHeadModel":
  205. return gguf.MODEL_ARCH.QWEN
  206. if arch == "Qwen2ForCausalLM":
  207. return gguf.MODEL_ARCH.QWEN2
  208. if arch == "MixtralForCausalLM":
  209. return gguf.MODEL_ARCH.LLAMA
  210. if arch == "GPT2LMHeadModel":
  211. return gguf.MODEL_ARCH.GPT2
  212. if arch == "PhiForCausalLM":
  213. return gguf.MODEL_ARCH.PHI2
  214. if arch == "PlamoForCausalLM":
  215. return gguf.MODEL_ARCH.PLAMO
  216. if arch == "CodeShellForCausalLM":
  217. return gguf.MODEL_ARCH.CODESHELL
  218. if arch == "OrionForCausalLM":
  219. return gguf.MODEL_ARCH.ORION
  220. if arch == "InternLM2ForCausalLM":
  221. return gguf.MODEL_ARCH.INTERNLM2
  222. raise NotImplementedError(f'Architecture "{arch}" not supported!')
  223. def _set_vocab_gpt2(self):
  224. dir_model = self.dir_model
  225. hparams = self.hparams
  226. tokens: list[bytearray] = []
  227. toktypes: list[int] = []
  228. from transformers import AutoTokenizer
  229. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  230. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  231. assert max(tokenizer.vocab.values()) < vocab_size
  232. reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
  233. added_vocab = tokenizer.get_added_vocab()
  234. for i in range(vocab_size):
  235. if i not in reverse_vocab:
  236. pad_token = f"[PAD{i}]".encode('utf-8')
  237. tokens.append(bytearray(pad_token))
  238. toktypes.append(gguf.TokenType.USER_DEFINED)
  239. elif reverse_vocab[i] in added_vocab:
  240. tokens.append(reverse_vocab[i])
  241. if tokenizer.added_tokens_decoder[i].special:
  242. toktypes.append(gguf.TokenType.CONTROL)
  243. else:
  244. toktypes.append(gguf.TokenType.USER_DEFINED)
  245. else:
  246. tokens.append(reverse_vocab[i])
  247. toktypes.append(gguf.TokenType.NORMAL)
  248. self.gguf_writer.add_tokenizer_model("gpt2")
  249. self.gguf_writer.add_token_list(tokens)
  250. self.gguf_writer.add_token_types(toktypes)
  251. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  252. special_vocab.add_to_gguf(self.gguf_writer)
  253. def _set_vocab_qwen(self):
  254. dir_model = self.dir_model
  255. hparams = self.hparams
  256. tokens: list[bytearray] = []
  257. toktypes: list[int] = []
  258. from transformers import AutoTokenizer
  259. tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
  260. vocab_size = hparams["vocab_size"]
  261. assert max(tokenizer.get_vocab().values()) < vocab_size
  262. merges = []
  263. vocab = {}
  264. mergeable_ranks = tokenizer.mergeable_ranks
  265. for token, rank in mergeable_ranks.items():
  266. vocab[QwenModel.token_bytes_to_string(token)] = rank
  267. if len(token) == 1:
  268. continue
  269. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
  270. assert len(merged) == 2
  271. merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
  272. # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
  273. added_vocab = tokenizer.special_tokens
  274. reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
  275. for i in range(vocab_size):
  276. if i not in reverse_vocab:
  277. pad_token = f"[PAD{i}]".encode("utf-8")
  278. tokens.append(bytearray(pad_token))
  279. toktypes.append(gguf.TokenType.USER_DEFINED)
  280. elif reverse_vocab[i] in added_vocab:
  281. tokens.append(reverse_vocab[i])
  282. toktypes.append(gguf.TokenType.CONTROL)
  283. else:
  284. tokens.append(reverse_vocab[i])
  285. toktypes.append(gguf.TokenType.NORMAL)
  286. self.gguf_writer.add_tokenizer_model("gpt2")
  287. self.gguf_writer.add_token_list(tokens)
  288. self.gguf_writer.add_token_types(toktypes)
  289. special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
  290. special_vocab.merges = merges
  291. # only add special tokens when they were not already loaded from config.json
  292. if len(special_vocab.special_token_ids) == 0:
  293. special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
  294. special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
  295. # this one is usually not in config.json anyway
  296. special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
  297. special_vocab.add_to_gguf(self.gguf_writer)
  298. def _set_vocab_sentencepiece(self):
  299. from sentencepiece import SentencePieceProcessor
  300. tokenizer_path = self.dir_model / 'tokenizer.model'
  301. tokens: list[bytes] = []
  302. scores: list[float] = []
  303. toktypes: list[int] = []
  304. if not tokenizer_path.is_file():
  305. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  306. sys.exit(1)
  307. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  308. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  309. for token_id in range(vocab_size):
  310. piece = tokenizer.id_to_piece(token_id)
  311. text = piece.encode("utf-8")
  312. score = tokenizer.get_score(token_id)
  313. toktype = SentencePieceTokenTypes.NORMAL
  314. if tokenizer.is_unknown(token_id):
  315. toktype = SentencePieceTokenTypes.UNKNOWN
  316. elif tokenizer.is_control(token_id):
  317. toktype = SentencePieceTokenTypes.CONTROL
  318. elif tokenizer.is_unused(token_id):
  319. toktype = SentencePieceTokenTypes.UNUSED
  320. elif tokenizer.is_byte(token_id):
  321. toktype = SentencePieceTokenTypes.BYTE
  322. tokens.append(text)
  323. scores.append(score)
  324. toktypes.append(toktype)
  325. added_tokens_file = self.dir_model / 'added_tokens.json'
  326. if added_tokens_file.is_file():
  327. with open(added_tokens_file, "r", encoding="utf-8") as f:
  328. added_tokens_json = json.load(f)
  329. for key in added_tokens_json:
  330. tokens.append(key.encode("utf-8"))
  331. scores.append(-1000.0)
  332. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  333. self.gguf_writer.add_tokenizer_model("llama")
  334. self.gguf_writer.add_token_list(tokens)
  335. self.gguf_writer.add_token_scores(scores)
  336. self.gguf_writer.add_token_types(toktypes)
  337. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  338. special_vocab.add_to_gguf(self.gguf_writer)
  339. class GPTNeoXModel(Model):
  340. def set_gguf_parameters(self):
  341. block_count = self.hparams["num_hidden_layers"]
  342. self.gguf_writer.add_name(self.dir_model.name)
  343. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  344. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  345. self.gguf_writer.add_block_count(block_count)
  346. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  347. self.gguf_writer.add_rope_dimension_count(
  348. int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
  349. )
  350. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  351. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  352. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  353. class BloomModel(Model):
  354. def set_gguf_parameters(self):
  355. self.gguf_writer.add_name("Bloom")
  356. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  357. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  358. self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
  359. self.gguf_writer.add_embedding_length(n_embed)
  360. self.gguf_writer.add_feed_forward_length(4 * n_embed)
  361. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  362. self.gguf_writer.add_head_count(n_head)
  363. self.gguf_writer.add_head_count_kv(n_head)
  364. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  365. self.gguf_writer.add_file_type(self.ftype)
  366. def write_tensors(self):
  367. block_count = self.hparams["n_layer"]
  368. tensors = dict(self.get_tensors())
  369. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  370. has_lm_head = True
  371. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  372. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  373. for name, data_torch in tensors.items():
  374. if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
  375. has_lm_head = False
  376. name = re.sub(r'transformer\.', '', name)
  377. old_dtype = data_torch.dtype
  378. # convert any unsupported data types to float32
  379. if data_torch.dtype not in (torch.float16, torch.float32):
  380. data_torch = data_torch.to(torch.float32)
  381. data = data_torch.squeeze().numpy()
  382. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  383. # Map bloom-style qkv_linear to gpt-style qkv_linear
  384. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  385. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  386. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  387. data = np.concatenate(
  388. (
  389. qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  390. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  391. qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
  392. ),
  393. axis=0,
  394. )
  395. print("re-format attention.linear_qkv.weight")
  396. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  397. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  398. data = np.concatenate(
  399. (
  400. qkv_bias[:, 0, :].reshape((n_embed,)),
  401. qkv_bias[:, 1, :].reshape((n_embed,)),
  402. qkv_bias[:, 2, :].reshape((n_embed,)),
  403. ),
  404. axis=0,
  405. )
  406. print("re-format attention.linear_qkv.bias")
  407. # map tensor names
  408. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  409. if new_name is None:
  410. print(f"Can not map tensor {name!r}")
  411. sys.exit()
  412. n_dims = len(data.shape)
  413. data_dtype = data.dtype
  414. # if f32 desired, convert any float16 to float32
  415. if self.ftype == 0 and data_dtype == np.float16:
  416. data = data.astype(np.float32)
  417. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  418. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  419. data = data.astype(np.float32)
  420. # if f16 desired, convert any float32 2-dim weight tensors to float16
  421. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  422. data = data.astype(np.float16)
  423. print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  424. self.gguf_writer.add_tensor(new_name, data)
  425. if not has_lm_head and name == "word_embeddings.weight":
  426. self.gguf_writer.add_tensor("output.weight", data)
  427. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  428. class MPTModel(Model):
  429. def set_gguf_parameters(self):
  430. block_count = self.hparams["n_layers"]
  431. self.gguf_writer.add_name(self.dir_model.name)
  432. self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
  433. self.gguf_writer.add_embedding_length(self.hparams["d_model"])
  434. self.gguf_writer.add_block_count(block_count)
  435. self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
  436. self.gguf_writer.add_head_count(self.hparams["n_heads"])
  437. if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
  438. self.gguf_writer.add_head_count_kv(kv_n_heads)
  439. self.gguf_writer.add_layer_norm_eps(1e-5)
  440. if self.hparams["attn_config"]["clip_qkv"] is not None:
  441. self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
  442. self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
  443. def write_tensors(self):
  444. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
  445. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  446. for name, data_torch in self.get_tensors():
  447. # we don't need these
  448. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  449. continue
  450. old_dtype = data_torch.dtype
  451. # convert any unsupported data types to float32
  452. if data_torch.dtype not in (torch.float16, torch.float32):
  453. data_torch = data_torch.to(torch.float32)
  454. data = data_torch.squeeze().numpy()
  455. # map tensor names
  456. if "scales" in name:
  457. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
  458. if new_name is not None:
  459. new_name = new_name.replace("scales", "act.scales")
  460. else:
  461. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  462. if new_name is None:
  463. print(f"Can not map tensor {name!r}")
  464. sys.exit()
  465. n_dims = len(data.shape)
  466. data_dtype = data.dtype
  467. # if f32 desired, convert any float16 to float32
  468. if self.ftype == 0 and data_dtype == np.float16:
  469. data = data.astype(np.float32)
  470. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  471. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  472. data = data.astype(np.float32)
  473. # if f16 desired, convert any float32 2-dim weight tensors to float16
  474. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  475. data = data.astype(np.float16)
  476. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  477. self.gguf_writer.add_tensor(new_name, data)
  478. # note: MPT output is tied to (same as) wte in original model;
  479. # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
  480. if new_name == "token_embd.weight":
  481. self.gguf_writer.add_tensor("output.weight", data)
  482. class OrionModel(Model):
  483. def set_vocab(self):
  484. self._set_vocab_sentencepiece()
  485. def set_gguf_parameters(self):
  486. block_count = self.hparams["num_hidden_layers"]
  487. head_count = self.hparams["num_attention_heads"]
  488. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  489. hf_repo = self.hparams.get("_name_or_path", "")
  490. ctx_length = 0
  491. if "max_sequence_length" in self.hparams:
  492. ctx_length = self.hparams["max_sequence_length"]
  493. elif "max_position_embeddings" in self.hparams:
  494. ctx_length = self.hparams["max_position_embeddings"]
  495. elif "model_max_length" in self.hparams:
  496. ctx_length = self.hparams["model_max_length"]
  497. else:
  498. print("gguf: can not find ctx length parameter.")
  499. sys.exit()
  500. self.gguf_writer.add_file_type(self.ftype)
  501. self.gguf_writer.add_name(self.dir_model.name)
  502. self.gguf_writer.add_source_hf_repo(hf_repo)
  503. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  504. self.gguf_writer.add_context_length(ctx_length)
  505. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  506. self.gguf_writer.add_block_count(block_count)
  507. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  508. self.gguf_writer.add_head_count(head_count)
  509. self.gguf_writer.add_head_count_kv(head_count_kv)
  510. self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
  511. def write_tensors(self):
  512. # Collect tensors from generator object
  513. model_kv = dict(self.get_tensors())
  514. block_count = self.hparams["num_hidden_layers"]
  515. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  516. for name, data_torch in model_kv.items():
  517. # we don't need these
  518. if name.endswith(".rotary_emb.inv_freq"):
  519. continue
  520. old_dtype = data_torch.dtype
  521. # convert any unsupported data types to float32
  522. if data_torch.dtype not in (torch.float16, torch.float32):
  523. data_torch = data_torch.to(torch.float32)
  524. data = data_torch.squeeze().numpy()
  525. # map tensor names
  526. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  527. if new_name is None:
  528. print(f"Can not map tensor {name!r}")
  529. sys.exit()
  530. n_dims = len(data.shape)
  531. data_dtype = data.dtype
  532. # if f32 desired, convert any float16 to float32
  533. if self.ftype == 0 and data_dtype == np.float16:
  534. data = data.astype(np.float32)
  535. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  536. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  537. data = data.astype(np.float32)
  538. # if f16 desired, convert any float32 2-dim weight tensors to float16
  539. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  540. data = data.astype(np.float16)
  541. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  542. self.gguf_writer.add_tensor(new_name, data)
  543. class BaichuanModel(Model):
  544. def set_vocab(self):
  545. self._set_vocab_sentencepiece()
  546. def set_gguf_parameters(self):
  547. block_count = self.hparams["num_hidden_layers"]
  548. head_count = self.hparams["num_attention_heads"]
  549. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  550. hf_repo = self.hparams.get("_name_or_path", "")
  551. ctx_length = 0
  552. if "max_sequence_length" in self.hparams:
  553. ctx_length = self.hparams["max_sequence_length"]
  554. elif "max_position_embeddings" in self.hparams:
  555. ctx_length = self.hparams["max_position_embeddings"]
  556. elif "model_max_length" in self.hparams:
  557. ctx_length = self.hparams["model_max_length"]
  558. else:
  559. print("gguf: can not find ctx length parameter.")
  560. sys.exit()
  561. self.gguf_writer.add_name(self.dir_model.name)
  562. self.gguf_writer.add_source_hf_repo(hf_repo)
  563. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  564. self.gguf_writer.add_context_length(ctx_length)
  565. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  566. self.gguf_writer.add_block_count(block_count)
  567. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  568. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  569. self.gguf_writer.add_head_count(head_count)
  570. self.gguf_writer.add_head_count_kv(head_count_kv)
  571. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  572. if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
  573. if self.hparams["rope_scaling"].get("type") == "linear":
  574. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  575. self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
  576. def write_tensors(self):
  577. # Collect tensors from generator object
  578. model_kv = dict(self.get_tensors())
  579. block_count = self.hparams["num_hidden_layers"]
  580. head_count = self.hparams["num_attention_heads"]
  581. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  582. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  583. for i in range(block_count):
  584. if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
  585. print(f"Unpacking and permuting layer {i}")
  586. model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
  587. self._reverse_hf_permute_part(w, 0, head_count, head_count)
  588. model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
  589. self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
  590. model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
  591. self._reverse_hf_part(w, 2)
  592. del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
  593. for name, data_torch in model_kv.items():
  594. # we don't need these
  595. if name.endswith(".rotary_emb.inv_freq"):
  596. continue
  597. old_dtype = data_torch.dtype
  598. # convert any unsupported data types to float32
  599. if data_torch.dtype not in (torch.float16, torch.float32):
  600. data_torch = data_torch.to(torch.float32)
  601. data = data_torch.squeeze().numpy()
  602. # map tensor names
  603. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  604. if new_name is None:
  605. print(f"Can not map tensor {name!r}")
  606. sys.exit()
  607. n_dims = len(data.shape)
  608. data_dtype = data.dtype
  609. # if f32 desired, convert any float16 to float32
  610. if self.ftype == 0 and data_dtype == np.float16:
  611. data = data.astype(np.float32)
  612. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  613. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  614. data = data.astype(np.float32)
  615. # if f16 desired, convert any float32 2-dim weight tensors to float16
  616. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  617. data = data.astype(np.float16)
  618. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  619. self.gguf_writer.add_tensor(new_name, data)
  620. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  621. if n_kv_head is not None and n_head != n_kv_head:
  622. n_head //= n_kv_head
  623. return (
  624. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  625. .swapaxes(1, 2)
  626. .reshape(weights.shape)
  627. )
  628. def _reverse_hf_permute_part(
  629. self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
  630. ) -> Tensor:
  631. r = weights.shape[0] // 3
  632. return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
  633. def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
  634. r = weights.shape[0] // 3
  635. return weights[r * n_part:r * n_part + r, ...]
  636. class FalconModel(Model):
  637. def set_gguf_parameters(self):
  638. block_count = self.hparams.get("num_hidden_layers")
  639. if block_count is None:
  640. block_count = self.hparams["n_layer"] # old name
  641. n_head = self.hparams.get("num_attention_heads")
  642. if n_head is None:
  643. n_head = self.hparams["n_head"] # old name
  644. n_head_kv = self.hparams.get("num_kv_heads")
  645. if n_head_kv is None:
  646. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  647. self.gguf_writer.add_name("Falcon")
  648. self.gguf_writer.add_context_length(2048) # not in config.json
  649. self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
  650. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  651. self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
  652. self.gguf_writer.add_block_count(block_count)
  653. self.gguf_writer.add_head_count(n_head)
  654. self.gguf_writer.add_head_count_kv(n_head_kv)
  655. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  656. self.gguf_writer.add_file_type(self.ftype)
  657. def write_tensors(self):
  658. block_count = self.hparams.get("num_hidden_layers")
  659. if block_count is None:
  660. block_count = self.hparams["n_layer"] # old name
  661. n_head = self.hparams.get("num_attention_heads")
  662. if n_head is None:
  663. n_head = self.hparams["n_head"] # old name
  664. n_head_kv = self.hparams.get("num_kv_heads")
  665. if n_head_kv is None:
  666. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  667. head_dim = self.hparams["hidden_size"] // n_head
  668. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  669. for name, data_torch in self.get_tensors():
  670. old_dtype = data_torch.dtype
  671. # convert any unsupported data types to float32
  672. if data_torch.dtype not in (torch.float16, torch.float32):
  673. data_torch = data_torch.to(torch.float32)
  674. # QKV tensor transform
  675. # The original query_key_value tensor contains n_head_kv "kv groups",
  676. # each consisting of n_head/n_head_kv query weights followed by one key
  677. # and one value weight (shared by all query heads in the kv group).
  678. # This layout makes it a big pain to work with in GGML.
  679. # So we rearrange them here,, so that we have n_head query weights
  680. # followed by n_head_kv key weights followed by n_head_kv value weights,
  681. # in contiguous fashion.
  682. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
  683. if "query_key_value" in name:
  684. qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
  685. q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
  686. k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
  687. v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
  688. data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
  689. data = data_torch.squeeze().numpy()
  690. # map tensor names
  691. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  692. if new_name is None:
  693. print(f"Can not map tensor {name!r}")
  694. sys.exit()
  695. n_dims = len(data.shape)
  696. data_dtype = data.dtype
  697. # if f32 desired, convert any float16 to float32
  698. if self.ftype == 0 and data_dtype == np.float16:
  699. data = data.astype(np.float32)
  700. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  701. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  702. data = data.astype(np.float32)
  703. # if f16 desired, convert any float32 2-dim weight tensors to float16
  704. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  705. data = data.astype(np.float16)
  706. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  707. self.gguf_writer.add_tensor(new_name, data)
  708. class StarCoderModel(Model):
  709. def set_gguf_parameters(self):
  710. block_count = self.hparams["n_layer"]
  711. self.gguf_writer.add_name("StarCoder")
  712. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  713. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  714. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  715. self.gguf_writer.add_block_count(block_count)
  716. self.gguf_writer.add_head_count(self.hparams["n_head"])
  717. self.gguf_writer.add_head_count_kv(1)
  718. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  719. self.gguf_writer.add_file_type(self.ftype)
  720. class RefactModel(Model):
  721. def set_gguf_parameters(self):
  722. hidden_dim = self.hparams["n_embd"]
  723. inner_dim = 4 * hidden_dim
  724. hidden_dim = int(2 * inner_dim / 3)
  725. multiple_of = 256
  726. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  727. block_count = self.hparams["n_layer"]
  728. self.gguf_writer.add_name("Refact")
  729. # refact uses Alibi. So this is from config.json which might be used by training.
  730. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  731. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  732. self.gguf_writer.add_feed_forward_length(ff_dim)
  733. self.gguf_writer.add_block_count(block_count)
  734. self.gguf_writer.add_head_count(self.hparams["n_head"])
  735. self.gguf_writer.add_head_count_kv(1)
  736. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  737. self.gguf_writer.add_file_type(self.ftype)
  738. def write_tensors(self):
  739. hidden_dim = self.hparams["n_embd"]
  740. inner_dim = 4 * hidden_dim
  741. hidden_dim = int(2 * inner_dim / 3)
  742. multiple_of = 256
  743. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  744. n_head = self.hparams["n_head"]
  745. n_head_kv = 1
  746. head_dim = self.hparams["n_embd"] // n_head
  747. block_count = self.hparams["n_layer"]
  748. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  749. tensors = dict(self.get_tensors())
  750. for i in range(block_count):
  751. if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
  752. tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
  753. tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
  754. del tensors[f"transformer.h.{i}.attn.kv.weight"]
  755. if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
  756. tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
  757. del tensors[f"transformer.h.{i}.attn.q.weight"]
  758. if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
  759. tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
  760. tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
  761. del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  762. for name, data_torch in tensors.items():
  763. old_dtype = data_torch.dtype
  764. # convert any unsupported data types to float32
  765. if data_torch.dtype not in (torch.float16, torch.float32):
  766. data_torch = data_torch.to(torch.float32)
  767. data = data_torch.squeeze().numpy()
  768. # map tensor names
  769. new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
  770. if new_name is None:
  771. print(f"Can not map tensor {name!r}")
  772. sys.exit()
  773. n_dims = len(data.shape)
  774. data_dtype = data.dtype
  775. # if f32 desired, convert any float16 to float32
  776. if self.ftype == 0 and data_dtype == np.float16:
  777. data = data.astype(np.float32)
  778. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  779. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  780. data = data.astype(np.float32)
  781. # if f16 desired, convert any float32 2-dim weight tensors to float16
  782. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  783. data = data.astype(np.float16)
  784. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  785. self.gguf_writer.add_tensor(new_name, data)
  786. class PersimmonModel(Model):
  787. def set_gguf_parameters(self):
  788. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  789. head_count = self.hparams["num_attention_heads"]
  790. head_count_kv = head_count
  791. hidden_size = self.hparams["hidden_size"]
  792. self.gguf_writer.add_name('persimmon-8b-chat')
  793. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  794. self.gguf_writer.add_embedding_length(hidden_size)
  795. self.gguf_writer.add_block_count(block_count)
  796. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  797. # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
  798. # than the head size?
  799. # ref: https://github.com/ggerganov/llama.cpp/pull/4889
  800. # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
  801. self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
  802. self.gguf_writer.add_head_count(head_count)
  803. self.gguf_writer.add_head_count_kv(head_count_kv)
  804. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  805. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  806. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  807. def set_vocab(self):
  808. self._set_vocab_sentencepiece()
  809. # self.gguf_writer.add_bos_token_id(71013)
  810. # self.gguf_writer.add_eos_token_id(71013)
  811. def write_tensors(self):
  812. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  813. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  814. for name, data_torch in self.get_tensors():
  815. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  816. continue
  817. old_dtype = data_torch.dtype
  818. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  819. data = data_torch.to(torch.float32).squeeze().numpy()
  820. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  821. if new_name is None:
  822. print(f"Can not map tensor {name!r}")
  823. sys.exit()
  824. n_dims = len(data.shape)
  825. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  826. self.gguf_writer.add_tensor(new_name, data)
  827. class StableLMModel(Model):
  828. def set_vocab(self):
  829. if (self.dir_model / "tokenizer.json").is_file():
  830. self._set_vocab_gpt2()
  831. else:
  832. # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
  833. self._set_vocab_qwen()
  834. def set_gguf_parameters(self):
  835. hparams = self.hparams
  836. block_count = hparams["num_hidden_layers"]
  837. self.gguf_writer.add_name(self.dir_model.name)
  838. self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  839. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  840. self.gguf_writer.add_block_count(block_count)
  841. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  842. self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
  843. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  844. self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
  845. self.gguf_writer.add_layer_norm_eps(1e-5)
  846. class MixtralModel(Model):
  847. def set_vocab(self):
  848. self._set_vocab_sentencepiece()
  849. class QwenModel(Model):
  850. @staticmethod
  851. def token_bytes_to_string(b):
  852. from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
  853. byte_encoder = bytes_to_unicode()
  854. return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
  855. @staticmethod
  856. def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
  857. parts = [bytes([b]) for b in token]
  858. while True:
  859. min_idx = None
  860. min_rank = None
  861. for i, pair in enumerate(zip(parts[:-1], parts[1:])):
  862. rank = mergeable_ranks.get(pair[0] + pair[1])
  863. if rank is not None and (min_rank is None or rank < min_rank):
  864. min_idx = i
  865. min_rank = rank
  866. if min_rank is None or (max_rank is not None and min_rank >= max_rank):
  867. break
  868. assert min_idx is not None
  869. parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
  870. return parts
  871. def set_vocab(self):
  872. self._set_vocab_qwen()
  873. def set_gguf_parameters(self):
  874. self.gguf_writer.add_name("Qwen")
  875. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  876. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  877. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  878. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  879. self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
  880. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  881. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  882. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  883. def write_tensors(self):
  884. block_count = self.hparams["num_hidden_layers"]
  885. model_kv = dict(self.get_tensors())
  886. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  887. for name, data_torch in model_kv.items():
  888. # we don't need these
  889. if name.endswith(".rotary_emb.inv_freq"):
  890. continue
  891. old_dtype = data_torch.dtype
  892. # convert any unsupported data types to float32
  893. if data_torch.dtype not in (torch.float16, torch.float32):
  894. data_torch = data_torch.to(torch.float32)
  895. data = data_torch.squeeze().numpy()
  896. # map tensor names
  897. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  898. if new_name is None:
  899. print(f"Can not map tensor {name!r}")
  900. sys.exit()
  901. n_dims = len(data.shape)
  902. data_dtype = data.dtype
  903. # if f32 desired, convert any float16 to float32
  904. if self.ftype == 0 and data_dtype == np.float16:
  905. data = data.astype(np.float32)
  906. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  907. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  908. data = data.astype(np.float32)
  909. # if f16 desired, convert any float32 2-dim weight tensors to float16
  910. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  911. data = data.astype(np.float16)
  912. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  913. self.gguf_writer.add_tensor(new_name, data)
  914. class GPT2Model(Model):
  915. def set_gguf_parameters(self):
  916. self.gguf_writer.add_name(self.dir_model.name)
  917. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  918. self.gguf_writer.add_context_length(self.hparams["n_ctx"])
  919. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  920. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  921. self.gguf_writer.add_head_count(self.hparams["n_head"])
  922. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  923. self.gguf_writer.add_file_type(self.ftype)
  924. def write_tensors(self):
  925. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  926. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  927. for name, data_torch in self.get_tensors():
  928. # we don't need these
  929. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
  930. continue
  931. if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
  932. data_torch = data_torch.transpose(1, 0)
  933. old_dtype = data_torch.dtype
  934. # convert any unsupported data types to float32
  935. if data_torch.dtype not in (torch.float16, torch.float32):
  936. data_torch = data_torch.to(torch.float32)
  937. data = data_torch.squeeze().numpy()
  938. # map tensor names
  939. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  940. if new_name is None:
  941. print(f"Can not map tensor {name!r}")
  942. sys.exit()
  943. n_dims = len(data.shape)
  944. data_dtype = data.dtype
  945. # if f32 desired, convert any float16 to float32
  946. if self.ftype == 0 and data_dtype == np.float16:
  947. data = data.astype(np.float32)
  948. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  949. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  950. data = data.astype(np.float32)
  951. # if f16 desired, convert any float32 2-dim weight tensors to float16
  952. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  953. data = data.astype(np.float16)
  954. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  955. self.gguf_writer.add_tensor(new_name, data)
  956. # note: GPT2 output is tied to (same as) wte in original model
  957. if new_name == "token_embd.weight":
  958. print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  959. self.gguf_writer.add_tensor("output.weight", data)
  960. class Phi2Model(Model):
  961. def set_gguf_parameters(self):
  962. block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
  963. rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
  964. n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
  965. n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
  966. self.gguf_writer.add_name("Phi2")
  967. self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
  968. self.gguf_writer.add_embedding_length(n_embd)
  969. self.gguf_writer.add_feed_forward_length(4 * n_embd)
  970. self.gguf_writer.add_block_count(block_count)
  971. self.gguf_writer.add_head_count(n_head)
  972. self.gguf_writer.add_head_count_kv(n_head)
  973. self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
  974. self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
  975. self.gguf_writer.add_file_type(self.ftype)
  976. self.gguf_writer.add_add_bos_token(False)
  977. class PlamoModel(Model):
  978. def set_vocab(self):
  979. self._set_vocab_sentencepiece()
  980. def set_gguf_parameters(self):
  981. hparams = self.hparams
  982. block_count = hparams["num_hidden_layers"]
  983. self.gguf_writer.add_name("PLaMo")
  984. self.gguf_writer.add_context_length(4096) # not in config.json
  985. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  986. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  987. self.gguf_writer.add_block_count(block_count)
  988. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  989. self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
  990. self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
  991. def shuffle_attn_q_weight(self, data_torch):
  992. assert data_torch.size() == (5120, 5120)
  993. data_torch = data_torch.reshape(8, 5, 128, 5120)
  994. data_torch = torch.permute(data_torch, (1, 0, 2, 3))
  995. data_torch = torch.reshape(data_torch, (5120, 5120))
  996. return data_torch
  997. def shuffle_attn_output_weight(self, data_torch):
  998. assert data_torch.size() == (5120, 5120)
  999. data_torch = data_torch.reshape(5120, 8, 5, 128)
  1000. data_torch = torch.permute(data_torch, (0, 2, 1, 3))
  1001. data_torch = torch.reshape(data_torch, (5120, 5120))
  1002. return data_torch
  1003. def write_tensors(self):
  1004. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  1005. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1006. for name, data_torch in self.get_tensors():
  1007. if "self_attn.rotary_emb.inv_freq" in name:
  1008. continue
  1009. # map tensor names
  1010. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1011. if new_name is None:
  1012. print(f"Can not map tensor {name!r}")
  1013. sys.exit()
  1014. # shuffle for broadcasting of gqa in ggml_mul_mat
  1015. if new_name.endswith("attn_q.weight"):
  1016. data_torch = self.shuffle_attn_q_weight(data_torch)
  1017. elif new_name.endswith("attn_output.weight"):
  1018. data_torch = self.shuffle_attn_output_weight(data_torch)
  1019. old_dtype = data_torch.dtype
  1020. # convert any unsupported data types to float32
  1021. if data_torch.dtype not in (torch.float16, torch.float32):
  1022. data_torch = data_torch.to(torch.float32)
  1023. data = data_torch.squeeze().numpy()
  1024. n_dims = len(data.shape)
  1025. data_dtype = data.dtype
  1026. # if f32 desired, convert any float16 to float32
  1027. if self.ftype == 0 and data_dtype == np.float16:
  1028. data = data.astype(np.float32)
  1029. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1030. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1031. data = data.astype(np.float32)
  1032. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1033. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1034. data = data.astype(np.float16)
  1035. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1036. self.gguf_writer.add_tensor(new_name, data)
  1037. class CodeShellModel(Model):
  1038. def set_gguf_parameters(self):
  1039. block_count = self.hparams["n_layer"]
  1040. self.gguf_writer.add_name("CodeShell")
  1041. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  1042. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  1043. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  1044. self.gguf_writer.add_block_count(block_count)
  1045. self.gguf_writer.add_head_count(self.hparams["n_head"])
  1046. self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
  1047. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  1048. self.gguf_writer.add_file_type(self.ftype)
  1049. self.gguf_writer.add_rope_freq_base(10000.0)
  1050. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  1051. self.gguf_writer.add_rope_scaling_factor(1.0)
  1052. def write_tensors(self):
  1053. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  1054. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1055. tensors = dict(self.get_tensors())
  1056. has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
  1057. for name, data_torch in tensors.items():
  1058. # we don't need these
  1059. if name.endswith((".attn.rotary_emb.inv_freq")):
  1060. continue
  1061. old_dtype = data_torch.dtype
  1062. # convert any unsupported data types to float32
  1063. if data_torch.dtype not in (torch.float16, torch.float32):
  1064. data_torch = data_torch.to(torch.float32)
  1065. data = data_torch.squeeze().numpy()
  1066. # map tensor names
  1067. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1068. if new_name is None:
  1069. print(f"Can not map tensor {name!r}")
  1070. sys.exit()
  1071. n_dims = len(data.shape)
  1072. data_dtype = data.dtype
  1073. # if f32 desired, convert any float16 to float32
  1074. if self.ftype == 0 and data_dtype == np.float16:
  1075. data = data.astype(np.float32)
  1076. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1077. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1078. data = data.astype(np.float32)
  1079. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1080. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1081. data = data.astype(np.float16)
  1082. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1083. self.gguf_writer.add_tensor(new_name, data)
  1084. if not has_lm_head and name == "transformer.wte.weight":
  1085. self.gguf_writer.add_tensor("output.weight", data)
  1086. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  1087. class InternLM2Model(Model):
  1088. def set_vocab(self):
  1089. # (TODO): Is there a better way?
  1090. # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
  1091. # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
  1092. # recognized as an empty string in C++.
  1093. from sentencepiece import SentencePieceProcessor
  1094. from sentencepiece import sentencepiece_model_pb2 as model
  1095. tokenizer_path = self.dir_model / 'tokenizer.model'
  1096. tokens: list[bytes] = []
  1097. scores: list[float] = []
  1098. toktypes: list[int] = []
  1099. if not tokenizer_path.is_file():
  1100. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  1101. sys.exit(1)
  1102. sentencepiece_model = model.ModelProto()
  1103. sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
  1104. add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
  1105. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  1106. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  1107. for token_id in range(vocab_size):
  1108. piece = tokenizer.id_to_piece(token_id)
  1109. text = piece.encode("utf-8")
  1110. score = tokenizer.get_score(token_id)
  1111. if text == b"\x00":
  1112. # (TODO): fixme
  1113. # Hack here and replace the \x00 characters.
  1114. print(f"InternLM2 convert token '{text}' to '🐉'!")
  1115. text = "🐉"
  1116. toktype = SentencePieceTokenTypes.NORMAL
  1117. if tokenizer.is_unknown(token_id):
  1118. toktype = SentencePieceTokenTypes.UNKNOWN
  1119. elif tokenizer.is_control(token_id):
  1120. toktype = SentencePieceTokenTypes.CONTROL
  1121. elif tokenizer.is_unused(token_id):
  1122. toktype = SentencePieceTokenTypes.UNUSED
  1123. elif tokenizer.is_byte(token_id):
  1124. toktype = SentencePieceTokenTypes.BYTE
  1125. tokens.append(text)
  1126. scores.append(score)
  1127. toktypes.append(toktype)
  1128. added_tokens_file = self.dir_model / 'added_tokens.json'
  1129. if added_tokens_file.is_file():
  1130. with open(added_tokens_file, "r", encoding="utf-8") as f:
  1131. added_tokens_json = json.load(f)
  1132. for key in added_tokens_json:
  1133. tokens.append(key.encode("utf-8"))
  1134. scores.append(-1000.0)
  1135. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  1136. self.gguf_writer.add_tokenizer_model("llama")
  1137. self.gguf_writer.add_token_list(tokens)
  1138. self.gguf_writer.add_token_scores(scores)
  1139. self.gguf_writer.add_token_types(toktypes)
  1140. self.gguf_writer.add_add_space_prefix(add_prefix)
  1141. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  1142. old_eos = special_vocab.special_token_ids["eos"]
  1143. if "chat" in os.path.basename(self.dir_model.absolute()):
  1144. # For the chat model, we replace the eos with '<|im_end|>'.
  1145. special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
  1146. print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
  1147. in chat mode so that the conversation can end normally.")
  1148. special_vocab.add_to_gguf(self.gguf_writer)
  1149. def _try_get_sft_eos(self, tokenizer):
  1150. unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
  1151. im_end_list = tokenizer.encode('<|im_end|>')
  1152. assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
  1153. if len(unused_145_list) == 1:
  1154. eos_token = unused_145_list[0]
  1155. if len(im_end_list) == 1:
  1156. eos_token = im_end_list[0]
  1157. return eos_token
  1158. def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
  1159. if n_head_kv is not None and n_head != n_head_kv:
  1160. n_head = n_head_kv
  1161. return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  1162. .swapaxes(1, 2)
  1163. .reshape(weights.shape))
  1164. def set_gguf_parameters(self):
  1165. self.gguf_writer.add_name("InternLM2")
  1166. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  1167. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  1168. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  1169. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  1170. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  1171. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  1172. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  1173. self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
  1174. def post_write_tensors(self, tensor_map, name, data_torch):
  1175. old_dtype = data_torch.dtype
  1176. # convert any unsupported data types to float32
  1177. if data_torch.dtype not in (torch.float16, torch.float32):
  1178. data_torch = data_torch.to(torch.float32)
  1179. data = data_torch.squeeze().numpy()
  1180. # map tensor names
  1181. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1182. if new_name is None:
  1183. print(f"Can not map tensor {name!r}")
  1184. sys.exit()
  1185. n_dims = len(data.shape)
  1186. data_dtype = data.dtype
  1187. # if f32 desired, convert any float16 to float32
  1188. if self.ftype == 0 and data_dtype == np.float16:
  1189. data = data.astype(np.float32)
  1190. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1191. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1192. data = data.astype(np.float32)
  1193. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1194. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1195. data = data.astype(np.float16)
  1196. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1197. self.gguf_writer.add_tensor(new_name, data)
  1198. def write_tensors(self):
  1199. from einops import rearrange
  1200. num_heads = self.hparams.get("num_attention_heads")
  1201. num_kv_heads = self.hparams.get("num_key_value_heads")
  1202. hidden_size = self.hparams.get("hidden_size")
  1203. q_per_kv = num_heads // num_kv_heads
  1204. head_dim = hidden_size // num_heads
  1205. num_groups = num_heads // q_per_kv
  1206. block_count = self.hparams["num_hidden_layers"]
  1207. model_kv = dict(self.get_tensors())
  1208. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1209. qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
  1210. for name, data_torch in model_kv.items():
  1211. # we don't need these
  1212. if name.endswith(".rotary_emb.inv_freq"):
  1213. continue
  1214. if re.match(qkv_pattern, name):
  1215. bid = re.findall(qkv_pattern, name)[0]
  1216. qkv = data_torch
  1217. qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
  1218. q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
  1219. # The model weights of q and k equire additional reshape.
  1220. q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
  1221. k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
  1222. v = rearrange(v, " o g n i -> o (g n i)").T
  1223. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
  1224. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
  1225. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
  1226. else:
  1227. self.post_write_tensors(tensor_map, name, data_torch)
  1228. ###### CONVERSION LOGIC ######
  1229. def parse_args() -> argparse.Namespace:
  1230. parser = argparse.ArgumentParser(
  1231. description="Convert a huggingface model to a GGML compatible file")
  1232. parser.add_argument(
  1233. "--vocab-only", action="store_true",
  1234. help="extract only the vocab",
  1235. )
  1236. parser.add_argument(
  1237. "--awq-path", type=Path, default=None,
  1238. help="Path to scale awq cache file")
  1239. parser.add_argument(
  1240. "--outfile", type=Path,
  1241. help="path to write to; default: based on input",
  1242. )
  1243. parser.add_argument(
  1244. "--outtype", type=str, choices=["f32", "f16"], default="f16",
  1245. help="output format - use f32 for float32, f16 for float16",
  1246. )
  1247. parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
  1248. parser.add_argument(
  1249. "model", type=Path,
  1250. help="directory containing model file",
  1251. )
  1252. return parser.parse_args()
  1253. def main() -> None:
  1254. args = parse_args()
  1255. dir_model = args.model
  1256. if args.awq_path:
  1257. sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
  1258. from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
  1259. tmp_model_path = args.model / "weighted_model"
  1260. dir_model = tmp_model_path
  1261. if tmp_model_path.is_dir():
  1262. print(f"{tmp_model_path} exists as a weighted model.")
  1263. else:
  1264. tmp_model_path.mkdir(parents=True, exist_ok=True)
  1265. print("Saving new weighted model ...")
  1266. add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
  1267. print(f"Saved weighted model at {tmp_model_path}.")
  1268. if not dir_model.is_dir():
  1269. print(f'Error: {args.model} is not a directory', file=sys.stderr)
  1270. sys.exit(1)
  1271. ftype_map = {
  1272. "f32": gguf.GGMLQuantizationType.F32,
  1273. "f16": gguf.GGMLQuantizationType.F16,
  1274. }
  1275. if args.outfile is not None:
  1276. fname_out = args.outfile
  1277. else:
  1278. # output in the same directory as the model by default
  1279. fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
  1280. print(f"Loading model: {dir_model.name}")
  1281. hparams = Model.load_hparams(dir_model)
  1282. with torch.inference_mode():
  1283. model_class = Model.from_model_architecture(hparams["architectures"][0])
  1284. model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
  1285. print("Set model parameters")
  1286. model_instance.set_gguf_parameters()
  1287. print("Set model tokenizer")
  1288. model_instance.set_vocab()
  1289. if args.vocab_only:
  1290. print(f"Exporting model vocab to '{fname_out}'")
  1291. model_instance.write_vocab()
  1292. else:
  1293. print(f"Exporting model to '{fname_out}'")
  1294. model_instance.write()
  1295. print(f"Model successfully exported to '{fname_out}'")
  1296. if __name__ == '__main__':
  1297. main()