convert-hf-to-gguf.py 74 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import contextlib
  5. import json
  6. import os
  7. import re
  8. import sys
  9. from enum import IntEnum
  10. from pathlib import Path
  11. from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
  12. import numpy as np
  13. import torch
  14. if TYPE_CHECKING:
  15. from torch import Tensor
  16. if 'NO_LOCAL_GGUF' not in os.environ:
  17. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  18. import gguf
  19. from convert import HfVocab
  20. # check for any of the given keys in the dictionary and return the value of the first key found
  21. def get_key_opts(d, keys):
  22. for k in keys:
  23. if k in d:
  24. return d[k]
  25. print(f"Could not find any of {keys}")
  26. sys.exit()
  27. ###### MODEL DEFINITIONS ######
  28. class SentencePieceTokenTypes(IntEnum):
  29. NORMAL = 1
  30. UNKNOWN = 2
  31. CONTROL = 3
  32. USER_DEFINED = 4
  33. UNUSED = 5
  34. BYTE = 6
  35. class Model:
  36. def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
  37. self.dir_model = dir_model
  38. self.ftype = ftype
  39. self.fname_out = fname_out
  40. self.is_big_endian = is_big_endian
  41. self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  42. self.is_safetensors = self._is_model_safetensors()
  43. self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
  44. self.part_names = self._get_part_names()
  45. self.hparams = Model.load_hparams(self.dir_model)
  46. self.model_arch = self._get_model_architecture()
  47. self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
  48. def set_vocab(self):
  49. self._set_vocab_gpt2()
  50. def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
  51. for part_name in self.part_names:
  52. print(f"gguf: loading model part '{part_name}'")
  53. ctx: ContextManager[Any]
  54. if self.is_safetensors:
  55. from safetensors import safe_open
  56. ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
  57. else:
  58. ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
  59. with ctx as model_part:
  60. for name in model_part.keys():
  61. data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
  62. yield name, data
  63. def set_gguf_parameters(self):
  64. self.gguf_writer.add_name(self.dir_model.name)
  65. self.gguf_writer.add_block_count(self.hparams.get(
  66. "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
  67. ))
  68. if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
  69. self.gguf_writer.add_context_length(n_ctx)
  70. if (n_embd := self.hparams.get("hidden_size")) is not None:
  71. self.gguf_writer.add_embedding_length(n_embd)
  72. if (n_ff := self.hparams.get("intermediate_size")) is not None:
  73. self.gguf_writer.add_feed_forward_length(n_ff)
  74. if (n_head := self.hparams.get("num_attention_heads")) is not None:
  75. self.gguf_writer.add_head_count(n_head)
  76. if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
  77. self.gguf_writer.add_head_count_kv(n_head_kv)
  78. if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
  79. self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
  80. if (n_experts := self.hparams.get("num_local_experts")) is not None:
  81. self.gguf_writer.add_expert_count(n_experts)
  82. if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
  83. self.gguf_writer.add_expert_used_count(n_experts_used)
  84. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  85. def write_tensors(self):
  86. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  87. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  88. for name, data_torch in self.get_tensors():
  89. # we don't need these
  90. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  91. continue
  92. old_dtype = data_torch.dtype
  93. # convert any unsupported data types to float32
  94. if data_torch.dtype not in (torch.float16, torch.float32):
  95. data_torch = data_torch.to(torch.float32)
  96. data = data_torch.squeeze().numpy()
  97. # map tensor names
  98. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  99. if new_name is None:
  100. print(f"Can not map tensor {name!r}")
  101. sys.exit()
  102. n_dims = len(data.shape)
  103. data_dtype = data.dtype
  104. # if f32 desired, convert any float16 to float32
  105. if self.ftype == 0 and data_dtype == np.float16:
  106. data = data.astype(np.float32)
  107. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  108. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  109. data = data.astype(np.float32)
  110. # if f16 desired, convert any float32 2-dim weight tensors to float16
  111. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  112. data = data.astype(np.float16)
  113. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  114. self.gguf_writer.add_tensor(new_name, data)
  115. def write(self):
  116. self.write_tensors()
  117. self.gguf_writer.write_header_to_file()
  118. self.gguf_writer.write_kv_data_to_file()
  119. self.gguf_writer.write_tensors_to_file()
  120. self.gguf_writer.close()
  121. def write_vocab(self):
  122. self.gguf_writer.write_header_to_file()
  123. self.gguf_writer.write_kv_data_to_file()
  124. self.gguf_writer.close()
  125. @staticmethod
  126. def count_model_parts(dir_model: Path, prefix: str) -> int:
  127. num_parts = 0
  128. for filename in os.listdir(dir_model):
  129. if filename.endswith(prefix):
  130. num_parts += 1
  131. return num_parts
  132. @staticmethod
  133. def load_hparams(dir_model):
  134. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  135. return json.load(f)
  136. @staticmethod
  137. def from_model_architecture(model_architecture):
  138. if model_architecture == "GPTNeoXForCausalLM":
  139. return GPTNeoXModel
  140. if model_architecture == "BloomForCausalLM":
  141. return BloomModel
  142. if model_architecture == "MPTForCausalLM":
  143. return MPTModel
  144. if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  145. return BaichuanModel
  146. if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
  147. return FalconModel
  148. if model_architecture == "GPTBigCodeForCausalLM":
  149. return StarCoderModel
  150. if model_architecture == "GPTRefactForCausalLM":
  151. return RefactModel
  152. if model_architecture == "PersimmonForCausalLM":
  153. return PersimmonModel
  154. if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  155. return StableLMModel
  156. if model_architecture == "QWenLMHeadModel":
  157. return QwenModel
  158. if model_architecture == "Qwen2ForCausalLM":
  159. return Model
  160. if model_architecture == "MixtralForCausalLM":
  161. return MixtralModel
  162. if model_architecture == "GPT2LMHeadModel":
  163. return GPT2Model
  164. if model_architecture == "PhiForCausalLM":
  165. return Phi2Model
  166. if model_architecture == "PlamoForCausalLM":
  167. return PlamoModel
  168. if model_architecture == "CodeShellForCausalLM":
  169. return CodeShellModel
  170. if model_architecture == "OrionForCausalLM":
  171. return OrionModel
  172. if model_architecture == "InternLM2ForCausalLM":
  173. return InternLM2Model
  174. if model_architecture == "MiniCPMForCausalLM":
  175. return MiniCPMModel
  176. return Model
  177. def _is_model_safetensors(self) -> bool:
  178. return Model.count_model_parts(self.dir_model, ".safetensors") > 0
  179. def _get_part_names(self):
  180. if self.is_safetensors:
  181. if self.num_parts == 1: # there's only one .safetensors file
  182. return ("model.safetensors",)
  183. return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
  184. if self.num_parts == 1: # there's only one .bin file
  185. return ("pytorch_model.bin",)
  186. return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
  187. def _get_model_architecture(self) -> gguf.MODEL_ARCH:
  188. arch = self.hparams["architectures"][0]
  189. if arch == "GPTNeoXForCausalLM":
  190. return gguf.MODEL_ARCH.GPTNEOX
  191. if arch == "BloomForCausalLM":
  192. return gguf.MODEL_ARCH.BLOOM
  193. if arch == "MPTForCausalLM":
  194. return gguf.MODEL_ARCH.MPT
  195. if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
  196. return gguf.MODEL_ARCH.BAICHUAN
  197. if arch in ("FalconForCausalLM", "RWForCausalLM"):
  198. return gguf.MODEL_ARCH.FALCON
  199. if arch == "GPTBigCodeForCausalLM":
  200. return gguf.MODEL_ARCH.STARCODER
  201. if arch == "GPTRefactForCausalLM":
  202. return gguf.MODEL_ARCH.REFACT
  203. if arch == "PersimmonForCausalLM":
  204. return gguf.MODEL_ARCH.PERSIMMON
  205. if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
  206. return gguf.MODEL_ARCH.STABLELM
  207. if arch == "QWenLMHeadModel":
  208. return gguf.MODEL_ARCH.QWEN
  209. if arch == "Qwen2ForCausalLM":
  210. return gguf.MODEL_ARCH.QWEN2
  211. if arch == "MixtralForCausalLM":
  212. return gguf.MODEL_ARCH.LLAMA
  213. if arch == "GPT2LMHeadModel":
  214. return gguf.MODEL_ARCH.GPT2
  215. if arch == "PhiForCausalLM":
  216. return gguf.MODEL_ARCH.PHI2
  217. if arch == "PlamoForCausalLM":
  218. return gguf.MODEL_ARCH.PLAMO
  219. if arch == "CodeShellForCausalLM":
  220. return gguf.MODEL_ARCH.CODESHELL
  221. if arch == "OrionForCausalLM":
  222. return gguf.MODEL_ARCH.ORION
  223. if arch == "InternLM2ForCausalLM":
  224. return gguf.MODEL_ARCH.INTERNLM2
  225. if arch == "MiniCPMForCausalLM":
  226. return gguf.MODEL_ARCH.MINICPM
  227. raise NotImplementedError(f'Architecture "{arch}" not supported!')
  228. def _set_vocab_gpt2(self):
  229. dir_model = self.dir_model
  230. hparams = self.hparams
  231. tokens: list[bytearray] = []
  232. toktypes: list[int] = []
  233. from transformers import AutoTokenizer
  234. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  235. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  236. assert max(tokenizer.vocab.values()) < vocab_size
  237. reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
  238. added_vocab = tokenizer.get_added_vocab()
  239. for i in range(vocab_size):
  240. if i not in reverse_vocab:
  241. pad_token = f"[PAD{i}]".encode('utf-8')
  242. tokens.append(bytearray(pad_token))
  243. toktypes.append(gguf.TokenType.USER_DEFINED)
  244. elif reverse_vocab[i] in added_vocab:
  245. tokens.append(reverse_vocab[i])
  246. if tokenizer.added_tokens_decoder[i].special:
  247. toktypes.append(gguf.TokenType.CONTROL)
  248. else:
  249. toktypes.append(gguf.TokenType.USER_DEFINED)
  250. else:
  251. tokens.append(reverse_vocab[i])
  252. toktypes.append(gguf.TokenType.NORMAL)
  253. self.gguf_writer.add_tokenizer_model("gpt2")
  254. self.gguf_writer.add_token_list(tokens)
  255. self.gguf_writer.add_token_types(toktypes)
  256. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  257. special_vocab.add_to_gguf(self.gguf_writer)
  258. def _set_vocab_qwen(self):
  259. dir_model = self.dir_model
  260. hparams = self.hparams
  261. tokens: list[bytearray] = []
  262. toktypes: list[int] = []
  263. from transformers import AutoTokenizer
  264. tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
  265. vocab_size = hparams["vocab_size"]
  266. assert max(tokenizer.get_vocab().values()) < vocab_size
  267. merges = []
  268. vocab = {}
  269. mergeable_ranks = tokenizer.mergeable_ranks
  270. for token, rank in mergeable_ranks.items():
  271. vocab[QwenModel.token_bytes_to_string(token)] = rank
  272. if len(token) == 1:
  273. continue
  274. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
  275. assert len(merged) == 2
  276. merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
  277. # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
  278. added_vocab = tokenizer.special_tokens
  279. reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
  280. for i in range(vocab_size):
  281. if i not in reverse_vocab:
  282. pad_token = f"[PAD{i}]".encode("utf-8")
  283. tokens.append(bytearray(pad_token))
  284. toktypes.append(gguf.TokenType.USER_DEFINED)
  285. elif reverse_vocab[i] in added_vocab:
  286. tokens.append(reverse_vocab[i])
  287. toktypes.append(gguf.TokenType.CONTROL)
  288. else:
  289. tokens.append(reverse_vocab[i])
  290. toktypes.append(gguf.TokenType.NORMAL)
  291. self.gguf_writer.add_tokenizer_model("gpt2")
  292. self.gguf_writer.add_token_list(tokens)
  293. self.gguf_writer.add_token_types(toktypes)
  294. special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
  295. special_vocab.merges = merges
  296. # only add special tokens when they were not already loaded from config.json
  297. if len(special_vocab.special_token_ids) == 0:
  298. special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
  299. special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
  300. # this one is usually not in config.json anyway
  301. special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
  302. special_vocab.add_to_gguf(self.gguf_writer)
  303. def _set_vocab_sentencepiece(self):
  304. from sentencepiece import SentencePieceProcessor
  305. tokenizer_path = self.dir_model / 'tokenizer.model'
  306. tokens: list[bytes] = []
  307. scores: list[float] = []
  308. toktypes: list[int] = []
  309. if not tokenizer_path.is_file():
  310. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  311. sys.exit(1)
  312. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  313. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  314. for token_id in range(vocab_size):
  315. piece = tokenizer.id_to_piece(token_id)
  316. text = piece.encode("utf-8")
  317. score = tokenizer.get_score(token_id)
  318. toktype = SentencePieceTokenTypes.NORMAL
  319. if tokenizer.is_unknown(token_id):
  320. toktype = SentencePieceTokenTypes.UNKNOWN
  321. elif tokenizer.is_control(token_id):
  322. toktype = SentencePieceTokenTypes.CONTROL
  323. elif tokenizer.is_unused(token_id):
  324. toktype = SentencePieceTokenTypes.UNUSED
  325. elif tokenizer.is_byte(token_id):
  326. toktype = SentencePieceTokenTypes.BYTE
  327. tokens.append(text)
  328. scores.append(score)
  329. toktypes.append(toktype)
  330. added_tokens_file = self.dir_model / 'added_tokens.json'
  331. if added_tokens_file.is_file():
  332. with open(added_tokens_file, "r", encoding="utf-8") as f:
  333. added_tokens_json = json.load(f)
  334. for key in added_tokens_json:
  335. tokens.append(key.encode("utf-8"))
  336. scores.append(-1000.0)
  337. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  338. self.gguf_writer.add_tokenizer_model("llama")
  339. self.gguf_writer.add_token_list(tokens)
  340. self.gguf_writer.add_token_scores(scores)
  341. self.gguf_writer.add_token_types(toktypes)
  342. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  343. special_vocab.add_to_gguf(self.gguf_writer)
  344. def _set_vocab_hf(self):
  345. path = self.dir_model
  346. added_tokens_path = self.dir_model
  347. vocab = HfVocab(
  348. path, added_tokens_path if added_tokens_path.exists() else None
  349. )
  350. tokens = []
  351. scores = []
  352. toktypes = []
  353. for text, score, toktype in vocab.all_tokens():
  354. tokens.append(text)
  355. scores.append(score)
  356. toktypes.append(toktype)
  357. assert len(tokens) == vocab.vocab_size
  358. self.gguf_writer.add_tokenizer_model("llama")
  359. self.gguf_writer.add_token_list(tokens)
  360. self.gguf_writer.add_token_scores(scores)
  361. self.gguf_writer.add_token_types(toktypes)
  362. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  363. special_vocab.add_to_gguf(self.gguf_writer)
  364. class GPTNeoXModel(Model):
  365. def set_gguf_parameters(self):
  366. block_count = self.hparams["num_hidden_layers"]
  367. self.gguf_writer.add_name(self.dir_model.name)
  368. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  369. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  370. self.gguf_writer.add_block_count(block_count)
  371. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  372. self.gguf_writer.add_rope_dimension_count(
  373. int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
  374. )
  375. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  376. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  377. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  378. class BloomModel(Model):
  379. def set_gguf_parameters(self):
  380. self.gguf_writer.add_name("Bloom")
  381. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  382. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  383. self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
  384. self.gguf_writer.add_embedding_length(n_embed)
  385. self.gguf_writer.add_feed_forward_length(4 * n_embed)
  386. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  387. self.gguf_writer.add_head_count(n_head)
  388. self.gguf_writer.add_head_count_kv(n_head)
  389. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  390. self.gguf_writer.add_file_type(self.ftype)
  391. def write_tensors(self):
  392. block_count = self.hparams["n_layer"]
  393. tensors = dict(self.get_tensors())
  394. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  395. has_lm_head = True
  396. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  397. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  398. for name, data_torch in tensors.items():
  399. if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
  400. has_lm_head = False
  401. name = re.sub(r'transformer\.', '', name)
  402. old_dtype = data_torch.dtype
  403. # convert any unsupported data types to float32
  404. if data_torch.dtype not in (torch.float16, torch.float32):
  405. data_torch = data_torch.to(torch.float32)
  406. data = data_torch.squeeze().numpy()
  407. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  408. # Map bloom-style qkv_linear to gpt-style qkv_linear
  409. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  410. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  411. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  412. data = np.concatenate(
  413. (
  414. qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  415. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  416. qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
  417. ),
  418. axis=0,
  419. )
  420. print("re-format attention.linear_qkv.weight")
  421. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  422. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  423. data = np.concatenate(
  424. (
  425. qkv_bias[:, 0, :].reshape((n_embed,)),
  426. qkv_bias[:, 1, :].reshape((n_embed,)),
  427. qkv_bias[:, 2, :].reshape((n_embed,)),
  428. ),
  429. axis=0,
  430. )
  431. print("re-format attention.linear_qkv.bias")
  432. # map tensor names
  433. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  434. if new_name is None:
  435. print(f"Can not map tensor {name!r}")
  436. sys.exit()
  437. n_dims = len(data.shape)
  438. data_dtype = data.dtype
  439. # if f32 desired, convert any float16 to float32
  440. if self.ftype == 0 and data_dtype == np.float16:
  441. data = data.astype(np.float32)
  442. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  443. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  444. data = data.astype(np.float32)
  445. # if f16 desired, convert any float32 2-dim weight tensors to float16
  446. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  447. data = data.astype(np.float16)
  448. print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  449. self.gguf_writer.add_tensor(new_name, data)
  450. if not has_lm_head and name == "word_embeddings.weight":
  451. self.gguf_writer.add_tensor("output.weight", data)
  452. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  453. class MPTModel(Model):
  454. def set_gguf_parameters(self):
  455. block_count = self.hparams["n_layers"]
  456. self.gguf_writer.add_name(self.dir_model.name)
  457. self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
  458. self.gguf_writer.add_embedding_length(self.hparams["d_model"])
  459. self.gguf_writer.add_block_count(block_count)
  460. self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
  461. self.gguf_writer.add_head_count(self.hparams["n_heads"])
  462. if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
  463. self.gguf_writer.add_head_count_kv(kv_n_heads)
  464. self.gguf_writer.add_layer_norm_eps(1e-5)
  465. if self.hparams["attn_config"]["clip_qkv"] is not None:
  466. self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
  467. self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
  468. def write_tensors(self):
  469. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
  470. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  471. for name, data_torch in self.get_tensors():
  472. # we don't need these
  473. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  474. continue
  475. old_dtype = data_torch.dtype
  476. # convert any unsupported data types to float32
  477. if data_torch.dtype not in (torch.float16, torch.float32):
  478. data_torch = data_torch.to(torch.float32)
  479. data = data_torch.squeeze().numpy()
  480. # map tensor names
  481. if "scales" in name:
  482. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
  483. if new_name is not None:
  484. new_name = new_name.replace("scales", "act.scales")
  485. else:
  486. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  487. if new_name is None:
  488. print(f"Can not map tensor {name!r}")
  489. sys.exit()
  490. n_dims = len(data.shape)
  491. data_dtype = data.dtype
  492. # if f32 desired, convert any float16 to float32
  493. if self.ftype == 0 and data_dtype == np.float16:
  494. data = data.astype(np.float32)
  495. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  496. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  497. data = data.astype(np.float32)
  498. # if f16 desired, convert any float32 2-dim weight tensors to float16
  499. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  500. data = data.astype(np.float16)
  501. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  502. self.gguf_writer.add_tensor(new_name, data)
  503. # note: MPT output is tied to (same as) wte in original model;
  504. # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
  505. if new_name == "token_embd.weight":
  506. self.gguf_writer.add_tensor("output.weight", data)
  507. class OrionModel(Model):
  508. def set_vocab(self):
  509. self._set_vocab_sentencepiece()
  510. def set_gguf_parameters(self):
  511. block_count = self.hparams["num_hidden_layers"]
  512. head_count = self.hparams["num_attention_heads"]
  513. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  514. hf_repo = self.hparams.get("_name_or_path", "")
  515. ctx_length = 0
  516. if "max_sequence_length" in self.hparams:
  517. ctx_length = self.hparams["max_sequence_length"]
  518. elif "max_position_embeddings" in self.hparams:
  519. ctx_length = self.hparams["max_position_embeddings"]
  520. elif "model_max_length" in self.hparams:
  521. ctx_length = self.hparams["model_max_length"]
  522. else:
  523. print("gguf: can not find ctx length parameter.")
  524. sys.exit()
  525. self.gguf_writer.add_file_type(self.ftype)
  526. self.gguf_writer.add_name(self.dir_model.name)
  527. self.gguf_writer.add_source_hf_repo(hf_repo)
  528. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  529. self.gguf_writer.add_context_length(ctx_length)
  530. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  531. self.gguf_writer.add_block_count(block_count)
  532. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  533. self.gguf_writer.add_head_count(head_count)
  534. self.gguf_writer.add_head_count_kv(head_count_kv)
  535. self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
  536. def write_tensors(self):
  537. # Collect tensors from generator object
  538. model_kv = dict(self.get_tensors())
  539. block_count = self.hparams["num_hidden_layers"]
  540. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  541. for name, data_torch in model_kv.items():
  542. # we don't need these
  543. if name.endswith(".rotary_emb.inv_freq"):
  544. continue
  545. old_dtype = data_torch.dtype
  546. # convert any unsupported data types to float32
  547. if data_torch.dtype not in (torch.float16, torch.float32):
  548. data_torch = data_torch.to(torch.float32)
  549. data = data_torch.squeeze().numpy()
  550. # map tensor names
  551. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  552. if new_name is None:
  553. print(f"Can not map tensor {name!r}")
  554. sys.exit()
  555. n_dims = len(data.shape)
  556. data_dtype = data.dtype
  557. # if f32 desired, convert any float16 to float32
  558. if self.ftype == 0 and data_dtype == np.float16:
  559. data = data.astype(np.float32)
  560. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  561. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  562. data = data.astype(np.float32)
  563. # if f16 desired, convert any float32 2-dim weight tensors to float16
  564. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  565. data = data.astype(np.float16)
  566. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  567. self.gguf_writer.add_tensor(new_name, data)
  568. class BaichuanModel(Model):
  569. def set_vocab(self):
  570. self._set_vocab_sentencepiece()
  571. def set_gguf_parameters(self):
  572. block_count = self.hparams["num_hidden_layers"]
  573. head_count = self.hparams["num_attention_heads"]
  574. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  575. hf_repo = self.hparams.get("_name_or_path", "")
  576. ctx_length = 0
  577. if "max_sequence_length" in self.hparams:
  578. ctx_length = self.hparams["max_sequence_length"]
  579. elif "max_position_embeddings" in self.hparams:
  580. ctx_length = self.hparams["max_position_embeddings"]
  581. elif "model_max_length" in self.hparams:
  582. ctx_length = self.hparams["model_max_length"]
  583. else:
  584. print("gguf: can not find ctx length parameter.")
  585. sys.exit()
  586. self.gguf_writer.add_name(self.dir_model.name)
  587. self.gguf_writer.add_source_hf_repo(hf_repo)
  588. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  589. self.gguf_writer.add_context_length(ctx_length)
  590. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  591. self.gguf_writer.add_block_count(block_count)
  592. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  593. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  594. self.gguf_writer.add_head_count(head_count)
  595. self.gguf_writer.add_head_count_kv(head_count_kv)
  596. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  597. if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
  598. if self.hparams["rope_scaling"].get("type") == "linear":
  599. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  600. self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
  601. def write_tensors(self):
  602. # Collect tensors from generator object
  603. model_kv = dict(self.get_tensors())
  604. block_count = self.hparams["num_hidden_layers"]
  605. head_count = self.hparams["num_attention_heads"]
  606. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  607. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  608. for i in range(block_count):
  609. if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
  610. print(f"Unpacking and permuting layer {i}")
  611. model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
  612. self._reverse_hf_permute_part(w, 0, head_count, head_count)
  613. model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
  614. self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
  615. model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
  616. self._reverse_hf_part(w, 2)
  617. del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
  618. for name, data_torch in model_kv.items():
  619. # we don't need these
  620. if name.endswith(".rotary_emb.inv_freq"):
  621. continue
  622. old_dtype = data_torch.dtype
  623. # convert any unsupported data types to float32
  624. if data_torch.dtype not in (torch.float16, torch.float32):
  625. data_torch = data_torch.to(torch.float32)
  626. data = data_torch.squeeze().numpy()
  627. # map tensor names
  628. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  629. if new_name is None:
  630. print(f"Can not map tensor {name!r}")
  631. sys.exit()
  632. n_dims = len(data.shape)
  633. data_dtype = data.dtype
  634. # if f32 desired, convert any float16 to float32
  635. if self.ftype == 0 and data_dtype == np.float16:
  636. data = data.astype(np.float32)
  637. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  638. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  639. data = data.astype(np.float32)
  640. # if f16 desired, convert any float32 2-dim weight tensors to float16
  641. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  642. data = data.astype(np.float16)
  643. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  644. self.gguf_writer.add_tensor(new_name, data)
  645. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  646. if n_kv_head is not None and n_head != n_kv_head:
  647. n_head //= n_kv_head
  648. return (
  649. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  650. .swapaxes(1, 2)
  651. .reshape(weights.shape)
  652. )
  653. def _reverse_hf_permute_part(
  654. self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
  655. ) -> Tensor:
  656. r = weights.shape[0] // 3
  657. return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
  658. def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
  659. r = weights.shape[0] // 3
  660. return weights[r * n_part:r * n_part + r, ...]
  661. class FalconModel(Model):
  662. def set_gguf_parameters(self):
  663. block_count = self.hparams.get("num_hidden_layers")
  664. if block_count is None:
  665. block_count = self.hparams["n_layer"] # old name
  666. n_head = self.hparams.get("num_attention_heads")
  667. if n_head is None:
  668. n_head = self.hparams["n_head"] # old name
  669. n_head_kv = self.hparams.get("num_kv_heads")
  670. if n_head_kv is None:
  671. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  672. self.gguf_writer.add_name("Falcon")
  673. self.gguf_writer.add_context_length(2048) # not in config.json
  674. self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
  675. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  676. self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
  677. self.gguf_writer.add_block_count(block_count)
  678. self.gguf_writer.add_head_count(n_head)
  679. self.gguf_writer.add_head_count_kv(n_head_kv)
  680. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  681. self.gguf_writer.add_file_type(self.ftype)
  682. def write_tensors(self):
  683. block_count = self.hparams.get("num_hidden_layers")
  684. if block_count is None:
  685. block_count = self.hparams["n_layer"] # old name
  686. n_head = self.hparams.get("num_attention_heads")
  687. if n_head is None:
  688. n_head = self.hparams["n_head"] # old name
  689. n_head_kv = self.hparams.get("num_kv_heads")
  690. if n_head_kv is None:
  691. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  692. head_dim = self.hparams["hidden_size"] // n_head
  693. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  694. for name, data_torch in self.get_tensors():
  695. old_dtype = data_torch.dtype
  696. # convert any unsupported data types to float32
  697. if data_torch.dtype not in (torch.float16, torch.float32):
  698. data_torch = data_torch.to(torch.float32)
  699. # QKV tensor transform
  700. # The original query_key_value tensor contains n_head_kv "kv groups",
  701. # each consisting of n_head/n_head_kv query weights followed by one key
  702. # and one value weight (shared by all query heads in the kv group).
  703. # This layout makes it a big pain to work with in GGML.
  704. # So we rearrange them here,, so that we have n_head query weights
  705. # followed by n_head_kv key weights followed by n_head_kv value weights,
  706. # in contiguous fashion.
  707. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
  708. if "query_key_value" in name:
  709. qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
  710. q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
  711. k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
  712. v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
  713. data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
  714. data = data_torch.squeeze().numpy()
  715. # map tensor names
  716. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  717. if new_name is None:
  718. print(f"Can not map tensor {name!r}")
  719. sys.exit()
  720. n_dims = len(data.shape)
  721. data_dtype = data.dtype
  722. # if f32 desired, convert any float16 to float32
  723. if self.ftype == 0 and data_dtype == np.float16:
  724. data = data.astype(np.float32)
  725. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  726. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  727. data = data.astype(np.float32)
  728. # if f16 desired, convert any float32 2-dim weight tensors to float16
  729. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  730. data = data.astype(np.float16)
  731. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  732. self.gguf_writer.add_tensor(new_name, data)
  733. class StarCoderModel(Model):
  734. def set_gguf_parameters(self):
  735. block_count = self.hparams["n_layer"]
  736. self.gguf_writer.add_name("StarCoder")
  737. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  738. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  739. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  740. self.gguf_writer.add_block_count(block_count)
  741. self.gguf_writer.add_head_count(self.hparams["n_head"])
  742. self.gguf_writer.add_head_count_kv(1)
  743. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  744. self.gguf_writer.add_file_type(self.ftype)
  745. class RefactModel(Model):
  746. def set_gguf_parameters(self):
  747. hidden_dim = self.hparams["n_embd"]
  748. inner_dim = 4 * hidden_dim
  749. hidden_dim = int(2 * inner_dim / 3)
  750. multiple_of = 256
  751. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  752. block_count = self.hparams["n_layer"]
  753. self.gguf_writer.add_name("Refact")
  754. # refact uses Alibi. So this is from config.json which might be used by training.
  755. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  756. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  757. self.gguf_writer.add_feed_forward_length(ff_dim)
  758. self.gguf_writer.add_block_count(block_count)
  759. self.gguf_writer.add_head_count(self.hparams["n_head"])
  760. self.gguf_writer.add_head_count_kv(1)
  761. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  762. self.gguf_writer.add_file_type(self.ftype)
  763. def write_tensors(self):
  764. hidden_dim = self.hparams["n_embd"]
  765. inner_dim = 4 * hidden_dim
  766. hidden_dim = int(2 * inner_dim / 3)
  767. multiple_of = 256
  768. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  769. n_head = self.hparams["n_head"]
  770. n_head_kv = 1
  771. head_dim = self.hparams["n_embd"] // n_head
  772. block_count = self.hparams["n_layer"]
  773. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  774. tensors = dict(self.get_tensors())
  775. for i in range(block_count):
  776. if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
  777. tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
  778. tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
  779. del tensors[f"transformer.h.{i}.attn.kv.weight"]
  780. if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
  781. tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
  782. del tensors[f"transformer.h.{i}.attn.q.weight"]
  783. if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
  784. tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
  785. tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
  786. del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  787. for name, data_torch in tensors.items():
  788. old_dtype = data_torch.dtype
  789. # convert any unsupported data types to float32
  790. if data_torch.dtype not in (torch.float16, torch.float32):
  791. data_torch = data_torch.to(torch.float32)
  792. data = data_torch.squeeze().numpy()
  793. # map tensor names
  794. new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
  795. if new_name is None:
  796. print(f"Can not map tensor {name!r}")
  797. sys.exit()
  798. n_dims = len(data.shape)
  799. data_dtype = data.dtype
  800. # if f32 desired, convert any float16 to float32
  801. if self.ftype == 0 and data_dtype == np.float16:
  802. data = data.astype(np.float32)
  803. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  804. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  805. data = data.astype(np.float32)
  806. # if f16 desired, convert any float32 2-dim weight tensors to float16
  807. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  808. data = data.astype(np.float16)
  809. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  810. self.gguf_writer.add_tensor(new_name, data)
  811. class PersimmonModel(Model):
  812. def set_gguf_parameters(self):
  813. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  814. head_count = self.hparams["num_attention_heads"]
  815. head_count_kv = head_count
  816. hidden_size = self.hparams["hidden_size"]
  817. self.gguf_writer.add_name('persimmon-8b-chat')
  818. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  819. self.gguf_writer.add_embedding_length(hidden_size)
  820. self.gguf_writer.add_block_count(block_count)
  821. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  822. # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
  823. # than the head size?
  824. # ref: https://github.com/ggerganov/llama.cpp/pull/4889
  825. # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
  826. self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
  827. self.gguf_writer.add_head_count(head_count)
  828. self.gguf_writer.add_head_count_kv(head_count_kv)
  829. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  830. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  831. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  832. def set_vocab(self):
  833. self._set_vocab_sentencepiece()
  834. # self.gguf_writer.add_bos_token_id(71013)
  835. # self.gguf_writer.add_eos_token_id(71013)
  836. def write_tensors(self):
  837. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  838. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  839. for name, data_torch in self.get_tensors():
  840. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  841. continue
  842. old_dtype = data_torch.dtype
  843. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  844. data = data_torch.to(torch.float32).squeeze().numpy()
  845. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  846. if new_name is None:
  847. print(f"Can not map tensor {name!r}")
  848. sys.exit()
  849. n_dims = len(data.shape)
  850. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  851. self.gguf_writer.add_tensor(new_name, data)
  852. class StableLMModel(Model):
  853. def set_vocab(self):
  854. if (self.dir_model / "tokenizer.json").is_file():
  855. self._set_vocab_gpt2()
  856. else:
  857. # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
  858. self._set_vocab_qwen()
  859. def set_gguf_parameters(self):
  860. hparams = self.hparams
  861. block_count = hparams["num_hidden_layers"]
  862. self.gguf_writer.add_name(self.dir_model.name)
  863. self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  864. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  865. self.gguf_writer.add_block_count(block_count)
  866. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  867. self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
  868. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  869. self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
  870. self.gguf_writer.add_layer_norm_eps(1e-5)
  871. class MixtralModel(Model):
  872. def set_vocab(self):
  873. self._set_vocab_sentencepiece()
  874. class MiniCPMModel(Model):
  875. def set_gguf_parameters(self):
  876. block_count = self.hparams["num_hidden_layers"]
  877. self.gguf_writer.add_name("MiniCPM")
  878. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  879. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  880. self.gguf_writer.add_block_count(block_count)
  881. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  882. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  883. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  884. self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
  885. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  886. self.gguf_writer.add_file_type(self.ftype)
  887. def set_vocab(self):
  888. self._set_vocab_hf()
  889. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  890. if n_kv_head is not None and n_head != n_kv_head:
  891. n_head //= n_kv_head
  892. return (
  893. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  894. .swapaxes(1, 2)
  895. .reshape(weights.shape)
  896. )
  897. def write_tensors(self):
  898. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  899. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  900. n_head = self.hparams.get("num_attention_heads")
  901. n_kv_head = self.hparams.get("num_key_value_heads")
  902. for name, data_torch in self.get_tensors():
  903. # we don't need these
  904. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  905. continue
  906. old_dtype = data_torch.dtype
  907. # convert any unsupported data types to float32
  908. if data_torch.dtype not in (torch.float16, torch.float32):
  909. data_torch = data_torch.to(torch.float32)
  910. # HF models permute some of the tensors, so we need to undo that
  911. if name.endswith(("q_proj.weight")):
  912. data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
  913. if name.endswith(("k_proj.weight")):
  914. data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
  915. data = data_torch.squeeze().numpy()
  916. # map tensor names
  917. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  918. if new_name is None:
  919. print(f"Can not map tensor {name!r}")
  920. sys.exit()
  921. n_dims = len(data.shape)
  922. data_dtype = data.dtype
  923. # if f32 desired, convert any float16 to float32
  924. if self.ftype == 0 and data_dtype == np.float16:
  925. data = data.astype(np.float32)
  926. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  927. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  928. data = data.astype(np.float32)
  929. # if f16 desired, convert any float32 2-dim weight tensors to float16
  930. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  931. data = data.astype(np.float16)
  932. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  933. self.gguf_writer.add_tensor(new_name, data)
  934. class QwenModel(Model):
  935. @staticmethod
  936. def token_bytes_to_string(b):
  937. from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
  938. byte_encoder = bytes_to_unicode()
  939. return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
  940. @staticmethod
  941. def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
  942. parts = [bytes([b]) for b in token]
  943. while True:
  944. min_idx = None
  945. min_rank = None
  946. for i, pair in enumerate(zip(parts[:-1], parts[1:])):
  947. rank = mergeable_ranks.get(pair[0] + pair[1])
  948. if rank is not None and (min_rank is None or rank < min_rank):
  949. min_idx = i
  950. min_rank = rank
  951. if min_rank is None or (max_rank is not None and min_rank >= max_rank):
  952. break
  953. assert min_idx is not None
  954. parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
  955. return parts
  956. def set_vocab(self):
  957. self._set_vocab_qwen()
  958. def set_gguf_parameters(self):
  959. self.gguf_writer.add_name("Qwen")
  960. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  961. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  962. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  963. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  964. self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
  965. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  966. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  967. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  968. def write_tensors(self):
  969. block_count = self.hparams["num_hidden_layers"]
  970. model_kv = dict(self.get_tensors())
  971. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  972. for name, data_torch in model_kv.items():
  973. # we don't need these
  974. if name.endswith(".rotary_emb.inv_freq"):
  975. continue
  976. old_dtype = data_torch.dtype
  977. # convert any unsupported data types to float32
  978. if data_torch.dtype not in (torch.float16, torch.float32):
  979. data_torch = data_torch.to(torch.float32)
  980. data = data_torch.squeeze().numpy()
  981. # map tensor names
  982. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  983. if new_name is None:
  984. print(f"Can not map tensor {name!r}")
  985. sys.exit()
  986. n_dims = len(data.shape)
  987. data_dtype = data.dtype
  988. # if f32 desired, convert any float16 to float32
  989. if self.ftype == 0 and data_dtype == np.float16:
  990. data = data.astype(np.float32)
  991. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  992. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  993. data = data.astype(np.float32)
  994. # if f16 desired, convert any float32 2-dim weight tensors to float16
  995. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  996. data = data.astype(np.float16)
  997. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  998. self.gguf_writer.add_tensor(new_name, data)
  999. class GPT2Model(Model):
  1000. def set_gguf_parameters(self):
  1001. self.gguf_writer.add_name(self.dir_model.name)
  1002. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  1003. self.gguf_writer.add_context_length(self.hparams["n_ctx"])
  1004. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  1005. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  1006. self.gguf_writer.add_head_count(self.hparams["n_head"])
  1007. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  1008. self.gguf_writer.add_file_type(self.ftype)
  1009. def write_tensors(self):
  1010. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  1011. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1012. for name, data_torch in self.get_tensors():
  1013. # we don't need these
  1014. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
  1015. continue
  1016. if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
  1017. data_torch = data_torch.transpose(1, 0)
  1018. old_dtype = data_torch.dtype
  1019. # convert any unsupported data types to float32
  1020. if data_torch.dtype not in (torch.float16, torch.float32):
  1021. data_torch = data_torch.to(torch.float32)
  1022. data = data_torch.squeeze().numpy()
  1023. # map tensor names
  1024. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1025. if new_name is None:
  1026. print(f"Can not map tensor {name!r}")
  1027. sys.exit()
  1028. n_dims = len(data.shape)
  1029. data_dtype = data.dtype
  1030. # if f32 desired, convert any float16 to float32
  1031. if self.ftype == 0 and data_dtype == np.float16:
  1032. data = data.astype(np.float32)
  1033. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1034. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1035. data = data.astype(np.float32)
  1036. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1037. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1038. data = data.astype(np.float16)
  1039. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1040. self.gguf_writer.add_tensor(new_name, data)
  1041. # note: GPT2 output is tied to (same as) wte in original model
  1042. if new_name == "token_embd.weight":
  1043. print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1044. self.gguf_writer.add_tensor("output.weight", data)
  1045. class Phi2Model(Model):
  1046. def set_gguf_parameters(self):
  1047. block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
  1048. rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
  1049. n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
  1050. n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
  1051. self.gguf_writer.add_name("Phi2")
  1052. self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
  1053. self.gguf_writer.add_embedding_length(n_embd)
  1054. self.gguf_writer.add_feed_forward_length(4 * n_embd)
  1055. self.gguf_writer.add_block_count(block_count)
  1056. self.gguf_writer.add_head_count(n_head)
  1057. self.gguf_writer.add_head_count_kv(n_head)
  1058. self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
  1059. self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
  1060. self.gguf_writer.add_file_type(self.ftype)
  1061. self.gguf_writer.add_add_bos_token(False)
  1062. class PlamoModel(Model):
  1063. def set_vocab(self):
  1064. self._set_vocab_sentencepiece()
  1065. def set_gguf_parameters(self):
  1066. hparams = self.hparams
  1067. block_count = hparams["num_hidden_layers"]
  1068. self.gguf_writer.add_name("PLaMo")
  1069. self.gguf_writer.add_context_length(4096) # not in config.json
  1070. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  1071. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  1072. self.gguf_writer.add_block_count(block_count)
  1073. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  1074. self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
  1075. self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
  1076. def shuffle_attn_q_weight(self, data_torch):
  1077. assert data_torch.size() == (5120, 5120)
  1078. data_torch = data_torch.reshape(8, 5, 128, 5120)
  1079. data_torch = torch.permute(data_torch, (1, 0, 2, 3))
  1080. data_torch = torch.reshape(data_torch, (5120, 5120))
  1081. return data_torch
  1082. def shuffle_attn_output_weight(self, data_torch):
  1083. assert data_torch.size() == (5120, 5120)
  1084. data_torch = data_torch.reshape(5120, 8, 5, 128)
  1085. data_torch = torch.permute(data_torch, (0, 2, 1, 3))
  1086. data_torch = torch.reshape(data_torch, (5120, 5120))
  1087. return data_torch
  1088. def write_tensors(self):
  1089. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  1090. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1091. for name, data_torch in self.get_tensors():
  1092. if "self_attn.rotary_emb.inv_freq" in name:
  1093. continue
  1094. # map tensor names
  1095. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1096. if new_name is None:
  1097. print(f"Can not map tensor {name!r}")
  1098. sys.exit()
  1099. # shuffle for broadcasting of gqa in ggml_mul_mat
  1100. if new_name.endswith("attn_q.weight"):
  1101. data_torch = self.shuffle_attn_q_weight(data_torch)
  1102. elif new_name.endswith("attn_output.weight"):
  1103. data_torch = self.shuffle_attn_output_weight(data_torch)
  1104. old_dtype = data_torch.dtype
  1105. # convert any unsupported data types to float32
  1106. if data_torch.dtype not in (torch.float16, torch.float32):
  1107. data_torch = data_torch.to(torch.float32)
  1108. data = data_torch.squeeze().numpy()
  1109. n_dims = len(data.shape)
  1110. data_dtype = data.dtype
  1111. # if f32 desired, convert any float16 to float32
  1112. if self.ftype == 0 and data_dtype == np.float16:
  1113. data = data.astype(np.float32)
  1114. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1115. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1116. data = data.astype(np.float32)
  1117. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1118. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1119. data = data.astype(np.float16)
  1120. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1121. self.gguf_writer.add_tensor(new_name, data)
  1122. class CodeShellModel(Model):
  1123. def set_gguf_parameters(self):
  1124. block_count = self.hparams["n_layer"]
  1125. self.gguf_writer.add_name("CodeShell")
  1126. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  1127. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  1128. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  1129. self.gguf_writer.add_block_count(block_count)
  1130. self.gguf_writer.add_head_count(self.hparams["n_head"])
  1131. self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
  1132. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  1133. self.gguf_writer.add_file_type(self.ftype)
  1134. self.gguf_writer.add_rope_freq_base(10000.0)
  1135. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  1136. self.gguf_writer.add_rope_scaling_factor(1.0)
  1137. def write_tensors(self):
  1138. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  1139. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1140. tensors = dict(self.get_tensors())
  1141. has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
  1142. for name, data_torch in tensors.items():
  1143. # we don't need these
  1144. if name.endswith((".attn.rotary_emb.inv_freq")):
  1145. continue
  1146. old_dtype = data_torch.dtype
  1147. # convert any unsupported data types to float32
  1148. if data_torch.dtype not in (torch.float16, torch.float32):
  1149. data_torch = data_torch.to(torch.float32)
  1150. data = data_torch.squeeze().numpy()
  1151. # map tensor names
  1152. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1153. if new_name is None:
  1154. print(f"Can not map tensor {name!r}")
  1155. sys.exit()
  1156. n_dims = len(data.shape)
  1157. data_dtype = data.dtype
  1158. # if f32 desired, convert any float16 to float32
  1159. if self.ftype == 0 and data_dtype == np.float16:
  1160. data = data.astype(np.float32)
  1161. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1162. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1163. data = data.astype(np.float32)
  1164. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1165. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1166. data = data.astype(np.float16)
  1167. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1168. self.gguf_writer.add_tensor(new_name, data)
  1169. if not has_lm_head and name == "transformer.wte.weight":
  1170. self.gguf_writer.add_tensor("output.weight", data)
  1171. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  1172. class InternLM2Model(Model):
  1173. def set_vocab(self):
  1174. # (TODO): Is there a better way?
  1175. # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
  1176. # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
  1177. # recognized as an empty string in C++.
  1178. from sentencepiece import SentencePieceProcessor
  1179. from sentencepiece import sentencepiece_model_pb2 as model
  1180. tokenizer_path = self.dir_model / 'tokenizer.model'
  1181. tokens: list[bytes] = []
  1182. scores: list[float] = []
  1183. toktypes: list[int] = []
  1184. if not tokenizer_path.is_file():
  1185. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  1186. sys.exit(1)
  1187. sentencepiece_model = model.ModelProto()
  1188. sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
  1189. add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
  1190. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  1191. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  1192. for token_id in range(vocab_size):
  1193. piece = tokenizer.id_to_piece(token_id)
  1194. text = piece.encode("utf-8")
  1195. score = tokenizer.get_score(token_id)
  1196. if text == b"\x00":
  1197. # (TODO): fixme
  1198. # Hack here and replace the \x00 characters.
  1199. print(f"InternLM2 convert token '{text}' to '🐉'!")
  1200. text = "🐉"
  1201. toktype = SentencePieceTokenTypes.NORMAL
  1202. if tokenizer.is_unknown(token_id):
  1203. toktype = SentencePieceTokenTypes.UNKNOWN
  1204. elif tokenizer.is_control(token_id):
  1205. toktype = SentencePieceTokenTypes.CONTROL
  1206. elif tokenizer.is_unused(token_id):
  1207. toktype = SentencePieceTokenTypes.UNUSED
  1208. elif tokenizer.is_byte(token_id):
  1209. toktype = SentencePieceTokenTypes.BYTE
  1210. tokens.append(text)
  1211. scores.append(score)
  1212. toktypes.append(toktype)
  1213. added_tokens_file = self.dir_model / 'added_tokens.json'
  1214. if added_tokens_file.is_file():
  1215. with open(added_tokens_file, "r", encoding="utf-8") as f:
  1216. added_tokens_json = json.load(f)
  1217. for key in added_tokens_json:
  1218. tokens.append(key.encode("utf-8"))
  1219. scores.append(-1000.0)
  1220. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  1221. self.gguf_writer.add_tokenizer_model("llama")
  1222. self.gguf_writer.add_token_list(tokens)
  1223. self.gguf_writer.add_token_scores(scores)
  1224. self.gguf_writer.add_token_types(toktypes)
  1225. self.gguf_writer.add_add_space_prefix(add_prefix)
  1226. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  1227. old_eos = special_vocab.special_token_ids["eos"]
  1228. if "chat" in os.path.basename(self.dir_model.absolute()):
  1229. # For the chat model, we replace the eos with '<|im_end|>'.
  1230. special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
  1231. print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
  1232. in chat mode so that the conversation can end normally.")
  1233. special_vocab.add_to_gguf(self.gguf_writer)
  1234. def _try_get_sft_eos(self, tokenizer):
  1235. unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
  1236. im_end_list = tokenizer.encode('<|im_end|>')
  1237. assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
  1238. if len(unused_145_list) == 1:
  1239. eos_token = unused_145_list[0]
  1240. if len(im_end_list) == 1:
  1241. eos_token = im_end_list[0]
  1242. return eos_token
  1243. def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
  1244. if n_head_kv is not None and n_head != n_head_kv:
  1245. n_head = n_head_kv
  1246. return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  1247. .swapaxes(1, 2)
  1248. .reshape(weights.shape))
  1249. def set_gguf_parameters(self):
  1250. self.gguf_writer.add_name("InternLM2")
  1251. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  1252. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  1253. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  1254. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  1255. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  1256. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  1257. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  1258. self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
  1259. def post_write_tensors(self, tensor_map, name, data_torch):
  1260. old_dtype = data_torch.dtype
  1261. # convert any unsupported data types to float32
  1262. if data_torch.dtype not in (torch.float16, torch.float32):
  1263. data_torch = data_torch.to(torch.float32)
  1264. data = data_torch.squeeze().numpy()
  1265. # map tensor names
  1266. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1267. if new_name is None:
  1268. print(f"Can not map tensor {name!r}")
  1269. sys.exit()
  1270. n_dims = len(data.shape)
  1271. data_dtype = data.dtype
  1272. # if f32 desired, convert any float16 to float32
  1273. if self.ftype == 0 and data_dtype == np.float16:
  1274. data = data.astype(np.float32)
  1275. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1276. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1277. data = data.astype(np.float32)
  1278. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1279. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1280. data = data.astype(np.float16)
  1281. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1282. self.gguf_writer.add_tensor(new_name, data)
  1283. def write_tensors(self):
  1284. from einops import rearrange
  1285. num_heads = self.hparams.get("num_attention_heads")
  1286. num_kv_heads = self.hparams.get("num_key_value_heads")
  1287. hidden_size = self.hparams.get("hidden_size")
  1288. q_per_kv = num_heads // num_kv_heads
  1289. head_dim = hidden_size // num_heads
  1290. num_groups = num_heads // q_per_kv
  1291. block_count = self.hparams["num_hidden_layers"]
  1292. model_kv = dict(self.get_tensors())
  1293. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1294. qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
  1295. for name, data_torch in model_kv.items():
  1296. # we don't need these
  1297. if name.endswith(".rotary_emb.inv_freq"):
  1298. continue
  1299. if re.match(qkv_pattern, name):
  1300. bid = re.findall(qkv_pattern, name)[0]
  1301. qkv = data_torch
  1302. qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
  1303. q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
  1304. # The model weights of q and k equire additional reshape.
  1305. q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
  1306. k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
  1307. v = rearrange(v, " o g n i -> o (g n i)").T
  1308. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
  1309. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
  1310. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
  1311. else:
  1312. self.post_write_tensors(tensor_map, name, data_torch)
  1313. ###### CONVERSION LOGIC ######
  1314. def parse_args() -> argparse.Namespace:
  1315. parser = argparse.ArgumentParser(
  1316. description="Convert a huggingface model to a GGML compatible file")
  1317. parser.add_argument(
  1318. "--vocab-only", action="store_true",
  1319. help="extract only the vocab",
  1320. )
  1321. parser.add_argument(
  1322. "--awq-path", type=Path, default=None,
  1323. help="Path to scale awq cache file")
  1324. parser.add_argument(
  1325. "--outfile", type=Path,
  1326. help="path to write to; default: based on input",
  1327. )
  1328. parser.add_argument(
  1329. "--outtype", type=str, choices=["f32", "f16"], default="f16",
  1330. help="output format - use f32 for float32, f16 for float16",
  1331. )
  1332. parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
  1333. parser.add_argument(
  1334. "model", type=Path,
  1335. help="directory containing model file",
  1336. )
  1337. return parser.parse_args()
  1338. def main() -> None:
  1339. args = parse_args()
  1340. dir_model = args.model
  1341. if args.awq_path:
  1342. sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
  1343. from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
  1344. tmp_model_path = args.model / "weighted_model"
  1345. dir_model = tmp_model_path
  1346. if tmp_model_path.is_dir():
  1347. print(f"{tmp_model_path} exists as a weighted model.")
  1348. else:
  1349. tmp_model_path.mkdir(parents=True, exist_ok=True)
  1350. print("Saving new weighted model ...")
  1351. add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
  1352. print(f"Saved weighted model at {tmp_model_path}.")
  1353. if not dir_model.is_dir():
  1354. print(f'Error: {args.model} is not a directory', file=sys.stderr)
  1355. sys.exit(1)
  1356. ftype_map = {
  1357. "f32": gguf.GGMLQuantizationType.F32,
  1358. "f16": gguf.GGMLQuantizationType.F16,
  1359. }
  1360. if args.outfile is not None:
  1361. fname_out = args.outfile
  1362. else:
  1363. # output in the same directory as the model by default
  1364. fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
  1365. print(f"Loading model: {dir_model.name}")
  1366. hparams = Model.load_hparams(dir_model)
  1367. with torch.inference_mode():
  1368. model_class = Model.from_model_architecture(hparams["architectures"][0])
  1369. model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
  1370. print("Set model parameters")
  1371. model_instance.set_gguf_parameters()
  1372. print("Set model tokenizer")
  1373. model_instance.set_vocab()
  1374. if args.vocab_only:
  1375. print(f"Exporting model vocab to '{fname_out}'")
  1376. model_instance.write_vocab()
  1377. else:
  1378. print(f"Exporting model to '{fname_out}'")
  1379. model_instance.write()
  1380. print(f"Model successfully exported to '{fname_out}'")
  1381. if __name__ == '__main__':
  1382. main()