convert-hf-to-gguf.py 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3. import argparse
  4. import contextlib
  5. import json
  6. import os
  7. import re
  8. import sys
  9. from abc import ABC, abstractmethod
  10. from enum import IntEnum
  11. from pathlib import Path
  12. from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
  13. import numpy as np
  14. import torch
  15. if TYPE_CHECKING:
  16. from torch import Tensor
  17. if 'NO_LOCAL_GGUF' not in os.environ:
  18. sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  19. import gguf
  20. from convert import HfVocab
  21. ###### MODEL DEFINITIONS ######
  22. class SentencePieceTokenTypes(IntEnum):
  23. NORMAL = 1
  24. UNKNOWN = 2
  25. CONTROL = 3
  26. USER_DEFINED = 4
  27. UNUSED = 5
  28. BYTE = 6
  29. AnyModel = TypeVar("AnyModel", bound="type[Model]")
  30. class Model(ABC):
  31. _model_classes: dict[str, type[Model]] = {}
  32. def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
  33. self.dir_model = dir_model
  34. self.ftype = ftype
  35. self.fname_out = fname_out
  36. self.is_big_endian = is_big_endian
  37. self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  38. self.is_safetensors = self._is_model_safetensors()
  39. self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
  40. self.part_names = self._get_part_names()
  41. self.hparams = Model.load_hparams(self.dir_model)
  42. self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
  43. self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
  44. @property
  45. @abstractmethod
  46. def model_arch(self) -> gguf.MODEL_ARCH:
  47. pass
  48. def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
  49. key = next((k for k in keys if k in self.hparams), None)
  50. if key is not None:
  51. return self.hparams[key]
  52. if optional:
  53. return None
  54. raise KeyError(f"could not find any of: {keys}")
  55. def set_vocab(self):
  56. self._set_vocab_gpt2()
  57. def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
  58. for part_name in self.part_names:
  59. print(f"gguf: loading model part '{part_name}'")
  60. ctx: ContextManager[Any]
  61. if self.is_safetensors:
  62. from safetensors import safe_open
  63. ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
  64. else:
  65. ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
  66. with ctx as model_part:
  67. for name in model_part.keys():
  68. data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
  69. yield name, data
  70. def set_gguf_parameters(self):
  71. self.gguf_writer.add_name(self.dir_model.name)
  72. self.gguf_writer.add_block_count(self.block_count)
  73. if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
  74. self.gguf_writer.add_context_length(n_ctx)
  75. n_embd = self.find_hparam(["hidden_size", "n_embd"])
  76. self.gguf_writer.add_embedding_length(n_embd)
  77. if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
  78. self.gguf_writer.add_feed_forward_length(n_ff)
  79. n_head = self.find_hparam(["num_attention_heads", "n_head"])
  80. self.gguf_writer.add_head_count(n_head)
  81. if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
  82. self.gguf_writer.add_head_count_kv(n_head_kv)
  83. if (rope_theta := self.hparams.get("rope_theta")) is not None:
  84. self.gguf_writer.add_rope_freq_base(rope_theta)
  85. if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
  86. self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
  87. if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
  88. self.gguf_writer.add_layer_norm_eps(f_norm_eps)
  89. if (n_experts := self.hparams.get("num_local_experts")) is not None:
  90. self.gguf_writer.add_expert_count(n_experts)
  91. if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
  92. self.gguf_writer.add_expert_used_count(n_experts_used)
  93. self.gguf_writer.add_file_type(self.ftype)
  94. def write_tensors(self):
  95. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  96. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  97. for name, data_torch in self.get_tensors():
  98. # we don't need these
  99. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  100. continue
  101. old_dtype = data_torch.dtype
  102. # convert any unsupported data types to float32
  103. if data_torch.dtype not in (torch.float16, torch.float32):
  104. data_torch = data_torch.to(torch.float32)
  105. data = data_torch.squeeze().numpy()
  106. # map tensor names
  107. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  108. if new_name is None:
  109. print(f"Can not map tensor {name!r}")
  110. sys.exit()
  111. n_dims = len(data.shape)
  112. data_dtype = data.dtype
  113. # if f32 desired, convert any float16 to float32
  114. if self.ftype == 0 and data_dtype == np.float16:
  115. data = data.astype(np.float32)
  116. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  117. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  118. data = data.astype(np.float32)
  119. # if f16 desired, convert any float32 2-dim weight tensors to float16
  120. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  121. data = data.astype(np.float16)
  122. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  123. self.gguf_writer.add_tensor(new_name, data)
  124. def write(self):
  125. self.write_tensors()
  126. self.gguf_writer.write_header_to_file()
  127. self.gguf_writer.write_kv_data_to_file()
  128. self.gguf_writer.write_tensors_to_file()
  129. self.gguf_writer.close()
  130. def write_vocab(self):
  131. self.gguf_writer.write_header_to_file()
  132. self.gguf_writer.write_kv_data_to_file()
  133. self.gguf_writer.close()
  134. @staticmethod
  135. def count_model_parts(dir_model: Path, prefix: str) -> int:
  136. num_parts = 0
  137. for filename in os.listdir(dir_model):
  138. if filename.endswith(prefix):
  139. num_parts += 1
  140. return num_parts
  141. @staticmethod
  142. def load_hparams(dir_model):
  143. with open(dir_model / "config.json", "r", encoding="utf-8") as f:
  144. return json.load(f)
  145. @classmethod
  146. def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
  147. assert names
  148. def func(modelcls: type[Model]):
  149. for name in names:
  150. cls._model_classes[name] = modelcls
  151. return modelcls
  152. return func
  153. @classmethod
  154. def from_model_architecture(cls, arch):
  155. try:
  156. return cls._model_classes[arch]
  157. except KeyError:
  158. raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
  159. def _is_model_safetensors(self) -> bool:
  160. return Model.count_model_parts(self.dir_model, ".safetensors") > 0
  161. def _get_part_names(self):
  162. if self.is_safetensors:
  163. if self.num_parts == 1: # there's only one .safetensors file
  164. return ("model.safetensors",)
  165. return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
  166. if self.num_parts == 1: # there's only one .bin file
  167. return ("pytorch_model.bin",)
  168. return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
  169. def _set_vocab_gpt2(self):
  170. dir_model = self.dir_model
  171. hparams = self.hparams
  172. tokens: list[bytearray] = []
  173. toktypes: list[int] = []
  174. from transformers import AutoTokenizer
  175. tokenizer = AutoTokenizer.from_pretrained(dir_model)
  176. vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
  177. assert max(tokenizer.vocab.values()) < vocab_size
  178. reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
  179. added_vocab = tokenizer.get_added_vocab()
  180. for i in range(vocab_size):
  181. if i not in reverse_vocab:
  182. pad_token = f"[PAD{i}]".encode('utf-8')
  183. tokens.append(bytearray(pad_token))
  184. toktypes.append(gguf.TokenType.USER_DEFINED)
  185. elif reverse_vocab[i] in added_vocab:
  186. tokens.append(reverse_vocab[i])
  187. if tokenizer.added_tokens_decoder[i].special:
  188. toktypes.append(gguf.TokenType.CONTROL)
  189. else:
  190. toktypes.append(gguf.TokenType.USER_DEFINED)
  191. else:
  192. tokens.append(reverse_vocab[i])
  193. toktypes.append(gguf.TokenType.NORMAL)
  194. self.gguf_writer.add_tokenizer_model("gpt2")
  195. self.gguf_writer.add_token_list(tokens)
  196. self.gguf_writer.add_token_types(toktypes)
  197. special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
  198. special_vocab.add_to_gguf(self.gguf_writer)
  199. def _set_vocab_qwen(self):
  200. dir_model = self.dir_model
  201. hparams = self.hparams
  202. tokens: list[bytearray] = []
  203. toktypes: list[int] = []
  204. from transformers import AutoTokenizer
  205. tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
  206. vocab_size = hparams["vocab_size"]
  207. assert max(tokenizer.get_vocab().values()) < vocab_size
  208. merges = []
  209. vocab = {}
  210. mergeable_ranks = tokenizer.mergeable_ranks
  211. for token, rank in mergeable_ranks.items():
  212. vocab[QwenModel.token_bytes_to_string(token)] = rank
  213. if len(token) == 1:
  214. continue
  215. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
  216. assert len(merged) == 2
  217. merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
  218. # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
  219. added_vocab = tokenizer.special_tokens
  220. reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
  221. for i in range(vocab_size):
  222. if i not in reverse_vocab:
  223. pad_token = f"[PAD{i}]".encode("utf-8")
  224. tokens.append(bytearray(pad_token))
  225. toktypes.append(gguf.TokenType.USER_DEFINED)
  226. elif reverse_vocab[i] in added_vocab:
  227. tokens.append(reverse_vocab[i])
  228. toktypes.append(gguf.TokenType.CONTROL)
  229. else:
  230. tokens.append(reverse_vocab[i])
  231. toktypes.append(gguf.TokenType.NORMAL)
  232. self.gguf_writer.add_tokenizer_model("gpt2")
  233. self.gguf_writer.add_token_list(tokens)
  234. self.gguf_writer.add_token_types(toktypes)
  235. special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
  236. special_vocab.merges = merges
  237. # only add special tokens when they were not already loaded from config.json
  238. if len(special_vocab.special_token_ids) == 0:
  239. special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
  240. special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
  241. # this one is usually not in config.json anyway
  242. special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
  243. special_vocab.add_to_gguf(self.gguf_writer)
  244. def _set_vocab_sentencepiece(self):
  245. from sentencepiece import SentencePieceProcessor
  246. tokenizer_path = self.dir_model / 'tokenizer.model'
  247. tokens: list[bytes] = []
  248. scores: list[float] = []
  249. toktypes: list[int] = []
  250. if not tokenizer_path.is_file():
  251. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  252. sys.exit(1)
  253. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  254. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  255. for token_id in range(vocab_size):
  256. piece = tokenizer.id_to_piece(token_id)
  257. text = piece.encode("utf-8")
  258. score = tokenizer.get_score(token_id)
  259. toktype = SentencePieceTokenTypes.NORMAL
  260. if tokenizer.is_unknown(token_id):
  261. toktype = SentencePieceTokenTypes.UNKNOWN
  262. elif tokenizer.is_control(token_id):
  263. toktype = SentencePieceTokenTypes.CONTROL
  264. elif tokenizer.is_unused(token_id):
  265. toktype = SentencePieceTokenTypes.UNUSED
  266. elif tokenizer.is_byte(token_id):
  267. toktype = SentencePieceTokenTypes.BYTE
  268. tokens.append(text)
  269. scores.append(score)
  270. toktypes.append(toktype)
  271. added_tokens_file = self.dir_model / 'added_tokens.json'
  272. if added_tokens_file.is_file():
  273. with open(added_tokens_file, "r", encoding="utf-8") as f:
  274. added_tokens_json = json.load(f)
  275. for key in added_tokens_json:
  276. tokens.append(key.encode("utf-8"))
  277. scores.append(-1000.0)
  278. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  279. self.gguf_writer.add_tokenizer_model("llama")
  280. self.gguf_writer.add_token_list(tokens)
  281. self.gguf_writer.add_token_scores(scores)
  282. self.gguf_writer.add_token_types(toktypes)
  283. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  284. special_vocab.add_to_gguf(self.gguf_writer)
  285. def _set_vocab_hf(self):
  286. path = self.dir_model
  287. added_tokens_path = self.dir_model
  288. vocab = HfVocab(
  289. path, added_tokens_path if added_tokens_path.exists() else None
  290. )
  291. tokens = []
  292. scores = []
  293. toktypes = []
  294. for text, score, toktype in vocab.all_tokens():
  295. tokens.append(text)
  296. scores.append(score)
  297. toktypes.append(toktype)
  298. assert len(tokens) == vocab.vocab_size
  299. self.gguf_writer.add_tokenizer_model("llama")
  300. self.gguf_writer.add_token_list(tokens)
  301. self.gguf_writer.add_token_scores(scores)
  302. self.gguf_writer.add_token_types(toktypes)
  303. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  304. special_vocab.add_to_gguf(self.gguf_writer)
  305. @Model.register("GPTNeoXForCausalLM")
  306. class GPTNeoXModel(Model):
  307. model_arch = gguf.MODEL_ARCH.GPTNEOX
  308. def set_gguf_parameters(self):
  309. block_count = self.hparams["num_hidden_layers"]
  310. self.gguf_writer.add_name(self.dir_model.name)
  311. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  312. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  313. self.gguf_writer.add_block_count(block_count)
  314. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  315. self.gguf_writer.add_rope_dimension_count(
  316. int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
  317. )
  318. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  319. self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
  320. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  321. @Model.register("BloomForCausalLM")
  322. class BloomModel(Model):
  323. model_arch = gguf.MODEL_ARCH.BLOOM
  324. def set_gguf_parameters(self):
  325. self.gguf_writer.add_name("Bloom")
  326. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  327. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  328. self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
  329. self.gguf_writer.add_embedding_length(n_embed)
  330. self.gguf_writer.add_feed_forward_length(4 * n_embed)
  331. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  332. self.gguf_writer.add_head_count(n_head)
  333. self.gguf_writer.add_head_count_kv(n_head)
  334. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  335. self.gguf_writer.add_file_type(self.ftype)
  336. def write_tensors(self):
  337. block_count = self.hparams["n_layer"]
  338. tensors = dict(self.get_tensors())
  339. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  340. has_lm_head = True
  341. n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
  342. n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
  343. for name, data_torch in tensors.items():
  344. if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
  345. has_lm_head = False
  346. name = re.sub(r'transformer\.', '', name)
  347. old_dtype = data_torch.dtype
  348. # convert any unsupported data types to float32
  349. if data_torch.dtype not in (torch.float16, torch.float32):
  350. data_torch = data_torch.to(torch.float32)
  351. data = data_torch.squeeze().numpy()
  352. if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
  353. # Map bloom-style qkv_linear to gpt-style qkv_linear
  354. # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
  355. # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
  356. qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
  357. data = np.concatenate(
  358. (
  359. qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
  360. qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
  361. qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
  362. ),
  363. axis=0,
  364. )
  365. print("re-format attention.linear_qkv.weight")
  366. elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
  367. qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
  368. data = np.concatenate(
  369. (
  370. qkv_bias[:, 0, :].reshape((n_embed,)),
  371. qkv_bias[:, 1, :].reshape((n_embed,)),
  372. qkv_bias[:, 2, :].reshape((n_embed,)),
  373. ),
  374. axis=0,
  375. )
  376. print("re-format attention.linear_qkv.bias")
  377. # map tensor names
  378. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  379. if new_name is None:
  380. print(f"Can not map tensor {name!r}")
  381. sys.exit()
  382. n_dims = len(data.shape)
  383. data_dtype = data.dtype
  384. # if f32 desired, convert any float16 to float32
  385. if self.ftype == 0 and data_dtype == np.float16:
  386. data = data.astype(np.float32)
  387. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  388. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  389. data = data.astype(np.float32)
  390. # if f16 desired, convert any float32 2-dim weight tensors to float16
  391. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  392. data = data.astype(np.float16)
  393. print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  394. self.gguf_writer.add_tensor(new_name, data)
  395. if not has_lm_head and name == "word_embeddings.weight":
  396. self.gguf_writer.add_tensor("output.weight", data)
  397. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  398. @Model.register("MPTForCausalLM")
  399. class MPTModel(Model):
  400. model_arch = gguf.MODEL_ARCH.MPT
  401. def set_gguf_parameters(self):
  402. block_count = self.hparams["n_layers"]
  403. self.gguf_writer.add_name(self.dir_model.name)
  404. self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
  405. self.gguf_writer.add_embedding_length(self.hparams["d_model"])
  406. self.gguf_writer.add_block_count(block_count)
  407. self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
  408. self.gguf_writer.add_head_count(self.hparams["n_heads"])
  409. if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
  410. self.gguf_writer.add_head_count_kv(kv_n_heads)
  411. self.gguf_writer.add_layer_norm_eps(1e-5)
  412. if self.hparams["attn_config"]["clip_qkv"] is not None:
  413. self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
  414. self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
  415. def write_tensors(self):
  416. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
  417. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  418. for name, data_torch in self.get_tensors():
  419. # we don't need these
  420. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  421. continue
  422. old_dtype = data_torch.dtype
  423. # convert any unsupported data types to float32
  424. if data_torch.dtype not in (torch.float16, torch.float32):
  425. data_torch = data_torch.to(torch.float32)
  426. data = data_torch.squeeze().numpy()
  427. # map tensor names
  428. if "scales" in name:
  429. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
  430. if new_name is not None:
  431. new_name = new_name.replace("scales", "act.scales")
  432. else:
  433. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  434. if new_name is None:
  435. print(f"Can not map tensor {name!r}")
  436. sys.exit()
  437. n_dims = len(data.shape)
  438. data_dtype = data.dtype
  439. # if f32 desired, convert any float16 to float32
  440. if self.ftype == 0 and data_dtype == np.float16:
  441. data = data.astype(np.float32)
  442. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  443. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  444. data = data.astype(np.float32)
  445. # if f16 desired, convert any float32 2-dim weight tensors to float16
  446. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  447. data = data.astype(np.float16)
  448. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  449. self.gguf_writer.add_tensor(new_name, data)
  450. @Model.register("OrionForCausalLM")
  451. class OrionModel(Model):
  452. model_arch = gguf.MODEL_ARCH.ORION
  453. def set_vocab(self):
  454. self._set_vocab_sentencepiece()
  455. def set_gguf_parameters(self):
  456. block_count = self.hparams["num_hidden_layers"]
  457. head_count = self.hparams["num_attention_heads"]
  458. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  459. hf_repo = self.hparams.get("_name_or_path", "")
  460. ctx_length = 0
  461. if "max_sequence_length" in self.hparams:
  462. ctx_length = self.hparams["max_sequence_length"]
  463. elif "max_position_embeddings" in self.hparams:
  464. ctx_length = self.hparams["max_position_embeddings"]
  465. elif "model_max_length" in self.hparams:
  466. ctx_length = self.hparams["model_max_length"]
  467. else:
  468. print("gguf: can not find ctx length parameter.")
  469. sys.exit()
  470. self.gguf_writer.add_file_type(self.ftype)
  471. self.gguf_writer.add_name(self.dir_model.name)
  472. self.gguf_writer.add_source_hf_repo(hf_repo)
  473. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  474. self.gguf_writer.add_context_length(ctx_length)
  475. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  476. self.gguf_writer.add_block_count(block_count)
  477. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  478. self.gguf_writer.add_head_count(head_count)
  479. self.gguf_writer.add_head_count_kv(head_count_kv)
  480. # note: config provides rms norm but it is actually layer norm
  481. # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
  482. self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
  483. def write_tensors(self):
  484. # Collect tensors from generator object
  485. model_kv = dict(self.get_tensors())
  486. block_count = self.hparams["num_hidden_layers"]
  487. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  488. for name, data_torch in model_kv.items():
  489. # we don't need these
  490. if name.endswith(".rotary_emb.inv_freq"):
  491. continue
  492. old_dtype = data_torch.dtype
  493. # convert any unsupported data types to float32
  494. if data_torch.dtype not in (torch.float16, torch.float32):
  495. data_torch = data_torch.to(torch.float32)
  496. data = data_torch.squeeze().numpy()
  497. # map tensor names
  498. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  499. if new_name is None:
  500. print(f"Can not map tensor {name!r}")
  501. sys.exit()
  502. n_dims = len(data.shape)
  503. data_dtype = data.dtype
  504. # if f32 desired, convert any float16 to float32
  505. if self.ftype == 0 and data_dtype == np.float16:
  506. data = data.astype(np.float32)
  507. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  508. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  509. data = data.astype(np.float32)
  510. # if f16 desired, convert any float32 2-dim weight tensors to float16
  511. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  512. data = data.astype(np.float16)
  513. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  514. self.gguf_writer.add_tensor(new_name, data)
  515. @Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
  516. class BaichuanModel(Model):
  517. model_arch = gguf.MODEL_ARCH.BAICHUAN
  518. def set_vocab(self):
  519. self._set_vocab_sentencepiece()
  520. def set_gguf_parameters(self):
  521. block_count = self.hparams["num_hidden_layers"]
  522. head_count = self.hparams["num_attention_heads"]
  523. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  524. hf_repo = self.hparams.get("_name_or_path", "")
  525. ctx_length = 0
  526. if "max_sequence_length" in self.hparams:
  527. ctx_length = self.hparams["max_sequence_length"]
  528. elif "max_position_embeddings" in self.hparams:
  529. ctx_length = self.hparams["max_position_embeddings"]
  530. elif "model_max_length" in self.hparams:
  531. ctx_length = self.hparams["model_max_length"]
  532. else:
  533. print("gguf: can not find ctx length parameter.")
  534. sys.exit()
  535. self.gguf_writer.add_name(self.dir_model.name)
  536. self.gguf_writer.add_source_hf_repo(hf_repo)
  537. self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  538. self.gguf_writer.add_context_length(ctx_length)
  539. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  540. self.gguf_writer.add_block_count(block_count)
  541. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  542. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  543. self.gguf_writer.add_head_count(head_count)
  544. self.gguf_writer.add_head_count_kv(head_count_kv)
  545. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  546. if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
  547. if self.hparams["rope_scaling"].get("type") == "linear":
  548. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  549. self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
  550. def write_tensors(self):
  551. # Collect tensors from generator object
  552. model_kv = dict(self.get_tensors())
  553. block_count = self.hparams["num_hidden_layers"]
  554. head_count = self.hparams["num_attention_heads"]
  555. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  556. head_count_kv = self.hparams.get("num_key_value_heads", head_count)
  557. for i in range(block_count):
  558. if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
  559. print(f"Unpacking and permuting layer {i}")
  560. model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
  561. self._reverse_hf_permute_part(w, 0, head_count, head_count)
  562. model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
  563. self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
  564. model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
  565. self._reverse_hf_part(w, 2)
  566. del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
  567. for name, data_torch in model_kv.items():
  568. # we don't need these
  569. if name.endswith(".rotary_emb.inv_freq"):
  570. continue
  571. old_dtype = data_torch.dtype
  572. # convert any unsupported data types to float32
  573. if data_torch.dtype not in (torch.float16, torch.float32):
  574. data_torch = data_torch.to(torch.float32)
  575. data = data_torch.squeeze().numpy()
  576. # map tensor names
  577. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  578. if new_name is None:
  579. print(f"Can not map tensor {name!r}")
  580. sys.exit()
  581. n_dims = len(data.shape)
  582. data_dtype = data.dtype
  583. # if f32 desired, convert any float16 to float32
  584. if self.ftype == 0 and data_dtype == np.float16:
  585. data = data.astype(np.float32)
  586. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  587. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  588. data = data.astype(np.float32)
  589. # if f16 desired, convert any float32 2-dim weight tensors to float16
  590. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  591. data = data.astype(np.float16)
  592. print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  593. self.gguf_writer.add_tensor(new_name, data)
  594. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  595. if n_kv_head is not None and n_head != n_kv_head:
  596. n_head //= n_kv_head
  597. return (
  598. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  599. .swapaxes(1, 2)
  600. .reshape(weights.shape)
  601. )
  602. def _reverse_hf_permute_part(
  603. self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
  604. ) -> Tensor:
  605. r = weights.shape[0] // 3
  606. return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
  607. def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
  608. r = weights.shape[0] // 3
  609. return weights[r * n_part:r * n_part + r, ...]
  610. @Model.register("FalconForCausalLM", "RWForCausalLM")
  611. class FalconModel(Model):
  612. model_arch = gguf.MODEL_ARCH.FALCON
  613. def set_gguf_parameters(self):
  614. block_count = self.hparams.get("num_hidden_layers")
  615. if block_count is None:
  616. block_count = self.hparams["n_layer"] # old name
  617. n_head = self.hparams.get("num_attention_heads")
  618. if n_head is None:
  619. n_head = self.hparams["n_head"] # old name
  620. n_head_kv = self.hparams.get("num_kv_heads")
  621. if n_head_kv is None:
  622. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  623. self.gguf_writer.add_name("Falcon")
  624. self.gguf_writer.add_context_length(2048) # not in config.json
  625. self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
  626. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  627. self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
  628. self.gguf_writer.add_block_count(block_count)
  629. self.gguf_writer.add_head_count(n_head)
  630. self.gguf_writer.add_head_count_kv(n_head_kv)
  631. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  632. self.gguf_writer.add_file_type(self.ftype)
  633. def write_tensors(self):
  634. block_count = self.hparams.get("num_hidden_layers")
  635. if block_count is None:
  636. block_count = self.hparams["n_layer"] # old name
  637. n_head = self.hparams.get("num_attention_heads")
  638. if n_head is None:
  639. n_head = self.hparams["n_head"] # old name
  640. n_head_kv = self.hparams.get("num_kv_heads")
  641. if n_head_kv is None:
  642. n_head_kv = self.hparams.get("n_head_kv", 1) # old name
  643. head_dim = self.hparams["hidden_size"] // n_head
  644. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  645. for name, data_torch in self.get_tensors():
  646. old_dtype = data_torch.dtype
  647. # convert any unsupported data types to float32
  648. if data_torch.dtype not in (torch.float16, torch.float32):
  649. data_torch = data_torch.to(torch.float32)
  650. # QKV tensor transform
  651. # The original query_key_value tensor contains n_head_kv "kv groups",
  652. # each consisting of n_head/n_head_kv query weights followed by one key
  653. # and one value weight (shared by all query heads in the kv group).
  654. # This layout makes it a big pain to work with in GGML.
  655. # So we rearrange them here,, so that we have n_head query weights
  656. # followed by n_head_kv key weights followed by n_head_kv value weights,
  657. # in contiguous fashion.
  658. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
  659. if "query_key_value" in name:
  660. qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
  661. q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
  662. k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
  663. v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
  664. data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
  665. data = data_torch.squeeze().numpy()
  666. # map tensor names
  667. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  668. if new_name is None:
  669. print(f"Can not map tensor {name!r}")
  670. sys.exit()
  671. n_dims = len(data.shape)
  672. data_dtype = data.dtype
  673. # if f32 desired, convert any float16 to float32
  674. if self.ftype == 0 and data_dtype == np.float16:
  675. data = data.astype(np.float32)
  676. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  677. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  678. data = data.astype(np.float32)
  679. # if f16 desired, convert any float32 2-dim weight tensors to float16
  680. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  681. data = data.astype(np.float16)
  682. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  683. self.gguf_writer.add_tensor(new_name, data)
  684. @Model.register("GPTBigCodeForCausalLM")
  685. class StarCoderModel(Model):
  686. model_arch = gguf.MODEL_ARCH.STARCODER
  687. def set_gguf_parameters(self):
  688. block_count = self.hparams["n_layer"]
  689. self.gguf_writer.add_name("StarCoder")
  690. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  691. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  692. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  693. self.gguf_writer.add_block_count(block_count)
  694. self.gguf_writer.add_head_count(self.hparams["n_head"])
  695. self.gguf_writer.add_head_count_kv(1)
  696. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  697. self.gguf_writer.add_file_type(self.ftype)
  698. @Model.register("GPTRefactForCausalLM")
  699. class RefactModel(Model):
  700. model_arch = gguf.MODEL_ARCH.REFACT
  701. def set_gguf_parameters(self):
  702. hidden_dim = self.hparams["n_embd"]
  703. inner_dim = 4 * hidden_dim
  704. hidden_dim = int(2 * inner_dim / 3)
  705. multiple_of = 256
  706. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  707. block_count = self.hparams["n_layer"]
  708. self.gguf_writer.add_name("Refact")
  709. # refact uses Alibi. So this is from config.json which might be used by training.
  710. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  711. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  712. self.gguf_writer.add_feed_forward_length(ff_dim)
  713. self.gguf_writer.add_block_count(block_count)
  714. self.gguf_writer.add_head_count(self.hparams["n_head"])
  715. self.gguf_writer.add_head_count_kv(1)
  716. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  717. self.gguf_writer.add_file_type(self.ftype)
  718. def write_tensors(self):
  719. hidden_dim = self.hparams["n_embd"]
  720. inner_dim = 4 * hidden_dim
  721. hidden_dim = int(2 * inner_dim / 3)
  722. multiple_of = 256
  723. ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
  724. n_head = self.hparams["n_head"]
  725. n_head_kv = 1
  726. head_dim = self.hparams["n_embd"] // n_head
  727. block_count = self.hparams["n_layer"]
  728. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  729. tensors = dict(self.get_tensors())
  730. for i in range(block_count):
  731. if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
  732. tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
  733. tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
  734. del tensors[f"transformer.h.{i}.attn.kv.weight"]
  735. if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
  736. tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
  737. del tensors[f"transformer.h.{i}.attn.q.weight"]
  738. if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
  739. tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
  740. tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
  741. del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
  742. for name, data_torch in tensors.items():
  743. old_dtype = data_torch.dtype
  744. # convert any unsupported data types to float32
  745. if data_torch.dtype not in (torch.float16, torch.float32):
  746. data_torch = data_torch.to(torch.float32)
  747. data = data_torch.squeeze().numpy()
  748. # map tensor names
  749. new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
  750. if new_name is None:
  751. print(f"Can not map tensor {name!r}")
  752. sys.exit()
  753. n_dims = len(data.shape)
  754. data_dtype = data.dtype
  755. # if f32 desired, convert any float16 to float32
  756. if self.ftype == 0 and data_dtype == np.float16:
  757. data = data.astype(np.float32)
  758. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  759. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  760. data = data.astype(np.float32)
  761. # if f16 desired, convert any float32 2-dim weight tensors to float16
  762. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  763. data = data.astype(np.float16)
  764. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  765. self.gguf_writer.add_tensor(new_name, data)
  766. @Model.register("PersimmonForCausalLM")
  767. class PersimmonModel(Model):
  768. model_arch = gguf.MODEL_ARCH.PERSIMMON
  769. def set_gguf_parameters(self):
  770. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  771. head_count = self.hparams["num_attention_heads"]
  772. head_count_kv = head_count
  773. hidden_size = self.hparams["hidden_size"]
  774. self.gguf_writer.add_name('persimmon-8b-chat')
  775. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  776. self.gguf_writer.add_embedding_length(hidden_size)
  777. self.gguf_writer.add_block_count(block_count)
  778. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  779. # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
  780. # than the head size?
  781. # ref: https://github.com/ggerganov/llama.cpp/pull/4889
  782. # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
  783. self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
  784. self.gguf_writer.add_head_count(head_count)
  785. self.gguf_writer.add_head_count_kv(head_count_kv)
  786. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  787. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
  788. def set_vocab(self):
  789. self._set_vocab_sentencepiece()
  790. # self.gguf_writer.add_bos_token_id(71013)
  791. # self.gguf_writer.add_eos_token_id(71013)
  792. def write_tensors(self):
  793. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  794. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  795. for name, data_torch in self.get_tensors():
  796. if name.endswith(".self_attention.rotary_emb.inv_freq"):
  797. continue
  798. old_dtype = data_torch.dtype
  799. # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
  800. data = data_torch.to(torch.float32).squeeze().numpy()
  801. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  802. if new_name is None:
  803. print(f"Can not map tensor {name!r}")
  804. sys.exit()
  805. n_dims = len(data.shape)
  806. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  807. self.gguf_writer.add_tensor(new_name, data)
  808. @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
  809. class StableLMModel(Model):
  810. model_arch = gguf.MODEL_ARCH.STABLELM
  811. def set_vocab(self):
  812. if (self.dir_model / "tokenizer.json").is_file():
  813. self._set_vocab_gpt2()
  814. else:
  815. # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
  816. self._set_vocab_qwen()
  817. def set_gguf_parameters(self):
  818. hparams = self.hparams
  819. block_count = hparams["num_hidden_layers"]
  820. self.gguf_writer.add_name(self.dir_model.name)
  821. self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  822. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  823. self.gguf_writer.add_block_count(block_count)
  824. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  825. rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
  826. self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
  827. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  828. self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
  829. self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
  830. @Model.register("MixtralForCausalLM")
  831. class MixtralModel(Model):
  832. model_arch = gguf.MODEL_ARCH.LLAMA
  833. def set_vocab(self):
  834. self._set_vocab_sentencepiece()
  835. @Model.register("MiniCPMForCausalLM")
  836. class MiniCPMModel(Model):
  837. model_arch = gguf.MODEL_ARCH.MINICPM
  838. def set_gguf_parameters(self):
  839. block_count = self.hparams["num_hidden_layers"]
  840. self.gguf_writer.add_name("MiniCPM")
  841. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  842. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  843. self.gguf_writer.add_block_count(block_count)
  844. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  845. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  846. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  847. self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
  848. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  849. self.gguf_writer.add_file_type(self.ftype)
  850. def set_vocab(self):
  851. self._set_vocab_hf()
  852. def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
  853. if n_kv_head is not None and n_head != n_kv_head:
  854. n_head //= n_kv_head
  855. return (
  856. weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  857. .swapaxes(1, 2)
  858. .reshape(weights.shape)
  859. )
  860. def write_tensors(self):
  861. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  862. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  863. n_head = self.hparams.get("num_attention_heads")
  864. n_kv_head = self.hparams.get("num_key_value_heads")
  865. for name, data_torch in self.get_tensors():
  866. # we don't need these
  867. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
  868. continue
  869. old_dtype = data_torch.dtype
  870. # convert any unsupported data types to float32
  871. if data_torch.dtype not in (torch.float16, torch.float32):
  872. data_torch = data_torch.to(torch.float32)
  873. # HF models permute some of the tensors, so we need to undo that
  874. if name.endswith(("q_proj.weight")):
  875. data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
  876. if name.endswith(("k_proj.weight")):
  877. data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
  878. data = data_torch.squeeze().numpy()
  879. # map tensor names
  880. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  881. if new_name is None:
  882. print(f"Can not map tensor {name!r}")
  883. sys.exit()
  884. n_dims = len(data.shape)
  885. data_dtype = data.dtype
  886. # if f32 desired, convert any float16 to float32
  887. if self.ftype == 0 and data_dtype == np.float16:
  888. data = data.astype(np.float32)
  889. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  890. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  891. data = data.astype(np.float32)
  892. # if f16 desired, convert any float32 2-dim weight tensors to float16
  893. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  894. data = data.astype(np.float16)
  895. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  896. self.gguf_writer.add_tensor(new_name, data)
  897. @Model.register("QWenLMHeadModel")
  898. class QwenModel(Model):
  899. model_arch = gguf.MODEL_ARCH.QWEN
  900. @staticmethod
  901. def token_bytes_to_string(b):
  902. from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
  903. byte_encoder = bytes_to_unicode()
  904. return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
  905. @staticmethod
  906. def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
  907. parts = [bytes([b]) for b in token]
  908. while True:
  909. min_idx = None
  910. min_rank = None
  911. for i, pair in enumerate(zip(parts[:-1], parts[1:])):
  912. rank = mergeable_ranks.get(pair[0] + pair[1])
  913. if rank is not None and (min_rank is None or rank < min_rank):
  914. min_idx = i
  915. min_rank = rank
  916. if min_rank is None or (max_rank is not None and min_rank >= max_rank):
  917. break
  918. assert min_idx is not None
  919. parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
  920. return parts
  921. def set_vocab(self):
  922. self._set_vocab_qwen()
  923. def set_gguf_parameters(self):
  924. self.gguf_writer.add_name("Qwen")
  925. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  926. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  927. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  928. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  929. self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
  930. self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
  931. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  932. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
  933. def write_tensors(self):
  934. block_count = self.hparams["num_hidden_layers"]
  935. model_kv = dict(self.get_tensors())
  936. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  937. for name, data_torch in model_kv.items():
  938. # we don't need these
  939. if name.endswith(".rotary_emb.inv_freq"):
  940. continue
  941. old_dtype = data_torch.dtype
  942. # convert any unsupported data types to float32
  943. if data_torch.dtype not in (torch.float16, torch.float32):
  944. data_torch = data_torch.to(torch.float32)
  945. data = data_torch.squeeze().numpy()
  946. # map tensor names
  947. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  948. if new_name is None:
  949. print(f"Can not map tensor {name!r}")
  950. sys.exit()
  951. n_dims = len(data.shape)
  952. data_dtype = data.dtype
  953. # if f32 desired, convert any float16 to float32
  954. if self.ftype == 0 and data_dtype == np.float16:
  955. data = data.astype(np.float32)
  956. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  957. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  958. data = data.astype(np.float32)
  959. # if f16 desired, convert any float32 2-dim weight tensors to float16
  960. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  961. data = data.astype(np.float16)
  962. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  963. self.gguf_writer.add_tensor(new_name, data)
  964. @Model.register("Qwen2ForCausalLM")
  965. class Qwen2Model(Model):
  966. model_arch = gguf.MODEL_ARCH.QWEN2
  967. @Model.register("GPT2LMHeadModel")
  968. class GPT2Model(Model):
  969. model_arch = gguf.MODEL_ARCH.GPT2
  970. def set_gguf_parameters(self):
  971. self.gguf_writer.add_name(self.dir_model.name)
  972. self.gguf_writer.add_block_count(self.hparams["n_layer"])
  973. self.gguf_writer.add_context_length(self.hparams["n_ctx"])
  974. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  975. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  976. self.gguf_writer.add_head_count(self.hparams["n_head"])
  977. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  978. self.gguf_writer.add_file_type(self.ftype)
  979. def write_tensors(self):
  980. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  981. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  982. for name, data_torch in self.get_tensors():
  983. # we don't need these
  984. if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
  985. continue
  986. if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
  987. data_torch = data_torch.transpose(1, 0)
  988. old_dtype = data_torch.dtype
  989. # convert any unsupported data types to float32
  990. if data_torch.dtype not in (torch.float16, torch.float32):
  991. data_torch = data_torch.to(torch.float32)
  992. data = data_torch.squeeze().numpy()
  993. # map tensor names
  994. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  995. if new_name is None:
  996. print(f"Can not map tensor {name!r}")
  997. sys.exit()
  998. n_dims = len(data.shape)
  999. data_dtype = data.dtype
  1000. # if f32 desired, convert any float16 to float32
  1001. if self.ftype == 0 and data_dtype == np.float16:
  1002. data = data.astype(np.float32)
  1003. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1004. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1005. data = data.astype(np.float32)
  1006. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1007. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1008. data = data.astype(np.float16)
  1009. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1010. self.gguf_writer.add_tensor(new_name, data)
  1011. # note: GPT2 output is tied to (same as) wte in original model
  1012. if new_name == "token_embd.weight":
  1013. print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1014. self.gguf_writer.add_tensor("output.weight", data)
  1015. @Model.register("PhiForCausalLM")
  1016. class Phi2Model(Model):
  1017. model_arch = gguf.MODEL_ARCH.PHI2
  1018. def set_gguf_parameters(self):
  1019. block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
  1020. rot_pct = self.find_hparam(["partial_rotary_factor"])
  1021. n_embd = self.find_hparam(["hidden_size", "n_embd"])
  1022. n_head = self.find_hparam(["num_attention_heads", "n_head"])
  1023. self.gguf_writer.add_name("Phi2")
  1024. self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
  1025. self.gguf_writer.add_embedding_length(n_embd)
  1026. self.gguf_writer.add_feed_forward_length(4 * n_embd)
  1027. self.gguf_writer.add_block_count(block_count)
  1028. self.gguf_writer.add_head_count(n_head)
  1029. self.gguf_writer.add_head_count_kv(n_head)
  1030. self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
  1031. self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
  1032. self.gguf_writer.add_file_type(self.ftype)
  1033. self.gguf_writer.add_add_bos_token(False)
  1034. @Model.register("PlamoForCausalLM")
  1035. class PlamoModel(Model):
  1036. model_arch = gguf.MODEL_ARCH.PLAMO
  1037. def set_vocab(self):
  1038. self._set_vocab_sentencepiece()
  1039. def set_gguf_parameters(self):
  1040. hparams = self.hparams
  1041. block_count = hparams["num_hidden_layers"]
  1042. self.gguf_writer.add_name("PLaMo")
  1043. self.gguf_writer.add_context_length(4096) # not in config.json
  1044. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  1045. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  1046. self.gguf_writer.add_block_count(block_count)
  1047. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  1048. self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
  1049. self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
  1050. def shuffle_attn_q_weight(self, data_torch):
  1051. assert data_torch.size() == (5120, 5120)
  1052. data_torch = data_torch.reshape(8, 5, 128, 5120)
  1053. data_torch = torch.permute(data_torch, (1, 0, 2, 3))
  1054. data_torch = torch.reshape(data_torch, (5120, 5120))
  1055. return data_torch
  1056. def shuffle_attn_output_weight(self, data_torch):
  1057. assert data_torch.size() == (5120, 5120)
  1058. data_torch = data_torch.reshape(5120, 8, 5, 128)
  1059. data_torch = torch.permute(data_torch, (0, 2, 1, 3))
  1060. data_torch = torch.reshape(data_torch, (5120, 5120))
  1061. return data_torch
  1062. def write_tensors(self):
  1063. block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
  1064. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1065. for name, data_torch in self.get_tensors():
  1066. if "self_attn.rotary_emb.inv_freq" in name:
  1067. continue
  1068. # map tensor names
  1069. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1070. if new_name is None:
  1071. print(f"Can not map tensor {name!r}")
  1072. sys.exit()
  1073. # shuffle for broadcasting of gqa in ggml_mul_mat
  1074. if new_name.endswith("attn_q.weight"):
  1075. data_torch = self.shuffle_attn_q_weight(data_torch)
  1076. elif new_name.endswith("attn_output.weight"):
  1077. data_torch = self.shuffle_attn_output_weight(data_torch)
  1078. old_dtype = data_torch.dtype
  1079. # convert any unsupported data types to float32
  1080. if data_torch.dtype not in (torch.float16, torch.float32):
  1081. data_torch = data_torch.to(torch.float32)
  1082. data = data_torch.squeeze().numpy()
  1083. n_dims = len(data.shape)
  1084. data_dtype = data.dtype
  1085. # if f32 desired, convert any float16 to float32
  1086. if self.ftype == 0 and data_dtype == np.float16:
  1087. data = data.astype(np.float32)
  1088. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1089. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1090. data = data.astype(np.float32)
  1091. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1092. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1093. data = data.astype(np.float16)
  1094. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1095. self.gguf_writer.add_tensor(new_name, data)
  1096. @Model.register("CodeShellForCausalLM")
  1097. class CodeShellModel(Model):
  1098. model_arch = gguf.MODEL_ARCH.CODESHELL
  1099. def set_gguf_parameters(self):
  1100. block_count = self.hparams["n_layer"]
  1101. self.gguf_writer.add_name("CodeShell")
  1102. self.gguf_writer.add_context_length(self.hparams["n_positions"])
  1103. self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
  1104. self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
  1105. self.gguf_writer.add_block_count(block_count)
  1106. self.gguf_writer.add_head_count(self.hparams["n_head"])
  1107. self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
  1108. self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
  1109. self.gguf_writer.add_file_type(self.ftype)
  1110. self.gguf_writer.add_rope_freq_base(10000.0)
  1111. self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
  1112. self.gguf_writer.add_rope_scaling_factor(1.0)
  1113. def write_tensors(self):
  1114. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  1115. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1116. tensors = dict(self.get_tensors())
  1117. has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
  1118. for name, data_torch in tensors.items():
  1119. # we don't need these
  1120. if name.endswith((".attn.rotary_emb.inv_freq")):
  1121. continue
  1122. old_dtype = data_torch.dtype
  1123. # convert any unsupported data types to float32
  1124. if data_torch.dtype not in (torch.float16, torch.float32):
  1125. data_torch = data_torch.to(torch.float32)
  1126. data = data_torch.squeeze().numpy()
  1127. # map tensor names
  1128. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1129. if new_name is None:
  1130. print(f"Can not map tensor {name!r}")
  1131. sys.exit()
  1132. n_dims = len(data.shape)
  1133. data_dtype = data.dtype
  1134. # if f32 desired, convert any float16 to float32
  1135. if self.ftype == 0 and data_dtype == np.float16:
  1136. data = data.astype(np.float32)
  1137. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1138. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1139. data = data.astype(np.float32)
  1140. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1141. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1142. data = data.astype(np.float16)
  1143. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1144. self.gguf_writer.add_tensor(new_name, data)
  1145. if not has_lm_head and name == "transformer.wte.weight":
  1146. self.gguf_writer.add_tensor("output.weight", data)
  1147. print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
  1148. @Model.register("InternLM2ForCausalLM")
  1149. class InternLM2Model(Model):
  1150. model_arch = gguf.MODEL_ARCH.INTERNLM2
  1151. def set_vocab(self):
  1152. # (TODO): Is there a better way?
  1153. # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
  1154. # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
  1155. # recognized as an empty string in C++.
  1156. from sentencepiece import SentencePieceProcessor
  1157. from sentencepiece import sentencepiece_model_pb2 as model
  1158. tokenizer_path = self.dir_model / 'tokenizer.model'
  1159. tokens: list[bytes] = []
  1160. scores: list[float] = []
  1161. toktypes: list[int] = []
  1162. if not tokenizer_path.is_file():
  1163. print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
  1164. sys.exit(1)
  1165. sentencepiece_model = model.ModelProto()
  1166. sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
  1167. add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
  1168. tokenizer = SentencePieceProcessor(str(tokenizer_path))
  1169. vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
  1170. for token_id in range(vocab_size):
  1171. piece = tokenizer.id_to_piece(token_id)
  1172. text = piece.encode("utf-8")
  1173. score = tokenizer.get_score(token_id)
  1174. if text == b"\x00":
  1175. # (TODO): fixme
  1176. # Hack here and replace the \x00 characters.
  1177. print(f"InternLM2 convert token '{text}' to '🐉'!")
  1178. text = "🐉"
  1179. toktype = SentencePieceTokenTypes.NORMAL
  1180. if tokenizer.is_unknown(token_id):
  1181. toktype = SentencePieceTokenTypes.UNKNOWN
  1182. elif tokenizer.is_control(token_id):
  1183. toktype = SentencePieceTokenTypes.CONTROL
  1184. elif tokenizer.is_unused(token_id):
  1185. toktype = SentencePieceTokenTypes.UNUSED
  1186. elif tokenizer.is_byte(token_id):
  1187. toktype = SentencePieceTokenTypes.BYTE
  1188. tokens.append(text)
  1189. scores.append(score)
  1190. toktypes.append(toktype)
  1191. added_tokens_file = self.dir_model / 'added_tokens.json'
  1192. if added_tokens_file.is_file():
  1193. with open(added_tokens_file, "r", encoding="utf-8") as f:
  1194. added_tokens_json = json.load(f)
  1195. for key in added_tokens_json:
  1196. tokens.append(key.encode("utf-8"))
  1197. scores.append(-1000.0)
  1198. toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
  1199. self.gguf_writer.add_tokenizer_model("llama")
  1200. self.gguf_writer.add_token_list(tokens)
  1201. self.gguf_writer.add_token_scores(scores)
  1202. self.gguf_writer.add_token_types(toktypes)
  1203. self.gguf_writer.add_add_space_prefix(add_prefix)
  1204. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  1205. old_eos = special_vocab.special_token_ids["eos"]
  1206. if "chat" in os.path.basename(self.dir_model.absolute()):
  1207. # For the chat model, we replace the eos with '<|im_end|>'.
  1208. special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
  1209. print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
  1210. in chat mode so that the conversation can end normally.")
  1211. special_vocab.add_to_gguf(self.gguf_writer)
  1212. def _try_get_sft_eos(self, tokenizer):
  1213. unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
  1214. im_end_list = tokenizer.encode('<|im_end|>')
  1215. assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
  1216. if len(unused_145_list) == 1:
  1217. eos_token = unused_145_list[0]
  1218. if len(im_end_list) == 1:
  1219. eos_token = im_end_list[0]
  1220. return eos_token
  1221. def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
  1222. if n_head_kv is not None and n_head != n_head_kv:
  1223. n_head = n_head_kv
  1224. return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  1225. .swapaxes(1, 2)
  1226. .reshape(weights.shape))
  1227. def set_gguf_parameters(self):
  1228. self.gguf_writer.add_name("InternLM2")
  1229. self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  1230. self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
  1231. self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  1232. self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
  1233. self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
  1234. self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
  1235. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  1236. self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
  1237. def post_write_tensors(self, tensor_map, name, data_torch):
  1238. old_dtype = data_torch.dtype
  1239. # convert any unsupported data types to float32
  1240. if data_torch.dtype not in (torch.float16, torch.float32):
  1241. data_torch = data_torch.to(torch.float32)
  1242. data = data_torch.squeeze().numpy()
  1243. # map tensor names
  1244. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1245. if new_name is None:
  1246. print(f"Can not map tensor {name!r}")
  1247. sys.exit()
  1248. n_dims = len(data.shape)
  1249. data_dtype = data.dtype
  1250. # if f32 desired, convert any float16 to float32
  1251. if self.ftype == 0 and data_dtype == np.float16:
  1252. data = data.astype(np.float32)
  1253. # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
  1254. if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
  1255. data = data.astype(np.float32)
  1256. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1257. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1258. data = data.astype(np.float16)
  1259. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1260. self.gguf_writer.add_tensor(new_name, data)
  1261. def write_tensors(self):
  1262. from einops import rearrange
  1263. num_heads = self.hparams.get("num_attention_heads")
  1264. num_kv_heads = self.hparams.get("num_key_value_heads")
  1265. hidden_size = self.hparams.get("hidden_size")
  1266. q_per_kv = num_heads // num_kv_heads
  1267. head_dim = hidden_size // num_heads
  1268. num_groups = num_heads // q_per_kv
  1269. block_count = self.hparams["num_hidden_layers"]
  1270. model_kv = dict(self.get_tensors())
  1271. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1272. qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
  1273. for name, data_torch in model_kv.items():
  1274. # we don't need these
  1275. if name.endswith(".rotary_emb.inv_freq"):
  1276. continue
  1277. if re.match(qkv_pattern, name):
  1278. bid = re.findall(qkv_pattern, name)[0]
  1279. qkv = data_torch
  1280. qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
  1281. q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
  1282. # The model weights of q and k equire additional reshape.
  1283. q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
  1284. k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
  1285. v = rearrange(v, " o g n i -> o (g n i)").T
  1286. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
  1287. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
  1288. self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
  1289. else:
  1290. self.post_write_tensors(tensor_map, name, data_torch)
  1291. @Model.register("BertModel")
  1292. class BertModel(Model):
  1293. model_arch = gguf.MODEL_ARCH.BERT
  1294. def __init__(self, *args, **kwargs):
  1295. super().__init__(*args, **kwargs)
  1296. self.vocab_size = None
  1297. def set_gguf_parameters(self):
  1298. super().set_gguf_parameters()
  1299. self.gguf_writer.add_causal_attention(False)
  1300. # get pooling path
  1301. pooling_path = None
  1302. module_path = self.dir_model / "modules.json"
  1303. if module_path.is_file():
  1304. with open(module_path, encoding="utf-8") as f:
  1305. modules = json.load(f)
  1306. for mod in modules:
  1307. if mod["type"] == "sentence_transformers.models.Pooling":
  1308. pooling_path = mod["path"]
  1309. break
  1310. # get pooling type
  1311. if pooling_path is not None:
  1312. with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
  1313. pooling = json.load(f)
  1314. if pooling["pooling_mode_mean_tokens"]:
  1315. pooling_type = gguf.PoolingType.MEAN
  1316. elif pooling["pooling_mode_cls_token"]:
  1317. pooling_type = gguf.PoolingType.CLS
  1318. else:
  1319. raise NotImplementedError("Only MEAN and CLS pooling types supported")
  1320. self.gguf_writer.add_pooling_type(pooling_type)
  1321. def set_vocab(self):
  1322. path = self.dir_model
  1323. added_tokens_path = self.dir_model if self.dir_model.exists() else None
  1324. # use huggingface vocab to get all tokens
  1325. vocab = HfVocab(path, added_tokens_path)
  1326. tokens, scores, toktypes = zip(*vocab.all_tokens())
  1327. assert len(tokens) == vocab.vocab_size
  1328. self.vocab_size = vocab.vocab_size
  1329. # we need this to validate the size of the token_type embeddings
  1330. # though currently we are passing all zeros to the token_type embeddings
  1331. n_token_types = len(set(toktypes))
  1332. self.gguf_writer.add_token_type_count(n_token_types)
  1333. # convert to phantom space vocab
  1334. def phantom(tok, typ):
  1335. if tok.startswith(b"[") and tok.endswith(b"]"):
  1336. return tok
  1337. if tok.startswith(b"##"):
  1338. return tok[2:]
  1339. return b"\xe2\x96\x81" + tok
  1340. tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
  1341. # set up bos and eos tokens (cls and sep)
  1342. self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
  1343. self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
  1344. # add vocab to gguf
  1345. self.gguf_writer.add_tokenizer_model("bert")
  1346. self.gguf_writer.add_token_list(tokens)
  1347. self.gguf_writer.add_token_scores(scores)
  1348. self.gguf_writer.add_token_types(toktypes)
  1349. # handle special tokens
  1350. special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
  1351. special_vocab.add_to_gguf(self.gguf_writer)
  1352. def write_tensors(self):
  1353. tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
  1354. tensors = dict(self.get_tensors())
  1355. for name, data_torch in tensors.items():
  1356. # we are only using BERT for embeddings so we don't need the pooling layer
  1357. if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
  1358. continue # we don't need these
  1359. # map tensor names
  1360. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1361. if new_name is None:
  1362. print(f"Can not map tensor {name!r}")
  1363. sys.exit()
  1364. data = data_torch.squeeze().numpy()
  1365. n_dims = len(data.shape)
  1366. new_dtype: type[np.floating[Any]]
  1367. if (
  1368. self.ftype == 1 and name.endswith(".weight") and n_dims == 2
  1369. and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
  1370. ):
  1371. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1372. new_dtype = np.float16
  1373. else:
  1374. # if f32 desired, convert any float16 to float32
  1375. new_dtype = np.float32
  1376. print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
  1377. if data.dtype != new_dtype:
  1378. data = data.astype(new_dtype)
  1379. self.gguf_writer.add_tensor(new_name, data)
  1380. @Model.register("NomicBertModel")
  1381. class NomicBertModel(BertModel):
  1382. model_arch = gguf.MODEL_ARCH.NOMIC_BERT
  1383. def __init__(self, *args, **kwargs):
  1384. super().__init__(*args, **kwargs)
  1385. # the HF config claims n_ctx=8192, but it uses RoPE scaling
  1386. self.hparams["n_ctx"] = 2048
  1387. # SwigLU activation
  1388. assert self.hparams["activation_function"] == "swiglu"
  1389. # this doesn't do anything in the HF version
  1390. assert self.hparams["causal"] is False
  1391. # no bias tensors
  1392. assert self.hparams["qkv_proj_bias"] is False
  1393. assert self.hparams["mlp_fc1_bias"] is False
  1394. assert self.hparams["mlp_fc2_bias"] is False
  1395. # norm at end of layer
  1396. assert self.hparams["prenorm"] is False
  1397. # standard RoPE
  1398. assert self.hparams["rotary_emb_fraction"] == 1.0
  1399. assert self.hparams["rotary_emb_interleaved"] is False
  1400. assert self.hparams["rotary_emb_scale_base"] is None
  1401. def set_gguf_parameters(self):
  1402. super().set_gguf_parameters()
  1403. self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
  1404. def get_tensors(self):
  1405. assert self.vocab_size is not None
  1406. for name, data in super().get_tensors():
  1407. # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
  1408. if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
  1409. rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
  1410. assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
  1411. data = data[:self.vocab_size, :]
  1412. yield name, data
  1413. @Model.register("GemmaForCausalLM")
  1414. class GemmaModel(Model):
  1415. model_arch = gguf.MODEL_ARCH.GEMMA
  1416. def set_vocab(self):
  1417. self._set_vocab_sentencepiece()
  1418. def set_gguf_parameters(self):
  1419. hparams = self.hparams
  1420. block_count = hparams["num_hidden_layers"]
  1421. self.gguf_writer.add_name(self.dir_model.name)
  1422. self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  1423. self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  1424. self.gguf_writer.add_block_count(block_count)
  1425. self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
  1426. self.gguf_writer.add_head_count(hparams["num_attention_heads"])
  1427. self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
  1428. self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
  1429. self.gguf_writer.add_key_length(hparams["head_dim"])
  1430. self.gguf_writer.add_value_length(hparams["head_dim"])
  1431. self.gguf_writer.add_file_type(self.ftype)
  1432. def write_tensors(self):
  1433. block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
  1434. tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
  1435. for name, data_torch in self.get_tensors():
  1436. old_dtype = data_torch.dtype
  1437. # convert any unsupported data types to float32
  1438. if data_torch.dtype not in (torch.float16, torch.float32):
  1439. data_torch = data_torch.to(torch.float32)
  1440. # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
  1441. if name.endswith("norm.weight"):
  1442. data_torch = data_torch + 1
  1443. data = data_torch.squeeze().numpy()
  1444. # map tensor names
  1445. new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
  1446. if new_name is None:
  1447. print(f"Can not map tensor {name!r}")
  1448. sys.exit()
  1449. n_dims = len(data.shape)
  1450. data_dtype = data.dtype
  1451. data = data.astype(np.float32)
  1452. # if f16 desired, convert any float32 2-dim weight tensors to float16
  1453. if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
  1454. data = data.astype(np.float16)
  1455. print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
  1456. self.gguf_writer.add_tensor(new_name, data)
  1457. @Model.register("Starcoder2ForCausalLM")
  1458. class StarCoder2Model(Model):
  1459. model_arch = gguf.MODEL_ARCH.STARCODER2
  1460. ###### CONVERSION LOGIC ######
  1461. def parse_args() -> argparse.Namespace:
  1462. parser = argparse.ArgumentParser(
  1463. description="Convert a huggingface model to a GGML compatible file")
  1464. parser.add_argument(
  1465. "--vocab-only", action="store_true",
  1466. help="extract only the vocab",
  1467. )
  1468. parser.add_argument(
  1469. "--awq-path", type=Path, default=None,
  1470. help="Path to scale awq cache file")
  1471. parser.add_argument(
  1472. "--outfile", type=Path,
  1473. help="path to write to; default: based on input",
  1474. )
  1475. parser.add_argument(
  1476. "--outtype", type=str, choices=["f32", "f16"], default="f16",
  1477. help="output format - use f32 for float32, f16 for float16",
  1478. )
  1479. parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
  1480. parser.add_argument(
  1481. "model", type=Path,
  1482. help="directory containing model file",
  1483. )
  1484. return parser.parse_args()
  1485. def main() -> None:
  1486. args = parse_args()
  1487. dir_model = args.model
  1488. if args.awq_path:
  1489. sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
  1490. from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
  1491. tmp_model_path = args.model / "weighted_model"
  1492. dir_model = tmp_model_path
  1493. if tmp_model_path.is_dir():
  1494. print(f"{tmp_model_path} exists as a weighted model.")
  1495. else:
  1496. tmp_model_path.mkdir(parents=True, exist_ok=True)
  1497. print("Saving new weighted model ...")
  1498. add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
  1499. print(f"Saved weighted model at {tmp_model_path}.")
  1500. if not dir_model.is_dir():
  1501. print(f'Error: {args.model} is not a directory', file=sys.stderr)
  1502. sys.exit(1)
  1503. ftype_map = {
  1504. "f32": gguf.GGMLQuantizationType.F32,
  1505. "f16": gguf.GGMLQuantizationType.F16,
  1506. }
  1507. if args.outfile is not None:
  1508. fname_out = args.outfile
  1509. else:
  1510. # output in the same directory as the model by default
  1511. fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
  1512. print(f"Loading model: {dir_model.name}")
  1513. hparams = Model.load_hparams(dir_model)
  1514. with torch.inference_mode():
  1515. model_class = Model.from_model_architecture(hparams["architectures"][0])
  1516. model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
  1517. print("Set model parameters")
  1518. model_instance.set_gguf_parameters()
  1519. print("Set model tokenizer")
  1520. model_instance.set_vocab()
  1521. if args.vocab_only:
  1522. print(f"Exporting model vocab to '{fname_out}'")
  1523. model_instance.write_vocab()
  1524. else:
  1525. print(f"Exporting model to '{fname_out}'")
  1526. model_instance.write()
  1527. print(f"Model successfully exported to '{fname_out}'")
  1528. if __name__ == '__main__':
  1529. main()